def get_collection(self): name = config.USERNAME sql = "select * from collection where(name='%s') order by id DESC limit 5" % ( name) mysqlClient = MysqlClient() find_res = mysqlClient.find_all(sql) return find_res
def get_history(self): name = config.USERNAME sql = "select * from content where(name='%s') order by id DESC limit 10" % ( name) mysqlClient = MysqlClient() find_res = mysqlClient.find_all(sql) return find_res
class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() self.redisClient = RedisClient() def run(self): #self.get_qu() #self.get_zhen() # self.push_url_to_redis() self.get_position() def get_qu(self): sql = 'select * from shi' results = self.db.find_all(sql) for res in results: shi_id = res[2] url = SHI_URL.format(shi_id='c' + shi_id) print(url) html = self.download.get_html(url) if html.status_code == 200 and html is not None: html = HTML(html.text) qu_id_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/@href' ) qu_name_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/text()' ) for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]): qu_id = qu_id.split('/') qu_id = qu_id[2] sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_zhen(self): sql = 'select * from qu' results = self.db.find_all(sql) for res in results: shi_id = res[1] qu_id = res[2] url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id) print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) zhen_id_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/@href' ) zhen_name_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/text()' ) for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]): zhen_id = zhen_id.split('/') zhen_id = zhen_id[2] sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_position(self): redis_results = self.redisClient.pop('employment') try: json_obj = json.loads(redis_results[1].decode('utf8')) except: return None if json_obj: flag = True pageToken = 1 #处理翻页问题 while flag: detail_url_list = [] url = json_obj['url'] pre_page = re.search('\/\?page=(.*?)&', url).group(1) if int(pageToken) > 10: break url = url.replace( 'page=' + pre_page + '&sort=2&ka=page-' + pre_page, 'page=' + str(pageToken) + '&sort=2&ka=page-' + str(pageToken)) cityId = json_obj['cityId'] zhiweiId = json_obj['zhiweiId'] print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) #判断是否是当天发布,是的话请求详情页, 判断数据库是否有这条数据,有的话不请求(暂时) li_xpath = html.xpath('//div[@class="job-list"]/ul/li') for li in li_xpath: content = etree.tostring(li) content = HT.unescape(content.decode()) content = HTML(content) li_time = content.xpath( 'string(//div[@class="info-publis"]/p)') href_url = content.xpath( 'string(//div[@class="info-primary"]//h3/a/@href)') try: last_str = li_time.split('发布于')[1] minute = last_str.split(':')[1] #判断是否当天发布 if minute: #判断数据库存不存在: try: cid = re.match('^/job_detail/(.*?)\.html', href_url).group(1) sql = "select * from positions where cid='%s'" % ( cid) find_one_res = self.db.find_one(sql) if find_one_res is None: #先把cid插入,避免重复抓取 sql = "insert into positions(cid) values ('%s')" % ( cid) self.db.save(sql) detail_url_list.append( config.HOST_URL + href_url) elif find_one_res[2] is None: detail_url_list.append( config.HOST_URL + href_url) else: print('数据库存在该记录:' + str(cid)) except: print('查询数据库出错:' + str(cid)) except: print('该URL发布日期小于当天:' + config.HOST_URL + href_url) results = self.get_detail(detail_url_list, cityId, zhiweiId) #判断是否翻页 try: last_li = html.xpath( 'string(//div[@class="job-list"]/ul/li[last()]//div[@class="info-publis"]/p)' ) last_str = last_li.split('发布于')[1] minute = last_str.split(':')[1] if minute: pageToken = str(int(pageToken) + 1) except: flag = False else: print('该url无数据') def get_detail(self, detail_url_list, cityId, zhiweiId): for url in detail_url_list: print('下载该详情页:' + url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) try: cid = re.match( '^https://www.zhipin.com/job_detail/(.*?)\.html', url).group(1) except: print('获取cid失败') continue title = html.xpath('string(//h1)') url = url try: publishDateStr = html.xpath( 'string(//span[@class="time"])').split('发布于')[1] publishDate = int( time.mktime( time.strptime(publishDateStr, "%Y-%m-%d %H:%M"))) except: publishDateStr = None publishDate = None try: info = html.xpath( 'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)' ) info = info.split(':') city = info[1][:-2] jingyan = info[2][:-2] xueli = info[3] except: city = None jingyan = None xueli = None price = html.xpath( 'string(//div[@class="info-primary"]//span[@class="badge"])' ) posterName = html.xpath('string(//h2)') posterId = None posterUrl = html.xpath( 'string(//div[@class="detail-figure"]/img/@src)') content = html.xpath( 'string(//div[@class="job-sec"]/div[@class="text"])' ).strip() try: company_text = html.xpath( 'string(//a[@ka="job-cominfo"]/@href)') companyID = re.match('/gongsi/(.*?)\.html', company_text).group(1) except: companyID = None createDate = int(time.time()) #判断是否是当天发布 temp_time = time.localtime(int(time.time())) now_DateStr = time.strftime("%Y-%m-%d", temp_time) lt = time.strptime(now_DateStr, "%Y-%m-%d") now_timestamp = int(time.mktime(lt)) if publishDate == None or publishDate < now_timestamp or publishDate >= ( now_timestamp + 86400): print('特例.该url不是当天发布:' + str(url)) continue res_obj = { 'cid': cid, 'title': title, 'url': url, 'publishDateStr': publishDateStr, 'publishDate': publishDate, 'city': city, 'jingyan': jingyan, 'xueli': xueli, 'price': price, 'posterName': posterName, 'posterId': posterId, 'posterUrl': posterUrl, 'content': content, 'companyID': companyID, 'createDate': createDate, 'cityId': cityId, 'zhiweiId': zhiweiId } print(res_obj) sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \ " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\ + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \ %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId) self.db.save(sql) else: print('请求详情页失败:' + str(url)) def push_url_to_redis(self): # zhiwei_list = [] # zhiwei_sql = 'select * from zhiwei' # zhiwei_results = self.db.find_all(zhiwei_sql) # for zhiwei in zhiwei_results: # zhiwei_list.append(zhiwei[2]) # # zhen_sql = 'select * from zhen' # zhen_results = self.db.find_all(zhen_sql) # # for res in zhen_results: # pid = res[1] # zhen_id = res[2] # for zhiwei_id in zhiwei_list: # url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1') # self.redisClient.push('employment',url) zhiwei_list = [] zhiwei_sql = 'select * from zhiwei' zhiwei_results = self.db.find_all(zhiwei_sql) for zhiwei in zhiwei_results: zhiwei_list.append(zhiwei[2]) shi_sql = 'select * from shi' shi_results = self.db.find_all(shi_sql) for res in shi_results: pid = res[2] for zhiwei_id in zhiwei_list: url = NEW_POSITION_URL.format(pid=pid, zhiwei_id=zhiwei_id, pageToken='1') url_obj = {"url": url, "cityId": pid, "zhiweiId": zhiwei_id} self.redisClient.push('employment', json.dumps(url_obj))
class Bestseller(object): def __init__(self): self.download = Download() self.mysql = MysqlClient() def start(self): # res1 = self.get_url('onedepa') # res2 = self.get_url('twodepa') res3 = self.get_url('threedepa') # self.get_html(res1) # self.get_html(res2) self.get_html(res3[4317:4319]) def get_url(self, typename): sql = "select * from %s" % (typename) results = self.mysql.find_all(sql) return results def get_html(self, results): for res in results: url = res[5] typeid = res[1] temp_url_lit = [] url_one = url page2_replace = re.search( 'https://www.amazon.com.*?ref=zg_bs_(.*?/\d+-\d+-\d+)', url).group(1) url_tow = url.replace(page2_replace, 'pg_2?&pg=2') temp_url_lit.append(url_one) temp_url_lit.append(url_tow) for url in temp_url_lit: response = self.download.get_html(url) if response: html = HTML(response.text) url_list = html.xpath( '//div[@id="zg-center-div"]/ol/li//a[@class="a-link-normal a-text-normal"]/@href' ) for detail_url in url_list: spider_url = 'https://www.amazon.com' + detail_url detail_response = self.download.get_html(spider_url) if detail_response: detail_html = HTML(detail_response.text) sellrank = re.search( 'https://www.amazon.com/.*?/dp/.*?ref=.*?_(\d+)/\d+-\d+-\d+\?', spider_url).group(1) print('sellrank:' + sellrank) product_id = hashlib.md5( detail_url.encode()).hexdigest() title = detail_html.xpath( 'string(//h1[@id="title"])').strip() price = detail_html.xpath( 'string(//span[@id="priceblock_ourprice"])' ).replace(',', '').replace('$', '') if price == '': price = 0 color = detail_html.xpath( 'string(//div[@id="variation_color_name"]//span)' ).strip() size = detail_html.xpath( 'string(//div[@id="variation_size_name"]//span)' ).strip() commentCount = detail_html.xpath( 'string(//span[@id="acrCustomerReviewText"])' ).split(' ')[0].replace(',', '') if commentCount == '': commentCount = 0 commentRating = detail_html.xpath( 'string(//a[@class="a-popover-trigger a-declarative"]/i/span)' ).split(' ')[0] if commentRating == '': commentRating = 0 crawled_timestamp = int(time.time()) crawled_time = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) crawled_date = time.strftime( "%Y-%m-%d", time.localtime()) # 编号 try: asin = re.search( 'https://www.amazon.com/.*?/dp/(.*?)/ref=.*?', spider_url).group(1) except: asin = None # 类目排名 rank1 = None rank2 = None # try: # category_res1 = re.search('.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?(<span>.*?</span>)',detail_response.text, re.S) # category_res2 = re.search('.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?<span>.*?</span>.*?(<span>.*?</span>).*?</span>', detail_response.text, re.S) # if category_res1: # # rank_search = re.search('.*?#(.*?)in.*?', category_res1.group(1)) # # if rank_search: # # rank1 = rank_search.group(1) # # else: # # rank1 = None # # print(rank1) # html = HTML(category_res1.group(1)) # list_res = html.xpath('//text()') # rank1 = ''.join(list_res) # if category_res2: # html = HTML(category_res2.group(1)) # list_res = html.xpath('//text()') # rank2 = ''.join(list_res) # except: # rank1 = None # rank2 = None # 图片信息入库 try: imageUrls = [] img_res = re.search( "var data = {};.*?var obj = jQuery.parseJSON\('(.*?)'\);", detail_response.text, re.S) img_obj = json.loads(img_res.group(1)) key_one = list( img_obj['colorImages'].keys())[0] for data in img_obj['colorImages'][key_one]: imageUrls.append(data['large']) for img in imageUrls: img_id = hashlib.md5( img.encode()).hexdigest() img_url = img sql = "insert into image(product_id,img_id,img_url,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s')" \ % (asin, img_id, img_url, crawled_timestamp, crawled_time) \ + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time) print(sql) self.mysql.save(sql) except: pass # 跟卖信息入库 have_follow_sale = '0' follow_sale_num = 0 follow_sale_str = detail_html.xpath( 'string(//div[@id="olpPocs_feature_div"]/div/span)' ) if follow_sale_str != '': have_follow_sale = '1' follow_sale_num = re.search( '\((\d+)\)', follow_sale_str).group(1) follow_sale_url = detail_html.xpath( 'string(//div[@id="olpPocs_feature_div"]/div/span/a/@href)' ) if follow_sale_url[0:4] == 'http': follow_sale_url = follow_sale_url else: follow_sale_url = 'https://www.amazon.com' + follow_sale_url + '&startIndex={startIndex}' follow_response = self.get_follow_sale( follow_sale_url, follow_sale_num) for item in follow_response: follow_sale_id = item['follow_sale_id'] price = item['price'] seller = item['seller'] type = item['type'] sql = "insert into follow_sale(product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s','%s','%s')" \ % (asin, follow_sale_id, price, seller, type, crawled_timestamp, crawled_time) \ + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time) print(sql) self.mysql.save(sql) # 商品信息入库 sql = "insert into bestseller(typeid,sellrank,product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (typeid,sellrank,product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date) \ + "ON DUPLICATE KEY UPDATE sellrank='%s',title='%s', url='%s', price='%s',commentCount='%s',crawled_timestamp='%s',crawled_time='%s',crawled_date='%s',follow_sale_num='%s'" % ( sellrank, title, spider_url, price, commentCount, crawled_timestamp, crawled_time,crawled_date,follow_sale_num) print(sql) self.mysql.save(sql) def get_follow_sale(self, url, follow_sale_num): if follow_sale_num == 0: return [] if int(follow_sale_num) > 10: pageNum = math.ceil(int(follow_sale_num) / 10) else: pageNum = 1 item_list = [] for page in range(0, pageNum): startIndex = page * 10 url = url.format(startIndex=startIndex) print(url) follow_response = self.download.get_html(url) if follow_response is None: return [] follow_html = HTML(follow_response.text) html_list = follow_html.xpath( '//div[@class="a-row a-spacing-mini olpOffer"]') for html in html_list: html = etree.tostring(html).decode() html = HTML(html) price = html.xpath( 'string(//div[@class="a-column a-span2 olpPriceColumn"]/span)' ).strip().replace('$', '') seller = html.xpath('string(//h3/span)').strip() FBA = html.xpath('string(//div[@class="olpBadge"])') type = 'FBM' if FBA != '': type = 'FBA' follow_sale_id = hashlib.md5( (seller + price + type).encode()).hexdigest() obj = { 'follow_sale_id': follow_sale_id, 'price': price, 'seller': seller, 'type': type } print(obj) item_list.append(obj) return item_list
def get_all_message(self): sql = "select * from content order by id DESC limit 15" mysqlClient = MysqlClient() find_res = mysqlClient.find_all(sql) return find_res