def get_html(self, url, ua='', referer='', cookie=None, url_type='', asin=''): if not url_type: url_type = self.url_type html = '' cookiesObj = {} headers = {} is_error = False try: html, resp, need_param = get_product(url=url, ua=ua, cookies=cookie, debug_log=self.debug_log, referer=referer, url_type=url_type) if resp.status_code == 404: if url_type == 'goods' and asin: DataOutput.record_not_found_goods(asin) DataOutput.record_not_found_tosell(asin) cookiesObj = resp.cookies headers = need_param.get('headers') return html, (cookiesObj, headers), is_error except Exception as e: is_error = True return html, (cookiesObj, headers), is_error
def bsrData_save(dataQ, debug_log, db_log): print('\nbsrData_save init\n') data_type = 'bsr' if dataQ.RedisQ.llen('bsrData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) db_name = SqlConfig.bsrData_db_name update_sql = SqlConfig.bsrData_update_sql insert_sql = SqlConfig.bsrData_insert_sql while True: datas = dataQ.get_new_bsrData() if not datas: if dataQ.RedisQ.llen('bsrData') > 0: datas = dataQ.get_new_bsrData() else: break for item in datas: asin = item tuple_list = datas[item] tm = int(DataOutput.get_redis_time()) # print('asin tuple_list: ', asin, tuple_list) for item in tuple_list: if item and type(item) is tuple: # print('bsr item: ', item) itemLen = len(item) bsr = item[0] bsrc = item[1] aday = item[2] # if itemLen == 4: # tm = item[3] # else: # tm = int(time.time() * 1000) data_dict = dict(asin=asin, bsr=bsr, bsrc=bsrc, tm=tm, aday=aday) data = dataOutput.save_data_to_db(update_sql, insert_sql, asin, data_dict, db_name=db_name) # print('bsrData: ',data) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def download(self, ip, asin_or_kw, url_dict): time_now = lambda: time.time() url_type = self.url_type kw = asin_or_kw cid = url_dict.get('cid') or 0 print(ip, kw, cid, url_dict) # time.sleep(30) kw_url = self.make_search_url(quote(kw), cid) url = kw_url print('\nkeyword_url_tuple: ', kw_url) self.debug_log.debug( 'url_type: %s, asin: %s, monitor_type: %s, url %s: ' % (url_type, kw, cid, url)) if url: self.debug_log.debug('[ip %s] 工作中... [%s]' % (ip, url)) startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S") time1 = time_now() referer = 'https://www.amazon.com' self.debug_log.debug('keyword referer: %s' % (referer)) ua = self.get_ua() self.debug_log.debug('keyword ua: %s' % (ua)) # value_str = ip + ua # self.debug_log.debug('keyword value_str: %s' % (value_str)) url_md5key = url_dict.get('md5') or '' if not url_md5key: url_md5key = self.get_md5_key(kw + url_type) # cookMd5key = self.get_md5_key(value_str) cookMd5key = None cookie = self.get_cookie(cookMd5key) self.debug_log.debug('keyword cookie: %s' % (cookie)) # 下载url html_list, url_list, cookiesObj, is_error_lsit = \ self.get_keyword_html_lsit(url, ua, ip, cookie, referer, kw=kw) old_dnum = url_dict.get('dnum') or 0 durl = url_dict.get('durl') or [] url_dict['durl'] = list(set(durl + url_list)) url_dict['dnum'] = old_dnum + 1 # 如果判定为没有关键词, 结束程序 if self.not_found: DataOutput.record_not_found_keyword(kw) self.dataQ.record_keyword_not_fund_times() msgInt = 0 proxyInfo = 'the keyword not found' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) return self.debug_log.war('%s关键字不存在' % (kw)) i = -1 keyword_html_list = [] if len(html_list) > 0: for html in html_list: i += 1 is_error = is_error_lsit[i] url = url_list[i] if is_error: msgInt = 6 proxyInfo = 'get Html error' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) else: analyze = self.analyze_html(html, cookie, cookiesObj, ip, kw, url_dict, cookMd5key, time1, startTime, html_type=url_type) if analyze and analyze != 404: keyword_html_list.append(html) if len(html_list) == len(keyword_html_list): result, is_error = self.kw_parser(keyword_html_list, kw, cid, ip=ip, url=url) if is_error: msgInt = 3 proxyInfo = 'get data error' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) else: if not result: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) msgInt = 2 proxyInfo = 'get data defeated' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) else: keyword_datas = result[0] if not keyword_datas: self.add_url_to_queue(url_dict, url_type=url_type, retry_type=True) else: self.save_success_asin_keyword( kw, url_type=url_type) msgInt = 1 proxyInfo = 'get data success' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) data_bytes = pickle.dumps(keyword_datas) self.dataQ.add_keyword_to_queue(data_bytes) self.dataQ.record_data_ok_times() self.dataQ.record_keyword_ok_times() else: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) else: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) time.sleep(1) else: print(url_type, '没有url') self.add_url_to_set(url_dict, url_type, retry_type=True) time.sleep(1)
def goods_data_save(dataQ, debug_log, db_log): print('\ngoods_save init\n') data_type = 'goods' if dataQ.RedisQ.llen('goodsData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) db_name = SqlConfig.goods_db_name update_sql = SqlConfig.goods_update_sql insert_sql = SqlConfig.goods_insert_sql while True: datas = dataQ.get_new_goods_data() the_hour = return_PST().hour if not datas: if dataQ.RedisQ.llen('goodsData') > 0: datas = dataQ.get_new_goods_data() else: break tm = DataOutput.get_redis_time() for k, v in datas.items(): asin = k data = v # print('data', data) # 如果库存下载失败, 先不入历史库 from pprint import pprint pprint(data) print(data['getinfo_tm'], 1) data['getinfo_tm'] = tm print(data['getinfo_tm'], 2) print('rc1: ', data['rc']) print('quantity1', data['quantity']) sql = "select rc, quantity, price, title, bsr from public.amazon_product_data where asin=%(asin)s and getinfo_tm>%(the_tm)s ;" select_dict = { 'asin': data['asin'], 'the_tm': (tm / 1000 - 3600 * 24 * 3) * 1000 } cur.execute(sql, select_dict) select_rows = cur.fetchall() if len(select_rows) > 0: print(select_rows, type(select_rows), type(select_rows[0]), type(select_rows[0][0])) the_old_rc, the_old_qty, the_old_price, the_old_title, the_old_bsr = select_rows[ 0] print(the_old_rc, the_old_qty, the_old_price, the_old_title, the_old_bsr) the_new_qty = data['quantity'] print('price1', data['price'], the_old_price) # 如果没有price 则用前一天的数据 if data['price'] <= 0 and the_old_price > 0 and data[ 'asin_state'] == 3: data['price'] = the_old_price # 如果没有title, 则用前一天的数据 if not data['title'] and the_old_title: data['title'] = the_old_title # 如果没有bsr, 则用前一天的数据 #if data['bsr'] < 1 and the_old_bsr > 0: # data['bsr'] = the_old_bsr print('the_old_rc', the_old_rc, type(the_old_rc)) print('old quantity', the_old_qty, type(the_old_qty)) print('new quantity', the_new_qty, type(the_new_qty)) # 如果评论小于前一天的评论, 则用前一天的评论 print("data['rc']", data['rc'], type(data['rc'])) if data.get('rc', 0) < the_old_rc: data['rc'] = the_old_rc # 如果库存爬取失败, 则用前一天的库存 if the_new_qty == -1 and the_old_qty >= 0 and data[ 'asin_state'] == 3: data['quantity'] = the_old_qty data['qtydt'] = 4 with open('quantity_fail.csv', 'a', encoding='utf-8') as f: f.write('asin, %s, old qty, %s, new qty, %s\n' % (data['asin'], the_old_qty, the_new_qty)) if data['asin_state'] == 2: data['quantity'] = 0 data['byb'] = 0 data['qtydt'] = 5 # 不可售 # 如果没有dpre, 则用price if data['dpre'] <= 0 and data['price'] > 0: data['dpre'] = data['price'] # 如果没有cart_price, 则用price if data['cart_price'] <= 0 and data['price'] > 0: data['cart_price'] = data['price'] print('price2', data['price']) print('quantity2', data['quantity']) print('rc2: ', data['rc']) if the_hour < 9 and data['quantity'] < 0 and data[ 'asin_state'] == 3: # 先不更新 pass # # 弹出更新库不需要的字段 # data.pop('dpre') # data.pop('bs1') # data.pop('qc') # data.pop('qtydt') # data.pop('aday') # # 再传给更新库 # dataOutput.save_data_to_db(update_sql, insert_sql, asin, data, db_name=db_name) else: # 先传一份给历史库 druidData_to_db(asin, data, dataOutput) # 弹出更新库不需要的字段 data.pop('dpre') data.pop('bs1') data.pop('qc') data.pop('qtydt') data.pop('aday') # 再传给更新库 dataOutput.save_data_to_db(update_sql, insert_sql, asin, data, db_name=db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def keyword_data_save(dataQ, debug_log, db_log): print('\nkeyword_data_save init\n') data_type = 'keyword' if dataQ.RedisQ.llen('keywordData') > 0: pool = psycopg2.pool.SimpleConnectionPool(5, 51, **DATADB_CONFIG[BASE_TYPE]) # dbObj1 = GetDbObj().get_db_obj() # cur1 = dbObj1.cursor() # dataOutput = DataOutput(dbObj1, cur1, db_log, debug_log, dataQ) keyword_data_db_name = SqlConfig.keyword_data_db_name keyword_data_update_sql = SqlConfig.keyword_data_update_sql keyword_data_insert_sql = SqlConfig.keyword_data_insert_sql druid_keyword_db_name = SqlConfig.druid_keyword_db_name #druid_keyword_update_sql = SqlConfig.druid_keyword_update_sql druid_keyword_update_sql = None druid_keyword_insert_sql = SqlConfig.druid_keyword_insert_sql i = 0 while True: i += 1 dbObj = pool.getconn(i) cur = dbObj.cursor() datas = dataQ.get_new_keywordData() if not datas: if dataQ.RedisQ.llen('keywordData') > 0: print(dataQ.RedisQ.llen('keywordData'), type(dataQ.RedisQ.llen('keywordData'))) datas = dataQ.get_new_keywordData() else: break for k, v in datas.items(): kw = k tm = DataOutput.get_redis_time() keyword_data_dict = v[0] keyword_druid_data_list = v[1] aday = keyword_druid_data_list[0]['aday'] if len( keyword_druid_data_list) > 0 else return_PST().strftime( "%Y%m%d") if len(keyword_druid_data_list ) < 50 and keyword_data_dict['search_num'] < 1000: if keyword_data_dict['search_num'] != len( keyword_druid_data_list): keyword_data_dict['search_num'] = len( keyword_druid_data_list) for data in keyword_druid_data_list: data['srn'] = len(keyword_druid_data_list) # print('keyword_data_dict: ', keyword_data_dict) # print(keyword_data_dict['getinfo_tm'], 1) keyword_data_dict['getinfo_tm'] = tm # print(keyword_data_dict['getinfo_tm'], 2) sql = "select kw from public.amazon_druid_keyword_data where kw=%(kw)s and aday=%(aday)s limit 1;" the_data = dict(kw=kw, aday=aday) cur.execute(sql, the_data) asin_rows = cur.fetchall() print('asin_rows: ', len(asin_rows)) print('keyword_druid_data_list len: ', len(keyword_druid_data_list)) if len(asin_rows) < 1: data0 = DataOutput.save_data_to_db_pool( dbObj, cur, db_log, debug_log, dataQ, keyword_data_update_sql, keyword_data_insert_sql, kw, keyword_data_dict, db_name=keyword_data_db_name) # print('keyword_druid_data_list: ', keyword_druid_data_list) if len(keyword_druid_data_list) > 0: for druid in keyword_druid_data_list: # print(druid) druid['tm'] = tm data1 = DataOutput.save_data_to_db_pool( dbObj, cur, db_log, debug_log, dataQ, druid_keyword_update_sql, druid_keyword_insert_sql, kw, druid, db_name=druid_keyword_db_name) # time.sleep(20) # 记录更新时间 data_dict = { 'kw': kw, 'crawler_tm': keyword_data_dict['getinfo_tm'] / 1000 } db_name = 'public.amazon_keyword_monitor.crawler_tm' insert_sql = '' update_sql = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;" DataOutput.save_data_to_db_pool(dbObj, cur, db_log, debug_log, dataQ, update_sql, insert_sql, kw, data_dict, db_name=db_name) dbObj.commit() pool.putconn(dbObj, i) if i == 50: i = 0 pool.closeall() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def tosell_save(dataQ, debug_log, db_log): print('\ntosell_save init\n') data_type = 'tosell' if dataQ.RedisQ.llen('tosellData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) data_tosell_db_name = SqlConfig.data_tosell_db_name data_tosell_update_sql = SqlConfig.data_tosell_update_sql data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql druid_tosell_db_name = SqlConfig.druid_tosell_db_name # druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql druid_tosell_update_sql = None # SqlConfig.druid_tosell_update_sql druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql while True: datas = dataQ.get_new_tosellData() pprint(datas) # datas = {'B01F0QQN8Q': ({'asin': 'B01F0QQN8Q', # 'fba_sn': 1, # 'getinfo_tm': 1542018763364, # 'plow': 1, # 'plows': 'largeshop', # 'plows_id': 'df', # 'seller_id': 'A1XEMYOCVN4TN8', # 'sn': 1, # 'sname': 'Gemschest'}, # [{'aday': '20181112', # 'asin': 'B01F0QQN8Q', # 'condition': 'New', # 'crawler_state': 1, # 'delivery': 'Fulfillment by Amazon', # 'demo': '5 out of 5 stars 99% positive over the past 12 months. (722 total ' # 'ratings)', # 'fba': 1, # 'is_limit': 0, # 'offering_id': 'tXTG86Zk6%2Bfn3YW0ITpD7nE1mscbzOgJAAhDW3VHDrP8cWV%2F1fd0DDtk7FV8eHIOKghI7PqYtkyapr23dSShe%2Fec6EMnW30fniLCM2fd1hkZKMTSUhqBYCuO87D2zljdYwfuDuVCDTm%2FQbjYnRPPhVBBs82MwpT9', # 'positive': 99, # 'price': 2199, # 'qty': 11, # 'qtydt': 0, # 'rank': 1, # 'reivew_count': 50, # 'seller_id': 'A21P7EI9UKXT1Y', # 'sn': 1, # 'sname': 'largeshop', # 'srank': 0, # 'stype': 'FREE Shipping', # 'tm': 1542018647, # 'total_ratings': 722}])} if not datas: if dataQ.RedisQ.llen('tosellData') > 0: datas = dataQ.get_new_tosellData() else: break # print('\ntosell_save datas: [= %s =] \n' % (datas)) tm = DataOutput.get_redis_time() for item in datas: asin = item tosell_datas = datas[item][0] tosell_list = datas[item][1] pprint(tosell_datas) pprint(tosell_list) print(tosell_datas['getinfo_tm'], 1) tosell_datas['getinfo_tm'] = tm print(tosell_datas['getinfo_tm'], 2) sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;" aday = tosell_list[0]['aday'] if len( tosell_list) > 0 else return_PST().strftime('%Y%m%d') select_dict = {'asin': asin, 'aday': aday} cur.execute(sql, select_dict) select_rows = cur.fetchall() dbObj.commit() if len(select_rows) < 1: if not tosell_datas.get('sname'): print(222222) sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % ( asin, tm - 24 * 3600 * 1000) cur.execute(sql1) select_rows = cur.fetchall() dbObj.commit() select_rows = select_rows[0] if len( select_rows) == 1 else ('', '') sname, seller_id = select_rows print('seller_id: ', seller_id) print('sname ', sname) tosell_datas['sname'] = sname tosell_datas['seller_id'] = seller_id data0 = dataOutput.save_data_to_db( data_tosell_update_sql, data_tosell_insert_sql, asin, tosell_datas, db_name=data_tosell_db_name) for item in tosell_list: item['tm'] = int(tm / 1000) data = dataOutput.save_data_to_db( druid_tosell_update_sql, druid_tosell_insert_sql, asin, item, db_name=druid_tosell_db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def tosell_save(dataQ, debug_log, db_log): print('\ntosell_save init\n') data_type = 'tosell' if dataQ.RedisQ.llen('tosellData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) data_tosell_db_name = SqlConfig.data_tosell_db_name data_tosell_update_sql = SqlConfig.data_tosell_update_sql data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql druid_tosell_db_name = SqlConfig.druid_tosell_db_name #druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql druid_tosell_update_sql = None #SqlConfig.druid_tosell_update_sql druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql while True: datas = dataQ.get_new_tosellData() if not datas: if dataQ.RedisQ.llen('tosellData') > 0: datas = dataQ.get_new_tosellData() else: break # print('\ntosell_save datas: [= %s =] \n' % (datas)) tm = DataOutput.get_redis_time() for item in datas: asin = item tosell_datas = datas[item][0] tosell_list = datas[item][1] # print('tosell_datas: ', tosell_datas) print(tosell_datas['getinfo_tm'], 1) tosell_datas['getinfo_tm'] = tm print(tosell_datas['getinfo_tm'], 2) # sql = "select asin, getinfo_tm from public.amazon_product_data_tosell where asin=%(asin)s and getinfo_tm>%(the_tm)s;" # # select_dict = {'asin': asin, 'the_tm': (tm / 1000 - 120) * 1000} # the_tm = dataQ._get_value_from_string('initUpdateTm', 'initTime') # print('the_tm1', the_tm) # if not the_tm: # _, the_tm = BaseCrawler.get_the_time() # print('the_tm2', the_tm) # else: # the_tm = str(the_tm, encoding='utf-8') # print('the_tm3', the_tm) # select_dict = {'asin': asin, 'the_tm': int(the_tm) * 1000} # cur.execute(sql, select_dict) # select_rows = cur.fetchall() sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;" aday = tosell_list[0]['aday'] if len( tosell_list) > 0 else return_PST().strftime('%Y%m%d') select_dict = {'asin': asin, 'aday': aday} cur.execute(sql, select_dict) select_rows = cur.fetchall() dbObj.commit() if len(select_rows) < 1: print(tosell_datas) if not tosell_datas.get('sname'): sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % ( asin, tm - 24 * 3600 * 1000) cur.execute(sql1) select_rows = cur.fetchall() dbObj.commit() select_rows = select_rows[0] if len( select_rows) == 1 else ('', '') sname, seller_id = select_rows print('seller_id: ', seller_id) print('sname ', sname) tosell_datas['sname'] = sname tosell_datas['seller_id'] = seller_id data0 = dataOutput.save_data_to_db( data_tosell_update_sql, data_tosell_insert_sql, asin, tosell_datas, db_name=data_tosell_db_name) for item in tosell_list: item['tm'] = int(tm / 1000) data = dataOutput.save_data_to_db( druid_tosell_update_sql, druid_tosell_insert_sql, asin, item, db_name=druid_tosell_db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def start(**kwargs): ''' :param kwargs: 必须参数: asin user_anget html_code cookies goods_datas :return: ''' # 必要参数 asin = kwargs['asin'] user_anget = kwargs['user_anget'] goods_html = kwargs['goods_html'] cookies = kwargs['cookies'] goods_datas = kwargs['goods_datas'] crawler_obj = kwargs['crawler_obj'] url_dict = kwargs['url_dict'] # 如果asin校验不通过, 则标记为已下架 if kwargs.get('asin_') and kwargs.get('asin_') != asin: DataOutput.record_not_found_goods(asin) DataOutput.record_not_found_tosell(asin) # # 调试打印 from pprint import pprint # print('*' * 20) # print('productTosellCrawler.goods_datas:') # pprint(goods_datas) # 非必要参数 log_param = kwargs.get('log_param') # print(log_param, type(log_param)) # print(crawler_obj) if type(log_param) is tuple and crawler_obj: crawler_obj.record_log(*log_param) crawler_param = (crawler_obj.urlQ, crawler_obj.kwQ, crawler_obj.dataQ, crawler_obj.info_log, crawler_obj.debug_log) try: tosell_datas = tosell_crawler(asin, url_dict, crawler_obj, goods_html, crawler_param) except Exception as e: # 如果跟卖下载出错, 结束这个线程. return # 解包跟卖数据 tosell_data, tosell_list = tosell_datas.get(asin, ({}, [])) # 将跟卖数据包加入库存信息 inv_list = [] is_limit = 0 qty_list = [] tosell_sum = len(tosell_list) # 卖家数量重新赋值 goods_datas[asin]['to_sell'] = tosell_sum # 卖家数量至少为1才去拿库存 if tosell_sum >= 1: for tosell in tosell_list: inv_dict = scrape_inventory(tosell) # 如果库存大于0则存入库存 if inv_dict['qty'] > 0: qty_list.append(inv_dict['qty']) else: # 否则存0 qty_list.append(0) if inv_dict['qtydt'] == 2: is_limit = 1 inv_list.append(inv_dict) # 标记限售状态 goods_datas[asin]['is_limit'] = is_limit # 库存数量重新赋值 goods_datas[asin]['quantity'] = sum(qty_list) if sum(qty_list) > 0: goods_datas[asin]['qtydt'] = 0 else: goods_datas[asin]['qtydt'] = 1 print('*' * 20) print('add to queue goods_datas:') pprint(goods_datas) goods_bytes = pickle.dumps(goods_datas) crawler_obj.dataQ.add_goods_data_to_queue(goods_bytes) # 解包bsr详情数据 bsr_info = goods_datas[asin]['bsr_info'] if bsr_info: # 将bsr详情数据加入数据队列 bsr_bytes = pickle.dumps({asin: bsr_info}) crawler_obj.dataQ.add_bsrData_to_queue(bsr_bytes) # 将跟卖(含库存)数据加入数据队列 tosell_qty_datas = {asin: (tosell_data, inv_list)} pprint(tosell_qty_datas) tosell_bytes = pickle.dumps(tosell_qty_datas) crawler_obj.dataQ.add_tosell_to_queue(tosell_bytes) '''
def get_tosell_html_lsit(self, asin, goodsUrl, ua, ip, cookie, referer): self.html_list = [] self.url_list = [] self.is_error_list = [] self.not_found = False self.cookies = None goodshtml, cookiesObj, is_error = self.get_html(goodsUrl, ua, ip, cookie, referer, url_type=self.url_type, asin=asin) self.goods_html = goodshtml # if cookiesObj and not cookie: # cookie = cookiesObj tosellSum = TosellParser.get_to_sell_sum(goodshtml) print('tosellSum1', tosellSum) if tosellSum < 1 and not self.is_page_not_found(goodshtml): tosellSum = 1 print('tosellSum2', tosellSum) if tosellSum > 0: print('tosellSum: ', tosellSum) page_url_list = self.get_page_urls(asin, tosellSum) # tList = [] # tStart = 0 # 遍历下载跟卖 if page_url_list: i = 0 referer = goodsUrl for page_url in page_url_list: i += 1 print('tosell page%s: [%s]' % (i, page_url)) print('tosell referer %s: [%s]' % (i, referer)) self.get_page_html(page_url, ua, ip, None, referer) if re.search( 'There are currently no listings for this search', self.html_list[i - 1]): print( 'There are currently no listings for this search') break referer = page_url else: # 如果有商品源码, 并且不是验证码, 则表示找不到跟卖 if goodshtml and self.is_page_not_found(goodshtml): # print(goodshtml) DataOutput.update_getdata_tm( { 'tosell_tm_crawler': int(time.time()), 'asin': asin }, 'tosell') elif goodshtml and not self.is_RobotCheck(goodshtml): # 记录一个错误 self.not_found = True tosell_info = TosellNotFoundParser(goodshtml).parser_not_found( asin, goodshtml) if tosell_info: self.save_success_asin_keyword(asin, url_type=self.url_type) data_bytes = pickle.dumps(tosell_info) self.dataQ.add_tosell_to_queue(data_bytes) self.dataQ.record_data_ok_times() self.dataQ.record_tosell_ok_times() else: tosellSum = -1 return self.html_list, self.url_list, self.cookies, self.is_error_list, tosellSum
def get_reviews_html_list(self, asin, url, referer, **kwargs): print('\n' * 4, '*' * 20) print(self.get_reviews_html_list.__name__, asin, url, referer, **kwargs) print('*' * 20, '\n' * 4) self.html_list = [] self.url_list = [] self.is_error_list = [] self.cookies = None html, cookiesObj, is_error = self.get_html(url, referer=referer) reviewsUrl = '' url_asin = asin if self.looking_something(html) or not html: goodsUrl_tuple = self.make_url(asin, url_type='goods') goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1] print('get_reviews_html_list.goodsUrl: ', goodsUrl) goods_html, cookiesObj, is_error = self.get_html( goodsUrl, referer=goodsReferer, url_type=self.url_type, asin=asin) url_asin = self.get_reviews_url_asin(goods_html) reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0] # print(reviewsUrl, goodsUrl) if not reviewsUrl: DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if reviewsUrl: print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl) goodsUrl_tuple = self.make_url(url_asin, url_type='goods') goodsUrl = goodsUrl_tuple[0] # html, cookiesObj, is_error = self.get_html(reviewsUrl, ua, cookie, goodsUrl, url_type=self.url_type, asin=asin) html, cookiesObj, is_error = self.get_html(reviewsUrl, url_type=self.url_type, asin=asin) if ReviewsParser.is_page_not_found(html): DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list self.url_list.append(url) self.html_list.append(html) self.is_error_list.append(is_error) md5value = asin + 'reviewsFirst' md5key = self.get_md5_key(md5value) first = self.dataQ.is_first_download(md5key) # 先获取评论总数 reviewsSum = ReviewsParser.get_review_count(html) print('reviewsSum: ', reviewsSum, asin) if reviewsSum > 10: # 如果不需要翻页, 则直接return, 减少没必要的网络请求 if self.is_not_turn_the_page(first, html, page_num=1, asin=asin): return self.html_list, self.url_list, self.cookies, self.is_error_list # 如果是第一次下载评论 review: ['174'] if first: is_frist = 1 else: is_frist = 0 # 获取翻页url page_url_list = self.get_page_urls(url_asin, reviewsSum, frist=is_frist) tList = [] tStart = 0 # 遍历下载评论(多线程) if page_url_list: i = 1 j = 1 referer = url for page_url in page_url_list: i += 1 j += 1 print('reviews page%s: [%s]' % (i, page_url)) print('referer %s: [%s]' % (i, referer)) self.get_page_html(page_url, referer) referer = page_url for html in self.html_list: if self.is_not_turn_the_page(first, html, page_num=j, asin=asin): return self.html_list, self.url_list, self.cookies, self.is_error_list return self.html_list, self.url_list, self.cookies, self.is_error_list
def download(self, asin_or_kw, url_dict): time_now = lambda: time.time() url_type = self.url_type kw = asin_or_kw print(kw, url_type) cid = url_dict.get('cid') or 0 # print(self.download.__name, kw, cid, url_dict) kw_url = self.make_url(quote(kw), cid=cid, url_type=url_type) url = kw_url[0] print('\nkeyword_url_tuple: ', kw_url) if url: startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S") time1 = time_now() referer = 'https://www.amazon.com' url_md5key = url_dict.get('md5', '') if not url_md5key: url_md5key = self.get_md5_key(kw + url_type) html_list, url_list, cookiesObj, is_error_lsit = self.get_keyword_html_lsit( url, kw, referer=referer) old_dnum = url_dict.get('dnum') or 0 durl = url_dict.get('durl') or [] url_dict['durl'] = list(set(durl + url_list)) url_dict['dnum'] = old_dnum + 1 # 如果判定为没有关键词, 结束程序 if self.not_found: DataOutput.record_not_found_keyword(kw) self.dataQ.record_keyword_not_fund_times() msgInt = 0 proxyInfo = 'the keyword not found' self.record_log(kw, time1, msgInt, url_type, startTime, proxyInfo) return self.debug_log.war('%s关键字不存在' % (kw)) i = -1 keyword_html_list = [] if len(html_list) > 0: for html in html_list: i += 1 is_error = is_error_lsit[i] url = url_list[i] if is_error: msgInt = 6 proxyInfo = 'get Html error' self.record_log(kw, time1, msgInt, url_type, startTime, proxyInfo) else: analyze = self.analyze_html(html, kw, url_dict, time1, startTime, html_type=url_type) if analyze and analyze != 404: keyword_html_list.append(html) if len(html_list) == len(keyword_html_list): result, is_error = self.kw_parser(keyword_html_list, kw, cid, url=url) if is_error: msgInt = 3 proxyInfo = 'get data error' self.record_log(kw, time1, msgInt, url_type, startTime, proxyInfo) else: if not result: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) msgInt = 2 proxyInfo = 'get data defeated' self.record_log(kw, time1, msgInt, url_type, startTime, proxyInfo) else: self.save_success_asin_keyword(kw, url_type=url_type) msgInt = 1 proxyInfo = 'get data success' self.record_log(kw, time1, msgInt, url_type, startTime, proxyInfo) keyword_datas = result[0] from pprint import pprint pprint(keyword_datas) data_bytes = pickle.dumps(keyword_datas) self.dataQ.add_keyword_to_queue(data_bytes) self.dataQ.record_data_ok_times() self.dataQ.record_keyword_ok_times() else: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) else: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) time.sleep(1) else: print(url_type, '没有url') # self.add_url_to_set(url_dict, url_type, retry_type=True) time.sleep(1)
def get_html_useRequest(url, ua, ip, cookie, debug_log, referer, ipQ, urlQ=None, timeout=90, retry=1, goodsUrl='', url_type='', asin=''): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'close', 'User-Agent': ua, 'Host': 'www.amazon.com', 'Referer': referer, 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0', } proxy = {'https': PROXY_HTTPS, 'http': PROXY_HTTP} print(proxy) html = '' cookies = {} status_code = 0 session = requests print('\nheaders: ', headers) is_error = False if url.startswith('https://www.amazon.com') or url.startswith( 'http://www.amazon.com'): try: get_parmas = dict(url=url, headers=headers, proxies=proxy, timeout=timeout) if 'proxy.crawlera.com' in proxy.get('https', ''): get_parmas['verify'] = PROXY_VERIFY response = session.get(**get_parmas) status_code = response.status_code print('status_code', status_code) if status_code == 200 or status_code == 302 or status_code == 404: response.encoding = 'utf-8' responseCookies = response.cookies if not cookie: cookies = responseCookies if status_code == 404: if url_type == 'goods' and asin: DataOutput.record_not_found_goods(asin) if url_type == 'tosell' and asin: DataOutput.record_not_found_tosell(asin) html = response.text else: html = response.text if "Enter the characters you see below" in html: raise Exception("Exception: Captcha") if "Enter the characters you see below" in html: raise Exception("Exception: block") if 'proxy.crawlera.com' not in proxy.get('https', ''): time.sleep(3) return html, cookies, is_error except Exception as e: if status_code != 404: is_error = True debug_log.error('[%s] get_html_useRequest下载 [%s] 时 [%s]' % (ip, url, e)) if "NotFound" in str(e): raise Exception("NOT_FOUND") else: debug_log.error('[%s] get_html_useRequest下载 [%s] url不合法' % (ip, url)) return html, cookies, is_error
def the_url_is_discard(self, asin_or_kw, url_dict, url_type, url_md5key): result = False url_list = url_dict.get('durl') or [] if not url_md5key: url_md5key = url_dict.get('md5') or '' if not asin_or_kw: asin_or_kw = url_dict.get('asin') or url_dict.get('kw') or '' print('the_url_is_discard.asin_or_kw: ', asin_or_kw) # 查url失败次数, 验证码次数 disabled_num = 10 if url_type != 'keyword': disabled_num = disabled_num + 5 new_url_set_name = 'NewUrl' if not url_md5key: url_md5key = self.get_md5_key(asin_or_kw + url_type) is_new_url = self.urlQ._the_member_is_exist_in_set( new_url_set_name, url_md5key) if is_new_url: disabled_num = disabled_num + 5 if disabled_num > 20: disabled_num = 20 url_RobotChek_times = url_dict.get('rnum') or 0 if not url_RobotChek_times: url_RobotChek_times = self.urlQ.get_RobotCheck_url_times( url_md5key) # url_disabled_times = url_dict.get('dnum') or 0 # if not url_disabled_times: # url_disabled_times = self.urlQ.get_defeated_url_times(url_md5key) zset_name = '%s%s' % (self.url_type, 'fail') url_disabled_times = self.urlQ._return_zscore(zset_name, asin_or_kw) self.urlQ._record_member_times(zset_name, asin_or_kw) print('url_RobotChek_times', url_RobotChek_times) print('url_disabled_times', url_disabled_times) if url_RobotChek_times >= 50: # 标记一个更新时间 if url_type == 'goods': DataOutput.record_disabled_goods(asin_or_kw, 'discard') if url_type == 'tosell': DataOutput.record_disabled_tosell(asin_or_kw) if url_type == 'keyword': DataOutput.record_disabled_keyword(asin_or_kw) self.urlQ.pop_RobotCheck_url_times(url_md5key) # 加入下载失败集合 self.urlQ.add_defeated_url_to_set(url_md5key) self.save_discard_url(asin_or_kw, url_list, url_RobotChek_times, '验证码次数过多') if url_disabled_times >= disabled_num: # 标记一个更新时间 if url_type == 'goods': DataOutput.record_disabled_goods(asin_or_kw, 'discard') if url_type == 'tosell': DataOutput.record_disabled_tosell(asin_or_kw) if url_type == 'keyword': DataOutput.record_disabled_keyword(asin_or_kw) # 加入下载失败集合 self.urlQ.add_defeated_url_to_set(url_md5key) self.urlQ.pop_defeated_url_times(url_md5key) self.save_discard_url(asin_or_kw, url_list, url_disabled_times, '连接失败') # else: # result = self.add_url_to_queue(url_dict, url_type, retry_type=True) return result
def get_reviews_html_list(self, asin, url, ua, ip, cookie, referer): self.html_list = [] self.url_list = [] self.is_error_list = [] self.cookies = None print(url, ua) # url = 'https://www.amazon.com/product-reviews/B000TZ8TEU/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&sortBy=recent' html, cookiesObj, is_error = self.get_html(url, ua, ip, cookie, referer) reviewsUrl = '' url_asin = asin if self.looking_something(html) or not html: goodsUrl_tuple = self.make_url(asin, url_type='goods') goodsUrl, goodsReferer = goodsUrl_tuple[0], goodsUrl_tuple[1] print('get_reviews_html_list.goodsUrl: ', goodsUrl) goods_html, cookiesObj, is_error = self.get_html( goodsUrl, ua, ip, cookie, goodsReferer, url_type=self.url_type, asin=asin) url_asin = self.get_reviews_url_asin(goods_html) reviewsUrl = self.make_url(url_asin, url_type=self.url_type)[0] # print(reviewsUrl, goodsUrl) if not reviewsUrl: DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if reviewsUrl: print('%s get_reviews_html_list.reviewsUrl: ' % (asin), reviewsUrl) goodsUrl_tuple = self.make_url(url_asin, url_type='goods') goodsUrl = goodsUrl_tuple[0] html, cookiesObj, is_error = self.get_html(reviewsUrl, ua, ip, cookie, goodsUrl, url_type=self.url_type, asin=asin) if ReviewsParser.is_page_not_found(html): DataOutput.record_not_found_reviews(asin) self.is_error_list.append(404) return self.html_list, self.url_list, self.cookies, self.is_error_list if cookiesObj and not cookie: cookie = cookiesObj self.url_list.append(url) self.html_list.append(html) self.is_error_list.append(is_error) md5value = asin + 'reviewsFirst' md5key = self.get_md5_key(md5value) first = dataQ.is_first_download(md5key) # 先获取评论总数 reviewsSum = ReviewsParser.get_review_count(html) print('reviewsSum: ', reviewsSum, asin) if reviewsSum > 10: # 如果不需要翻页, 则直接return, 减少没必要的网络请求 if self.is_not_turn_the_page(first, html, page_num=1, asin=asin): return self.html_list, self.url_list, cookie, self.is_error_list # 如果是第一次下载评论 review: ['174'] if first: is_frist = 1 else: is_frist = 0 # 获取翻页url page_url_list = self.get_page_urls(url_asin, reviewsSum, frist=is_frist) tList = [] tStart = 0 # 遍历下载评论(多线程) if page_url_list: i = 1 j = 1 referer = url for page_url in page_url_list: i += 1 j += 1 print('reviews page%s: [%s]' % (i, page_url)) print('referer %s: [%s]' % (i, referer)) self.get_page_html(page_url, ua, ip, None, referer) referer = page_url for html in self.html_list: if self.is_not_turn_the_page(first, html, page_num=j, asin=asin): return self.html_list, self.url_list, cookie, self.is_error_list return self.html_list, self.url_list, self.cookies, self.is_error_list
def reviews_save(dataQ, debug_log, db_log): print('\nreviews_save init\n') data_type = 'reviews' if dataQ.RedisQ.llen('reviewsData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) reviews_db_name = SqlConfig.reivews_db_name reviews_update_sql = SqlConfig.reivews_update_sql reviews_insert_sql = SqlConfig.reivews_insert_sql datetime = return_PST() oldDate = datetime - timedelta(days=90) yesterdate = datetime - timedelta(days=1) yesterday = yesterdate.strftime('%Y%m%d') theYesterDete = int(yesterday) theMon = oldDate.strftime('%Y%m%d') three_mon_date = int(theMon) while True: datas = dataQ.get_new_reviewsData() if not datas: if dataQ.RedisQ.llen('reviewsData') > 0: datas = dataQ.get_new_reviewsData() else: break for item in datas: asin = item dict_list = datas[item] # print('tuple_list: ', dict_list) md5value = asin + 'reviewsFirst' md5key = DataOutput.get_md5_key(md5value) first = dataQ.is_first_download(md5key) i = 0 for item in dict_list: i += 1 # md5key只传3次, 避免无畏的重复写入, 以及前两次写入失败的情况. if i < 3: theMd5key = md5key else: theMd5key = None # 如果是第一次下载, 则写入三个月内的评论. if first: if item['date'] >= three_mon_date: # print('reviews item: ', item) data0 = dataOutput.save_data_to_db( reviews_update_sql, reviews_insert_sql, asin, item, db_name=reviews_db_name, md5key=theMd5key) # 否则只写入当天评论 else: if item['date'] >= theYesterDete: # print('reviews item: ', item) data1 = dataOutput.save_data_to_db( reviews_update_sql, reviews_insert_sql, asin, item, db_name=reviews_db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))