def record_disabled_tosell(the_asin, data_type=''): data_dict = { 'asin': the_asin, 'getinfo_tm': int(DataOutput.get_redis_time()) } conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE]) cur = conn.cursor() try: sql = "update public.amazon_product_data_tosell set is_sync=0, crawler_state=2, getinfo_tm=%(getinfo_tm)s where asin=%(asin)s;" print(sql) cur.execute(sql, data_dict) row = cur.rowcount print(1, row) if row > 0: conn.commit() print('\namazon_product_data_tosell,%s,%s,%s行,更新成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_asin, row)) sql1 = "update public.amazon_product_monitor set tosell_tm_crawler=%(tosell_tm_crawler)s where asin=%(asin)s;" data_dict1 = { 'asin': the_asin, 'tosell_tm_crawler': int(DataOutput.get_redis_time() / 1000) } cur.execute(sql1, data_dict1) conn.commit() # cur.execute("select * from amazon_product_data where asin=%s;", (the_asin,)) # print(cur.fetchall()) else: conn.rollback() sql = "INSERT INTO public.amazon_product_data_tosell(asin, getinfo_tm, crawler_state) VALUES (%(asin)s, %(getinfo_tm)s, 2);" cur.execute(sql, data_dict) row = cur.rowcount print(2, row) if row > 0: conn.commit() sql1 = "update public.amazon_product_monitor set tosell_tm_crawler=%(tosell_tm_crawler)s where asin=%(asin)s;" data_dict1 = { 'asin': the_asin, 'tosell_tm_crawler': int(DataOutput.get_redis_time() / 1000) } cur.execute(sql1, data_dict1) conn.commit() print('\namazon_product_data_tosell,%s,%s,%s行,插入成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_asin, row)) else: conn.rollback() cur.close() conn.close() except Exception as e: conn.rollback() cur.close() conn.close() datas = {the_asin: data_dict} print('失败 amazon_product_data ', e, datas) cur.close() conn.close()
def record_not_found_keyword(the_kw, data_type=''): data_dict = { 'kw': the_kw, 'search_num': -2, 'getinfo_tm': int(DataOutput.get_redis_time()) } conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE]) cur = conn.cursor() try: sql = "update public.amazon_keyword_data set kw=%(kw)s, search_num=%(search_num)s, getinfo_tm=%(getinfo_tm)s, is_sync=0, crawler_state=1 where kw=%(kw)s;" cur.execute(sql, data_dict) row = cur.rowcount if row > 0: conn.commit() print( '\namazon_keyword_data,%s,%s,%s行,更新成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_kw, row)) sql1 = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;" data_dict1 = { 'asin': the_kw, 'crawler_tm': int(DataOutput.get_redis_time() / 1000) } cur.execute(sql1, data_dict1) conn.commit() # cur.execute("select * from amazon_product_data where asin=%s;", (the_asin,)) # print(cur.fetchall()) else: conn.rollback() sql = "INSERT INTO public.amazon_keyword_data(kw, search_num, getinfo_tm, crawler_state) VALUES (%(kw)s, %(search_num)s, %(getinfo_tm)s, 1);" cur.execute(sql, data_dict) row = cur.rowcount if row > 0: conn.commit() sql1 = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;" data_dict1 = { 'asin': the_kw, 'crawler_tm': int(DataOutput.get_redis_time() / 1000) } cur.execute(sql1, data_dict1) conn.commit() print('\namazon_keyword_data,%s,%s,%s行,插入成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_kw, row)) else: conn.rollback() cur.close() conn.close() except Exception as e: conn.rollback() cur.close() conn.close() datas = {the_kw: data_dict} # print('失败 amazon_product_data ', e, datas) cur.close() conn.close()
def bsrData_save(dataQ, debug_log, db_log): print('\nbsrData_save init\n') data_type = 'bsr' if dataQ.RedisQ.llen('bsrData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) db_name = SqlConfig.bsrData_db_name update_sql = SqlConfig.bsrData_update_sql insert_sql = SqlConfig.bsrData_insert_sql while True: datas = dataQ.get_new_bsrData() if not datas: if dataQ.RedisQ.llen('bsrData') > 0: datas = dataQ.get_new_bsrData() else: break for item in datas: asin = item tuple_list = datas[item] tm = int(DataOutput.get_redis_time()) # print('asin tuple_list: ', asin, tuple_list) for item in tuple_list: if item and type(item) is tuple: # print('bsr item: ', item) itemLen = len(item) bsr = item[0] bsrc = item[1] aday = item[2] # if itemLen == 4: # tm = item[3] # else: # tm = int(time.time() * 1000) data_dict = dict(asin=asin, bsr=bsr, bsrc=bsrc, tm=tm, aday=aday) data = dataOutput.save_data_to_db(update_sql, insert_sql, asin, data_dict, db_name=db_name) # print('bsrData: ',data) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def goods_retry(urlQ): url_type = 'goods' Qname = 'goodsUrlQueue' empty_url_queue(urlQ.RedisQ, Qname) sql_times = get_init_updae_tm(urlQ) # sql_times = 0 if sql_times: # 未下载重试 sql = "select asin, monitor_type, aid, info_tm from public.amazon_product_monitor where state = 1 \ and monitor_type > 0 and info_tm_crawler < %s and asin not in (select asin from public.amazon_druid_product_data where aday='%s') order by info_tm_crawler limit 8000;" % ( sql_times, return_PST().strftime('%Y%m%d')) # 库存下载失败重试 #sql1 = "select asin, monitor_type, aid, info_tm from public.amazon_product_monitor where asin in (select asin from amazon_product_data where asin in (select asin from amazon_product_monitor where state=1 and monitor_type > 0) and asin not in (select asin from amazon_druid_product_data where aday='%s')) order by info_tm_crawler limit 8000;" % (return_PST().strftime("%Y%m%d")) #print(sql) print('\ngoods 重试进程 %s\n' % (sql_times)) url_tuple_list = urlQ.retrieve_asin(sql) #url_tuple_list2 = urlQ.retrieve_asin(sql1) #url_list_tuple = set(url_tuple_list + url_tuple_list2) url_list_tuple = url_tuple_list print('需要重试的商品数 %s' % (len(url_list_tuple))) # print('monitoring_goods_Now: ', url_tuple_list) if len(url_list_tuple) > 0: add_url_to_queue(urlQ, url_list_tuple, url_type=url_type, sql_times=sql_times)
def update_getdata_tm(data_dict, data_type, dataQ=None, db_log=None): sql = '' db_type = '' if data_type == 'goods': db_type = 'public.amazon_product_monitor.info_tm_crawler' sql = "update public.amazon_product_monitor set info_tm_crawler=%(info_tm_crawler)s where asin=%(asin)s;" if data_type == 'reviews': db_type = 'public.amazon_product_monitor.comment_tm_crawler' sql = "update public.amazon_product_monitor set comment_tm_crawler=%(comment_tm_crawler)s where asin=%(asin)s;" if data_type == 'tosell': db_type = 'public.amazon_product_monitor.tosell_tm_crawler' sql = "update public.amazon_product_monitor set tosell_tm_crawler=%(tosell_tm_crawler)s where asin=%(asin)s;" if data_type == 'keyword': db_type = 'public.amazon_keyword_monitor.crawler_tm' sql = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;" if sql: conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE]) cur = conn.cursor() cur.execute(sql, data_dict) row = cur.rowcount if row > 0: if db_log: db_log.info('爬虫数据更新时间 %s,%s,%s,%s行,标记成功' % (db_type, return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type, row)) conn.commit() else: conn.rollback() cur.close() conn.close()
def select_analyzer_state(asins, kws, urlQ, aid, tid, aday=None): if not aday: aday = return_PST().strftime("%Y%m%d") _, init_time = get_the_time() asin_tuple = tuple(set(asins)) kw_tulep = tuple(set(kws)) asin_sql = 'select count(*) from amazon_product_data where getinfo_tm > ' + str(init_time - 3600 * 2) + 'and asin in %s;' the_vaule = lambda lst: lst[0][0] if len(lst) > 0 and type(lst[0]) is tuple and len(lst[0]) > 0 else 0 asin_count = the_vaule(urlQ.retrieve_asin(asin_sql, [asin_tuple])) print(asin_count, type(asin_count)) # kw_sql = 'select count(*) from amazon_keyword_data where getinfo_tm > ' + str(init_time - 3600 * 2) + 'and kw in %s;' # kw_count = the_vaule(urlQ.retrieve_asin(kw_sql, [kw_tulep])) # print(kw_count, type(kw_count)) sql1 = "select kw from public.amazon_druid_keyword_data where tm > " + str(init_time - 3600 * 2) sql2 = sql1 + "and kw in %s group by kw;" print(sql2) get_vlues = lambda lst: [x[0] for x in lst if len(lst) > 0 and type(x) is tuple and len(x) > 0] rows = get_vlues(urlQ.retrieve_asin(sql2, [kw_tulep])) # rows = list(set(rows)) finish_kws = json.dumps(rows) finish_count = len(rows) print(finish_kws, type(finish_kws)) print('总共 %s 个asin, %s 个关键词\n已更新%s个asin, %s个关键词' % (len(asin_tuple), len(kw_tulep), asin_count, finish_count)) # 有90%的关键词完成, 就可以标记完成状态了. if len(kw_tulep) - finish_count < len(kw_tulep) * 0.1: # 标记爬虫完成(crawler_state=2). update_analyzer_state(tid, aid, 2, finish_count, finish_kws=finish_kws) return True
def get_the_time(): # 日期格式 date_str = '%Y%m%d' # 时间格式 time_str = '%Y%m%d%H%M%S' # 当天的日期对象 the_day = datetime.now() the_hour = the_day.hour pstNow = return_PST() pstHour = pstNow.hour # print(1.1, the_day) # 当天日期字符串 date_str = the_day.strftime(date_str) # 当天15点整字符串 the_day_str = '%s150000' % (date_str) # 当天15点的时间对象 time_day = time.strptime(the_day_str, time_str) # print(1, time_day) the_time = time.mktime(time_day) # 当天15点时间戳 the_date_time = the_time # 昨天15点时间戳 old_date_time = the_date_time - 86400 # 如果过了太平洋时间0点了, 需要另外计算. if 10 >= pstHour >= 0 and 15 <= the_hour <= 23: the_date_time = the_time + 86400 old_date_time = the_time return the_date_time, old_date_time
def record_not_found_reviews(the_asin, data_type=''): data_dict = { 'asin': the_asin, 'comment_tm_crawler': int(DataOutput.get_redis_time() / 1000) } db_name = 'public.amazon_product_monitor.comment_tm_crawler' update_sql = "update public.amazon_product_monitor set comment_tm_crawler=%(comment_tm_crawler)s where asin=%(asin)s;" conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE]) cur = conn.cursor() try: cur.execute(update_sql, data_dict) row = cur.rowcount if row > 0: conn.commit() print('%s,%s,%s,%s行,更新成功' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin, row)) cur.close() conn.close() except Exception as e: conn.rollback() cur.close() conn.close() # print('失败 amazon_product_data ', e, datas) cur.close() conn.close()
def __init__(self): # 配置文件 self.db_config = deepcopy(DATADB_CONFIG[BASE_TYPE]) # 数据库名 self.database = self.db_config['database'] super(ProductModel, self).__init__(self.db_config) # amazon_product_data 表 self.product_name = 'amazon_product_data' self.product_class = self.get_class(self.product_name) # crud用 # amazon_druid_product_data 表 self.druid_name = 'amazon_druid_product_data' self.druid_class = self.get_class(self.druid_name) # crud用 # amazon_druid_product_data_bsr self.bsr_name = 'amazon_druid_product_data_bsr' self.bsr_class = self.get_class(self.bsr_name) # amazon_product_monitor 表 self.monitor_name = 'amazon_product_monitor' self.monitor_class = self.get_class(self.monitor_name) # 时间相关的对象. self.date_fmt = '%Y%m%d' self.today = return_PST() self.old_date = self.today + timedelta(days=-3)
def record_not_found_goods(the_asin, data_type=''): data_dict = {'asin': the_asin, 'asin_state': 0, 'getinfo_tm': int(DataOutput.get_redis_time())} conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE]) cur = conn.cursor() try: sql = "update public.amazon_product_data set \ price=0, sale_price=0, sname='', ts_min_price=0, to_sell=0, byb=0, bsr=0, rc=0, rrg=0,\ r5p=0, r4p=0, r3p=0, r2p=0, r1p=0, feature='', brand='', release_date=0,\ collect_tm=0, variant='', cart_price=0, quantity=0, seller_id='',\ asin_state=%(asin_state)s, getinfo_tm=%(getinfo_tm)s, is_sync=0, crawler_state=1 where asin=%(asin)s;" cur.execute(sql, data_dict) row = cur.rowcount if row > 0: conn.commit() print('\namazon_product_data,%s,%s,%s行,更新成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_asin, row)) sql1 = "update public.amazon_product_monitor set info_tm_crawler=%(info_tm_crawler)s where asin=%(asin)s;" data_dict1 = {'asin': the_asin, 'info_tm_crawler': int(DataOutput.get_redis_time() / 1000)} cur.execute(sql1, data_dict1) conn.commit() # cur.execute("select * from amazon_product_data where asin=%s;", (the_asin,)) # print(cur.fetchall()) else: conn.rollback() sql = "INSERT INTO public.amazon_product_data(asin, asin_state, getinfo_tm, crawler_state) VALUES (%(asin)s, %(asin_state)s, %(getinfo_tm)s, 1);" cur.execute(sql, data_dict) row = cur.rowcount if row > 0: conn.commit() sql1 = "update public.amazon_product_monitor set info_tm_crawler=%(info_tm_crawler)s where asin=%(asin)s;" data_dict1 = {'asin':the_asin, 'info_tm_crawler': int(DataOutput.get_redis_time() / 1000)} cur.execute(sql1, data_dict1) conn.commit() print('\namazon_product_data,%s,%s,%s行,插入成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_asin, row)) else: conn.rollback() cur.close() conn.close() except Exception as e: conn.rollback() cur.close() conn.close() datas = {the_asin: data_dict} # print('失败 amazon_product_data ', e, datas) cur.close() conn.close()
def record_log(self, asin_keyword, time1, msgInt, msgType, startTime, ip, proxyInfo): '''msgInt 1成功 2失败 3报错 代表数据, 4失败 5验证码 6报错 7页面找不到 代表html''' time2 = time.time() endTime = return_PST().strftime("%Y-%m-%d %H:%M:%S") diffTime = time2 - time1 self.info_log.info('%s, %s, %s, %s, %s, %s, %s, %s' % (ip, asin_keyword, msgType, startTime, endTime, diffTime, msgInt, proxyInfo))
def save_discard_url(self, asin, url, num, discard_type): pstNow = return_PST() timeNow = pstNow.strftime("%Y-%m-%d %H:%M:%S") dateNow = pstNow.strftime("%Y_%m_%d") filepath = os.path.join(DATA_DIR, 'discard_url_%s.log' % (dateNow)) msg = '[%s][%s][%s] [%s] [%s] [被放弃]\n' % (timeNow, asin, url, discard_type, num) msg.encode('utf-8') with open(filepath, 'a') as f: f.write(msg)
def save_data_to_db(self, update_sql, insert_sql, the_asin_or_kw, data_dict, db_name='', md5key=''): self.dataQ.record_dbSum_times() # print(the_asin_or_kw, data_dict) try: if update_sql and insert_sql: self.cur.execute(update_sql, data_dict) row = self.cur.rowcount if row > 0: self.dbObj.commit() self.dataQ.record_db_ok_times() self.db_log.info('%s,%s,%s,%s行,更新成功' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, row)) # 评论进程标记第一次 if md5key: self.dataQ.the_reviews_first_download(md5key) else: self.dbObj.rollback() self.cur.execute(insert_sql, data_dict) row = self.cur.rowcount if row > 0: self.dbObj.commit() self.dataQ.record_db_ok_times() self.db_log.info('%s,%s,%s,%s行,插入成功' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, row)) # 评论进程标记第一次 if md5key: self.dataQ.the_reviews_first_download(md5key) else: self.dbObj.rollback() else: if update_sql and not insert_sql: self.cur.execute(update_sql, data_dict) row = self.cur.rowcount if row > 0: self.dbObj.commit() self.dataQ.record_db_ok_times() self.db_log.info('%s,%s,%s,%s行,更新成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, row)) else: self.dbObj.rollback() if insert_sql and not update_sql: self.cur.execute(insert_sql, data_dict) row = self.cur.rowcount if row > 0: self.dbObj.commit() self.dataQ.record_db_ok_times() self.db_log.info('%s,%s,%s,%s行,插入成功' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, row)) else: self.dbObj.rollback() except Exception as e: self.dbObj.rollback() datas = {the_asin_or_kw: data_dict} self.debug_log.error('%s,%s,%s,入库失败,原因%s,失败数据[%s];' % ( return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, e, datas)) self.db_log.error('%s,%s,%s,入库失败,原因%s,失败数据[%s];' % ( return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, e, datas)) return datas
def wrap(*args, **kwargs): '''保存HTML的装饰器, 此装饰器, 装饰在产品详情与库存的解析方法上''' # 运行原函数 result = func(*args, **kwargs) try: if 'qty' in func.__name__: '''如果是获取库存的函数, 用此逻辑获取相应参数''' asin = kwargs.get('asin', '') html = kwargs.get('html_code', '') html_type = 'inventory' print(asin, html) else: '''如果不是获取库存的函数, 用此逻辑获取相应参数''' asin = args[2] html = args[1] html_type = 'product' print('*' * 20) print(func.__name__, html_type, asin, len(html), type(html)) print('*' * 20) ## 在 save_asin_list 中的 asin 才保存 # if asin in save_asin_list: '''注释掉上面一行后, 则是所有合法HTML都保存.''' if type(html) is str and html: print(1111111111) # 获取pt时间 datenow = return_PST() # 文件保存目录 # base_dir = '/data3/var/devtest/' base_dir = '../../data/devtest/' # 若目录不存在, 分级创建各种目录 if not os.path.exists(base_dir): os.mkdir(base_dir) html_dir = os.path.join(base_dir, 'save_asin_html/') if not os.path.exists(html_dir): os.mkdir(html_dir) save_dir = os.path.join(html_dir, datenow.strftime('%Y%m%d')) if not os.path.exists(save_dir): os.mkdir(save_dir) # 生成文件名 file_path = os.path.join( save_dir, '%s_%s_%s.html' % (asin, html_type, datenow.strftime('%Y%m%d_%H_%M_%S'))) # 将HTML写入文件 with open(file_path, 'w', encoding='utf-8') as f: f.write(html) print(2222222) except Exception as e: print(e) return result
def wrap(*args, **kwargs): '''保存HTML的装饰器, 此装饰器, 装饰在关键字的解析方法上''' # 运行原函数 result = func(*args, **kwargs) try: keyword = args[2] html_list = args[1] html_type = 'keyword' # print(keyword, html_list) print('*' * 20) print(func.__name__, html_type, keyword, len(html_list), type(html_list)) print('*' * 20) ## 在 save_asin_list 中的 asin 才保存 # if asin in save_asin_list: '''注释掉上面一行后, 则是所有合法HTML都保存.''' if type(html_list) is list and html_list: i = 1 for html in html_list: # 获取pt时间 datenow = return_PST() # 文件保存目录 base_dir = '../../data/devtest/' # 若目录不存在, 分级创建各种目录 if not os.path.exists(base_dir): os.mkdir(base_dir) html_dir = os.path.join(base_dir, 'save_asin_html/') if not os.path.exists(html_dir): os.mkdir(html_dir) save_dir = os.path.join(html_dir, datenow.strftime('%Y%m%d')) if not os.path.exists(save_dir): os.mkdir(save_dir) # 生成文件名 keyword = '_'.join(keyword.split(' ')) print(keyword, i) file_path = os.path.join( save_dir, '%s_%s_%s_%s.html' % (keyword, html_type, datenow.strftime('%Y%m%d_%H_%M_%S'), i)) # 将HTML写入文件 with open(file_path, 'w', encoding='utf-8') as f: f.write(html) i += 1 except Exception as e: print(e) return result
def all_url_init(urlQ, kwQ): kwNum = kw_init(kwQ) goodsNum = goods_init(urlQ) reviewsNum = reviews_init(urlQ) tosellNum = tosell_init(urlQ) urlNum = goodsNum + reviewsNum + tosellNum + kwNum urlQ.update_mission_attempts(urlNum) pstNow = return_PST() startTime = pstNow.strftime("%Y-%m-%d %H:%M:%S") dateNow = pstNow.strftime("%Y_%m_%d") statFile = os.path.join(REPORT_DIR, 'statistics_%s.csv' % (dateNow)) msg = '\n[,%s,] [,初始化报告,], 任务总数, %s, 成功加入商品队列, %s, ' % (startTime, urlNum, goodsNum) msg1 = '成功加入评论队列, %s, ' % (reviewsNum) msg2 = '成功加入跟卖队列, %s,' % (tosellNum) msg3 = '成功加入关键字队列, %s, 任务开始!' % (kwNum) msgs = msg + msg1 + msg2 + msg3 with open(statFile, 'a') as f: f.write(msgs)
def analyzer_start(urlQ, dataQ, kwQ, info_log, debug_log, i): print('\nanalyzer_start%s 启动成功' % (i)) while True: urllen = urlQ._get_queue_len(queue_name) print('当前analyzer任务队列长度 %s' % (urllen)) if urllen < 1: sys.exit() # 获取任务字典 task_dict = get_task(urlQ) # 提取监测时间 mtm = task_dict.get('mtm', int(time.time())) # 提取aid aid = task_dict.get('aid', -1) # 提取tid tid = task_dict.get('tid') # 解包asin加工后重新打包 asin_list = task_dict.get('asins', []) for asin in asin_list: asin_dict = dict(asin=asin, monitor_tm=mtm, aid=aid, utp='goods') # 将asin加入监测 save_task_to_db(urlQ, asin_dict, url_type='goods') # 解包关键词加工后并重新打包 kws_dict = task_dict.get('kws', {}) # print(kws_dict, type(kws_dict)) kw_list = [] for k, v in kws_dict.items(): # print(k, type(k)) # print(v, type(v)) kw_list.extend(v) for kw in kw_list: kw_dict = dict(aid=aid, monitor_tm=mtm, kw=kw, utp='keyword') # 将关键词加入监测 save_task_to_db(urlQ, kw_dict, url_type='keyword') # 更改分析器状态(1 分析中) update_analyzer_state(tid, aid, 1) aday = return_PST().strftime("%Y%m%d") while 1: if select_analyzer_state(asin_list, kw_list, urlQ, aid, tid, aday=aday): break time.sleep(60 * 5)
def is_not_turn_the_page(self, first, html, page_num=0, asin=''): ''' 判断规则, 如果不第一次下载, 单页内出现前一天的评论, 则判断为不需要翻页了. 如果是第一次下载, 单页内出现三个月以前的评论, 也判断不再需要翻页. ''' reviews_date_list = ReviewsParser.get_reviews_date_list(html) datetime = return_PST() min_reviews_date = min( reviews_date_list) if len(reviews_date_list) > 0 else int( datetime.strftime('%Y%m%d')) oldDate = datetime - timedelta(days=90) yesterdate = datetime - timedelta(days=1) yesterday = yesterdate.strftime('%Y%m%d') theYesterDete = int(yesterday) theMon = oldDate.strftime('%Y%m%d') three_mon_date = int(theMon) print( '\n%s: min_reviews_date: %s\ntheYesterDete: %s\nthree_mon_date: %s\n' % (asin, min_reviews_date, theYesterDete, three_mon_date)) # 如果不是第一次下载 # 如果redis崩了, 数据丢了, 要从数据库查询, 修复此数据 if not first: if min_reviews_date < theYesterDete: print('%s < %s' % (min_reviews_date, theYesterDete)) print('非第一次下载, 当前是%s的第%s页评论, 不再需要继续翻页' % (asin, page_num)) return True else: print('是第一次下载, 当前是%s的第%s页评论, 还需要继续翻页' % (asin, page_num)) return False else: if min_reviews_date < three_mon_date: print('%s < %s' % (min_reviews_date, three_mon_date)) print('非第一次下载, 当前是%s的第%s页评论, 不再需要继续翻页' % (asin, page_num)) return True else: print('非第一次下载, 当前是%s的第%s页评论, 还需要继续翻页' % (asin, page_num)) return False
def parser_not_found(self, the_asin, goods_html, html_code_list=None): self.html_code = goods_html self.head_html = self.get_head_html(goods_html) self.xpath_obj = etree.HTML(goods_html) self.desc_html = self.get_description_html(goods_html) self.buy_box_html = self.get_buy_box_html(goods_html) tosell_list = [] tosell_info = {} sn = 1 if self._is_Currently_unavailable(): sn = 0 sname = self._get_seller_name() # 卖家 seller_id = self._get_seller_id() if not sname: seller_id = '' price = self._get_discount_price() total_ratings = self._get_review_count() # 评论数 reivew_count = self._get_review_rating() # 综合评分 r5p = self._get_review_5_percent() # 5星评价百分比 r4p = self._get_review_4_percent() # 4星评价百分比 positive = r5p + r4p byb = self._has_buy_button() fba = self._get_fba() tosell_dict = dict( asin=the_asin, condition='', # 使用情况 sname=sname, # 卖家 stype='', # 货运类型 price=price, # 价格 demo='', # 描述 positive=positive, # 好评率 total_ratings=total_ratings, # 总评数量 tm=int(BaseParser.get_redis_time() / 1000), # 更新时间 fba=fba, # 是否fba seller_id=seller_id, # 卖家id reivew_count=reivew_count, # 评分 delivery='', # 配送方式 aday=return_PST().strftime("%Y%m%d"), # 获取数据的太平洋日期 ) tosell_list.append(tosell_dict) if fba > 0: fba_sn = 1 else: fba_sn = 0 if byb > 0: the_sname = sname the_seller_id = seller_id else: the_sname = '' the_seller_id = '' if not the_sname: the_seller_id = '' tosell_datas = dict( asin=the_asin, sn=sn, # 跟卖卖家数量 fba_sn=fba_sn, # FBA跟卖卖家数量 plow=price, # 最低价 plows=sname, # 最低跟卖卖家名 plows_id=seller_id, # 最低跟卖卖家id getinfo_tm=int(BaseParser.get_redis_time()), # 获取时间 sname=the_sname, # 黄金购物车卖家 seller_id=the_seller_id, # 黄金购物车卖家id ) tosell_html = html_code_list[0] if type( html_code_list) is list and len(html_code_list) > 0 else '' if re.search('There are currently no listings for this search', tosell_html): # 如果没有new跟卖, 则卖家数量为0, fba卖家数量为0, 价格为-, 最低跟卖卖家与卖家id为- tosell_datas['sn'] = 0 tosell_datas['fba_sn'] = 0 tosell_datas['plow'] = -1 tosell_datas['plows'] = '' tosell_datas['plows_id'] = '' tosell_info[the_asin] = (tosell_datas, []) elif self._is_Currently_unavailable(): # 如果不可售, 则卖家数量为0, fba卖家数量为0, 价格为-, 最低跟卖卖家与卖家id为- tosell_datas['sn'] = 0 tosell_datas['fba_sn'] = 0 tosell_datas['plow'] = -1 tosell_datas['plows'] = '' tosell_datas['plows_id'] = '' tosell_datas['sname'] = '' tosell_datas['seller_id'] = '' tosell_info[the_asin] = (tosell_datas, []) else: tosell_info[the_asin] = (tosell_datas, tosell_list) return tosell_info
def tosell_parser(self, html_code_list, the_asin, tosellSum=None, ip='', download_url='', goods_html_code=''): fba_list = [] sn_list = [] plow_list = [] plow_dict = {} tosell_list = [] tosell_info = {} the_sname, the_seller_id = self._get_byb_merchant(goods_html_code) print(the_sname, the_seller_id) ts_price = self.get_to_sell_price(goods_html_code) print(ts_price) for html_code in html_code_list: xpathObj_list = self.get_tosell_html(html_code) print('xpathObj_list: ', xpathObj_list) for xpathObj in xpathObj_list: self.html_code = str(tostring(xpathObj), encoding='utf-8') # print(self.html_code) condition = self._get_condition() sname = self._get_sname() stype = self._get_stype() price = self._get_price() demo = self._get_demo() positive = self._get_positive(demo) total_ratings = self._get_total_ratings(demo) reivew_count = self._get_reivew_count(demo) seller_id = self._get_seller_id() if not sname: seller_id = '' fba = self._get_fba() delivery = self._get_delivery() tosell_dict = dict( asin=the_asin, condition=condition, # 使用情况 sname=sname, # 卖家 stype=stype, # 货运类型 price=price, # 价格 demo=demo, # 描述 positive=positive, # 好评率 total_ratings=total_ratings, # 总评数量 tm=int(BaseParser.get_redis_time() / 1000), # 查询时间 fba=fba, # 是否fba seller_id=seller_id, # 卖家id reivew_count=reivew_count, # 评分 delivery=delivery, # 配送方式 aday=return_PST().strftime("%Y%m%d"), # 获取数据的太平洋日期 ) if fba: if seller_id: fba_list.append(str(seller_id)) else: fba_list.append(str(sname)) # if seller_id: # sn_list.append(str(seller_id)) # else: # sn_list.append(str(sname)) plow_list.append(price) price_str = str(price) if price_str not in plow_dict: plow_dict[price_str] = dict(sname=sname, seller_id=seller_id) tosell_list.append(tosell_dict) sn = len(tosell_list) # print('\ntosellSum: ', tosellSum) # print('sn: ', sn, '\n') fba_sn = len(fba_list) # fbag跟卖数 if len(plow_list) > 0: plow = min(plow_list) else: plow = 0 # if ts_price > 0 and ts_price - plow > ts_price * 0.7: # plow = ts_price plows1 = plow_dict.get(str(plow)) or {} plows = plows1.get('sname') or '' plows_id = plows1.get('seller_id') or '' tosell_datas = dict( asin=the_asin, sn=sn, # 跟卖卖家数量 fba_sn=fba_sn, # FBA跟卖卖家数量 plow=plow, # 最低价 plows=plows, # 最低跟卖卖家名 plows_id=plows_id, # 最低跟卖卖家id getinfo_tm=int(BaseParser.get_redis_time()), # 获取时间 sname=the_sname, # 黄金购物车卖家 seller_id=the_seller_id, # 黄金购物车卖家id ) if len(tosell_list) > 0: tosell_info[the_asin] = (tosell_datas, tosell_list) else: tosell_info = TosellNotFoundParser( goods_html_code).parser_not_found(the_asin, goods_html_code, html_code_list) print(tosell_info) return tosell_info
def add_url_to_queue(theQueue, url_tuple_list, url_type='', sql_times=None): used = 'useInterval' aday = return_PST().strftime('%Y%m%d') def add_to_queue(theQueue, url_tuple_bytes, url_type=''): result = False if url_type == 'goods': result = theQueue.add_goods_url_to_queue(url_tuple_bytes) if url_type == 'reviews': result = theQueue.add_reviews_url_to_queue(url_tuple_bytes) if url_type == 'tosell': result = theQueue.add_tosell_url_to_queue(url_tuple_bytes) if url_type == 'keyword': result = theQueue.add_keyword_to_queue(url_tuple_bytes) return result # print(url_tuple_list) url_data_dict = {} for urlTuple in url_tuple_list: # print(urlTuple) kw_asin = urlTuple[0] cid_monitorType = urlTuple[1] aid = urlTuple[2] # 记录监控时间 monitor_tm = urlTuple[3] md5value = kw_asin + url_type md5key = theQueue.get_md5_key(md5value) usedMd5key = theQueue.get_md5_key(md5value + used) url_dict = dict( aid=aid, mtm=monitor_tm, md5=md5key, umd5=usedMd5key, utp=url_type, ) if url_type == 'keyword': url_dict['kw'] = kw_asin url_dict['cid'] = cid_monitorType else: url_dict['asin'] = kw_asin url_dict['mtp'] = cid_monitorType if not url_data_dict.get(kw_asin): url_data_dict[kw_asin] = url_dict url_data_list = [] for url in url_data_dict: url_data_list.append(url_data_dict[url]) url_dict_list = sorted(url_data_list, key=lambda x: x.get('mtm', int(time.time()))) filter_list = lambda lst: [x[0] for x in lst if type(x) is tuple and len(x) > 0] asin_set_list = [] if url_type == 'keyword': sql1 = "select kw from public.amazon_druid_keyword_data where tm > %(tm)s and aday=%(aday)s group by kw;" # print(sql1) data = dict(tm=sql_times * 1000, aday=aday) asin_set_list = filter_list(urlQ.retrieve_asin(sql1, data)) if len(asin_set_list) > 10: print(asin_set_list[:10]) print('len asin_set_list', len(asin_set_list)) if url_type == 'goods': sql1 = "select asin from public.amazon_druid_product_data where tm > %s and aday='%s';" % ( sql_times * 1000, return_PST().strftime('%Y%m%d')) # print(sql1) asin_set_list = filter_list(urlQ.retrieve_asin(sql1)) if len(asin_set_list) > 10: print(asin_set_list[:10]) print('len asin_set_list', len(asin_set_list)) for url_dict in url_dict_list: url_tuple_bytes = pickle.dumps(url_dict) # print(monitor_tm, 1, md5value) result = add_to_queue(theQueue, url_tuple_bytes, url_type) # print(monitor_tm, 2, md5value) if result: # print(monitor_tm, result, md5value) theQueue.add_asinAndKw_to_set(url_dict.get('md5')) # urlQ.srem_successAsinSet_from_set(md5key) if url_type == 'goods': # 如果 asin 不是已下架 if sql_times: if urlQ.is_downloaded(url_dict.get('md5')): if url_dict.get('asin') not in asin_set_list: print('实际上没有下载的asin', url_dict.get('asin')) urlQ.srem_successAsinSet_from_set(url_dict.get('md5')) if url_type == 'keyword': # 如果 asin 不是已下架 if sql_times: if urlQ.is_downloaded(url_dict.get('md5')): if url_dict.get('kw') not in asin_set_list: print('实际上没有下载的kw', url_dict.get('kw')) urlQ.srem_successAsinSet_from_set(url_dict.get('md5'))
def download(self, asin_or_kw, url_dict, **kwargs): url_type = self.url_type asin = asin_or_kw monitor_type = url_dict.get('mtp') or 5 print('url type: ', url_type) startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S") time_now = lambda: time.time() time1 = time_now() url_md5key = url_dict.get('md5') or '' if not url_md5key: url_md5key = self.get_md5_key(asin + url_type) goodsUrl_tuple = self.make_url(asin, url_type='goods') goodsUrl, referer = goodsUrl_tuple if goodsUrl: html_list, url_list, cookiesObj, is_error_lsit, tosellSum = \ self.get_tosell_html_lsit(asin, goodsUrl, referer, **kwargs) old_dnum = url_dict.get('dnum') or 0 durl = url_dict.get('durl') or [] url_dict['durl'] = list(set(durl + url_list)) url_dict['dnum'] = old_dnum + 1 # 如果判定为没有跟卖, 结束程序 if self.not_found: self.urlQ.record_tosell_notFound_times() msgInt = 0 proxyInfo = 'the asin not tosell' self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo) return self.debug_log.war('%s没有跟卖' % (asin)) i = -1 tosell_html_list = [] if len(html_list) > 0: for html in html_list: i += 1 is_error = is_error_lsit[i] print(is_error_lsit, is_error_lsit[i]) url = url_list[i] if is_error: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 6 proxyInfo = 'get Html error' self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo) else: analyze = self.analyze_html(html, asin, url_dict, time1, startTime, html_type=url_type) if analyze and analyze != 404: tosell_html_list.append(html) print('html num: ', len(html_list), 'tosell_html num: ', len(tosell_html_list)) if len(tosell_html_list) == len(html_list): result, is_error = self.parser(tosell_html_list, html_type=url_type, asin=asin, monitor_type=monitor_type, tosellSum=tosellSum, goods_html_code=self.goods_html) # from pprint import pprint # pprint('tosell_result', result) if is_error: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 3 proxyInfo = 'get data error' self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo) else: if not result: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 2 proxyInfo = 'get data defeated' self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo) else: self.save_success_asin_keyword(asin, url_type=url_type) msgInt = 1 proxyInfo = 'get data success' self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo) tosell_datas = result[0] # from pprint import pprint # pprint(tosell_datas) data_bytes = pickle.dumps(tosell_datas) self.dataQ.add_tosell_to_queue(data_bytes) self.dataQ.record_data_ok_times() self.dataQ.record_tosell_ok_times() else: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) else: if tosellSum == -1: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) if tosellSum > 0: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) else: print(url_type, '没有url') self.the_url_is_discard(asin, url_dict, url_type, url_md5key) time.sleep(1)
def statistics(urlQ): the_date = return_PST() file_name = os.path.join( REPORT_DIR, 'statistics_info_%s.csv' % (the_date.strftime('%Y%m%d'))) print(file_name) # 关键字总数 已入库数 入库失败的数量 kw_sum, kw_num, kw_fail_sum = keyword_retry(urlQ) # 产品的总数 已经入库的数量 入库失败的数量 入库失败但是保存数据到数据库的商品的数量 # goods_sum, goods_num, goods_fail_sum, fail_goods = goods_retry(urlQ) goods_sum, goods_num, goods_fail_sum = goods_retry(urlQ) # 失败的库存 not_qty_sum = qty_retry(urlQ) # 评论的总数 已经入库的数量 入库失败的数量 reviews_sum, reviews_num, reviews_fail_sum = reviews_retry(urlQ) # 跟卖的总数 已经入库的数量 入库失败的数量 tosell_sum, tosell_num, tosell_fail_sum = tosell_retry(urlQ) # 库存已入库数 inventory_num = goods_num - not_qty_sum # 未完成入库的商品的数量 goods_unfinished = goods_sum - goods_num # 未完成入库的关键字的数量 kw_unfinished = kw_sum - kw_num # 未完成入库的跟卖的数量 tosell_unfinished = tosell_sum - tosell_num # 未完成入库的评论的数量 reviews_unfinished = reviews_sum - reviews_num # 库存入库完成度 inventory_comlete_rate = inventory_num / goods_sum * 100 # 商品入库完成度 goods_complete_rate = goods_num / goods_sum * 100 # 关键字入库完成度 kw_complete_rate = kw_num / kw_sum * 100 # 跟卖入库完成度 tosell_complete_rate = tosell_num / tosell_sum * 100 # 评论入库完成度 reviews_complete_rate = reviews_num / reviews_sum * 100 # 库存重试率 inventory_retry_rate = not_qty_sum / goods_sum * 100 # 商品重试率 goods_retry_rate = goods_fail_sum / goods_sum * 100 # 关键字重试率 kw_retry_rate = kw_fail_sum / kw_sum * 100 # 跟卖重试率 tosell_retry_rate = tosell_fail_sum / tosell_sum * 100 # 评论重试率 reviews_retry_rate = reviews_fail_sum / reviews_sum * 100 with open(file_name, 'a', encoding='utf-8') as f: f.write( "\n , ,GTM+8,%s\n" % (time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time())))) f.write("%s,%s,%s,%s,%s,%s\n" % (the_date.strftime('%Y-%m-%d,%H:%M:%S'), 'inventory|库存', 'product|产品', 'keyword|关键字', 'tosell|跟卖', 'reviews|评论')) f.write( "%s,%s,%s,%s,%s,%s\n" % (' ,总数', goods_sum, goods_sum, kw_sum, tosell_sum, reviews_sum)) f.write("%s,%s,%s,%s,%s,%s\n" % (' ,已入库', inventory_num, goods_num, kw_num, tosell_num, reviews_num)) f.write("%s,%s,%s,%s,%s,%s\n" % (' ,未完成', goods_sum - inventory_num, goods_unfinished, kw_unfinished, tosell_unfinished, reviews_unfinished)) f.write("%s,%s,%s,%s,%s,%s\n" % (' ,完成度', '%.2f%%' % inventory_comlete_rate, '%.2f%%' % goods_complete_rate, '%.2f%%' % kw_complete_rate, '%.2f%%' % tosell_complete_rate, '%.2f%%' % reviews_complete_rate)) f.write("%s,%s,%s,%s,%s,%s\n" % (' ,需重试', not_qty_sum, goods_fail_sum, kw_fail_sum, tosell_fail_sum, reviews_fail_sum)) f.write("%s,%s,%s,%s,%s,%s\n" % (' ,重试率', '%.2f%%' % inventory_retry_rate, '%.2f%%' % goods_retry_rate, '%.2f%%' % kw_retry_rate, '%.2f%%' % tosell_retry_rate, '%.2f%%' % reviews_retry_rate)) # 发送邮件的参数 # 发送邮件的数据格式 msg_format = "%s队列:总数%s条数据, 已入库%s条数据, 未入库%s条数据, 完成度%s, 需重试%s, 重试率%s" # 要发送的信息 msg_list = [] # 发送的邮箱地址 to_addr = '*****@*****.**' war_msg = '数据入库完成度超过90%' title_format = 'Data Save Warning! PT: %s' # 判断产品入库的完成度 超过90%就发送邮件 if goods_complete_rate >= 0: msg_list.append(msg_format % ('product', goods_sum, goods_num, goods_unfinished, (goods_complete_rate), goods_fail_sum, (goods_retry_rate))) send_func(msg_list, war_msg, title_format, to_addr) # 清空列表 msg_list = [] # 判断关键字入库的完成度 超过90%就发送邮件 if kw_complete_rate >= 0: msg_list.append(msg_format % ('keyword', kw_sum, kw_num, kw_unfinished, (kw_complete_rate), kw_fail_sum, (kw_retry_rate))) send_func(msg_list, war_msg, title_format, to_addr) # 清空列表 msg_list = [] # 判断跟卖入库的完成度 超过90%就发送邮件 if tosell_complete_rate >= 0: msg_list.append(msg_format % ('tosell', tosell_sum, tosell_num, tosell_unfinished, (tosell_complete_rate), tosell_fail_sum, (tosell_retry_rate))) send_func(msg_list, war_msg, title_format, to_addr) # 清空列表 msg_list = [] # 判断评论入库的完成度 超过90%就发送邮件 if reviews_complete_rate >= 90: msg_list.append( msg_format % ('reviews', reviews_sum, reviews_num, reviews_unfinished, reviews_complete_rate, reviews_fail_sum, reviews_retry_rate)) send_func(msg_list, war_msg, title_format, to_addr) # 清空列表 msg_list = []
def _get_date(self): return int(return_PST().strftime('%Y%m%d'))
def tosell_save(dataQ, debug_log, db_log): print('\ntosell_save init\n') data_type = 'tosell' if dataQ.RedisQ.llen('tosellData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) data_tosell_db_name = SqlConfig.data_tosell_db_name data_tosell_update_sql = SqlConfig.data_tosell_update_sql data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql druid_tosell_db_name = SqlConfig.druid_tosell_db_name #druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql druid_tosell_update_sql = None #SqlConfig.druid_tosell_update_sql druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql while True: datas = dataQ.get_new_tosellData() if not datas: if dataQ.RedisQ.llen('tosellData') > 0: datas = dataQ.get_new_tosellData() else: break # print('\ntosell_save datas: [= %s =] \n' % (datas)) tm = DataOutput.get_redis_time() for item in datas: asin = item tosell_datas = datas[item][0] tosell_list = datas[item][1] # print('tosell_datas: ', tosell_datas) print(tosell_datas['getinfo_tm'], 1) tosell_datas['getinfo_tm'] = tm print(tosell_datas['getinfo_tm'], 2) # sql = "select asin, getinfo_tm from public.amazon_product_data_tosell where asin=%(asin)s and getinfo_tm>%(the_tm)s;" # # select_dict = {'asin': asin, 'the_tm': (tm / 1000 - 120) * 1000} # the_tm = dataQ._get_value_from_string('initUpdateTm', 'initTime') # print('the_tm1', the_tm) # if not the_tm: # _, the_tm = BaseCrawler.get_the_time() # print('the_tm2', the_tm) # else: # the_tm = str(the_tm, encoding='utf-8') # print('the_tm3', the_tm) # select_dict = {'asin': asin, 'the_tm': int(the_tm) * 1000} # cur.execute(sql, select_dict) # select_rows = cur.fetchall() sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;" aday = tosell_list[0]['aday'] if len( tosell_list) > 0 else return_PST().strftime('%Y%m%d') select_dict = {'asin': asin, 'aday': aday} cur.execute(sql, select_dict) select_rows = cur.fetchall() dbObj.commit() if len(select_rows) < 1: print(tosell_datas) if not tosell_datas.get('sname'): sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % ( asin, tm - 24 * 3600 * 1000) cur.execute(sql1) select_rows = cur.fetchall() dbObj.commit() select_rows = select_rows[0] if len( select_rows) == 1 else ('', '') sname, seller_id = select_rows print('seller_id: ', seller_id) print('sname ', sname) tosell_datas['sname'] = sname tosell_datas['seller_id'] = seller_id data0 = dataOutput.save_data_to_db( data_tosell_update_sql, data_tosell_insert_sql, asin, tosell_datas, db_name=data_tosell_db_name) for item in tosell_list: item['tm'] = int(tm / 1000) data = dataOutput.save_data_to_db( druid_tosell_update_sql, druid_tosell_insert_sql, asin, item, db_name=druid_tosell_db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def goods_data_save(dataQ, debug_log, db_log): print('\ngoods_save init\n') data_type = 'goods' if dataQ.RedisQ.llen('goodsData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) db_name = SqlConfig.goods_db_name update_sql = SqlConfig.goods_update_sql insert_sql = SqlConfig.goods_insert_sql while True: datas = dataQ.get_new_goods_data() the_hour = return_PST().hour if not datas: if dataQ.RedisQ.llen('goodsData') > 0: datas = dataQ.get_new_goods_data() else: break tm = DataOutput.get_redis_time() for k, v in datas.items(): asin = k data = v # print('data', data) # 如果库存下载失败, 先不入历史库 from pprint import pprint pprint(data) print(data['getinfo_tm'], 1) data['getinfo_tm'] = tm print(data['getinfo_tm'], 2) print('rc1: ', data['rc']) print('quantity1', data['quantity']) sql = "select rc, quantity, price, title, bsr from public.amazon_product_data where asin=%(asin)s and getinfo_tm>%(the_tm)s ;" select_dict = { 'asin': data['asin'], 'the_tm': (tm / 1000 - 3600 * 24 * 3) * 1000 } cur.execute(sql, select_dict) select_rows = cur.fetchall() if len(select_rows) > 0: print(select_rows, type(select_rows), type(select_rows[0]), type(select_rows[0][0])) the_old_rc, the_old_qty, the_old_price, the_old_title, the_old_bsr = select_rows[ 0] print(the_old_rc, the_old_qty, the_old_price, the_old_title, the_old_bsr) the_new_qty = data['quantity'] print('price1', data['price'], the_old_price) # 如果没有price 则用前一天的数据 if data['price'] <= 0 and the_old_price > 0 and data[ 'asin_state'] == 3: data['price'] = the_old_price # 如果没有title, 则用前一天的数据 if not data['title'] and the_old_title: data['title'] = the_old_title # 如果没有bsr, 则用前一天的数据 #if data['bsr'] < 1 and the_old_bsr > 0: # data['bsr'] = the_old_bsr print('the_old_rc', the_old_rc, type(the_old_rc)) print('old quantity', the_old_qty, type(the_old_qty)) print('new quantity', the_new_qty, type(the_new_qty)) # 如果评论小于前一天的评论, 则用前一天的评论 print("data['rc']", data['rc'], type(data['rc'])) if data.get('rc', 0) < the_old_rc: data['rc'] = the_old_rc # 如果库存爬取失败, 则用前一天的库存 if the_new_qty == -1 and the_old_qty >= 0 and data[ 'asin_state'] == 3: data['quantity'] = the_old_qty data['qtydt'] = 4 with open('quantity_fail.csv', 'a', encoding='utf-8') as f: f.write('asin, %s, old qty, %s, new qty, %s\n' % (data['asin'], the_old_qty, the_new_qty)) if data['asin_state'] == 2: data['quantity'] = 0 data['byb'] = 0 data['qtydt'] = 5 # 不可售 # 如果没有dpre, 则用price if data['dpre'] <= 0 and data['price'] > 0: data['dpre'] = data['price'] # 如果没有cart_price, 则用price if data['cart_price'] <= 0 and data['price'] > 0: data['cart_price'] = data['price'] print('price2', data['price']) print('quantity2', data['quantity']) print('rc2: ', data['rc']) if the_hour < 9 and data['quantity'] < 0 and data[ 'asin_state'] == 3: # 先不更新 pass # # 弹出更新库不需要的字段 # data.pop('dpre') # data.pop('bs1') # data.pop('qc') # data.pop('qtydt') # data.pop('aday') # # 再传给更新库 # dataOutput.save_data_to_db(update_sql, insert_sql, asin, data, db_name=db_name) else: # 先传一份给历史库 druidData_to_db(asin, data, dataOutput) # 弹出更新库不需要的字段 data.pop('dpre') data.pop('bs1') data.pop('qc') data.pop('qtydt') data.pop('aday') # 再传给更新库 dataOutput.save_data_to_db(update_sql, insert_sql, asin, data, db_name=db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def keyword_data_save(dataQ, debug_log, db_log): print('\nkeyword_data_save init\n') data_type = 'keyword' if dataQ.RedisQ.llen('keywordData') > 0: pool = psycopg2.pool.SimpleConnectionPool(5, 51, **DATADB_CONFIG[BASE_TYPE]) # dbObj1 = GetDbObj().get_db_obj() # cur1 = dbObj1.cursor() # dataOutput = DataOutput(dbObj1, cur1, db_log, debug_log, dataQ) keyword_data_db_name = SqlConfig.keyword_data_db_name keyword_data_update_sql = SqlConfig.keyword_data_update_sql keyword_data_insert_sql = SqlConfig.keyword_data_insert_sql druid_keyword_db_name = SqlConfig.druid_keyword_db_name #druid_keyword_update_sql = SqlConfig.druid_keyword_update_sql druid_keyword_update_sql = None druid_keyword_insert_sql = SqlConfig.druid_keyword_insert_sql i = 0 while True: i += 1 dbObj = pool.getconn(i) cur = dbObj.cursor() datas = dataQ.get_new_keywordData() if not datas: if dataQ.RedisQ.llen('keywordData') > 0: print(dataQ.RedisQ.llen('keywordData'), type(dataQ.RedisQ.llen('keywordData'))) datas = dataQ.get_new_keywordData() else: break for k, v in datas.items(): kw = k tm = DataOutput.get_redis_time() keyword_data_dict = v[0] keyword_druid_data_list = v[1] aday = keyword_druid_data_list[0]['aday'] if len( keyword_druid_data_list) > 0 else return_PST().strftime( "%Y%m%d") if len(keyword_druid_data_list ) < 50 and keyword_data_dict['search_num'] < 1000: if keyword_data_dict['search_num'] != len( keyword_druid_data_list): keyword_data_dict['search_num'] = len( keyword_druid_data_list) for data in keyword_druid_data_list: data['srn'] = len(keyword_druid_data_list) # print('keyword_data_dict: ', keyword_data_dict) # print(keyword_data_dict['getinfo_tm'], 1) keyword_data_dict['getinfo_tm'] = tm # print(keyword_data_dict['getinfo_tm'], 2) sql = "select kw from public.amazon_druid_keyword_data where kw=%(kw)s and aday=%(aday)s limit 1;" the_data = dict(kw=kw, aday=aday) cur.execute(sql, the_data) asin_rows = cur.fetchall() print('asin_rows: ', len(asin_rows)) print('keyword_druid_data_list len: ', len(keyword_druid_data_list)) if len(asin_rows) < 1: data0 = DataOutput.save_data_to_db_pool( dbObj, cur, db_log, debug_log, dataQ, keyword_data_update_sql, keyword_data_insert_sql, kw, keyword_data_dict, db_name=keyword_data_db_name) # print('keyword_druid_data_list: ', keyword_druid_data_list) if len(keyword_druid_data_list) > 0: for druid in keyword_druid_data_list: # print(druid) druid['tm'] = tm data1 = DataOutput.save_data_to_db_pool( dbObj, cur, db_log, debug_log, dataQ, druid_keyword_update_sql, druid_keyword_insert_sql, kw, druid, db_name=druid_keyword_db_name) # time.sleep(20) # 记录更新时间 data_dict = { 'kw': kw, 'crawler_tm': keyword_data_dict['getinfo_tm'] / 1000 } db_name = 'public.amazon_keyword_monitor.crawler_tm' insert_sql = '' update_sql = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;" DataOutput.save_data_to_db_pool(dbObj, cur, db_log, debug_log, dataQ, update_sql, insert_sql, kw, data_dict, db_name=db_name) dbObj.commit() pool.putconn(dbObj, i) if i == 50: i = 0 pool.closeall() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def tosell_save(dataQ, debug_log, db_log): print('\ntosell_save init\n') data_type = 'tosell' if dataQ.RedisQ.llen('tosellData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) data_tosell_db_name = SqlConfig.data_tosell_db_name data_tosell_update_sql = SqlConfig.data_tosell_update_sql data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql druid_tosell_db_name = SqlConfig.druid_tosell_db_name # druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql druid_tosell_update_sql = None # SqlConfig.druid_tosell_update_sql druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql while True: datas = dataQ.get_new_tosellData() pprint(datas) # datas = {'B01F0QQN8Q': ({'asin': 'B01F0QQN8Q', # 'fba_sn': 1, # 'getinfo_tm': 1542018763364, # 'plow': 1, # 'plows': 'largeshop', # 'plows_id': 'df', # 'seller_id': 'A1XEMYOCVN4TN8', # 'sn': 1, # 'sname': 'Gemschest'}, # [{'aday': '20181112', # 'asin': 'B01F0QQN8Q', # 'condition': 'New', # 'crawler_state': 1, # 'delivery': 'Fulfillment by Amazon', # 'demo': '5 out of 5 stars 99% positive over the past 12 months. (722 total ' # 'ratings)', # 'fba': 1, # 'is_limit': 0, # 'offering_id': 'tXTG86Zk6%2Bfn3YW0ITpD7nE1mscbzOgJAAhDW3VHDrP8cWV%2F1fd0DDtk7FV8eHIOKghI7PqYtkyapr23dSShe%2Fec6EMnW30fniLCM2fd1hkZKMTSUhqBYCuO87D2zljdYwfuDuVCDTm%2FQbjYnRPPhVBBs82MwpT9', # 'positive': 99, # 'price': 2199, # 'qty': 11, # 'qtydt': 0, # 'rank': 1, # 'reivew_count': 50, # 'seller_id': 'A21P7EI9UKXT1Y', # 'sn': 1, # 'sname': 'largeshop', # 'srank': 0, # 'stype': 'FREE Shipping', # 'tm': 1542018647, # 'total_ratings': 722}])} if not datas: if dataQ.RedisQ.llen('tosellData') > 0: datas = dataQ.get_new_tosellData() else: break # print('\ntosell_save datas: [= %s =] \n' % (datas)) tm = DataOutput.get_redis_time() for item in datas: asin = item tosell_datas = datas[item][0] tosell_list = datas[item][1] pprint(tosell_datas) pprint(tosell_list) print(tosell_datas['getinfo_tm'], 1) tosell_datas['getinfo_tm'] = tm print(tosell_datas['getinfo_tm'], 2) sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;" aday = tosell_list[0]['aday'] if len( tosell_list) > 0 else return_PST().strftime('%Y%m%d') select_dict = {'asin': asin, 'aday': aday} cur.execute(sql, select_dict) select_rows = cur.fetchall() dbObj.commit() if len(select_rows) < 1: if not tosell_datas.get('sname'): print(222222) sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % ( asin, tm - 24 * 3600 * 1000) cur.execute(sql1) select_rows = cur.fetchall() dbObj.commit() select_rows = select_rows[0] if len( select_rows) == 1 else ('', '') sname, seller_id = select_rows print('seller_id: ', seller_id) print('sname ', sname) tosell_datas['sname'] = sname tosell_datas['seller_id'] = seller_id data0 = dataOutput.save_data_to_db( data_tosell_update_sql, data_tosell_insert_sql, asin, tosell_datas, db_name=data_tosell_db_name) for item in tosell_list: item['tm'] = int(tm / 1000) data = dataOutput.save_data_to_db( druid_tosell_update_sql, druid_tosell_insert_sql, asin, item, db_name=druid_tosell_db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def download(self, ip, asin_or_kw, url_dict): time_now = lambda: time.time() url_type = self.url_type kw = asin_or_kw cid = url_dict.get('cid') or 0 print(ip, kw, cid, url_dict) # time.sleep(30) kw_url = self.make_search_url(quote(kw), cid) url = kw_url print('\nkeyword_url_tuple: ', kw_url) self.debug_log.debug( 'url_type: %s, asin: %s, monitor_type: %s, url %s: ' % (url_type, kw, cid, url)) if url: self.debug_log.debug('[ip %s] 工作中... [%s]' % (ip, url)) startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S") time1 = time_now() referer = 'https://www.amazon.com' self.debug_log.debug('keyword referer: %s' % (referer)) ua = self.get_ua() self.debug_log.debug('keyword ua: %s' % (ua)) # value_str = ip + ua # self.debug_log.debug('keyword value_str: %s' % (value_str)) url_md5key = url_dict.get('md5') or '' if not url_md5key: url_md5key = self.get_md5_key(kw + url_type) # cookMd5key = self.get_md5_key(value_str) cookMd5key = None cookie = self.get_cookie(cookMd5key) self.debug_log.debug('keyword cookie: %s' % (cookie)) # 下载url html_list, url_list, cookiesObj, is_error_lsit = \ self.get_keyword_html_lsit(url, ua, ip, cookie, referer, kw=kw) old_dnum = url_dict.get('dnum') or 0 durl = url_dict.get('durl') or [] url_dict['durl'] = list(set(durl + url_list)) url_dict['dnum'] = old_dnum + 1 # 如果判定为没有关键词, 结束程序 if self.not_found: DataOutput.record_not_found_keyword(kw) self.dataQ.record_keyword_not_fund_times() msgInt = 0 proxyInfo = 'the keyword not found' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) return self.debug_log.war('%s关键字不存在' % (kw)) i = -1 keyword_html_list = [] if len(html_list) > 0: for html in html_list: i += 1 is_error = is_error_lsit[i] url = url_list[i] if is_error: msgInt = 6 proxyInfo = 'get Html error' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) else: analyze = self.analyze_html(html, cookie, cookiesObj, ip, kw, url_dict, cookMd5key, time1, startTime, html_type=url_type) if analyze and analyze != 404: keyword_html_list.append(html) if len(html_list) == len(keyword_html_list): result, is_error = self.kw_parser(keyword_html_list, kw, cid, ip=ip, url=url) if is_error: msgInt = 3 proxyInfo = 'get data error' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) else: if not result: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) msgInt = 2 proxyInfo = 'get data defeated' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) else: keyword_datas = result[0] if not keyword_datas: self.add_url_to_queue(url_dict, url_type=url_type, retry_type=True) else: self.save_success_asin_keyword( kw, url_type=url_type) msgInt = 1 proxyInfo = 'get data success' self.record_log(kw, time1, msgInt, url_type, startTime, ip, proxyInfo) data_bytes = pickle.dumps(keyword_datas) self.dataQ.add_keyword_to_queue(data_bytes) self.dataQ.record_data_ok_times() self.dataQ.record_keyword_ok_times() else: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) else: self.the_url_is_discard(kw, url_dict, url_type, url_md5key) time.sleep(1) else: print(url_type, '没有url') self.add_url_to_set(url_dict, url_type, retry_type=True) time.sleep(1)
def kw_parser(self, html_code_list, keyword, cid, not_match=False): result_dict = {} # print(len(html_code)) print('not_match', not_match) price_list = [] rrg_list = [] rc_list = [] keyword_data_list = [] search_num = self._get_search_num(html_code_list[0]) print('html_code_list len: ', len(html_code_list)) j = 0 for html_code in html_code_list: j += 1 self.html_code = html_code resultId = KwParser.get_results_tag(html_code) resultNum = len(resultId) # 保存html样本, 测试用 # print('resultId len: ', resultNum) # file = '%s%s.html' % (keyword, j) # with open(file, 'w', encoding='utf-8') as f: # f.write(html_code) i = -1 for result in resultId: i += 1 html = str(tostring(result), encoding='utf-8') result = etree.HTML(html) asin = self._get_asin(result, i) title = self._get_title(result, i) brand = self._get_brand(result, i) price = self._get_price(result, i) rrg = self._get_rrg(result, i) rc = self._get_rc(result, i) img = self._get_img(result, i) issp = self._get_issp(result, i) prime = self._get_prime(result, i) keyword_data_dict = dict( kw=keyword, # 关键字 cid=cid, # 分类id(0代表所有分类) asin=asin, # 产品 title=title, # 产品标题 img=img, # 产品图片 brand=brand, # 品牌 msn=0, # 月搜索量 issp=issp, # 是否付费推广 srn=search_num, # 搜索结果数 price=price, # 产品价格 rrg=rrg, # 产品评分 rc=rc, # 产品评论数 special=1, # 均分程序跑数据 tm=int(BaseParser.get_redis_time()), # 获取数据的时间(毫秒级) aday=return_PST().strftime("%Y%m%d"), # 获取数据的太平洋日期 is_prime=prime, # 是否有prime 标记 fba=prime, category=None, # 分类 bsr=None, # bsr1.3做 ) if asin: keyword_data_list.append(keyword_data_dict) print(j, '数据长度: ', len(keyword_data_list)) i = 0 for item in keyword_data_list: i += 1 item['pr'] = i print(keyword, 'keyword_data_list len: ', len(keyword_data_list)) if len(keyword_data_list) > 50: keyword_data_list = keyword_data_list[0:50] for item in keyword_data_list: price = item.get('price') rrg = item.get('rrg') rc = item.get('rc') # 价格 if type(price) is int and price > 0: price_list.append(price) # 评论 if type(rc) is int and rc >= 0: rc_list.append(rc) # 评分 if type(rrg) is int and rrg > 0: rrg_list.append(rrg) print('price_list: ', len(price_list), price_list) print('rrg: ', len(rrg_list), rrg_list) print('rc: ', len(rc_list), rc_list) price_max = self._get_price_max(price_list) price_min = self._get_price_min(price_list) price_ave = self._get_price_ave(price_list) # print('rrg_list: ', rrg_list) rrg_max = self._get_rrg_max(rrg_list) rrg_min = self._get_rrg_min(rrg_list) rrg_ave = self._get_rrg_ave(rrg_list) # print('rc_list: ', rc_list) rc_max = self._get_rc_max(rc_list) rc_min = self._get_rc_min(rc_list) rc_ave = self._get_rc_ave(rc_list) date = self._get_date() # print(date, type(date)) if search_num < 50 and search_num < len(keyword_data_list): search_num = len(keyword_data_list) for data in keyword_data_list: data['srn'] = search_num kwData_dict = dict( kw=keyword, # 关键词 cid=cid, # 分类id(0代表所有分类) mon_search=0, # 月搜索量(废弃) search_num=search_num, # 搜索结果数 price_max=price_max, # 最高价格 price_min=price_min, # 最低价格 price_ave=price_ave, # 平均价格 rrg_max=rrg_max, # 最高评分 rrg_min=rrg_min, # 最低评分 rrg_ave=rrg_ave, # 平均评分 rc_max=rc_max, # 最高评论数 rc_min=rc_min, # 最低评论数 rc_ave=rc_ave, # 平均评论数 date=date, # 采集的日期(Ymd) mon_search_state=0, # 月搜索数量采集的状态 other_state=1, # 其它数据采集状态 getinfo_tm=int(BaseParser.get_redis_time()), # 获取数据的时间(毫秒级) ) print(keyword, 'keyword_data_list len1: ', len(keyword_data_list)) if search_num > 100: if len(keyword_data_list) == 50: result_dict[keyword] = (kwData_dict, keyword_data_list) else: if not_match and len(keyword_data_list) <= 50: result_dict[keyword] = (kwData_dict, keyword_data_list) else: if len(keyword_data_list) <= 50: result_dict[keyword] = (kwData_dict, keyword_data_list) if search_num == 0 and price_max == 0 and price_ave == 0 and rrg_max == 0 and rc_max == 0: result_dict = {} from pprint import pprint pprint(result_dict) return result_dict