def bsrData_save(dataQ, debug_log, db_log): print('\nbsrData_save init\n') data_type = 'bsr' if dataQ.RedisQ.llen('bsrData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) db_name = SqlConfig.bsrData_db_name update_sql = SqlConfig.bsrData_update_sql insert_sql = SqlConfig.bsrData_insert_sql while True: datas = dataQ.get_new_bsrData() if not datas: if dataQ.RedisQ.llen('bsrData') > 0: datas = dataQ.get_new_bsrData() else: break for item in datas: asin = item tuple_list = datas[item] tm = int(DataOutput.get_redis_time()) # print('asin tuple_list: ', asin, tuple_list) for item in tuple_list: if item and type(item) is tuple: # print('bsr item: ', item) itemLen = len(item) bsr = item[0] bsrc = item[1] aday = item[2] # if itemLen == 4: # tm = item[3] # else: # tm = int(time.time() * 1000) data_dict = dict(asin=asin, bsr=bsr, bsrc=bsrc, tm=tm, aday=aday) data = dataOutput.save_data_to_db(update_sql, insert_sql, asin, data_dict, db_name=db_name) # print('bsrData: ',data) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def goods_data_save(dataQ, debug_log, db_log): print('\ngoods_save init\n') data_type = 'goods' if dataQ.RedisQ.llen('goodsData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) db_name = SqlConfig.goods_db_name update_sql = SqlConfig.goods_update_sql insert_sql = SqlConfig.goods_insert_sql while True: datas = dataQ.get_new_goods_data() the_hour = return_PST().hour if not datas: if dataQ.RedisQ.llen('goodsData') > 0: datas = dataQ.get_new_goods_data() else: break tm = DataOutput.get_redis_time() for k, v in datas.items(): asin = k data = v # print('data', data) # 如果库存下载失败, 先不入历史库 from pprint import pprint pprint(data) print(data['getinfo_tm'], 1) data['getinfo_tm'] = tm print(data['getinfo_tm'], 2) print('rc1: ', data['rc']) print('quantity1', data['quantity']) sql = "select rc, quantity, price, title, bsr from public.amazon_product_data where asin=%(asin)s and getinfo_tm>%(the_tm)s ;" select_dict = { 'asin': data['asin'], 'the_tm': (tm / 1000 - 3600 * 24 * 3) * 1000 } cur.execute(sql, select_dict) select_rows = cur.fetchall() if len(select_rows) > 0: print(select_rows, type(select_rows), type(select_rows[0]), type(select_rows[0][0])) the_old_rc, the_old_qty, the_old_price, the_old_title, the_old_bsr = select_rows[ 0] print(the_old_rc, the_old_qty, the_old_price, the_old_title, the_old_bsr) the_new_qty = data['quantity'] print('price1', data['price'], the_old_price) # 如果没有price 则用前一天的数据 if data['price'] <= 0 and the_old_price > 0 and data[ 'asin_state'] == 3: data['price'] = the_old_price # 如果没有title, 则用前一天的数据 if not data['title'] and the_old_title: data['title'] = the_old_title # 如果没有bsr, 则用前一天的数据 #if data['bsr'] < 1 and the_old_bsr > 0: # data['bsr'] = the_old_bsr print('the_old_rc', the_old_rc, type(the_old_rc)) print('old quantity', the_old_qty, type(the_old_qty)) print('new quantity', the_new_qty, type(the_new_qty)) # 如果评论小于前一天的评论, 则用前一天的评论 print("data['rc']", data['rc'], type(data['rc'])) if data.get('rc', 0) < the_old_rc: data['rc'] = the_old_rc # 如果库存爬取失败, 则用前一天的库存 if the_new_qty == -1 and the_old_qty >= 0 and data[ 'asin_state'] == 3: data['quantity'] = the_old_qty data['qtydt'] = 4 with open('quantity_fail.csv', 'a', encoding='utf-8') as f: f.write('asin, %s, old qty, %s, new qty, %s\n' % (data['asin'], the_old_qty, the_new_qty)) if data['asin_state'] == 2: data['quantity'] = 0 data['byb'] = 0 data['qtydt'] = 5 # 不可售 # 如果没有dpre, 则用price if data['dpre'] <= 0 and data['price'] > 0: data['dpre'] = data['price'] # 如果没有cart_price, 则用price if data['cart_price'] <= 0 and data['price'] > 0: data['cart_price'] = data['price'] print('price2', data['price']) print('quantity2', data['quantity']) print('rc2: ', data['rc']) if the_hour < 9 and data['quantity'] < 0 and data[ 'asin_state'] == 3: # 先不更新 pass # # 弹出更新库不需要的字段 # data.pop('dpre') # data.pop('bs1') # data.pop('qc') # data.pop('qtydt') # data.pop('aday') # # 再传给更新库 # dataOutput.save_data_to_db(update_sql, insert_sql, asin, data, db_name=db_name) else: # 先传一份给历史库 druidData_to_db(asin, data, dataOutput) # 弹出更新库不需要的字段 data.pop('dpre') data.pop('bs1') data.pop('qc') data.pop('qtydt') data.pop('aday') # 再传给更新库 dataOutput.save_data_to_db(update_sql, insert_sql, asin, data, db_name=db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def tosell_save(dataQ, debug_log, db_log): print('\ntosell_save init\n') data_type = 'tosell' if dataQ.RedisQ.llen('tosellData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) data_tosell_db_name = SqlConfig.data_tosell_db_name data_tosell_update_sql = SqlConfig.data_tosell_update_sql data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql druid_tosell_db_name = SqlConfig.druid_tosell_db_name # druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql druid_tosell_update_sql = None # SqlConfig.druid_tosell_update_sql druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql while True: datas = dataQ.get_new_tosellData() pprint(datas) # datas = {'B01F0QQN8Q': ({'asin': 'B01F0QQN8Q', # 'fba_sn': 1, # 'getinfo_tm': 1542018763364, # 'plow': 1, # 'plows': 'largeshop', # 'plows_id': 'df', # 'seller_id': 'A1XEMYOCVN4TN8', # 'sn': 1, # 'sname': 'Gemschest'}, # [{'aday': '20181112', # 'asin': 'B01F0QQN8Q', # 'condition': 'New', # 'crawler_state': 1, # 'delivery': 'Fulfillment by Amazon', # 'demo': '5 out of 5 stars 99% positive over the past 12 months. (722 total ' # 'ratings)', # 'fba': 1, # 'is_limit': 0, # 'offering_id': 'tXTG86Zk6%2Bfn3YW0ITpD7nE1mscbzOgJAAhDW3VHDrP8cWV%2F1fd0DDtk7FV8eHIOKghI7PqYtkyapr23dSShe%2Fec6EMnW30fniLCM2fd1hkZKMTSUhqBYCuO87D2zljdYwfuDuVCDTm%2FQbjYnRPPhVBBs82MwpT9', # 'positive': 99, # 'price': 2199, # 'qty': 11, # 'qtydt': 0, # 'rank': 1, # 'reivew_count': 50, # 'seller_id': 'A21P7EI9UKXT1Y', # 'sn': 1, # 'sname': 'largeshop', # 'srank': 0, # 'stype': 'FREE Shipping', # 'tm': 1542018647, # 'total_ratings': 722}])} if not datas: if dataQ.RedisQ.llen('tosellData') > 0: datas = dataQ.get_new_tosellData() else: break # print('\ntosell_save datas: [= %s =] \n' % (datas)) tm = DataOutput.get_redis_time() for item in datas: asin = item tosell_datas = datas[item][0] tosell_list = datas[item][1] pprint(tosell_datas) pprint(tosell_list) print(tosell_datas['getinfo_tm'], 1) tosell_datas['getinfo_tm'] = tm print(tosell_datas['getinfo_tm'], 2) sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;" aday = tosell_list[0]['aday'] if len( tosell_list) > 0 else return_PST().strftime('%Y%m%d') select_dict = {'asin': asin, 'aday': aday} cur.execute(sql, select_dict) select_rows = cur.fetchall() dbObj.commit() if len(select_rows) < 1: if not tosell_datas.get('sname'): print(222222) sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % ( asin, tm - 24 * 3600 * 1000) cur.execute(sql1) select_rows = cur.fetchall() dbObj.commit() select_rows = select_rows[0] if len( select_rows) == 1 else ('', '') sname, seller_id = select_rows print('seller_id: ', seller_id) print('sname ', sname) tosell_datas['sname'] = sname tosell_datas['seller_id'] = seller_id data0 = dataOutput.save_data_to_db( data_tosell_update_sql, data_tosell_insert_sql, asin, tosell_datas, db_name=data_tosell_db_name) for item in tosell_list: item['tm'] = int(tm / 1000) data = dataOutput.save_data_to_db( druid_tosell_update_sql, druid_tosell_insert_sql, asin, item, db_name=druid_tosell_db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def tosell_save(dataQ, debug_log, db_log): print('\ntosell_save init\n') data_type = 'tosell' if dataQ.RedisQ.llen('tosellData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) data_tosell_db_name = SqlConfig.data_tosell_db_name data_tosell_update_sql = SqlConfig.data_tosell_update_sql data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql druid_tosell_db_name = SqlConfig.druid_tosell_db_name #druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql druid_tosell_update_sql = None #SqlConfig.druid_tosell_update_sql druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql while True: datas = dataQ.get_new_tosellData() if not datas: if dataQ.RedisQ.llen('tosellData') > 0: datas = dataQ.get_new_tosellData() else: break # print('\ntosell_save datas: [= %s =] \n' % (datas)) tm = DataOutput.get_redis_time() for item in datas: asin = item tosell_datas = datas[item][0] tosell_list = datas[item][1] # print('tosell_datas: ', tosell_datas) print(tosell_datas['getinfo_tm'], 1) tosell_datas['getinfo_tm'] = tm print(tosell_datas['getinfo_tm'], 2) # sql = "select asin, getinfo_tm from public.amazon_product_data_tosell where asin=%(asin)s and getinfo_tm>%(the_tm)s;" # # select_dict = {'asin': asin, 'the_tm': (tm / 1000 - 120) * 1000} # the_tm = dataQ._get_value_from_string('initUpdateTm', 'initTime') # print('the_tm1', the_tm) # if not the_tm: # _, the_tm = BaseCrawler.get_the_time() # print('the_tm2', the_tm) # else: # the_tm = str(the_tm, encoding='utf-8') # print('the_tm3', the_tm) # select_dict = {'asin': asin, 'the_tm': int(the_tm) * 1000} # cur.execute(sql, select_dict) # select_rows = cur.fetchall() sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;" aday = tosell_list[0]['aday'] if len( tosell_list) > 0 else return_PST().strftime('%Y%m%d') select_dict = {'asin': asin, 'aday': aday} cur.execute(sql, select_dict) select_rows = cur.fetchall() dbObj.commit() if len(select_rows) < 1: print(tosell_datas) if not tosell_datas.get('sname'): sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % ( asin, tm - 24 * 3600 * 1000) cur.execute(sql1) select_rows = cur.fetchall() dbObj.commit() select_rows = select_rows[0] if len( select_rows) == 1 else ('', '') sname, seller_id = select_rows print('seller_id: ', seller_id) print('sname ', sname) tosell_datas['sname'] = sname tosell_datas['seller_id'] = seller_id data0 = dataOutput.save_data_to_db( data_tosell_update_sql, data_tosell_insert_sql, asin, tosell_datas, db_name=data_tosell_db_name) for item in tosell_list: item['tm'] = int(tm / 1000) data = dataOutput.save_data_to_db( druid_tosell_update_sql, druid_tosell_insert_sql, asin, item, db_name=druid_tosell_db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
def reviews_save(dataQ, debug_log, db_log): print('\nreviews_save init\n') data_type = 'reviews' if dataQ.RedisQ.llen('reviewsData') > 0: dbObj = GetDbObj().get_db_obj() cur = dbObj.cursor() dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ) reviews_db_name = SqlConfig.reivews_db_name reviews_update_sql = SqlConfig.reivews_update_sql reviews_insert_sql = SqlConfig.reivews_insert_sql datetime = return_PST() oldDate = datetime - timedelta(days=90) yesterdate = datetime - timedelta(days=1) yesterday = yesterdate.strftime('%Y%m%d') theYesterDete = int(yesterday) theMon = oldDate.strftime('%Y%m%d') three_mon_date = int(theMon) while True: datas = dataQ.get_new_reviewsData() if not datas: if dataQ.RedisQ.llen('reviewsData') > 0: datas = dataQ.get_new_reviewsData() else: break for item in datas: asin = item dict_list = datas[item] # print('tuple_list: ', dict_list) md5value = asin + 'reviewsFirst' md5key = DataOutput.get_md5_key(md5value) first = dataQ.is_first_download(md5key) i = 0 for item in dict_list: i += 1 # md5key只传3次, 避免无畏的重复写入, 以及前两次写入失败的情况. if i < 3: theMd5key = md5key else: theMd5key = None # 如果是第一次下载, 则写入三个月内的评论. if first: if item['date'] >= three_mon_date: # print('reviews item: ', item) data0 = dataOutput.save_data_to_db( reviews_update_sql, reviews_insert_sql, asin, item, db_name=reviews_db_name, md5key=theMd5key) # 否则只写入当天评论 else: if item['date'] >= theYesterDete: # print('reviews item: ', item) data1 = dataOutput.save_data_to_db( reviews_update_sql, reviews_insert_sql, asin, item, db_name=reviews_db_name) # 记录更新时间 dataOutput.crawler_tm(asin, data_type) cur.close() dbObj.close() db_log.war('%s, %s线程任务已完成\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type)) else: db_log.war('%s, %s数据队列为空\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))