def crawl_sku_comment_count(sku_list): clist = jd_API.getCommentCount_JD(sku_list) if len(clist) == 0: return {'status': -1, 'msg': 'jd api returned no result for sku_list'} if len(clist) != len(set(sku_list)): return { 'status': -1, 'msg': 'jd api return size mismatch, size of sku:%s, size of api:%s' % (len(set(sku_list)), len(clist)) } vlist = [] dt = timeHelper.getNow() for cdict in clist: tp = [] cdict['dt'] = dt for key in cdict: tp.append(cdict[key]) vlist.append(tp) return crawler_helper.persist_db_history_and_latest( table_name='jd_item_comment_count', num_cols=len(clist[0]), value_list=vlist, is_many=True)
def configLogging(log_name, log_level=data_config.LOGGING_LEVEL): timenow = timeHelper.getNow() #filename = '/tmp/%s_worthy_%s.log' %(log_name,timenow) filename = '/tmp/%s_worthy.log' %(log_name) logging.basicConfig(level=log_level, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename=filename, filemode='a') logging.info('START logging : %s' %log_name)
def __get_task_already_done__(self): if self.is_daily: sql = 'select task_id from task_status where job_name="%s" and update_time>="%s 0:00:00" group by task_id' % ( self.job_name, timeHelper.getNow()) else: stime = timeHelper.getTimeAheadOfNowHours( self.interval_hours, format='%Y-%m-%d %H:%M:%S') sql = 'select task_id from task_status where job_name="%s" and update_time>="%s" group by task_id' % ( self.job_name, stime) print sql retrows = dbhelper.executeSqlRead(sql, is_dirty=True) catlist = [] for row in retrows: catlist.append("%s" % row['task_id']) logging.info("Task already done: %s" % len(catlist)) print("Task already done: %s" % len(catlist)) return catlist
def crawl_sku_price(sku_list, sleep_time): # no more than 5000 items here per design rdict = jd_API.getPrices_JD(sku_list,sleep_time=sleep_time) vlist = [] dt = timeHelper.getNow() dtlong = timeHelper.getNowLong() for key in rdict: tp = rdict[key] price = tp[0] price_m = tp[1] price_pcp = tp[2] vlist.append([key,dt,dtlong,price,price_m,price_pcp]) return crawler_helper.persist_db_history_and_latest( table_name='jd_item_price', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=True, need_flow=True, )
def crawl_category_promo(category_id): rdict = jd_API.get_Promo_Category(category_id) dt = timeHelper.getNow() if len(rdict) == 0: return { 'status': 0, 'msg': 'empty in return, category_id=%s' % category_id } quan = json.dumps(rdict['quan']) ads = json.dumps(rdict['ads']) prom = json.dumps(rdict['prom']) vlist = [[ category_id, dt, quan if quan != '[]' else None, ads if ads != '[]' else None, prom if prom != '[]' else None ]] return crawler_helper.persist_db_history_and_latest( table_name='jd_promo_category', num_cols=len(vlist[0]), value_list=vlist, is_many=True)
def crawl_sku_price(sku_list, sleep_time): # no more than 5000 items here per design rdict = jd_API.getPrices_JD(sku_list, sleep_time=sleep_time) vlist = [] dt = timeHelper.getNow() dtlong = timeHelper.getNowLong() for key in rdict: tp = rdict[key] price = tp[0] price_m = tp[1] price_pcp = tp[2] vlist.append([key, dt, dtlong, price, price_m, price_pcp]) return crawler_helper.persist_db_history_and_latest( table_name='jd_item_price', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=True, need_flow=True, )
def crawl_sku_comment_count(sku_list): clist = jd_API.getCommentCount_JD(sku_list) if len(clist)==0: return {'status':-1,'msg':'jd api returned no result for sku_list'} if len(clist)!=len(set(sku_list)): return {'status':-1,'msg':'jd api return size mismatch, size of sku:%s, size of api:%s' %(len(set(sku_list)),len(clist))} vlist = [] dt = timeHelper.getNow() for cdict in clist: tp = [] cdict['dt'] = dt for key in cdict: tp.append(cdict[key]) vlist.append(tp) return crawler_helper.persist_db_history_and_latest( table_name='jd_item_comment_count', num_cols=len(clist[0]), value_list=vlist, is_many=True )
def crawl_detail_images(sku_id): html = __get_detail_page_content__(sku_id) img_list = jd_detail_resolver.resolve_Images(html) # logging.debug(img_list) if len(img_list)==0: return {'status':-1} vlist = [] update_time = timeHelper.getNow() for img in img_list: tp = (sku_id, update_time, img) vlist.append(tp) # sql = 'replace into jd_item_images values(%s,%s,%s)' # affected_rows = dbhelper.executeSqlWriteMany(sql,vlist) sql2 = 'replace into jd_item_images_latest values(%s,%s,%s)' affected_rows2 = dbhelper.executeSqlWriteMany(sql2,vlist) ret = { 'status': -1, # 'affected_rows': affected_rows, 'affected_rows2': affected_rows2 } if affected_rows2>0: ret['status'] = 0 return ret
def crawl_detail_property(sku_id): html = __get_detail_page_content__(sku_id) prop_map = jd_detail_resolver.resolve_Properties(html) update_time = timeHelper.getNow() vlist = [] if len(prop_map) == 0: return {'status':0} for p_key in prop_map: p_value = prop_map[p_key] tp = (sku_id, update_time, p_key, p_value) vlist.append(tp) # sql = 'replace into jd_item_property values(%s,%s,%s,%s)' # affected_rows = dbhelper.executeSqlWriteMany(sql,vlist) sql2 = 'replace into jd_item_property_latest values(%s,%s,%s,%s)' affected_rows2 = dbhelper.executeSqlWriteMany(sql2,vlist) ret = { 'status': -1, # 'affected_rows': affected_rows, 'affected_rows2': affected_rows2 } if affected_rows2 > 0: ret['status'] = 0 return ret
def crawl_category_promo(category_id): rdict = jd_API.get_Promo_Category(category_id) dt = timeHelper.getNow() if len(rdict)==0: return { 'status':0, 'msg':'empty in return, category_id=%s' %category_id } quan = json.dumps(rdict['quan']) ads = json.dumps(rdict['ads']) prom = json.dumps(rdict['prom']) vlist = [[ category_id, dt, quan if quan!='[]' else None, ads if ads!='[]' else None, prom if prom!='[]' else None ]] return crawler_helper.persist_db_history_and_latest( table_name='jd_promo_category', num_cols=len(vlist[0]), value_list=vlist, is_many=True )
def resolveProductListFromPage(html): product_list = [] nowtime = timeHelper.getNowLong() nowdate = timeHelper.getNow() try: doc = libxml2.htmlReadDoc(html, None, 'utf8', PARSE_OPTIONS) sku_docs = doc.xpathEval('//div[@data-sku]') for sku in sku_docs: #if True: try: sku_doc = libxml2.htmlReadDoc('%s' % sku, None, 'utf8', PARSE_OPTIONS) sku_id = int(sku_doc.xpathEval('//@data-sku')[0].content) # 判断是否是JD自营 if sku_id > 99999999: # 非自营商品 continue #print '%s' %sku sku_url = sku_doc.xpathEval( '//div[@class="p-img"]/a/@href')[0].content try: sku_thumnail_url = sku_doc.xpathEval( '//div[@class="p-img"]/a/img/@data-lazy-img' )[0].content except: sku_thumnail_url = sku_doc.xpathEval( '//div[@class="p-img"]/a/img/@src')[0].content sku_title = "" try: sku_title = sku_doc.xpathEval( '//div[@class="p-name"]/a/@title')[0].content except: pass if len(sku_title) == 0: sku_title = sku_doc.xpathEval( '//div[@class="p-name"]/a/em')[0].content comment_count = int( sku_doc.xpathEval('//div[@class="p-commit"]/strong/a') [0].content) sku_icon_url = "" icon_doc = sku_doc.xpathEval( '//div[@class="p-img"]/a/div/@style') if len(icon_doc) > 0: sku_icon_url = url_utils.getStringBetween( icon_doc[0].content, 'url("', '")') is_global = is_free_gift = is_pay_on_delivery = 0 price_items = sku_doc.xpathEval( '//div[@class="p-price"]/div/i') for pitem in price_items: txt = pitem.content if '全球购' in txt: is_global = 1 elif '货到付款' in txt: is_pay_on_delivery = 1 elif '赠品' in txt: is_free_gift = 1 else: print 'new-mark found:' print txt sku_stock = -1 try: sku_stock = int( sku_doc.xpathEval('//div[@data-stock_v]/@data-stock_v') [0].content) except: pass sku_url = __makeUrl__(sku_url) sku_thumnail_url = __makeUrl__(sku_thumnail_url) tp = (sku_id, nowdate, nowtime, sku_title, sku_url, sku_thumnail_url, sku_stock, comment_count, is_global, is_pay_on_delivery, is_free_gift, sku_icon_url) product_list.append(tp) except Exception as e: logging.error('resolveProductListError: %s, error = %s') % ( sku, e) continue finally: sku_doc.freeDoc() return product_list finally: doc.freeDoc()
def crawl_category(category_id): logging.debug('category_id = %s -- page 1' %(category_id)) url = __get_category_page_url__(category_id,1) # print url html = url_utils.getWebResponse(url,'utf-8') if html == "": html = url_utils.getWebResponse(url,'gb18030') if html == "": html = url_utils.getWebResponse(url, 'gbk') total_pages = jd_list_resolver.resolveTotalPageNum(html) product_list = jd_list_resolver.resolveProductListFromPage(html) while len(product_list) == 0 and category_id is not None: category_id = __up_roll_category_id__(category_id) return crawl_category(category_id) if category_id is None or len(product_list)==0: return {'status':-1, 'msg': 'No item in category product list'} for page_iter in range(2,total_pages+1): logging.debug('category_id = %s -- page %s' %(category_id,page_iter)) url = __get_category_page_url__(category_id,page_iter) html = url_utils.getWebResponse(url,'utf-8') product_list = product_list + jd_list_resolver.resolveProductListFromPage(html) time.sleep(SLEEP_TIME) sku_list = [] for product_tp in product_list: sku_id = product_tp[0] sku_list.append(sku_id) # Get price of all products #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API) ret_obj = { 'status': -1, 'affected_rows': -1, 'sku_count': -1 } total_goods_num = len(product_list) # for item in product_list: # print item[0] # print '='*80 # combine product list and price list, timestamp, category_id for i in xrange(total_goods_num): product_id = product_list[i][0] pkey = '%s' %product_id # if pkey in price_obj: # product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,) # else: # logging.error('Error: product_id=%s cannot get result' %(product_id,price_id)) # continue product_list[i] = product_list[i] + (0,0,0,) # persist in database # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id) # sql = ''' # replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery, # has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) # ''' # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list) ret = crawler_helper.persist_db_history_and_latest( table_name='jd_item_dynamic', num_cols=len(product_list[0]), value_list=product_list, is_many=True, need_history=False, # was True - changed 01/03 need_flow=False, # was True - changed 12/23 ) logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %(category_id,total_goods_num)) logging.debug('%s' %ret) # HANDLE JD_ITEM_CATEGORY item_cat_list = [] for prod in product_list: item_cat_list.append((prod[0],category_id,)) sql2 = 'replace into jd_item_category values (%s,%s)' affected_rows2 = dbhelper.executeSqlWriteMany(sql2,item_cat_list) logging.debug('Saved to DB - item_category - affected rows = %s' %affected_rows2) if affected_rows2<=0: logging.error('Saving to item_category error, category_id = %s' %category_id) # HANDLE JD_ITEM_FIRSTSEEN nowtime = timeHelper.getNowLong() nowdate = timeHelper.getNow() sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")' ftlist = [] for item in product_list: ftlist.append([item[0],nowtime,nowdate]) affected_rows3 = dbhelper.executeSqlWriteMany(sql3,ftlist) ret_obj = { 'status': 0 if ret['status']==0 and affected_rows2>0 else -1, 'item_dynamic': ret, 'item_category': affected_rows2, 'item_first_seen': affected_rows3, } return ret_obj
def processRecord(consentJsonStreamTable, requestDict, consentJson): requestId = localId = status = errorMessage = None baseTable = consentJsonStreamTable.TABLE baseTableAttr = baseTable.ATTRIBUTE print(f"CustomerMK: '{baseTable.INCOMING_DATA.CUSTOMER_MK}', requestDict: {requestDict}") customerMK = requestDict[baseTable.INCOMING_DATA.CUSTOMER_MK] sortKey = timeHelper.getUTCDateTimeString() # ----- INSERT the record ----- insertRecordDict = {} requestId = localId = status = None # Tag Start DateTime startTime = timeHelper.getNow() # Set Primary and Sort Key addPrimaryAndSortKey(insertRecordDict, baseTableAttr, customerMK, sortKey) # Set Original Json insertRecordDict[baseTableAttr.ORIGINAL_DATA] = requestDict valiationResult = transformConsentJson.validate(requestDict, consentJson) if(API.isResultFailure(valiationResult)): status = API.getResultStatus(valiationResult) errorMessage = API.getErrorPrintMessage(valiationResult) else: # Transform and set new Json transformResult = transformConsentJson.transform(requestDict, consentJson) if(API.isResultFailure(transformResult)): status = API.getResultStatus(transformResult) errorMessage = API.getErrorPrintMessage(transformResult) else: insertRecordDict[baseTableAttr.TRANSFORMED_DATA] = API.getResultData(transformResult) # Set Trace and Stats dictionary timeLogTuple = timeHelper.getTimeLogTupleString(startTime) # print(f"processRecord() >> timeLogTuple (insert): '{timeLogTuple[0]}', '{timeLogTuple[1]}', '{timeLogTuple[2]}'") insertRecordDict[baseTableAttr.TRACE] = helper.getTraceDict(originId=requestId, localId=localId, status=status, message=errorMessage, startDateTime=timeLogTuple[0], endDateTime=timeLogTuple[1], duration=timeLogTuple[2]) # Insert into the table # print("processRecord() >> insert dictionary: ", insertRecordDict) insertResult = consentJsonStreamTable.insert(insertRecordDict) # print(" processRecord() >> Processing Outcome (insert): ", insertResult) if(API.isResultFailure(insertResult)): # TODO What do we do in this scenario? return insertResult if insertResult != None else consentJsonStreamTable.composeResult(API.STATUS_CODE.FAILED) else: # ----- UPDATE the record trace (overall processing time until insert DB operation was successful) ----- status = API.getResultStatus(insertResult) updateRecordDict = {} addPrimaryAndSortKey(updateRecordDict, baseTableAttr, customerMK, sortKey) # Update the End DateTime timeLogTuple = timeHelper.getTimeLogTupleString(startTime) # print(f"processRecord() >> timeLogTuple (update): '{timeLogTuple[0]}', '{timeLogTuple[1]}', '{timeLogTuple[2]}'") updateRecordDict[baseTableAttr.TRACE] = helper.getTraceDict(originId=requestId, localId=localId, status=status, message=errorMessage, startDateTime=timeLogTuple[0], endDateTime=timeLogTuple[1], duration=timeLogTuple[2]) # Update ConsentProcessStream # print("processRecord() >> update dictionary: ", insertRecordDict) updateResult = consentJsonStreamTable.update(updateRecordDict) # print(" processRecord() >> Processing Outcome (update): ", updateResult) # Ignore the update error as insert operation (the key operation) is successful # if(API.isResultFailure(updateResult)): # return updateResult if updateResult != None else consentJsonStreamTable.composeResult(API.STATUS_CODE.FAILED) # ----- RETURN the insertResult ----- return insertResult if insertResult != None else consentJsonStreamTable.composeResult(API.STATUS_CODE.FAILED)
def resolveProductListFromPage(html): product_list = [] nowtime = timeHelper.getNowLong() nowdate = timeHelper.getNow() try: doc = libxml2.htmlReadDoc(html, None, "utf8", PARSE_OPTIONS) sku_docs = doc.xpathEval("//div[@data-sku]") for sku in sku_docs: # if True: try: sku_doc = libxml2.htmlReadDoc("%s" % sku, None, "utf8", PARSE_OPTIONS) sku_id = int(sku_doc.xpathEval("//@data-sku")[0].content) # 判断是否是JD自营 if sku_id > 99999999: # 非自营商品 continue # print '%s' %sku sku_url = sku_doc.xpathEval('//div[@class="p-img"]/a/@href')[0].content try: sku_thumnail_url = sku_doc.xpathEval('//div[@class="p-img"]/a/img/@data-lazy-img')[0].content except: sku_thumnail_url = sku_doc.xpathEval('//div[@class="p-img"]/a/img/@src')[0].content sku_title = "" try: sku_title = sku_doc.xpathEval('//div[@class="p-name"]/a/@title')[0].content except: pass if len(sku_title) == 0: sku_title = sku_doc.xpathEval('//div[@class="p-name"]/a/em')[0].content comment_count = int(sku_doc.xpathEval('//div[@class="p-commit"]/strong/a')[0].content) sku_icon_url = "" icon_doc = sku_doc.xpathEval('//div[@class="p-img"]/a/div/@style') if len(icon_doc) > 0: sku_icon_url = url_utils.getStringBetween(icon_doc[0].content, 'url("', '")') is_global = is_free_gift = is_pay_on_delivery = 0 price_items = sku_doc.xpathEval('//div[@class="p-price"]/div/i') for pitem in price_items: txt = pitem.content if "全球购" in txt: is_global = 1 elif "货到付款" in txt: is_pay_on_delivery = 1 elif "赠品" in txt: is_free_gift = 1 else: print "new-mark found:" print txt sku_stock = -1 try: sku_stock = int(sku_doc.xpathEval("//div[@data-stock_v]/@data-stock_v")[0].content) except: pass sku_url = __makeUrl__(sku_url) sku_thumnail_url = __makeUrl__(sku_thumnail_url) tp = ( sku_id, nowdate, nowtime, sku_title, sku_url, sku_thumnail_url, sku_stock, comment_count, is_global, is_pay_on_delivery, is_free_gift, sku_icon_url, ) product_list.append(tp) except Exception as e: logging.error("resolveProductListError: %s, error = %s") % (sku, e) continue finally: sku_doc.freeDoc() return product_list finally: doc.freeDoc()
def __get_task_already_done__(self): if self.is_daily: sql = 'select task_id from task_status where job_name="%s" and update_time>="%s 0:00:00" group by task_id' %(self.job_name,timeHelper.getNow()) else: stime = timeHelper.getTimeAheadOfNowHours(self.interval_hours,format='%Y-%m-%d %H:%M:%S') sql = 'select task_id from task_status where job_name="%s" and update_time>="%s" group by task_id' %(self.job_name,stime) print sql retrows = dbhelper.executeSqlRead(sql,is_dirty=True) catlist = [] for row in retrows: catlist.append("%s" %row['task_id']) logging.info("Task already done: %s" %len(catlist)) print("Task already done: %s" %len(catlist)) return catlist
def crawl_category(category_id): logging.debug('category_id = %s -- page 1' % (category_id)) url = __get_category_page_url__(category_id, 1) # print url html = url_utils.getWebResponse(url, 'utf-8') if html == "": html = url_utils.getWebResponse(url, 'gb18030') if html == "": html = url_utils.getWebResponse(url, 'gbk') total_pages = jd_list_resolver.resolveTotalPageNum(html) product_list = jd_list_resolver.resolveProductListFromPage(html) while len(product_list) == 0 and category_id is not None: category_id = __up_roll_category_id__(category_id) return crawl_category(category_id) if category_id is None or len(product_list) == 0: return {'status': -1, 'msg': 'No item in category product list'} for page_iter in range(2, total_pages + 1): logging.debug('category_id = %s -- page %s' % (category_id, page_iter)) url = __get_category_page_url__(category_id, page_iter) html = url_utils.getWebResponse(url, 'utf-8') product_list = product_list + jd_list_resolver.resolveProductListFromPage( html) time.sleep(SLEEP_TIME) sku_list = [] for product_tp in product_list: sku_id = product_tp[0] sku_list.append(sku_id) # Get price of all products #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API) ret_obj = {'status': -1, 'affected_rows': -1, 'sku_count': -1} total_goods_num = len(product_list) # for item in product_list: # print item[0] # print '='*80 # combine product list and price list, timestamp, category_id for i in xrange(total_goods_num): product_id = product_list[i][0] pkey = '%s' % product_id # if pkey in price_obj: # product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,) # else: # logging.error('Error: product_id=%s cannot get result' %(product_id,price_id)) # continue product_list[i] = product_list[i] + ( 0, 0, 0, ) # persist in database # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id) # sql = ''' # replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery, # has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) # ''' # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list) ret = crawler_helper.persist_db_history_and_latest( table_name='jd_item_dynamic', num_cols=len(product_list[0]), value_list=product_list, is_many=True, need_history=False, # was True - changed 01/03 need_flow=False, # was True - changed 12/23 ) logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' % (category_id, total_goods_num)) logging.debug('%s' % ret) # HANDLE JD_ITEM_CATEGORY item_cat_list = [] for prod in product_list: item_cat_list.append(( prod[0], category_id, )) sql2 = 'replace into jd_item_category values (%s,%s)' affected_rows2 = dbhelper.executeSqlWriteMany(sql2, item_cat_list) logging.debug('Saved to DB - item_category - affected rows = %s' % affected_rows2) if affected_rows2 <= 0: logging.error('Saving to item_category error, category_id = %s' % category_id) # HANDLE JD_ITEM_FIRSTSEEN nowtime = timeHelper.getNowLong() nowdate = timeHelper.getNow() sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")' ftlist = [] for item in product_list: ftlist.append([item[0], nowtime, nowdate]) affected_rows3 = dbhelper.executeSqlWriteMany(sql3, ftlist) ret_obj = { 'status': 0 if ret['status'] == 0 and affected_rows2 > 0 else -1, 'item_dynamic': ret, 'item_category': affected_rows2, 'item_first_seen': affected_rows3, } return ret_obj