def _get_deduction_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours( FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and a.sku_id=%s" % DEBUG_SKU_ID sql_deduction = ''' select * from ( select sku_id, max(single_discount_rate) as max_deduction_ratio FROM jd_analytic_promo_deduction_latest group by sku_id -- having max(single_discount_rate)>0 ) a left join jd_analytic_promo_deduction_latest b on a.sku_id = b.sku_id and ABS(a.max_deduction_ratio-b.single_discount_rate)<0.001 where origin_time>'%s' %s ''' % (hours_ahead, debug_sku_str) retrows_deduction = dbhelper.executeSqlRead( sql_deduction, is_dirty=True, isolation_type='read-committed') for row in retrows_deduction: row['content_deduction'] = row['content'] dict_deduction = rows_helper.transform_retrows_to_dict( retrows_deduction, 'sku_id') return dict_deduction
def getWorthyInfo_of_skuid_list(sku_id_list): if len(sku_id_list) == 0: return [] sku_id_list2 = [] for item in sku_id_list: sku_id_list2.append("%s" % item) dt = timeHelper.getTimeAheadOfNowHours( service_config.SKU_LIST_APP_WORTHY_RECENCY_HOURS, timeHelper.FORMAT_LONG) id_clause = ','.join(sku_id_list2) skulist3 = [] skuid_clause = "(" for sku_id in sku_id_list2: skulist3.append("sku_id = %s" % sku_id) skuid_clause += ' OR '.join(skulist3) skuid_clause += ")" sql = ''' select *, instr('%s',sku_id) as dd from jd_worthy_latest where -- this_update_time > '%s' -- and sku_id in (%s) %s order by dd ASC ''' % (id_clause, dt, id_clause, skuid_clause) # print sql retrows = dbhelper_read.executeSqlRead(sql, is_dirty=True) return retrows
def _get_deduction_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and a.sku_id=%s" %DEBUG_SKU_ID sql_deduction = ''' select * from ( select sku_id, max(single_discount_rate) as max_deduction_ratio FROM jd_analytic_promo_deduction_latest group by sku_id -- having max(single_discount_rate)>0 ) a left join jd_analytic_promo_deduction_latest b on a.sku_id = b.sku_id and ABS(a.max_deduction_ratio-b.single_discount_rate)<0.001 where origin_time>'%s' %s ''' %(hours_ahead, debug_sku_str) retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed') for row in retrows_deduction: row['content_deduction'] = row['content'] dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id') return dict_deduction
def _get_gift_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" %DEBUG_SKU_ID # gift_valued表中, dt是原始爬取时间(其他表是origin_time) sql_deduction = ''' select * from jd_analytic_promo_gift_valued where dt>'%s' %s ''' %(hours_ahead, debug_sku_str) # print sql_deduction retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed') dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id') return dict_deduction
def _get_discount_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" %DEBUG_SKU_ID sql_deduction = ''' select * from jd_analytic_promo_discount_latest where origin_dt>'%s' %s ''' %(hours_ahead, debug_sku_str) retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed') for row in retrows_deduction: row['content_discount'] = row['content'] dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id') #print dict_deduction['264212'] return dict_deduction
def _get_gift_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours( FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" % DEBUG_SKU_ID # gift_valued表中, dt是原始爬取时间(其他表是origin_time) sql_deduction = ''' select * from jd_analytic_promo_gift_valued where dt>'%s' %s ''' % (hours_ahead, debug_sku_str) # print sql_deduction retrows_deduction = dbhelper.executeSqlRead( sql_deduction, is_dirty=True, isolation_type='read-committed') dict_deduction = rows_helper.transform_retrows_to_dict( retrows_deduction, 'sku_id') return dict_deduction
def _get_discount_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours( FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" % DEBUG_SKU_ID sql_deduction = ''' select * from jd_analytic_promo_discount_latest where origin_dt>'%s' %s ''' % (hours_ahead, debug_sku_str) retrows_deduction = dbhelper.executeSqlRead( sql_deduction, is_dirty=True, isolation_type='read-committed') for row in retrows_deduction: row['content_discount'] = row['content'] dict_deduction = rows_helper.transform_retrows_to_dict( retrows_deduction, 'sku_id') #print dict_deduction['264212'] return dict_deduction
def getSku_ID_ListByCatalogID( category_id="_ALL_", startpos=0, min_allowed_price=service_config.SKU_LIST_MIN_ALLOWED_PRICE, min_allowed_discount_rate=service_config.SKU_LIST_MIN_ALLOWED_WORTHY_VALUE ): retrows = None t1 = time.time() catalog_id_constraint = "" for idc in service_config.PRESET_CATALOG_ID_CONSTRAINTS: catalog_id_constraint += " catalog_id <> %s AND " % idc blackword_constraint = "" for blackword in service_config.PRESET_CATALOG_CATEGORY_WILDCARD_BLACK_WORDS: blackword_constraint += " category_name not like '%%%s%%' AND " % blackword.strip( ) catalog_constraint = " catalog_id is not null AND %s %s " % ( catalog_id_constraint, blackword_constraint) if category_id == "_ALL_": catalog_sql_part = catalog_constraint elif category_id == "_EXPENSIVE_": min_allowed_price = service_config.SKU_LIST_MIN_PRICE_FOR_EXPENSIVE catalog_sql_part = catalog_constraint else: catalog_sql_part = 'catalog_id = %s and ' % category_id dt = timeHelper.getTimeAheadOfNowHours( service_config.SKU_LIST_APP_WORTHY_RECENCY_HOURS, timeHelper.FORMAT_LONG) sql = ''' select sku_id -- ,if(a=34,0,1) as stock_bit from jd_worthy_latest where %s worthy_value1 < %s and median_price >= %s and median_price < %s and this_update_time > '%s' order by -- stock_bit DESC, worthy_value1 ASC -- limit %s, %s ''' % (catalog_sql_part, min_allowed_discount_rate, min_allowed_price, service_config.SKU_LIST_MAX_ALLOWED_PRICE, dt, startpos, service_config.SKU_LIST_FRAME_SIZE) if category_id == '_HISTORY_LOWEST_': sql = ''' select sku_id from jd_worthy_latest where %s min_price_reached = 2 and this_update_time > '%s' and a<>34 order by worthy_value1 ASC ''' % (catalog_constraint, dt) elif category_id == 'HOT': dt_hot = timeHelper.getTimeAheadOfNowHours( service_config.SKU_LIST_DISCOVERY_RECENCY_HOURS, format=timeHelper.FORMAT_LONG) sql = ''' select distinct a.sku_id from jd_notification_history_lowest a left join jd_worthy_latest b using(sku_id) where %s a.update_time > '%s' and b.a<>34 order by a.update_time DESC, worthy_value1 ASC ''' % (catalog_constraint, dt_hot) # print sql retrows = dbhelper_read.executeSqlRead(sql) vlist = [] for row in retrows: vlist.append(row['sku_id']) return vlist
def process_gift_value(for_date=None): # today = timeHelper.getNowLong() today = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S') sql1 = 'delete from jd_analytic_promo_gift_valued' sql2 = ''' insert into jd_analytic_promo_gift_valued select a.*, b.price as price, c.price as gift_price, c.price*a.gift_num as gift_value, (c.price*a.gift_num)/b.price as gift_ratio from jd_analytic_promo_gift_latest a left join jd_item_dynamic_latest b on a.sku_id = b.sku_id left join jd_item_dynamic_latest c on a.gift_sku_id = c.sku_id where a.update_date >= '%s' and b.price is not NULL and c.price is not NULL and b.price>0 order by gift_value DESC ''' % today afr = -1 # AS TRANSACTION conn = dbhelper.getConnection() try: cursor1 = conn.cursor(MySQLdb.cursors.DictCursor) retrows = cursor1.execute(sql1) retrows2 = cursor1.execute(sql2) if retrows2 <= 0: raise Exception("process_gift_value: nothing to insert") conn.commit() afr = cursor1.rowcount except Exception as e: conn.rollback() logging.error(e) finally: conn.close() logging.debug("affected rows: %s" % afr) ret = { 'status': 0 if afr > 0 else -1, 'affected_rows': afr, 'rows deleted': retrows, 'rows_inserted': retrows2 } return ret
def _removeOldNotifications(): ut = timeHelper.getTimeAheadOfNowHours(24,format=timeHelper.FORMAT_LONG) sql = 'delete from jd_notification_history_lowest where update_time <= "%s"' %ut afr = dbhelper.executeSqlWrite1(sql) return afr
def process_promo_detail(): today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,'%Y-%m-%d %H:%M:%S') # today = timeHelper.getTimeAheadOfNowDays(1) sql = ''' select a.*, b.price, d.id as category_id, d.name as category_name from jd_analytic_promo_item_latest a left join jd_item_price_latest b on a.sku_id = b.sku_id left JOIN jd_item_category c on a.sku_id = c.sku_id left join jd_category d on c.category_id = d.id where a.dt >= "%s" and b.sku_id is not NULL and b.price is not NULL ''' %today # logging.debug(sql) retrows = dbhelper.executeSqlRead(sql, is_dirty=True) vlist = [] vlist19 = [] dt = timeHelper.getNowLong() logging.debug('num total promo_item rows: %s' %len(retrows) ) # exit() num_15 = 0 num_19 = 0 num_15_repeated = 0 for row in retrows: sku_id = row['sku_id'] code = int(row['code']) content = row['content'] if 'content' in row else "" adurl = row['adurl'] if 'adurl' in row else "" origin_dt = row['dt'] pid = row['pid'] name = row['name'] if 'name' in row else "" price = float("%s" %row['price']) category_id = row['category_id'] category_name = row['category_name'] # title = row['title'] if code == 15: num_15 += 1 ret = _extract_reach_deduction_array(content) stat_has_repeat = False max_deduction = float(ret['max']) for item in ret['data']: try: reach = float(item[0]) deduction = float(item[1]) is_repeat = item[2] if is_repeat==1: stat_has_repeat = True dr_ratio = deduction*1.0/reach maxp_ratio = max_deduction*1.0/price if max_deduction > 0 else 1.0 could_deduct = 0 except Exception as e: logging.error("reach:%s, deduction:%s" %(reach,deduction) ) logging.error(e) continue if price >= reach and reach>0: if is_repeat: times = price // reach else: times = 1 could_deduct = times * deduction if could_deduct > max_deduction: could_deduct = max_deduction single_discount_rate = could_deduct/price tp =[sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt] vlist.append(tp) if stat_has_repeat: num_15_repeated += 1 elif code == 19: sku_str = "%s" %sku_id num_19 += 1 # 满几件打折或者降低多少 type_word_list = ["总价打","商品价格"] # 0: 直接打折 # 1: 减商品价格 # 2: 其他 deduct_type = 0 for type_word in type_word_list: if content.find(type_word) >= 0: # deduct_type = 0 break deduct_type += 1 if deduct_type==2: logging.error("NEW TYPE OF DISCOUNT FOUND!!!") logging.error(content) logging.error("NEW TYPE OF DISCOUNT FOUND!!!") pt = re.compile(u'[\d.]+',re.U) pts = pt.findall(content) if len(pts) != 2: if '可购买热销商品' not in content: logging.error(content) logging.error("NEW PATTERN ABOVE") reach_num = discount = free_num = rf_ratio = None reach_num = float(pts[0]) if deduct_type==0: discount = pts[1] elif deduct_type==1: free_num = float(pts[1]) rf_ratio = float(free_num*1.0/reach_num) # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] tp19 =[sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] vlist19.append(tp19) else: pass logging.debug("code = 15, cases = %s" %num_15) logging.debug("code = 15, repeated = %s" %num_15_repeated) logging.debug("rows to insert = %s" %len(vlist) ) sql_cb_deduction = ''' CREATE TABLE jd_analytic_promo_deduction_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) NOT NULL, price float NOT NULL, is_repeat tinyint(4) NOT NULL, reach float NOT NULL, deduction float NOT NULL, max_deduction float NOT NULL, dr_ratio float NOT NULL, maxp_ratio float NOT NULL, single_discount_rate float NOT NULL, category_id varchar(255) NOT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_time datetime NOT NULL, KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_discount = ''' CREATE TABLE jd_analytic_promo_discount_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) DEFAULT NULL, price float DEFAULT NULL, deduct_type smallint(6) DEFAULT NULL, reach_num smallint(6) DEFAULT NULL, discount float DEFAULT NULL, free_num smallint(6) DEFAULT NULL, rf_ratio float DEFAULT NULL, category_id varchar(255) DEFAULT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_dt datetime DEFAULT NULL, PRIMARY KEY (sku_id,pid), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' pret15 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_deduction', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_deduction, ) logging.debug("code = 19, cases = %s" %num_19 ) logging.debug("rows to insert = %s" %len(vlist19) ) pret19 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_discount', num_cols=len(vlist19[0]), value_list=vlist19, is_many=True, need_history=False, sql_create_table=sql_cb_discount, ) return _generate_mixed_ret([pret15, pret19])
def process_gift_value(for_date = None): # today = timeHelper.getNowLong() today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S') sql1 = 'delete from jd_analytic_promo_gift_valued' sql2 = ''' insert into jd_analytic_promo_gift_valued select a.*, b.price as price, c.price as gift_price, c.price*a.gift_num as gift_value, (c.price*a.gift_num)/b.price as gift_ratio from jd_analytic_promo_gift_latest a left join jd_item_dynamic_latest b on a.sku_id = b.sku_id left join jd_item_dynamic_latest c on a.gift_sku_id = c.sku_id where a.update_date >= '%s' and b.price is not NULL and c.price is not NULL and b.price>0 order by gift_value DESC ''' %today afr = -1 # AS TRANSACTION conn = dbhelper.getConnection() try: cursor1 = conn.cursor(MySQLdb.cursors.DictCursor) retrows = cursor1.execute(sql1) retrows2 = cursor1.execute(sql2) if retrows2 <= 0: raise Exception("process_gift_value: nothing to insert") conn.commit() afr = cursor1.rowcount except Exception as e: conn.rollback() logging.error(e) finally: conn.close() logging.debug("affected rows: %s" %afr ) ret = { 'status': 0 if afr > 0 else -1, 'affected_rows': afr, 'rows deleted': retrows, 'rows_inserted': retrows2 } return ret
def processItemPromo(): vlist = [] glist = [] update_date = timeHelper.getNowLong() recent = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,timeHelper.FORMAT_LONG) logging.debug('Reading jd_promo_item_latest...' ) sql = ''' select sku_id, dt, promo_json from jd_promo_item_latest where promo_json is not NULL and LENGTH(promo_json)>100 and dt>="%s" ''' %recent retrows = dbhelper.executeSqlRead(sql,is_dirty=True) # total_rows = len(retrows) num_error = 0 num17 = 0 logging.debug('completed!') logging.debug("Total rows with promo_json: %s" %len(retrows)) for row in retrows: sku_id = row['sku_id'] dt = row['dt'] obj = None try: obj = json.loads(row['promo_json']) except: num_error += 1 continue rtags = obj['pickOneTag'] for tag in rtags: pid = tag['pid'] code = tag['code'] # 不记录加价购 if code == "17": num17 += 1 continue name = tag['name'] content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) tags = obj['tags'] for tag in tags: pid = tag['pid'] code = tag['code'] name = tag['name'] if 'name' in tag else "" if code == "10": # gift gifts = tag['gifts'] for gift in gifts: gift_name = "赠品" try: gift_name = gift['nm'] gift_num = gift['num'] if 'num' in gift else 1 gift_image = gift['mp'] if 'mp' in gift else "" gift_sku_id = gift['sid'] if 'sid' in gift else "" gift_gt = gift['gt'] if 'gt' in gift else "" gift_gs = gift['gs'] if 'gs' in gift else "" tp_gift = [sku_id,dt,pid,code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date] glist.append(tp_gift) except Exception as e: logging.debug("error in extracting gift info for sku_id = %s"%sku_id) logging.debug("%s" %e) else: content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %num_error) logging.debug('num17: %s' %num17 ) logging.debug('vlist len: %s' %len(vlist) ) logging.debug('glist len: %s' %len(glist) ) sql_cb_promo_item = ''' CREATE TABLE jd_analytic_promo_item_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_promo_gift = ''' CREATE TABLE jd_analytic_promo_gift_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, gift_name varchar(255) NOT NULL, gift_num int(11) NOT NULL, gift_image varchar(255) DEFAULT NULL, gift_sku_id bigint(20) NOT NULL, gift_gt varchar(255) DEFAULT NULL, gift_gs varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' # persist in DB ret1 = ret2 = None if len(vlist)>0: ret1 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_item', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_item, ) if len(glist)>0: ret2 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_gift', num_cols=len(glist[0]), value_list=glist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_gift, ) # record gift sglist = [] cur_time = timeHelper.getNowLong() for gg in glist: sglist.append([gg[0],cur_time]) sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)' afr = dbhelper.executeSqlWriteMany(sql_gg,sglist) ret3 = { 'status': 0 if afr > 0 else -1, 'msg': "", } return _generate_mixed_ret([ret1,ret2, ret3])
def calculate_min_max_price(): logging.debug( 'Reading item_dynamic history and calculate min/max/avg/median price for skus...' ) t1 = time.time() dt = timeHelper.getTimeAheadOfNowHours( datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG) sql1 = ''' select sku_id, AVG(price) as average_price, min(price) as min_price, -- median(price) as median_price, -- changed 12/22 percentile_minx(price) as median_price, max(price) as max_price, max(update_time) as origin_time, count(1) as sample_count, min_ratio(price) as min_ratio, LPDR(price) as LPDR from -- jd_item_dynamic -- changed 12/22 jd_item_price where -- update_time > '2015-11-14 0:00:00' and -- 双十一期间价格 price > 0 group by sku_id having max(update_time) >= '%s' ''' % (dt) logging.debug(sql1) retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True) logging.debug("Done, rows to insert: %s" % len(retrows)) t2 = time.time() logging.debug('using seconds: %0.1f' % (t2 - t1)) sql_cb = ''' CREATE TABLE jd_analytic_price_stat_latest ( sku_id bigint(20) NOT NULL, average_price float NOT NULL, min_price float NOT NULL, median_price float NOT NULL, max_price float NOT NULL, origin_time datetime NOT NULL, sample_count int(11) NOT NULL, min_ratio float NOT NULL, LPDR float NOT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_price_stat', num_cols=len(retrows[0]), value_list=retrows, is_many=True, need_history=False, sql_create_table=sql_cb, ) return ret
def process_promo_detail(): today = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, '%Y-%m-%d %H:%M:%S') # today = timeHelper.getTimeAheadOfNowDays(1) sql = ''' select a.*, b.price, d.id as category_id, d.name as category_name from jd_analytic_promo_item_latest a left join jd_item_price_latest b on a.sku_id = b.sku_id left JOIN jd_item_category c on a.sku_id = c.sku_id left join jd_category d on c.category_id = d.id where a.dt >= "%s" and b.sku_id is not NULL and b.price is not NULL ''' % today # logging.debug(sql) retrows = dbhelper.executeSqlRead(sql, is_dirty=True) vlist = [] vlist19 = [] dt = timeHelper.getNowLong() logging.debug('num total promo_item rows: %s' % len(retrows)) # exit() num_15 = 0 num_19 = 0 num_15_repeated = 0 for row in retrows: sku_id = row['sku_id'] code = int(row['code']) content = row['content'] if 'content' in row else "" adurl = row['adurl'] if 'adurl' in row else "" origin_dt = row['dt'] pid = row['pid'] name = row['name'] if 'name' in row else "" price = float("%s" % row['price']) category_id = row['category_id'] category_name = row['category_name'] # title = row['title'] if code == 15: num_15 += 1 ret = _extract_reach_deduction_array(content) stat_has_repeat = False max_deduction = float(ret['max']) for item in ret['data']: try: reach = float(item[0]) deduction = float(item[1]) is_repeat = item[2] if is_repeat == 1: stat_has_repeat = True dr_ratio = deduction * 1.0 / reach maxp_ratio = max_deduction * 1.0 / price if max_deduction > 0 else 1.0 could_deduct = 0 except Exception as e: logging.error("reach:%s, deduction:%s" % (reach, deduction)) logging.error(e) continue if price >= reach and reach > 0: if is_repeat: times = price // reach else: times = 1 could_deduct = times * deduction if could_deduct > max_deduction: could_deduct = max_deduction single_discount_rate = could_deduct / price tp = [ sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt ] vlist.append(tp) if stat_has_repeat: num_15_repeated += 1 elif code == 19: sku_str = "%s" % sku_id num_19 += 1 # 满几件打折或者降低多少 type_word_list = ["总价打", "商品价格"] # 0: 直接打折 # 1: 减商品价格 # 2: 其他 deduct_type = 0 for type_word in type_word_list: if content.find(type_word) >= 0: # deduct_type = 0 break deduct_type += 1 if deduct_type == 2: logging.error("NEW TYPE OF DISCOUNT FOUND!!!") logging.error(content) logging.error("NEW TYPE OF DISCOUNT FOUND!!!") pt = re.compile(u'[\d.]+', re.U) pts = pt.findall(content) if len(pts) != 2: if '可购买热销商品' not in content: logging.error(content) logging.error("NEW PATTERN ABOVE") reach_num = discount = free_num = rf_ratio = None reach_num = float(pts[0]) if deduct_type == 0: discount = pts[1] elif deduct_type == 1: free_num = float(pts[1]) rf_ratio = float(free_num * 1.0 / reach_num) # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] tp19 = [ sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt ] vlist19.append(tp19) else: pass logging.debug("code = 15, cases = %s" % num_15) logging.debug("code = 15, repeated = %s" % num_15_repeated) logging.debug("rows to insert = %s" % len(vlist)) sql_cb_deduction = ''' CREATE TABLE jd_analytic_promo_deduction_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) NOT NULL, price float NOT NULL, is_repeat tinyint(4) NOT NULL, reach float NOT NULL, deduction float NOT NULL, max_deduction float NOT NULL, dr_ratio float NOT NULL, maxp_ratio float NOT NULL, single_discount_rate float NOT NULL, category_id varchar(255) NOT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_time datetime NOT NULL, KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_discount = ''' CREATE TABLE jd_analytic_promo_discount_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) DEFAULT NULL, price float DEFAULT NULL, deduct_type smallint(6) DEFAULT NULL, reach_num smallint(6) DEFAULT NULL, discount float DEFAULT NULL, free_num smallint(6) DEFAULT NULL, rf_ratio float DEFAULT NULL, category_id varchar(255) DEFAULT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_dt datetime DEFAULT NULL, PRIMARY KEY (sku_id,pid), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' pret15 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_deduction', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_deduction, ) logging.debug("code = 19, cases = %s" % num_19) logging.debug("rows to insert = %s" % len(vlist19)) pret19 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_discount', num_cols=len(vlist19[0]), value_list=vlist19, is_many=True, need_history=False, sql_create_table=sql_cb_discount, ) return _generate_mixed_ret([pret15, pret19])
def calculate_min_max_price(): logging.debug('Reading item_dynamic history and calculate min/max/avg/median price for skus...') t1 = time.time() dt = timeHelper.getTimeAheadOfNowHours(datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG) sql1 = ''' select sku_id, AVG(price) as average_price, min(price) as min_price, -- median(price) as median_price, -- changed 12/22 percentile_minx(price) as median_price, max(price) as max_price, max(update_time) as origin_time, count(1) as sample_count, min_ratio(price) as min_ratio, LPDR(price) as LPDR from -- jd_item_dynamic -- changed 12/22 jd_item_price where -- update_time > '2015-11-14 0:00:00' and -- 双十一期间价格 price > 0 group by sku_id having max(update_time) >= '%s' ''' %(dt) logging.debug(sql1) retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True) logging.debug("Done, rows to insert: %s" %len(retrows) ) t2 = time.time() logging.debug('using seconds: %0.1f' %(t2-t1) ) sql_cb = ''' CREATE TABLE jd_analytic_price_stat_latest ( sku_id bigint(20) NOT NULL, average_price float NOT NULL, min_price float NOT NULL, median_price float NOT NULL, max_price float NOT NULL, origin_time datetime NOT NULL, sample_count int(11) NOT NULL, min_ratio float NOT NULL, LPDR float NOT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_price_stat', num_cols=len(retrows[0]), value_list=retrows, is_many=True, need_history=False, sql_create_table= sql_cb, ) return ret
def processItemPromo(): vlist = [] glist = [] update_date = timeHelper.getNowLong() recent = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, timeHelper.FORMAT_LONG) logging.debug('Reading jd_promo_item_latest...') sql = ''' select sku_id, dt, promo_json from jd_promo_item_latest where promo_json is not NULL and LENGTH(promo_json)>100 and dt>="%s" ''' % recent retrows = dbhelper.executeSqlRead(sql, is_dirty=True) # total_rows = len(retrows) num_error = 0 num17 = 0 logging.debug('completed!') logging.debug("Total rows with promo_json: %s" % len(retrows)) for row in retrows: sku_id = row['sku_id'] dt = row['dt'] obj = None try: obj = json.loads(row['promo_json']) except: num_error += 1 continue rtags = obj['pickOneTag'] for tag in rtags: pid = tag['pid'] code = tag['code'] # 不记录加价购 if code == "17": num17 += 1 continue name = tag['name'] content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) tags = obj['tags'] for tag in tags: pid = tag['pid'] code = tag['code'] name = tag['name'] if 'name' in tag else "" if code == "10": # gift gifts = tag['gifts'] for gift in gifts: gift_name = "赠品" try: gift_name = gift['nm'] gift_num = gift['num'] if 'num' in gift else 1 gift_image = gift['mp'] if 'mp' in gift else "" gift_sku_id = gift['sid'] if 'sid' in gift else "" gift_gt = gift['gt'] if 'gt' in gift else "" gift_gs = gift['gs'] if 'gs' in gift else "" tp_gift = [ sku_id, dt, pid, code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date ] glist.append(tp_gift) except Exception as e: logging.debug( "error in extracting gift info for sku_id = %s" % sku_id) logging.debug("%s" % e) else: content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" % num_error) logging.debug('num17: %s' % num17) logging.debug('vlist len: %s' % len(vlist)) logging.debug('glist len: %s' % len(glist)) sql_cb_promo_item = ''' CREATE TABLE jd_analytic_promo_item_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_promo_gift = ''' CREATE TABLE jd_analytic_promo_gift_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, gift_name varchar(255) NOT NULL, gift_num int(11) NOT NULL, gift_image varchar(255) DEFAULT NULL, gift_sku_id bigint(20) NOT NULL, gift_gt varchar(255) DEFAULT NULL, gift_gs varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' # persist in DB ret1 = ret2 = None if len(vlist) > 0: ret1 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_item', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_item, ) if len(glist) > 0: ret2 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_gift', num_cols=len(glist[0]), value_list=glist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_gift, ) # record gift sglist = [] cur_time = timeHelper.getNowLong() for gg in glist: sglist.append([gg[0], cur_time]) sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)' afr = dbhelper.executeSqlWriteMany(sql_gg, sglist) ret3 = { 'status': 0 if afr > 0 else -1, 'msg': "", } return _generate_mixed_ret([ret1, ret2, ret3])