def _get_deduction_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and a.sku_id=%s" %DEBUG_SKU_ID sql_deduction = ''' select * from ( select sku_id, max(single_discount_rate) as max_deduction_ratio FROM jd_analytic_promo_deduction_latest group by sku_id -- having max(single_discount_rate)>0 ) a left join jd_analytic_promo_deduction_latest b on a.sku_id = b.sku_id and ABS(a.max_deduction_ratio-b.single_discount_rate)<0.001 where origin_time>'%s' %s ''' %(hours_ahead, debug_sku_str) retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed') for row in retrows_deduction: row['content_deduction'] = row['content'] dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id') return dict_deduction
def _get_deduction_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours( FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and a.sku_id=%s" % DEBUG_SKU_ID sql_deduction = ''' select * from ( select sku_id, max(single_discount_rate) as max_deduction_ratio FROM jd_analytic_promo_deduction_latest group by sku_id -- having max(single_discount_rate)>0 ) a left join jd_analytic_promo_deduction_latest b on a.sku_id = b.sku_id and ABS(a.max_deduction_ratio-b.single_discount_rate)<0.001 where origin_time>'%s' %s ''' % (hours_ahead, debug_sku_str) retrows_deduction = dbhelper.executeSqlRead( sql_deduction, is_dirty=True, isolation_type='read-committed') for row in retrows_deduction: row['content_deduction'] = row['content'] dict_deduction = rows_helper.transform_retrows_to_dict( retrows_deduction, 'sku_id') return dict_deduction
def match_discounts(): debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" %DEBUG_SKU_ID print('>>> 1/8 >>> Reading jd_price_temp_latest...') sql_price = 'select * from jd_price_temp_latest %s' %debug_sku_str retrows_price = dbhelper.executeSqlRead(sql_price, is_dirty=True) print('rows read: %s' %len(retrows_price)) print('>>> 2/8 >>> Reading strongest deductions of each sku...') deduction_dict = _get_deduction_dict() print('rows read: %s' %len(deduction_dict)) print('>>> 3/8 >>> Reading discounts of each sku...') discount_dict = _get_discount_dict() print('rows read: %s' %len(discount_dict)) print('>>> 4/8 >>> Reading gifts of each sku...') gift_dict = _get_gift_dict() print('rows read: %s' %len(gift_dict)) print('>>> 5/8 >>> Reading first seen date of each sku...') first_seen_dict = _get_item_firstseen_dict() print('rows read: %s' %len(first_seen_dict)) print('>>> 6/8 >>> Reading ratings of each sku...') rating_dict = _get_rating_dict() print('rows read: %s' %len(rating_dict)) print('>>> 7/8 >>> Joining results in memory...') _merge_dict_under_key( deduction_dict, [ discount_dict, gift_dict, first_seen_dict, rating_dict, ] ) tlist = _memory_left_join(retrows_price,deduction_dict, col_name_list_left=cols_left, col_name_list_right=cols_deduction ) print('rows generated: %s' %len(tlist)) print '>>> 8/8 >>> Calculating worhty_values...' _calculate_worthy_values(tlist) print 'num cols = %s ' %len(tlist[0]) print '>>> 9/9 >>> Saving to DB...' ret = crawler_helper.persist_db_history_and_latest( table_name='jd_worthy', num_cols=len(tlist[0]), value_list=tlist, is_many=True, need_history=False ) return ret
def _get_rating_dict(): debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" %DEBUG_SKU_ID sql_deduction = ''' select * from jd_analytic_item_rating_diff %s ''' %(debug_sku_str) retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True) dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id') return dict_deduction
def _get_item_firstseen_dict(): debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" %DEBUG_SKU_ID sql_deduction = ''' select sku_id,first_seen_date from jd_item_firstseen %s ''' %(debug_sku_str) retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True) dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id') return dict_deduction
def _get_category_all(): sql = 'select * from jd_category' retrows = dbhelper.executeSqlRead(sql) catdict = {} for row in retrows: cat_id = row['id'] cat_text = row['name'].replace('/', ' ') catdict[cat_id] = cat_text return catdict
def _get_rating_dict(): debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" % DEBUG_SKU_ID sql_deduction = ''' select * from jd_analytic_item_rating_diff %s ''' % (debug_sku_str) retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True) dict_deduction = rows_helper.transform_retrows_to_dict( retrows_deduction, 'sku_id') return dict_deduction
def _get_category_all(): sql = 'select * from jd_category' retrows = dbhelper.executeSqlRead(sql) catdict = {} for row in retrows: cat_id = row['id'] cat_text = row['name'].replace('/',' ') catdict[cat_id] =cat_text return catdict
def _get_item_firstseen_dict(): debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" % DEBUG_SKU_ID sql_deduction = ''' select sku_id,first_seen_date from jd_item_firstseen %s ''' % (debug_sku_str) retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True) dict_deduction = rows_helper.transform_retrows_to_dict( retrows_deduction, 'sku_id') return dict_deduction
def _load_catalog_map_as_dict_key_category_id_prefix(): global g_catalog_map if len(g_catalog_map) > 0: return g_catalog_map sql = "select * from jd_catalog_map" retrows = dbhelper.executeSqlRead(sql) cdict = rows_helper.transform_retrows_to_dict(retrows, "category_id") g_catalog_map = cdict return g_catalog_map
def _get_display_name_dict(): sql = 'select * from jd_category_show' retrows = dbhelper.executeSqlRead(sql) catdict = {} for row in retrows: cat_id = row['category_id_prefix'] cat_text = "%s %s" %(row['category_prefix_name'],row['display_name']) catdict[cat_id] = cat_text return catdict
def match_discounts(): debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" % DEBUG_SKU_ID print('>>> 1/8 >>> Reading jd_price_temp_latest...') sql_price = 'select * from jd_price_temp_latest %s' % debug_sku_str retrows_price = dbhelper.executeSqlRead(sql_price, is_dirty=True) print('rows read: %s' % len(retrows_price)) print('>>> 2/8 >>> Reading strongest deductions of each sku...') deduction_dict = _get_deduction_dict() print('rows read: %s' % len(deduction_dict)) print('>>> 3/8 >>> Reading discounts of each sku...') discount_dict = _get_discount_dict() print('rows read: %s' % len(discount_dict)) print('>>> 4/8 >>> Reading gifts of each sku...') gift_dict = _get_gift_dict() print('rows read: %s' % len(gift_dict)) print('>>> 5/8 >>> Reading first seen date of each sku...') first_seen_dict = _get_item_firstseen_dict() print('rows read: %s' % len(first_seen_dict)) print('>>> 6/8 >>> Reading ratings of each sku...') rating_dict = _get_rating_dict() print('rows read: %s' % len(rating_dict)) print('>>> 7/8 >>> Joining results in memory...') _merge_dict_under_key(deduction_dict, [ discount_dict, gift_dict, first_seen_dict, rating_dict, ]) tlist = _memory_left_join(retrows_price, deduction_dict, col_name_list_left=cols_left, col_name_list_right=cols_deduction) print('rows generated: %s' % len(tlist)) print '>>> 8/8 >>> Calculating worhty_values...' _calculate_worthy_values(tlist) print 'num cols = %s ' % len(tlist[0]) print '>>> 9/9 >>> Saving to DB...' ret = crawler_helper.persist_db_history_and_latest(table_name='jd_worthy', num_cols=len(tlist[0]), value_list=tlist, is_many=True, need_history=False) return ret
def _get_display_name_dict(): sql = 'select * from jd_category_show' retrows = dbhelper.executeSqlRead(sql) catdict = {} for row in retrows: cat_id = row['category_id_prefix'] cat_text = "%s %s" % (row['category_prefix_name'], row['display_name']) catdict[cat_id] = cat_text return catdict
def genPropertyTable(): print("reading...") sql = 'select * from jd_item_property_latest' retrows = dbhelper.executeSqlRead(sql, is_dirty=True) pdict = {} for row in retrows: p_key = row['p_key'] if p_key is None: continue if p_key == '__DEFAULT__' or p_key == u'__DEFAULT__': continue if len(p_key) > 60: # print p_key continue p_value = row['p_value'] if (p_value is None): continue if p_value == u'无': # print p_value continue p_value_nf = multi_replace(p_value,PROPERTY_SPLITTER_LIST,' ') lendiff = len(p_value) - len(p_value_nf) if lendiff > 5: # print p_value continue sku_id = row['sku_id'] if sku_id in pdict: pold = pdict[sku_id] pdict[sku_id] = "%s %s" %(pold,p_value_nf) else: pdict[sku_id] = p_value_nf vlist = [] for key in pdict: vlist.append([key,pdict[key]]) print("writing to db...") return crawler_helper.persist_db_history_and_latest( table_name='jd_index_property', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, need_flow=False, )
def genPropertyTable(): print("reading...") sql = 'select * from jd_item_property_latest' retrows = dbhelper.executeSqlRead(sql, is_dirty=True) pdict = {} for row in retrows: p_key = row['p_key'] if p_key is None: continue if p_key == '__DEFAULT__' or p_key == u'__DEFAULT__': continue if len(p_key) > 60: # print p_key continue p_value = row['p_value'] if (p_value is None): continue if p_value == u'无': # print p_value continue p_value_nf = multi_replace(p_value, PROPERTY_SPLITTER_LIST, ' ') lendiff = len(p_value) - len(p_value_nf) if lendiff > 5: # print p_value continue sku_id = row['sku_id'] if sku_id in pdict: pold = pdict[sku_id] pdict[sku_id] = "%s %s" % (pold, p_value_nf) else: pdict[sku_id] = p_value_nf vlist = [] for key in pdict: vlist.append([key, pdict[key]]) print("writing to db...") return crawler_helper.persist_db_history_and_latest( table_name='jd_index_property', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, need_flow=False, )
def _get_gift_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" %DEBUG_SKU_ID # gift_valued表中, dt是原始爬取时间(其他表是origin_time) sql_deduction = ''' select * from jd_analytic_promo_gift_valued where dt>'%s' %s ''' %(hours_ahead, debug_sku_str) # print sql_deduction retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed') dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id') return dict_deduction
def _get_discount_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" %DEBUG_SKU_ID sql_deduction = ''' select * from jd_analytic_promo_discount_latest where origin_dt>'%s' %s ''' %(hours_ahead, debug_sku_str) retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed') for row in retrows_deduction: row['content_discount'] = row['content'] dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id') #print dict_deduction['264212'] return dict_deduction
def _get_gift_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours( FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" % DEBUG_SKU_ID # gift_valued表中, dt是原始爬取时间(其他表是origin_time) sql_deduction = ''' select * from jd_analytic_promo_gift_valued where dt>'%s' %s ''' % (hours_ahead, debug_sku_str) # print sql_deduction retrows_deduction = dbhelper.executeSqlRead( sql_deduction, is_dirty=True, isolation_type='read-committed') dict_deduction = rows_helper.transform_retrows_to_dict( retrows_deduction, 'sku_id') return dict_deduction
def _get_discount_dict(): hours_ahead = timeHelper.getTimeAheadOfNowHours( FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S') debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" % DEBUG_SKU_ID sql_deduction = ''' select * from jd_analytic_promo_discount_latest where origin_dt>'%s' %s ''' % (hours_ahead, debug_sku_str) retrows_deduction = dbhelper.executeSqlRead( sql_deduction, is_dirty=True, isolation_type='read-committed') for row in retrows_deduction: row['content_discount'] = row['content'] dict_deduction = rows_helper.transform_retrows_to_dict( retrows_deduction, 'sku_id') #print dict_deduction['264212'] return dict_deduction
def do_log_user_event(device_id, query, catalog_id, remote_ip): catalog_name = "" if query is not None and len(query) > 0: catalog_id = "" else: query = "" sql2 = 'select * from jd_catalog where catalog_id="%s"' %catalog_id # print sql2 retrows = dbhelper.executeSqlRead(sql2) if len(retrows) > 0: catalog_name = retrows[0]['catalog_name'] else: catalog_name = 'Unknown' sql_user_event = 'insert into user_events values("%s","%s","%s","%s","%s","%s")' %(device_id,query,catalog_id,catalog_name, timeHelper.getNowLong(),remote_ip) afr = dbhelper.executeSqlWrite1(sql_user_event) # print afr return afr
def calculate_base_rating_for_categories(): today = timeHelper.getNow() sql = getSqlCatRating() retrows = dbhelper.executeSqlRead(sql, is_dirty=True) # print sql print "rows of data selected for insert: %s" %len(retrows) # print len(retrows[0]) # print retrows[0] vlist = [] for row in retrows: tp = [] tp.append(row['category_id']) tp.append(row['sample_count']) tp.append(row['sum_1']) tp.append(row['sum_2']) tp.append(row['sum_3']) tp.append(row['sum_4']) tp.append(row['sum_5']) tp.append(row['comment_count']) tp.append(row['rating_score']) tp.append(row['rate_1']) tp.append(row['rate_2']) tp.append(row['rate_3']) tp.append(row['rate_4']) tp.append(row['rate_5']) tp.append(row['rate_good']) tp.append(row['rate_bad']) tp.append(row['origin_dt']) tp.append(row['dt']) tp.append(row['name']) # print row['category_id'] vlist.append(tp) return crawler_helper.persist_db_history_and_latest( table_name='jd_analytic_category_rating', num_cols=len(vlist[0]), value_list=vlist, is_many=True )
def do_log_user_event(device_id, query, catalog_id, remote_ip): catalog_name = "" if query is not None and len(query) > 0: catalog_id = "" else: query = "" sql2 = 'select * from jd_catalog where catalog_id="%s"' % catalog_id # print sql2 retrows = dbhelper.executeSqlRead(sql2) if len(retrows) > 0: catalog_name = retrows[0]['catalog_name'] else: catalog_name = 'Unknown' sql_user_event = 'insert into user_events values("%s","%s","%s","%s","%s","%s")' % ( device_id, query, catalog_id, catalog_name, timeHelper.getNowLong(), remote_ip) afr = dbhelper.executeSqlWrite1(sql_user_event) # print afr return afr
def _load_category_map(): sql = 'select category_id_prefix,category_prefix_name from jd_category_show' retrows = dbhelper.executeSqlRead(sql) retdict = rows_helper.transform_retrows_to_dict(retrows,'category_prefix_name') return retdict
def calculatePercentile(): """ (1) load data (2) hash by key: category_id, ordered dict (3) for each item, find it's category_id array, got it's pos and percentile (4) store results in db :return: """ t1 = time.time() # STEP (1) print "step 1/4: reading data from rating_score_latest" sql = ''' select * from jd_analytic_rating_score_latest where rating_score is NOT NULL and comment_count is not NULL AND comment_count >= %s -- and category_id like "670-729-%%" order by comment_count DESC order by category_id -- limit 1000 ''' % datamining_config.MIN_SKU_NUM_PER_CATEGORY_SO_STATISTICALLY_SIGNIFICANT retrows = dbhelper.executeSqlRead(sql) t2 = time.time() print "Done, rows read: %s, seconds used: %0.1f" % (len(retrows), t2 - t1) # STEP (2) print "step 2/4: sorting category scores..." print "" key_col = 'rating_score' tdict = rows_helper.transform_retrows_to_hashed_arrays( retrows, key_col_name='category_id') odict = {} for cat in tdict: array = tdict[cat] _get_ordered_array(array, key_col) odict[cat] = array t3 = time.time() print "Done, ordered_dict generated, num of keys = %s, time used = %0.1f" % ( len(odict), t3 - t2) print "" # STEP (3) print "step 3/4: calculate rating percentile for each sku..." #sku_dict = rows_helper.transform_retrows_to_dict(retrows, key_col_name='sku_id') for row in retrows: catid = row['category_id'] myval = row[key_col] pt = _getPercentileGreaterThan(myval, odict[catid], key_col) row['percentile_' + key_col] = pt row['sample_num'] = len( odict[catid]) if odict[catid] is not None else 0 # print "myval: %s\tpt: %s" %(myval,pt) t4 = time.time() print "Done, using seconds: %0.1f" % (t4 - t3) print "" # Step (4) print 'step 4/4: storing results in db...' # for item in retrows: # for key in item: # print key # break sql_cb = ''' CREATE TABLE jd_analytic_rating_percentile_latest ( sku_id bigint(20) NOT NULL, comment_count int(11) NOT NULL, this_update_time datetime NOT NULL, rating_score float NOT NULL, category_id varchar(255) NOT NULL, rating_sample_num int(11) DEFAULT 0, percentile_rating_score float DEFAULT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) -- KEY cat_score (rating_score,category_id), -- KEY score (rating_score), -- KEY category (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' vlist = rows_helper.transform_retrows_arrayofdicts_to_arrayoftuples( retrows) ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_rating_percentile', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb, ) t5 = time.time() print "Done, rows affected: %s, time used: %0.1f" % (ret, t5 - t4) print "" return ret
def processItemPromo(): vlist = [] glist = [] update_date = timeHelper.getNowLong() recent = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,timeHelper.FORMAT_LONG) logging.debug('Reading jd_promo_item_latest...' ) sql = ''' select sku_id, dt, promo_json from jd_promo_item_latest where promo_json is not NULL and LENGTH(promo_json)>100 and dt>="%s" ''' %recent retrows = dbhelper.executeSqlRead(sql,is_dirty=True) # total_rows = len(retrows) num_error = 0 num17 = 0 logging.debug('completed!') logging.debug("Total rows with promo_json: %s" %len(retrows)) for row in retrows: sku_id = row['sku_id'] dt = row['dt'] obj = None try: obj = json.loads(row['promo_json']) except: num_error += 1 continue rtags = obj['pickOneTag'] for tag in rtags: pid = tag['pid'] code = tag['code'] # 不记录加价购 if code == "17": num17 += 1 continue name = tag['name'] content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) tags = obj['tags'] for tag in tags: pid = tag['pid'] code = tag['code'] name = tag['name'] if 'name' in tag else "" if code == "10": # gift gifts = tag['gifts'] for gift in gifts: gift_name = "赠品" try: gift_name = gift['nm'] gift_num = gift['num'] if 'num' in gift else 1 gift_image = gift['mp'] if 'mp' in gift else "" gift_sku_id = gift['sid'] if 'sid' in gift else "" gift_gt = gift['gt'] if 'gt' in gift else "" gift_gs = gift['gs'] if 'gs' in gift else "" tp_gift = [sku_id,dt,pid,code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date] glist.append(tp_gift) except Exception as e: logging.debug("error in extracting gift info for sku_id = %s"%sku_id) logging.debug("%s" %e) else: content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %num_error) logging.debug('num17: %s' %num17 ) logging.debug('vlist len: %s' %len(vlist) ) logging.debug('glist len: %s' %len(glist) ) sql_cb_promo_item = ''' CREATE TABLE jd_analytic_promo_item_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_promo_gift = ''' CREATE TABLE jd_analytic_promo_gift_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, gift_name varchar(255) NOT NULL, gift_num int(11) NOT NULL, gift_image varchar(255) DEFAULT NULL, gift_sku_id bigint(20) NOT NULL, gift_gt varchar(255) DEFAULT NULL, gift_gs varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' # persist in DB ret1 = ret2 = None if len(vlist)>0: ret1 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_item', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_item, ) if len(glist)>0: ret2 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_gift', num_cols=len(glist[0]), value_list=glist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_gift, ) # record gift sglist = [] cur_time = timeHelper.getNowLong() for gg in glist: sglist.append([gg[0],cur_time]) sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)' afr = dbhelper.executeSqlWriteMany(sql_gg,sglist) ret3 = { 'status': 0 if afr > 0 else -1, 'msg': "", } return _generate_mixed_ret([ret1,ret2, ret3])
def _get_merged_tables(): sql = """ select a.sku_id as sku_id, CURRENT_TIMESTAMP() as this_update_time, j.category_id, h.name as category_name, pp.price as current_price, a.average_price as average_price, a.median_price, a.min_price, a.max_price, a.min_ratio, a.LPDR, pp.price/a.median_price as discount_rate, k.a, k.b, k.c, k.j, k.l, b.title, b.thumbnail_url, b.icon_url, c.content as content_deduction, c.adurl as adurl_deduction, c.is_repeat, c.reach, c.deduction, c.max_deduction, c.dr_ratio, c.maxp_ratio, c.max_deduction_ratio, c.reach_2, c.deduction_2, c.max_dr_ratio, c.discount_score_2 as deduction_score, d.content as content_discount, d.adurl as adurl_discount, d.deduct_type, d.reach_num, d.discount, d.free_num, d.rf_ratio, e.gift_name, e.gift_num, e.gift_image, e.gift_sku_id, e.gift_price, e.gift_value, e.gift_ratio, f.comment_count, f.rating_score, f.rating_sample_num as category_rating_score, f.percentile_rating_score as rating_score_diff, g.first_seen_date, a.sample_count, CURRENT_DATE() as this_update_date, m.catalog_id, m.catalog_name FROM jd_item_category j left join jd_category h on j.category_id = h.id inner join jd_catalog_map m on h.id = m.category_id inner join jd_analytic_price_stat_latest a on a.sku_id = j.sku_id left join jd_item_price_latest pp on a.sku_id = pp.sku_id left join jd_item_dynamic_latest b on a.sku_id = b.sku_id left join jd_analytic_promo_deduction_max c on a.sku_id = c.sku_id left join jd_analytic_promo_discount_latest d on a.sku_id = d.sku_id left join jd_analytic_promo_gift_valued e on a.sku_id = e.sku_id left join jd_analytic_rating_percentile_latest f on a.sku_id = f.sku_id left join jd_item_firstseen g on a.sku_id = g.sku_id left join jd_item_stock_latest k on a.sku_id = k.sku_id where pp.price > 0 and a.sku_id not in (select sku_id from jd_analytic_sku_gift) -- limit 100 """ if IS_SKU_LEVEL_DEBUGGING: sql += "\n and a.sku_id = %s" % DEBUG_SKU_ID retrows = dbhelper.executeSqlRead(sql, is_dirty=True) return retrows
def calculatePercentile(): """ (1) load data (2) hash by key: category_id, ordered dict (3) for each item, find it's category_id array, got it's pos and percentile (4) store results in db :return: """ t1 = time.time() # STEP (1) print "step 1/4: reading data from rating_score_latest" sql = ''' select * from jd_analytic_rating_score_latest where rating_score is NOT NULL and comment_count is not NULL AND comment_count >= %s -- and category_id like "670-729-%%" order by comment_count DESC order by category_id -- limit 1000 ''' %datamining_config.MIN_SKU_NUM_PER_CATEGORY_SO_STATISTICALLY_SIGNIFICANT retrows = dbhelper.executeSqlRead(sql) t2 = time.time() print "Done, rows read: %s, seconds used: %0.1f" %(len(retrows), t2-t1) # STEP (2) print "step 2/4: sorting category scores..." print "" key_col = 'rating_score' tdict = rows_helper.transform_retrows_to_hashed_arrays(retrows, key_col_name='category_id') odict = {} for cat in tdict: array = tdict[cat] _get_ordered_array(array, key_col) odict[cat] = array t3 = time.time() print "Done, ordered_dict generated, num of keys = %s, time used = %0.1f" %(len(odict),t3-t2) print "" # STEP (3) print "step 3/4: calculate rating percentile for each sku..." #sku_dict = rows_helper.transform_retrows_to_dict(retrows, key_col_name='sku_id') for row in retrows: catid = row['category_id'] myval = row[key_col] pt = _getPercentileGreaterThan(myval,odict[catid],key_col) row['percentile_'+key_col] = pt row['sample_num'] = len(odict[catid]) if odict[catid] is not None else 0 # print "myval: %s\tpt: %s" %(myval,pt) t4 = time.time() print "Done, using seconds: %0.1f" %(t4-t3) print "" # Step (4) print 'step 4/4: storing results in db...' # for item in retrows: # for key in item: # print key # break sql_cb = ''' CREATE TABLE jd_analytic_rating_percentile_latest ( sku_id bigint(20) NOT NULL, comment_count int(11) NOT NULL, this_update_time datetime NOT NULL, rating_score float NOT NULL, category_id varchar(255) NOT NULL, rating_sample_num int(11) DEFAULT 0, percentile_rating_score float DEFAULT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) -- KEY cat_score (rating_score,category_id), -- KEY score (rating_score), -- KEY category (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' vlist = rows_helper.transform_retrows_arrayofdicts_to_arrayoftuples(retrows) ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_rating_percentile', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb, ) t5 = time.time() print "Done, rows affected: %s, time used: %0.1f" %(ret, t5-t4) print "" return ret
def process_promo_detail(): today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,'%Y-%m-%d %H:%M:%S') # today = timeHelper.getTimeAheadOfNowDays(1) sql = ''' select a.*, b.price, d.id as category_id, d.name as category_name from jd_analytic_promo_item_latest a left join jd_item_price_latest b on a.sku_id = b.sku_id left JOIN jd_item_category c on a.sku_id = c.sku_id left join jd_category d on c.category_id = d.id where a.dt >= "%s" and b.sku_id is not NULL and b.price is not NULL ''' %today # logging.debug(sql) retrows = dbhelper.executeSqlRead(sql, is_dirty=True) vlist = [] vlist19 = [] dt = timeHelper.getNowLong() logging.debug('num total promo_item rows: %s' %len(retrows) ) # exit() num_15 = 0 num_19 = 0 num_15_repeated = 0 for row in retrows: sku_id = row['sku_id'] code = int(row['code']) content = row['content'] if 'content' in row else "" adurl = row['adurl'] if 'adurl' in row else "" origin_dt = row['dt'] pid = row['pid'] name = row['name'] if 'name' in row else "" price = float("%s" %row['price']) category_id = row['category_id'] category_name = row['category_name'] # title = row['title'] if code == 15: num_15 += 1 ret = _extract_reach_deduction_array(content) stat_has_repeat = False max_deduction = float(ret['max']) for item in ret['data']: try: reach = float(item[0]) deduction = float(item[1]) is_repeat = item[2] if is_repeat==1: stat_has_repeat = True dr_ratio = deduction*1.0/reach maxp_ratio = max_deduction*1.0/price if max_deduction > 0 else 1.0 could_deduct = 0 except Exception as e: logging.error("reach:%s, deduction:%s" %(reach,deduction) ) logging.error(e) continue if price >= reach and reach>0: if is_repeat: times = price // reach else: times = 1 could_deduct = times * deduction if could_deduct > max_deduction: could_deduct = max_deduction single_discount_rate = could_deduct/price tp =[sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt] vlist.append(tp) if stat_has_repeat: num_15_repeated += 1 elif code == 19: sku_str = "%s" %sku_id num_19 += 1 # 满几件打折或者降低多少 type_word_list = ["总价打","商品价格"] # 0: 直接打折 # 1: 减商品价格 # 2: 其他 deduct_type = 0 for type_word in type_word_list: if content.find(type_word) >= 0: # deduct_type = 0 break deduct_type += 1 if deduct_type==2: logging.error("NEW TYPE OF DISCOUNT FOUND!!!") logging.error(content) logging.error("NEW TYPE OF DISCOUNT FOUND!!!") pt = re.compile(u'[\d.]+',re.U) pts = pt.findall(content) if len(pts) != 2: if '可购买热销商品' not in content: logging.error(content) logging.error("NEW PATTERN ABOVE") reach_num = discount = free_num = rf_ratio = None reach_num = float(pts[0]) if deduct_type==0: discount = pts[1] elif deduct_type==1: free_num = float(pts[1]) rf_ratio = float(free_num*1.0/reach_num) # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] tp19 =[sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] vlist19.append(tp19) else: pass logging.debug("code = 15, cases = %s" %num_15) logging.debug("code = 15, repeated = %s" %num_15_repeated) logging.debug("rows to insert = %s" %len(vlist) ) sql_cb_deduction = ''' CREATE TABLE jd_analytic_promo_deduction_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) NOT NULL, price float NOT NULL, is_repeat tinyint(4) NOT NULL, reach float NOT NULL, deduction float NOT NULL, max_deduction float NOT NULL, dr_ratio float NOT NULL, maxp_ratio float NOT NULL, single_discount_rate float NOT NULL, category_id varchar(255) NOT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_time datetime NOT NULL, KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_discount = ''' CREATE TABLE jd_analytic_promo_discount_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) DEFAULT NULL, price float DEFAULT NULL, deduct_type smallint(6) DEFAULT NULL, reach_num smallint(6) DEFAULT NULL, discount float DEFAULT NULL, free_num smallint(6) DEFAULT NULL, rf_ratio float DEFAULT NULL, category_id varchar(255) DEFAULT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_dt datetime DEFAULT NULL, PRIMARY KEY (sku_id,pid), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' pret15 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_deduction', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_deduction, ) logging.debug("code = 19, cases = %s" %num_19 ) logging.debug("rows to insert = %s" %len(vlist19) ) pret19 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_discount', num_cols=len(vlist19[0]), value_list=vlist19, is_many=True, need_history=False, sql_create_table=sql_cb_discount, ) return _generate_mixed_ret([pret15, pret19])
def processItemPromo(): vlist = [] glist = [] update_date = timeHelper.getNowLong() recent = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, timeHelper.FORMAT_LONG) logging.debug('Reading jd_promo_item_latest...') sql = ''' select sku_id, dt, promo_json from jd_promo_item_latest where promo_json is not NULL and LENGTH(promo_json)>100 and dt>="%s" ''' % recent retrows = dbhelper.executeSqlRead(sql, is_dirty=True) # total_rows = len(retrows) num_error = 0 num17 = 0 logging.debug('completed!') logging.debug("Total rows with promo_json: %s" % len(retrows)) for row in retrows: sku_id = row['sku_id'] dt = row['dt'] obj = None try: obj = json.loads(row['promo_json']) except: num_error += 1 continue rtags = obj['pickOneTag'] for tag in rtags: pid = tag['pid'] code = tag['code'] # 不记录加价购 if code == "17": num17 += 1 continue name = tag['name'] content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) tags = obj['tags'] for tag in tags: pid = tag['pid'] code = tag['code'] name = tag['name'] if 'name' in tag else "" if code == "10": # gift gifts = tag['gifts'] for gift in gifts: gift_name = "赠品" try: gift_name = gift['nm'] gift_num = gift['num'] if 'num' in gift else 1 gift_image = gift['mp'] if 'mp' in gift else "" gift_sku_id = gift['sid'] if 'sid' in gift else "" gift_gt = gift['gt'] if 'gt' in gift else "" gift_gs = gift['gs'] if 'gs' in gift else "" tp_gift = [ sku_id, dt, pid, code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date ] glist.append(tp_gift) except Exception as e: logging.debug( "error in extracting gift info for sku_id = %s" % sku_id) logging.debug("%s" % e) else: content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" % num_error) logging.debug('num17: %s' % num17) logging.debug('vlist len: %s' % len(vlist)) logging.debug('glist len: %s' % len(glist)) sql_cb_promo_item = ''' CREATE TABLE jd_analytic_promo_item_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_promo_gift = ''' CREATE TABLE jd_analytic_promo_gift_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, gift_name varchar(255) NOT NULL, gift_num int(11) NOT NULL, gift_image varchar(255) DEFAULT NULL, gift_sku_id bigint(20) NOT NULL, gift_gt varchar(255) DEFAULT NULL, gift_gs varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' # persist in DB ret1 = ret2 = None if len(vlist) > 0: ret1 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_item', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_item, ) if len(glist) > 0: ret2 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_gift', num_cols=len(glist[0]), value_list=glist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_gift, ) # record gift sglist = [] cur_time = timeHelper.getNowLong() for gg in glist: sglist.append([gg[0], cur_time]) sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)' afr = dbhelper.executeSqlWriteMany(sql_gg, sglist) ret3 = { 'status': 0 if afr > 0 else -1, 'msg': "", } return _generate_mixed_ret([ret1, ret2, ret3])
def _load_category_map(): sql = 'select category_id_prefix,category_prefix_name from jd_category_show' retrows = dbhelper.executeSqlRead(sql) retdict = rows_helper.transform_retrows_to_dict(retrows, 'category_prefix_name') return retdict
def process_promo_detail(): today = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, '%Y-%m-%d %H:%M:%S') # today = timeHelper.getTimeAheadOfNowDays(1) sql = ''' select a.*, b.price, d.id as category_id, d.name as category_name from jd_analytic_promo_item_latest a left join jd_item_price_latest b on a.sku_id = b.sku_id left JOIN jd_item_category c on a.sku_id = c.sku_id left join jd_category d on c.category_id = d.id where a.dt >= "%s" and b.sku_id is not NULL and b.price is not NULL ''' % today # logging.debug(sql) retrows = dbhelper.executeSqlRead(sql, is_dirty=True) vlist = [] vlist19 = [] dt = timeHelper.getNowLong() logging.debug('num total promo_item rows: %s' % len(retrows)) # exit() num_15 = 0 num_19 = 0 num_15_repeated = 0 for row in retrows: sku_id = row['sku_id'] code = int(row['code']) content = row['content'] if 'content' in row else "" adurl = row['adurl'] if 'adurl' in row else "" origin_dt = row['dt'] pid = row['pid'] name = row['name'] if 'name' in row else "" price = float("%s" % row['price']) category_id = row['category_id'] category_name = row['category_name'] # title = row['title'] if code == 15: num_15 += 1 ret = _extract_reach_deduction_array(content) stat_has_repeat = False max_deduction = float(ret['max']) for item in ret['data']: try: reach = float(item[0]) deduction = float(item[1]) is_repeat = item[2] if is_repeat == 1: stat_has_repeat = True dr_ratio = deduction * 1.0 / reach maxp_ratio = max_deduction * 1.0 / price if max_deduction > 0 else 1.0 could_deduct = 0 except Exception as e: logging.error("reach:%s, deduction:%s" % (reach, deduction)) logging.error(e) continue if price >= reach and reach > 0: if is_repeat: times = price // reach else: times = 1 could_deduct = times * deduction if could_deduct > max_deduction: could_deduct = max_deduction single_discount_rate = could_deduct / price tp = [ sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt ] vlist.append(tp) if stat_has_repeat: num_15_repeated += 1 elif code == 19: sku_str = "%s" % sku_id num_19 += 1 # 满几件打折或者降低多少 type_word_list = ["总价打", "商品价格"] # 0: 直接打折 # 1: 减商品价格 # 2: 其他 deduct_type = 0 for type_word in type_word_list: if content.find(type_word) >= 0: # deduct_type = 0 break deduct_type += 1 if deduct_type == 2: logging.error("NEW TYPE OF DISCOUNT FOUND!!!") logging.error(content) logging.error("NEW TYPE OF DISCOUNT FOUND!!!") pt = re.compile(u'[\d.]+', re.U) pts = pt.findall(content) if len(pts) != 2: if '可购买热销商品' not in content: logging.error(content) logging.error("NEW PATTERN ABOVE") reach_num = discount = free_num = rf_ratio = None reach_num = float(pts[0]) if deduct_type == 0: discount = pts[1] elif deduct_type == 1: free_num = float(pts[1]) rf_ratio = float(free_num * 1.0 / reach_num) # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] tp19 = [ sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt ] vlist19.append(tp19) else: pass logging.debug("code = 15, cases = %s" % num_15) logging.debug("code = 15, repeated = %s" % num_15_repeated) logging.debug("rows to insert = %s" % len(vlist)) sql_cb_deduction = ''' CREATE TABLE jd_analytic_promo_deduction_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) NOT NULL, price float NOT NULL, is_repeat tinyint(4) NOT NULL, reach float NOT NULL, deduction float NOT NULL, max_deduction float NOT NULL, dr_ratio float NOT NULL, maxp_ratio float NOT NULL, single_discount_rate float NOT NULL, category_id varchar(255) NOT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_time datetime NOT NULL, KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_discount = ''' CREATE TABLE jd_analytic_promo_discount_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) DEFAULT NULL, price float DEFAULT NULL, deduct_type smallint(6) DEFAULT NULL, reach_num smallint(6) DEFAULT NULL, discount float DEFAULT NULL, free_num smallint(6) DEFAULT NULL, rf_ratio float DEFAULT NULL, category_id varchar(255) DEFAULT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_dt datetime DEFAULT NULL, PRIMARY KEY (sku_id,pid), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' pret15 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_deduction', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_deduction, ) logging.debug("code = 19, cases = %s" % num_19) logging.debug("rows to insert = %s" % len(vlist19)) pret19 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_discount', num_cols=len(vlist19[0]), value_list=vlist19, is_many=True, need_history=False, sql_create_table=sql_cb_discount, ) return _generate_mixed_ret([pret15, pret19])