def get_job_update_time(): sql = 'select job_time from jd_notification_job_status where job_name="%s"' %(NOTIFICATION_JOB_NAME) retrows = dbhelper.executeSqlRead2(sql) ret = '0000-00-00 0:00:00' if len(retrows) > 0: ret = retrows[0][0] return ret
def _get_all_devices(): sql = 'select device_id from user_notification_device_latest' retrows = dbhelper.executeSqlRead2(sql) vlist = [] for row in retrows: device_id = row[0] vlist.append(device_id) return vlist
def calculateSkuRatingScores(): sql_cb = ''' CREATE TABLE jd_analytic_rating_score_latest ( sku_id bigint(20) NOT NULL, comment_count int(11) NOT NULL, rating_score float DEFAULT NULL, category_id varchar(255) NOT NULL, this_update_time datetime NOT NULL, PRIMARY KEY (sku_id) -- KEY skuid (sku_id), -- KEY cat_score (rating_score,category_id), -- KEY score (rating_score), -- KEY category (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql = ''' select skuid as sku_id, CommentCount, ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score, category_id, CURRENT_TIMESTAMP() as this_update_time from jd_item_comment_count_latest a left join jd_item_category b on a.SkuId = b.sku_id where a.CommentCount is not null and a.CommentCount >= %s ''' % (datamining_config. MIN_COMMENT_NUM_SO_RATING_SCORE_STATISTICALLY_SIGNIFICANT) retrows = dbhelper.executeSqlRead2(sql, is_dirty=True) ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_rating_score', num_cols=len(retrows[0]), value_list=retrows, need_history=False, is_many=True, sql_create_table=sql_cb, ) return ret
def calculateSkuRatingScores(): sql_cb = ''' CREATE TABLE jd_analytic_rating_score_latest ( sku_id bigint(20) NOT NULL, comment_count int(11) NOT NULL, rating_score float DEFAULT NULL, category_id varchar(255) NOT NULL, this_update_time datetime NOT NULL, PRIMARY KEY (sku_id) -- KEY skuid (sku_id), -- KEY cat_score (rating_score,category_id), -- KEY score (rating_score), -- KEY category (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql = ''' select skuid as sku_id, CommentCount, ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score, category_id, CURRENT_TIMESTAMP() as this_update_time from jd_item_comment_count_latest a left join jd_item_category b on a.SkuId = b.sku_id where a.CommentCount is not null and a.CommentCount >= %s ''' %(datamining_config.MIN_COMMENT_NUM_SO_RATING_SCORE_STATISTICALLY_SIGNIFICANT) retrows = dbhelper.executeSqlRead2(sql,is_dirty=True) ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_rating_score', num_cols=len(retrows[0]), value_list=retrows, need_history=False, is_many=True, sql_create_table=sql_cb, ) return ret
def _get_category_id_prefix_given_category_name(category_name) : # global CATEGORY_MAP # if len(CATEGORY_MAP)==0: # CATEGORY_MAP = _load_category_map() category_name = category_name.decode('utf-8') CATEGORY_MAP = _load_category_map() if category_name in CATEGORY_MAP: return [CATEGORY_MAP[category_name]['category_id_prefix'],CATEGORY_MAP[category_name]['category_prefix_name'] ] else: sql2 = 'select id from jd_category where name="%s" limit 1' %category_name retrows = dbhelper.executeSqlRead2(sql2) if len(retrows)==0: err = "Not found in category_map : %s, returning this category_name" %category_name print err return [retrows[0][0],category_name]
def getUpdatedSkuIds(): ut = get_job_update_time() sql = ''' select distinct sku_id FROM jd_notification_history_lowest WHERE update_time >= "%s" ''' %(ut) retrows = dbhelper.executeSqlRead2(sql) vlist = [] for row in retrows: vlist.append(row[0]) ret = { 'num': len(vlist), 'data': vlist, } return ret
def _get_category_id_prefix_given_category_name(category_name): # global CATEGORY_MAP # if len(CATEGORY_MAP)==0: # CATEGORY_MAP = _load_category_map() category_name = category_name.decode('utf-8') CATEGORY_MAP = _load_category_map() if category_name in CATEGORY_MAP: return [ CATEGORY_MAP[category_name]['category_id_prefix'], CATEGORY_MAP[category_name]['category_prefix_name'] ] else: sql2 = 'select id from jd_category where name="%s" limit 1' % category_name retrows = dbhelper.executeSqlRead2(sql2) if len(retrows) == 0: err = "Not found in category_map : %s, returning this category_name" % category_name print err return [retrows[0][0], category_name]
def calculate_min_max_price(): logging.debug('Reading item_dynamic history and calculate min/max/avg/median price for skus...') t1 = time.time() dt = timeHelper.getTimeAheadOfNowHours(datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG) sql1 = ''' select sku_id, AVG(price) as average_price, min(price) as min_price, -- median(price) as median_price, -- changed 12/22 percentile_minx(price) as median_price, max(price) as max_price, max(update_time) as origin_time, count(1) as sample_count, min_ratio(price) as min_ratio, LPDR(price) as LPDR from -- jd_item_dynamic -- changed 12/22 jd_item_price where -- update_time > '2015-11-14 0:00:00' and -- 双十一期间价格 price > 0 group by sku_id having max(update_time) >= '%s' ''' %(dt) logging.debug(sql1) retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True) logging.debug("Done, rows to insert: %s" %len(retrows) ) t2 = time.time() logging.debug('using seconds: %0.1f' %(t2-t1) ) sql_cb = ''' CREATE TABLE jd_analytic_price_stat_latest ( sku_id bigint(20) NOT NULL, average_price float NOT NULL, min_price float NOT NULL, median_price float NOT NULL, max_price float NOT NULL, origin_time datetime NOT NULL, sample_count int(11) NOT NULL, min_ratio float NOT NULL, LPDR float NOT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_price_stat', num_cols=len(retrows[0]), value_list=retrows, is_many=True, need_history=False, sql_create_table= sql_cb, ) return ret
def generate_category_property_mapping(): # sql = 'select * from jd_category' # retrows = dbhelper.executeSqlRead(sql) # # for row in retrows: # category_id = row['id'] # category_name = row['name'] black_list_clause = '","'.join(PROPERTY_KEY_BLACK_WORD_LIST) black_list_clause = '"%s"' %black_list_clause sql2 = ''' select sku_id, p_key, p_value, category_id from jd_item_property_latest a left join jd_item_category b using (sku_id) where LENGTH(p_value)>3 and LENGTH(p_value)<=30 and not (p_value like '%%个' and length(p_value)<=10) and p_key<>'__DEFAULT__' and LENGTH(p_key)>=6 and LENGTH(p_key)<=21 and p_key not like '%%重%%' and p_key not like '%%尺寸%%' and p_key not like '%%厚度%%' and p_key not like '%%宽度%%' and p_key not like '%%长宽高%%' and p_key not like '%%mm%%' and p_key <> '上架时间' and p_key NOT IN (%s) ''' %(black_list_clause) vlist = dbhelper.executeSqlRead2(sql2, is_dirty=True) sql_cb = ''' CREATE TABLE jd_analytic_property_latest ( sku_id bigint(20) DEFAULT NULL, p_key varchar(255) DEFAULT NULL, p_value varchar(255) DEFAULT NULL, category_id varchar(255) DEFAULT NULL, KEY skuid_categoryid (sku_id,category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' print "now writing to db..." t1 = time.time() ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_property', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb, ) t2 = time.time() print "using : %0.0f" %(t2-t1) return ret
def calculate_min_max_price(): logging.debug( 'Reading item_dynamic history and calculate min/max/avg/median price for skus...' ) t1 = time.time() dt = timeHelper.getTimeAheadOfNowHours( datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG) sql1 = ''' select sku_id, AVG(price) as average_price, min(price) as min_price, -- median(price) as median_price, -- changed 12/22 percentile_minx(price) as median_price, max(price) as max_price, max(update_time) as origin_time, count(1) as sample_count, min_ratio(price) as min_ratio, LPDR(price) as LPDR from -- jd_item_dynamic -- changed 12/22 jd_item_price where -- update_time > '2015-11-14 0:00:00' and -- 双十一期间价格 price > 0 group by sku_id having max(update_time) >= '%s' ''' % (dt) logging.debug(sql1) retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True) logging.debug("Done, rows to insert: %s" % len(retrows)) t2 = time.time() logging.debug('using seconds: %0.1f' % (t2 - t1)) sql_cb = ''' CREATE TABLE jd_analytic_price_stat_latest ( sku_id bigint(20) NOT NULL, average_price float NOT NULL, min_price float NOT NULL, median_price float NOT NULL, max_price float NOT NULL, origin_time datetime NOT NULL, sample_count int(11) NOT NULL, min_ratio float NOT NULL, LPDR float NOT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_price_stat', num_cols=len(retrows[0]), value_list=retrows, is_many=True, need_history=False, sql_create_table=sql_cb, ) return ret
def generate_category_property_mapping(): # sql = 'select * from jd_category' # retrows = dbhelper.executeSqlRead(sql) # # for row in retrows: # category_id = row['id'] # category_name = row['name'] black_list_clause = '","'.join(PROPERTY_KEY_BLACK_WORD_LIST) black_list_clause = '"%s"' % black_list_clause sql2 = ''' select sku_id, p_key, p_value, category_id from jd_item_property_latest a left join jd_item_category b using (sku_id) where LENGTH(p_value)>3 and LENGTH(p_value)<=30 and not (p_value like '%%个' and length(p_value)<=10) and p_key<>'__DEFAULT__' and LENGTH(p_key)>=6 and LENGTH(p_key)<=21 and p_key not like '%%重%%' and p_key not like '%%尺寸%%' and p_key not like '%%厚度%%' and p_key not like '%%宽度%%' and p_key not like '%%长宽高%%' and p_key not like '%%mm%%' and p_key <> '上架时间' and p_key NOT IN (%s) ''' % (black_list_clause) vlist = dbhelper.executeSqlRead2(sql2, is_dirty=True) sql_cb = ''' CREATE TABLE jd_analytic_property_latest ( sku_id bigint(20) DEFAULT NULL, p_key varchar(255) DEFAULT NULL, p_value varchar(255) DEFAULT NULL, category_id varchar(255) DEFAULT NULL, KEY skuid_categoryid (sku_id,category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' print "now writing to db..." t1 = time.time() ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_property', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb, ) t2 = time.time() print "using : %0.0f" % (t2 - t1) return ret