def getSqlCatRating(): today = timeHelper.getNow() #### SAMPLE_COUNT means number of SKUs having comment_count > min_req so as to be involved in stats. # SMPLE_COUNT is NOT num of comments #### sql = ''' select e.*,'%s' as dt,c.name from ( select category_id, count(1) as sample_count, sum(a.Score1Count) as sum_1, sum(a.Score2Count) as sum_2, sum(a.Score3Count) as sum_3, sum(a.Score4Count) as sum_4, sum(a.Score5Count) as sum_5, sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count) as comment_count, (sum(a.Score1Count)*1+sum(a.Score2Count)*2+sum(a.Score3Count)*3+sum(a.Score4Count)*4+sum(a.Score5Count)*5)/sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count) as rating_score, sum(a.Score1Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_1, sum(a.Score2Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_2, sum(a.Score3Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_3, sum(a.Score4Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_4, sum(a.Score5Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_5, sum(a.Score5Count+a.Score4Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_good, sum(a.Score1Count+a.Score2Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_bad, max(a.dt) as origin_dt from (select * from jd_item_comment_count_latest where CommentCount>=100) a left join jd_item_category b on a.SkuId = b.sku_id -- where a.dt > '2015-10-1' group by b.category_id ) e left JOIN jd_category c on e.category_id = c.id ''' %(today) return sql
def calculate_base_rating_for_categories(): today = timeHelper.getNow() sql = getSqlCatRating() retrows = dbhelper.executeSqlRead(sql, is_dirty=True) # print sql print "rows of data selected for insert: %s" %len(retrows) # print len(retrows[0]) # print retrows[0] vlist = [] for row in retrows: tp = [] tp.append(row['category_id']) tp.append(row['sample_count']) tp.append(row['sum_1']) tp.append(row['sum_2']) tp.append(row['sum_3']) tp.append(row['sum_4']) tp.append(row['sum_5']) tp.append(row['comment_count']) tp.append(row['rating_score']) tp.append(row['rate_1']) tp.append(row['rate_2']) tp.append(row['rate_3']) tp.append(row['rate_4']) tp.append(row['rate_5']) tp.append(row['rate_good']) tp.append(row['rate_bad']) tp.append(row['origin_dt']) tp.append(row['dt']) tp.append(row['name']) # print row['category_id'] vlist.append(tp) return crawler_helper.persist_db_history_and_latest( table_name='jd_analytic_category_rating', num_cols=len(vlist[0]), value_list=vlist, is_many=True )
def loadCategoryList(): html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL, JD_ENC) json_str = url_utils.removeJsonP(html) obj = json.loads(json_str) clist = __extractCategoryList_fromJson__(obj) cat_list = [] for item in clist: print item vals = item.split('|') if len(vals) < 4: print 'error in length of category line' print item continue cat_name = vals[1] vals0 = vals[0] cat_id = cat_url = cat_memo = "" if '.com' in vals0: cat_url = vals0 else: cat_id = vals0 if len(vals[2]) > 0: cat_memo = vals[2] if len(cat_id) > 0: tp = (cat_id, cat_name, timeHelper.getNow()) cat_list.append(tp) # persist categories sql = 'replace into jd_category values(%s,%s,%s)' affected_rows = dbhelper.executeSqlWriteMany(sql, cat_list) print 'rows affected : jd_category : %s' % affected_rows return 0
def loadCategoryList(): html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL,JD_ENC) json_str = url_utils.removeJsonP(html) obj = json.loads(json_str) clist = __extractCategoryList_fromJson__(obj) cat_list = [] for item in clist: print item vals = item.split('|') if len(vals)<4: print 'error in length of category line' print item continue cat_name = vals[1] vals0 = vals[0] cat_id = cat_url = cat_memo = "" if '.com' in vals0: cat_url = vals0 else: cat_id = vals0 if len(vals[2]) > 0: cat_memo = vals[2] if len(cat_id) > 0: tp = (cat_id,cat_name, timeHelper.getNow()) cat_list.append(tp) # persist categories sql = 'replace into jd_category values(%s,%s,%s)' affected_rows = dbhelper.executeSqlWriteMany(sql,cat_list) print 'rows affected : jd_category : %s' %affected_rows return 0
def calculate_rating_diff(): sql = ''' replace into jd_analytic_item_rating_diff select a.SkuId, '%s' as dt, c.category_id, d.name as category_name, a.CommentCount, c.sample_count as category_sample_count, ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score, c.rating_score as category_rating_score, ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount*1.0/c.rating_score as rating_score_diff, (a.Score4Count+a.Score5Count)/a.CommentCount as rate_good, c.rate_good as category_rate_good, Format(((a.Score4Count+a.Score5Count)/a.CommentCount - c.rate_good)*100/c.rate_good,1) as rate_good_diff, (a.Score1Count+a.Score2Count)/a.CommentCount as rate_bad, c.rate_bad as category_rate_bad, Format(((a.Score1Count+a.Score2Count)/a.CommentCount - c.rate_bad)*100/c.rate_bad,1) as rate_bad_diff, a.Score1Count/a.CommentCount as rate_1, c.rate_1 as category_rate_1, Format((a.Score1Count/a.CommentCount - c.rate_1)*100/c.rate_1,1) as rate_1_diff, a.Score2Count/a.CommentCount as rate_2, c.rate_2 as category_rate_2, Format((a.Score2Count/a.CommentCount - c.rate_2)*100/c.rate_2,1) as rate_2_diff, a.Score3Count/a.CommentCount as rate_3, c.rate_3 as category_rate_3, Format((a.Score3Count/a.CommentCount - c.rate_3)*100/c.rate_3,1) as rate_3_diff, a.Score4Count/a.CommentCount as rate_4, c.rate_4 as category_rate_4, Format((a.Score4Count/a.CommentCount - c.rate_4)*100/c.rate_4,1) as rate_4_diff, a.Score5Count/a.CommentCount as rate_5, c.rate_5 as category_rate_5, Format((a.Score5Count/a.CommentCount - c.rate_5)*100/c.rate_5,1) as rate_5_diff, a.dt as item_origin_dt, c.dt as category_origin_dt, c.origin_dt as raw_origin_dt FROM jd_item_comment_count_latest a left JOIN jd_item_category b on a.SkuId = b.sku_id and b.sku_id is not NULL left join jd_analytic_category_rating_latest c on b.category_id = c.category_id left join jd_category d on d.id = c.category_id where a.CommentCount>0 ''' % timeHelper.getNow() # print sql afr = dbhelper.executeSqlWrite1(sql, is_dirty=True, isolation_type='serializable') return afr
def calculate_rating_diff() : sql = ''' replace into jd_analytic_item_rating_diff select a.SkuId, '%s' as dt, c.category_id, d.name as category_name, a.CommentCount, c.sample_count as category_sample_count, ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score, c.rating_score as category_rating_score, ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount*1.0/c.rating_score as rating_score_diff, (a.Score4Count+a.Score5Count)/a.CommentCount as rate_good, c.rate_good as category_rate_good, Format(((a.Score4Count+a.Score5Count)/a.CommentCount - c.rate_good)*100/c.rate_good,1) as rate_good_diff, (a.Score1Count+a.Score2Count)/a.CommentCount as rate_bad, c.rate_bad as category_rate_bad, Format(((a.Score1Count+a.Score2Count)/a.CommentCount - c.rate_bad)*100/c.rate_bad,1) as rate_bad_diff, a.Score1Count/a.CommentCount as rate_1, c.rate_1 as category_rate_1, Format((a.Score1Count/a.CommentCount - c.rate_1)*100/c.rate_1,1) as rate_1_diff, a.Score2Count/a.CommentCount as rate_2, c.rate_2 as category_rate_2, Format((a.Score2Count/a.CommentCount - c.rate_2)*100/c.rate_2,1) as rate_2_diff, a.Score3Count/a.CommentCount as rate_3, c.rate_3 as category_rate_3, Format((a.Score3Count/a.CommentCount - c.rate_3)*100/c.rate_3,1) as rate_3_diff, a.Score4Count/a.CommentCount as rate_4, c.rate_4 as category_rate_4, Format((a.Score4Count/a.CommentCount - c.rate_4)*100/c.rate_4,1) as rate_4_diff, a.Score5Count/a.CommentCount as rate_5, c.rate_5 as category_rate_5, Format((a.Score5Count/a.CommentCount - c.rate_5)*100/c.rate_5,1) as rate_5_diff, a.dt as item_origin_dt, c.dt as category_origin_dt, c.origin_dt as raw_origin_dt FROM jd_item_comment_count_latest a left JOIN jd_item_category b on a.SkuId = b.sku_id and b.sku_id is not NULL left join jd_analytic_category_rating_latest c on b.category_id = c.category_id left join jd_category d on d.id = c.category_id where a.CommentCount>0 ''' %timeHelper.getNow() # print sql afr = dbhelper.executeSqlWrite1(sql,is_dirty=True,isolation_type='serializable') return afr