def getSqlCatRating():

    today = timeHelper.getNow()
    #### SAMPLE_COUNT means number of SKUs having comment_count > min_req so as to be involved in stats.
    # SMPLE_COUNT is NOT num of comments
    ####
    sql  = '''
        select e.*,'%s' as dt,c.name from (
        select
        category_id,
        count(1) as sample_count,
        sum(a.Score1Count) as sum_1,
        sum(a.Score2Count) as sum_2,
        sum(a.Score3Count) as sum_3,
        sum(a.Score4Count) as sum_4,
        sum(a.Score5Count) as sum_5,
        sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count) as comment_count,
        (sum(a.Score1Count)*1+sum(a.Score2Count)*2+sum(a.Score3Count)*3+sum(a.Score4Count)*4+sum(a.Score5Count)*5)/sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count) as rating_score,
        sum(a.Score1Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_1,
        sum(a.Score2Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_2,
        sum(a.Score3Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_3,
        sum(a.Score4Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_4,
        sum(a.Score5Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_5,
        sum(a.Score5Count+a.Score4Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_good,
        sum(a.Score1Count+a.Score2Count)/(sum(a.Score1Count+a.Score2Count+a.Score3Count+a.Score4Count+a.Score5Count)+1) as rate_bad,
        max(a.dt) as origin_dt

        from

        (select * from jd_item_comment_count_latest where CommentCount>=100) a
        left join
        jd_item_category b
        on a.SkuId = b.sku_id
        -- where a.dt > '2015-10-1'

        group by b.category_id
        ) e
        left JOIN
        jd_category c
        on e.category_id = c.id

    ''' %(today)
    return sql
def calculate_base_rating_for_categories():

    today = timeHelper.getNow()
    sql = getSqlCatRating()
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)
    # print sql
    print "rows of data selected for insert: %s" %len(retrows)
    # print len(retrows[0])
    # print retrows[0]
    vlist = []
    for row in retrows:
        tp = []
        tp.append(row['category_id'])
        tp.append(row['sample_count'])
        tp.append(row['sum_1'])
        tp.append(row['sum_2'])
        tp.append(row['sum_3'])
        tp.append(row['sum_4'])
        tp.append(row['sum_5'])
        tp.append(row['comment_count'])
        tp.append(row['rating_score'])
        tp.append(row['rate_1'])
        tp.append(row['rate_2'])
        tp.append(row['rate_3'])
        tp.append(row['rate_4'])
        tp.append(row['rate_5'])
        tp.append(row['rate_good'])
        tp.append(row['rate_bad'])
        tp.append(row['origin_dt'])
        tp.append(row['dt'])
        tp.append(row['name'])
        # print row['category_id']
        vlist.append(tp)

    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_analytic_category_rating',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True
    )
Beispiel #3
0
def loadCategoryList():
    html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL, JD_ENC)
    json_str = url_utils.removeJsonP(html)
    obj = json.loads(json_str)
    clist = __extractCategoryList_fromJson__(obj)

    cat_list = []

    for item in clist:
        print item
        vals = item.split('|')
        if len(vals) < 4:
            print 'error in length of category line'
            print item
            continue
        cat_name = vals[1]
        vals0 = vals[0]
        cat_id = cat_url = cat_memo = ""
        if '.com' in vals0:
            cat_url = vals0
        else:
            cat_id = vals0
        if len(vals[2]) > 0:
            cat_memo = vals[2]

        if len(cat_id) > 0:
            tp = (cat_id, cat_name, timeHelper.getNow())
            cat_list.append(tp)

    # persist categories
    sql = 'replace into jd_category values(%s,%s,%s)'
    affected_rows = dbhelper.executeSqlWriteMany(sql, cat_list)

    print 'rows affected : jd_category : %s' % affected_rows

    return 0
def loadCategoryList():
    html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL,JD_ENC)
    json_str = url_utils.removeJsonP(html)
    obj = json.loads(json_str)
    clist = __extractCategoryList_fromJson__(obj)

    cat_list = []

    for item in clist:
        print item
        vals = item.split('|')
        if len(vals)<4:
            print 'error in length of category line'
            print item
            continue
        cat_name = vals[1]
        vals0 = vals[0]
        cat_id = cat_url = cat_memo = ""
        if '.com' in vals0:
            cat_url = vals0
        else:
            cat_id = vals0
        if len(vals[2]) > 0:
            cat_memo = vals[2]

        if len(cat_id) > 0:
            tp = (cat_id,cat_name, timeHelper.getNow())
            cat_list.append(tp)

    # persist categories
    sql = 'replace into jd_category values(%s,%s,%s)'
    affected_rows = dbhelper.executeSqlWriteMany(sql,cat_list)

    print 'rows affected : jd_category : %s' %affected_rows

    return 0
Beispiel #5
0
def calculate_rating_diff():

    sql = '''
        replace into jd_analytic_item_rating_diff

        select

        a.SkuId,
        '%s' as dt,
        c.category_id,
        d.name as category_name,
        a.CommentCount,
        c.sample_count as category_sample_count,

        ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score,
        c.rating_score as category_rating_score,
         ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount*1.0/c.rating_score as rating_score_diff,

        (a.Score4Count+a.Score5Count)/a.CommentCount as rate_good,
        c.rate_good as category_rate_good,
        Format(((a.Score4Count+a.Score5Count)/a.CommentCount - c.rate_good)*100/c.rate_good,1) as rate_good_diff,

        (a.Score1Count+a.Score2Count)/a.CommentCount as rate_bad,
        c.rate_bad as category_rate_bad,
        Format(((a.Score1Count+a.Score2Count)/a.CommentCount - c.rate_bad)*100/c.rate_bad,1) as rate_bad_diff,

        a.Score1Count/a.CommentCount as rate_1,
        c.rate_1 as category_rate_1,
        Format((a.Score1Count/a.CommentCount - c.rate_1)*100/c.rate_1,1) as rate_1_diff,

        a.Score2Count/a.CommentCount as rate_2,
        c.rate_2 as category_rate_2,
        Format((a.Score2Count/a.CommentCount - c.rate_2)*100/c.rate_2,1) as rate_2_diff,

        a.Score3Count/a.CommentCount as rate_3,
        c.rate_3 as category_rate_3,
        Format((a.Score3Count/a.CommentCount - c.rate_3)*100/c.rate_3,1) as rate_3_diff,

        a.Score4Count/a.CommentCount as rate_4,
        c.rate_4 as category_rate_4,
        Format((a.Score4Count/a.CommentCount - c.rate_4)*100/c.rate_4,1) as rate_4_diff,

        a.Score5Count/a.CommentCount as rate_5,
        c.rate_5 as category_rate_5,
        Format((a.Score5Count/a.CommentCount - c.rate_5)*100/c.rate_5,1) as rate_5_diff,

        a.dt as item_origin_dt,
        c.dt as category_origin_dt,
        c.origin_dt as raw_origin_dt

        FROM

        jd_item_comment_count_latest a
        left JOIN
        jd_item_category b
        on a.SkuId = b.sku_id and b.sku_id is not NULL
        left join
        jd_analytic_category_rating_latest c
        on b.category_id = c.category_id
        left join
        jd_category d
        on d.id = c.category_id
        where a.CommentCount>0

    ''' % timeHelper.getNow()

    # print sql
    afr = dbhelper.executeSqlWrite1(sql,
                                    is_dirty=True,
                                    isolation_type='serializable')

    return afr
def calculate_rating_diff() :

    sql = '''
        replace into jd_analytic_item_rating_diff

        select

        a.SkuId,
        '%s' as dt,
        c.category_id,
        d.name as category_name,
        a.CommentCount,
        c.sample_count as category_sample_count,

        ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score,
        c.rating_score as category_rating_score,
         ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount*1.0/c.rating_score as rating_score_diff,

        (a.Score4Count+a.Score5Count)/a.CommentCount as rate_good,
        c.rate_good as category_rate_good,
        Format(((a.Score4Count+a.Score5Count)/a.CommentCount - c.rate_good)*100/c.rate_good,1) as rate_good_diff,

        (a.Score1Count+a.Score2Count)/a.CommentCount as rate_bad,
        c.rate_bad as category_rate_bad,
        Format(((a.Score1Count+a.Score2Count)/a.CommentCount - c.rate_bad)*100/c.rate_bad,1) as rate_bad_diff,

        a.Score1Count/a.CommentCount as rate_1,
        c.rate_1 as category_rate_1,
        Format((a.Score1Count/a.CommentCount - c.rate_1)*100/c.rate_1,1) as rate_1_diff,

        a.Score2Count/a.CommentCount as rate_2,
        c.rate_2 as category_rate_2,
        Format((a.Score2Count/a.CommentCount - c.rate_2)*100/c.rate_2,1) as rate_2_diff,

        a.Score3Count/a.CommentCount as rate_3,
        c.rate_3 as category_rate_3,
        Format((a.Score3Count/a.CommentCount - c.rate_3)*100/c.rate_3,1) as rate_3_diff,

        a.Score4Count/a.CommentCount as rate_4,
        c.rate_4 as category_rate_4,
        Format((a.Score4Count/a.CommentCount - c.rate_4)*100/c.rate_4,1) as rate_4_diff,

        a.Score5Count/a.CommentCount as rate_5,
        c.rate_5 as category_rate_5,
        Format((a.Score5Count/a.CommentCount - c.rate_5)*100/c.rate_5,1) as rate_5_diff,

        a.dt as item_origin_dt,
        c.dt as category_origin_dt,
        c.origin_dt as raw_origin_dt

        FROM

        jd_item_comment_count_latest a
        left JOIN
        jd_item_category b
        on a.SkuId = b.sku_id and b.sku_id is not NULL
        left join
        jd_analytic_category_rating_latest c
        on b.category_id = c.category_id
        left join
        jd_category d
        on d.id = c.category_id
        where a.CommentCount>0

    ''' %timeHelper.getNow()

    # print sql
    afr = dbhelper.executeSqlWrite1(sql,is_dirty=True,isolation_type='serializable')

    return afr