Beispiel #1
0
def calculateSkuRatingScores():

    sql_cb = '''
        CREATE TABLE jd_analytic_rating_score_latest (
          sku_id bigint(20) NOT NULL,
          comment_count int(11) NOT NULL,
          rating_score float DEFAULT NULL,
          category_id varchar(255) NOT NULL,
          this_update_time datetime NOT NULL,
          PRIMARY KEY (sku_id)
          -- KEY skuid (sku_id),
          -- KEY cat_score (rating_score,category_id),
          -- KEY score (rating_score),
          -- KEY category (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
        '''

    sql = '''
        select

        skuid as sku_id,
        CommentCount,
        ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score,
        category_id,
        CURRENT_TIMESTAMP() as this_update_time

        from

        jd_item_comment_count_latest a
        left join
        jd_item_category b
        on a.SkuId = b.sku_id

        where a.CommentCount is not null and a.CommentCount >= %s

        ''' % (datamining_config.
               MIN_COMMENT_NUM_SO_RATING_SCORE_STATISTICALLY_SIGNIFICANT)

    retrows = dbhelper.executeSqlRead2(sql, is_dirty=True)

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_rating_score',
        num_cols=len(retrows[0]),
        value_list=retrows,
        need_history=False,
        is_many=True,
        sql_create_table=sql_cb,
    )

    return ret
def calculateSkuRatingScores():

    sql_cb = '''
        CREATE TABLE jd_analytic_rating_score_latest (
          sku_id bigint(20) NOT NULL,
          comment_count int(11) NOT NULL,
          rating_score float DEFAULT NULL,
          category_id varchar(255) NOT NULL,
          this_update_time datetime NOT NULL,
          PRIMARY KEY (sku_id)
          -- KEY skuid (sku_id),
          -- KEY cat_score (rating_score,category_id),
          -- KEY score (rating_score),
          -- KEY category (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
        '''

    sql = '''
        select

        skuid as sku_id,
        CommentCount,
        ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score,
        category_id,
        CURRENT_TIMESTAMP() as this_update_time

        from

        jd_item_comment_count_latest a
        left join
        jd_item_category b
        on a.SkuId = b.sku_id

        where a.CommentCount is not null and a.CommentCount >= %s

        ''' %(datamining_config.MIN_COMMENT_NUM_SO_RATING_SCORE_STATISTICALLY_SIGNIFICANT)

    retrows = dbhelper.executeSqlRead2(sql,is_dirty=True)

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_rating_score',
        num_cols=len(retrows[0]),
        value_list=retrows,
        need_history=False,
        is_many=True,
        sql_create_table=sql_cb,
    )

    return ret
Beispiel #3
0
def generate_index():
    catdict = getMergedCategoryInfo()
    vlist = []
    for key in catdict:
        vlist.append([key, catdict[key]])
    sqlcb = '''
        CREATE TABLE jd_index_category_latest (
          category_id varchar(255) NOT NULL,
          category_text varchar(255) NOT NULL,
          PRIMARY KEY (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''
    return crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_index_category',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sqlcb,
    )
def generate_index():
    catdict = getMergedCategoryInfo()
    vlist = []
    for key in catdict:
        vlist.append([key,catdict[key]])
    sqlcb = '''
        CREATE TABLE jd_index_category_latest (
          category_id varchar(255) NOT NULL,
          category_text varchar(255) NOT NULL,
          PRIMARY KEY (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''
    return crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_index_category',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sqlcb,
    )
def calculate_min_max_price():
    logging.debug('Reading item_dynamic history and calculate min/max/avg/median price for skus...')
    t1 = time.time()
    dt = timeHelper.getTimeAheadOfNowHours(datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    sql1 = '''
        select
            sku_id,
            AVG(price) as average_price,
            min(price) as min_price,
            -- median(price) as median_price,           -- changed 12/22
            percentile_minx(price) as median_price,
            max(price) as max_price,
            max(update_time) as origin_time,
            count(1) as sample_count,
            min_ratio(price) as min_ratio,
            LPDR(price) as LPDR

        from
        -- jd_item_dynamic                              -- changed 12/22
        jd_item_price

        where

        -- update_time > '2015-11-14 0:00:00' and  -- 双十一期间价格
        price > 0

        group by sku_id
        having max(update_time) >= '%s'
    ''' %(dt)

    logging.debug(sql1)
    retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True)
    logging.debug("Done, rows to insert: %s" %len(retrows) )
    t2 = time.time()
    logging.debug('using seconds: %0.1f' %(t2-t1) )

    sql_cb = '''

        CREATE TABLE jd_analytic_price_stat_latest (
          sku_id bigint(20) NOT NULL,
          average_price float NOT NULL,
          min_price float NOT NULL,
          median_price float NOT NULL,
          max_price float NOT NULL,
          origin_time datetime NOT NULL,
          sample_count int(11) NOT NULL,
          min_ratio float NOT NULL,
          LPDR float NOT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_price_stat',
        num_cols=len(retrows[0]),
        value_list=retrows,
        is_many=True,
        need_history=False,
        sql_create_table= sql_cb,
    )
    return ret
def generate_category_property_mapping():

    # sql = 'select * from jd_category'
    # retrows = dbhelper.executeSqlRead(sql)
    #
    # for row in retrows:
    #     category_id = row['id']
    #     category_name = row['name']
    black_list_clause = '","'.join(PROPERTY_KEY_BLACK_WORD_LIST)
    black_list_clause = '"%s"' %black_list_clause
    sql2 = '''
        select
            sku_id,
            p_key,
            p_value,
            category_id
        from
        jd_item_property_latest a
        left join
        jd_item_category b
        using (sku_id)
		where LENGTH(p_value)>3
		and LENGTH(p_value)<=30
		and not (p_value like '%%个' and length(p_value)<=10)
		and p_key<>'__DEFAULT__'
		and LENGTH(p_key)>=6
		and LENGTH(p_key)<=21
		and p_key not like '%%重%%'
		and p_key not like '%%尺寸%%'
		and p_key not like '%%厚度%%'
		and p_key not like '%%宽度%%'
		and p_key not like '%%长宽高%%'
		and p_key not like '%%mm%%'
		and p_key <> '上架时间'
		and p_key NOT IN (%s)
    ''' %(black_list_clause)

    vlist = dbhelper.executeSqlRead2(sql2, is_dirty=True)

    sql_cb = '''
        CREATE TABLE jd_analytic_property_latest (
          sku_id bigint(20) DEFAULT NULL,
          p_key varchar(255) DEFAULT NULL,
          p_value varchar(255) DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          KEY skuid_categoryid (sku_id,category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    print "now writing to db..."
    t1 = time.time()

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_property',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    t2 = time.time()
    print "using : %0.0f" %(t2-t1)

    return ret
def calculatePercentile():
    """
    (1) load data
    (2) hash by key: category_id, ordered dict
    (3) for each item, find it's category_id array, got it's pos and percentile
    (4) store results in db
    :return:
    """

    t1 = time.time()

    # STEP (1)
    print "step 1/4: reading data from rating_score_latest"
    sql = '''
    select * from jd_analytic_rating_score_latest
    where
    rating_score is NOT NULL and
    comment_count is not NULL AND comment_count >= %s
    -- and category_id like "670-729-%%" order by comment_count DESC
    order by category_id
    -- limit 1000
    ''' %datamining_config.MIN_SKU_NUM_PER_CATEGORY_SO_STATISTICALLY_SIGNIFICANT

    retrows = dbhelper.executeSqlRead(sql)
    t2 = time.time()
    print "Done, rows read: %s, seconds used: %0.1f" %(len(retrows), t2-t1)

    # STEP (2)

    print "step 2/4: sorting category scores..."
    print ""

    key_col = 'rating_score'

    tdict = rows_helper.transform_retrows_to_hashed_arrays(retrows, key_col_name='category_id')
    odict = {}
    for cat in tdict:
        array = tdict[cat]
        _get_ordered_array(array, key_col)
        odict[cat] = array
    t3 = time.time()
    print "Done, ordered_dict generated, num of keys = %s, time used = %0.1f" %(len(odict),t3-t2)
    print ""

    # STEP (3)

    print "step 3/4: calculate rating percentile for each sku..."
    #sku_dict = rows_helper.transform_retrows_to_dict(retrows, key_col_name='sku_id')
    for row in retrows:
        catid = row['category_id']
        myval = row[key_col]
        pt = _getPercentileGreaterThan(myval,odict[catid],key_col)
        row['percentile_'+key_col] = pt
        row['sample_num'] = len(odict[catid]) if odict[catid] is not None else 0
        # print "myval: %s\tpt: %s" %(myval,pt)
    t4 = time.time()
    print "Done, using seconds: %0.1f" %(t4-t3)
    print ""

    # Step (4)
    print 'step 4/4: storing results in db...'
    # for item in retrows:
    #     for key in item:
    #         print key
    #     break

    sql_cb = '''
    CREATE TABLE jd_analytic_rating_percentile_latest (
          sku_id bigint(20) NOT NULL,
          comment_count int(11) NOT NULL,
          this_update_time datetime NOT NULL,
          rating_score float NOT NULL,
          category_id varchar(255) NOT NULL,
          rating_sample_num int(11) DEFAULT 0,
          percentile_rating_score float DEFAULT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
          -- KEY cat_score (rating_score,category_id),
          -- KEY score (rating_score),
          -- KEY category (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''
    vlist = rows_helper.transform_retrows_arrayofdicts_to_arrayoftuples(retrows)
    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_rating_percentile',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    t5 = time.time()
    print "Done, rows affected: %s, time used: %0.1f" %(ret, t5-t4)
    print ""
    return ret
def process_promo_detail():
    today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,'%Y-%m-%d %H:%M:%S')
    # today = timeHelper.getTimeAheadOfNowDays(1)
    sql = '''
        select a.*, b.price, d.id as category_id, d.name as category_name from

        jd_analytic_promo_item_latest a
        left join
        jd_item_price_latest b
        on a.sku_id = b.sku_id

        left JOIN
        jd_item_category c
        on a.sku_id = c.sku_id

        left join
        jd_category d
        on c.category_id = d.id

        where a.dt >= "%s"
        and b.sku_id is not NULL
        and b.price is not NULL
    ''' %today
    # logging.debug(sql)
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    vlist = []
    vlist19 = []

    dt = timeHelper.getNowLong()

    logging.debug('num total promo_item rows: %s' %len(retrows) )
    # exit()

    num_15 = 0
    num_19 = 0
    num_15_repeated = 0

    for row in retrows:
        sku_id = row['sku_id']
        code = int(row['code'])
        content = row['content'] if 'content' in row else ""
        adurl = row['adurl'] if 'adurl' in row else ""
        origin_dt = row['dt']
        pid = row['pid']
        name = row['name'] if 'name' in row else ""
        price = float("%s" %row['price'])
        category_id = row['category_id']
        category_name = row['category_name']
        # title = row['title']
        if code == 15:
            num_15 += 1
            ret = _extract_reach_deduction_array(content)

            stat_has_repeat = False
            max_deduction = float(ret['max'])
            for item in ret['data']:
                try:
                    reach = float(item[0])
                    deduction = float(item[1])

                    is_repeat = item[2]
                    if is_repeat==1:
                        stat_has_repeat = True
                    dr_ratio = deduction*1.0/reach
                    maxp_ratio = max_deduction*1.0/price if max_deduction > 0 else 1.0
                    could_deduct = 0
                except Exception as e:
                    logging.error("reach:%s, deduction:%s" %(reach,deduction) )
                    logging.error(e)
                    continue

                if price >= reach and reach>0:
                    if is_repeat:
                        times = price // reach
                    else:
                        times = 1
                    could_deduct = times * deduction
                    if could_deduct > max_deduction:
                        could_deduct = max_deduction
                single_discount_rate = could_deduct/price
                tp =[sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt]
                vlist.append(tp)

            if stat_has_repeat:
                num_15_repeated += 1

        elif code == 19:

            sku_str = "%s" %sku_id

            num_19 += 1
            # 满几件打折或者降低多少
            type_word_list = ["总价打","商品价格"]
            # 0: 直接打折
            # 1: 减商品价格
            # 2: 其他
            deduct_type = 0
            for type_word in type_word_list:
                if content.find(type_word) >= 0:
                    # deduct_type = 0
                    break
                deduct_type += 1

            if deduct_type==2:
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")
                logging.error(content)
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")

            pt = re.compile(u'[\d.]+',re.U)
            pts = pt.findall(content)
            if len(pts) != 2:
                if '可购买热销商品' not in content:
                    logging.error(content)
                    logging.error("NEW PATTERN ABOVE")
            reach_num = discount = free_num = rf_ratio = None
            reach_num = float(pts[0])
            if deduct_type==0:
                discount = pts[1]
            elif deduct_type==1:
                free_num = float(pts[1])
                rf_ratio = float(free_num*1.0/reach_num)

            # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            tp19 =[sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            vlist19.append(tp19)


        else:
            pass

    logging.debug("code = 15, cases = %s" %num_15)
    logging.debug("code = 15, repeated = %s" %num_15_repeated)
    logging.debug("rows to insert = %s" %len(vlist) )

    sql_cb_deduction = '''
        CREATE TABLE jd_analytic_promo_deduction_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) NOT NULL,
          price float NOT NULL,
          is_repeat tinyint(4) NOT NULL,
          reach float NOT NULL,
          deduction float NOT NULL,
          max_deduction float NOT NULL,
          dr_ratio float NOT NULL,
          maxp_ratio float NOT NULL,
          single_discount_rate float NOT NULL,
          category_id varchar(255) NOT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_time datetime NOT NULL,
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_discount = '''
        CREATE TABLE jd_analytic_promo_discount_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) DEFAULT NULL,
          price float DEFAULT NULL,
          deduct_type smallint(6) DEFAULT NULL,
          reach_num smallint(6) DEFAULT NULL,
          discount float DEFAULT NULL,
          free_num smallint(6) DEFAULT NULL,
          rf_ratio float DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_dt datetime DEFAULT NULL,
          PRIMARY KEY (sku_id,pid),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    pret15 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_deduction',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_deduction,
    )

    logging.debug("code = 19, cases = %s" %num_19 )
    logging.debug("rows to insert = %s" %len(vlist19) )

    pret19 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_discount',
        num_cols=len(vlist19[0]),
        value_list=vlist19,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_discount,
    )

    return _generate_mixed_ret([pret15, pret19])
def processItemPromo():
    vlist = []
    glist = []
    update_date = timeHelper.getNowLong()
    recent = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,timeHelper.FORMAT_LONG)
    logging.debug('Reading jd_promo_item_latest...' )
    sql = '''
        select sku_id, dt, promo_json from jd_promo_item_latest
        where promo_json is not NULL and LENGTH(promo_json)>100
        and dt>="%s"
    ''' %recent
    retrows = dbhelper.executeSqlRead(sql,is_dirty=True)
    # total_rows = len(retrows)
    num_error = 0
    num17 = 0
    logging.debug('completed!')
    logging.debug("Total rows with promo_json: %s" %len(retrows))
    for row in retrows:
        sku_id = row['sku_id']
        dt = row['dt']
        obj = None
        try:
            obj = json.loads(row['promo_json'])
        except:
            num_error += 1
            continue
        rtags = obj['pickOneTag']
        for tag in rtags:
            pid = tag['pid']
            code = tag['code']
            # 不记录加价购
            if code == "17":
                num17 += 1
                continue
            name = tag['name']
            content = tag['content']
            adurl = tag['adurl'] if 'adurl' in tag else ""
            tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
            vlist.append(tp)
        tags = obj['tags']
        for tag in tags:
            pid = tag['pid']
            code = tag['code']
            name = tag['name'] if 'name' in tag else ""
            if code == "10":
                # gift
                gifts = tag['gifts']
                for gift in gifts:
                    gift_name = "赠品"
                    try:
                        gift_name = gift['nm']
                        gift_num = gift['num'] if 'num' in gift else 1
                        gift_image = gift['mp'] if 'mp' in gift else ""
                        gift_sku_id = gift['sid'] if 'sid' in gift else ""
                        gift_gt = gift['gt'] if 'gt' in gift else ""
                        gift_gs = gift['gs'] if 'gs' in gift else ""
                        tp_gift = [sku_id,dt,pid,code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date]
                        glist.append(tp_gift)
                    except Exception as e:
                        logging.debug("error in extracting gift info for sku_id = %s"%sku_id)
                        logging.debug("%s" %e)
            else:
                content = tag['content']
                adurl = tag['adurl'] if 'adurl' in tag else ""
                tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
                vlist.append(tp)

    logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %num_error)
    logging.debug('num17: %s' %num17 )
    logging.debug('vlist len: %s' %len(vlist) )
    logging.debug('glist len: %s' %len(glist) )

    sql_cb_promo_item = '''
        CREATE TABLE jd_analytic_promo_item_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_promo_gift = '''
        CREATE TABLE jd_analytic_promo_gift_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          gift_name varchar(255) NOT NULL,
          gift_num int(11) NOT NULL,
          gift_image varchar(255) DEFAULT NULL,
          gift_sku_id bigint(20) NOT NULL,
          gift_gt varchar(255) DEFAULT NULL,
          gift_gs varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    # persist in DB
    ret1 = ret2 = None
    if len(vlist)>0:
        ret1 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_item',
            num_cols=len(vlist[0]),
            value_list=vlist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_item,
        )
    if len(glist)>0:
        ret2 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_gift',
            num_cols=len(glist[0]),
            value_list=glist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_gift,
        )

        # record gift
        sglist = []
        cur_time = timeHelper.getNowLong()
        for gg in glist:
            sglist.append([gg[0],cur_time])
        sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)'
        afr = dbhelper.executeSqlWriteMany(sql_gg,sglist)
        ret3 = {
            'status': 0 if afr > 0 else -1,
            'msg': "",
        }

    return _generate_mixed_ret([ret1,ret2, ret3])
Beispiel #10
0
def calculatePercentile():
    """
    (1) load data
    (2) hash by key: category_id, ordered dict
    (3) for each item, find it's category_id array, got it's pos and percentile
    (4) store results in db
    :return:
    """

    t1 = time.time()

    # STEP (1)
    print "step 1/4: reading data from rating_score_latest"
    sql = '''
    select * from jd_analytic_rating_score_latest
    where
    rating_score is NOT NULL and
    comment_count is not NULL AND comment_count >= %s
    -- and category_id like "670-729-%%" order by comment_count DESC
    order by category_id
    -- limit 1000
    ''' % datamining_config.MIN_SKU_NUM_PER_CATEGORY_SO_STATISTICALLY_SIGNIFICANT

    retrows = dbhelper.executeSqlRead(sql)
    t2 = time.time()
    print "Done, rows read: %s, seconds used: %0.1f" % (len(retrows), t2 - t1)

    # STEP (2)

    print "step 2/4: sorting category scores..."
    print ""

    key_col = 'rating_score'

    tdict = rows_helper.transform_retrows_to_hashed_arrays(
        retrows, key_col_name='category_id')
    odict = {}
    for cat in tdict:
        array = tdict[cat]
        _get_ordered_array(array, key_col)
        odict[cat] = array
    t3 = time.time()
    print "Done, ordered_dict generated, num of keys = %s, time used = %0.1f" % (
        len(odict), t3 - t2)
    print ""

    # STEP (3)

    print "step 3/4: calculate rating percentile for each sku..."
    #sku_dict = rows_helper.transform_retrows_to_dict(retrows, key_col_name='sku_id')
    for row in retrows:
        catid = row['category_id']
        myval = row[key_col]
        pt = _getPercentileGreaterThan(myval, odict[catid], key_col)
        row['percentile_' + key_col] = pt
        row['sample_num'] = len(
            odict[catid]) if odict[catid] is not None else 0
        # print "myval: %s\tpt: %s" %(myval,pt)
    t4 = time.time()
    print "Done, using seconds: %0.1f" % (t4 - t3)
    print ""

    # Step (4)
    print 'step 4/4: storing results in db...'
    # for item in retrows:
    #     for key in item:
    #         print key
    #     break

    sql_cb = '''
    CREATE TABLE jd_analytic_rating_percentile_latest (
          sku_id bigint(20) NOT NULL,
          comment_count int(11) NOT NULL,
          this_update_time datetime NOT NULL,
          rating_score float NOT NULL,
          category_id varchar(255) NOT NULL,
          rating_sample_num int(11) DEFAULT 0,
          percentile_rating_score float DEFAULT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
          -- KEY cat_score (rating_score,category_id),
          -- KEY score (rating_score),
          -- KEY category (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''
    vlist = rows_helper.transform_retrows_arrayofdicts_to_arrayoftuples(
        retrows)
    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_rating_percentile',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    t5 = time.time()
    print "Done, rows affected: %s, time used: %0.1f" % (ret, t5 - t4)
    print ""
    return ret
Beispiel #11
0
def calculate_min_max_price():
    logging.debug(
        'Reading item_dynamic history and calculate min/max/avg/median price for skus...'
    )
    t1 = time.time()
    dt = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    sql1 = '''
        select
            sku_id,
            AVG(price) as average_price,
            min(price) as min_price,
            -- median(price) as median_price,           -- changed 12/22
            percentile_minx(price) as median_price,
            max(price) as max_price,
            max(update_time) as origin_time,
            count(1) as sample_count,
            min_ratio(price) as min_ratio,
            LPDR(price) as LPDR

        from
        -- jd_item_dynamic                              -- changed 12/22
        jd_item_price

        where

        -- update_time > '2015-11-14 0:00:00' and  -- 双十一期间价格
        price > 0

        group by sku_id
        having max(update_time) >= '%s'
    ''' % (dt)

    logging.debug(sql1)
    retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True)
    logging.debug("Done, rows to insert: %s" % len(retrows))
    t2 = time.time()
    logging.debug('using seconds: %0.1f' % (t2 - t1))

    sql_cb = '''

        CREATE TABLE jd_analytic_price_stat_latest (
          sku_id bigint(20) NOT NULL,
          average_price float NOT NULL,
          min_price float NOT NULL,
          median_price float NOT NULL,
          max_price float NOT NULL,
          origin_time datetime NOT NULL,
          sample_count int(11) NOT NULL,
          min_ratio float NOT NULL,
          LPDR float NOT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_price_stat',
        num_cols=len(retrows[0]),
        value_list=retrows,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    return ret
Beispiel #12
0
def process_promo_detail():
    today = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, '%Y-%m-%d %H:%M:%S')
    # today = timeHelper.getTimeAheadOfNowDays(1)
    sql = '''
        select a.*, b.price, d.id as category_id, d.name as category_name from

        jd_analytic_promo_item_latest a
        left join
        jd_item_price_latest b
        on a.sku_id = b.sku_id

        left JOIN
        jd_item_category c
        on a.sku_id = c.sku_id

        left join
        jd_category d
        on c.category_id = d.id

        where a.dt >= "%s"
        and b.sku_id is not NULL
        and b.price is not NULL
    ''' % today
    # logging.debug(sql)
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    vlist = []
    vlist19 = []

    dt = timeHelper.getNowLong()

    logging.debug('num total promo_item rows: %s' % len(retrows))
    # exit()

    num_15 = 0
    num_19 = 0
    num_15_repeated = 0

    for row in retrows:
        sku_id = row['sku_id']
        code = int(row['code'])
        content = row['content'] if 'content' in row else ""
        adurl = row['adurl'] if 'adurl' in row else ""
        origin_dt = row['dt']
        pid = row['pid']
        name = row['name'] if 'name' in row else ""
        price = float("%s" % row['price'])
        category_id = row['category_id']
        category_name = row['category_name']
        # title = row['title']
        if code == 15:
            num_15 += 1
            ret = _extract_reach_deduction_array(content)

            stat_has_repeat = False
            max_deduction = float(ret['max'])
            for item in ret['data']:
                try:
                    reach = float(item[0])
                    deduction = float(item[1])

                    is_repeat = item[2]
                    if is_repeat == 1:
                        stat_has_repeat = True
                    dr_ratio = deduction * 1.0 / reach
                    maxp_ratio = max_deduction * 1.0 / price if max_deduction > 0 else 1.0
                    could_deduct = 0
                except Exception as e:
                    logging.error("reach:%s, deduction:%s" %
                                  (reach, deduction))
                    logging.error(e)
                    continue

                if price >= reach and reach > 0:
                    if is_repeat:
                        times = price // reach
                    else:
                        times = 1
                    could_deduct = times * deduction
                    if could_deduct > max_deduction:
                        could_deduct = max_deduction
                single_discount_rate = could_deduct / price
                tp = [
                    sku_id, dt, price, is_repeat, reach, deduction,
                    max_deduction, dr_ratio, maxp_ratio, single_discount_rate,
                    category_id, category_name, pid, code, name, content,
                    adurl, origin_dt
                ]
                vlist.append(tp)

            if stat_has_repeat:
                num_15_repeated += 1

        elif code == 19:

            sku_str = "%s" % sku_id

            num_19 += 1
            # 满几件打折或者降低多少
            type_word_list = ["总价打", "商品价格"]
            # 0: 直接打折
            # 1: 减商品价格
            # 2: 其他
            deduct_type = 0
            for type_word in type_word_list:
                if content.find(type_word) >= 0:
                    # deduct_type = 0
                    break
                deduct_type += 1

            if deduct_type == 2:
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")
                logging.error(content)
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")

            pt = re.compile(u'[\d.]+', re.U)
            pts = pt.findall(content)
            if len(pts) != 2:
                if '可购买热销商品' not in content:
                    logging.error(content)
                    logging.error("NEW PATTERN ABOVE")
            reach_num = discount = free_num = rf_ratio = None
            reach_num = float(pts[0])
            if deduct_type == 0:
                discount = pts[1]
            elif deduct_type == 1:
                free_num = float(pts[1])
                rf_ratio = float(free_num * 1.0 / reach_num)

            # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            tp19 = [
                sku_id, dt, price, deduct_type, reach_num, discount, free_num,
                rf_ratio, category_id, category_name, pid, code, name, content,
                adurl, origin_dt
            ]
            vlist19.append(tp19)

        else:
            pass

    logging.debug("code = 15, cases = %s" % num_15)
    logging.debug("code = 15, repeated = %s" % num_15_repeated)
    logging.debug("rows to insert = %s" % len(vlist))

    sql_cb_deduction = '''
        CREATE TABLE jd_analytic_promo_deduction_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) NOT NULL,
          price float NOT NULL,
          is_repeat tinyint(4) NOT NULL,
          reach float NOT NULL,
          deduction float NOT NULL,
          max_deduction float NOT NULL,
          dr_ratio float NOT NULL,
          maxp_ratio float NOT NULL,
          single_discount_rate float NOT NULL,
          category_id varchar(255) NOT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_time datetime NOT NULL,
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_discount = '''
        CREATE TABLE jd_analytic_promo_discount_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) DEFAULT NULL,
          price float DEFAULT NULL,
          deduct_type smallint(6) DEFAULT NULL,
          reach_num smallint(6) DEFAULT NULL,
          discount float DEFAULT NULL,
          free_num smallint(6) DEFAULT NULL,
          rf_ratio float DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_dt datetime DEFAULT NULL,
          PRIMARY KEY (sku_id,pid),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    pret15 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_deduction',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_deduction,
    )

    logging.debug("code = 19, cases = %s" % num_19)
    logging.debug("rows to insert = %s" % len(vlist19))

    pret19 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_discount',
        num_cols=len(vlist19[0]),
        value_list=vlist19,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_discount,
    )

    return _generate_mixed_ret([pret15, pret19])
Beispiel #13
0
def processItemPromo():
    vlist = []
    glist = []
    update_date = timeHelper.getNowLong()
    recent = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    logging.debug('Reading jd_promo_item_latest...')
    sql = '''
        select sku_id, dt, promo_json from jd_promo_item_latest
        where promo_json is not NULL and LENGTH(promo_json)>100
        and dt>="%s"
    ''' % recent
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)
    # total_rows = len(retrows)
    num_error = 0
    num17 = 0
    logging.debug('completed!')
    logging.debug("Total rows with promo_json: %s" % len(retrows))
    for row in retrows:
        sku_id = row['sku_id']
        dt = row['dt']
        obj = None
        try:
            obj = json.loads(row['promo_json'])
        except:
            num_error += 1
            continue
        rtags = obj['pickOneTag']
        for tag in rtags:
            pid = tag['pid']
            code = tag['code']
            # 不记录加价购
            if code == "17":
                num17 += 1
                continue
            name = tag['name']
            content = tag['content']
            adurl = tag['adurl'] if 'adurl' in tag else ""
            tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
            vlist.append(tp)
        tags = obj['tags']
        for tag in tags:
            pid = tag['pid']
            code = tag['code']
            name = tag['name'] if 'name' in tag else ""
            if code == "10":
                # gift
                gifts = tag['gifts']
                for gift in gifts:
                    gift_name = "赠品"
                    try:
                        gift_name = gift['nm']
                        gift_num = gift['num'] if 'num' in gift else 1
                        gift_image = gift['mp'] if 'mp' in gift else ""
                        gift_sku_id = gift['sid'] if 'sid' in gift else ""
                        gift_gt = gift['gt'] if 'gt' in gift else ""
                        gift_gs = gift['gs'] if 'gs' in gift else ""
                        tp_gift = [
                            sku_id, dt, pid, code, name, gift_name, gift_num,
                            gift_image, gift_sku_id, gift_gt, gift_gs,
                            update_date
                        ]
                        glist.append(tp_gift)
                    except Exception as e:
                        logging.debug(
                            "error in extracting gift info for sku_id = %s" %
                            sku_id)
                        logging.debug("%s" % e)
            else:
                content = tag['content']
                adurl = tag['adurl'] if 'adurl' in tag else ""
                tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
                vlist.append(tp)

    logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %
                  num_error)
    logging.debug('num17: %s' % num17)
    logging.debug('vlist len: %s' % len(vlist))
    logging.debug('glist len: %s' % len(glist))

    sql_cb_promo_item = '''
        CREATE TABLE jd_analytic_promo_item_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_promo_gift = '''
        CREATE TABLE jd_analytic_promo_gift_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          gift_name varchar(255) NOT NULL,
          gift_num int(11) NOT NULL,
          gift_image varchar(255) DEFAULT NULL,
          gift_sku_id bigint(20) NOT NULL,
          gift_gt varchar(255) DEFAULT NULL,
          gift_gs varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    # persist in DB
    ret1 = ret2 = None
    if len(vlist) > 0:
        ret1 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_item',
            num_cols=len(vlist[0]),
            value_list=vlist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_item,
        )
    if len(glist) > 0:
        ret2 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_gift',
            num_cols=len(glist[0]),
            value_list=glist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_gift,
        )

        # record gift
        sglist = []
        cur_time = timeHelper.getNowLong()
        for gg in glist:
            sglist.append([gg[0], cur_time])
        sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)'
        afr = dbhelper.executeSqlWriteMany(sql_gg, sglist)
        ret3 = {
            'status': 0 if afr > 0 else -1,
            'msg': "",
        }

    return _generate_mixed_ret([ret1, ret2, ret3])
Beispiel #14
0
def generate_category_property_mapping():

    # sql = 'select * from jd_category'
    # retrows = dbhelper.executeSqlRead(sql)
    #
    # for row in retrows:
    #     category_id = row['id']
    #     category_name = row['name']
    black_list_clause = '","'.join(PROPERTY_KEY_BLACK_WORD_LIST)
    black_list_clause = '"%s"' % black_list_clause
    sql2 = '''
        select
            sku_id,
            p_key,
            p_value,
            category_id
        from
        jd_item_property_latest a
        left join
        jd_item_category b
        using (sku_id)
		where LENGTH(p_value)>3
		and LENGTH(p_value)<=30
		and not (p_value like '%%个' and length(p_value)<=10)
		and p_key<>'__DEFAULT__'
		and LENGTH(p_key)>=6
		and LENGTH(p_key)<=21
		and p_key not like '%%重%%'
		and p_key not like '%%尺寸%%'
		and p_key not like '%%厚度%%'
		and p_key not like '%%宽度%%'
		and p_key not like '%%长宽高%%'
		and p_key not like '%%mm%%'
		and p_key <> '上架时间'
		and p_key NOT IN (%s)
    ''' % (black_list_clause)

    vlist = dbhelper.executeSqlRead2(sql2, is_dirty=True)

    sql_cb = '''
        CREATE TABLE jd_analytic_property_latest (
          sku_id bigint(20) DEFAULT NULL,
          p_key varchar(255) DEFAULT NULL,
          p_value varchar(255) DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          KEY skuid_categoryid (sku_id,category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    print "now writing to db..."
    t1 = time.time()

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_property',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    t2 = time.time()
    print "using : %0.0f" % (t2 - t1)

    return ret