def _get_deduction_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and a.sku_id=%s" %DEBUG_SKU_ID

    sql_deduction = '''
        select * from
        (
        select
        sku_id,
        max(single_discount_rate) as max_deduction_ratio
        FROM
        jd_analytic_promo_deduction_latest
        group by sku_id
        -- having max(single_discount_rate)>0
        ) a

        left join

        jd_analytic_promo_deduction_latest b

        on
        a.sku_id = b.sku_id
        and ABS(a.max_deduction_ratio-b.single_discount_rate)<0.001

        where origin_time>'%s' %s
    ''' %(hours_ahead, debug_sku_str)

    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed')
    for row in retrows_deduction:
        row['content_deduction'] = row['content']
    dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id')
    return dict_deduction
Beispiel #2
0
def _get_deduction_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(
        FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and a.sku_id=%s" % DEBUG_SKU_ID

    sql_deduction = '''
        select * from
        (
        select
        sku_id,
        max(single_discount_rate) as max_deduction_ratio
        FROM
        jd_analytic_promo_deduction_latest
        group by sku_id
        -- having max(single_discount_rate)>0
        ) a

        left join

        jd_analytic_promo_deduction_latest b

        on
        a.sku_id = b.sku_id
        and ABS(a.max_deduction_ratio-b.single_discount_rate)<0.001

        where origin_time>'%s' %s
    ''' % (hours_ahead, debug_sku_str)

    retrows_deduction = dbhelper.executeSqlRead(
        sql_deduction, is_dirty=True, isolation_type='read-committed')
    for row in retrows_deduction:
        row['content_deduction'] = row['content']
    dict_deduction = rows_helper.transform_retrows_to_dict(
        retrows_deduction, 'sku_id')
    return dict_deduction
def match_discounts():

    debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" %DEBUG_SKU_ID
    print('>>> 1/8 >>> Reading jd_price_temp_latest...')
    sql_price = 'select * from jd_price_temp_latest %s' %debug_sku_str
    retrows_price = dbhelper.executeSqlRead(sql_price, is_dirty=True)
    print('rows read: %s' %len(retrows_price))

    print('>>> 2/8 >>> Reading strongest deductions of each sku...')
    deduction_dict = _get_deduction_dict()
    print('rows read: %s' %len(deduction_dict))

    print('>>> 3/8 >>> Reading discounts of each sku...')
    discount_dict = _get_discount_dict()
    print('rows read: %s' %len(discount_dict))

    print('>>> 4/8 >>> Reading gifts of each sku...')
    gift_dict = _get_gift_dict()
    print('rows read: %s' %len(gift_dict))

    print('>>> 5/8 >>> Reading first seen date of each sku...')
    first_seen_dict = _get_item_firstseen_dict()
    print('rows read: %s' %len(first_seen_dict))

    print('>>> 6/8 >>> Reading ratings of each sku...')
    rating_dict = _get_rating_dict()
    print('rows read: %s' %len(rating_dict))

    print('>>> 7/8 >>> Joining results in memory...')

    _merge_dict_under_key(
        deduction_dict,
        [
            discount_dict,
            gift_dict,
            first_seen_dict,
            rating_dict,
        ]
    )

    tlist = _memory_left_join(retrows_price,deduction_dict,
                              col_name_list_left=cols_left,
                              col_name_list_right=cols_deduction
                              )
    print('rows generated: %s' %len(tlist))

    print '>>> 8/8 >>> Calculating worhty_values...'
    _calculate_worthy_values(tlist)
    print 'num cols = %s ' %len(tlist[0])

    print '>>> 9/9 >>> Saving to DB...'
    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_worthy',
        num_cols=len(tlist[0]),
        value_list=tlist,
        is_many=True,
        need_history=False
    )
    return ret
def _get_rating_dict():
    debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" %DEBUG_SKU_ID
    sql_deduction = '''
        select * from jd_analytic_item_rating_diff %s
    ''' %(debug_sku_str)
    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True)
    dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id')
    return dict_deduction
def _get_item_firstseen_dict():
    debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" %DEBUG_SKU_ID
    sql_deduction = '''
        select sku_id,first_seen_date from jd_item_firstseen %s
    ''' %(debug_sku_str)
    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True)
    dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id')
    return dict_deduction
Beispiel #6
0
def _get_category_all():
    sql = 'select * from jd_category'
    retrows = dbhelper.executeSqlRead(sql)
    catdict = {}
    for row in retrows:
        cat_id = row['id']
        cat_text = row['name'].replace('/', ' ')
        catdict[cat_id] = cat_text
    return catdict
Beispiel #7
0
def _get_rating_dict():
    debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" % DEBUG_SKU_ID
    sql_deduction = '''
        select * from jd_analytic_item_rating_diff %s
    ''' % (debug_sku_str)
    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True)
    dict_deduction = rows_helper.transform_retrows_to_dict(
        retrows_deduction, 'sku_id')
    return dict_deduction
def _get_category_all():
    sql = 'select * from jd_category'
    retrows = dbhelper.executeSqlRead(sql)
    catdict = {}
    for row in retrows:
        cat_id = row['id']
        cat_text = row['name'].replace('/',' ')
        catdict[cat_id] =cat_text
    return catdict
Beispiel #9
0
def _get_item_firstseen_dict():
    debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" % DEBUG_SKU_ID
    sql_deduction = '''
        select sku_id,first_seen_date from jd_item_firstseen %s
    ''' % (debug_sku_str)
    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True)
    dict_deduction = rows_helper.transform_retrows_to_dict(
        retrows_deduction, 'sku_id')
    return dict_deduction
Beispiel #10
0
def _load_catalog_map_as_dict_key_category_id_prefix():
    global g_catalog_map
    if len(g_catalog_map) > 0:
        return g_catalog_map

    sql = "select * from jd_catalog_map"
    retrows = dbhelper.executeSqlRead(sql)
    cdict = rows_helper.transform_retrows_to_dict(retrows, "category_id")
    g_catalog_map = cdict
    return g_catalog_map
def _get_display_name_dict():

    sql = 'select * from jd_category_show'
    retrows = dbhelper.executeSqlRead(sql)

    catdict = {}
    for row in retrows:
        cat_id = row['category_id_prefix']
        cat_text = "%s %s" %(row['category_prefix_name'],row['display_name'])
        catdict[cat_id] = cat_text
    return catdict
Beispiel #12
0
def match_discounts():

    debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" % DEBUG_SKU_ID
    print('>>> 1/8 >>> Reading jd_price_temp_latest...')
    sql_price = 'select * from jd_price_temp_latest %s' % debug_sku_str
    retrows_price = dbhelper.executeSqlRead(sql_price, is_dirty=True)
    print('rows read: %s' % len(retrows_price))

    print('>>> 2/8 >>> Reading strongest deductions of each sku...')
    deduction_dict = _get_deduction_dict()
    print('rows read: %s' % len(deduction_dict))

    print('>>> 3/8 >>> Reading discounts of each sku...')
    discount_dict = _get_discount_dict()
    print('rows read: %s' % len(discount_dict))

    print('>>> 4/8 >>> Reading gifts of each sku...')
    gift_dict = _get_gift_dict()
    print('rows read: %s' % len(gift_dict))

    print('>>> 5/8 >>> Reading first seen date of each sku...')
    first_seen_dict = _get_item_firstseen_dict()
    print('rows read: %s' % len(first_seen_dict))

    print('>>> 6/8 >>> Reading ratings of each sku...')
    rating_dict = _get_rating_dict()
    print('rows read: %s' % len(rating_dict))

    print('>>> 7/8 >>> Joining results in memory...')

    _merge_dict_under_key(deduction_dict, [
        discount_dict,
        gift_dict,
        first_seen_dict,
        rating_dict,
    ])

    tlist = _memory_left_join(retrows_price,
                              deduction_dict,
                              col_name_list_left=cols_left,
                              col_name_list_right=cols_deduction)
    print('rows generated: %s' % len(tlist))

    print '>>> 8/8 >>> Calculating worhty_values...'
    _calculate_worthy_values(tlist)
    print 'num cols = %s ' % len(tlist[0])

    print '>>> 9/9 >>> Saving to DB...'
    ret = crawler_helper.persist_db_history_and_latest(table_name='jd_worthy',
                                                       num_cols=len(tlist[0]),
                                                       value_list=tlist,
                                                       is_many=True,
                                                       need_history=False)
    return ret
Beispiel #13
0
def _get_display_name_dict():

    sql = 'select * from jd_category_show'
    retrows = dbhelper.executeSqlRead(sql)

    catdict = {}
    for row in retrows:
        cat_id = row['category_id_prefix']
        cat_text = "%s %s" % (row['category_prefix_name'], row['display_name'])
        catdict[cat_id] = cat_text
    return catdict
def genPropertyTable():

    print("reading...")
    sql = 'select * from jd_item_property_latest'
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    pdict = {}

    for row in retrows:

        p_key = row['p_key']
        if p_key is None:
            continue
        if p_key == '__DEFAULT__' or p_key == u'__DEFAULT__':
            continue
        if len(p_key) > 60:
            # print p_key
            continue

        p_value = row['p_value']
        if (p_value is None):
            continue
        if p_value == u'无':
            # print p_value
            continue
        p_value_nf = multi_replace(p_value,PROPERTY_SPLITTER_LIST,' ')
        lendiff = len(p_value) - len(p_value_nf)
        if lendiff > 5:
            # print p_value
            continue

        sku_id = row['sku_id']
        if sku_id in pdict:
            pold = pdict[sku_id]
            pdict[sku_id] = "%s %s" %(pold,p_value_nf)
        else:
            pdict[sku_id] = p_value_nf

    vlist = []
    for key in pdict:
        vlist.append([key,pdict[key]])

    print("writing to db...")
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_index_property',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        need_flow=False,
    )
Beispiel #15
0
def genPropertyTable():

    print("reading...")
    sql = 'select * from jd_item_property_latest'
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    pdict = {}

    for row in retrows:

        p_key = row['p_key']
        if p_key is None:
            continue
        if p_key == '__DEFAULT__' or p_key == u'__DEFAULT__':
            continue
        if len(p_key) > 60:
            # print p_key
            continue

        p_value = row['p_value']
        if (p_value is None):
            continue
        if p_value == u'无':
            # print p_value
            continue
        p_value_nf = multi_replace(p_value, PROPERTY_SPLITTER_LIST, ' ')
        lendiff = len(p_value) - len(p_value_nf)
        if lendiff > 5:
            # print p_value
            continue

        sku_id = row['sku_id']
        if sku_id in pdict:
            pold = pdict[sku_id]
            pdict[sku_id] = "%s %s" % (pold, p_value_nf)
        else:
            pdict[sku_id] = p_value_nf

    vlist = []
    for key in pdict:
        vlist.append([key, pdict[key]])

    print("writing to db...")
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_index_property',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        need_flow=False,
    )
def _get_gift_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" %DEBUG_SKU_ID
    # gift_valued表中, dt是原始爬取时间(其他表是origin_time)
    sql_deduction = '''
        select * from
        jd_analytic_promo_gift_valued
        where dt>'%s' %s
    ''' %(hours_ahead, debug_sku_str)
    # print sql_deduction

    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed')
    dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id')
    return dict_deduction
def _get_discount_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" %DEBUG_SKU_ID

    sql_deduction = '''
        select * from
        jd_analytic_promo_discount_latest
        where origin_dt>'%s' %s
    ''' %(hours_ahead, debug_sku_str)
    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed')
    for row in retrows_deduction:
        row['content_discount'] = row['content']
    dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id')
    #print dict_deduction['264212']
    return dict_deduction
Beispiel #18
0
def _get_gift_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(
        FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" % DEBUG_SKU_ID
    # gift_valued表中, dt是原始爬取时间(其他表是origin_time)
    sql_deduction = '''
        select * from
        jd_analytic_promo_gift_valued
        where dt>'%s' %s
    ''' % (hours_ahead, debug_sku_str)
    # print sql_deduction

    retrows_deduction = dbhelper.executeSqlRead(
        sql_deduction, is_dirty=True, isolation_type='read-committed')
    dict_deduction = rows_helper.transform_retrows_to_dict(
        retrows_deduction, 'sku_id')
    return dict_deduction
Beispiel #19
0
def _get_discount_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(
        FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" % DEBUG_SKU_ID

    sql_deduction = '''
        select * from
        jd_analytic_promo_discount_latest
        where origin_dt>'%s' %s
    ''' % (hours_ahead, debug_sku_str)
    retrows_deduction = dbhelper.executeSqlRead(
        sql_deduction, is_dirty=True, isolation_type='read-committed')
    for row in retrows_deduction:
        row['content_discount'] = row['content']
    dict_deduction = rows_helper.transform_retrows_to_dict(
        retrows_deduction, 'sku_id')
    #print dict_deduction['264212']
    return dict_deduction
def do_log_user_event(device_id, query, catalog_id, remote_ip):

    catalog_name = ""

    if query is not None and len(query) > 0:
        catalog_id = ""
    else:
        query = ""
        sql2 = 'select * from jd_catalog where catalog_id="%s"' %catalog_id
        # print sql2
        retrows = dbhelper.executeSqlRead(sql2)
        if len(retrows) > 0:
            catalog_name = retrows[0]['catalog_name']
        else:
            catalog_name = 'Unknown'

    sql_user_event = 'insert into user_events values("%s","%s","%s","%s","%s","%s")' %(device_id,query,catalog_id,catalog_name, timeHelper.getNowLong(),remote_ip)
    afr = dbhelper.executeSqlWrite1(sql_user_event)
    # print afr
    return afr
def calculate_base_rating_for_categories():

    today = timeHelper.getNow()
    sql = getSqlCatRating()
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)
    # print sql
    print "rows of data selected for insert: %s" %len(retrows)
    # print len(retrows[0])
    # print retrows[0]
    vlist = []
    for row in retrows:
        tp = []
        tp.append(row['category_id'])
        tp.append(row['sample_count'])
        tp.append(row['sum_1'])
        tp.append(row['sum_2'])
        tp.append(row['sum_3'])
        tp.append(row['sum_4'])
        tp.append(row['sum_5'])
        tp.append(row['comment_count'])
        tp.append(row['rating_score'])
        tp.append(row['rate_1'])
        tp.append(row['rate_2'])
        tp.append(row['rate_3'])
        tp.append(row['rate_4'])
        tp.append(row['rate_5'])
        tp.append(row['rate_good'])
        tp.append(row['rate_bad'])
        tp.append(row['origin_dt'])
        tp.append(row['dt'])
        tp.append(row['name'])
        # print row['category_id']
        vlist.append(tp)

    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_analytic_category_rating',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True
    )
Beispiel #22
0
def do_log_user_event(device_id, query, catalog_id, remote_ip):

    catalog_name = ""

    if query is not None and len(query) > 0:
        catalog_id = ""
    else:
        query = ""
        sql2 = 'select * from jd_catalog where catalog_id="%s"' % catalog_id
        # print sql2
        retrows = dbhelper.executeSqlRead(sql2)
        if len(retrows) > 0:
            catalog_name = retrows[0]['catalog_name']
        else:
            catalog_name = 'Unknown'

    sql_user_event = 'insert into user_events values("%s","%s","%s","%s","%s","%s")' % (
        device_id, query, catalog_id, catalog_name, timeHelper.getNowLong(),
        remote_ip)
    afr = dbhelper.executeSqlWrite1(sql_user_event)
    # print afr
    return afr
def _load_category_map():
    sql = 'select category_id_prefix,category_prefix_name from jd_category_show'
    retrows = dbhelper.executeSqlRead(sql)
    retdict = rows_helper.transform_retrows_to_dict(retrows,'category_prefix_name')
    return retdict
Beispiel #24
0
def calculatePercentile():
    """
    (1) load data
    (2) hash by key: category_id, ordered dict
    (3) for each item, find it's category_id array, got it's pos and percentile
    (4) store results in db
    :return:
    """

    t1 = time.time()

    # STEP (1)
    print "step 1/4: reading data from rating_score_latest"
    sql = '''
    select * from jd_analytic_rating_score_latest
    where
    rating_score is NOT NULL and
    comment_count is not NULL AND comment_count >= %s
    -- and category_id like "670-729-%%" order by comment_count DESC
    order by category_id
    -- limit 1000
    ''' % datamining_config.MIN_SKU_NUM_PER_CATEGORY_SO_STATISTICALLY_SIGNIFICANT

    retrows = dbhelper.executeSqlRead(sql)
    t2 = time.time()
    print "Done, rows read: %s, seconds used: %0.1f" % (len(retrows), t2 - t1)

    # STEP (2)

    print "step 2/4: sorting category scores..."
    print ""

    key_col = 'rating_score'

    tdict = rows_helper.transform_retrows_to_hashed_arrays(
        retrows, key_col_name='category_id')
    odict = {}
    for cat in tdict:
        array = tdict[cat]
        _get_ordered_array(array, key_col)
        odict[cat] = array
    t3 = time.time()
    print "Done, ordered_dict generated, num of keys = %s, time used = %0.1f" % (
        len(odict), t3 - t2)
    print ""

    # STEP (3)

    print "step 3/4: calculate rating percentile for each sku..."
    #sku_dict = rows_helper.transform_retrows_to_dict(retrows, key_col_name='sku_id')
    for row in retrows:
        catid = row['category_id']
        myval = row[key_col]
        pt = _getPercentileGreaterThan(myval, odict[catid], key_col)
        row['percentile_' + key_col] = pt
        row['sample_num'] = len(
            odict[catid]) if odict[catid] is not None else 0
        # print "myval: %s\tpt: %s" %(myval,pt)
    t4 = time.time()
    print "Done, using seconds: %0.1f" % (t4 - t3)
    print ""

    # Step (4)
    print 'step 4/4: storing results in db...'
    # for item in retrows:
    #     for key in item:
    #         print key
    #     break

    sql_cb = '''
    CREATE TABLE jd_analytic_rating_percentile_latest (
          sku_id bigint(20) NOT NULL,
          comment_count int(11) NOT NULL,
          this_update_time datetime NOT NULL,
          rating_score float NOT NULL,
          category_id varchar(255) NOT NULL,
          rating_sample_num int(11) DEFAULT 0,
          percentile_rating_score float DEFAULT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
          -- KEY cat_score (rating_score,category_id),
          -- KEY score (rating_score),
          -- KEY category (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''
    vlist = rows_helper.transform_retrows_arrayofdicts_to_arrayoftuples(
        retrows)
    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_rating_percentile',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    t5 = time.time()
    print "Done, rows affected: %s, time used: %0.1f" % (ret, t5 - t4)
    print ""
    return ret
def processItemPromo():
    vlist = []
    glist = []
    update_date = timeHelper.getNowLong()
    recent = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,timeHelper.FORMAT_LONG)
    logging.debug('Reading jd_promo_item_latest...' )
    sql = '''
        select sku_id, dt, promo_json from jd_promo_item_latest
        where promo_json is not NULL and LENGTH(promo_json)>100
        and dt>="%s"
    ''' %recent
    retrows = dbhelper.executeSqlRead(sql,is_dirty=True)
    # total_rows = len(retrows)
    num_error = 0
    num17 = 0
    logging.debug('completed!')
    logging.debug("Total rows with promo_json: %s" %len(retrows))
    for row in retrows:
        sku_id = row['sku_id']
        dt = row['dt']
        obj = None
        try:
            obj = json.loads(row['promo_json'])
        except:
            num_error += 1
            continue
        rtags = obj['pickOneTag']
        for tag in rtags:
            pid = tag['pid']
            code = tag['code']
            # 不记录加价购
            if code == "17":
                num17 += 1
                continue
            name = tag['name']
            content = tag['content']
            adurl = tag['adurl'] if 'adurl' in tag else ""
            tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
            vlist.append(tp)
        tags = obj['tags']
        for tag in tags:
            pid = tag['pid']
            code = tag['code']
            name = tag['name'] if 'name' in tag else ""
            if code == "10":
                # gift
                gifts = tag['gifts']
                for gift in gifts:
                    gift_name = "赠品"
                    try:
                        gift_name = gift['nm']
                        gift_num = gift['num'] if 'num' in gift else 1
                        gift_image = gift['mp'] if 'mp' in gift else ""
                        gift_sku_id = gift['sid'] if 'sid' in gift else ""
                        gift_gt = gift['gt'] if 'gt' in gift else ""
                        gift_gs = gift['gs'] if 'gs' in gift else ""
                        tp_gift = [sku_id,dt,pid,code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date]
                        glist.append(tp_gift)
                    except Exception as e:
                        logging.debug("error in extracting gift info for sku_id = %s"%sku_id)
                        logging.debug("%s" %e)
            else:
                content = tag['content']
                adurl = tag['adurl'] if 'adurl' in tag else ""
                tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
                vlist.append(tp)

    logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %num_error)
    logging.debug('num17: %s' %num17 )
    logging.debug('vlist len: %s' %len(vlist) )
    logging.debug('glist len: %s' %len(glist) )

    sql_cb_promo_item = '''
        CREATE TABLE jd_analytic_promo_item_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_promo_gift = '''
        CREATE TABLE jd_analytic_promo_gift_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          gift_name varchar(255) NOT NULL,
          gift_num int(11) NOT NULL,
          gift_image varchar(255) DEFAULT NULL,
          gift_sku_id bigint(20) NOT NULL,
          gift_gt varchar(255) DEFAULT NULL,
          gift_gs varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    # persist in DB
    ret1 = ret2 = None
    if len(vlist)>0:
        ret1 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_item',
            num_cols=len(vlist[0]),
            value_list=vlist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_item,
        )
    if len(glist)>0:
        ret2 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_gift',
            num_cols=len(glist[0]),
            value_list=glist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_gift,
        )

        # record gift
        sglist = []
        cur_time = timeHelper.getNowLong()
        for gg in glist:
            sglist.append([gg[0],cur_time])
        sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)'
        afr = dbhelper.executeSqlWriteMany(sql_gg,sglist)
        ret3 = {
            'status': 0 if afr > 0 else -1,
            'msg': "",
        }

    return _generate_mixed_ret([ret1,ret2, ret3])
Beispiel #26
0
def _get_merged_tables():
    sql = """
    select
    a.sku_id as sku_id,
    CURRENT_TIMESTAMP() as this_update_time,
    j.category_id,
    h.name as category_name,
    pp.price as current_price,
    a.average_price as average_price,
    a.median_price,
    a.min_price,
    a.max_price,
    a.min_ratio,
    a.LPDR,
    pp.price/a.median_price as discount_rate,
    k.a,
    k.b,
    k.c,
    k.j,
    k.l,
    b.title,
    b.thumbnail_url,
    b.icon_url,
    c.content as content_deduction,
    c.adurl as adurl_deduction,
    c.is_repeat,
    c.reach,
    c.deduction,
    c.max_deduction,
    c.dr_ratio,
    c.maxp_ratio,
    c.max_deduction_ratio,
    c.reach_2,
    c.deduction_2,
    c.max_dr_ratio,
    c.discount_score_2 as deduction_score,
    d.content as content_discount,
    d.adurl as adurl_discount,
    d.deduct_type,
    d.reach_num,
    d.discount,
    d.free_num,
    d.rf_ratio,
    e.gift_name,
    e.gift_num,
    e.gift_image,
    e.gift_sku_id,
    e.gift_price,
    e.gift_value,
    e.gift_ratio,
    f.comment_count,
    f.rating_score,
    f.rating_sample_num as category_rating_score,
    f.percentile_rating_score as rating_score_diff,
    g.first_seen_date,
    a.sample_count,
    CURRENT_DATE() as this_update_date,
	m.catalog_id,
	m.catalog_name

    FROM

	jd_item_category j
	left join jd_category h
	on j.category_id = h.id

	inner join jd_catalog_map m
	on h.id = m.category_id

	inner join
    jd_analytic_price_stat_latest a
	on a.sku_id = j.sku_id

    left join jd_item_price_latest pp
    on a.sku_id = pp.sku_id

    left join jd_item_dynamic_latest b
    on a.sku_id = b.sku_id

    left join jd_analytic_promo_deduction_max c
    on a.sku_id = c.sku_id

    left join jd_analytic_promo_discount_latest d
    on a.sku_id = d.sku_id

    left join jd_analytic_promo_gift_valued e
    on a.sku_id = e.sku_id

    left join jd_analytic_rating_percentile_latest f
    on a.sku_id = f.sku_id

    left join jd_item_firstseen g
    on a.sku_id = g.sku_id

    left join jd_item_stock_latest k
    on a.sku_id = k.sku_id

    where
    pp.price > 0
    and a.sku_id not in (select sku_id from jd_analytic_sku_gift)
    -- limit 100
    """

    if IS_SKU_LEVEL_DEBUGGING:
        sql += "\n and a.sku_id = %s" % DEBUG_SKU_ID

    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)
    return retrows
def calculatePercentile():
    """
    (1) load data
    (2) hash by key: category_id, ordered dict
    (3) for each item, find it's category_id array, got it's pos and percentile
    (4) store results in db
    :return:
    """

    t1 = time.time()

    # STEP (1)
    print "step 1/4: reading data from rating_score_latest"
    sql = '''
    select * from jd_analytic_rating_score_latest
    where
    rating_score is NOT NULL and
    comment_count is not NULL AND comment_count >= %s
    -- and category_id like "670-729-%%" order by comment_count DESC
    order by category_id
    -- limit 1000
    ''' %datamining_config.MIN_SKU_NUM_PER_CATEGORY_SO_STATISTICALLY_SIGNIFICANT

    retrows = dbhelper.executeSqlRead(sql)
    t2 = time.time()
    print "Done, rows read: %s, seconds used: %0.1f" %(len(retrows), t2-t1)

    # STEP (2)

    print "step 2/4: sorting category scores..."
    print ""

    key_col = 'rating_score'

    tdict = rows_helper.transform_retrows_to_hashed_arrays(retrows, key_col_name='category_id')
    odict = {}
    for cat in tdict:
        array = tdict[cat]
        _get_ordered_array(array, key_col)
        odict[cat] = array
    t3 = time.time()
    print "Done, ordered_dict generated, num of keys = %s, time used = %0.1f" %(len(odict),t3-t2)
    print ""

    # STEP (3)

    print "step 3/4: calculate rating percentile for each sku..."
    #sku_dict = rows_helper.transform_retrows_to_dict(retrows, key_col_name='sku_id')
    for row in retrows:
        catid = row['category_id']
        myval = row[key_col]
        pt = _getPercentileGreaterThan(myval,odict[catid],key_col)
        row['percentile_'+key_col] = pt
        row['sample_num'] = len(odict[catid]) if odict[catid] is not None else 0
        # print "myval: %s\tpt: %s" %(myval,pt)
    t4 = time.time()
    print "Done, using seconds: %0.1f" %(t4-t3)
    print ""

    # Step (4)
    print 'step 4/4: storing results in db...'
    # for item in retrows:
    #     for key in item:
    #         print key
    #     break

    sql_cb = '''
    CREATE TABLE jd_analytic_rating_percentile_latest (
          sku_id bigint(20) NOT NULL,
          comment_count int(11) NOT NULL,
          this_update_time datetime NOT NULL,
          rating_score float NOT NULL,
          category_id varchar(255) NOT NULL,
          rating_sample_num int(11) DEFAULT 0,
          percentile_rating_score float DEFAULT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
          -- KEY cat_score (rating_score,category_id),
          -- KEY score (rating_score),
          -- KEY category (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''
    vlist = rows_helper.transform_retrows_arrayofdicts_to_arrayoftuples(retrows)
    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_rating_percentile',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    t5 = time.time()
    print "Done, rows affected: %s, time used: %0.1f" %(ret, t5-t4)
    print ""
    return ret
def process_promo_detail():
    today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,'%Y-%m-%d %H:%M:%S')
    # today = timeHelper.getTimeAheadOfNowDays(1)
    sql = '''
        select a.*, b.price, d.id as category_id, d.name as category_name from

        jd_analytic_promo_item_latest a
        left join
        jd_item_price_latest b
        on a.sku_id = b.sku_id

        left JOIN
        jd_item_category c
        on a.sku_id = c.sku_id

        left join
        jd_category d
        on c.category_id = d.id

        where a.dt >= "%s"
        and b.sku_id is not NULL
        and b.price is not NULL
    ''' %today
    # logging.debug(sql)
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    vlist = []
    vlist19 = []

    dt = timeHelper.getNowLong()

    logging.debug('num total promo_item rows: %s' %len(retrows) )
    # exit()

    num_15 = 0
    num_19 = 0
    num_15_repeated = 0

    for row in retrows:
        sku_id = row['sku_id']
        code = int(row['code'])
        content = row['content'] if 'content' in row else ""
        adurl = row['adurl'] if 'adurl' in row else ""
        origin_dt = row['dt']
        pid = row['pid']
        name = row['name'] if 'name' in row else ""
        price = float("%s" %row['price'])
        category_id = row['category_id']
        category_name = row['category_name']
        # title = row['title']
        if code == 15:
            num_15 += 1
            ret = _extract_reach_deduction_array(content)

            stat_has_repeat = False
            max_deduction = float(ret['max'])
            for item in ret['data']:
                try:
                    reach = float(item[0])
                    deduction = float(item[1])

                    is_repeat = item[2]
                    if is_repeat==1:
                        stat_has_repeat = True
                    dr_ratio = deduction*1.0/reach
                    maxp_ratio = max_deduction*1.0/price if max_deduction > 0 else 1.0
                    could_deduct = 0
                except Exception as e:
                    logging.error("reach:%s, deduction:%s" %(reach,deduction) )
                    logging.error(e)
                    continue

                if price >= reach and reach>0:
                    if is_repeat:
                        times = price // reach
                    else:
                        times = 1
                    could_deduct = times * deduction
                    if could_deduct > max_deduction:
                        could_deduct = max_deduction
                single_discount_rate = could_deduct/price
                tp =[sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt]
                vlist.append(tp)

            if stat_has_repeat:
                num_15_repeated += 1

        elif code == 19:

            sku_str = "%s" %sku_id

            num_19 += 1
            # 满几件打折或者降低多少
            type_word_list = ["总价打","商品价格"]
            # 0: 直接打折
            # 1: 减商品价格
            # 2: 其他
            deduct_type = 0
            for type_word in type_word_list:
                if content.find(type_word) >= 0:
                    # deduct_type = 0
                    break
                deduct_type += 1

            if deduct_type==2:
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")
                logging.error(content)
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")

            pt = re.compile(u'[\d.]+',re.U)
            pts = pt.findall(content)
            if len(pts) != 2:
                if '可购买热销商品' not in content:
                    logging.error(content)
                    logging.error("NEW PATTERN ABOVE")
            reach_num = discount = free_num = rf_ratio = None
            reach_num = float(pts[0])
            if deduct_type==0:
                discount = pts[1]
            elif deduct_type==1:
                free_num = float(pts[1])
                rf_ratio = float(free_num*1.0/reach_num)

            # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            tp19 =[sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            vlist19.append(tp19)


        else:
            pass

    logging.debug("code = 15, cases = %s" %num_15)
    logging.debug("code = 15, repeated = %s" %num_15_repeated)
    logging.debug("rows to insert = %s" %len(vlist) )

    sql_cb_deduction = '''
        CREATE TABLE jd_analytic_promo_deduction_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) NOT NULL,
          price float NOT NULL,
          is_repeat tinyint(4) NOT NULL,
          reach float NOT NULL,
          deduction float NOT NULL,
          max_deduction float NOT NULL,
          dr_ratio float NOT NULL,
          maxp_ratio float NOT NULL,
          single_discount_rate float NOT NULL,
          category_id varchar(255) NOT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_time datetime NOT NULL,
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_discount = '''
        CREATE TABLE jd_analytic_promo_discount_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) DEFAULT NULL,
          price float DEFAULT NULL,
          deduct_type smallint(6) DEFAULT NULL,
          reach_num smallint(6) DEFAULT NULL,
          discount float DEFAULT NULL,
          free_num smallint(6) DEFAULT NULL,
          rf_ratio float DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_dt datetime DEFAULT NULL,
          PRIMARY KEY (sku_id,pid),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    pret15 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_deduction',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_deduction,
    )

    logging.debug("code = 19, cases = %s" %num_19 )
    logging.debug("rows to insert = %s" %len(vlist19) )

    pret19 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_discount',
        num_cols=len(vlist19[0]),
        value_list=vlist19,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_discount,
    )

    return _generate_mixed_ret([pret15, pret19])
Beispiel #29
0
def processItemPromo():
    vlist = []
    glist = []
    update_date = timeHelper.getNowLong()
    recent = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    logging.debug('Reading jd_promo_item_latest...')
    sql = '''
        select sku_id, dt, promo_json from jd_promo_item_latest
        where promo_json is not NULL and LENGTH(promo_json)>100
        and dt>="%s"
    ''' % recent
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)
    # total_rows = len(retrows)
    num_error = 0
    num17 = 0
    logging.debug('completed!')
    logging.debug("Total rows with promo_json: %s" % len(retrows))
    for row in retrows:
        sku_id = row['sku_id']
        dt = row['dt']
        obj = None
        try:
            obj = json.loads(row['promo_json'])
        except:
            num_error += 1
            continue
        rtags = obj['pickOneTag']
        for tag in rtags:
            pid = tag['pid']
            code = tag['code']
            # 不记录加价购
            if code == "17":
                num17 += 1
                continue
            name = tag['name']
            content = tag['content']
            adurl = tag['adurl'] if 'adurl' in tag else ""
            tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
            vlist.append(tp)
        tags = obj['tags']
        for tag in tags:
            pid = tag['pid']
            code = tag['code']
            name = tag['name'] if 'name' in tag else ""
            if code == "10":
                # gift
                gifts = tag['gifts']
                for gift in gifts:
                    gift_name = "赠品"
                    try:
                        gift_name = gift['nm']
                        gift_num = gift['num'] if 'num' in gift else 1
                        gift_image = gift['mp'] if 'mp' in gift else ""
                        gift_sku_id = gift['sid'] if 'sid' in gift else ""
                        gift_gt = gift['gt'] if 'gt' in gift else ""
                        gift_gs = gift['gs'] if 'gs' in gift else ""
                        tp_gift = [
                            sku_id, dt, pid, code, name, gift_name, gift_num,
                            gift_image, gift_sku_id, gift_gt, gift_gs,
                            update_date
                        ]
                        glist.append(tp_gift)
                    except Exception as e:
                        logging.debug(
                            "error in extracting gift info for sku_id = %s" %
                            sku_id)
                        logging.debug("%s" % e)
            else:
                content = tag['content']
                adurl = tag['adurl'] if 'adurl' in tag else ""
                tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
                vlist.append(tp)

    logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %
                  num_error)
    logging.debug('num17: %s' % num17)
    logging.debug('vlist len: %s' % len(vlist))
    logging.debug('glist len: %s' % len(glist))

    sql_cb_promo_item = '''
        CREATE TABLE jd_analytic_promo_item_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_promo_gift = '''
        CREATE TABLE jd_analytic_promo_gift_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          gift_name varchar(255) NOT NULL,
          gift_num int(11) NOT NULL,
          gift_image varchar(255) DEFAULT NULL,
          gift_sku_id bigint(20) NOT NULL,
          gift_gt varchar(255) DEFAULT NULL,
          gift_gs varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    # persist in DB
    ret1 = ret2 = None
    if len(vlist) > 0:
        ret1 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_item',
            num_cols=len(vlist[0]),
            value_list=vlist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_item,
        )
    if len(glist) > 0:
        ret2 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_gift',
            num_cols=len(glist[0]),
            value_list=glist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_gift,
        )

        # record gift
        sglist = []
        cur_time = timeHelper.getNowLong()
        for gg in glist:
            sglist.append([gg[0], cur_time])
        sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)'
        afr = dbhelper.executeSqlWriteMany(sql_gg, sglist)
        ret3 = {
            'status': 0 if afr > 0 else -1,
            'msg': "",
        }

    return _generate_mixed_ret([ret1, ret2, ret3])
Beispiel #30
0
def _load_category_map():
    sql = 'select category_id_prefix,category_prefix_name from jd_category_show'
    retrows = dbhelper.executeSqlRead(sql)
    retdict = rows_helper.transform_retrows_to_dict(retrows,
                                                    'category_prefix_name')
    return retdict
Beispiel #31
0
def process_promo_detail():
    today = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, '%Y-%m-%d %H:%M:%S')
    # today = timeHelper.getTimeAheadOfNowDays(1)
    sql = '''
        select a.*, b.price, d.id as category_id, d.name as category_name from

        jd_analytic_promo_item_latest a
        left join
        jd_item_price_latest b
        on a.sku_id = b.sku_id

        left JOIN
        jd_item_category c
        on a.sku_id = c.sku_id

        left join
        jd_category d
        on c.category_id = d.id

        where a.dt >= "%s"
        and b.sku_id is not NULL
        and b.price is not NULL
    ''' % today
    # logging.debug(sql)
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    vlist = []
    vlist19 = []

    dt = timeHelper.getNowLong()

    logging.debug('num total promo_item rows: %s' % len(retrows))
    # exit()

    num_15 = 0
    num_19 = 0
    num_15_repeated = 0

    for row in retrows:
        sku_id = row['sku_id']
        code = int(row['code'])
        content = row['content'] if 'content' in row else ""
        adurl = row['adurl'] if 'adurl' in row else ""
        origin_dt = row['dt']
        pid = row['pid']
        name = row['name'] if 'name' in row else ""
        price = float("%s" % row['price'])
        category_id = row['category_id']
        category_name = row['category_name']
        # title = row['title']
        if code == 15:
            num_15 += 1
            ret = _extract_reach_deduction_array(content)

            stat_has_repeat = False
            max_deduction = float(ret['max'])
            for item in ret['data']:
                try:
                    reach = float(item[0])
                    deduction = float(item[1])

                    is_repeat = item[2]
                    if is_repeat == 1:
                        stat_has_repeat = True
                    dr_ratio = deduction * 1.0 / reach
                    maxp_ratio = max_deduction * 1.0 / price if max_deduction > 0 else 1.0
                    could_deduct = 0
                except Exception as e:
                    logging.error("reach:%s, deduction:%s" %
                                  (reach, deduction))
                    logging.error(e)
                    continue

                if price >= reach and reach > 0:
                    if is_repeat:
                        times = price // reach
                    else:
                        times = 1
                    could_deduct = times * deduction
                    if could_deduct > max_deduction:
                        could_deduct = max_deduction
                single_discount_rate = could_deduct / price
                tp = [
                    sku_id, dt, price, is_repeat, reach, deduction,
                    max_deduction, dr_ratio, maxp_ratio, single_discount_rate,
                    category_id, category_name, pid, code, name, content,
                    adurl, origin_dt
                ]
                vlist.append(tp)

            if stat_has_repeat:
                num_15_repeated += 1

        elif code == 19:

            sku_str = "%s" % sku_id

            num_19 += 1
            # 满几件打折或者降低多少
            type_word_list = ["总价打", "商品价格"]
            # 0: 直接打折
            # 1: 减商品价格
            # 2: 其他
            deduct_type = 0
            for type_word in type_word_list:
                if content.find(type_word) >= 0:
                    # deduct_type = 0
                    break
                deduct_type += 1

            if deduct_type == 2:
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")
                logging.error(content)
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")

            pt = re.compile(u'[\d.]+', re.U)
            pts = pt.findall(content)
            if len(pts) != 2:
                if '可购买热销商品' not in content:
                    logging.error(content)
                    logging.error("NEW PATTERN ABOVE")
            reach_num = discount = free_num = rf_ratio = None
            reach_num = float(pts[0])
            if deduct_type == 0:
                discount = pts[1]
            elif deduct_type == 1:
                free_num = float(pts[1])
                rf_ratio = float(free_num * 1.0 / reach_num)

            # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            tp19 = [
                sku_id, dt, price, deduct_type, reach_num, discount, free_num,
                rf_ratio, category_id, category_name, pid, code, name, content,
                adurl, origin_dt
            ]
            vlist19.append(tp19)

        else:
            pass

    logging.debug("code = 15, cases = %s" % num_15)
    logging.debug("code = 15, repeated = %s" % num_15_repeated)
    logging.debug("rows to insert = %s" % len(vlist))

    sql_cb_deduction = '''
        CREATE TABLE jd_analytic_promo_deduction_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) NOT NULL,
          price float NOT NULL,
          is_repeat tinyint(4) NOT NULL,
          reach float NOT NULL,
          deduction float NOT NULL,
          max_deduction float NOT NULL,
          dr_ratio float NOT NULL,
          maxp_ratio float NOT NULL,
          single_discount_rate float NOT NULL,
          category_id varchar(255) NOT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_time datetime NOT NULL,
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_discount = '''
        CREATE TABLE jd_analytic_promo_discount_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) DEFAULT NULL,
          price float DEFAULT NULL,
          deduct_type smallint(6) DEFAULT NULL,
          reach_num smallint(6) DEFAULT NULL,
          discount float DEFAULT NULL,
          free_num smallint(6) DEFAULT NULL,
          rf_ratio float DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_dt datetime DEFAULT NULL,
          PRIMARY KEY (sku_id,pid),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    pret15 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_deduction',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_deduction,
    )

    logging.debug("code = 19, cases = %s" % num_19)
    logging.debug("rows to insert = %s" % len(vlist19))

    pret19 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_discount',
        num_cols=len(vlist19[0]),
        value_list=vlist19,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_discount,
    )

    return _generate_mixed_ret([pret15, pret19])