Example #1
0
def _get_deduction_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(
        FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and a.sku_id=%s" % DEBUG_SKU_ID

    sql_deduction = '''
        select * from
        (
        select
        sku_id,
        max(single_discount_rate) as max_deduction_ratio
        FROM
        jd_analytic_promo_deduction_latest
        group by sku_id
        -- having max(single_discount_rate)>0
        ) a

        left join

        jd_analytic_promo_deduction_latest b

        on
        a.sku_id = b.sku_id
        and ABS(a.max_deduction_ratio-b.single_discount_rate)<0.001

        where origin_time>'%s' %s
    ''' % (hours_ahead, debug_sku_str)

    retrows_deduction = dbhelper.executeSqlRead(
        sql_deduction, is_dirty=True, isolation_type='read-committed')
    for row in retrows_deduction:
        row['content_deduction'] = row['content']
    dict_deduction = rows_helper.transform_retrows_to_dict(
        retrows_deduction, 'sku_id')
    return dict_deduction
Example #2
0
def getWorthyInfo_of_skuid_list(sku_id_list):
    if len(sku_id_list) == 0:
        return []
    sku_id_list2 = []
    for item in sku_id_list:
        sku_id_list2.append("%s" % item)
    dt = timeHelper.getTimeAheadOfNowHours(
        service_config.SKU_LIST_APP_WORTHY_RECENCY_HOURS,
        timeHelper.FORMAT_LONG)
    id_clause = ','.join(sku_id_list2)

    skulist3 = []
    skuid_clause = "("
    for sku_id in sku_id_list2:
        skulist3.append("sku_id = %s" % sku_id)
    skuid_clause += ' OR '.join(skulist3)
    skuid_clause += ")"

    sql = '''
            select
            *, instr('%s',sku_id) as dd
            from
            jd_worthy_latest
            where
            -- this_update_time > '%s'
            -- and sku_id in (%s)
             %s
            order by dd ASC
        ''' % (id_clause, dt, id_clause, skuid_clause)
    # print sql
    retrows = dbhelper_read.executeSqlRead(sql, is_dirty=True)
    return retrows
def _get_deduction_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and a.sku_id=%s" %DEBUG_SKU_ID

    sql_deduction = '''
        select * from
        (
        select
        sku_id,
        max(single_discount_rate) as max_deduction_ratio
        FROM
        jd_analytic_promo_deduction_latest
        group by sku_id
        -- having max(single_discount_rate)>0
        ) a

        left join

        jd_analytic_promo_deduction_latest b

        on
        a.sku_id = b.sku_id
        and ABS(a.max_deduction_ratio-b.single_discount_rate)<0.001

        where origin_time>'%s' %s
    ''' %(hours_ahead, debug_sku_str)

    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed')
    for row in retrows_deduction:
        row['content_deduction'] = row['content']
    dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id')
    return dict_deduction
def _get_gift_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" %DEBUG_SKU_ID
    # gift_valued表中, dt是原始爬取时间(其他表是origin_time)
    sql_deduction = '''
        select * from
        jd_analytic_promo_gift_valued
        where dt>'%s' %s
    ''' %(hours_ahead, debug_sku_str)
    # print sql_deduction

    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed')
    dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id')
    return dict_deduction
def _get_discount_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(FINAL_DISCOUNT_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" %DEBUG_SKU_ID

    sql_deduction = '''
        select * from
        jd_analytic_promo_discount_latest
        where origin_dt>'%s' %s
    ''' %(hours_ahead, debug_sku_str)
    retrows_deduction = dbhelper.executeSqlRead(sql_deduction, is_dirty=True, isolation_type='read-committed')
    for row in retrows_deduction:
        row['content_discount'] = row['content']
    dict_deduction = rows_helper.transform_retrows_to_dict(retrows_deduction, 'sku_id')
    #print dict_deduction['264212']
    return dict_deduction
Example #6
0
def _get_gift_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(
        FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" % DEBUG_SKU_ID
    # gift_valued表中, dt是原始爬取时间(其他表是origin_time)
    sql_deduction = '''
        select * from
        jd_analytic_promo_gift_valued
        where dt>'%s' %s
    ''' % (hours_ahead, debug_sku_str)
    # print sql_deduction

    retrows_deduction = dbhelper.executeSqlRead(
        sql_deduction, is_dirty=True, isolation_type='read-committed')
    dict_deduction = rows_helper.transform_retrows_to_dict(
        retrows_deduction, 'sku_id')
    return dict_deduction
Example #7
0
def _get_discount_dict():
    hours_ahead = timeHelper.getTimeAheadOfNowHours(
        FINAL_DISCOUNT_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S')
    debug_sku_str = "" if not IF_DEBUG_SKU else " and sku_id=%s" % DEBUG_SKU_ID

    sql_deduction = '''
        select * from
        jd_analytic_promo_discount_latest
        where origin_dt>'%s' %s
    ''' % (hours_ahead, debug_sku_str)
    retrows_deduction = dbhelper.executeSqlRead(
        sql_deduction, is_dirty=True, isolation_type='read-committed')
    for row in retrows_deduction:
        row['content_discount'] = row['content']
    dict_deduction = rows_helper.transform_retrows_to_dict(
        retrows_deduction, 'sku_id')
    #print dict_deduction['264212']
    return dict_deduction
Example #8
0
def getSku_ID_ListByCatalogID(
    category_id="_ALL_",
    startpos=0,
    min_allowed_price=service_config.SKU_LIST_MIN_ALLOWED_PRICE,
    min_allowed_discount_rate=service_config.SKU_LIST_MIN_ALLOWED_WORTHY_VALUE
):

    retrows = None
    t1 = time.time()

    catalog_id_constraint = ""
    for idc in service_config.PRESET_CATALOG_ID_CONSTRAINTS:
        catalog_id_constraint += " catalog_id <> %s AND " % idc
    blackword_constraint = ""
    for blackword in service_config.PRESET_CATALOG_CATEGORY_WILDCARD_BLACK_WORDS:
        blackword_constraint += " category_name not like '%%%s%%' AND " % blackword.strip(
        )

    catalog_constraint = " catalog_id is not null AND %s %s " % (
        catalog_id_constraint, blackword_constraint)

    if category_id == "_ALL_":
        catalog_sql_part = catalog_constraint
    elif category_id == "_EXPENSIVE_":
        min_allowed_price = service_config.SKU_LIST_MIN_PRICE_FOR_EXPENSIVE
        catalog_sql_part = catalog_constraint
    else:
        catalog_sql_part = 'catalog_id = %s and ' % category_id

    dt = timeHelper.getTimeAheadOfNowHours(
        service_config.SKU_LIST_APP_WORTHY_RECENCY_HOURS,
        timeHelper.FORMAT_LONG)
    sql = '''
        select
        sku_id
        -- ,if(a=34,0,1) as stock_bit
        from
        jd_worthy_latest
        where
        %s
        worthy_value1 < %s
        and median_price >= %s
        and median_price < %s
        and this_update_time > '%s'
        order by
        -- stock_bit DESC,
        worthy_value1 ASC
        -- limit %s, %s
    ''' % (catalog_sql_part, min_allowed_discount_rate, min_allowed_price,
           service_config.SKU_LIST_MAX_ALLOWED_PRICE, dt, startpos,
           service_config.SKU_LIST_FRAME_SIZE)

    if category_id == '_HISTORY_LOWEST_':
        sql = '''
        select
        sku_id
        from
        jd_worthy_latest
        where
        %s
        min_price_reached = 2
        and this_update_time > '%s'
        and a<>34
        order by
        worthy_value1 ASC
        ''' % (catalog_constraint, dt)

    elif category_id == 'HOT':
        dt_hot = timeHelper.getTimeAheadOfNowHours(
            service_config.SKU_LIST_DISCOVERY_RECENCY_HOURS,
            format=timeHelper.FORMAT_LONG)
        sql = '''
        select

        distinct a.sku_id

        from

        jd_notification_history_lowest a
        left join
        jd_worthy_latest b
        using(sku_id)

        where
        %s
        a.update_time > '%s'
        and b.a<>34

        order by
        a.update_time DESC, worthy_value1 ASC
        ''' % (catalog_constraint, dt_hot)

    # print sql
    retrows = dbhelper_read.executeSqlRead(sql)
    vlist = []
    for row in retrows:
        vlist.append(row['sku_id'])
    return vlist
Example #9
0
def process_gift_value(for_date=None):
    # today = timeHelper.getNowLong()
    today = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, format='%Y-%m-%d %H:%M:%S')

    sql1 = 'delete from jd_analytic_promo_gift_valued'

    sql2 = '''
    insert into jd_analytic_promo_gift_valued

    select

    a.*,
    b.price as price,
    c.price as gift_price,
    c.price*a.gift_num as gift_value,
    (c.price*a.gift_num)/b.price as gift_ratio

    from

    jd_analytic_promo_gift_latest a
    left join
    jd_item_dynamic_latest b
    on a.sku_id = b.sku_id

    left join
    jd_item_dynamic_latest c
    on a.gift_sku_id = c.sku_id

    where
    a.update_date >= '%s'
    and b.price is not NULL
    and c.price is not NULL
    and b.price>0

    order by gift_value DESC

    ''' % today

    afr = -1

    # AS TRANSACTION
    conn = dbhelper.getConnection()
    try:
        cursor1 = conn.cursor(MySQLdb.cursors.DictCursor)
        retrows = cursor1.execute(sql1)
        retrows2 = cursor1.execute(sql2)
        if retrows2 <= 0:
            raise Exception("process_gift_value: nothing to insert")
        conn.commit()
        afr = cursor1.rowcount
    except Exception as e:
        conn.rollback()
        logging.error(e)
    finally:
        conn.close()

    logging.debug("affected rows: %s" % afr)
    ret = {
        'status': 0 if afr > 0 else -1,
        'affected_rows': afr,
        'rows deleted': retrows,
        'rows_inserted': retrows2
    }
    return ret
def _removeOldNotifications():
    ut = timeHelper.getTimeAheadOfNowHours(24,format=timeHelper.FORMAT_LONG)
    sql = 'delete from jd_notification_history_lowest where update_time <= "%s"' %ut
    afr = dbhelper.executeSqlWrite1(sql)
    return afr
def process_promo_detail():
    today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,'%Y-%m-%d %H:%M:%S')
    # today = timeHelper.getTimeAheadOfNowDays(1)
    sql = '''
        select a.*, b.price, d.id as category_id, d.name as category_name from

        jd_analytic_promo_item_latest a
        left join
        jd_item_price_latest b
        on a.sku_id = b.sku_id

        left JOIN
        jd_item_category c
        on a.sku_id = c.sku_id

        left join
        jd_category d
        on c.category_id = d.id

        where a.dt >= "%s"
        and b.sku_id is not NULL
        and b.price is not NULL
    ''' %today
    # logging.debug(sql)
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    vlist = []
    vlist19 = []

    dt = timeHelper.getNowLong()

    logging.debug('num total promo_item rows: %s' %len(retrows) )
    # exit()

    num_15 = 0
    num_19 = 0
    num_15_repeated = 0

    for row in retrows:
        sku_id = row['sku_id']
        code = int(row['code'])
        content = row['content'] if 'content' in row else ""
        adurl = row['adurl'] if 'adurl' in row else ""
        origin_dt = row['dt']
        pid = row['pid']
        name = row['name'] if 'name' in row else ""
        price = float("%s" %row['price'])
        category_id = row['category_id']
        category_name = row['category_name']
        # title = row['title']
        if code == 15:
            num_15 += 1
            ret = _extract_reach_deduction_array(content)

            stat_has_repeat = False
            max_deduction = float(ret['max'])
            for item in ret['data']:
                try:
                    reach = float(item[0])
                    deduction = float(item[1])

                    is_repeat = item[2]
                    if is_repeat==1:
                        stat_has_repeat = True
                    dr_ratio = deduction*1.0/reach
                    maxp_ratio = max_deduction*1.0/price if max_deduction > 0 else 1.0
                    could_deduct = 0
                except Exception as e:
                    logging.error("reach:%s, deduction:%s" %(reach,deduction) )
                    logging.error(e)
                    continue

                if price >= reach and reach>0:
                    if is_repeat:
                        times = price // reach
                    else:
                        times = 1
                    could_deduct = times * deduction
                    if could_deduct > max_deduction:
                        could_deduct = max_deduction
                single_discount_rate = could_deduct/price
                tp =[sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt]
                vlist.append(tp)

            if stat_has_repeat:
                num_15_repeated += 1

        elif code == 19:

            sku_str = "%s" %sku_id

            num_19 += 1
            # 满几件打折或者降低多少
            type_word_list = ["总价打","商品价格"]
            # 0: 直接打折
            # 1: 减商品价格
            # 2: 其他
            deduct_type = 0
            for type_word in type_word_list:
                if content.find(type_word) >= 0:
                    # deduct_type = 0
                    break
                deduct_type += 1

            if deduct_type==2:
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")
                logging.error(content)
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")

            pt = re.compile(u'[\d.]+',re.U)
            pts = pt.findall(content)
            if len(pts) != 2:
                if '可购买热销商品' not in content:
                    logging.error(content)
                    logging.error("NEW PATTERN ABOVE")
            reach_num = discount = free_num = rf_ratio = None
            reach_num = float(pts[0])
            if deduct_type==0:
                discount = pts[1]
            elif deduct_type==1:
                free_num = float(pts[1])
                rf_ratio = float(free_num*1.0/reach_num)

            # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            tp19 =[sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            vlist19.append(tp19)


        else:
            pass

    logging.debug("code = 15, cases = %s" %num_15)
    logging.debug("code = 15, repeated = %s" %num_15_repeated)
    logging.debug("rows to insert = %s" %len(vlist) )

    sql_cb_deduction = '''
        CREATE TABLE jd_analytic_promo_deduction_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) NOT NULL,
          price float NOT NULL,
          is_repeat tinyint(4) NOT NULL,
          reach float NOT NULL,
          deduction float NOT NULL,
          max_deduction float NOT NULL,
          dr_ratio float NOT NULL,
          maxp_ratio float NOT NULL,
          single_discount_rate float NOT NULL,
          category_id varchar(255) NOT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_time datetime NOT NULL,
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_discount = '''
        CREATE TABLE jd_analytic_promo_discount_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) DEFAULT NULL,
          price float DEFAULT NULL,
          deduct_type smallint(6) DEFAULT NULL,
          reach_num smallint(6) DEFAULT NULL,
          discount float DEFAULT NULL,
          free_num smallint(6) DEFAULT NULL,
          rf_ratio float DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_dt datetime DEFAULT NULL,
          PRIMARY KEY (sku_id,pid),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    pret15 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_deduction',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_deduction,
    )

    logging.debug("code = 19, cases = %s" %num_19 )
    logging.debug("rows to insert = %s" %len(vlist19) )

    pret19 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_discount',
        num_cols=len(vlist19[0]),
        value_list=vlist19,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_discount,
    )

    return _generate_mixed_ret([pret15, pret19])
def process_gift_value(for_date = None):
    # today = timeHelper.getNowLong()
    today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,format='%Y-%m-%d %H:%M:%S')

    sql1 = 'delete from jd_analytic_promo_gift_valued'

    sql2 = '''
    insert into jd_analytic_promo_gift_valued

    select

    a.*,
    b.price as price,
    c.price as gift_price,
    c.price*a.gift_num as gift_value,
    (c.price*a.gift_num)/b.price as gift_ratio

    from

    jd_analytic_promo_gift_latest a
    left join
    jd_item_dynamic_latest b
    on a.sku_id = b.sku_id

    left join
    jd_item_dynamic_latest c
    on a.gift_sku_id = c.sku_id

    where
    a.update_date >= '%s'
    and b.price is not NULL
    and c.price is not NULL
    and b.price>0

    order by gift_value DESC

    ''' %today

    afr = -1

    # AS TRANSACTION
    conn = dbhelper.getConnection()
    try:
        cursor1 = conn.cursor(MySQLdb.cursors.DictCursor)
        retrows = cursor1.execute(sql1)
        retrows2 = cursor1.execute(sql2)
        if retrows2 <= 0:
            raise Exception("process_gift_value: nothing to insert")
        conn.commit()
        afr = cursor1.rowcount
    except Exception as e:
        conn.rollback()
        logging.error(e)
    finally:
        conn.close()

    logging.debug("affected rows: %s" %afr )
    ret = {
        'status': 0 if afr > 0 else -1,
        'affected_rows': afr,
        'rows deleted': retrows,
        'rows_inserted': retrows2
    }
    return ret
def processItemPromo():
    vlist = []
    glist = []
    update_date = timeHelper.getNowLong()
    recent = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,timeHelper.FORMAT_LONG)
    logging.debug('Reading jd_promo_item_latest...' )
    sql = '''
        select sku_id, dt, promo_json from jd_promo_item_latest
        where promo_json is not NULL and LENGTH(promo_json)>100
        and dt>="%s"
    ''' %recent
    retrows = dbhelper.executeSqlRead(sql,is_dirty=True)
    # total_rows = len(retrows)
    num_error = 0
    num17 = 0
    logging.debug('completed!')
    logging.debug("Total rows with promo_json: %s" %len(retrows))
    for row in retrows:
        sku_id = row['sku_id']
        dt = row['dt']
        obj = None
        try:
            obj = json.loads(row['promo_json'])
        except:
            num_error += 1
            continue
        rtags = obj['pickOneTag']
        for tag in rtags:
            pid = tag['pid']
            code = tag['code']
            # 不记录加价购
            if code == "17":
                num17 += 1
                continue
            name = tag['name']
            content = tag['content']
            adurl = tag['adurl'] if 'adurl' in tag else ""
            tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
            vlist.append(tp)
        tags = obj['tags']
        for tag in tags:
            pid = tag['pid']
            code = tag['code']
            name = tag['name'] if 'name' in tag else ""
            if code == "10":
                # gift
                gifts = tag['gifts']
                for gift in gifts:
                    gift_name = "赠品"
                    try:
                        gift_name = gift['nm']
                        gift_num = gift['num'] if 'num' in gift else 1
                        gift_image = gift['mp'] if 'mp' in gift else ""
                        gift_sku_id = gift['sid'] if 'sid' in gift else ""
                        gift_gt = gift['gt'] if 'gt' in gift else ""
                        gift_gs = gift['gs'] if 'gs' in gift else ""
                        tp_gift = [sku_id,dt,pid,code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date]
                        glist.append(tp_gift)
                    except Exception as e:
                        logging.debug("error in extracting gift info for sku_id = %s"%sku_id)
                        logging.debug("%s" %e)
            else:
                content = tag['content']
                adurl = tag['adurl'] if 'adurl' in tag else ""
                tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
                vlist.append(tp)

    logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %num_error)
    logging.debug('num17: %s' %num17 )
    logging.debug('vlist len: %s' %len(vlist) )
    logging.debug('glist len: %s' %len(glist) )

    sql_cb_promo_item = '''
        CREATE TABLE jd_analytic_promo_item_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_promo_gift = '''
        CREATE TABLE jd_analytic_promo_gift_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          gift_name varchar(255) NOT NULL,
          gift_num int(11) NOT NULL,
          gift_image varchar(255) DEFAULT NULL,
          gift_sku_id bigint(20) NOT NULL,
          gift_gt varchar(255) DEFAULT NULL,
          gift_gs varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    # persist in DB
    ret1 = ret2 = None
    if len(vlist)>0:
        ret1 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_item',
            num_cols=len(vlist[0]),
            value_list=vlist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_item,
        )
    if len(glist)>0:
        ret2 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_gift',
            num_cols=len(glist[0]),
            value_list=glist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_gift,
        )

        # record gift
        sglist = []
        cur_time = timeHelper.getNowLong()
        for gg in glist:
            sglist.append([gg[0],cur_time])
        sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)'
        afr = dbhelper.executeSqlWriteMany(sql_gg,sglist)
        ret3 = {
            'status': 0 if afr > 0 else -1,
            'msg': "",
        }

    return _generate_mixed_ret([ret1,ret2, ret3])
Example #14
0
def calculate_min_max_price():
    logging.debug(
        'Reading item_dynamic history and calculate min/max/avg/median price for skus...'
    )
    t1 = time.time()
    dt = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    sql1 = '''
        select
            sku_id,
            AVG(price) as average_price,
            min(price) as min_price,
            -- median(price) as median_price,           -- changed 12/22
            percentile_minx(price) as median_price,
            max(price) as max_price,
            max(update_time) as origin_time,
            count(1) as sample_count,
            min_ratio(price) as min_ratio,
            LPDR(price) as LPDR

        from
        -- jd_item_dynamic                              -- changed 12/22
        jd_item_price

        where

        -- update_time > '2015-11-14 0:00:00' and  -- 双十一期间价格
        price > 0

        group by sku_id
        having max(update_time) >= '%s'
    ''' % (dt)

    logging.debug(sql1)
    retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True)
    logging.debug("Done, rows to insert: %s" % len(retrows))
    t2 = time.time()
    logging.debug('using seconds: %0.1f' % (t2 - t1))

    sql_cb = '''

        CREATE TABLE jd_analytic_price_stat_latest (
          sku_id bigint(20) NOT NULL,
          average_price float NOT NULL,
          min_price float NOT NULL,
          median_price float NOT NULL,
          max_price float NOT NULL,
          origin_time datetime NOT NULL,
          sample_count int(11) NOT NULL,
          min_ratio float NOT NULL,
          LPDR float NOT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_price_stat',
        num_cols=len(retrows[0]),
        value_list=retrows,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    return ret
Example #15
0
def process_promo_detail():
    today = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, '%Y-%m-%d %H:%M:%S')
    # today = timeHelper.getTimeAheadOfNowDays(1)
    sql = '''
        select a.*, b.price, d.id as category_id, d.name as category_name from

        jd_analytic_promo_item_latest a
        left join
        jd_item_price_latest b
        on a.sku_id = b.sku_id

        left JOIN
        jd_item_category c
        on a.sku_id = c.sku_id

        left join
        jd_category d
        on c.category_id = d.id

        where a.dt >= "%s"
        and b.sku_id is not NULL
        and b.price is not NULL
    ''' % today
    # logging.debug(sql)
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)

    vlist = []
    vlist19 = []

    dt = timeHelper.getNowLong()

    logging.debug('num total promo_item rows: %s' % len(retrows))
    # exit()

    num_15 = 0
    num_19 = 0
    num_15_repeated = 0

    for row in retrows:
        sku_id = row['sku_id']
        code = int(row['code'])
        content = row['content'] if 'content' in row else ""
        adurl = row['adurl'] if 'adurl' in row else ""
        origin_dt = row['dt']
        pid = row['pid']
        name = row['name'] if 'name' in row else ""
        price = float("%s" % row['price'])
        category_id = row['category_id']
        category_name = row['category_name']
        # title = row['title']
        if code == 15:
            num_15 += 1
            ret = _extract_reach_deduction_array(content)

            stat_has_repeat = False
            max_deduction = float(ret['max'])
            for item in ret['data']:
                try:
                    reach = float(item[0])
                    deduction = float(item[1])

                    is_repeat = item[2]
                    if is_repeat == 1:
                        stat_has_repeat = True
                    dr_ratio = deduction * 1.0 / reach
                    maxp_ratio = max_deduction * 1.0 / price if max_deduction > 0 else 1.0
                    could_deduct = 0
                except Exception as e:
                    logging.error("reach:%s, deduction:%s" %
                                  (reach, deduction))
                    logging.error(e)
                    continue

                if price >= reach and reach > 0:
                    if is_repeat:
                        times = price // reach
                    else:
                        times = 1
                    could_deduct = times * deduction
                    if could_deduct > max_deduction:
                        could_deduct = max_deduction
                single_discount_rate = could_deduct / price
                tp = [
                    sku_id, dt, price, is_repeat, reach, deduction,
                    max_deduction, dr_ratio, maxp_ratio, single_discount_rate,
                    category_id, category_name, pid, code, name, content,
                    adurl, origin_dt
                ]
                vlist.append(tp)

            if stat_has_repeat:
                num_15_repeated += 1

        elif code == 19:

            sku_str = "%s" % sku_id

            num_19 += 1
            # 满几件打折或者降低多少
            type_word_list = ["总价打", "商品价格"]
            # 0: 直接打折
            # 1: 减商品价格
            # 2: 其他
            deduct_type = 0
            for type_word in type_word_list:
                if content.find(type_word) >= 0:
                    # deduct_type = 0
                    break
                deduct_type += 1

            if deduct_type == 2:
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")
                logging.error(content)
                logging.error("NEW TYPE OF DISCOUNT FOUND!!!")

            pt = re.compile(u'[\d.]+', re.U)
            pts = pt.findall(content)
            if len(pts) != 2:
                if '可购买热销商品' not in content:
                    logging.error(content)
                    logging.error("NEW PATTERN ABOVE")
            reach_num = discount = free_num = rf_ratio = None
            reach_num = float(pts[0])
            if deduct_type == 0:
                discount = pts[1]
            elif deduct_type == 1:
                free_num = float(pts[1])
                rf_ratio = float(free_num * 1.0 / reach_num)

            # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt]
            tp19 = [
                sku_id, dt, price, deduct_type, reach_num, discount, free_num,
                rf_ratio, category_id, category_name, pid, code, name, content,
                adurl, origin_dt
            ]
            vlist19.append(tp19)

        else:
            pass

    logging.debug("code = 15, cases = %s" % num_15)
    logging.debug("code = 15, repeated = %s" % num_15_repeated)
    logging.debug("rows to insert = %s" % len(vlist))

    sql_cb_deduction = '''
        CREATE TABLE jd_analytic_promo_deduction_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) NOT NULL,
          price float NOT NULL,
          is_repeat tinyint(4) NOT NULL,
          reach float NOT NULL,
          deduction float NOT NULL,
          max_deduction float NOT NULL,
          dr_ratio float NOT NULL,
          maxp_ratio float NOT NULL,
          single_discount_rate float NOT NULL,
          category_id varchar(255) NOT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_time datetime NOT NULL,
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_discount = '''
        CREATE TABLE jd_analytic_promo_discount_latest (
          sku_id bigint(20) NOT NULL,
          add_time datetime NOT NULL,
          -- title varchar(255) DEFAULT NULL,
          price float DEFAULT NULL,
          deduct_type smallint(6) DEFAULT NULL,
          reach_num smallint(6) DEFAULT NULL,
          discount float DEFAULT NULL,
          free_num smallint(6) DEFAULT NULL,
          rf_ratio float DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          category_name varchar(255) DEFAULT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          origin_dt datetime DEFAULT NULL,
          PRIMARY KEY (sku_id,pid),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    pret15 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_deduction',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_deduction,
    )

    logging.debug("code = 19, cases = %s" % num_19)
    logging.debug("rows to insert = %s" % len(vlist19))

    pret19 = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_promo_discount',
        num_cols=len(vlist19[0]),
        value_list=vlist19,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb_discount,
    )

    return _generate_mixed_ret([pret15, pret19])
Example #16
0
def calculate_min_max_price():
    logging.debug('Reading item_dynamic history and calculate min/max/avg/median price for skus...')
    t1 = time.time()
    dt = timeHelper.getTimeAheadOfNowHours(datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    sql1 = '''
        select
            sku_id,
            AVG(price) as average_price,
            min(price) as min_price,
            -- median(price) as median_price,           -- changed 12/22
            percentile_minx(price) as median_price,
            max(price) as max_price,
            max(update_time) as origin_time,
            count(1) as sample_count,
            min_ratio(price) as min_ratio,
            LPDR(price) as LPDR

        from
        -- jd_item_dynamic                              -- changed 12/22
        jd_item_price

        where

        -- update_time > '2015-11-14 0:00:00' and  -- 双十一期间价格
        price > 0

        group by sku_id
        having max(update_time) >= '%s'
    ''' %(dt)

    logging.debug(sql1)
    retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True)
    logging.debug("Done, rows to insert: %s" %len(retrows) )
    t2 = time.time()
    logging.debug('using seconds: %0.1f' %(t2-t1) )

    sql_cb = '''

        CREATE TABLE jd_analytic_price_stat_latest (
          sku_id bigint(20) NOT NULL,
          average_price float NOT NULL,
          min_price float NOT NULL,
          median_price float NOT NULL,
          max_price float NOT NULL,
          origin_time datetime NOT NULL,
          sample_count int(11) NOT NULL,
          min_ratio float NOT NULL,
          LPDR float NOT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_price_stat',
        num_cols=len(retrows[0]),
        value_list=retrows,
        is_many=True,
        need_history=False,
        sql_create_table= sql_cb,
    )
    return ret
Example #17
0
def processItemPromo():
    vlist = []
    glist = []
    update_date = timeHelper.getNowLong()
    recent = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PROMO_ITEM_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    logging.debug('Reading jd_promo_item_latest...')
    sql = '''
        select sku_id, dt, promo_json from jd_promo_item_latest
        where promo_json is not NULL and LENGTH(promo_json)>100
        and dt>="%s"
    ''' % recent
    retrows = dbhelper.executeSqlRead(sql, is_dirty=True)
    # total_rows = len(retrows)
    num_error = 0
    num17 = 0
    logging.debug('completed!')
    logging.debug("Total rows with promo_json: %s" % len(retrows))
    for row in retrows:
        sku_id = row['sku_id']
        dt = row['dt']
        obj = None
        try:
            obj = json.loads(row['promo_json'])
        except:
            num_error += 1
            continue
        rtags = obj['pickOneTag']
        for tag in rtags:
            pid = tag['pid']
            code = tag['code']
            # 不记录加价购
            if code == "17":
                num17 += 1
                continue
            name = tag['name']
            content = tag['content']
            adurl = tag['adurl'] if 'adurl' in tag else ""
            tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
            vlist.append(tp)
        tags = obj['tags']
        for tag in tags:
            pid = tag['pid']
            code = tag['code']
            name = tag['name'] if 'name' in tag else ""
            if code == "10":
                # gift
                gifts = tag['gifts']
                for gift in gifts:
                    gift_name = "赠品"
                    try:
                        gift_name = gift['nm']
                        gift_num = gift['num'] if 'num' in gift else 1
                        gift_image = gift['mp'] if 'mp' in gift else ""
                        gift_sku_id = gift['sid'] if 'sid' in gift else ""
                        gift_gt = gift['gt'] if 'gt' in gift else ""
                        gift_gs = gift['gs'] if 'gs' in gift else ""
                        tp_gift = [
                            sku_id, dt, pid, code, name, gift_name, gift_num,
                            gift_image, gift_sku_id, gift_gt, gift_gs,
                            update_date
                        ]
                        glist.append(tp_gift)
                    except Exception as e:
                        logging.debug(
                            "error in extracting gift info for sku_id = %s" %
                            sku_id)
                        logging.debug("%s" % e)
            else:
                content = tag['content']
                adurl = tag['adurl'] if 'adurl' in tag else ""
                tp = [sku_id, dt, pid, code, name, content, adurl, update_date]
                vlist.append(tp)

    logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %
                  num_error)
    logging.debug('num17: %s' % num17)
    logging.debug('vlist len: %s' % len(vlist))
    logging.debug('glist len: %s' % len(glist))

    sql_cb_promo_item = '''
        CREATE TABLE jd_analytic_promo_item_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          content varchar(255) NOT NULL,
          adurl varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    sql_cb_promo_gift = '''
        CREATE TABLE jd_analytic_promo_gift_latest (
          sku_id bigint(20) NOT NULL,
          dt datetime NOT NULL,
          pid varchar(255) NOT NULL,
          code varchar(255) NOT NULL,
          name varchar(255) NOT NULL,
          gift_name varchar(255) NOT NULL,
          gift_num int(11) NOT NULL,
          gift_image varchar(255) DEFAULT NULL,
          gift_sku_id bigint(20) NOT NULL,
          gift_gt varchar(255) DEFAULT NULL,
          gift_gs varchar(255) DEFAULT NULL,
          update_date datetime NOT NULL,
          PRIMARY KEY (sku_id,pid)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    # persist in DB
    ret1 = ret2 = None
    if len(vlist) > 0:
        ret1 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_item',
            num_cols=len(vlist[0]),
            value_list=vlist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_item,
        )
    if len(glist) > 0:
        ret2 = crawler_helper.persist_db_history_and_lastest_empty_first(
            table_name='jd_analytic_promo_gift',
            num_cols=len(glist[0]),
            value_list=glist,
            is_many=True,
            need_history=False,
            sql_create_table=sql_cb_promo_gift,
        )

        # record gift
        sglist = []
        cur_time = timeHelper.getNowLong()
        for gg in glist:
            sglist.append([gg[0], cur_time])
        sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)'
        afr = dbhelper.executeSqlWriteMany(sql_gg, sglist)
        ret3 = {
            'status': 0 if afr > 0 else -1,
            'msg': "",
        }

    return _generate_mixed_ret([ret1, ret2, ret3])