def get_job_update_time():
    sql = 'select job_time from jd_notification_job_status where job_name="%s"' %(NOTIFICATION_JOB_NAME)
    retrows = dbhelper.executeSqlRead2(sql)
    ret = '0000-00-00 0:00:00'
    if len(retrows) > 0:
        ret = retrows[0][0]
    return ret
Example #2
0
def _get_all_devices():
    sql = 'select device_id from user_notification_device_latest'
    retrows = dbhelper.executeSqlRead2(sql)
    vlist = []
    for row in retrows:
        device_id = row[0]
        vlist.append(device_id)
    return vlist
Example #3
0
def _get_all_devices():
    sql = 'select device_id from user_notification_device_latest'
    retrows = dbhelper.executeSqlRead2(sql)
    vlist = []
    for row in retrows:
        device_id = row[0]
        vlist.append(device_id)
    return vlist
Example #4
0
def calculateSkuRatingScores():

    sql_cb = '''
        CREATE TABLE jd_analytic_rating_score_latest (
          sku_id bigint(20) NOT NULL,
          comment_count int(11) NOT NULL,
          rating_score float DEFAULT NULL,
          category_id varchar(255) NOT NULL,
          this_update_time datetime NOT NULL,
          PRIMARY KEY (sku_id)
          -- KEY skuid (sku_id),
          -- KEY cat_score (rating_score,category_id),
          -- KEY score (rating_score),
          -- KEY category (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
        '''

    sql = '''
        select

        skuid as sku_id,
        CommentCount,
        ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score,
        category_id,
        CURRENT_TIMESTAMP() as this_update_time

        from

        jd_item_comment_count_latest a
        left join
        jd_item_category b
        on a.SkuId = b.sku_id

        where a.CommentCount is not null and a.CommentCount >= %s

        ''' % (datamining_config.
               MIN_COMMENT_NUM_SO_RATING_SCORE_STATISTICALLY_SIGNIFICANT)

    retrows = dbhelper.executeSqlRead2(sql, is_dirty=True)

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_rating_score',
        num_cols=len(retrows[0]),
        value_list=retrows,
        need_history=False,
        is_many=True,
        sql_create_table=sql_cb,
    )

    return ret
def calculateSkuRatingScores():

    sql_cb = '''
        CREATE TABLE jd_analytic_rating_score_latest (
          sku_id bigint(20) NOT NULL,
          comment_count int(11) NOT NULL,
          rating_score float DEFAULT NULL,
          category_id varchar(255) NOT NULL,
          this_update_time datetime NOT NULL,
          PRIMARY KEY (sku_id)
          -- KEY skuid (sku_id),
          -- KEY cat_score (rating_score,category_id),
          -- KEY score (rating_score),
          -- KEY category (category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
        '''

    sql = '''
        select

        skuid as sku_id,
        CommentCount,
        ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score,
        category_id,
        CURRENT_TIMESTAMP() as this_update_time

        from

        jd_item_comment_count_latest a
        left join
        jd_item_category b
        on a.SkuId = b.sku_id

        where a.CommentCount is not null and a.CommentCount >= %s

        ''' %(datamining_config.MIN_COMMENT_NUM_SO_RATING_SCORE_STATISTICALLY_SIGNIFICANT)

    retrows = dbhelper.executeSqlRead2(sql,is_dirty=True)

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_rating_score',
        num_cols=len(retrows[0]),
        value_list=retrows,
        need_history=False,
        is_many=True,
        sql_create_table=sql_cb,
    )

    return ret
def _get_category_id_prefix_given_category_name(category_name) :
    # global CATEGORY_MAP
    # if len(CATEGORY_MAP)==0:
    #     CATEGORY_MAP = _load_category_map()
    category_name = category_name.decode('utf-8')
    CATEGORY_MAP = _load_category_map()
    if category_name in CATEGORY_MAP:
        return [CATEGORY_MAP[category_name]['category_id_prefix'],CATEGORY_MAP[category_name]['category_prefix_name'] ]
    else:
        sql2 = 'select id from jd_category where name="%s" limit 1' %category_name
        retrows = dbhelper.executeSqlRead2(sql2)
        if len(retrows)==0:
            err = "Not found in category_map : %s, returning this category_name" %category_name
            print err
        return [retrows[0][0],category_name]
def getUpdatedSkuIds():
    ut = get_job_update_time()
    sql = '''
    select distinct sku_id FROM
    jd_notification_history_lowest
    WHERE
    update_time >= "%s"
    ''' %(ut)
    retrows = dbhelper.executeSqlRead2(sql)
    vlist = []
    for row in retrows:
        vlist.append(row[0])
    ret = {
        'num': len(vlist),
        'data': vlist,
    }
    return ret
Example #8
0
def _get_category_id_prefix_given_category_name(category_name):
    # global CATEGORY_MAP
    # if len(CATEGORY_MAP)==0:
    #     CATEGORY_MAP = _load_category_map()
    category_name = category_name.decode('utf-8')
    CATEGORY_MAP = _load_category_map()
    if category_name in CATEGORY_MAP:
        return [
            CATEGORY_MAP[category_name]['category_id_prefix'],
            CATEGORY_MAP[category_name]['category_prefix_name']
        ]
    else:
        sql2 = 'select id from jd_category where name="%s" limit 1' % category_name
        retrows = dbhelper.executeSqlRead2(sql2)
        if len(retrows) == 0:
            err = "Not found in category_map : %s, returning this category_name" % category_name
            print err
        return [retrows[0][0], category_name]
Example #9
0
def calculate_min_max_price():
    logging.debug('Reading item_dynamic history and calculate min/max/avg/median price for skus...')
    t1 = time.time()
    dt = timeHelper.getTimeAheadOfNowHours(datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    sql1 = '''
        select
            sku_id,
            AVG(price) as average_price,
            min(price) as min_price,
            -- median(price) as median_price,           -- changed 12/22
            percentile_minx(price) as median_price,
            max(price) as max_price,
            max(update_time) as origin_time,
            count(1) as sample_count,
            min_ratio(price) as min_ratio,
            LPDR(price) as LPDR

        from
        -- jd_item_dynamic                              -- changed 12/22
        jd_item_price

        where

        -- update_time > '2015-11-14 0:00:00' and  -- 双十一期间价格
        price > 0

        group by sku_id
        having max(update_time) >= '%s'
    ''' %(dt)

    logging.debug(sql1)
    retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True)
    logging.debug("Done, rows to insert: %s" %len(retrows) )
    t2 = time.time()
    logging.debug('using seconds: %0.1f' %(t2-t1) )

    sql_cb = '''

        CREATE TABLE jd_analytic_price_stat_latest (
          sku_id bigint(20) NOT NULL,
          average_price float NOT NULL,
          min_price float NOT NULL,
          median_price float NOT NULL,
          max_price float NOT NULL,
          origin_time datetime NOT NULL,
          sample_count int(11) NOT NULL,
          min_ratio float NOT NULL,
          LPDR float NOT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_price_stat',
        num_cols=len(retrows[0]),
        value_list=retrows,
        is_many=True,
        need_history=False,
        sql_create_table= sql_cb,
    )
    return ret
def generate_category_property_mapping():

    # sql = 'select * from jd_category'
    # retrows = dbhelper.executeSqlRead(sql)
    #
    # for row in retrows:
    #     category_id = row['id']
    #     category_name = row['name']
    black_list_clause = '","'.join(PROPERTY_KEY_BLACK_WORD_LIST)
    black_list_clause = '"%s"' %black_list_clause
    sql2 = '''
        select
            sku_id,
            p_key,
            p_value,
            category_id
        from
        jd_item_property_latest a
        left join
        jd_item_category b
        using (sku_id)
		where LENGTH(p_value)>3
		and LENGTH(p_value)<=30
		and not (p_value like '%%个' and length(p_value)<=10)
		and p_key<>'__DEFAULT__'
		and LENGTH(p_key)>=6
		and LENGTH(p_key)<=21
		and p_key not like '%%重%%'
		and p_key not like '%%尺寸%%'
		and p_key not like '%%厚度%%'
		and p_key not like '%%宽度%%'
		and p_key not like '%%长宽高%%'
		and p_key not like '%%mm%%'
		and p_key <> '上架时间'
		and p_key NOT IN (%s)
    ''' %(black_list_clause)

    vlist = dbhelper.executeSqlRead2(sql2, is_dirty=True)

    sql_cb = '''
        CREATE TABLE jd_analytic_property_latest (
          sku_id bigint(20) DEFAULT NULL,
          p_key varchar(255) DEFAULT NULL,
          p_value varchar(255) DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          KEY skuid_categoryid (sku_id,category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    print "now writing to db..."
    t1 = time.time()

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_property',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    t2 = time.time()
    print "using : %0.0f" %(t2-t1)

    return ret
Example #11
0
def calculate_min_max_price():
    logging.debug(
        'Reading item_dynamic history and calculate min/max/avg/median price for skus...'
    )
    t1 = time.time()
    dt = timeHelper.getTimeAheadOfNowHours(
        datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG)
    sql1 = '''
        select
            sku_id,
            AVG(price) as average_price,
            min(price) as min_price,
            -- median(price) as median_price,           -- changed 12/22
            percentile_minx(price) as median_price,
            max(price) as max_price,
            max(update_time) as origin_time,
            count(1) as sample_count,
            min_ratio(price) as min_ratio,
            LPDR(price) as LPDR

        from
        -- jd_item_dynamic                              -- changed 12/22
        jd_item_price

        where

        -- update_time > '2015-11-14 0:00:00' and  -- 双十一期间价格
        price > 0

        group by sku_id
        having max(update_time) >= '%s'
    ''' % (dt)

    logging.debug(sql1)
    retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True)
    logging.debug("Done, rows to insert: %s" % len(retrows))
    t2 = time.time()
    logging.debug('using seconds: %0.1f' % (t2 - t1))

    sql_cb = '''

        CREATE TABLE jd_analytic_price_stat_latest (
          sku_id bigint(20) NOT NULL,
          average_price float NOT NULL,
          min_price float NOT NULL,
          median_price float NOT NULL,
          max_price float NOT NULL,
          origin_time datetime NOT NULL,
          sample_count int(11) NOT NULL,
          min_ratio float NOT NULL,
          LPDR float NOT NULL,
          PRIMARY KEY (sku_id),
          KEY skuid (sku_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8

    '''

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_price_stat',
        num_cols=len(retrows[0]),
        value_list=retrows,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    return ret
Example #12
0
def generate_category_property_mapping():

    # sql = 'select * from jd_category'
    # retrows = dbhelper.executeSqlRead(sql)
    #
    # for row in retrows:
    #     category_id = row['id']
    #     category_name = row['name']
    black_list_clause = '","'.join(PROPERTY_KEY_BLACK_WORD_LIST)
    black_list_clause = '"%s"' % black_list_clause
    sql2 = '''
        select
            sku_id,
            p_key,
            p_value,
            category_id
        from
        jd_item_property_latest a
        left join
        jd_item_category b
        using (sku_id)
		where LENGTH(p_value)>3
		and LENGTH(p_value)<=30
		and not (p_value like '%%个' and length(p_value)<=10)
		and p_key<>'__DEFAULT__'
		and LENGTH(p_key)>=6
		and LENGTH(p_key)<=21
		and p_key not like '%%重%%'
		and p_key not like '%%尺寸%%'
		and p_key not like '%%厚度%%'
		and p_key not like '%%宽度%%'
		and p_key not like '%%长宽高%%'
		and p_key not like '%%mm%%'
		and p_key <> '上架时间'
		and p_key NOT IN (%s)
    ''' % (black_list_clause)

    vlist = dbhelper.executeSqlRead2(sql2, is_dirty=True)

    sql_cb = '''
        CREATE TABLE jd_analytic_property_latest (
          sku_id bigint(20) DEFAULT NULL,
          p_key varchar(255) DEFAULT NULL,
          p_value varchar(255) DEFAULT NULL,
          category_id varchar(255) DEFAULT NULL,
          KEY skuid_categoryid (sku_id,category_id)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    '''

    print "now writing to db..."
    t1 = time.time()

    ret = crawler_helper.persist_db_history_and_lastest_empty_first(
        table_name='jd_analytic_property',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=False,
        sql_create_table=sql_cb,
    )
    t2 = time.time()
    print "using : %0.0f" % (t2 - t1)

    return ret