Exemple #1
0
def crawl_sku_comment_count(sku_list):
    clist = jd_API.getCommentCount_JD(sku_list)
    if len(clist) == 0:
        return {'status': -1, 'msg': 'jd api returned no result for sku_list'}
    if len(clist) != len(set(sku_list)):
        return {
            'status':
            -1,
            'msg':
            'jd api return size mismatch, size of sku:%s, size of api:%s' %
            (len(set(sku_list)), len(clist))
        }
    vlist = []
    dt = timeHelper.getNow()
    for cdict in clist:
        tp = []
        cdict['dt'] = dt
        for key in cdict:
            tp.append(cdict[key])
        vlist.append(tp)

    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_comment_count',
        num_cols=len(clist[0]),
        value_list=vlist,
        is_many=True)
Exemple #2
0
def crawl_sku_stock_status(sku_list):
    vlist = jd_API.get_Stock_Status_Resolved(sku_list)
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_stock',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True)
def crawl_sku_stock_status(sku_list):
    vlist = jd_API.get_Stock_Status_Resolved(sku_list)
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_stock',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True
    )
def crawl_sku_price(sku_list, sleep_time):
    # no more than 5000 items here per design
    rdict = jd_API.getPrices_JD(sku_list,sleep_time=sleep_time)
    vlist = []
    dt = timeHelper.getNow()
    dtlong = timeHelper.getNowLong()
    for key in rdict:
        tp = rdict[key]
        price = tp[0]
        price_m = tp[1]
        price_pcp = tp[2]
        vlist.append([key,dt,dtlong,price,price_m,price_pcp])
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_price',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=True,
        need_flow=True,
    )
Exemple #5
0
def crawl_category_promo(category_id):
    rdict = jd_API.get_Promo_Category(category_id)
    dt = timeHelper.getNow()
    if len(rdict) == 0:
        return {
            'status': 0,
            'msg': 'empty in return, category_id=%s' % category_id
        }
    quan = json.dumps(rdict['quan'])
    ads = json.dumps(rdict['ads'])
    prom = json.dumps(rdict['prom'])
    vlist = [[
        category_id, dt, quan if quan != '[]' else None,
        ads if ads != '[]' else None, prom if prom != '[]' else None
    ]]
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_promo_category',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True)
Exemple #6
0
def crawl_sku_price(sku_list, sleep_time):
    # no more than 5000 items here per design
    rdict = jd_API.getPrices_JD(sku_list, sleep_time=sleep_time)
    vlist = []
    dt = timeHelper.getNow()
    dtlong = timeHelper.getNowLong()
    for key in rdict:
        tp = rdict[key]
        price = tp[0]
        price_m = tp[1]
        price_pcp = tp[2]
        vlist.append([key, dt, dtlong, price, price_m, price_pcp])
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_price',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=True,
        need_flow=True,
    )
def crawl_sku_comment_count(sku_list):
    clist = jd_API.getCommentCount_JD(sku_list)
    if len(clist)==0:
        return {'status':-1,'msg':'jd api returned no result for sku_list'}
    if len(clist)!=len(set(sku_list)):
        return {'status':-1,'msg':'jd api return size mismatch, size of sku:%s, size of api:%s' %(len(set(sku_list)),len(clist))}
    vlist = []
    dt = timeHelper.getNow()
    for cdict in clist:
        tp = []
        cdict['dt'] = dt
        for key in cdict:
            tp.append(cdict[key])
        vlist.append(tp)

    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_comment_count',
        num_cols=len(clist[0]),
        value_list=vlist,
        is_many=True
    )
Exemple #8
0
def crawl_item_promo(sku_id):
    rdict = jd_API.get_Promo_Sku(sku_id)
    dt = timeHelper.getNowLong()
    quan = ""  #json.dumps(rdict['quan'])
    ads = ""
    try:
        ads = rdict['ads'][0]['ad']  #json.dumps(rdict['ads'])
    except:
        pass
    prom = json.dumps(rdict['prom'])
    vlist = [[
        sku_id, dt, quan if quan != '[]' else None,
        ads if ads != '[]' else None, prom if prom != '[]' else None
    ]]
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_promo_item',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        # need_history=True
    )
def crawl_item_promo(sku_id):
    rdict = jd_API.get_Promo_Sku(sku_id)
    dt = timeHelper.getNowLong()
    quan = ""   #json.dumps(rdict['quan'])
    ads = ""
    try:
        ads = rdict['ads'][0]['ad']    #json.dumps(rdict['ads'])
    except:
        pass
    prom = json.dumps(rdict['prom'])
    vlist = [[
        sku_id,
        dt,
        quan if quan!='[]' else None,
        ads if ads!='[]' else None,
        prom if prom!='[]' else None
    ]]
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_promo_item',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        # need_history=True
    )
def crawl_category_promo(category_id):
    rdict = jd_API.get_Promo_Category(category_id)
    dt = timeHelper.getNow()
    if len(rdict)==0:
        return {
            'status':0,
            'msg':'empty in return, category_id=%s' %category_id
        }
    quan = json.dumps(rdict['quan'])
    ads = json.dumps(rdict['ads'])
    prom = json.dumps(rdict['prom'])
    vlist = [[
        category_id,
        dt,
        quan if quan!='[]' else None,
        ads if ads!='[]' else None,
        prom if prom!='[]' else None
    ]]
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_promo_category',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True
    )
def crawl_category(category_id):

    logging.debug('category_id = %s -- page 1' %(category_id))
    url = __get_category_page_url__(category_id,1)
    # print url
    html = url_utils.getWebResponse(url,'utf-8')
    if html == "":
        html = url_utils.getWebResponse(url,'gb18030')
    if html == "":
        html = url_utils.getWebResponse(url, 'gbk')
    total_pages = jd_list_resolver.resolveTotalPageNum(html)

    product_list = jd_list_resolver.resolveProductListFromPage(html)

    while len(product_list) == 0 and category_id is not None:
        category_id = __up_roll_category_id__(category_id)
        return crawl_category(category_id)

    if category_id is None or len(product_list)==0:
        return {'status':-1, 'msg': 'No item in category product list'}

    for page_iter in range(2,total_pages+1):
        logging.debug('category_id = %s -- page %s' %(category_id,page_iter))
        url = __get_category_page_url__(category_id,page_iter)
        html = url_utils.getWebResponse(url,'utf-8')
        product_list = product_list + jd_list_resolver.resolveProductListFromPage(html)
        time.sleep(SLEEP_TIME)

    sku_list = []
    for product_tp in product_list:
        sku_id = product_tp[0]
        sku_list.append(sku_id)

    # Get price of all products
    #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API)

    ret_obj = {
        'status': -1,
        'affected_rows': -1,
        'sku_count': -1
    }
    total_goods_num = len(product_list)

    # for item in product_list:
    #     print item[0]
    # print '='*80

    # combine product list and price list, timestamp, category_id
    for i in xrange(total_goods_num):
        product_id = product_list[i][0]
        pkey = '%s' %product_id
        # if pkey in price_obj:
        #     product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,)
        # else:
        #     logging.error('Error: product_id=%s cannot get result' %(product_id,price_id))
        #     continue
        product_list[i] = product_list[i] + (0,0,0,)

    # persist in database
    # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id)
    # sql = '''
    #   replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery,
    #   has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #   '''
    # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list)

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_dynamic',
        num_cols=len(product_list[0]),
        value_list=product_list,
        is_many=True,
        need_history=False, # was True - changed 01/03
        need_flow=False,    # was True - changed 12/23
    )

    logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %(category_id,total_goods_num))
    logging.debug('%s' %ret)

    # HANDLE JD_ITEM_CATEGORY
    item_cat_list = []
    for prod in product_list:
        item_cat_list.append((prod[0],category_id,))
    sql2 = 'replace into jd_item_category values (%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2,item_cat_list)
    logging.debug('Saved to DB - item_category - affected rows = %s' %affected_rows2)
    if affected_rows2<=0:
        logging.error('Saving to item_category error, category_id = %s' %category_id)


    # HANDLE JD_ITEM_FIRSTSEEN
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")'
    ftlist = []
    for item in product_list:
        ftlist.append([item[0],nowtime,nowdate])
    affected_rows3 = dbhelper.executeSqlWriteMany(sql3,ftlist)

    ret_obj = {
        'status': 0 if ret['status']==0 and affected_rows2>0 else -1,
        'item_dynamic': ret,
        'item_category': affected_rows2,
        'item_first_seen': affected_rows3,
    }

    return ret_obj
def crawl_category(category_id):

    logging.debug('category_id = %s -- page 1' % (category_id))
    url = __get_category_page_url__(category_id, 1)
    # print url
    html = url_utils.getWebResponse(url, 'utf-8')
    if html == "":
        html = url_utils.getWebResponse(url, 'gb18030')
    if html == "":
        html = url_utils.getWebResponse(url, 'gbk')
    total_pages = jd_list_resolver.resolveTotalPageNum(html)

    product_list = jd_list_resolver.resolveProductListFromPage(html)

    while len(product_list) == 0 and category_id is not None:
        category_id = __up_roll_category_id__(category_id)
        return crawl_category(category_id)

    if category_id is None or len(product_list) == 0:
        return {'status': -1, 'msg': 'No item in category product list'}

    for page_iter in range(2, total_pages + 1):
        logging.debug('category_id = %s -- page %s' % (category_id, page_iter))
        url = __get_category_page_url__(category_id, page_iter)
        html = url_utils.getWebResponse(url, 'utf-8')
        product_list = product_list + jd_list_resolver.resolveProductListFromPage(
            html)
        time.sleep(SLEEP_TIME)

    sku_list = []
    for product_tp in product_list:
        sku_id = product_tp[0]
        sku_list.append(sku_id)

    # Get price of all products
    #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API)

    ret_obj = {'status': -1, 'affected_rows': -1, 'sku_count': -1}
    total_goods_num = len(product_list)

    # for item in product_list:
    #     print item[0]
    # print '='*80

    # combine product list and price list, timestamp, category_id
    for i in xrange(total_goods_num):
        product_id = product_list[i][0]
        pkey = '%s' % product_id
        # if pkey in price_obj:
        #     product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,)
        # else:
        #     logging.error('Error: product_id=%s cannot get result' %(product_id,price_id))
        #     continue
        product_list[i] = product_list[i] + (
            0,
            0,
            0,
        )

    # persist in database
    # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id)
    # sql = '''
    #   replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery,
    #   has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #   '''
    # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list)

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_dynamic',
        num_cols=len(product_list[0]),
        value_list=product_list,
        is_many=True,
        need_history=False,  # was True - changed 01/03
        need_flow=False,  # was True - changed 12/23
    )

    logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %
                  (category_id, total_goods_num))
    logging.debug('%s' % ret)

    # HANDLE JD_ITEM_CATEGORY
    item_cat_list = []
    for prod in product_list:
        item_cat_list.append((
            prod[0],
            category_id,
        ))
    sql2 = 'replace into jd_item_category values (%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2, item_cat_list)
    logging.debug('Saved to DB - item_category - affected rows = %s' %
                  affected_rows2)
    if affected_rows2 <= 0:
        logging.error('Saving to item_category error, category_id = %s' %
                      category_id)

    # HANDLE JD_ITEM_FIRSTSEEN
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")'
    ftlist = []
    for item in product_list:
        ftlist.append([item[0], nowtime, nowdate])
    affected_rows3 = dbhelper.executeSqlWriteMany(sql3, ftlist)

    ret_obj = {
        'status': 0 if ret['status'] == 0 and affected_rows2 > 0 else -1,
        'item_dynamic': ret,
        'item_category': affected_rows2,
        'item_first_seen': affected_rows3,
    }

    return ret_obj