Exemple #1
0
def get_Stock_Status_Resolved(sku_list):
    clist = get_Stock_Status(sku_list)
    if len(clist) == 0:
        return {'status': -1, 'msg': 'jd api returned no result for sku_list'}
    if len(clist) != len(set(sku_list)):
        return {
            'status':
            -1,
            'msg':
            'jd api return size mismatch, size of sku:%s, size of api:%s' %
            (len(set(sku_list)), len(clist))
        }
    vlist = []
    dt = timeHelper.getNowLong()
    key_list = ['sku_id', 'dt', 'a', 'b', 'c', 'l', 'j', 'stock_json']
    for cdict in clist:
        cdict['stock_json'] = json.dumps(cdict)
        cdict['dt'] = dt
        tp = []
        for key in key_list:
            # print 'key=%s\tvalue=%s' %(key,cdict[key])
            if key in cdict.keys():
                tp.append(cdict[key])
            else:
                tp.append(None)
        vlist.append(tp)
        # print '-'*60
    return vlist
Exemple #2
0
    def doTask(self, M=1, N=1):
        while True:
            retry = 0
            print 'start job - %s' % timeHelper.getNowLong()
            t1 = time.time()
            is_success = self.doTaskOnce(M, N)
            t2 = time.time()

            remaining = 600
            if self.is_daily:
                remaining = timeHelper.getTimeLeftTillTomorrow()
            else:
                remaining = int(self.repeat_interval * 3600 - (t2 - t1))
                if remaining < 0:
                    remaining = 0
            remaining += 10
            logging.info('=' * 80)
            logging.info('Finished crawling, using time: %s seconds' %
                         (t2 - t1))
            logging.info('Has Errors? %s' %
                         ('NO' if is_success == 1 else 'YES'))
            logging.info(
                'Now sleeping for %s seconds for next run (%.1f hours)' %
                (remaining, remaining / 3600))
            logging.info('=' * 80)
            time.sleep(remaining)
Exemple #3
0
 def __record_task_complete__(self, task_list):
     vlist = []
     ut = timeHelper.getNowLong()
     for item in task_list:
         tp = (self.job_name, item, ut)
         vlist.append(tp)
     sql = 'insert into task_status(job_name,task_id,update_time) values(%s,%s,%s)'
     affected_rows = dbhelper.executeSqlWriteMany(sql, vlist)
     return affected_rows
 def __record_task_complete__(self, task_list):
     vlist = []
     ut = timeHelper.getNowLong()
     for item in task_list:
         tp = (self.job_name,item,ut)
         vlist.append(tp)
     sql = 'insert into task_status(job_name,task_id,update_time) values(%s,%s,%s)'
     affected_rows = dbhelper.executeSqlWriteMany(sql,vlist)
     return affected_rows
def crawl_sku_price(sku_list, sleep_time):
    # no more than 5000 items here per design
    rdict = jd_API.getPrices_JD(sku_list,sleep_time=sleep_time)
    vlist = []
    dt = timeHelper.getNow()
    dtlong = timeHelper.getNowLong()
    for key in rdict:
        tp = rdict[key]
        price = tp[0]
        price_m = tp[1]
        price_pcp = tp[2]
        vlist.append([key,dt,dtlong,price,price_m,price_pcp])
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_price',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=True,
        need_flow=True,
    )
Exemple #6
0
def crawl_sku_price(sku_list, sleep_time):
    # no more than 5000 items here per design
    rdict = jd_API.getPrices_JD(sku_list, sleep_time=sleep_time)
    vlist = []
    dt = timeHelper.getNow()
    dtlong = timeHelper.getNowLong()
    for key in rdict:
        tp = rdict[key]
        price = tp[0]
        price_m = tp[1]
        price_pcp = tp[2]
        vlist.append([key, dt, dtlong, price, price_m, price_pcp])
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_price',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        need_history=True,
        need_flow=True,
    )
Exemple #7
0
def crawl_item_promo(sku_id):
    rdict = jd_API.get_Promo_Sku(sku_id)
    dt = timeHelper.getNowLong()
    quan = ""  #json.dumps(rdict['quan'])
    ads = ""
    try:
        ads = rdict['ads'][0]['ad']  #json.dumps(rdict['ads'])
    except:
        pass
    prom = json.dumps(rdict['prom'])
    vlist = [[
        sku_id, dt, quan if quan != '[]' else None,
        ads if ads != '[]' else None, prom if prom != '[]' else None
    ]]
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_promo_item',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        # need_history=True
    )
Exemple #8
0
def get_Stock_Status_Resolved(sku_list):
    clist = get_Stock_Status(sku_list)
    if len(clist)==0:
        return {'status':-1,'msg':'jd api returned no result for sku_list'}
    if len(clist)!=len(set(sku_list)):
        return {'status':-1,'msg':'jd api return size mismatch, size of sku:%s, size of api:%s' %(len(set(sku_list)),len(clist))}
    vlist = []
    dt = timeHelper.getNowLong()
    key_list = ['sku_id','dt','a','b','c','l','j','stock_json']
    for cdict in clist:
        cdict['stock_json'] = json.dumps(cdict)
        cdict['dt'] = dt
        tp = []
        for key in key_list:
            # print 'key=%s\tvalue=%s' %(key,cdict[key])
            if key in cdict.keys():
                tp.append(cdict[key])
            else:
                tp.append(None)
        vlist.append(tp)
        # print '-'*60
    return vlist
    def doTask(self,M=1,N=1):
        while True:
            retry = 0
            print 'start job - %s' %timeHelper.getNowLong()
            t1 = time.time()
            is_success = self.doTaskOnce(M,N)
            t2 = time.time()

            remaining = 600
            if self.is_daily:
                remaining = timeHelper.getTimeLeftTillTomorrow()
            else:
                remaining = int(self.repeat_interval * 3600 - (t2-t1))
                if remaining < 0:
                    remaining = 0
            remaining += 10
            logging.info('='*80)
            logging.info('Finished crawling, using time: %s seconds' %(t2-t1))
            logging.info('Has Errors? %s' %('NO' if is_success==1 else 'YES'))
            logging.info('Now sleeping for %s seconds for next run (%.1f hours)' %(remaining,remaining/3600))
            logging.info('='*80)
            time.sleep(remaining)
def crawl_item_promo(sku_id):
    rdict = jd_API.get_Promo_Sku(sku_id)
    dt = timeHelper.getNowLong()
    quan = ""   #json.dumps(rdict['quan'])
    ads = ""
    try:
        ads = rdict['ads'][0]['ad']    #json.dumps(rdict['ads'])
    except:
        pass
    prom = json.dumps(rdict['prom'])
    vlist = [[
        sku_id,
        dt,
        quan if quan!='[]' else None,
        ads if ads!='[]' else None,
        prom if prom!='[]' else None
    ]]
    return crawler_helper.persist_db_history_and_latest(
        table_name='jd_promo_item',
        num_cols=len(vlist[0]),
        value_list=vlist,
        is_many=True,
        # need_history=True
    )
Exemple #11
0
def resolveProductListFromPage(html):
    product_list = []
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    try:
        doc = libxml2.htmlReadDoc(html, None, 'utf8', PARSE_OPTIONS)
        sku_docs = doc.xpathEval('//div[@data-sku]')
        for sku in sku_docs:
            #if True:
            try:
                sku_doc = libxml2.htmlReadDoc('%s' % sku, None, 'utf8',
                                              PARSE_OPTIONS)

                sku_id = int(sku_doc.xpathEval('//@data-sku')[0].content)
                # 判断是否是JD自营
                if sku_id > 99999999:
                    # 非自营商品
                    continue

                #print '%s' %sku

                sku_url = sku_doc.xpathEval(
                    '//div[@class="p-img"]/a/@href')[0].content
                try:
                    sku_thumnail_url = sku_doc.xpathEval(
                        '//div[@class="p-img"]/a/img/@data-lazy-img'
                    )[0].content
                except:
                    sku_thumnail_url = sku_doc.xpathEval(
                        '//div[@class="p-img"]/a/img/@src')[0].content

                sku_title = ""
                try:
                    sku_title = sku_doc.xpathEval(
                        '//div[@class="p-name"]/a/@title')[0].content
                except:
                    pass

                if len(sku_title) == 0:
                    sku_title = sku_doc.xpathEval(
                        '//div[@class="p-name"]/a/em')[0].content
                comment_count = int(
                    sku_doc.xpathEval('//div[@class="p-commit"]/strong/a')
                    [0].content)

                sku_icon_url = ""
                icon_doc = sku_doc.xpathEval(
                    '//div[@class="p-img"]/a/div/@style')
                if len(icon_doc) > 0:
                    sku_icon_url = url_utils.getStringBetween(
                        icon_doc[0].content, 'url("', '")')

                is_global = is_free_gift = is_pay_on_delivery = 0
                price_items = sku_doc.xpathEval(
                    '//div[@class="p-price"]/div/i')
                for pitem in price_items:
                    txt = pitem.content
                    if '全球购' in txt:
                        is_global = 1
                    elif '货到付款' in txt:
                        is_pay_on_delivery = 1
                    elif '赠品' in txt:
                        is_free_gift = 1
                    else:
                        print 'new-mark found:'
                        print txt

                sku_stock = -1
                try:
                    sku_stock = int(
                        sku_doc.xpathEval('//div[@data-stock_v]/@data-stock_v')
                        [0].content)
                except:
                    pass

                sku_url = __makeUrl__(sku_url)
                sku_thumnail_url = __makeUrl__(sku_thumnail_url)

                tp = (sku_id, nowdate, nowtime, sku_title, sku_url,
                      sku_thumnail_url, sku_stock, comment_count, is_global,
                      is_pay_on_delivery, is_free_gift, sku_icon_url)
                product_list.append(tp)

            except Exception as e:
                logging.error('resolveProductListError: %s, error = %s') % (
                    sku, e)
                continue
            finally:
                sku_doc.freeDoc()

        return product_list
    finally:
        doc.freeDoc()
def crawl_category(category_id):

    logging.debug('category_id = %s -- page 1' %(category_id))
    url = __get_category_page_url__(category_id,1)
    # print url
    html = url_utils.getWebResponse(url,'utf-8')
    if html == "":
        html = url_utils.getWebResponse(url,'gb18030')
    if html == "":
        html = url_utils.getWebResponse(url, 'gbk')
    total_pages = jd_list_resolver.resolveTotalPageNum(html)

    product_list = jd_list_resolver.resolveProductListFromPage(html)

    while len(product_list) == 0 and category_id is not None:
        category_id = __up_roll_category_id__(category_id)
        return crawl_category(category_id)

    if category_id is None or len(product_list)==0:
        return {'status':-1, 'msg': 'No item in category product list'}

    for page_iter in range(2,total_pages+1):
        logging.debug('category_id = %s -- page %s' %(category_id,page_iter))
        url = __get_category_page_url__(category_id,page_iter)
        html = url_utils.getWebResponse(url,'utf-8')
        product_list = product_list + jd_list_resolver.resolveProductListFromPage(html)
        time.sleep(SLEEP_TIME)

    sku_list = []
    for product_tp in product_list:
        sku_id = product_tp[0]
        sku_list.append(sku_id)

    # Get price of all products
    #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API)

    ret_obj = {
        'status': -1,
        'affected_rows': -1,
        'sku_count': -1
    }
    total_goods_num = len(product_list)

    # for item in product_list:
    #     print item[0]
    # print '='*80

    # combine product list and price list, timestamp, category_id
    for i in xrange(total_goods_num):
        product_id = product_list[i][0]
        pkey = '%s' %product_id
        # if pkey in price_obj:
        #     product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,)
        # else:
        #     logging.error('Error: product_id=%s cannot get result' %(product_id,price_id))
        #     continue
        product_list[i] = product_list[i] + (0,0,0,)

    # persist in database
    # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id)
    # sql = '''
    #   replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery,
    #   has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #   '''
    # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list)

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_dynamic',
        num_cols=len(product_list[0]),
        value_list=product_list,
        is_many=True,
        need_history=False, # was True - changed 01/03
        need_flow=False,    # was True - changed 12/23
    )

    logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %(category_id,total_goods_num))
    logging.debug('%s' %ret)

    # HANDLE JD_ITEM_CATEGORY
    item_cat_list = []
    for prod in product_list:
        item_cat_list.append((prod[0],category_id,))
    sql2 = 'replace into jd_item_category values (%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2,item_cat_list)
    logging.debug('Saved to DB - item_category - affected rows = %s' %affected_rows2)
    if affected_rows2<=0:
        logging.error('Saving to item_category error, category_id = %s' %category_id)


    # HANDLE JD_ITEM_FIRSTSEEN
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")'
    ftlist = []
    for item in product_list:
        ftlist.append([item[0],nowtime,nowdate])
    affected_rows3 = dbhelper.executeSqlWriteMany(sql3,ftlist)

    ret_obj = {
        'status': 0 if ret['status']==0 and affected_rows2>0 else -1,
        'item_dynamic': ret,
        'item_category': affected_rows2,
        'item_first_seen': affected_rows3,
    }

    return ret_obj
def resolveProductListFromPage(html):
    product_list = []
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    try:
        doc = libxml2.htmlReadDoc(html, None, "utf8", PARSE_OPTIONS)
        sku_docs = doc.xpathEval("//div[@data-sku]")
        for sku in sku_docs:
            # if True:
            try:
                sku_doc = libxml2.htmlReadDoc("%s" % sku, None, "utf8", PARSE_OPTIONS)

                sku_id = int(sku_doc.xpathEval("//@data-sku")[0].content)
                # 判断是否是JD自营
                if sku_id > 99999999:
                    # 非自营商品
                    continue

                # print '%s' %sku

                sku_url = sku_doc.xpathEval('//div[@class="p-img"]/a/@href')[0].content
                try:
                    sku_thumnail_url = sku_doc.xpathEval('//div[@class="p-img"]/a/img/@data-lazy-img')[0].content
                except:
                    sku_thumnail_url = sku_doc.xpathEval('//div[@class="p-img"]/a/img/@src')[0].content

                sku_title = ""
                try:
                    sku_title = sku_doc.xpathEval('//div[@class="p-name"]/a/@title')[0].content
                except:
                    pass

                if len(sku_title) == 0:
                    sku_title = sku_doc.xpathEval('//div[@class="p-name"]/a/em')[0].content
                comment_count = int(sku_doc.xpathEval('//div[@class="p-commit"]/strong/a')[0].content)

                sku_icon_url = ""
                icon_doc = sku_doc.xpathEval('//div[@class="p-img"]/a/div/@style')
                if len(icon_doc) > 0:
                    sku_icon_url = url_utils.getStringBetween(icon_doc[0].content, 'url("', '")')

                is_global = is_free_gift = is_pay_on_delivery = 0
                price_items = sku_doc.xpathEval('//div[@class="p-price"]/div/i')
                for pitem in price_items:
                    txt = pitem.content
                    if "全球购" in txt:
                        is_global = 1
                    elif "货到付款" in txt:
                        is_pay_on_delivery = 1
                    elif "赠品" in txt:
                        is_free_gift = 1
                    else:
                        print "new-mark found:"
                        print txt

                sku_stock = -1
                try:
                    sku_stock = int(sku_doc.xpathEval("//div[@data-stock_v]/@data-stock_v")[0].content)
                except:
                    pass

                sku_url = __makeUrl__(sku_url)
                sku_thumnail_url = __makeUrl__(sku_thumnail_url)

                tp = (
                    sku_id,
                    nowdate,
                    nowtime,
                    sku_title,
                    sku_url,
                    sku_thumnail_url,
                    sku_stock,
                    comment_count,
                    is_global,
                    is_pay_on_delivery,
                    is_free_gift,
                    sku_icon_url,
                )
                product_list.append(tp)

            except Exception as e:
                logging.error("resolveProductListError: %s, error = %s") % (sku, e)
                continue
            finally:
                sku_doc.freeDoc()

        return product_list
    finally:
        doc.freeDoc()
def crawl_category(category_id):

    logging.debug('category_id = %s -- page 1' % (category_id))
    url = __get_category_page_url__(category_id, 1)
    # print url
    html = url_utils.getWebResponse(url, 'utf-8')
    if html == "":
        html = url_utils.getWebResponse(url, 'gb18030')
    if html == "":
        html = url_utils.getWebResponse(url, 'gbk')
    total_pages = jd_list_resolver.resolveTotalPageNum(html)

    product_list = jd_list_resolver.resolveProductListFromPage(html)

    while len(product_list) == 0 and category_id is not None:
        category_id = __up_roll_category_id__(category_id)
        return crawl_category(category_id)

    if category_id is None or len(product_list) == 0:
        return {'status': -1, 'msg': 'No item in category product list'}

    for page_iter in range(2, total_pages + 1):
        logging.debug('category_id = %s -- page %s' % (category_id, page_iter))
        url = __get_category_page_url__(category_id, page_iter)
        html = url_utils.getWebResponse(url, 'utf-8')
        product_list = product_list + jd_list_resolver.resolveProductListFromPage(
            html)
        time.sleep(SLEEP_TIME)

    sku_list = []
    for product_tp in product_list:
        sku_id = product_tp[0]
        sku_list.append(sku_id)

    # Get price of all products
    #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API)

    ret_obj = {'status': -1, 'affected_rows': -1, 'sku_count': -1}
    total_goods_num = len(product_list)

    # for item in product_list:
    #     print item[0]
    # print '='*80

    # combine product list and price list, timestamp, category_id
    for i in xrange(total_goods_num):
        product_id = product_list[i][0]
        pkey = '%s' % product_id
        # if pkey in price_obj:
        #     product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,)
        # else:
        #     logging.error('Error: product_id=%s cannot get result' %(product_id,price_id))
        #     continue
        product_list[i] = product_list[i] + (
            0,
            0,
            0,
        )

    # persist in database
    # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id)
    # sql = '''
    #   replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery,
    #   has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #   '''
    # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list)

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_dynamic',
        num_cols=len(product_list[0]),
        value_list=product_list,
        is_many=True,
        need_history=False,  # was True - changed 01/03
        need_flow=False,  # was True - changed 12/23
    )

    logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %
                  (category_id, total_goods_num))
    logging.debug('%s' % ret)

    # HANDLE JD_ITEM_CATEGORY
    item_cat_list = []
    for prod in product_list:
        item_cat_list.append((
            prod[0],
            category_id,
        ))
    sql2 = 'replace into jd_item_category values (%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2, item_cat_list)
    logging.debug('Saved to DB - item_category - affected rows = %s' %
                  affected_rows2)
    if affected_rows2 <= 0:
        logging.error('Saving to item_category error, category_id = %s' %
                      category_id)

    # HANDLE JD_ITEM_FIRSTSEEN
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")'
    ftlist = []
    for item in product_list:
        ftlist.append([item[0], nowtime, nowdate])
    affected_rows3 = dbhelper.executeSqlWriteMany(sql3, ftlist)

    ret_obj = {
        'status': 0 if ret['status'] == 0 and affected_rows2 > 0 else -1,
        'item_dynamic': ret,
        'item_category': affected_rows2,
        'item_first_seen': affected_rows3,
    }

    return ret_obj