Ejemplo n.º 1
0
def get_Promo_Sku(sku_id):
    # http://cd.jd.com/promotion/v2?skuId=1279827&area=1_72_2799_123&cat=670,729,7311199999
    api_url = 'http://cd.jd.com/promotion/v2?skuId=%s&area=1_72_2799_123&cat=670,729,7311199999' % sku_id
    json_str = "{}"
    try:
        json_str = url_utils.getWebResponse(api_url, 'gbk')
    except:
        try:
            json_str = url_utils.getWebResponse(api_url, 'gbk')
        except:
            try:
                json_str = url_utils.getWebResponse(api_url)
            except:
                pass
    ret_map = {}
    #print json_str
    try:
        obj = json.loads(json_str)
        if obj['quanStatus'] == 200:
            ret_map['quan'] = obj['quan']
        if obj['adsStatus'] == 200:
            ret_map['ads'] = obj['ads']
        if obj['promStatus'] == 200:
            ret_map['prom'] = obj['prom']
    except:
        logging.error('JD_API::get_Promo_Sku() failed, sku_id = %s' % sku_id)
    return ret_map
Ejemplo n.º 2
0
def get_Promo_Sku(sku_id):
    # http://cd.jd.com/promotion/v2?skuId=1279827&area=1_72_2799_123&cat=670,729,7311199999
    api_url = 'http://cd.jd.com/promotion/v2?skuId=%s&area=1_72_2799_123&cat=670,729,7311199999' %sku_id
    json_str = "{}"
    try:
        json_str = url_utils.getWebResponse(api_url,'gbk')
    except:
        try:
            json_str = url_utils.getWebResponse(api_url,'gbk')
        except:
            try:
                json_str = url_utils.getWebResponse(api_url)
            except:
                pass
    ret_map = {}
    #print json_str
    try:
        obj = json.loads(json_str)
        if obj['quanStatus']==200:
            ret_map['quan'] = obj['quan']
        if obj['adsStatus']==200:
            ret_map['ads'] = obj['ads']
        if obj['promStatus']==200:
            ret_map['prom'] = obj['prom']
    except:
        logging.error('JD_API::get_Promo_Sku() failed, sku_id = %s' %sku_id)
    return ret_map
Ejemplo n.º 3
0
def __getPrices_JD_100__(sku_list):
    if len(sku_list) == 0:
        return []
    api_url = __get_price_call_url___(sku_list)
    obj = json.loads(url_utils.getWebResponse(api_url))
    # print 'obj len: %s' %len(obj)
    for item in obj:
        item['id'] = int(item['id'])
        item['p'] = float(item['p'])
        item['m'] = float(item['m'])
        if 'pcp' in item:
            item['pcp'] = float(item['pcp'])
        else:
            item['pcp'] = None
    return obj
Ejemplo n.º 4
0
def __getPrices_JD_100__(sku_list):
    if len(sku_list) == 0:
        return []
    api_url = __get_price_call_url___(sku_list)
    obj =  json.loads(url_utils.getWebResponse(api_url))
    # print 'obj len: %s' %len(obj)
    for item in obj:
        item['id'] = int(item['id'])
        item['p'] = float(item['p'])
        item['m'] = float(item['m'])
        if 'pcp' in item:
            item['pcp'] = float(item['pcp'])
        else:
            item['pcp'] = None
    return obj
Ejemplo n.º 5
0
def __getCommentCount_JD__(sku_list):
    # http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=1279827
    sku_list2 = []
    for item in sku_list:
        sku_list2.append('%s' %item)
    sku_str = ','.join(sku_list2)
    api_url = 'http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=%s' %sku_str
    json_str = url_utils.getWebResponse(api_url)
    ret_list = []
    try:
        ret_map = json.loads(json_str)
        ret_list = ret_map['CommentsCount']
    except:
        logging.error('JD_API::getCommentCount_JD() failed, sku_id = %s' %sku_id)
    return ret_list
Ejemplo n.º 6
0
def _get_Stock_Status(sku_list):
    # http://ss.3.cn/ss/areaStockState/mget?app=search_pc&ch=1&skuNum=1861098;1856588;1867038;1867670;1866550;1866973;1866564;1904606;1954504;1867014;1866686;1866577;1866958;1866661;1867024;1866945;2109985;2008714;2095246;2095250;2095272;2056957;2008804&area=1,2901,2906,0
    sku_str_list = []
    for sku in sku_list:
        sku_str_list.append('%s' % sku)
    sku_param = ';'.join(sku_str_list)
    api_url = "http://ss.3.cn/ss/areaStockState/mget?app=search_pc&ch=1&skuNum=%s&area=1,2901,2906,0" % sku_param
    # print api_url
    ret_dict_str = url_utils.getWebResponse(api_url)
    ret_dict = json.loads(ret_dict_str)
    ret_list = []
    for key in ret_dict:
        ret_obj = ret_dict[key]
        ret_obj['sku_id'] = key
        ret_list.append(ret_obj)
    return ret_list
Ejemplo n.º 7
0
def __getCommentCount_JD__(sku_list):
    # http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=1279827
    sku_list2 = []
    for item in sku_list:
        sku_list2.append('%s' % item)
    sku_str = ','.join(sku_list2)
    api_url = 'http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=%s' % sku_str
    json_str = url_utils.getWebResponse(api_url)
    ret_list = []
    try:
        ret_map = json.loads(json_str)
        ret_list = ret_map['CommentsCount']
    except:
        logging.error('JD_API::getCommentCount_JD() failed, sku_id = %s' %
                      sku_id)
    return ret_list
Ejemplo n.º 8
0
def _get_Stock_Status(sku_list):
    # http://ss.3.cn/ss/areaStockState/mget?app=search_pc&ch=1&skuNum=1861098;1856588;1867038;1867670;1866550;1866973;1866564;1904606;1954504;1867014;1866686;1866577;1866958;1866661;1867024;1866945;2109985;2008714;2095246;2095250;2095272;2056957;2008804&area=1,2901,2906,0
    sku_str_list = []
    for sku in sku_list:
        sku_str_list.append('%s' %sku)
    sku_param = ';'.join(sku_str_list)
    api_url = "http://ss.3.cn/ss/areaStockState/mget?app=search_pc&ch=1&skuNum=%s&area=1,2901,2906,0" %sku_param
    # print api_url
    ret_dict_str = url_utils.getWebResponse(api_url)
    ret_dict = json.loads(ret_dict_str)
    ret_list = []
    for key in ret_dict:
        ret_obj = ret_dict[key]
        ret_obj['sku_id'] = key
        ret_list.append(ret_obj)
    return ret_list
Ejemplo n.º 9
0
def __get_detail_page_content__(sku_id):
    mc_key = 'JD_DETAIL_HTML9_%s' %sku_id
    mcv = mc.get(mc_key)
    if mcv is not None:
        return mcv
    html = ""
    try:
        url = __get_detail_page_url__(sku_id)
        html = url_utils.getWebResponse(url)
        html = html.decode('gbk')
    except Exception as e:
        try:
            html = html.decode('gb18030')
        except:
            logging.warning('url=%s, failed decoding using GBK or GB18030, using utf-8 now... may cause problems' %url)
    if len(html) > 0:
        mc.set(mc_key,html,MEMCACHE_DETAIL_HTML_TIMEOUT)
    return html
Ejemplo n.º 10
0
def get_Promo_Category(category_id):
    # http://cd.jd.com/promotion/v2?skuId=1&area=1_72_2799_123&cat=737%2C794%2C798
    cat_id = category_id.replace('-','%2C')
    api_url = 'http://cd.jd.com/promotion/v2?skuId=1&area=1_72_2799_123&cat=%s' %cat_id
    json_str = url_utils.getWebResponse(api_url,'gbk')
    ret_map = {}
    #print json_str
    try:
        obj = json.loads(json_str)
        if obj['quanStatus']==200:
            ret_map['quan'] = obj['quan']
        if obj['adsStatus']==200:
            ret_map['ads'] = obj['ads']
        if obj['promStatus']==200:
            ret_map['prom'] = obj['prom']
    except:
        logging.error('JD_API::get_Promo_Category() failed, category_id = %s' %category_id)
    return ret_map
Ejemplo n.º 11
0
def get_Promo_Category(category_id):
    # http://cd.jd.com/promotion/v2?skuId=1&area=1_72_2799_123&cat=737%2C794%2C798
    cat_id = category_id.replace('-', '%2C')
    api_url = 'http://cd.jd.com/promotion/v2?skuId=1&area=1_72_2799_123&cat=%s' % cat_id
    json_str = url_utils.getWebResponse(api_url, 'gbk')
    ret_map = {}
    #print json_str
    try:
        obj = json.loads(json_str)
        if obj['quanStatus'] == 200:
            ret_map['quan'] = obj['quan']
        if obj['adsStatus'] == 200:
            ret_map['ads'] = obj['ads']
        if obj['promStatus'] == 200:
            ret_map['prom'] = obj['prom']
    except:
        logging.error('JD_API::get_Promo_Category() failed, category_id = %s' %
                      category_id)
    return ret_map
Ejemplo n.º 12
0
def loadCategoryList():
    html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL, JD_ENC)
    json_str = url_utils.removeJsonP(html)
    obj = json.loads(json_str)
    clist = __extractCategoryList_fromJson__(obj)

    cat_list = []

    for item in clist:
        print item
        vals = item.split('|')
        if len(vals) < 4:
            print 'error in length of category line'
            print item
            continue
        cat_name = vals[1]
        vals0 = vals[0]
        cat_id = cat_url = cat_memo = ""
        if '.com' in vals0:
            cat_url = vals0
        else:
            cat_id = vals0
        if len(vals[2]) > 0:
            cat_memo = vals[2]

        if len(cat_id) > 0:
            tp = (cat_id, cat_name, timeHelper.getNow())
            cat_list.append(tp)

    # persist categories
    sql = 'replace into jd_category values(%s,%s,%s)'
    affected_rows = dbhelper.executeSqlWriteMany(sql, cat_list)

    print 'rows affected : jd_category : %s' % affected_rows

    return 0
def loadCategoryList():
    html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL,JD_ENC)
    json_str = url_utils.removeJsonP(html)
    obj = json.loads(json_str)
    clist = __extractCategoryList_fromJson__(obj)

    cat_list = []

    for item in clist:
        print item
        vals = item.split('|')
        if len(vals)<4:
            print 'error in length of category line'
            print item
            continue
        cat_name = vals[1]
        vals0 = vals[0]
        cat_id = cat_url = cat_memo = ""
        if '.com' in vals0:
            cat_url = vals0
        else:
            cat_id = vals0
        if len(vals[2]) > 0:
            cat_memo = vals[2]

        if len(cat_id) > 0:
            tp = (cat_id,cat_name, timeHelper.getNow())
            cat_list.append(tp)

    # persist categories
    sql = 'replace into jd_category values(%s,%s,%s)'
    affected_rows = dbhelper.executeSqlWriteMany(sql,cat_list)

    print 'rows affected : jd_category : %s' %affected_rows

    return 0
Ejemplo n.º 14
0
def crawl_category(category_id):

    logging.debug('category_id = %s -- page 1' %(category_id))
    url = __get_category_page_url__(category_id,1)
    # print url
    html = url_utils.getWebResponse(url,'utf-8')
    if html == "":
        html = url_utils.getWebResponse(url,'gb18030')
    if html == "":
        html = url_utils.getWebResponse(url, 'gbk')
    total_pages = jd_list_resolver.resolveTotalPageNum(html)

    product_list = jd_list_resolver.resolveProductListFromPage(html)

    while len(product_list) == 0 and category_id is not None:
        category_id = __up_roll_category_id__(category_id)
        return crawl_category(category_id)

    if category_id is None or len(product_list)==0:
        return {'status':-1, 'msg': 'No item in category product list'}

    for page_iter in range(2,total_pages+1):
        logging.debug('category_id = %s -- page %s' %(category_id,page_iter))
        url = __get_category_page_url__(category_id,page_iter)
        html = url_utils.getWebResponse(url,'utf-8')
        product_list = product_list + jd_list_resolver.resolveProductListFromPage(html)
        time.sleep(SLEEP_TIME)

    sku_list = []
    for product_tp in product_list:
        sku_id = product_tp[0]
        sku_list.append(sku_id)

    # Get price of all products
    #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API)

    ret_obj = {
        'status': -1,
        'affected_rows': -1,
        'sku_count': -1
    }
    total_goods_num = len(product_list)

    # for item in product_list:
    #     print item[0]
    # print '='*80

    # combine product list and price list, timestamp, category_id
    for i in xrange(total_goods_num):
        product_id = product_list[i][0]
        pkey = '%s' %product_id
        # if pkey in price_obj:
        #     product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,)
        # else:
        #     logging.error('Error: product_id=%s cannot get result' %(product_id,price_id))
        #     continue
        product_list[i] = product_list[i] + (0,0,0,)

    # persist in database
    # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id)
    # sql = '''
    #   replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery,
    #   has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #   '''
    # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list)

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_dynamic',
        num_cols=len(product_list[0]),
        value_list=product_list,
        is_many=True,
        need_history=False, # was True - changed 01/03
        need_flow=False,    # was True - changed 12/23
    )

    logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %(category_id,total_goods_num))
    logging.debug('%s' %ret)

    # HANDLE JD_ITEM_CATEGORY
    item_cat_list = []
    for prod in product_list:
        item_cat_list.append((prod[0],category_id,))
    sql2 = 'replace into jd_item_category values (%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2,item_cat_list)
    logging.debug('Saved to DB - item_category - affected rows = %s' %affected_rows2)
    if affected_rows2<=0:
        logging.error('Saving to item_category error, category_id = %s' %category_id)


    # HANDLE JD_ITEM_FIRSTSEEN
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")'
    ftlist = []
    for item in product_list:
        ftlist.append([item[0],nowtime,nowdate])
    affected_rows3 = dbhelper.executeSqlWriteMany(sql3,ftlist)

    ret_obj = {
        'status': 0 if ret['status']==0 and affected_rows2>0 else -1,
        'item_dynamic': ret,
        'item_category': affected_rows2,
        'item_first_seen': affected_rows3,
    }

    return ret_obj
Ejemplo n.º 15
0
def crawl_category(category_id):

    logging.debug('category_id = %s -- page 1' % (category_id))
    url = __get_category_page_url__(category_id, 1)
    # print url
    html = url_utils.getWebResponse(url, 'utf-8')
    if html == "":
        html = url_utils.getWebResponse(url, 'gb18030')
    if html == "":
        html = url_utils.getWebResponse(url, 'gbk')
    total_pages = jd_list_resolver.resolveTotalPageNum(html)

    product_list = jd_list_resolver.resolveProductListFromPage(html)

    while len(product_list) == 0 and category_id is not None:
        category_id = __up_roll_category_id__(category_id)
        return crawl_category(category_id)

    if category_id is None or len(product_list) == 0:
        return {'status': -1, 'msg': 'No item in category product list'}

    for page_iter in range(2, total_pages + 1):
        logging.debug('category_id = %s -- page %s' % (category_id, page_iter))
        url = __get_category_page_url__(category_id, page_iter)
        html = url_utils.getWebResponse(url, 'utf-8')
        product_list = product_list + jd_list_resolver.resolveProductListFromPage(
            html)
        time.sleep(SLEEP_TIME)

    sku_list = []
    for product_tp in product_list:
        sku_id = product_tp[0]
        sku_list.append(sku_id)

    # Get price of all products
    #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API)

    ret_obj = {'status': -1, 'affected_rows': -1, 'sku_count': -1}
    total_goods_num = len(product_list)

    # for item in product_list:
    #     print item[0]
    # print '='*80

    # combine product list and price list, timestamp, category_id
    for i in xrange(total_goods_num):
        product_id = product_list[i][0]
        pkey = '%s' % product_id
        # if pkey in price_obj:
        #     product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,)
        # else:
        #     logging.error('Error: product_id=%s cannot get result' %(product_id,price_id))
        #     continue
        product_list[i] = product_list[i] + (
            0,
            0,
            0,
        )

    # persist in database
    # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id)
    # sql = '''
    #   replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery,
    #   has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #   '''
    # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list)

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_dynamic',
        num_cols=len(product_list[0]),
        value_list=product_list,
        is_many=True,
        need_history=False,  # was True - changed 01/03
        need_flow=False,  # was True - changed 12/23
    )

    logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %
                  (category_id, total_goods_num))
    logging.debug('%s' % ret)

    # HANDLE JD_ITEM_CATEGORY
    item_cat_list = []
    for prod in product_list:
        item_cat_list.append((
            prod[0],
            category_id,
        ))
    sql2 = 'replace into jd_item_category values (%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2, item_cat_list)
    logging.debug('Saved to DB - item_category - affected rows = %s' %
                  affected_rows2)
    if affected_rows2 <= 0:
        logging.error('Saving to item_category error, category_id = %s' %
                      category_id)

    # HANDLE JD_ITEM_FIRSTSEEN
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")'
    ftlist = []
    for item in product_list:
        ftlist.append([item[0], nowtime, nowdate])
    affected_rows3 = dbhelper.executeSqlWriteMany(sql3, ftlist)

    ret_obj = {
        'status': 0 if ret['status'] == 0 and affected_rows2 > 0 else -1,
        'item_dynamic': ret,
        'item_category': affected_rows2,
        'item_first_seen': affected_rows3,
    }

    return ret_obj