def persist_db_history_and_latest(table_name,
                                  num_cols,
                                  value_list,
                                  is_many=True,
                                  need_history=False,
                                  need_flow=False):
    tbl_latest = '%s_latest' % table_name
    ps_list = []
    for i in xrange(num_cols):
        ps_list.append('%s')
    values_str = ','.join(ps_list)

    t1 = time.time()
    tcur = t1
    affected_rows = 99999
    affected_rows3 = 99999
    if need_history:
        sql = 'replace into %s values(%s)' % (table_name, values_str)
        affected_rows = dbhelper.executeSqlWriteMany(sql,
                                                     value_list,
                                                     is_dirty=True)
        t2 = time.time()
        tcur = t2
        logging.debug('persist_db_history_and_latest, history using time: %s' %
                      (t2 - t1))
    if need_flow:
        sql = 'replace into %s values(%s)' % (table_name + '_flow', values_str)
        affected_rows3 = dbhelper.executeSqlWriteMany(sql,
                                                      value_list,
                                                      is_dirty=True)
        t21 = time.time()
        logging.debug('persist_db_history_and_latest, flow using time: %s' %
                      (t21 - tcur))
        tcur = t21
    sql2 = 'replace into %s values(%s)' % (tbl_latest, values_str)
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2,
                                                  value_list,
                                                  is_dirty=True)
    t3 = time.time()
    logging.debug('persist_db_history_and_latest, latest using time: %s' %
                  (t3 - tcur))
    status = -1
    if affected_rows > 0 and affected_rows2 > 0 and affected_rows3 > 0:
        status = 0
    ret = {
        'status': status,
        'affected_rows_latest': affected_rows2,
        'affected_rows_history': affected_rows,
        'affected_rows_flow': affected_rows3,
    }
    return ret
Example #2
0
 def __record_task_complete__(self, task_list):
     vlist = []
     ut = timeHelper.getNowLong()
     for item in task_list:
         tp = (self.job_name, item, ut)
         vlist.append(tp)
     sql = 'insert into task_status(job_name,task_id,update_time) values(%s,%s,%s)'
     affected_rows = dbhelper.executeSqlWriteMany(sql, vlist)
     return affected_rows
Example #3
0
 def __record_task_complete__(self, task_list):
     vlist = []
     ut = timeHelper.getNowLong()
     for item in task_list:
         tp = (self.job_name,item,ut)
         vlist.append(tp)
     sql = 'insert into task_status(job_name,task_id,update_time) values(%s,%s,%s)'
     affected_rows = dbhelper.executeSqlWriteMany(sql,vlist)
     return affected_rows
def persist_db_history_and_latest(table_name, num_cols, value_list, is_many=True, need_history=False, need_flow=False):
    tbl_latest = '%s_latest' %table_name
    ps_list = []
    for i in xrange(num_cols):
        ps_list.append('%s')
    values_str = ','.join(ps_list)

    t1 = time.time()
    tcur = t1
    affected_rows = 99999
    affected_rows3 = 99999
    if need_history:
        sql = 'replace into %s values(%s)' %(table_name,values_str)
        affected_rows = dbhelper.executeSqlWriteMany(sql,value_list,is_dirty=True)
        t2 = time.time()
        tcur = t2
        logging.debug('persist_db_history_and_latest, history using time: %s' %(t2-t1))
    if need_flow:
        sql = 'replace into %s values(%s)' %(table_name+'_flow',values_str)
        affected_rows3 = dbhelper.executeSqlWriteMany(sql,value_list,is_dirty=True)
        t21 = time.time()
        logging.debug('persist_db_history_and_latest, flow using time: %s' %(t21-tcur))
        tcur = t21
    sql2 = 'replace into %s values(%s)' %(tbl_latest,values_str)
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2,value_list,is_dirty=True)
    t3 = time.time()
    logging.debug('persist_db_history_and_latest, latest using time: %s' %(t3-tcur))
    status = -1
    if affected_rows>0 and affected_rows2>0 and affected_rows3>0:
        status = 0
    ret = {
        'status':status,
        'affected_rows_latest': affected_rows2,
        'affected_rows_history': affected_rows,
        'affected_rows_flow': affected_rows3,
    }
    return ret
def crawl_detail_images(sku_id):
    html = __get_detail_page_content__(sku_id)
    img_list = jd_detail_resolver.resolve_Images(html)
    # logging.debug(img_list)
    if len(img_list)==0:
        return {'status':-1}
    vlist = []
    update_time = timeHelper.getNow()
    for img in img_list:
        tp = (sku_id, update_time, img)
        vlist.append(tp)
    # sql = 'replace into jd_item_images values(%s,%s,%s)'
    # affected_rows = dbhelper.executeSqlWriteMany(sql,vlist)
    sql2 = 'replace into jd_item_images_latest values(%s,%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2,vlist)
    ret = {
        'status': -1,
        # 'affected_rows': affected_rows,
        'affected_rows2': affected_rows2
    }
    if affected_rows2>0:
        ret['status'] = 0
    return ret
def crawl_detail_property(sku_id):
    html = __get_detail_page_content__(sku_id)
    prop_map = jd_detail_resolver.resolve_Properties(html)
    update_time = timeHelper.getNow()
    vlist = []
    if len(prop_map) == 0:
        return {'status':0}
    for p_key in prop_map:
        p_value = prop_map[p_key]
        tp = (sku_id, update_time, p_key, p_value)
        vlist.append(tp)
    # sql = 'replace into jd_item_property values(%s,%s,%s,%s)'
    # affected_rows = dbhelper.executeSqlWriteMany(sql,vlist)
    sql2 = 'replace into jd_item_property_latest values(%s,%s,%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2,vlist)
    ret = {
        'status': -1,
        # 'affected_rows': affected_rows,
        'affected_rows2': affected_rows2
    }
    if affected_rows2 > 0:
        ret['status'] = 0
    return ret
Example #7
0
def loadCategoryList():
    html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL, JD_ENC)
    json_str = url_utils.removeJsonP(html)
    obj = json.loads(json_str)
    clist = __extractCategoryList_fromJson__(obj)

    cat_list = []

    for item in clist:
        print item
        vals = item.split('|')
        if len(vals) < 4:
            print 'error in length of category line'
            print item
            continue
        cat_name = vals[1]
        vals0 = vals[0]
        cat_id = cat_url = cat_memo = ""
        if '.com' in vals0:
            cat_url = vals0
        else:
            cat_id = vals0
        if len(vals[2]) > 0:
            cat_memo = vals[2]

        if len(cat_id) > 0:
            tp = (cat_id, cat_name, timeHelper.getNow())
            cat_list.append(tp)

    # persist categories
    sql = 'replace into jd_category values(%s,%s,%s)'
    affected_rows = dbhelper.executeSqlWriteMany(sql, cat_list)

    print 'rows affected : jd_category : %s' % affected_rows

    return 0
def loadCategoryList():
    html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL,JD_ENC)
    json_str = url_utils.removeJsonP(html)
    obj = json.loads(json_str)
    clist = __extractCategoryList_fromJson__(obj)

    cat_list = []

    for item in clist:
        print item
        vals = item.split('|')
        if len(vals)<4:
            print 'error in length of category line'
            print item
            continue
        cat_name = vals[1]
        vals0 = vals[0]
        cat_id = cat_url = cat_memo = ""
        if '.com' in vals0:
            cat_url = vals0
        else:
            cat_id = vals0
        if len(vals[2]) > 0:
            cat_memo = vals[2]

        if len(cat_id) > 0:
            tp = (cat_id,cat_name, timeHelper.getNow())
            cat_list.append(tp)

    # persist categories
    sql = 'replace into jd_category values(%s,%s,%s)'
    affected_rows = dbhelper.executeSqlWriteMany(sql,cat_list)

    print 'rows affected : jd_category : %s' %affected_rows

    return 0
def crawl_category(category_id):

    logging.debug('category_id = %s -- page 1' %(category_id))
    url = __get_category_page_url__(category_id,1)
    # print url
    html = url_utils.getWebResponse(url,'utf-8')
    if html == "":
        html = url_utils.getWebResponse(url,'gb18030')
    if html == "":
        html = url_utils.getWebResponse(url, 'gbk')
    total_pages = jd_list_resolver.resolveTotalPageNum(html)

    product_list = jd_list_resolver.resolveProductListFromPage(html)

    while len(product_list) == 0 and category_id is not None:
        category_id = __up_roll_category_id__(category_id)
        return crawl_category(category_id)

    if category_id is None or len(product_list)==0:
        return {'status':-1, 'msg': 'No item in category product list'}

    for page_iter in range(2,total_pages+1):
        logging.debug('category_id = %s -- page %s' %(category_id,page_iter))
        url = __get_category_page_url__(category_id,page_iter)
        html = url_utils.getWebResponse(url,'utf-8')
        product_list = product_list + jd_list_resolver.resolveProductListFromPage(html)
        time.sleep(SLEEP_TIME)

    sku_list = []
    for product_tp in product_list:
        sku_id = product_tp[0]
        sku_list.append(sku_id)

    # Get price of all products
    #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API)

    ret_obj = {
        'status': -1,
        'affected_rows': -1,
        'sku_count': -1
    }
    total_goods_num = len(product_list)

    # for item in product_list:
    #     print item[0]
    # print '='*80

    # combine product list and price list, timestamp, category_id
    for i in xrange(total_goods_num):
        product_id = product_list[i][0]
        pkey = '%s' %product_id
        # if pkey in price_obj:
        #     product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,)
        # else:
        #     logging.error('Error: product_id=%s cannot get result' %(product_id,price_id))
        #     continue
        product_list[i] = product_list[i] + (0,0,0,)

    # persist in database
    # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id)
    # sql = '''
    #   replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery,
    #   has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #   '''
    # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list)

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_dynamic',
        num_cols=len(product_list[0]),
        value_list=product_list,
        is_many=True,
        need_history=False, # was True - changed 01/03
        need_flow=False,    # was True - changed 12/23
    )

    logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %(category_id,total_goods_num))
    logging.debug('%s' %ret)

    # HANDLE JD_ITEM_CATEGORY
    item_cat_list = []
    for prod in product_list:
        item_cat_list.append((prod[0],category_id,))
    sql2 = 'replace into jd_item_category values (%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2,item_cat_list)
    logging.debug('Saved to DB - item_category - affected rows = %s' %affected_rows2)
    if affected_rows2<=0:
        logging.error('Saving to item_category error, category_id = %s' %category_id)


    # HANDLE JD_ITEM_FIRSTSEEN
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")'
    ftlist = []
    for item in product_list:
        ftlist.append([item[0],nowtime,nowdate])
    affected_rows3 = dbhelper.executeSqlWriteMany(sql3,ftlist)

    ret_obj = {
        'status': 0 if ret['status']==0 and affected_rows2>0 else -1,
        'item_dynamic': ret,
        'item_category': affected_rows2,
        'item_first_seen': affected_rows3,
    }

    return ret_obj
def crawl_category(category_id):

    logging.debug('category_id = %s -- page 1' % (category_id))
    url = __get_category_page_url__(category_id, 1)
    # print url
    html = url_utils.getWebResponse(url, 'utf-8')
    if html == "":
        html = url_utils.getWebResponse(url, 'gb18030')
    if html == "":
        html = url_utils.getWebResponse(url, 'gbk')
    total_pages = jd_list_resolver.resolveTotalPageNum(html)

    product_list = jd_list_resolver.resolveProductListFromPage(html)

    while len(product_list) == 0 and category_id is not None:
        category_id = __up_roll_category_id__(category_id)
        return crawl_category(category_id)

    if category_id is None or len(product_list) == 0:
        return {'status': -1, 'msg': 'No item in category product list'}

    for page_iter in range(2, total_pages + 1):
        logging.debug('category_id = %s -- page %s' % (category_id, page_iter))
        url = __get_category_page_url__(category_id, page_iter)
        html = url_utils.getWebResponse(url, 'utf-8')
        product_list = product_list + jd_list_resolver.resolveProductListFromPage(
            html)
        time.sleep(SLEEP_TIME)

    sku_list = []
    for product_tp in product_list:
        sku_id = product_tp[0]
        sku_list.append(sku_id)

    # Get price of all products
    #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API)

    ret_obj = {'status': -1, 'affected_rows': -1, 'sku_count': -1}
    total_goods_num = len(product_list)

    # for item in product_list:
    #     print item[0]
    # print '='*80

    # combine product list and price list, timestamp, category_id
    for i in xrange(total_goods_num):
        product_id = product_list[i][0]
        pkey = '%s' % product_id
        # if pkey in price_obj:
        #     product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,)
        # else:
        #     logging.error('Error: product_id=%s cannot get result' %(product_id,price_id))
        #     continue
        product_list[i] = product_list[i] + (
            0,
            0,
            0,
        )

    # persist in database
    # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id)
    # sql = '''
    #   replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery,
    #   has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    #   '''
    # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list)

    ret = crawler_helper.persist_db_history_and_latest(
        table_name='jd_item_dynamic',
        num_cols=len(product_list[0]),
        value_list=product_list,
        is_many=True,
        need_history=False,  # was True - changed 01/03
        need_flow=False,  # was True - changed 12/23
    )

    logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %
                  (category_id, total_goods_num))
    logging.debug('%s' % ret)

    # HANDLE JD_ITEM_CATEGORY
    item_cat_list = []
    for prod in product_list:
        item_cat_list.append((
            prod[0],
            category_id,
        ))
    sql2 = 'replace into jd_item_category values (%s,%s)'
    affected_rows2 = dbhelper.executeSqlWriteMany(sql2, item_cat_list)
    logging.debug('Saved to DB - item_category - affected rows = %s' %
                  affected_rows2)
    if affected_rows2 <= 0:
        logging.error('Saving to item_category error, category_id = %s' %
                      category_id)

    # HANDLE JD_ITEM_FIRSTSEEN
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")'
    ftlist = []
    for item in product_list:
        ftlist.append([item[0], nowtime, nowdate])
    affected_rows3 = dbhelper.executeSqlWriteMany(sql3, ftlist)

    ret_obj = {
        'status': 0 if ret['status'] == 0 and affected_rows2 > 0 else -1,
        'item_dynamic': ret,
        'item_category': affected_rows2,
        'item_first_seen': affected_rows3,
    }

    return ret_obj