def crawl_one_shop(shop, failed):
    try:
        is_commit = shop['is_commit']
        shop_id = shop['shop'][0]
        shop_url = shop['shop'][1]
        shop_type = shop['shop'][4]
        shop_nick = shop['shop'][5]

        tb = TaobaoListHtml(shop_id, shop_url)
        tb.crawl()
        logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items))

        if is_commit:
            batch_size=100
            total_items = tb.total_items

            db = get_rawdb_conn()
            update_shop_items(batch_size, db, shop_id, total_items)
            update_taobao_volume(db, shop_id, shop_type, total_items)
            db.close()

            Statsd.increment('guang.crawl.shop_list_succ')
    except ShopOfflineException:
        #double check shop status by taobao api
        shopinfo = get_taobao_shops(get_rand_top(), shop_nick)
        if shopinfo.get('error', 0) == 560 and is_commit:
            db = get_rawdb_conn()
            do_query(db, "update shop set status=2 where id=%s" % shop_id)
            db.commit()
            db.close()
    except:
        Statsd.increment('guang.crawl.shop_list_failed')
        logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags':['crawlShopException',]})
        failed.append({'shopid':shop_id, 'err':traceback.format_exc()})
Example #2
0
def main():
    db = get_rawdb_conn()

    logger.debug("querying")
    db.query(
        "select item_id, result, is_image_crawled, id from crawl_html where id>3000 order by id"
    )
    results = db.store_result()

    i = 0
    db.autocommit(False)
    db.query("set autocommit=0;")
    for row in results.fetch_row(maxrows=0):
        item_id = row[0]
        result = row[1]
        is_image_crawled = row[2]
        i += 1
        if result == 1 and is_image_crawled == 1:
            try_query(db,
                      "update item set crawl_status=2 where id=%s" % item_id)
        if result == 1 and is_image_crawled == 0:
            try_query(db,
                      "update item set crawl_status=1 where id=%s" % item_id)
        if result == 0:
            try_query(db,
                      "update item set crawl_status=0 where id=%s" % item_id)
        if i % 1000 == 0:
            logger.debug("processing %s %s %s/%s", row[3], item_id, i, 1194351)
            db.commit()
    db.commit()
    db.close()
def main():
    db = get_rawdb_conn()

    logger.debug("querying")
    db.query("select item_id, result, is_image_crawled, id from crawl_html where id>3000 order by id")
    results = db.store_result()

    i = 0
    db.autocommit(False)
    db.query("set autocommit=0;")
    for row in results.fetch_row(maxrows=0):
        item_id = row[0]
        result = row[1]
        is_image_crawled = row[2]
        i += 1
        if result == 1 and is_image_crawled == 1:
            try_query(db, "update item set crawl_status=2 where id=%s" % item_id)
        if result == 1 and is_image_crawled == 0:
            try_query(db, "update item set crawl_status=1 where id=%s" % item_id)
        if result == 0:
            try_query(db, "update item set crawl_status=0 where id=%s" % item_id)
        if i % 1000 == 0:
            logger.debug("processing %s %s %s/%s", row[3], item_id, i, 1194351)
            db.commit()
    db.commit()
    db.close()
Example #4
0
def crawl_one_shop(shop, failed):
    try:
        is_commit = shop['is_commit']
        shop_id = shop['shop'][0]
        shop_url = shop['shop'][1]
        shop_type = shop['shop'][4]
        shop_nick = shop['shop'][5].encode('utf-8')

        tb = TaobaoListHtml(shop_id, shop_url)
        tb.crawl()
        logger.debug("crawl result %s count %s total %s", tb.id, tb.count,
                     len(tb.total_items))

        if is_commit:
            batch_size = 100
            total_items = tb.total_items

            db = get_rawdb_conn()
            update_shop_items(batch_size, db, shop_id, total_items)
            update_taobao_volume(db, shop_id, shop_type, total_items)
            db.close()

            Statsd.increment('guang.crawl.shop_list_succ')
    except ShopOfflineException:
        #double check shop status by taobao api
        shopinfo = get_taobao_shops(get_rand_top(), [shop_nick])
        if not shopinfo and is_commit:
            """
            db = get_rawdb_conn()
            do_query(db, "update shop set status=2 where id=%s" % shop_id)
            db.commit()
            db.close()
            """
            logger.warning("Shop %s: %s not is taobaoke", shop_id, shop_url)
        else:
            logger.error("Shop %s: %s url is error!", shop_id, shop_url)
    except:
        Statsd.increment('guang.crawl.shop_list_failed')
        logger.error("crawl shop failed %s %s",
                     shop_id,
                     traceback.format_exc(),
                     extra={'tags': [
                         'crawlShopException',
                     ]})
        failed.append({'shopid': shop_id, 'err': traceback.format_exc()})