Esempio n. 1
0
def crawl_one_shop(shop, failed):
    try:
        is_commit = shop['is_commit']
        shop_id = shop['shop'][0]
        shop_url = shop['shop'][1]
        shop_type = shop['shop'][4]
        shop_nick = shop['shop'][5]

        tb = TaobaoListHtml(shop_id, shop_url)
        tb.crawl()
        logger.debug("crawl result %s count %s total %s", tb.id, tb.count, len(tb.total_items))

        if is_commit:
            batch_size=100
            total_items = tb.total_items

            db = get_rawdb_conn()
            update_shop_items(batch_size, db, shop_id, total_items)
            update_taobao_volume(db, shop_id, shop_type, total_items)
            db.close()

            Statsd.increment('guang.crawl.shop_list_succ')
    except ShopOfflineException:
        #double check shop status by taobao api
        shopinfo = get_taobao_shops(get_rand_top(), shop_nick)
        if shopinfo.get('error', 0) == 560 and is_commit:
            db = get_rawdb_conn()
            do_query(db, "update shop set status=2 where id=%s" % shop_id)
            db.commit()
            db.close()
    except:
        Statsd.increment('guang.crawl.shop_list_failed')
        logger.error("crawl shop failed %s %s", shop_id, traceback.format_exc(), extra={'tags':['crawlShopException',]})
        failed.append({'shopid':shop_id, 'err':traceback.format_exc()})
Esempio n. 2
0
def check(shop, failed):
    shopid = shop[0]
    shop_url = shop[1]
    shop_nick = shop[5].encode("utf-8")
    shopinfo = get_taobao_shops(get_rand_top(), [shop_nick])
    if not shopinfo:
        failed.append(shopid)
        logger.error("shop %s url %s : not is taobaoke", shopid, shop_url)
Esempio n. 3
0
def check(shop, failed):
    shopid = shop[0]
    shop_url = shop[1]
    shop_nick = shop[5].encode('utf-8')
    shopinfo = get_taobao_shops(get_rand_top(), [shop_nick])
    if not shopinfo:
        failed.append(shopid)
        logger.error("shop %s url %s : not is taobaoke", shopid, shop_url)
Esempio n. 4
0
def process_shop(db, shop, failed):
    id, url, level, nick, sid, cid, taobao_created, taobao_modified, taobao_title, item_score, service_score, delivery_score = shop
    try:
        shopinfo = get_taobao_shops(get_rand_top(), nick)
        if shopinfo.get("error", 0) == 560:
            logger.warn("Shop nick maybe error! %s", id)
        new_shop = {}
        if shopinfo.has_key('shop'):
            new_shop['sid'] = shopinfo['shop']['sid']
            new_shop['cid'] = shopinfo['shop']['cid']
            new_shop['delivery_score'] = int(
                float(shopinfo['shop']['shop_score']['delivery_score']) * 10)
            new_shop['item_score'] = int(
                float(shopinfo['shop']['shop_score']['item_score']) * 10)
            new_shop['service_score'] = int(
                float(shopinfo['shop']['shop_score']['service_score']) * 10)

            new_shop['taobao_created'] = shopinfo['shop']['created']
            new_shop['taobao_modified'] = shopinfo['shop']['modified']
            new_shop['taobao_title'] = shopinfo['shop']['title']

        tb = TaobaoListHtml(id, url)
        tb.crawl(maxpage=1)
        if url.startswith('http://shop'):
            db.execute('update shop set nick_url="%s" where id=%s',
                       tb.nick_url[0], id)
            logger.debug("nick url is %s", tb.nick_url[0])
        new_shop['level'] = tb.get_level()

        update_fields = []
        for key in new_shop:
            old_val = locals()[key]
            if new_shop[key] != old_val:
                update_fields.append((key, new_shop[key], old_val))
        if update_fields:
            update_sql = "update shop set %s where id=%s" % (",".join(
                [get_set_sql(f) for f in update_fields]), id)
            logger.debug(update_sql)
            db.execute(update_sql)
    except KeyboardInterrupt:
        raise
    except:
        logger.warn("update shop(id=%s) level hash unknown exception %s", id,
                    traceback.format_exc())
        failed.append(traceback.format_exc())
        return None
Esempio n. 5
0
def crawl_one_shop(shop, failed):
    try:
        is_commit = shop['is_commit']
        shop_id = shop['shop'][0]
        shop_url = shop['shop'][1]
        shop_type = shop['shop'][4]
        shop_nick = shop['shop'][5].encode('utf-8')

        tb = TaobaoListHtml(shop_id, shop_url)
        tb.crawl()
        logger.debug("crawl result %s count %s total %s", tb.id, tb.count,
                     len(tb.total_items))

        if is_commit:
            batch_size = 100
            total_items = tb.total_items

            db = get_rawdb_conn()
            update_shop_items(batch_size, db, shop_id, total_items)
            update_taobao_volume(db, shop_id, shop_type, total_items)
            db.close()

            Statsd.increment('guang.crawl.shop_list_succ')
    except ShopOfflineException:
        #double check shop status by taobao api
        shopinfo = get_taobao_shops(get_rand_top(), [shop_nick])
        if not shopinfo and is_commit:
            """
            db = get_rawdb_conn()
            do_query(db, "update shop set status=2 where id=%s" % shop_id)
            db.commit()
            db.close()
            """
            logger.warning("Shop %s: %s not is taobaoke", shop_id, shop_url)
        else:
            logger.error("Shop %s: %s url is error!", shop_id, shop_url)
    except:
        Statsd.increment('guang.crawl.shop_list_failed')
        logger.error("crawl shop failed %s %s",
                     shop_id,
                     traceback.format_exc(),
                     extra={'tags': [
                         'crawlShopException',
                     ]})
        failed.append({'shopid': shop_id, 'err': traceback.format_exc()})
def process_shop(db, shop, failed):
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()
    id,url,level,nick,sid,cid,taobao_created,taobao_modified,taobao_title,item_score,service_score,delivery_score = shop
    try:
        shopinfo = get_taobao_shops(get_rand_top(), nick)
        if shopinfo.get("error", 0) == 560:
            logger.warn("Shop nick maybe error! %s", id)
        new_shop = {}
        if shopinfo.has_key('shop'):
            new_shop['sid'] = shopinfo['shop']['sid']
            new_shop['cid'] = shopinfo['shop']['cid']
            new_shop['delivery_score'] = int(float(shopinfo['shop']['shop_score']['delivery_score']) * 10)
            new_shop['item_score'] = int(float(shopinfo['shop']['shop_score']['item_score']) * 10)
            new_shop['service_score'] = int(float(shopinfo['shop']['shop_score']['service_score']) * 10)

            new_shop['taobao_created'] = shopinfo['shop']['created']
            new_shop['taobao_modified'] = shopinfo['shop']['modified']
            new_shop['taobao_title'] = shopinfo['shop']['title']

        tb = TaobaoListHtml(id, url)
        tb.crawl(maxpage=1)
        if url.startswith('http://shop'):
            db.execute('update shop set nick_url="%s" where id=%s', tb.nick_url[0], id)
            logger.debug("nick url is %s", tb.nick_url[0])
        new_shop['level'] = tb.get_level()

        update_fields = []
        for key in new_shop:
            old_val = locals()[key]
            if new_shop[key] != old_val:
                update_fields.append((key, new_shop[key], old_val))
        if update_fields:
            update_sql = "update shop set %s where id=%s" % (",".join([get_set_sql(f) for f in update_fields]), id)
            logger.debug(update_sql)
            db.execute(update_sql)
    except KeyboardInterrupt:
        raise
    except:
        logger.warn("update shop(id=%s) level hash unknown exception %s", id, traceback.format_exc())
        failed.append(traceback.format_exc())
        return None
Esempio n. 7
0
def check_one_shop(shop, failed):
    shopid = shop[0]
    shop_url = shop[1]
    shop_nick = shop[5]

    shopinfo = get_taobao_shops(get_rand_top(), shop_nick)
    db = get_db_engine()
    try:
        tb = TaobaoListHtml(shopid, shop_url)
        tb.crawl(maxpage=1)
        page_len = tb.count
    except ShopOfflineException:
        page_len = 0
        if shopinfo.get('error', 0) == 560:
            logger.error("Shop %s url is offline! %s", shopid, shop_url)
            db.execute("update shop set status=2 where id=%s", shopid)
        else:
            logger.error("Shop %s url is error! %s --> %s", shopid, shop_url, shopinfo)

    compare_item_indb(db, page_len, shop_url, shopid)
Esempio n. 8
0
def check_one_shop(shop, failed):
    shopid = shop[0]
    shop_url = shop[1]
    shop_nick = shop[5]

    shopinfo = get_taobao_shops(get_rand_top(), shop_nick)
    db = get_db_engine()
    try:
        tb = TaobaoListHtml(shopid, shop_url)
        tb.crawl(maxpage=1)
        page_len = tb.count
    except ShopOfflineException:
        page_len = 0
        if shopinfo.get('error', 0) == 560:
            logger.error("Shop %s url is offline! %s", shopid, shop_url)
            db.execute("update shop set status=2 where id=%s", shopid)
        else:
            logger.error("Shop %s url is error! %s --> %s", shopid, shop_url, shopinfo)

    compare_item_indb(db, page_len, shop_url, shopid)