Example #1
0
def crawl_one_shop(shop, tb_category, term_factory, db):
    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]
    shop_termLimits = shop[3]

    # 白名单模式暂时没有使用上,shop.mode

    defaultCampaign = list(db.execute(
        "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id))
    if not defaultCampaign:
        logger.error("can not get the default campaign for shop: %s", shop_id)
        return

    """
        1.setting shop crawl_status=2
        2.crawler
        3.setting shop crawl_status=0
    """
    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id)

    # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录
    allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id))
    tb_numids = []
    for ids in allTbNumIds:
        tb_numids.extend(ids[0].split(','))
    tb_numids_set = set(tb_numids)
    logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set))

    # 过滤
    new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set)
    logger.info("stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s" % (shop_id, len(new_numids_set), len(offShelf_numids_set), len(common_numids_set)))

    new_num = 0
    off2On_num = 0
    black_num = 0
    if len(new_numids_set) > 0:
        new_item_list = doCrawl(shop_id, new_numids_set)
        if new_item_list:
            for dict_item in new_item_list:
                num_id = str(dict_item['num_iid'])
                n_cid = dict_item['cid']
                tb_title = dict_item['title']
                tb_detail_url = str(dict_item['detail_url'])
                tb_price = float(dict_item['price'])
                tb_pic_url = str(dict_item['pic_url'])

                volume = 0
                if dict_item.has_key('volume'):
                    volume = dict_item['volume']
                try:
                    #检查该商品是否重新上架
                    db_item = list(db.execute(
                        "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id='%s'" % (shop_id, num_id)))
                    if db_item:
                        #update
                        db_status = int(db_item[0][6])
                        db_manual_set = int(db_item[0][5])
                        db_price = float(db_item[0][4])
                        db_local_pic_url = db_item[0][3]
                        db_pic_url = db_item[0][2]
                        db_title = db_item[0][1]
                        db_item_id = int(db_item[0][0])

                        if db_status == ITEM_STATUS_BLACKLIST:
                            black_num += 1
                            continue

                        item = TaobaoItem(shop_id, db_item_id, num_id)
                        item.status = ITEM_STATUS_ACTIVE     # 先置为上线状态,再检查其他属性是否有变化

                        # 人工设置了图片和title
                        if db_manual_set == 1:
                            if not imgExists(shop_id, db_local_pic_url):
                                # 图片不存在,需要重新下载,且检查价格
                                item.local_pic_url = db_local_pic_url
                                item.setPicUrl(tb_pic_url)
                                if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                    item.price = tb_price
                            else:
                            # 图片存在,只检查价格
                                if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                    item.price = tb_price
                        else:
                            if tb_title != db_title:
                                item.title = tb_title
                            if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                item.price = tb_price
                                # 图片路径有变化,或者原图片不存在了,都需要重新下载
                            if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                                item.local_pic_url = db_local_pic_url
                                item.setPicUrl(tb_pic_url)

                        # TODO
                        # dbItem是下线状态,可能要重新匹配terms,
                        # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理
                        #

                        item.db_update(db)
                        off2On_num += 1
                    else:
                        #add
                        item = TaobaoItem(shop_id, 0, num_id)
                        item.title = tb_title
                        item.detail_url = tb_detail_url.replace("spm=(\\.|\\d)*", "spm=2014.12669715.0.0")
                        item.price = tb_price
                        item.pic_url = tb_pic_url
                        item.volume = volume
                        item.category = tb_category.getCategoryPath(n_cid)      # --->
                        item.termIds = item.matchTaobaoTerms(term_factory, str(shop_termLimits))    # --->
                        item.setPicUrl(tb_pic_url)
                        item.setCampaign(defaultCampaign)
                        item.status = ITEM_STATUS_ACTIVE

                        item.db_create(db)
                        new_num += 1
                except:
                    logger.error("%s: %s creating failed %s", shop_id, num_id, traceback.format_exc())
                    continue
    logger.info("shop %s crawler: new %s, back on line %s, black %s", shop_id, new_num, off2On_num, black_num)

    if offShelf_numids_set:
        #offline
        db.execute("update item set status=2 where num_id in (%s)" % ', '.join("'" + str(s) + "'" for s in offShelf_numids_set))
    logger.info("shop %s crawler: offline %s", shop_id, len(offShelf_numids_set))

    """
    # 原有的逻辑中,是将已经抓取过的item过滤掉,不进行处理。
    # 如果想更新title/price/pic_url速度更块一些的话,可以打开此部分代码,可保证至少4小时内全部更新一遍
    update_num = 0
    if common_numids_set:
        #validate price pic_url
        common_item_list = doCrawl(shop_id, common_numids_set)
        if common_item_list:
            for dict_item in common_item_list:
                num_id = str(dict_item['num_iid'])
                tb_title = dict_item['title']
                tb_price = float(dict_item['price'])
                tb_pic_url = str(dict_item['pic_url'])
                db_item = list(db.execute("select id, title, pic_url, local_pic_url, price, manual_set, volume from item where shop_id=%s and num_id=%s and status = 1" % (shop_id, num_id)))
                if db_item:
                    db_volume = int(db_item[0][6])
                    db_manual_set = int(db_item[0][5])
                    db_price = float(db_item[0][4])
                    db_local_pic_url = db_item[0][3]
                    db_pic_url = db_item[0][2]
                    db_title = db_item[0][1]
                    db_item_id = db_item[0][0]

                    item = TaobaoItem(shop_id, db_item_id, num_id)
                    is_update = False
                    if db_manual_set == 1:
                        if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                            item.price = tb_price
                            is_update = True
                    else:
                        if dict_item.has_key('volume'):
                            if int(dict_item['volume']) != db_volume:
                                item.volume = int(dict_item['volume'])
                                is_update = True
                        if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                            item.price = tb_price
                            is_update = True
                        if tb_title != db_title:
                            item.title = tb_title
                            is_update = True
                        if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                            item.local_pic_url = db_local_pic_url
                            item.setPicUrl(tb_pic_url)
                            is_update = True

                    if is_update:
                        item.db_update(db)
                        update_num += 1

        logger.info("shop %s: common %s, update %s ", shop_id, len(common_numids_set), update_num)
    """

    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id)
Example #2
0
def crawl_one_shop(shop, tb_category, term_factory, db):
    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]
    shop_termLimits = shop[3]

    # 白名单模式暂时没有使用上,shop.mode

    defaultCampaign = list(db.execute(
        "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id))
    if not defaultCampaign:
        logger.error("can not get the default campaign for shop: %s", shop_id)
        return

    """
        1.setting shop crawl_status=2
        2.crawler
        3.setting shop crawl_status=0
    """
    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id)

    # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录
    allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id))
    tb_numids = []
    for ids in allTbNumIds:
        tb_numids.extend(ids[0].split(','))
    tb_numids_set = set(tb_numids)
    logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set))

    # 过滤
    new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set)
    logger.info("stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s" % (shop_id, len(new_numids_set), len(offShelf_numids_set), len(common_numids_set)))

    new_num = 0
    off2On_num = 0
    black_num = 0
    if len(new_numids_set) > 0:
        new_item_list = doCrawl(shop_id, new_numids_set)
        if new_item_list:
            for dict_item in new_item_list:
                num_id = str(dict_item['num_iid'])
                n_cid = dict_item['cid']
                tb_title = dict_item['title']
                tb_detail_url = str(dict_item['detail_url'])
                tb_price = float(dict_item['price'])
                tb_pic_url = str(dict_item['pic_url'])

                volume = 0
                if dict_item.has_key('volume'):
                    volume = dict_item['volume']
                try:
                    #检查该商品是否重新上架
                    db_item = list(db.execute(
                        "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id='%s'" % (shop_id, num_id)))
                    if db_item:
                        #update
                        db_status = int(db_item[0][6])
                        db_manual_set = int(db_item[0][5])
                        db_price = float(db_item[0][4])
                        db_local_pic_url = db_item[0][3]
                        db_pic_url = db_item[0][2]
                        db_title = db_item[0][1]
                        db_item_id = int(db_item[0][0])

                        if db_status == ITEM_STATUS_BLACKLIST:
                            black_num += 1
                            continue

                        item = TaobaoItem(shop_id, db_item_id, num_id)
                        item.status = ITEM_STATUS_ACTIVE     # 先置为上线状态,再检查其他属性是否有变化

                        # 人工设置了图片和title
                        if db_manual_set == 1:
                            if not imgExists(shop_id, db_local_pic_url):
                                # 图片不存在,需要重新下载,且检查价格
                                item.local_pic_url = db_local_pic_url
                                item.setPicUrl(tb_pic_url)
                                if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                    item.price = tb_price
                            else:
                            # 图片存在,只检查价格
                                if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                    item.price = tb_price
                        else:
                            if tb_title != db_title:
                                item.title = tb_title
                            if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                                item.price = tb_price
                                # 图片路径有变化,或者原图片不存在了,都需要重新下载
                            if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                                item.local_pic_url = db_local_pic_url
                                item.setPicUrl(tb_pic_url)

                        # TODO
                        # dbItem是下线状态,可能要重新匹配terms,
                        # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理
                        #

                        item.db_update(db)
                        off2On_num += 1
                    else:
                        #add
                        item = TaobaoItem(shop_id, 0, num_id)
                        item.title = tb_title
                        item.detail_url = tb_detail_url.replace("spm=(\\.|\\d)*", "spm=2014.12669715.0.0")
                        item.price = tb_price
                        item.pic_url = tb_pic_url
                        item.volume = volume
                        item.category = tb_category.getCategoryPath(n_cid)      # --->
                        item.termIds = item.matchTaobaoTerms(term_factory, str(shop_termLimits))    # --->
                        item.setPicUrl(tb_pic_url)
                        item.setCampaign(defaultCampaign)
                        item.status = ITEM_STATUS_ACTIVE

                        item.db_create(db)
                        new_num += 1
                except:
                    logger.error("%s: %s creating failed %s", shop_id, num_id, traceback.format_exc())
                    continue
    logger.info("shop %s crawler: new %s, back on line %s, black %s", shop_id, new_num, off2On_num, black_num)

    if offShelf_numids_set:
        #offline
        db.execute("update item set status=2 where num_id in (%s)" % ', '.join("'" + str(s) + "'" for s in offShelf_numids_set))
    logger.info("shop %s crawler: offline %s", shop_id, len(offShelf_numids_set))

    """
    # 原有的逻辑中,是将已经抓取过的item过滤掉,不进行处理。
    # 如果想更新title/price/pic_url速度更块一些的话,可以打开此部分代码,可保证至少4小时内全部更新一遍
    update_num = 0
    if common_numids_set:
        #validate price pic_url
        common_item_list = doCrawl(shop_id, common_numids_set)
        if common_item_list:
            for dict_item in common_item_list:
                num_id = str(dict_item['num_iid'])
                tb_title = dict_item['title']
                tb_price = float(dict_item['price'])
                tb_pic_url = str(dict_item['pic_url'])
                db_item = list(db.execute("select id, title, pic_url, local_pic_url, price, manual_set, volume from item where shop_id=%s and num_id=%s and status = 1" % (shop_id, num_id)))
                if db_item:
                    db_volume = int(db_item[0][6])
                    db_manual_set = int(db_item[0][5])
                    db_price = float(db_item[0][4])
                    db_local_pic_url = db_item[0][3]
                    db_pic_url = db_item[0][2]
                    db_title = db_item[0][1]
                    db_item_id = db_item[0][0]

                    item = TaobaoItem(shop_id, db_item_id, num_id)
                    is_update = False
                    if db_manual_set == 1:
                        if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                            item.price = tb_price
                            is_update = True
                    else:
                        if dict_item.has_key('volume'):
                            if int(dict_item['volume']) != db_volume:
                                item.volume = int(dict_item['volume'])
                                is_update = True
                        if tb_price != db_price and quickUpdatePrice(db_item_id, db):
                            item.price = tb_price
                            is_update = True
                        if tb_title != db_title:
                            item.title = tb_title
                            is_update = True
                        if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url):
                            item.local_pic_url = db_local_pic_url
                            item.setPicUrl(tb_pic_url)
                            is_update = True

                    if is_update:
                        item.db_update(db)
                        update_num += 1

        logger.info("shop %s: common %s, update %s ", shop_id, len(common_numids_set), update_num)
    """

    db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id)