コード例 #1
0
def update_item(sql):
    t = time.time()
    db = get_db_engine()
    item = db.execute(sql)

    results = get_taobao_items(get_top(), item, fn_join_iids=lambda
            x:','.join([str(i[1]) for i in x]), calllimit=60)

    for batch_item in results:
        for iid, item in batch_item.items.iteritems():
            try:
                item_id = item['req'][0]
                item_iid = item['req'][1]
                shop_id = item['req'][2]
                item_title = item['req'][3]
                item_picurl = item['req'][4]
                local_pic_url = item['req'][5]  #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg"

                if item['resp']:
                    taobao_title = item['resp']['title']
                    taobao_picurl = item['resp']['pic_url']
                    #item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item
                    #title, pic_url, pic_width, pic_height, modified

                    if FLAGS.forcibly:
                        #强制更新
                        is_title_update = True
                        is_picurl_update = True
                        # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1])
                    else:
                        if cmp(item_title, taobao_title):
                            is_title_update = True
                        else:
                            is_title_update = False

                        if cmp(item_picurl, taobao_picurl):
                            is_picurl_update = True
                        else:
                            is_picurl_update = False

                    if is_title_update:
                        if is_picurl_update:
                            width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                            db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id)

                            logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl)
                        else:
                            db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id)

                            logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title)
                    elif is_picurl_update:
                        width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                        db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id)

                        logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl)

            except:
                logger.error("update failed %s", traceback.format_exc())
    spent = time.time() - t
    logger.info("update_item_title_image use time : %s", spent*1000)
コード例 #2
0
def update_item(sql):
    t = time.time()
    db = get_db_engine()
    item = db.execute(sql)

    results = get_taobao_items(get_top(), item, fn_join_iids=lambda
            x:','.join([str(i[1]) for i in x]), calllimit=60)

    for batch_item in results:
        for iid, item in batch_item.items.iteritems():
            try:
                item_id = item['req'][0]
                item_iid = item['req'][1]
                shop_id = item['req'][2]
                item_title = item['req'][3]
                item_picurl = item['req'][4]
                local_pic_url = item['req'][5]  #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg"

                if item['resp']:
                    taobao_title = item['resp']['title']
                    taobao_picurl = item['resp']['pic_url']
                    #item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item
                    #title, pic_url, pic_width, pic_height, modified

                    if FLAGS.forcibly:
                        #强制更新
                        is_title_update = True
                        is_picurl_update = True
                        # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1])
                    else:
                        if cmp(item_title, taobao_title):
                            is_title_update = True
                        else:
                            is_title_update = False

                        if cmp(item_picurl, taobao_picurl):
                            is_picurl_update = True
                        else:
                            is_picurl_update = False

                    if is_title_update:
                        if is_picurl_update:
                            width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                            db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id)

                            logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl)
                        else:
                            db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id)

                            logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title)
                    elif is_picurl_update:
                        width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                        db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id)

                        logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl)

            except:
                logger.error("update failed %s", traceback.format_exc())
    spent = time.time() - t
    logger.info("update_item_title_image use time : %s", spent*1000)
コード例 #3
0
def crawl_main():
    write_db, read_db = get_db_engines(**{'dbconnstrs': FLAGS.xdbconnstrs})

    sql = "select item.id, item.num_id, item.price, item.pic_url, item.volume from item_hotest, item, shop where item_hotest.item_id = item.id and item.status = 1 and item.shop_id = shop.id and shop.type <= 2 and shop.status=1 limit %s" % FLAGS.limit

    rows = read_db.execute(sql)

    counter = 0
    off_counter = 0
    change_counter = 0
    vol_change_counter = 0
    total = rows.rowcount
    results = get_taobao_items(
        get_top(),
        rows,
        fn_join_iids=lambda x: ','.join([str(i[1]) for i in x]),
        calllimit=300)
    for batch_item in results:
        for iid, item in batch_item.items.items():
            try:
                counter += 1
                item_id = item['req'][0]
                item_iid = item['req'][1]
                item_price = item['req'][2]
                #item_picurl = item['req'][3]
                if item['resp']:
                    if item['resp']['approve_status'] != 'onsale':
                        logger.debug("Item %s/%s %s %s is offshelf", counter,
                                     total, item_id, item_iid)
                        off_counter += 1
                        write_db.execute(
                            "update item set status=2, modified=now()  where id=%s"
                            % item_id)
                    else:
                        price = float(item['resp']['price'])
                        #title = item['resp']['title']
                        #pic_url = item['resp']['pic_url']
                        if abs(item_price -
                               price) / (item_price + 0.0000001) > 0.2 or abs(
                                   item_price - price) > 2.0:
                            change_counter += 1
                            logger.debug("Item %s/%s %s %s price %s -> %s",
                                         counter, total, item_id, item_iid,
                                         item_price, price)
                            if FLAGS.commit_price:
                                write_db.execute(
                                    "update item set price=%s where id=%s" %
                                    (price, item_id))
                logger.debug("req %s resp %s", item['req'], item['resp'])
            except:
                logger.error("update failed %s", traceback.format_exc())
    logger.info(
        "Taobao quickupdate, total %s, off %s, price change %s, volume change %s",
        total, off_counter, change_counter, vol_change_counter)
コード例 #4
0
def doCrawl(shop_id, numids_set):
    """
        注意:
            下面这3行完全是为了满足get_taobao_items的第二个参数限制,组装成类似数据库查询结果,没啥意义
    """
    num_iids = []
    for id in numids_set:
        num_iids.append((shop_id, id))

    return_item_list = []
    results = get_taobao_items(get_top(), num_iids, fn_join_iids=lambda x: ','.join([str(i[1]) for i in x]))
    for r in results:
        for iid, item in r.items.iteritems():
            if item['resp']:
                return_item_list.append(dict(item['resp']))
    return return_item_list
コード例 #5
0
def doCrawl(shop_id, numids_set):
    """
        注意:
            下面这3行完全是为了满足get_taobao_items的第二个参数限制,组装成类似数据库查询结果,没啥意义
    """
    num_iids = []
    for id in numids_set:
        num_iids.append((shop_id, id))

    return_item_list = []
    results = get_taobao_items(get_top(), num_iids, fn_join_iids=lambda x: ','.join([str(i[1]) for i in x]))
    for r in results:
        for iid, item in r.items.iteritems():
            if item['resp']:
                return_item_list.append(dict(item['resp']))
    return return_item_list
コード例 #6
0
ファイル: crawl_taobao.py プロジェクト: ljb-2000/tb-crawler
def doCrawl(shop_id, numids_set):
    """
        注意:
            下面这3行完全是为了满足get_taobao_items的第二个参数限制,组装成类似数据库查询结果,没啥意义
    """
    num_iids = []
    for id in numids_set:
        num_iids.append((shop_id, id))

    # taobao.tbk.items.detail.get这个api有时间限制
    time.sleep(3)
    return_item_list = []
    results = get_taobao_items(get_taobaoke_top(), num_iids, fn_join_iids=lambda x: ",".join([str(i[1]) for i in x]))
    for r in results:
        for iid, item in r.items.iteritems():
            if item["resp"]:
                return_item_list.append(dict(item["resp"]))
    return return_item_list
コード例 #7
0
def crawl_main():
    write_db, read_db = get_db_engines(**{'dbconnstrs' : FLAGS.xdbconnstrs})

    sql = "select item.id, item.num_id, item.price, item.pic_url, item.volume from item_hotest, item, shop where item_hotest.item_id = item.id and item.status = 1 and item.shop_id = shop.id and shop.type <= 2 and shop.status=1 limit %s" % FLAGS.limit

    rows = read_db.execute(sql)

    counter = 0
    off_counter = 0
    change_counter = 0
    vol_change_counter = 0
    total = rows.rowcount
    results = get_taobao_items(get_top(), rows, fn_join_iids=lambda x:','.join([str(i[1]) for i in x]), calllimit=300)
    for batch_item in results:
        for iid, item in batch_item.items.items():
            try:
                counter += 1
                item_id = item['req'][0]
                item_iid = item['req'][1]
                item_price = item['req'][2]
                #item_picurl = item['req'][3]
                if item['resp']:
                    if item['resp']['approve_status'] != 'onsale':
                        logger.debug("Item %s/%s %s %s is offshelf", counter, total, item_id, item_iid)
                        off_counter += 1
                        write_db.execute("update item set status=2, modified=now()  where id=%s" % item_id)
                    else:
                        price = float(item['resp']['price'])
                        #title = item['resp']['title']
                        #pic_url = item['resp']['pic_url']
                        if abs(item_price - price) / (item_price + 0.0000001) > 0.2 or abs(item_price - price) > 2.0:
                            change_counter += 1
                            logger.debug("Item %s/%s %s %s price %s -> %s", counter, total, item_id, item_iid, item_price, price)
                            if FLAGS.commit_price:
                                write_db.execute("update item set price=%s where id=%s" % (price, item_id))
                logger.debug("req %s resp %s", item['req'], item['resp'])
            except:
                logger.error("update failed %s", traceback.format_exc())
    logger.info("Taobao quickupdate, total %s, off %s, price change %s, volume change %s", total, off_counter, change_counter, vol_change_counter)