def update_item(sql):
    t = time.time()
    db = get_db_engine()
    item = db.execute(sql)

    results = get_taobao_items(get_top(), item, fn_join_iids=lambda
            x:','.join([str(i[1]) for i in x]), calllimit=60)

    for batch_item in results:
        for iid, item in batch_item.items.iteritems():
            try:
                item_id = item['req'][0]
                item_iid = item['req'][1]
                shop_id = item['req'][2]
                item_title = item['req'][3]
                item_picurl = item['req'][4]
                local_pic_url = item['req'][5]  #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg"

                if item['resp']:
                    taobao_title = item['resp']['title']
                    taobao_picurl = item['resp']['pic_url']
                    #item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item
                    #title, pic_url, pic_width, pic_height, modified

                    if FLAGS.forcibly:
                        #强制更新
                        is_title_update = True
                        is_picurl_update = True
                        # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1])
                    else:
                        if cmp(item_title, taobao_title):
                            is_title_update = True
                        else:
                            is_title_update = False

                        if cmp(item_picurl, taobao_picurl):
                            is_picurl_update = True
                        else:
                            is_picurl_update = False

                    if is_title_update:
                        if is_picurl_update:
                            width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                            db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id)

                            logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl)
                        else:
                            db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id)

                            logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title)
                    elif is_picurl_update:
                        width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                        db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id)

                        logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl)

            except:
                logger.error("update failed %s", traceback.format_exc())
    spent = time.time() - t
    logger.info("update_item_title_image use time : %s", spent*1000)
Ejemplo n.º 2
0
def update_item(sql):
    t = time.time()
    db = get_db_engine()
    item = db.execute(sql)

    results = get_taobao_items(get_top(), item, fn_join_iids=lambda
            x:','.join([str(i[1]) for i in x]), calllimit=60)

    for batch_item in results:
        for iid, item in batch_item.items.iteritems():
            try:
                item_id = item['req'][0]
                item_iid = item['req'][1]
                shop_id = item['req'][2]
                item_title = item['req'][3]
                item_picurl = item['req'][4]
                local_pic_url = item['req'][5]  #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg"

                if item['resp']:
                    taobao_title = item['resp']['title']
                    taobao_picurl = item['resp']['pic_url']
                    #item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item
                    #title, pic_url, pic_width, pic_height, modified

                    if FLAGS.forcibly:
                        #强制更新
                        is_title_update = True
                        is_picurl_update = True
                        # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1])
                    else:
                        if cmp(item_title, taobao_title):
                            is_title_update = True
                        else:
                            is_title_update = False

                        if cmp(item_picurl, taobao_picurl):
                            is_picurl_update = True
                        else:
                            is_picurl_update = False

                    if is_title_update:
                        if is_picurl_update:
                            width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                            db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id)

                            logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl)
                        else:
                            db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id)

                            logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title)
                    elif is_picurl_update:
                        width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path})
                        db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id)

                        logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl)

            except:
                logger.error("update failed %s", traceback.format_exc())
    spent = time.time() - t
    logger.info("update_item_title_image use time : %s", spent*1000)
Ejemplo n.º 3
0
def crawl_main():
    write_db, read_db = get_db_engines(**{'dbconnstrs': FLAGS.xdbconnstrs})

    sql = "select item.id, item.num_id, item.price, item.pic_url, item.volume from item_hotest, item, shop where item_hotest.item_id = item.id and item.status = 1 and item.shop_id = shop.id and shop.type <= 2 and shop.status=1 limit %s" % FLAGS.limit

    rows = read_db.execute(sql)

    counter = 0
    off_counter = 0
    change_counter = 0
    vol_change_counter = 0
    total = rows.rowcount
    results = get_taobao_items(
        get_top(),
        rows,
        fn_join_iids=lambda x: ','.join([str(i[1]) for i in x]),
        calllimit=300)
    for batch_item in results:
        for iid, item in batch_item.items.items():
            try:
                counter += 1
                item_id = item['req'][0]
                item_iid = item['req'][1]
                item_price = item['req'][2]
                #item_picurl = item['req'][3]
                if item['resp']:
                    if item['resp']['approve_status'] != 'onsale':
                        logger.debug("Item %s/%s %s %s is offshelf", counter,
                                     total, item_id, item_iid)
                        off_counter += 1
                        write_db.execute(
                            "update item set status=2, modified=now()  where id=%s"
                            % item_id)
                    else:
                        price = float(item['resp']['price'])
                        #title = item['resp']['title']
                        #pic_url = item['resp']['pic_url']
                        if abs(item_price -
                               price) / (item_price + 0.0000001) > 0.2 or abs(
                                   item_price - price) > 2.0:
                            change_counter += 1
                            logger.debug("Item %s/%s %s %s price %s -> %s",
                                         counter, total, item_id, item_iid,
                                         item_price, price)
                            if FLAGS.commit_price:
                                write_db.execute(
                                    "update item set price=%s where id=%s" %
                                    (price, item_id))
                logger.debug("req %s resp %s", item['req'], item['resp'])
            except:
                logger.error("update failed %s", traceback.format_exc())
    logger.info(
        "Taobao quickupdate, total %s, off %s, price change %s, volume change %s",
        total, off_counter, change_counter, vol_change_counter)
Ejemplo n.º 4
0
    def buildPath(self, cid):
        itemcats = get_taobao_itemcats(get_top(), cid)
        path = []
        if itemcats:
            name = itemcats['item_cats']['item_cat'][0]['name']
            parent_cid = itemcats['item_cats']['item_cat'][0]['parent_cid']

            self.saveCategory(cid, parent_cid, name)

            if parent_cid != 0:
                # 注意这里有迭代
                path.append(self.buildPath(parent_cid))

        return ",".join(path)
Ejemplo n.º 5
0
    def buildPath(self, cid):
        itemcats = get_taobao_itemcats(get_top(), cid)
        path = []
        if itemcats:
            name = itemcats['item_cats']['item_cat'][0]['name']
            parent_cid = itemcats['item_cats']['item_cat'][0]['parent_cid']

            self.saveCategory(cid, parent_cid, name)

            if parent_cid != 0:
                # 注意这里有迭代
                path.append(self.buildPath(parent_cid))

        return ",".join(path)
Ejemplo n.º 6
0
def doCrawl(shop_id, numids_set):
    """
        注意:
            下面这3行完全是为了满足get_taobao_items的第二个参数限制,组装成类似数据库查询结果,没啥意义
    """
    num_iids = []
    for id in numids_set:
        num_iids.append((shop_id, id))

    return_item_list = []
    results = get_taobao_items(get_top(), num_iids, fn_join_iids=lambda x: ','.join([str(i[1]) for i in x]))
    for r in results:
        for iid, item in r.items.iteritems():
            if item['resp']:
                return_item_list.append(dict(item['resp']))
    return return_item_list
Ejemplo n.º 7
0
def doCrawl(shop_id, numids_set):
    """
        注意:
            下面这3行完全是为了满足get_taobao_items的第二个参数限制,组装成类似数据库查询结果,没啥意义
    """
    num_iids = []
    for id in numids_set:
        num_iids.append((shop_id, id))

    return_item_list = []
    results = get_taobao_items(get_top(), num_iids, fn_join_iids=lambda x: ','.join([str(i[1]) for i in x]))
    for r in results:
        for iid, item in r.items.iteritems():
            if item['resp']:
                return_item_list.append(dict(item['resp']))
    return return_item_list
def crawl_main():
    write_db, read_db = get_db_engines(**{'dbconnstrs' : FLAGS.xdbconnstrs})

    sql = "select item.id, item.num_id, item.price, item.pic_url, item.volume from item_hotest, item, shop where item_hotest.item_id = item.id and item.status = 1 and item.shop_id = shop.id and shop.type <= 2 and shop.status=1 limit %s" % FLAGS.limit

    rows = read_db.execute(sql)

    counter = 0
    off_counter = 0
    change_counter = 0
    vol_change_counter = 0
    total = rows.rowcount
    results = get_taobao_items(get_top(), rows, fn_join_iids=lambda x:','.join([str(i[1]) for i in x]), calllimit=300)
    for batch_item in results:
        for iid, item in batch_item.items.items():
            try:
                counter += 1
                item_id = item['req'][0]
                item_iid = item['req'][1]
                item_price = item['req'][2]
                #item_picurl = item['req'][3]
                if item['resp']:
                    if item['resp']['approve_status'] != 'onsale':
                        logger.debug("Item %s/%s %s %s is offshelf", counter, total, item_id, item_iid)
                        off_counter += 1
                        write_db.execute("update item set status=2, modified=now()  where id=%s" % item_id)
                    else:
                        price = float(item['resp']['price'])
                        #title = item['resp']['title']
                        #pic_url = item['resp']['pic_url']
                        if abs(item_price - price) / (item_price + 0.0000001) > 0.2 or abs(item_price - price) > 2.0:
                            change_counter += 1
                            logger.debug("Item %s/%s %s %s price %s -> %s", counter, total, item_id, item_iid, item_price, price)
                            if FLAGS.commit_price:
                                write_db.execute("update item set price=%s where id=%s" % (price, item_id))
                logger.debug("req %s resp %s", item['req'], item['resp'])
            except:
                logger.error("update failed %s", traceback.format_exc())
    logger.info("Taobao quickupdate, total %s, off %s, price change %s, volume change %s", total, off_counter, change_counter, vol_change_counter)
Ejemplo n.º 9
0
def main():
    if FLAGS.sessionid == "":
        logger.error(
            "Get SESSION from http://container.api.taobao.com/container?appkey=12525923"
        )
    db = None
    csv_w = None
    if not FLAGS.dryrun:
        db = get_db_engine()
    if FLAGS.csv:
        csv_w = csv.writer(open(FLAGS.csv_filename, "wb"),
                           delimiter=FLAGS.csv_split,
                           quotechar=FLAGS.csv_quote,
                           quoting=csv.QUOTE_NONNUMERIC)
        csv_w.writerow([
            "report_date", "outer_code", "commission_rate", "item_title",
            "seller_nick", "num_iid", "shop_title", "app_key", "commission",
            "trade_id", "pay_time", "item_num", "category_id", "pay_price",
            "real_pay_fee", "category_name"
        ])
    for d in waitlimit(FLAGS.limit, 60.0, dates()):
        logger.info("Fetching %s %s", d, FLAGS.sessionid)
        try:
            pageno = 1
            total = 100
            result_len = 100
            got = 0
            while result_len >= total:
                report = get_report(get_top(), d, FLAGS.sessionid, pageno,
                                    total)
                if not report:
                    logger.info("result %s %s null", d, pageno)
                    break
                else:
                    logger.info("result %s %s", d, pageno)
                pageno += 1
                result_len = len(
                    report['taobaoke_report']['taobaoke_report_members']
                    ['taobaoke_report_member'])
                got += result_len
                logger.info(
                    "result %s %s -> %s %s", d, pageno, got,
                    len(report['taobaoke_report']['taobaoke_report_members']
                        ['taobaoke_report_member']))
                if result_len > 0:
                    members = report['taobaoke_report'][
                        'taobaoke_report_members']['taobaoke_report_member']
                    for m in members:
                        try:
                            #import pdb; pdb.set_trace()
                            check_sql = """select outer_code, commission_rate, item_title, seller_nick, num_iid,
                                shop_title, app_key, commission, trade_id, pay_time, item_num,
                                category_id, pay_price, real_pay_fee, category_name, create_time,
                                confirm_time, status from taobao_report
                                where trade_id=%s""" % m['trade_id']
                            result = list(db.execute(check_sql))
                            if result:
                                if result[0][0] == m.get(
                                        'outer_code',
                                        '') and result[0][4] == m['num_iid']:
                                    logger.debug(
                                        "already exists in db, skip %s vs %s",
                                        result[0], m)
                                else:
                                    logger.warn(
                                        "same trade id, something wrong! %s %s"
                                        % (m, result))
                                continue
                            sql = """insert into taobao_report (outer_code, commission_rate, item_title, seller_nick,
                                num_iid, shop_title, app_key, commission, trade_id, pay_time, item_num,
                                category_id, pay_price, real_pay_fee, category_name) values (
                                "%s", "%s", "%s", "%s", %s, "%s", "%s", "%s", %s, "%s", %s, %s, "%s", "%s", "%s"
                                )""" % (
                                m.get('outer_code', ''),
                                m['commission_rate'].replace(
                                    '%', '%%'), m['item_title'].replace(
                                        '%', '%%'), m['seller_nick'].replace(
                                            '%', '%%'), m['num_iid'],
                                m['shop_title'].replace('%', '%%'),
                                m['app_key'], m['commission'], m['trade_id'],
                                m['pay_time'], m['item_num'], m['category_id'],
                                m['pay_price'], m['real_pay_fee'],
                                m.get('category_name', '').replace('%', '%%'))
                            logger.debug(sql)
                            if db:
                                try:
                                    db.execute(sql)
                                except:
                                    logger.warn(
                                        "insert failed sql %s --> err %s", sql,
                                        traceback.format_exc())
                            if csv_w:
                                writecsv(csv_w, [
                                    d,
                                    m.get('outer_code',
                                          ''), m['commission_rate'],
                                    m['item_title'], m['seller_nick'],
                                    m['num_iid'], m['shop_title'],
                                    m['app_key'], m['commission'],
                                    m['trade_id'], m['pay_time'],
                                    m['item_num'], m['category_id'],
                                    m['pay_price'], m['real_pay_fee'],
                                    m.get('category_name', '')
                                ])
                        except:
                            logger.error("Got error %s %s", m,
                                         traceback.format_exc())
        except:
            logger.error("Got fatal error %s %s", d, traceback.format_exc())
Ejemplo n.º 10
0
def main():
    if FLAGS.sessionid == "":
        logger.error("Get SESSION from http://container.api.taobao.com/container?appkey=12525923")
    db = None
    csv_w = None
    if not FLAGS.dryrun:
        db = get_db_engine()

    if FLAGS.csv:
        csv_w = csv.writer(open(FLAGS.csv_filename, "wb"), delimiter=FLAGS.csv_split,
            quotechar=FLAGS.csv_quote, quoting=csv.QUOTE_NONNUMERIC)
        csv_w.writerow(["report_date", "outer_code", "commission_rate", "item_title", "seller_nick", "num_iid",
                        "shop_title", "app_key", "commission", "trade_id", "pay_time", "item_num",
                        "category_id", "pay_price", "real_pay_fee", "category_name"])
    for d in waitlimit(FLAGS.limit, 60.0, dates()):
        logger.info("Fetching %s %s", d, FLAGS.sessionid)
        try:
            pageno = 1
            total = 100
            result_len = 100
            got = 0
            while result_len >= total:
                report = get_report(get_top(), d, FLAGS.sessionid, pageno, total)
                if not report:
                    logger.info("result %s %s null", d, pageno)
                    break
                else:
                    logger.info("result %s %s", d, pageno)
                result_len = len(report['taobaoke_payments']['taobaoke_payment'])
                got += result_len
                logger.info("result %s %s -> %s %s", d, pageno, got, len(report['taobaoke_payments']['taobaoke_payment']))
                if result_len > 0:
                    members = report['taobaoke_payments']['taobaoke_payment']
                    for m in members:
                        try:
                            #import pdb; pdb.set_trace()
                            check_sql = """select outer_code, commission_rate, item_title, seller_nick, num_iid,
                                shop_title, app_key, commission, trade_id, pay_time, item_num,
                                category_id, pay_price, real_pay_fee, category_name, create_time,
                                confirm_time, status from taobao_report
                                where trade_id=%s""" % m['trade_id']
                            result = list(db.execute(check_sql))
                            if result:
                                if result[0][0] == m.get('outer_code', '') and result[0][4] == m['num_iid']:
                                    logger.debug("already exists in db, skip %s vs %s", result[0], m)
                                else:
                                    logger.warn("same trade id, something wrong! %s %s" % (m, result))
                                continue
                            sql = """insert into taobao_report (outer_code, commission_rate, item_title, seller_nick,
                                num_iid, shop_title, app_key, commission, trade_id, pay_time, item_num,
                                category_id, pay_price, real_pay_fee, category_name, create_time) values (
                                "%s", "%s", "%s", "%s", %s, "%s", "%s", "%s", %s, "%s", %s, %s, "%s", "%s", "%s", now()
                                )""" % (
                                m.get('outer_code', ''), m['commission_rate'].replace('%', '%%'), m['item_title'].replace('%', '%%'),
                                m['seller_nick'].replace('%', '%%'), m['num_iid'],
                                m['shop_title'].replace('%', '%%'), m['app_key'], m['commission'], m['trade_id'], m['pay_time'], m['item_num'],
                                m['category_id'], m['pay_price'], m['real_pay_fee'], m.get('category_name','').replace('%', '%%')
                                )
                            logger.debug(sql)
                            if db:
                                try:
                                    db.execute(sql)
                                except:
                                    logger.warn("insert failed sql %s --> err %s", sql, traceback.format_exc())
                            if csv_w:
                                writecsv(csv_w, [d, m.get('outer_code', ''), m['commission_rate'], m['item_title'], m['seller_nick'], m['num_iid'],
                                    m['shop_title'], m['app_key'], m['commission'], m['trade_id'], m['pay_time'], m['item_num'],
                                    m['category_id'], m['pay_price'], m['real_pay_fee'], m.get('category_name', '')])
                        except:
                            logger.error("Got error %s %s", m, traceback.format_exc())
                pageno += 1
        except:
            logger.error("Got fatal error %s %s", d, traceback.format_exc())