def update_vip_shop(shop_id, db=None):
    if not db:
        db = get_db_engine()

    limitsql = ""
    if FLAGS.limit:
        limitsql += " limit " + str(FLAGS.limit)

    if shop_id:
        shop_str = " shop.id = %s and " % shop_id
    else:
        shop_str = " shop.id in (%s) and " % ','.join(map(str, FLAGS.vipshopids))

    if FLAGS.interval > 0:
        from_date = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S")
        shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date)

    sql = "select item.id,item.num_id,shop.type,item.detail_url,item_re.detail_url from shop,item left join item_re on item.id=item_re.item_id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str
    results = db.connect().execute(sql + limitsql)

    total = results.rowcount
    logger.debug("Processing %s result %s", sql, total)
    if total == 0:
        logger.info("nothing to do with shop %s", shop_str)
        return

    pos = 0
    converted = 0
    SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$")
    for input, outputstr in convert_taobaoke_widget(list(filter_tbk_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None, appkey='21315963', appsec='549d623e612832df7720101f83f951b9'):
        if not outputstr:
            logger.debug("Converted failed %s null %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input)))
            continue
        output = extract_json_from_jsonp(outputstr)
        pos += len(input)
        if not output:
            logger.debug("Converted failed empty %s %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input)))
            continue
        if output['total_results'] == 0 or not output['taobaoke_items']:
            logger.debug("No output %s %s %s/%s/%s", input, shop_id, converted, pos, total)
            continue
        succ_len = len(output['taobaoke_items']['taobaoke_item'])
        logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len))
        converted += succ_len
        Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len)
        try:
            numid2id = dict([(int(num_id), id) for id, num_id, shop_type, jn_url, re_url in input])
            for result in output['taobaoke_items']['taobaoke_item']:
                isql = ""
                try:
                    num_iid = result['num_iid']
                    click_url = result['click_url'] + "&u=re_UCTRAC_CLK_&unid=re_UCTRAC_CLK_"
                    # conver spm to xtao
                    if click_url.find('spm=') > 0:
                        click_url = SPMRE.subn(r'\g<1>2014.21315963.1.0\g<3>', click_url)[0]
                    else:
                        click_url += '&spm=2014.21315963.1.0'
                    id = numid2id[num_iid]
                    isql = "insert into item_re (item_id, detail_url) values (%s, '%s') on duplicate key update detail_url='%s'" % (id, click_url, click_url)
                    logger.debug("process %s %s/%s -> %s", shop_id, pos, total, isql)
                    if not FLAGS.dryrun:
                        db.execute(isql.replace('%', '%%'))
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logger.debug("in %s out %s" % (numid2id, result))
                    logger.warn("convert failed %s %s" % (isql, traceback.format_exc()))
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc()))
    logger.info("Convert result %s - %s", converted, total)

    # retry sql
    results = db.connect().execute(sql + limitsql)
    for row in filter_tbk_items(results):
        id, num_id, shop_type, jn_url, re_url = row
        if not re_url:
            sql = "insert into item_re (item_id, detail_url) values (%s, '%s')" % (id, 'http://item.taobao.com/item.htm?id=%s&spm=2014.21315963.1.0' % num_id)
            db.execute(sql)
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),)
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1] 
                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$")
                desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_table_thumbs = desc_html_obj.xpath("//table/@background")
                        desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]})

                images = []
                pos = 1
                for url in thumbImages:
                    ori_url = None
                    if tr.match(url):
                        ori_url = tr.sub(r'\1', url)
                    else:
                        if tr_new.match(url):
                            ori_url = tr_new.sub(r'\1', url)
                        else:
                            logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]})

                    images.append((ori_url, pos, 1))
                    pos += 1
                for url in desc_table_thumbs:
                    images.append((url, pos, 2))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport'])
                item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary),)
        except Exception, e:
            logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'],
            host = kwargs['statshost'], port = kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]})
            Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
def update_shop(shop_id, db):
    if not db:
        db = get_db_engine()

    tbk = list(db.execute("select * from tbk where shop_id=%s" % shop_id))
    if tbk:
        tbk_pid = str(tbk[0][1])
    else:
        tbk_pid = FLAGS.pid

    limitsql = ""
    if FLAGS.limit:
        limitsql += " limit " + str(FLAGS.limit)

    if shop_id:
        shop_str = " shop.id = %s and " % shop_id
    else:
        shop_str = ""

    if FLAGS.interval > 0:
        from_date = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S")
        shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date)

    if not FLAGS.force:
        sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id and item.detail_url not like '%%%%s.click.taobao.com%%%%'" % shop_str
        results = db.connect().execute(sql + limitsql)
    else:
        sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str
        results = db.connect().execute(sql + limitsql)

    total = results.rowcount
    if total == 0:
        logger.info("nothing to do with shop %s", shop_id)
        return

    pos = 0
    converted = 0
    SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$")
    for input, outputstr in convert_taobaoke_widget(list(filter_retry_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None):
        if not outputstr:
            logger.debug("Converted failed null %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input)))
            continue
        output = extract_json_from_jsonp(outputstr)
        pos += len(input)
        if not output:
            logger.debug("Converted failed empty %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input)))
            continue
        if output['total_results'] == 0 or not output['taobaoke_items']:
            logger.debug("No output %s %s/%s/%s", shop_id, converted, pos, total)
            for row in input:
                if not FLAGS.dryrun:
                    db.execute("insert into tbk_item_convert(item_id, failed_count, last_time) values(%s, 1, now()) on duplicate key update failed_count=failed_count+1, last_time=now()" % row[0])
            continue
        succ_len = len(output['taobaoke_items']['taobaoke_item'])
        logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len))
        converted += succ_len
        Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len)
        try:
            numid2id = dict([(int(num_id), id) for id, num_id, failed_count, last_time in input])
            for result in output['taobaoke_items']['taobaoke_item']:
                sql = ""
                try:
                    num_iid = result['num_iid']
                    click_url = result['click_url'] + "&u=jn_UCTRAC_CLK_&unid=jn_UCTRAC_CLK_"
                    # conver spm to xtao
                    if click_url.find('spm=') > 0:
                        click_url = SPMRE.subn(r'\g<1>2014.12669715.1.0\g<3>', click_url)[0]
                    else:
                        click_url += '&spm=2014.12669715.1.0'
                    id = numid2id[num_iid]
                    sql = "update item set detail_url='%s' where id=%s" % (click_url, id)
                    logger.debug("process %s %s/%s -> %s", shop_id, pos, total, sql)
                    if not FLAGS.dryrun:
                        db.execute(sql.replace('%', '%%'))
                        db.execute("delete from tbk_item_convert where item_id=%s" % id)
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logger.debug("in %s out %s" % (numid2id, result))
                    logger.warn("convert failed %s %s" % (sql, traceback.format_exc()))
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc()))
    logger.info("Convert result %s - %s", converted, total)
Esempio n. 4
0
def update_shop(shop_id, db):
    if not db:
        db = get_db_engine()

    tbk = list(db.execute("select * from tbk where shop_id=%s" % shop_id))
    if tbk:
        tbk_pid = str(tbk[0][1])
    else:
        tbk_pid = FLAGS.pid

    limitsql = ""
    if FLAGS.limit:
        limitsql += " limit " + str(FLAGS.limit)

    if shop_id:
        shop_str = " shop.id = %s and " % shop_id
    else:
        shop_str = ""

    if FLAGS.interval > 0:
        from_date = datetime.datetime.strftime(
            datetime.datetime.now() - datetime.timedelta(FLAGS.interval),
            "%Y-%m-%d %H:%M:%S")
        shop_str += " (created > '%s' or modified > '%s') and " % (from_date,
                                                                   from_date)

    if not FLAGS.force:
        sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id and item.detail_url not like '%%%%s.click.taobao.com%%%%'" % shop_str
        results = db.connect().execute(sql + limitsql)
    else:
        sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str
        results = db.connect().execute(sql + limitsql)

    total = results.rowcount
    if total == 0:
        logger.info("nothing to do with shop %s", shop_id)
        return

    pos = 0
    converted = 0
    SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$")
    for input, outputstr in convert_taobaoke_widget(list(
            filter_retry_items(results)),
                                                    fn_join_iids=join_iids,
                                                    calllimit=60,
                                                    outer_code=None):
        if not outputstr:
            logger.debug(
                "Converted failed null %s progress %s/%s/%s -> in %s" %
                (shop_id, converted, pos, total, len(input)))
            continue
        output = extract_json_from_jsonp(outputstr)
        pos += len(input)
        if not output:
            logger.debug(
                "Converted failed empty %s progress %s/%s/%s -> in %s" %
                (shop_id, converted, pos, total, len(input)))
            continue
        if output['total_results'] == 0 or not output['taobaoke_items']:
            logger.debug("No output %s %s/%s/%s", shop_id, converted, pos,
                         total)
            for row in input:
                if not FLAGS.dryrun:
                    db.execute(
                        "insert into tbk_item_convert(item_id, failed_count, last_time) values(%s, 1, now()) on duplicate key update failed_count=failed_count+1, last_time=now()"
                        % row[0])
            continue
        succ_len = len(output['taobaoke_items']['taobaoke_item'])
        logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" %
                    (shop_id, converted, pos, total, len(input),
                     output['total_results'], succ_len))
        converted += succ_len
        Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len)
        try:
            numid2id = dict([(int(num_id), id)
                             for id, num_id, failed_count, last_time in input])
            for result in output['taobaoke_items']['taobaoke_item']:
                sql = ""
                try:
                    num_iid = result['num_iid']
                    click_url = result[
                        'click_url'] + "&u=jn_UCTRAC_CLK_&unid=jn_UCTRAC_CLK_"
                    # conver spm to xtao
                    if click_url.find('spm=') > 0:
                        click_url = SPMRE.subn(r'\g<1>2014.12669715.1.0\g<3>',
                                               click_url)[0]
                    else:
                        click_url += '&spm=2014.12669715.1.0'
                    id = numid2id[num_iid]
                    sql = "update item set detail_url='%s' where id=%s" % (
                        click_url, id)
                    logger.debug("process %s %s/%s -> %s", shop_id, pos, total,
                                 sql)
                    if not FLAGS.dryrun:
                        db.execute(sql.replace('%', '%%'))
                        db.execute(
                            "delete from tbk_item_convert where item_id=%s" %
                            id)
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logger.debug("in %s out %s" % (numid2id, result))
                    logger.warn("convert failed %s %s" %
                                (sql, traceback.format_exc()))
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("process failed %s %s reason %s" %
                        (input, output, traceback.format_exc()))
    logger.info("Convert result %s - %s", converted, total)
Esempio n. 5
0
def update_vip_shop(shop_id, db=None):
    if not db:
        db = get_db_engine()

    limitsql = ""
    if FLAGS.limit:
        limitsql += " limit " + str(FLAGS.limit)

    if shop_id:
        shop_str = " shop.id = %s and " % shop_id
    else:
        shop_str = " shop.id in (%s) and " % ','.join(
            map(str, FLAGS.vipshopids))

    if FLAGS.interval > 0:
        from_date = datetime.datetime.strftime(
            datetime.datetime.now() - datetime.timedelta(FLAGS.interval),
            "%Y-%m-%d %H:%M:%S")
        shop_str += " (created > '%s' or modified > '%s') and " % (from_date,
                                                                   from_date)

    sql = "select item.id,item.num_id,shop.type,item.detail_url,item_re.detail_url from shop,item left join item_re on item.id=item_re.item_id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str
    results = db.connect().execute(sql + limitsql)

    total = results.rowcount
    logger.debug("Processing %s result %s", sql, total)
    if total == 0:
        logger.info("nothing to do with shop %s", shop_str)
        return

    pos = 0
    converted = 0
    SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$")
    for input, outputstr in convert_taobaoke_widget(
            list(filter_tbk_items(results)),
            fn_join_iids=join_iids,
            calllimit=60,
            outer_code=None,
            appkey='21315963',
            appsec='549d623e612832df7720101f83f951b9'):
        if not outputstr:
            logger.debug(
                "Converted failed %s null %s progress %s/%s/%s -> in %s" %
                (input, shop_id, converted, pos, total, len(input)))
            continue
        output = extract_json_from_jsonp(outputstr)
        pos += len(input)
        if not output:
            logger.debug(
                "Converted failed empty %s %s progress %s/%s/%s -> in %s" %
                (input, shop_id, converted, pos, total, len(input)))
            continue
        if output['total_results'] == 0 or not output['taobaoke_items']:
            logger.debug("No output %s %s %s/%s/%s", input, shop_id, converted,
                         pos, total)
            continue
        succ_len = len(output['taobaoke_items']['taobaoke_item'])
        logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" %
                    (shop_id, converted, pos, total, len(input),
                     output['total_results'], succ_len))
        converted += succ_len
        Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len)
        try:
            numid2id = dict([(int(num_id), id)
                             for id, num_id, shop_type, jn_url, re_url in input
                             ])
            for result in output['taobaoke_items']['taobaoke_item']:
                isql = ""
                try:
                    num_iid = result['num_iid']
                    click_url = result[
                        'click_url'] + "&u=re_UCTRAC_CLK_&unid=re_UCTRAC_CLK_"
                    # conver spm to xtao
                    if click_url.find('spm=') > 0:
                        click_url = SPMRE.subn(r'\g<1>2014.21315963.1.0\g<3>',
                                               click_url)[0]
                    else:
                        click_url += '&spm=2014.21315963.1.0'
                    id = numid2id[num_iid]
                    isql = "insert into item_re (item_id, detail_url) values (%s, '%s') on duplicate key update detail_url='%s'" % (
                        id, click_url, click_url)
                    logger.debug("process %s %s/%s -> %s", shop_id, pos, total,
                                 isql)
                    if not FLAGS.dryrun:
                        db.execute(isql.replace('%', '%%'))
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logger.debug("in %s out %s" % (numid2id, result))
                    logger.warn("convert failed %s %s" %
                                (isql, traceback.format_exc()))
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("process failed %s %s reason %s" %
                        (input, output, traceback.format_exc()))
    logger.info("Convert result %s - %s", converted, total)

    # retry sql
    results = db.connect().execute(sql + limitsql)
    for row in filter_tbk_items(results):
        id, num_id, shop_type, jn_url, re_url = row
        if not re_url:
            sql = "insert into item_re (item_id, detail_url) values (%s, '%s')" % (
                id,
                'http://item.taobao.com/item.htm?id=%s&spm=2014.21315963.1.0' %
                num_id)
            db.execute(sql)
Esempio n. 6
0
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {
        'suc1': 0,
        'count1': 0,
        'suc': 0,
        'count': 0
    }), )
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute(
                "select html, desc_content from crawl_html where crawl_html.item_id=%s;"
                % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1]

                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error(
                        "crawl item %s %s not found thumb images html size %s",
                        item_id,
                        num_id,
                        len(html),
                        extra={'tags': [
                            'crawl_thumb_empty',
                        ]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                desc_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!",
                                item_id,
                                num_id,
                                extra={'tags': [
                                    'crawl_nodesc',
                                ]})

                images = []
                pos = 1
                for url in thumbImages:
                    images.append((tr.sub(r'\1', url), pos, 1))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path,
                                           server_path, org_server_path,
                                           kwargs['statshost'],
                                           kwargs['statsport'])
                item_crawler.crawl(images, ((710, 10000), ), is_commit, conn,
                                   is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary), )
        except Exception, e:
            logger.error("crawl item %s %s got exception %s",
                         item_id,
                         num_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawl_exception',
                         ]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount",
                            crawl_result[0][1]['suc1'] +
                            crawl_result[0][1]['suc'],
                            host=kwargs['statshost'],
                            port=kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id,
                        crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s",
                        item_id,
                        num_id,
                        crawl_result,
                        extra={'tags': [
                            'crawl_failed',
                        ]})
            Statsd.increment('guang.crawl.itemimg.failed',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
Esempio n. 7
0
def crawl_item2(kwargs):
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    is_success = False
    item_id = item[0]
    num_id = item[2]
    crawl_result = ((item_id, (0, 0, 0, 0, 0, 0.0, 0)), )

    tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments'])

    db = None
    if is_commit:
        db = get_db_engine()

    try:
        logger.info("progress %s/%s id %s iid %s", kwargs['i'],
                    kwargs['total'], item_id, num_id)
        tb.crawl()
        if tb.is_offline and is_commit:
            db.execute("update item set status=2, modified=now() where id=%s" %
                       item_id)
        if tb.detailDiv and not tb.is_offline:
            tb.crawl_price()

            if is_commit:
                # check old price and volume
                pv = list(
                    db.execute("select price, volume from item where id=%s",
                               item_id))
                price = pv[0][0]
                volume = pv[0][1]
                if tb.price != price and tb.price > 0.001:
                    is_price_update = True
                else:
                    is_price_update = False
                if tb.volume > 0 and tb.volume != volume:
                    is_volume_update = True
                else:
                    is_volume_update = False

                if is_price_update:
                    db.execute(
                        "insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()"
                        % item_id)
                    if is_volume_update:
                        db.execute(
                            "update item set price=%s, volume=%s where id=%s",
                            tb.price, tb.volume, item_id)
                    else:
                        db.execute("update item set price=%s where id=%s",
                                   tb.price, item_id)
                elif is_volume_update:
                    db.execute("update item set volume=%s where id=%s",
                               tb.volume, item_id)
                if is_price_update:
                    Statsd.increment("taobao.crawl.price_update")
                if is_volume_update:
                    Statsd.increment("taobao.crawl.volume_update")

            if FLAGS.update_main:
                tb.crawl_desc()

                if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html:
                    db.execute("delete from crawl_html where item_id=%s" %
                               item_id)
                    db.execute(
                        "insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)",
                        item_id, tb.descUrl, tb.promoteUrl,
                        tb.data.decode('gb18030').encode('utf8'),
                        tb.descContent.decode('gb18030').encode('utf8'),
                        tb.promoteContent.decode('gb18030').encode('utf8'), 1,
                        "")
                    db.execute("update item set crawl_status=1 where id=%s" %
                               item_id)
                    Statsd.increment("taobao.crawl.html_update")

            ############### processing comments ###########
            if FLAGS.update_comments:
                rediscli = get_redis(FLAGS.redishost, FLAGS.redisport)
                key = "guang:rate:%s" % item_id
                l = rediscli.llen(key)
                tb.crawl_rate()
                logger.info("replace comments %s %s -> %s", item_id, l,
                            len(tb.comments))
                #rediscli.lrange(key, 0, l)
                rediscli.delete(key)
                for c in tb.comments:
                    rediscli.rpush(key, c.SerializeToString())
                    # if limit size
                    #p = rediscli.pipeline()
                    #p.rpush(key, c.SerializeToString())
                    #p.ltrim(0, 99)
                    #p.execute()
                Statsd.increment("taobao.crawl.comments_update")
                Statsd.update_stats("taobao.crawl.comments_update_total",
                                    len(tb.comments))

            is_success = True
            crawl_result = ((item_id, (len(tb.data), len(tb.promoteContent),
                                       len(tb.descContent),
                                       len(tb.thumbImages), len(tb.buyButton),
                                       tb.price, len(tb.comments))), )
        else:
            logger.warn("crawl %s failed, no detail content or is_offline=%s",
                        item_id, tb.is_offline)
            crawl_result = ((item_id, (len(tb.data), 0, 0, 0, 0, 0.0, 0)), )
    except:
        logger.error("crawling %s unknown exception %s",
                     item_id,
                     traceback.format_exc(),
                     extra={'tags': [
                         'crawlItemException',
                     ]})
    logger.info("crawling %s result %s - %s", item_id, is_success,
                crawl_result)
    if is_success:
        Statsd.increment("taobao.crawl.itemhtml.succ")
    else:
        Statsd.increment("taobao.crawl.itemhtml.failed")
    return crawl_result
Esempio n. 8
0
def crawl_item2(kwargs):
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    is_success = False
    item_id = item[0]
    num_id = item[2]
    crawl_result = ((item_id, (0,0,0,0,0,0.0,0)),)

    tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments'])

    db = None
    if is_commit:
        db = get_db_engine()

    try:
        logger.info("progress %s/%s id %s iid %s", kwargs['i'], kwargs['total'], item_id, num_id)
        tb.crawl()
        if tb.is_offline and is_commit:
            db.execute("update item set status=2, modified=now() where id=%s" % item_id)
        if tb.detailDiv and not tb.is_offline:
            tb.crawl_price()

            if is_commit:
                # check old price and volume
                pv = list(db.execute("select price, volume from item where id=%s", item_id))
                price = pv[0][0]
                volume = pv[0][1]
                if tb.price != price and tb.price > 0.001:
                    is_price_update = True
                else:
                    is_price_update = False
                if tb.volume > 0 and tb.volume != volume:
                    is_volume_update = True
                else:
                    is_volume_update = False

                if is_price_update:
                    db.execute("insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()" % item_id)
                    if is_volume_update:
                        db.execute("update item set price=%s, volume=%s where id=%s", tb.price, tb.volume, item_id)
                    else:
                        db.execute("update item set price=%s where id=%s", tb.price, item_id)
                elif is_volume_update:
                    db.execute("update item set volume=%s where id=%s", tb.volume, item_id)
                if is_price_update:
                    Statsd.increment("taobao.crawl.price_update")
                if is_volume_update:
                    Statsd.increment("taobao.crawl.volume_update")

            if FLAGS.update_main:
                tb.crawl_desc()

                if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html:
                    db.execute("delete from crawl_html where item_id=%s" % item_id)
                    db.execute("insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)", item_id, tb.descUrl, tb.promoteUrl, tb.data.decode('gb18030').encode('utf8'), tb.descContent.decode('gb18030').encode('utf8'), tb.promoteContent.decode('gb18030').encode('utf8'), 1, "")
                    db.execute("update item set crawl_status=1 where id=%s" % item_id)
                    Statsd.increment("taobao.crawl.html_update")

            ############### processing comments ###########
            if FLAGS.update_comments:
                rediscli = get_redis(FLAGS.redishost, FLAGS.redisport)
                key = "guang:rate:%s" % item_id
                l = rediscli.llen(key)
                tb.crawl_rate()
                logger.info("replace comments %s %s -> %s", item_id, l, len(tb.comments))
                #rediscli.lrange(key, 0, l)
                rediscli.delete(key)
                for c in tb.comments:
                    rediscli.rpush(key, c.SerializeToString())
                    # if limit size
                    #p = rediscli.pipeline()
                    #p.rpush(key, c.SerializeToString())
                    #p.ltrim(0, 99)
                    #p.execute()
                Statsd.increment("taobao.crawl.comments_update")
                Statsd.update_stats("taobao.crawl.comments_update_total", len(tb.comments))

            is_success = True
            crawl_result = ((item_id, (len(tb.data),len(tb.promoteContent),len(tb.descContent),len(tb.thumbImages),len(tb.buyButton),tb.price,len(tb.comments))),)
        else:
            logger.warn("crawl %s failed, no detail content or is_offline=%s", item_id, tb.is_offline)
            crawl_result = ((item_id, (len(tb.data),0,0,0,0,0.0,0)),)
    except:
        logger.error("crawling %s unknown exception %s", item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
    logger.info("crawling %s result %s - %s", item_id, is_success, crawl_result)
    if is_success:
        Statsd.increment("taobao.crawl.itemhtml.succ")
    else:
        Statsd.increment("taobao.crawl.itemhtml.failed")
    return crawl_result