def download_image(self):
        headers = {
            'Referer': str(self.url),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip,deflate,sdch',
            'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
            'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
        }
        big_path = "%s/%s/big/%s" % (FLAGS.path, self.shop_id, self.local_pic_url)
        mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, self.shop_id, self.local_pic_url)
        mid_path = "%s/%s/mid/%s" % (FLAGS.path, self.shop_id, self.local_pic_url)
        sma_path = "%s/%s/sma/%s" % (FLAGS.path, self.shop_id, self.local_pic_url)
        small2_path = "%s/%s/small2/%s" % (FLAGS.path, self.shop_id, self.local_pic_url)
        small3_path = "%s/%s/small3/%s" % (FLAGS.path, self.shop_id, self.local_pic_url)

        # 上层try/except, 便于stat
        data = download(self.pic_url, headers)
        if not data:
            time.sleep(2)
            data = download(self.pic_url, headers)
        self.save_image(big_path, data)

        self.imagemagick_resize(300, 300, big_path, mid2_path)
        self.imagemagick_resize(210, 210, big_path, mid_path)
        self.imagemagick_resize(60, 60, big_path, sma_path)
        self.imagemagick_resize(100, 100, big_path, small2_path)
        self.imagemagick_resize(145, 145, big_path, small3_path)

        return self.get_image_size(big_path)
Exemple #2
0
def download_image(pic_url, shop_id, item_id, local_pic_url, fdfs_client):
    big_path = "%s/%s/big/%s" % (FLAGS.path, shop_id, local_pic_url)
    mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url)
    mid_path = "%s/%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url)
    sma_path = "%s/%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url)
    sma2_path = "%s/%s/small2/%s" % (FLAGS.path, shop_id, local_pic_url)
    sma3_path = "%s/%s/small3/%s" % (FLAGS.path, shop_id, local_pic_url)
    headers = {
        'Referer':
        "http://www.j.cn/product/%s.htm" % item_id,
        'User-Agent':
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
    }

    # pic_url 包含http走下载,不包含即fastdfs get
    if "http://" in pic_url:
        data = download(pic_url, headers)
        save_image(big_path, data)
    else:
        try:
            if not os.path.exists(os.path.dirname(big_path)):
                make_dirs_for_file(big_path)

            fdfs_client.download_to_file(big_path, pic_url)
        except (ConnectionError, ResponseError, DataError), e:
            fastdfs_filename = "http://image2.guang.j.cn/images/%s/big/%s" % (
                shop_id, local_pic_url)
            data = download(fastdfs_filename, headers)
            save_image(big_path, data)
            logger.info("%s:%s fdfs get failed: %s, usage http download",
                        item_id, pic_url, e)
def download_image(pic_url, shop_id, item_id, local_pic_url, fdfs_client):
    big_path = "%s/%s/big/%s" % (FLAGS.path, shop_id, local_pic_url)
    mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url)
    mid_path = "%s/%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url)
    sma_path = "%s/%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url)
    sma2_path = "%s/%s/small2/%s" % (FLAGS.path, shop_id, local_pic_url)
    sma3_path = "%s/%s/small3/%s" % (FLAGS.path, shop_id, local_pic_url)
    headers = {
        'Referer': "http://www.j.cn/product/%s.htm" % item_id,
        'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
    }

    # pic_url 包含http走下载,不包含即fastdfs get
    if "http://" in pic_url:
        data = download(pic_url, headers)
        save_image(big_path, data)
    else:
        try:
            if not os.path.exists(os.path.dirname(big_path)):
                make_dirs_for_file(big_path)

            fdfs_client.download_to_file(big_path, pic_url)
        except (ConnectionError, ResponseError, DataError), e:
            fastdfs_filename = "http://image2.guang.j.cn/images/%s/big/%s" % (shop_id, local_pic_url)
            data = download(fastdfs_filename, headers)
            save_image(big_path, data)
            logger.info("%s:%s fdfs get failed: %s, usage http download", item_id, pic_url, e)
Exemple #4
0
def crawler(sql):
    db = get_db_engine()
    items = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for item in items:
        shop_id = item[0]
        shop_type = item[1]
        item_id = item[2]
        url = item[3]

        try:
            htm = get_item_htm(item_id, url, db)
            if shop_type == 1:
                htm_obj = parse_html(htm, encoding='gb18030')
                discount_url = htm_obj.xpath("//div[@id='promote']/@data-default")
                if discount_url and len(discount_url) > 0:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(discount_url[0], item_headers)
                    if disc_content.strip():
                        disc_obj = parse_html(disc_content, encoding='gb18030')
                        content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip()
                        dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip()
                        st = dates.encode('utf-8').replace("--","—").split("—")
                        start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')
                        end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')

                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                                   shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0])
                        logger.info("taobao shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
                else:
                    logger.warning("taobao shop %s:%s not discount.", shop_id, url)
            elif shop_type == 2:
                d_url = get_val(htm, "initApi")
                if d_url:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(d_url, item_headers)
                    cjson = loads(disc_content.decode('gb18030').encode('utf8'))
                    shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm']
                    if shop_prom:
                        st = int(shop_prom['startTime'])/1000
                        et = int(shop_prom['endTime'])/1000
                        start_time = time.strftime("%Y-%m-%d", time.localtime(st))
                        end_time = time.strftime("%Y-%m-%d", time.localtime(et))
                        content = shop_prom['promPlan'][0]['msg']
                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                            shop_id, content.encode('utf-8'), start_time, end_time, d_url)
                        logger.info("tmall shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
        except:
            logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
    def download_image(self, shop_id, num_id, pic_url, local_pic_url):
        headers = {'Referer': "http://item.taobao.com/item.htm?id=%s" % num_id,
                   'User-Agent': DEFAULT_UA
        }
        big_path = "%s/%s/big/%s" % (FLAGS.path, shop_id, local_pic_url)
        mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url)
        mid_path = "%s/%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url)
        sma_path = "%s/%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url)
        small2_path = "%s/%s/small2/%s" % (FLAGS.path, shop_id, local_pic_url)
        small3_path = "%s/%s/small3/%s" % (FLAGS.path, shop_id, local_pic_url)
        try:
            data = download(pic_url, headers)
        except KeyboardInterrupt:
            raise
        except:
            logger.error("download %s:%s failed reason: %s", num_id, pic_url, traceback.format_exc())

        self.save_image(big_path, data)

        self.imagemagick_resize(300, 300, big_path, mid2_path)
        self.imagemagick_resize(210, 210, big_path, mid_path)
        self.imagemagick_resize(60, 60, big_path, sma_path)
        self.imagemagick_resize(100, 100, big_path, small2_path)
        self.imagemagick_resize(145, 145, big_path, small3_path)

        return self.get_image_size(big_path)
Exemple #6
0
    def download_image(self, shop_id, num_id, pic_url, local_pic_url):
        headers = {
            'Referer': "http://item.taobao.com/item.htm?id=%s" % num_id,
            'User-Agent': DEFAULT_UA
        }
        big_path = "%s/%s/big/%s" % (FLAGS.path, shop_id, local_pic_url)
        mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url)
        mid_path = "%s/%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url)
        sma_path = "%s/%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url)
        small2_path = "%s/%s/small2/%s" % (FLAGS.path, shop_id, local_pic_url)
        small3_path = "%s/%s/small3/%s" % (FLAGS.path, shop_id, local_pic_url)
        try:
            data = download(pic_url, headers)
        except KeyboardInterrupt:
            raise
        except:
            logger.error("download %s:%s failed reason: %s", num_id, pic_url,
                         traceback.format_exc())

        self.save_image(big_path, data)

        self.imagemagick_resize(300, 300, big_path, mid2_path)
        self.imagemagick_resize(210, 210, big_path, mid_path)
        self.imagemagick_resize(60, 60, big_path, sma_path)
        self.imagemagick_resize(100, 100, big_path, small2_path)
        self.imagemagick_resize(145, 145, big_path, small3_path)

        return self.get_image_size(big_path)
def convert_taobaoke_widget(items, fn_join_iids=lambda x:','.join(x), batch_size=40, calllimit=60, expire=600, outer_code='jcn', appkey=TAOBAOKE_APPKEY, appsec=TAOBAOKE_APPSECRET):
    ts = int(time.time()*1000)
    msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec
    sign = hmac.HMAC(appsec, msg).hexdigest().upper()
    headers = {'User-Agent' : DEFAULT_UA, 'Referer' : "http://www.j.cn/"}
    for chunk in waitlimit(calllimit, 60.0, chunks(items, batch_size)): # calllimit for minutes
        params = {'app_key' : appkey,
                  '_t_sys' : 'args=4',
                  'method' : 'taobao.taobaoke.widget.items.convert',
                  'sign' : sign,
                  'timestamp' : ts,
                  'fields' : "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score",
                  'callback' : 'TOP.io.jsonpCbs.t%s' % md5( str(random.random()) ).hexdigest()[:13],
                  'partner_id' : 'top-sdk-js-20120801',
        }
        params['num_iids'] = fn_join_iids(chunk)
        if outer_code:
            params['outer_code'] = outer_code
        url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode(params)
        results = download(url, headers=headers)
        if results:
            Statsd.increment('guang.taobaoapi.widget_succ')
        else:
            Statsd.increment('guang.taobaoapi.widget_err')
        #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results)
        yield (chunk, results)
Exemple #8
0
def download_with_referer(url, referer):
    """抓取店铺扩展信息时 强制要求加refer 如果不需要加 则refer赋值为None"""
    if referer:
        headers = {
            'Referer':
            referer,
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
            'User-Agent':
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
        }
    else:
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
            'User-Agent':
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
        }

    return download(url, headers)
Exemple #9
0
def download_image(kwargs):
    item_id = kwargs['item_id']
    num_id = kwargs['num_id']
    shop_id = kwargs['shop_id']
    crawl_path = kwargs['crawl_path']
    image_name = kwargs['image_name']
    pic_url = kwargs['pic_url']
    #先下主图,放到big目录
    headers = {
        'Referer': "http://item.taobao.com/item.htm?id=%s" % num_id,
        'User-Agent': DEFAULT_UA
    }
    try:
        data = download(pic_url, headers)
    except KeyboardInterrupt:
        raise
    except:
        logger.info("download %s:%s failed reason: %s", item_id, pic_url, traceback.format_exc())

    shop_image_path = "%s/%s" % (crawl_path, shop_id)
    if not os.path.exists(shop_image_path):
        os.mkdir(shop_image_path)
    shop_image_big_path = "%s/big" % shop_image_path
    if not os.path.exists(shop_image_big_path):
        os.mkdir(shop_image_big_path)
    big_image_fullpath = "%s/%s" % (shop_image_big_path, image_name)
    f = open(big_image_fullpath, "w")
    f.write(data)
    f.close()

    try:
        image = Image.open(cStringIO.StringIO(open(big_image_fullpath).read()))
    except IOError, e:
        logger.info("Open image failed %s:%s %s", item_id, pic_url, e.message)
Exemple #10
0
 def download_page(self, url, max_retry_count=5):
     result = download(
         url,
         max_retry=max_retry_count,
         fn_is_banned=lambda data: data.find(u"您的访问受到限制".encode('gbk')) > 0,
         throw_on_banned=True)
     return result
def download_image(kwargs):
    item_id = kwargs['item_id']
    num_id = kwargs['num_id']
    shop_id = kwargs['shop_id']
    crawl_path = kwargs['crawl_path']
    image_name = kwargs['image_name']
    pic_url = kwargs['pic_url']
    #先下主图,放到big目录
    headers = {
        'Referer': "http://item.taobao.com/item.htm?id=%s" % num_id,
        'User-Agent': DEFAULT_UA
    }
    try:
        data = download(pic_url, headers)
    except KeyboardInterrupt:
        raise
    except:
        logger.info("download %s:%s failed reason: %s", item_id, pic_url, traceback.format_exc())

    shop_image_path = "%s/%s" % (crawl_path, shop_id)
    if not os.path.exists(shop_image_path):
        os.mkdir(shop_image_path)
    shop_image_big_path = "%s/big" % shop_image_path
    if not os.path.exists(shop_image_big_path):
        os.mkdir(shop_image_big_path)
    big_image_fullpath = "%s/%s" % (shop_image_big_path, image_name)
    f = open(big_image_fullpath, "w")
    f.write(data)
    f.close()

    try:
        image = Image.open(cStringIO.StringIO(open(big_image_fullpath).read()))
    except IOError, e:
        logger.info("Open image failed %s:%s %s", item_id, pic_url, e.message)
Exemple #12
0
 def GET(self):
     params = web.input(term_id=74, start=0, rows=120,
         sortby='region_ctr_0111_4',
         edismax=False,
         bfs='sum(mul(__SORT__,2.5),mul(__SIMI__,5.0))',
         qf='item_title^1.4 shop_name^0.5',
         boost=None,
         xks=12,
         wd='',
         debugQuery='on')
     bfs = boost = ''
     tagmatch = get_xks_tagmatch(params.xks)
     if params.edismax and params.bfs:
         bfs = replace_meta(params.bfs, params.sortby, tagmatch)
     if params.edismax and params.boost:
         boost = replace_meta(params.boost, params.sortby, tagmatch)
     url = build_solr_qs(params.term_id, params.start, params.rows, params.sortby,
         params.edismax, params.qf, bfs.split('|'), boost, params.debugQuery, params.wd)
     logger.debug('fetching %s', url)
     results = simplejson.loads(download(url))
     #import pdb; pdb.set_trace()
     return render_html("list.htm", {'results' : results,
             'solrurl' : url,
             'xksinfo' : 'xks %s : tagmatch %s' % (params.xks, tagmatch),
             'params' : params,
             })
Exemple #13
0
def crawl_tao123(shops):
    base_url = "http://dianpu.tao123.com/nvzhuang/%s.php"
    end = 22
    for i in range(1, end+1):
        url = base_url % i
        html = download(url)
        html_obj = parse_html(html)
        shops.update(html_obj.xpath("//div[@class='cg_shop_info']//a/@href"))
Exemple #14
0
def crawl_tao123(shops):
    base_url = "http://www.meilishuo.com/shop/top/0/%s"
    end = 203
    for i in range(end):
        logger.debug("processing %s", i)
        url = base_url % i
        html = download(url)
        html_obj = parse_html(html)
        shops.update(html_obj.xpath("//div[@class='shop_item']//a/@href"))
Exemple #15
0
def crawl_dirtbshop(shops):
    base_url = "http://dirtbshop.com/list_shop_%s_1_1.html"
    end = 251
    for i in range(1, end+1):
        url = base_url % i
        html = download(url)
        html_obj = parse_html(html)
        import pdb; pdb.set_trace()
        urls = html_obj.xpath("//span[@class='grebtn_in']/a/@href")
Exemple #16
0
 def get_dynamicStock(self):
     # 从script标签中获取并组装url
     # http://detailskip.taobao.com/json/sib.htm?
     s = self.html_obj.xpath("//script[contains(text(),'var b=')]/text()")
     s_re = re.compile("b=\"([^<>]*)\",a=")
     dynamicStock_url = s_re.search(str(s)).group(1)
     if dynamicStock_url:
         dynamicStock_url += "&ref=" + urllib.quote(self.url)
         self.dynamicStockData = download(dynamicStock_url, self.headers)
 def get_dynamicStock(self):
     # 从script标签中获取并组装url
     # http://detailskip.taobao.com/json/sib.htm?
     s = self.html_obj.xpath("//script[contains(text(),'var b=')]/text()")
     s_re = re.compile("b=\"([^<>]*)\",a=")
     dynamicStock_url = s_re.search(str(s)).group(1)
     if dynamicStock_url:
         dynamicStock_url += "&ref=" + urllib.quote(self.url);
         self.dynamicStockData = download(dynamicStock_url, self.headers)
Exemple #18
0
def check_graphite(server, target, n, warnv=0.0, errorv=0.0, gt=True, since="-1days", until="-"):
    url = "http://%s/render?format=json&from=%s&until=%s&target=%s" % (server, since, until, target)
    logger.debug("Fetching %s", url)
    data = download(url)
    json_data = simplejson.loads(data)
    data_points = json_data[0]['datapoints']
    lastn_datapoints = list(takelastn(data_points, FLAGS.lastn, lambda x:not x[0]))
    logger.debug("Last n data point %s", lastn_datapoints)
    is_warn = all_matched(lambda x:not ((x[0]>warnv) ^ gt), lastn_datapoints)
    is_error = all_matched(lambda x:not ((x[0]>errorv) ^ gt), lastn_datapoints)
    return is_warn, is_error, lastn_datapoints
def crawl_tao123(shops):
    for line in open(FLAGS.path):
        try:
            line = line.strip()
            url = "http://www.meilishuo.com%s" % line
            html = download(url)
            html_obj = parse_html(html)
            shop_url = html_obj.xpath("//div[@class='shop_summary']/a/@href")
            logger.debug("processing %s -> %s", line, shop_url)
            shops.update(shop_url)
        except:
            logger.error("processing %s failed", line)
Exemple #20
0
def crawl_tao123(shops):
    for line in open(FLAGS.path):
        try:
            line = line.strip()
            url = "http://www.meilishuo.com%s" % line
            html = download(url)
            html_obj = parse_html(html)
            shop_url = html_obj.xpath("//div[@class='shop_summary']/a/@href")
            logger.debug("processing %s -> %s", line, shop_url)
            shops.update(shop_url)
        except:
            logger.error("processing %s failed", line)
Exemple #21
0
    def download_image(self):
        headers = {
            'Referer':
            str(self.url),
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip,deflate,sdch',
            'Accept-Language':
            'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
            'User-Agent':
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
        }
        big_path = "%s/%s/big/%s" % (FLAGS.path, self.shop_id,
                                     self.local_pic_url)
        mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, self.shop_id,
                                       self.local_pic_url)
        mid_path = "%s/%s/mid/%s" % (FLAGS.path, self.shop_id,
                                     self.local_pic_url)
        sma_path = "%s/%s/sma/%s" % (FLAGS.path, self.shop_id,
                                     self.local_pic_url)
        small2_path = "%s/%s/small2/%s" % (FLAGS.path, self.shop_id,
                                           self.local_pic_url)
        small3_path = "%s/%s/small3/%s" % (FLAGS.path, self.shop_id,
                                           self.local_pic_url)

        # 上层try/except, 便于stat
        data = download(self.pic_url, headers)
        if not data:
            time.sleep(2)
            data = download(self.pic_url, headers)
        self.save_image(big_path, data)

        self.imagemagick_resize(300, 300, big_path, mid2_path)
        self.imagemagick_resize(210, 210, big_path, mid_path)
        self.imagemagick_resize(60, 60, big_path, sma_path)
        self.imagemagick_resize(100, 100, big_path, small2_path)
        self.imagemagick_resize(145, 145, big_path, small3_path)

        return self.get_image_size(big_path)
Exemple #22
0
def get_item_htm(id, url, db):
    sql = "select html,last_modified from crawl_html where item_id=%s" % id
    item_htm = list(db.execute(sql))
    last_modified = item_htm[0][1]
    now = datetime.datetime.now()
    days = now - last_modified
    if days > datetime.timedelta(days=7):
        item_headers = {'Referer': url,'User-Agent': DEFAULT_UA}
        item_htm = download(url, item_headers)
        db.execute("update crawl_html set html=%s,last_modified=now() where item_id=%s", item_htm.decode('gb18030').encode('utf8'), id) 
        return item_htm
    else:
        return item_htm[0][0]
Exemple #23
0
def main():
    url = "http://%s:7080%s" % (FLAGS.solr_host, SOLR_URL)
    #import pdb; pdb.set_trace()
    results = simplejson.loads(download(url))
    db = get_db_engine()
    counts = []
    for doc in results['response']['docs']:
        item_id = doc['item_id']
        count = db.execute("select count(id) from favourite where itemid=%s and acttime>'2012-12-01' and favstatus=1 and firstchoose=0;" % item_id)
        if count.rowcount:
            counts.append(list(count)[0][0])
        else:
            counts.append(0)
    cs = Series(counts)
    logger.info(cs.describe())
Exemple #24
0
 def GET(self):
     params = web.input(term_id=74, start=0, rows=120,
         sortby='region_ctr_0111_4',
         xks=12,
         wd='',
         debugQuery='on')
     tagmatch = get_xks_tagmatch(params.xks)
     url = build_solr_custom_qs(params.term_id, params.start, params.rows, params.sortby,
         params.debugQuery, params.wd, tagmatch)
     logger.debug('fetching %s', url)
     results = simplejson.loads(download(url))
     #import pdb; pdb.set_trace()
     return render_html("custlist.htm", {'results' : results,
             'solrurl' : url,
             'xksinfo' : 'xks %s : tagmatch %s' % (params.xks, tagmatch),
             'params' : params,
             })
Exemple #25
0
def convert_taobaoke_widget(items,
                            fn_join_iids=lambda x: ','.join(x),
                            batch_size=40,
                            calllimit=60,
                            expire=600,
                            outer_code='jcn',
                            appkey=TAOBAOKE_APPKEY,
                            appsec=TAOBAOKE_APPSECRET):
    ts = int(time.time() * 1000)
    msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec
    sign = hmac.HMAC(appsec, msg).hexdigest().upper()
    headers = {'User-Agent': DEFAULT_UA, 'Referer': "http://www.j.cn/"}
    for chunk in waitlimit(calllimit, 60.0,
                           chunks(items, batch_size)):  # calllimit for minutes
        params = {
            'app_key':
            appkey,
            '_t_sys':
            'args=4',
            'method':
            'taobao.taobaoke.widget.items.convert',
            'sign':
            sign,
            'timestamp':
            ts,
            'fields':
            "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score",
            'callback':
            'TOP.io.jsonpCbs.t%s' % md5(str(random.random())).hexdigest()[:13],
            'partner_id':
            'top-sdk-js-20120801',
        }
        params['num_iids'] = fn_join_iids(chunk)
        if outer_code:
            params['outer_code'] = outer_code
        url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode(
            params)
        results = download(url, headers=headers)
        if results:
            Statsd.increment('guang.taobaoapi.widget_succ')
        else:
            Statsd.increment('guang.taobaoapi.widget_err')
        #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results)
        yield (chunk, results)
def download_with_referer(url, referer):
    """抓取店铺扩展信息时 强制要求加refer 如果不需要加 则refer赋值为None"""
    if referer:
        headers = {
            'Referer': referer,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
            'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
        }
    else:
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
            'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)"
        }

    return download(url, headers)
def crawl_main():
    for host in open(FLAGS.path):
        url = "http://%s" % (host.strip())
        try:
            html = download(url)
            #import pdb; pdb.set_trace()
            html_obj = parse_html(html, 'gbk')
            if url.find('tmall.com') > 0:
                shop_url = html_obj.xpath("//h3[@class='shop-title']/a/@href")[0]
                shop_name = html_obj.xpath("//h3[@class='shop-title']/a/text()")[0]
                print shop_url, shop_name.encode('utf8')
            else:
                shop_url = html_obj.xpath("//div[@class='shop-info-simple']/a/@href")[0]
                shop_name = html_obj.xpath("//div[@class='shop-info-simple']/a/text()")[0]
                shop_rank = html_obj.xpath("//span[@class='shop-rank']//img/@src")[0]
                #good_rate = html_obj.xpath("//li[@class='goodrate']/text()")[0]
                print shop_url, shop_name.encode('utf8'), shop_rank
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("processing %s failed, %s", url, traceback.format_exc())
Exemple #28
0
def check_graphite(server,
                   target,
                   n,
                   warnv=0.0,
                   errorv=0.0,
                   gt=True,
                   since="-1days",
                   until="-"):
    url = "http://%s/render?format=json&from=%s&until=%s&target=%s" % (
        server, since, until, target)
    logger.debug("Fetching %s", url)
    data = download(url)
    json_data = simplejson.loads(data)
    data_points = json_data[0]['datapoints']
    lastn_datapoints = list(
        takelastn(data_points, FLAGS.lastn, lambda x: not x[0]))
    logger.debug("Last n data point %s", lastn_datapoints)
    is_warn = all_matched(lambda x: not ((x[0] > warnv) ^ gt),
                          lastn_datapoints)
    is_error = all_matched(lambda x: not ((x[0] > errorv) ^ gt),
                           lastn_datapoints)
    return is_warn, is_error, lastn_datapoints
Exemple #29
0
def download_page(url, headers, max_retry_count=5):
    return download(url,
                    headers,
                    max_retry=max_retry_count,
                    throw_on_banned=True)
Exemple #30
0
def download_page(url, headers, max_retry_count=5):
    return download(url, headers, max_retry=max_retry_count, throw_on_banned=True)
Exemple #31
0
def crawler(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for shop in shops:
        shop_id = shop[0]
        url = str(shop[1])
        type = shop[2]
        if url[-1] != '/':
            url += "/"
        try:
            shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
            dongtai_url = url + "dongtai.htm"
            dongtai_data = download(dongtai_url, shop_headers)
            if dongtai_data:
                dongtai_obj = parse_html(dongtai_data, encoding="gb18030")
                dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode(
                    'utf-8')
                if '店铺动态' in dongtai_title:
                    microscope_data = dongtai_obj.xpath(
                        "//*[@name='microscope-data']/@content")
                    userId = get_val(str(microscope_data), "userId")

                    if userId:
                        dongtai_headers = {
                            'Referer': dongtai_url,
                            'User-Agent': DEFAULT_UA
                        }
                        promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \
                                        "&userId=%s&vfeedTabId=115" % userId
                        promotion_data = download(promotion_url,
                                                  dongtai_headers)

                        if promotion_data:
                            promotion_obj = parse_html(promotion_data,
                                                       encoding="gb18030")
                            i = 0
                            while i < 10:
                                feedInfo = promotion_obj.xpath(
                                    "//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()"
                                )[i].encode('utf-8')
                                if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo:
                                    #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i]
                                    link = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href"
                                    )[i]
                                    promotion_price = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()"
                                    )[i]
                                    price = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()"
                                    )[i]
                                    promotion_time = promotion_obj.xpath(
                                        u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()"
                                    )[i]
                                    pt = promotion_time.encode(
                                        'utf-8').replace("起止日期:",
                                                         "").split(" - ")
                                    start_time = pt[0].replace(".", "-")
                                    end_time = pt[1].replace(".", "-")
                                    if '2013' not in pt[1] or '2014' not in pt[
                                            1]:
                                        end_time = '2013-' + end_time

                                    if start_time > end_time:
                                        end_time = end_time.replace(
                                            "2013", "2014")

                                    num_id = get_numiid(link, dongtai_headers)
                                    if num_id:
                                        sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (
                                            shop_id, num_id)
                                        re = list(db.execute(sql))
                                        if not re:
                                            db.execute(
                                                "insert into shop_promotion (shop_id, num_id, price, "
                                                "promotion_price, start_time, end_time, create_time, "
                                                "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())"
                                                % (shop_id, num_id,
                                                   price.replace(',', ''),
                                                   promotion_price.replace(
                                                       ',', ''), start_time,
                                                   end_time))
                                    else:
                                        logger.error(
                                            "shop %s:%s crawler num_id failed",
                                            shop_id, url)

                                i += 1
                                logger.info(
                                    "shop %s:%s crawler promotiom item num=%s",
                                    shop_id, url, i)

                        else:
                            logger.warning("shop %s:%s not promotion info",
                                           shop_id, url)
                    else:
                        logger.error("shop %s:%s crawler userId failed",
                                     shop_id, url)
                else:
                    logger.error("shop %s:%s not dongtai page", shop_id, url)
        except:
            logger.error("shop %s:%s crawler failed %s", shop_id, url,
                         traceback.format_exc())
def crawler(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for shop in shops:
        shop_id = shop[0]
        url = str(shop[1])
        type = shop[2]
        if url[-1] != '/':
            url += "/"
        try:
            shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
            dongtai_url = url + "dongtai.htm"
            dongtai_data = download(dongtai_url, shop_headers)
            if dongtai_data:
                dongtai_obj = parse_html(dongtai_data, encoding="gb18030")
                dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode('utf-8')
                if '店铺动态' in dongtai_title:
                    microscope_data = dongtai_obj.xpath("//*[@name='microscope-data']/@content")
                    userId = get_val(str(microscope_data), "userId")

                    if userId:
                        dongtai_headers = {'Referer': dongtai_url, 'User-Agent': DEFAULT_UA}
                        promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \
                                        "&userId=%s&vfeedTabId=115" % userId
                        promotion_data = download(promotion_url, dongtai_headers)

                        if promotion_data:
                            promotion_obj = parse_html(promotion_data, encoding="gb18030")
                            i = 0
                            while i < 10:
                                feedInfo = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()")[i].encode('utf-8')
                                if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo:
                                    #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i]
                                    link = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href")[i]
                                    promotion_price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()")[i]
                                    price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()")[i]
                                    promotion_time = promotion_obj.xpath(u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()")[i]
                                    pt = promotion_time.encode('utf-8').replace("起止日期:","").split(" - ")
                                    start_time = pt[0].replace(".", "-")
                                    end_time = pt[1].replace(".", "-")
                                    if '2013' not in pt[1] or '2014' not in pt[1]:
                                        end_time = '2013-' + end_time

                                    if start_time > end_time:
                                        end_time = end_time.replace("2013", "2014")

                                    num_id = get_numiid(link, dongtai_headers)
                                    if num_id:
                                        sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (shop_id, num_id)
                                        re = list(db.execute(sql))
                                        if not re:
                                            db.execute("insert into shop_promotion (shop_id, num_id, price, "
                                                       "promotion_price, start_time, end_time, create_time, "
                                                       "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())"
                                                       % (shop_id, num_id, price.replace(',', ''), promotion_price.replace(',', ''), start_time, end_time))
                                    else:
                                        logger.error("shop %s:%s crawler num_id failed", shop_id, url)

                                i += 1
                                logger.info("shop %s:%s crawler promotiom item num=%s", shop_id, url, i)

                        else:
                            logger.warning("shop %s:%s not promotion info", shop_id, url)
                    else:
                        logger.error("shop %s:%s crawler userId failed", shop_id, url)
                else:
                    logger.error("shop %s:%s not dongtai page", shop_id, url)
        except:
            logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())
 def download_page(self, url, max_retry_count=5):
     result = download(url, max_retry=max_retry_count,
                       fn_is_banned=lambda data:data.find(u"您的访问受到限制".encode('gbk')) > 0,
                       throw_on_banned=True)
     return result
Exemple #34
0
    def crawl_price(self):
        self.bidPrice = self.html_obj.xpath(
            "//input[@name='current_price']/@value")
        self.originPrice = self.html_obj.xpath(
            "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
        if not self.originPrice:
            self.originPrice = self.html_obj.xpath(
                "//strong[@class='J_originalPrice']/text()")

        self.promoteUrl2 = get_val(self.data, "apiPromoData")
        if self.promoteUrl2:
            self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/")

        price = ""
        if self.is_tmall and self.tmallInitApi and self.tmallInitApijson:
            try:
                priceInfo = self.tmallInitApijson['defaultModel'][
                    'itemPriceResultDO']['priceInfo']
                if priceInfo:
                    if priceInfo.has_key('def'):
                        defaultPriceInfo = priceInfo['def']
                    else:
                        defaultPriceInfo = priceInfo[priceInfo.keys()[0]]
                    # 2013-11-22 改为获取真实促销价格,而不是扣除佣金后的价格
                    if defaultPriceInfo.has_key(
                            'promotionList'
                    ) and defaultPriceInfo['promotionList']:
                        price = defaultPriceInfo['promotionList'][0]['price']
                    if not price:
                        if defaultPriceInfo.has_key('price'):
                            price = defaultPriceInfo['price']
                    if not price:
                        if defaultPriceInfo.has_key('promPrice'):
                            price = defaultPriceInfo['promPrice']['price']
                        elif defaultPriceInfo.has_key(
                                'promotionList'
                        ) and defaultPriceInfo['promotionList']:
                            price = str(
                                min([
                                    float(x.get('price', '100000000.0'))
                                    for x in defaultPriceInfo['promotionList']
                                ]))
            except:
                logger.warn("Parse tmall json price failed, %s", self.item_id)

        if not price:
            if self.promoteUrl2:
                self.promoteContent = self.crawl_page(
                    self.promoteUrl2).replace('&quot;', '"')
                tag = "low:"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find(',', pos)
                    price = self.promoteContent[pos:pos2]
                if not price:
                    price = get_num_val(self.promoteContent, 'price')
            else:
                self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id
                self.promoteContent = self.crawl_page(self.promoteUrl)
                if self.promoteContent:
                    self.promoteContent = self.promoteContent.replace(
                        '"', '&quot;')
                    tag = "promPrice&quot;:&quot;"
                    if self.promoteContent.find(tag) > 0:
                        pos = self.promoteContent.find(tag) + len(tag)
                        pos2 = self.promoteContent.find('&quot;', pos)
                        price = self.promoteContent[pos:pos2]

        if not price:
            tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()")
            tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()")
            if tbPrice and not tbPrice[0].strip():
                price = tbPrice[0].strip()
            elif tbPrice1 and not tbPrice1[0].strip():
                price = tbPrice1[0].strip()

        if price.find("-") > 0:
            price = price.split('-')[0].strip()

        if not price:
            rg_m = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE
                              | re.DOTALL).search(self.dynamicStockData)
            if rg_m:
                price_str = rg_m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # 2013-09-03  get price url
        if not price:
            #这里稍微有点麻烦,主要针对string进行处理
            pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id
            response = download(pirce_url, self.headers)
            rg = re.compile('price:\"[0-9]+[.][0-9]+\"',
                            re.IGNORECASE | re.DOTALL)
            m = rg.search(response.decode('gb18030').encode('utf8'))
            if m:
                price_str = m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # not chuxiao price, set origin price
        if not price:
            if self.originPrice:
                price = self.originPrice[0].strip()
            elif self.bidPrice:
                price = self.bidPrice[0].strip()
            if price.find("-") > 0:
                price = price.split('-')[0].strip()

        self.price = float(price)
        logger.debug("%s price is %s", self.item_id, self.price)
Exemple #35
0
 def crawl_page(self, url):
     result = download(url, self.headers)
     return result
Exemple #36
0
 def download_image(self, url):
     t = time.time()
     data = download(url, headers=self.headers)
     spent = time.time() - t
     Statsd.timing("guang.crawl.image", spent * 1000, host=self.statshost, port=self.statsport)
     return data
 def crawl_page(self, url):
     result = download(url, self.headers)
     return result
    def crawl_price(self):
        self.promoteUrl2 = get_val(self.data, "apiPromoData")
        if self.promoteUrl2:
            self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/")

        price = ""
        if self.is_tmall and self.tmallInitApi and self.tmallInitApijson:
            try:
                priceInfo = self.tmallInitApijson['defaultModel']['itemPriceResultDO']['priceInfo']
                if priceInfo:
                    if priceInfo.has_key('def'):
                        defaultPriceInfo = priceInfo['def']
                    else:
                        defaultPriceInfo = priceInfo[priceInfo.keys()[0]]

                    if defaultPriceInfo.has_key('promPrice'):
                        price = defaultPriceInfo['promPrice']['price']
                    elif defaultPriceInfo.has_key('promotionList') and defaultPriceInfo['promotionList']:
                        price = str(min([float(x.get('price','100000000.0')) for x in defaultPriceInfo['promotionList']]))
                    else:
                        price = defaultPriceInfo['price']
            except:
                logger.warn("Parse tmall json price failed, %s", self.item_id)

        if not price:
            if self.promoteUrl2:
                self.promoteContent = self.crawl_page(self.promoteUrl2).replace('&quot;', '"')
                tag = "low:"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find(',', pos)
                    price = self.promoteContent[pos:pos2]
                if not price:
                    price = get_num_val(self.promoteContent, 'price')
            else:
                self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id
                self.promoteContent = self.crawl_page(self.promoteUrl).replace('"', '&quot;')
                tag = "promPrice&quot;:&quot;"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find('&quot;', pos)
                    price = self.promoteContent[pos:pos2]

        if not price:
            tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()")
            tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()")
            if tbPrice and not tbPrice[0].strip():
                price = tbPrice[0].strip()
            elif tbPrice1 and not tbPrice1[0].strip():
                price = tbPrice1[0].strip()

        if price.find("-") > 0:
            price = price.split('-')[0].strip()

        # 2013-09-03  get price url
        if not price:
            #这里稍微有点麻烦,主要针对string进行处理
            pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id
            response = download(pirce_url, self.headers)
            rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE|re.DOTALL)
            m = rg.search(response.decode('gb18030').encode('utf8'))
            if m:
                price_str = m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # not chuxiao price, set origin price
        if not price:
            if self.originPrice:
                price = self.originPrice[0].strip()
            elif self.bidPrice:
                price = self.bidPrice[0].strip()
            if price.find("-") > 0:
                price = price.split('-')[0].strip()

        self.price = float(price)
        logger.debug("%s price is %s", self.item_id, self.price)