Python Selector.re Beispiele, scrapy.Selector.re Python Beispiele

Beispiel #1

0

Datei anzeigen

 def name_en(cls, input_value):
     try:
         if isinstance(input_value, list):
             input_value = ''.join(input_value)
             name = Selector(text=input_value).xpath(
                 "//h1//text()")
             if len(name.re("-.*-(.*)")) > 0:
                 try:
                     name_en = str(''.join(name.re("-.*-(.*)")))
                     return name_en
                 except UnicodeEncodeError:
                     return
             elif len(name.re("-(.*)")) > 0:
                 name_en = name.re("-(.*)")
                 return name_en
             else:
                 name = name.extract()
                 if isinstance(name, list):
                     name = ''.join(name)
                     try:
                         name = str(name)
                         return name
                     except UnicodeEncodeError:
                         return
         else:
             return
     except Exception as e:
         print e

Beispiel #2

0

Datei anzeigen

 def name_vn(cls, input_value):
     if isinstance(input_value, list):
         input_value = ''.join(input_value)
         name_1 = Selector(text=input_value).xpath("//span[@itemprop='title']//text()").extract()
         try:
             name_1 = str(''.join(name_1))
             name = Selector(text=input_value).xpath(
                 "//h1//text()")
             if len(name.re("(.*-.*-.*)")) > 0:
                 try:
                     name_en = str(''.join(name.re("(.*-.*-.*)")))
                     return
                 except UnicodeEncodeError:
                     return ''.join(name.re("(.*-.*-.*)"))
             elif len(name.re("(.*)-")) > 0:
                 name_vn = name.re("(.*)-")
                 return name_vn
             else:
                 name = name.extract()
                 if isinstance(name, list):
                     name = ''.join(name)
                     try:
                         name = str(name)
                         return
                     except UnicodeEncodeError:
                         return name
         except UnicodeEncodeError:
             return ''.join(name_1)
     else:
         return

Beispiel #3

0

Datei anzeigen

Datei: www80.py Projekt: zantaiguocheng/dianying

 def neirong(self, response):
     data = Selector(response)
     itme = DianyingItem()
     itme['moviename'] = data.xpath(
         '//h1[@class="font14w"]/text()').extract()
     itme['jianjie'] = ''.join(
         data.xpath(
             '//div[@class="info" and child::h1[@class="font14w"]]/span/text()'
         )[0:2].extract()).strip()
     itme['actor'] = data.xpath(
         '//span/a[contains(@href,"actor")]/text()').extract()
     itme['kind'] = data.xpath(
         '//span/a[contains(@href,"----")]/text()').extract()
     itme['country'] = data.xpath(
         '//span[child::span[contains(text(),"地区")]]/a/text()').extract()
     itme['language'] = data.xpath(
         '//span[child::span[contains(text(),"语言")]]/a/text()').extract()
     itme['daoyan'] = data.xpath(
         '//span/a[contains(@href,"dir")]/text()').extract()
     itme['sysj'] = data.re('上映日期：.*?(\d{4}-\d{2}-\d{2})')
     itme['pc'] = data.re('片长：\D+?(\d+[\u4E00-\u9FA5]+)')
     itme['gxsj'] = data.re('更新日期：.*?(\d{4}-\d{2}-\d{2})')
     itme['jqjs'] = ''.join(
         data.xpath('//div[@id="movie_content"]/text()').extract()).strip()
     itme['dbpf'] = data.xpath(
         '//span[child::span[contains(text(),"豆瓣评分")]]/text()').re(
             '\d+.\d+')
     downlink = data.xpath(
         '//div[@id="cpdl2list"]//a[@rel="nofollow"]/@href').extract()
     downlink_2 = data.xpath('//input[@class="checkone"]/@value').extract()
     for i in downlink_2:
         downlink.append(i)
     itme['downlink'] = downlink
     print(itme)
     return itme

Beispiel #4

0

Datei anzeigen

Datei: eastbay_spider.py Projekt: mtaziz/jaycluster

 def _enrich_same_part(self, item, response):
     sel = Selector(response)
     item['NBR'] = ''.join(sel.re(r'var model_nbr = (.*);')).strip()
     item['model'] = json.loads(''.join(sel.re(r'var model = (.*);')).strip().replace('&nbsp;', ''))
     item['styles'] = json.loads(''.join(sel.re(r'var styles = (.*);')).strip().replace('&nbsp;', ''))
     self.crawler.stats.inc_crawled_pages(
         crawlid=response.meta['crawlid'],
         spiderid=response.meta['spiderid'],
         appid=response.meta['appid'])

Beispiel #5

0

Datei anzeigen

 def _enrich_same_part(self, item, response):
     sel = Selector(response)
     item['NBR'] = ''.join(sel.re(r'var model_nbr = (.*);')).strip()
     item['model'] = json.loads(''.join(
         sel.re(r'var model = (.*);')).strip().replace('&nbsp;', ''))
     item['styles'] = json.loads(''.join(
         sel.re(r'var styles = (.*);')).strip().replace('&nbsp;', ''))
     self.crawler.stats.inc_crawled_pages(
         crawlid=response.meta['crawlid'],
         spiderid=response.meta['spiderid'],
         appid=response.meta['appid'])

Beispiel #6

0

Datei anzeigen

Datei: finishline_spider.py Projekt: mtaziz/jaycluster

 def _enrich_same_part(self, item, response):
     sel = Selector(response)
     item['nowPrice'] = ''.join(
         sel.xpath(
             '//div[@id="productPrice"]/div/span[@class="nowPrice"]/text()'
         ).extract()).strip()
     item['wasPrice'] = ''.join(
         sel.xpath(
             '//div[@id="productPrice"]/div/span[@class="wasPrice"]/text()'
         ).extract()).strip()
     item['price'] = sel.re(r'price: \"(.*?)\"')
     item['product_id'] = ''.join(sel.re(r'"product_id" : \["(.*?)\"'))

Beispiel #7

0

Datei anzeigen

Datei: finishline_spider.py Projekt: mtaziz/jaycluster

    def parse_item(self, response):
        print("FinishlineSpider#parse_item ...")
        self._logger.debug("FinishlineSpider#parse_item ...")
        sel = Selector(response)
        item = FinishlineItem()
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)

        item['title'] = ''.join(
            sel.xpath('//h1[@id="title"]/text()').extract()).strip()
        list_size = []
        sizes = sel.xpath('//div[@id="productSizes"]/div[@class="size"]')
        for size in sizes:
            list_size.append([
                ''.join(size.xpath('@id').extract()),
                ''.join(size.xpath('text()').extract())
            ])
        item['size'] = list_size
        item['productDescription'] = format_html_string(''.join(
            sel.xpath('//div[@id="productDescription"]').extract()))
        item['product_images'] = json.loads(''.join(
            sel.re(r"JSON.parse\(\'(.*?)\'")).strip())
        item['links'] = ''.join(sel.re(r"links: \'(.*?)\'")).split(';')
        item['product_color'] = ''.join(
            sel.re(r'"product_color" : \["(.*?)\"'))
        item['style_color_ids'] = ''.join(
            sel.xpath(
                '//div[@id="styleColors"]/span[@class="styleColorIds"]/text()'
            ).extract())

        colorid = ''.join(
            sel.xpath('//h1[@id="title"]/@data-colorid').extract())

        styleid = ''.join(
            sel.xpath('//h1[@id="title"]/@data-styleid').extract())

        imageset_url = 'http://www.finishline.com/store/api/scene7/imageset/?colorId=%s&styleId=%s' % (
            colorid, styleid)

        meta = response.meta
        meta['item-half'] = item
        req = Request(url=imageset_url,
                      meta=meta,
                      callback=self.parse_images,
                      dont_filter=response.request.dont_filter)

        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid'])
        print('self.crawler.stats.inc_crawled_pages::::::::::', )
        yield req

Beispiel #8

0

Datei anzeigen

def get_comment_url(response, page_num=1):
    sel = Selector(response)

    forum_id = sel.re(r'var\s+forumid\s*=\s*(.*);')[0]
    thread_id = sel.re(r'var\s+threadid\s*=\s*(.*);')[0]

    thread_list = [thread_id[:-5], thread_id[-5:-3], thread_id[-3:-1], thread_id[-1:]]
    thread_path = '/'.join(thread_list)

    comment_href = 'http://st01.club.china.com/data/thread/' \
                   + forum_id + '/' + thread_path + '_' + str(page_num) + '_re.js'

    return comment_href

Beispiel #9

0

Datei anzeigen

Datei: autohome.py Projekt: ZhouForrest/Spider

 def parse_car_conf(self, response):
     id = response.meta.get('id')
     req = Selector(response)
     res = req.re(r'url:(.*Price.*seriesId.*)')
     conf_urls = res[0].strip().replace('"', '')
     conf_url = self.mian_urls + conf_urls.replace(',', '').split('/', 1)[1]
     yield Request(conf_url, callback=self.prase_car_conf, meta={'id': id})

Beispiel #10

0

Datei anzeigen

def get_comment_list(response, post_id):
    sel = Selector(response)
    # noinspection PyBroadException
    try:
        comment_json = sel.re(r'page_obj\s*=\s*(.*);\s*printReCallBack')[0]
    except:
        comment_json = None
        print('comment page error:  ' + response.url)

    if comment_json:
        # noinspection PyBroadException
        try:
            comment_obj = json.loads(comment_json
                                     .replace('"%5C%22', '\\"')
                                     .replace('%5C%22"', '\\"')
                                     .replace('"\\', "")
                                     .replace(';" src=', ';\\" src=')
                                     .replace('onerror="', "onerror='"))
            if comment_obj:
                comment_list = comment_obj['l']
                for comment in comment_list:
                    comment_item = CommentItem()
                    comment_item['post_id'] = post_id
                    comment_item['comment_id'] = comment['mi']
                    comment_item['author_id'] = comment['ui']
                    comment_item['author_name'] = comment['nc']

                    comment_item['date_time'] = comment['cd']
                    comment_item['floor'] = comment['lc']
                    comment_item['content'] = comment['nr']

                    yield comment_item
        except:
            print('json: ' + comment_json + '   error!!')

Beispiel #11

0

Datei anzeigen

Datei: finishline_spider.py Projekt: mtaziz/jaycluster

    def parse_item(self, response):
        print("FinishlineSpider#parse_item ...")
        self._logger.debug("FinishlineSpider#parse_item ...")
        sel = Selector(response)
        item = FinishlineItem()
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)

        item['title'] = ''.join(sel.xpath('//h1[@id="title"]/text()').extract()).strip()
        list_size = []
        sizes = sel.xpath('//div[@id="productSizes"]/div[@class="size"]')
        for size in sizes:
            list_size.append([
                ''.join(size.xpath('@id').extract()),
                ''.join(size.xpath('text()').extract())
            ])
        item['size'] = list_size
        item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]').extract()))
        item['product_images'] = json.loads(''.join(sel.re(r"JSON.parse\(\'(.*?)\'")).strip())
        item['links'] = ''.join(sel.re(r"links: \'(.*?)\'")).split(';')
        item['product_color'] = ''.join(sel.re(r'"product_color" : \["(.*?)\"'))
        item['style_color_ids'] = ''.join(sel.xpath('//div[@id="styleColors"]/span[@class="styleColorIds"]/text()').extract())

        colorid = ''.join(sel.xpath('//h1[@id="title"]/@data-colorid').extract())

        styleid = ''.join(sel.xpath('//h1[@id="title"]/@data-styleid').extract())

        imageset_url = 'http://www.finishline.com/store/api/scene7/imageset/?colorId=%s&styleId=%s' % (colorid,styleid)

        meta = response.meta
        meta['item-half'] = item
        req = Request(
                url=imageset_url,
                meta=meta,
                callback=self.parse_images,
                dont_filter=response.request.dont_filter
            )

        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )
        print('self.crawler.stats.inc_crawled_pages::::::::::',)
        yield req

Beispiel #12

0

Datei anzeigen

    def _enrich_same_part(self, item, response):
        sel = Selector(response)
        item['title'] = ' '.join(sel.xpath('//*[@id="prdImage"]/h1/*//text()').extract()).strip()
        if len(item['title']) < 2:
            item['title'] = ' '.join(sel.xpath('//*[@id="productStage"]/h1/*/text()').extract()).strip()

        item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="prdInfoText"]').extract()).strip())
        if len(item['productDescription']) == 0:
            item['productDescription'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]').extract()).strip())

        item['stockJSON'] = json.loads(''.join(sel.re(r'var stockJSON =(.*);')).strip().replace('&nbsp;', ''))
        item['dimensions'] = json.loads(''.join(sel.re(r'var dimensions =(.*);')).strip().replace('&nbsp;', ''))
        item['dimToUnitToValJSON'] = json.loads(''.join(sel.re(r'var dimToUnitToValJSON =(.*);')).strip().replace('&nbsp;', ''))
        item['dimensionIdToNameJson'] = json.loads(''.join(sel.re(r'var dimensionIdToNameJson =(.*);')).strip().replace('&nbsp;', ''))
        item['valueIdToNameJSON'] = json.loads(''.join(sel.re(r'var valueIdToNameJSON =(.*);')).strip().replace('&nbsp;', ''))
        item['colorNames'] = json.loads(re_search(r'var colorNames =(.*?);', response.body))
        item['colorPrices'] = json.loads(re_search(r'var colorPrices =(.*?);', response.body))
        item['styleIds'] = json.loads(re_search(r'var styleIds =(.*?);', response.body))
        item['colorIds'] = json.loads(re_search(r'var colorIds =(.*?);', response.body))

Beispiel #13

0

Datei anzeigen

Datei: tracker.py Projekt: kalessin/finance-tools

 def get_price(self):
     r = requests.get(_BASE_URL.format(self.symbol), timeout=20)
     sel = Selector(text=r.text)
     company_id = sel.re(r"_chartConfigObject.companyId = '(\d+)';")[0]
     timestamp = sel.xpath("//span[@id='ref_{}_ltt']/text()".format(company_id)).extract()
     if timestamp:
         timestamp = '{} {}'.format(self.date, timestamp[0].strip())
         price = sel.xpath("//span[@id='ref_{}_l']/text()".format(company_id)).extract()[0]
         return timestamp, price
     return None, None

Beispiel #14

0

Datei anzeigen

 async def get_film(self, url):
     source = await html_source(url)
     bt_url = re.findall(r'href="(attach-dialog-fid-.*\.htm)"', source)
     selector = Selector(text=source)
     film_name = selector.re(r'\[BT下载\].*B\b')
     film_name = film_name[0] if film_name else ''
     bt_name = selector.css('td:nth-child(1) > a::text').extract_first()
     if film_name and bt_name:
         return film_name, bt_name, self.domain + bt_url[0].replace(
             'dialog', 'download')

Beispiel #15

0

Datei anzeigen

Datei: drugstore_spider.py Projekt: mtaziz/jaycluster

 def _enrich_same_part(self, item, response):
     sel = Selector(response)
     item['suggested_price'] = ''.join(
         sel.xpath(
             '//*[@id="divPricing"]/span[1]/s/text()').extract()).strip()
     item['our_price'] = ''.join(
         sel.xpath(
             '//*[@id="productprice"]/span/text()').extract()).strip()
     item['product_id'] = ''.join(
         sel.re(r"dtmProductId = \'(.*?)\'")).strip()

Beispiel #16

0

Datei anzeigen

    def get_prise_num_dict(self, response):
        sel = Selector(response)
        comment_vote = sel.re("var\s+commentsVotes\s*=\s*\'(.*)\',")[0]

        if comment_vote:
            vote_obj = json.loads(comment_vote)

            return vote_obj

        return {}

Beispiel #17

0

Datei anzeigen

Datei: dailiantong.py Projekt: wf1314/spider

 def get_game_id_name_map(self):
     """获取游戏名与id的映射关系"""
     url = 'https://m.dailiantong.com/js/game.js'
     resp = requests.get(url)
     doc = Selector(text=resp.text)
     result = doc.re(r'JSON.parse\(\'(.*)\'\);')[0]
     result = json.loads(result)
     d = {}
     for i in result:
         d[i['GameName']] = i['GameID']
     return d

Beispiel #18

0

Datei anzeigen

Datei: dailiantong.py Projekt: wf1314/spider

 def get_order_list(self, game_code):
     """获取订单列表"""
     url = 'https://server.dailiantong.com/API/AppService.ashx?Action=LevelOrderList&callback=callback&'
     params = 'IsPub=1&GameID={}&ZoneID=0&ServerID=0&SearchStr=&Sort_Str=&PageIndex=1&PageSize=20000&Price_Str=&PubCancel=0&SettleHour=0&UserID=0&TimeStamp={}&Ver=1.0&AppOS=webapp&AppID=webapp'.format(
         game_code, self.timestamp)
     amethod = "LevelOrderList"
     sign = self.get_sgin(amethod, params)
     params += '&Sign={}'.format(sign)
     resp = requests.get(url + params)
     doc = Selector(text=resp.text)
     r = doc.re(r'callback\((.*)\)')[0]
     order_list = json.loads(r)['LevelOrderList']
     return order_list

Beispiel #19

0

Datei anzeigen

Datei: amazon_spider.py Projekt: mtaziz/jaycluster

    def parse_item(self, response):
        self._logger.info("start response in parse_item -> response type:%s" %type(response).__name__)
        sel = Selector(response)
        item = AmazonItem()
        self._enrich_base_data(item, response, is_update=False)
        
        node_id_re = re.compile(r'node=(?P<node_id>\w+)')
        # breadcrum
        node_id_hrefs = sel.xpath('//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href').extract()
        item['node_ids'] = [node_id_re.search(x).group('node_id') for x in node_id_hrefs if node_id_re.search(x)]
        # Look for Similar Items by Category
        similar_node_id_links = [x.xpath('a/@href').extract() for x in sel.xpath('//div[@id="browse_feature_div"]/div/p')]
        item['similar_node_ids'] = [[node_id_re.search(x).group('node_id') for x in links] for links in [links for links in similar_node_id_links]]
        item['parent_asin'] = ''.join(sel.re(r'"parent_asin":"(.*?)"')).strip()
        if len(item['parent_asin']) == 0:
            item['parent_asin'] = ''.join(sel.xpath('//form[@id="addToCart"]/input[@id="ASIN"]/@value').extract()).strip()
        item['title'] = ''.join(sel.xpath('//span[@id="productTitle"]/text()').extract()).strip()
        item['product_specifications'] = format_html_string(''.join(sel.xpath('//div[@id="technicalSpecifications_feature_div"]//table').extract()).strip())
        item['product_description'] = format_html_string(''.join(sel.xpath('//div[@id="productDescription"]//p/text()').extract()).strip())
        brand_href = ''.join(sel.xpath('//a[@id="brand"]/@href').extract()).strip()
        brand_re = re.compile(r'^/(?P<brand>.*)/b/')
        m = brand_re.search(brand_href)
        if m:
            brand = brand_re.search(brand_href).group('brand')
        else:
            brand = ''.join(sel.xpath('//a[@id="brand"]/text()').extract()).strip()
        item['brand'] = brand
        item['feature'] = format_html_string(''.join(sel.xpath('//div[@id="feature-bullets"]').extract()).strip())
        item['dimensions_display'] = safely_json_loads(format_html_string(''.join(sel.re(r'"dimensionsDisplay":(.*?]),')).strip()))
        item['variations_data'] = safely_json_loads(''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip())
        enrich_color_images(item, sel)

        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )

        return item

Beispiel #20

0

Datei anzeigen

    def get_info(self):
        xpath_info = {
            "username": '******',
            "password": '******',
            "login_button": '//*[@id="login"]',
            "check_code": "",
            "code_image_url": "",
            "success_ele": ""
        }
        # 设置session
        session = Session()
        # 获取cookie
        cookie = self.no_check_get_cookie(xpath_info)
        print(cookie)
        # 给session设置cookie
        session.cookies.update(cookie_to_dict(cookie))
        # json的url
        url = "http://demand.lianfen360.com/market?id=mk103BP3r&login=true"
        # 设置头部
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
        }
        # 访问url

        response = session.get(url, headers=headers)
        selector = Selector(
            text=response.text).xpath('/html/body/div/div[6]/div/h2/text()')
        print(selector)

        result = {
            "注册人数": selector.re('注册数:(\d+)')[0],
            "实名人数": "null",
            "申请人数": selector.re('申请数:(\d+)')[0],
            "放款人数": "null",
            "备注": ''
        }
        self.write_sql(result)

Beispiel #21

0

Datei anzeigen

Datei: sogo_weixin_crawl.py Projekt: langgithub/LangSpider

 def prase_url(self, response):
     self.captcha_check(response)
     html = Selector(text=response.text)
     text = html.re("var url = '';([\s\S]*?)url.replace")[0].replace(
         "\r\n", "").replace(" ", "")
     url_list = text.split(";")
     url = "".join([(url_text.split("+=")[1].replace("\'", ""))
                    for url_text in url_list if url_text != ""])
     self.add_request(
         Request(url=url,
                 method="GET",
                 verify=False,
                 headers=self.headers,
                 callback=self.prase_content,
                 meta=response.meta))

Beispiel #22

0

Datei anzeigen

 def parse_zsptbs_bbs_no_boardid(self, response):
     hd_id = response.meta["hd_id"]
     stockcode = response.meta["stockcode"]
     content = response.body.decode('gbk').encode("utf-8")
     sel = Selector(text=content)
     rid_pattern = sel.re("boardid=\d+']")
     boardid = rid_pattern[0][8:len(rid_pattern[0])]
     zsptbs_url = "http://zsptbs.p5w.net/bbs/chatbbs/left.asp?boardid=%s&pageNo=1" % boardid
     yield Request(url=zsptbs_url,
                   meta={
                       "hd_id": hd_id,
                       "pageNo": '1',
                       "boardid": boardid,
                       "stockcode": stockcode
                   },
                   callback=self.parse_zsptbs_bbs)

Beispiel #23

0

Datei anzeigen

 def parse_no_rid(self, response):
     hd_id = response.meta["hd_id"]
     sel = Selector(response)
     rid_pattern = sel.re("rid=\d+")
     rid = rid_pattern[0][4:len(rid_pattern[0])]
     question_url = "http://ircs.p5w.net/ircs/topicInteraction/questionPage.do"
     yield FormRequest(url=question_url,
                       meta={
                           "hd_id": hd_id,
                           "pageNo": '1'
                       },
                       formdata={
                           'pageNo': str(1),
                           'rid': rid
                       },
                       callback=self.parseIndex)

Beispiel #24

0

Datei anzeigen

def get_comment_prise(response):
    sel = Selector(response)

    prise_json = sel.re(r'var\s+dingJson\s*=\s*(.*);')[0]

    json_obj = json.loads(
        ('{"l":' + prise_json + '}')
            .replace('messageid', '"messageid"')
            .replace('ding', '"ding"')
            .replace('\'', '"'))

    prise_list = json_obj['l']
    prise_dict = {}
    for prise in prise_list:
        prise_dict[prise['messageid']] = prise['ding']

    return prise_dict

Beispiel #25

0

Datei anzeigen

 def first_parse_item(self, response):
     sel = Selector(response)
     trs = sel.xpath('//section/div/div[1]/table/tr')
     times = trs.xpath("td[1]/text()").extract()
     prices = trs.xpath("td[3]/span/text()").extract()
     for index, tt in enumerate(times):
         res_item = ScprojectItem()
         res_item['time'] = tt
         res_item['price'] = prices[index]
         res_item['sid'] = '1111'
         res_item['mid'] = '111111'
         yield res_item
     page = sel.re(r'var v_PageCount = (\d*);')
     surl = response.url + '&page=%d'
     if not page is None:
         for x in range(2, int(page[0]) + 1):
             url = surl % x
             print(url)
             yield Request(url, callback=self.parse_item)

Beispiel #26

0

Datei anzeigen

Datei: finishline_spider.py Projekt: mtaziz/jaycluster

 def _enrich_same_part(self, item, response):
     sel = Selector(response)
     item['nowPrice'] = ''.join(sel.xpath('//div[@id="productPrice"]/div/span[@class="nowPrice"]/text()').extract()).strip()
     item['wasPrice'] = ''.join(sel.xpath('//div[@id="productPrice"]/div/span[@class="wasPrice"]/text()').extract()).strip()
     item['price'] = sel.re(r'price: \"(.*?)\"')
     item['product_id'] = ''.join(sel.re(r'"product_id" : \["(.*?)\"'))

Beispiel #27

0

Datei anzeigen

Datei: amazon_spider.py Projekt: mtaziz/jaycluster

    def parse_item_update(self, response):
        self._logger.info("start response in parse_item_update -> response type:%s" % type(response).__name__)
        item = AmazonItem()
        meta = response.meta
        self._enrich_base_data(item, response, is_update=True)

        item['asin'] = re_search(r'product/(.*)/', response.url)
        sel = Selector(response)
        asin_divs = sel.xpath('//input[@id="ASIN"]/@value').extract()
        if len(asin_divs) > 0:
            item['parent_asin'] = ''.join(asin_divs[0]).strip()
        else:
            item['parent_asin'] = ''

        item['size'] = re_search(r'\"%s\":\[(.*?)\]' % item['asin'], ''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip())
        item['dimensions_display'] = safely_json_loads(format_html_string(''.join(sel.re(r'"dimensionsDisplay":(.*?]),')).strip()))
        item['merchants'] = sel.xpath('//div[@id="merchant-info"]/a/text()').extract()
        item['merchant_3p'] = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/b/text()').extract()).strip()
        item['price_3p'] = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/span[contains(@class, "price3P")]/text()').extract()).strip()
        shipping_cost_3p_string = ''.join(sel.xpath('//div[@id="soldByThirdParty"]/span[contains(@class, "shipping3P")]/text()').extract()).strip()
        item['shipping_cost_3p'] = extract_shipping_cost_price_from_shipping_cost_string(shipping_cost_3p_string)
        item['from_price'] = ''.join(sel.xpath('//div[@id="mbc"]/div[@class="a-box"]/div/span/span[@class="a-color-price"]/text()').extract()).strip()
        availability_divs = [
            ''.join(sel.xpath('//div[@id="availability"]/span/text()').extract()),
            ''.join(sel.xpath('//span[@class="availRed"]/text()').extract()),
            ''.join(sel.xpath('//span[@class="availGreen"]/text()').extract())
            ]

        availability_str = ''.join(availability_divs).strip().lower()
        merchant_info_str = ''.join(sel.xpath('//div[@id="merchant-info"]/text()').extract()).strip().lower()
        if (
                (len(availability_divs) <= 0) or
                availability_str.startswith('only') or
                availability_str.startswith('in stock') or
                availability_str.startswith('usually')
        ):
            item['availability'] = 'true'
            item['availability_reason'] = "001: %s" % availability_str
        elif (
                merchant_info_str.startswith('ships from and sold by')
        ):
            item['availability'] = 'true'
            item['availability_reason'] = "002: %s" % merchant_info_str
        elif (
                availability_str.startswith('available from')
        ):
            item['availability'] = 'other'
            item['availability_reason'] = "003: %s" % availability_str
        elif availability_str.startswith('currently unavailable'):
            item['availability'] = 'false'
            item['availability_reason'] = "004: %s" % availability_str
        else:
            item['availability'] = 'false'
            item['availability_reason'] = '000: _'

        if item['availability'] in ['true']:
            item['list_price'] = ''.join([
                ''.join(sel.xpath('//div[@id="price"]//tr[1]/td[2]/text()').extract()).strip(),
                ''.join(sel.xpath('//span[@id="listPriceValue"]/text()').extract()).strip()
                ])

            item['price'] = ''.join([
                ''.join(sel.xpath('//span[@id="priceblock_ourprice"]/text()').extract()).strip(),
                ''.join(sel.xpath('//span[@id="priceblock_saleprice"]/text()').extract()).strip(),
                ''.join(sel.xpath('//span[@id="priceblock_dealprice"]/text()').extract()).strip(),
                ''.join(sel.xpath('//span[@id="actualPriceValue"]/b/text()').extract()).strip()
                ])

            if ((len(item['list_price']) + len(item['price'])) <= 0):
                #self.log("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body)))
                self._logger.info("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body)))
                dump_response_body(item['asin'], response.body)

            shipping_cost_string_ourprice = ''.join(sel.xpath('//*[@id="ourprice_shippingmessage"]/span/text()').extract()).strip()
            shipping_cost_string_saleprice = ''.join(sel.xpath('//*[@id="saleprice_shippingmessage"]/span/text()').extract()).strip()
            shipping_cost_string = shipping_cost_string_ourprice or shipping_cost_string_saleprice
            item['shipping_cost'] = extract_shipping_cost_price_from_shipping_cost_string(shipping_cost_string)
            self._logger.info("Spiderid: %s Crawlid: %s yield item in parse, asin: %s" % (response.meta['spiderid'],response.meta['crawlid'],item.get("asin", "unknow")))

            self.crawler.stats.inc_crawled_pages(
                crawlid=response.meta['crawlid'],
                spiderid=response.meta['spiderid'],
                appid=response.meta['appid']
            )
            return item
        elif item['availability'] in ['other']:
            item['price'] = ''.join([
                ''.join(sel.xpath('//*[@id="unqualifiedBuyBox"]//span[@class="a-color-price"]/text()').extract()).strip()
                ])

            new_url = ''.join(sel.xpath('//div[@id="unqualifiedBuyBox"]/div/div[1]/a/@href').extract()).strip()
            new_url = urljoin(response.url, new_url)

            meta['item_half'] = item

            req = Request(
                url=new_url,
                meta=meta,
                callback=self.parse_shipping_cost,
                dont_filter=response.request.dont_filter
            )
            self._logger.info("Spiderid: %s Crawlid: %s yield request in parse, asin: %s" % (response.meta['spiderid'],response.meta['crawlid'],req.meta.get("asin", "unknow")))
            return req
        else:
            self._logger.info("yield item in parse, asin: %s" % item.get("asin", "unknow"))
            self.crawler.stats.inc_crawled_pages(
                crawlid=response.meta['crawlid'],
                spiderid=response.meta['spiderid'],
                appid=response.meta['appid']
            )
            return item

Beispiel #28

0

Datei anzeigen

Datei: drugstore_spider.py Projekt: mtaziz/jaycluster

 def _enrich_same_part(self, item, response):
     sel = Selector(response)
     item['suggested_price'] = ''.join(sel.xpath('//*[@id="divPricing"]/span[1]/s/text()').extract()).strip()
     item['our_price'] = ''.join(sel.xpath('//*[@id="productprice"]/span/text()').extract()).strip()
     item['product_id'] = ''.join(sel.re(r"dtmProductId = \'(.*?)\'")).strip()

Beispiel #29

0

Datei anzeigen

def get_post_item(response):
    url = response.url
    sel = Selector(response)

    post_item = PostItem()
    post_item['url'] = url

    forum_id = sel.re(r'var\s+forumid\s*=\s*(.*);')[0]
    thread_id = sel.re(r'var\s+threadid\s*=\s*(.*);')[0]

    post_item['post_id'] = forum_id + '_' + thread_id

    path_text = sel.xpath(
        '//div[contains(@class, "breadcrumbs")]/a/text()').extract()
    path_href = sel.xpath(
        '//div[contains(@class, "breadcrumbs")]/a/@href').extract()
    post_item['path_text'] = ', '.join(path_text)
    post_item['path_href'] = ', '.join(
        [response.urljoin(p_href) for p_href in path_href if p_href])

    title = sel.xpath('//*[@id="chan_newsTitle"]').xpath(
        'string(.)').extract_first()
    post_item['title'] = check_value(title)

    key_words = sel.xpath('//meta[@name="keywords"]/@content').extract_first()
    post_item['key_words'] = check_value(key_words)

    hot_words = sel.xpath('//div[@class="hotWords"]/a/text()').extract()
    post_item['hot_words'] = ', '.join(hot_words)

    author_id = sel.xpath(
        '//span[@class="author"]/a[@name="onlineIcon"]/@_webim_ppid'
    ).extract_first()
    post_item['author_id'] = check_value(author_id)

    author_name = sel.xpath('//span[@class="author"]/a/text()').extract_first()
    post_item['author_name'] = check_value(author_name)

    level = sel.xpath('//span[@class="level"]/img/@title').extract_first()
    post_item['level'] = check_value(level)

    point = sel.xpath(
        '//div[@class="grade"]/span[not(@class)]/text()').extract_first()
    if point and len(point) > 3:
        post_item['point'] = point[3:]
    else:
        post_item['point'] = ''

    date_time = sel.xpath('//li[@class="time"]/span/text()').extract_first()
    if date_time and len(date_time) > 4:
        post_item['date_time'] = date_time[4:]
    else:
        post_item['date_time'] = ''

    num_href = sel.xpath(
        '//div[@class="postStaticData"]/span/script/@src').extract_first()
    post_item['_num_href'] = check_value(num_href)

    content, picture_hrefs = get_content(response)
    post_item['content'] = check_value(content)
    post_item['picture_hrefs'] = picture_hrefs

    post_item['comment_ids'] = []

    post_item['parse_time'] = time.time()

    return post_item

Beispiel #30

0

Datei anzeigen

Datei: amazon_spider.py Projekt: mtaziz/jaycluster

    def parse_item(self, response):
        self._logger.info("start response in parse_item -> response type:%s" %
                          type(response).__name__)
        sel = Selector(response)
        item = AmazonItem()
        self._enrich_base_data(item, response, is_update=False)

        node_id_re = re.compile(r'node=(?P<node_id>\w+)')
        # breadcrum
        node_id_hrefs = sel.xpath(
            '//div[@id="wayfinding-breadcrumbs_feature_div"]//a/@href'
        ).extract()
        item['node_ids'] = [
            node_id_re.search(x).group('node_id') for x in node_id_hrefs
            if node_id_re.search(x)
        ]
        # Look for Similar Items by Category
        similar_node_id_links = [
            x.xpath('a/@href').extract()
            for x in sel.xpath('//div[@id="browse_feature_div"]/div/p')
        ]
        item['similar_node_ids'] = [[
            node_id_re.search(x).group('node_id') for x in links
        ] for links in [links for links in similar_node_id_links]]
        item['parent_asin'] = ''.join(sel.re(r'"parent_asin":"(.*?)"')).strip()
        if len(item['parent_asin']) == 0:
            item['parent_asin'] = ''.join(
                sel.xpath('//form[@id="addToCart"]/input[@id="ASIN"]/@value').
                extract()).strip()
        item['title'] = ''.join(
            sel.xpath('//span[@id="productTitle"]/text()').extract()).strip()
        item['product_specifications'] = format_html_string(''.join(
            sel.xpath('//div[@id="technicalSpecifications_feature_div"]//table'
                      ).extract()).strip())
        item['product_description'] = format_html_string(''.join(
            sel.xpath('//div[@id="productDescription"]//p/text()').extract()).
                                                         strip())
        brand_href = ''.join(
            sel.xpath('//a[@id="brand"]/@href').extract()).strip()
        brand_re = re.compile(r'^/(?P<brand>.*)/b/')
        m = brand_re.search(brand_href)
        if m:
            brand = brand_re.search(brand_href).group('brand')
        else:
            brand = ''.join(
                sel.xpath('//a[@id="brand"]/text()').extract()).strip()
        item['brand'] = brand
        item['feature'] = format_html_string(''.join(
            sel.xpath('//div[@id="feature-bullets"]').extract()).strip())
        item['dimensions_display'] = safely_json_loads(
            format_html_string(''.join(
                sel.re(r'"dimensionsDisplay":(.*?]),')).strip()))
        item['variations_data'] = safely_json_loads(''.join(
            sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip())
        enrich_color_images(item, sel)

        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid'])

        return item

Beispiel #31

0

Datei anzeigen

Datei: amazon_spider.py Projekt: mtaziz/jaycluster

    def parse_item_update(self, response):
        self._logger.info(
            "start response in parse_item_update -> response type:%s" %
            type(response).__name__)
        item = AmazonItem()
        meta = response.meta
        self._enrich_base_data(item, response, is_update=True)

        item['asin'] = re_search(r'product/(.*)/', response.url)
        sel = Selector(response)
        asin_divs = sel.xpath('//input[@id="ASIN"]/@value').extract()
        if len(asin_divs) > 0:
            item['parent_asin'] = ''.join(asin_divs[0]).strip()
        else:
            item['parent_asin'] = ''

        item['size'] = re_search(
            r'\"%s\":\[(.*?)\]' % item['asin'],
            ''.join(sel.re(r'"dimensionValuesDisplayData":(.*?]}),')).strip())
        item['dimensions_display'] = safely_json_loads(
            format_html_string(''.join(
                sel.re(r'"dimensionsDisplay":(.*?]),')).strip()))
        item['merchants'] = sel.xpath(
            '//div[@id="merchant-info"]/a/text()').extract()
        item['merchant_3p'] = ''.join(
            sel.xpath(
                '//div[@id="soldByThirdParty"]/b/text()').extract()).strip()
        item['price_3p'] = ''.join(
            sel.xpath(
                '//div[@id="soldByThirdParty"]/span[contains(@class, "price3P")]/text()'
            ).extract()).strip()
        shipping_cost_3p_string = ''.join(
            sel.xpath(
                '//div[@id="soldByThirdParty"]/span[contains(@class, "shipping3P")]/text()'
            ).extract()).strip()
        item[
            'shipping_cost_3p'] = extract_shipping_cost_price_from_shipping_cost_string(
                shipping_cost_3p_string)
        item['from_price'] = ''.join(
            sel.xpath(
                '//div[@id="mbc"]/div[@class="a-box"]/div/span/span[@class="a-color-price"]/text()'
            ).extract()).strip()
        availability_divs = [
            ''.join(
                sel.xpath('//div[@id="availability"]/span/text()').extract()),
            ''.join(sel.xpath('//span[@class="availRed"]/text()').extract()),
            ''.join(sel.xpath('//span[@class="availGreen"]/text()').extract())
        ]

        availability_str = ''.join(availability_divs).strip().lower()
        merchant_info_str = ''.join(
            sel.xpath('//div[@id="merchant-info"]/text()').extract()).strip(
            ).lower()
        if ((len(availability_divs) <= 0)
                or availability_str.startswith('only')
                or availability_str.startswith('in stock')
                or availability_str.startswith('usually')):
            item['availability'] = 'true'
            item['availability_reason'] = "001: %s" % availability_str
        elif (merchant_info_str.startswith('ships from and sold by')):
            item['availability'] = 'true'
            item['availability_reason'] = "002: %s" % merchant_info_str
        elif (availability_str.startswith('available from')):
            item['availability'] = 'other'
            item['availability_reason'] = "003: %s" % availability_str
        elif availability_str.startswith('currently unavailable'):
            item['availability'] = 'false'
            item['availability_reason'] = "004: %s" % availability_str
        else:
            item['availability'] = 'false'
            item['availability_reason'] = '000: _'

        if item['availability'] in ['true']:
            item['list_price'] = ''.join([
                ''.join(
                    sel.xpath('//div[@id="price"]//tr[1]/td[2]/text()').
                    extract()).strip(), ''.join(
                        sel.xpath('//span[@id="listPriceValue"]/text()').
                        extract()).strip()
            ])

            item['price'] = ''.join([
                ''.join(
                    sel.xpath('//span[@id="priceblock_ourprice"]/text()').
                    extract()).strip(), ''.join(
                        sel.xpath('//span[@id="priceblock_saleprice"]/text()').
                        extract()).strip(),
                ''.join(
                    sel.xpath('//span[@id="priceblock_dealprice"]/text()').
                    extract()).strip(), ''.join(
                        sel.xpath('//span[@id="actualPriceValue"]/b/text()').
                        extract()).strip()
            ])

            if ((len(item['list_price']) + len(item['price'])) <= 0):
                #self.log("response body ILLEGAL: %s, %d, %d. Dumping ..." % (item['asin'], response.status, len(response.body)))
                self._logger.info(
                    "response body ILLEGAL: %s, %d, %d. Dumping ..." %
                    (item['asin'], response.status, len(response.body)))
                dump_response_body(item['asin'], response.body)

            shipping_cost_string_ourprice = ''.join(
                sel.xpath('//*[@id="ourprice_shippingmessage"]/span/text()').
                extract()).strip()
            shipping_cost_string_saleprice = ''.join(
                sel.xpath('//*[@id="saleprice_shippingmessage"]/span/text()').
                extract()).strip()
            shipping_cost_string = shipping_cost_string_ourprice or shipping_cost_string_saleprice
            item[
                'shipping_cost'] = extract_shipping_cost_price_from_shipping_cost_string(
                    shipping_cost_string)
            self._logger.info(
                "Spiderid: %s Crawlid: %s yield item in parse, asin: %s" %
                (response.meta['spiderid'], response.meta['crawlid'],
                 item.get("asin", "unknow")))

            self.crawler.stats.inc_crawled_pages(
                crawlid=response.meta['crawlid'],
                spiderid=response.meta['spiderid'],
                appid=response.meta['appid'])
            return item
        elif item['availability'] in ['other']:
            item['price'] = ''.join([
                ''.join(
                    sel.xpath(
                        '//*[@id="unqualifiedBuyBox"]//span[@class="a-color-price"]/text()'
                    ).extract()).strip()
            ])

            new_url = ''.join(
                sel.xpath('//div[@id="unqualifiedBuyBox"]/div/div[1]/a/@href').
                extract()).strip()
            new_url = urljoin(response.url, new_url)

            meta['item_half'] = item

            req = Request(url=new_url,
                          meta=meta,
                          callback=self.parse_shipping_cost,
                          dont_filter=response.request.dont_filter)
            self._logger.info(
                "Spiderid: %s Crawlid: %s yield request in parse, asin: %s" %
                (response.meta['spiderid'], response.meta['crawlid'],
                 req.meta.get("asin", "unknow")))
            return req
        else:
            self._logger.info("yield item in parse, asin: %s" %
                              item.get("asin", "unknow"))
            self.crawler.stats.inc_crawled_pages(
                crawlid=response.meta['crawlid'],
                spiderid=response.meta['spiderid'],
                appid=response.meta['appid'])
            return item