Python HtmlXPathSelector Examples, scrapy.selector.lxmlsel.HtmlXPathSelector Python Examples

Example #1

0

Show file

 def parse_developer(self, response):
     item = response.meta.get("item")
     hxs = HtmlXPathSelector(response)
     item["stars"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Stars")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["watchers"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Watchers")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["forks"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Forks")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["merged_pull_requests"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Merged Pull Requests")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["total_issues"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Total Issues")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["closed_issues"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Closed Issues")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["contributors"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Contributors")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["total_new_commits"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Total new commits")]/following-sibling::p[1]/text()'
     ).extract_first()
     yield item
     self.item_counts += 1
     self.logger.info("current item counts <{}>".format(self.item_counts))

Example #2

0

Show file

    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        meta = response.meta

        nodes = hxs.xpath("//div[@class='list-infoBox']")

        city = "".join(hxs.xpath('//a[@class="choose-city"]/span/text()').re(r'\S+'))

        for node in nodes:

            items_list = []

            title = "".join(node.xpath('.//a[1]/@title').extract())

            nowprice = "".join(node.xpath(".//i[@class='fc-org priType']/text()").extract())

            url = urljoin(self.start_urls[0],"".join(node.xpath('.//a[1]/@href').extract()))

            oldprice = "".join(node.xpath('.//p[@class="priType-s"]/s/text()').extract())

            drivetime = "".join(node.xpath('.//p[@class="fc-gray"]/descendant::text()').extract())

            items_list.append([url,title,nowprice,oldprice,drivetime,city,meta['brand_name']])

            writer.writerow([x.encode("utf8").replace("\n","").replace("\t","").replace("\r","").replace(" ","") for x in items_list[0]])

        next_page = hxs.xpath('//a[@class="next"]/@href').extract()

        if next_page:

            url = urljoin(self.start_urls[0],next_page[0])

            yield Request(url,callback=self.parse_item,meta=meta)

Example #3

0

Show file

    def parse(self, response):
        #print response.url
        response_selector = HtmlXPathSelector(response)
        list = response_selector.select(self.list_xpath)
        #���û��page�����ǵ�һ����ȡ��Ϊ��һҳ
        try:
            page = response.meta['page']
        except:
            page = 1
#��������
        for sel in list:
            home = sel.select(self.home_xpath).extract()[0]
            home_link = home[4:]
            #print home_link
            yield scrapy.Request(url=home_link,
                                 callback=self.parse_home,
                                 meta={"sub_url": response.url},
                                 headers=DEFAULT_REQUEST_HEADERS)
        #������һҳ��Ϣ���ַ�������PageList(1,10,10,104,'gyyclass=0&keyword=&CY_id=0&city=0',8)
        nextMsg = response_selector.select(self.nextpage_xpath).extract()[0]
        # �����ַ������õ���ǰҳ��
        currentPage = nextMsg.split(",")[0].split("(")[1]
        #�����ǰҳ���meta�е�ҳ����ȣ���ȥ��ȡ��һҳ�����򣬾Ͳ�������һҳ���������pageΪ12ʱ�����û�е�12ҳ�����������currentPage��Ϊ11����˵����ǰ�Ѿ��������ҳ
        if cmp(str(page), str(currentPage)) == 0:
            if page == 1:
                url = response.url + "?page=" + str(page + 1)
            else:
                url = response.url.split("=")[0] + "=" + str(page + 1)
        # ������һҳ
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 meta={"page": page + 1},
                                 headers=DEFAULT_REQUEST_HEADERS)

Example #4

0

Show file

    def parse_home(self, response):
        response_selector = HtmlXPathSelector(response)
        field_value = {}
        field_value['url'] = response.url
        field_value['sub_url'] = response.meta["sub_url"]
        for field_name in self.home_field_xpath:
            field_value[field_name] = ""
            if len(
                    response_selector.select(
                        self.home_field_xpath[field_name]).extract()) > 0:
                if field_name == "industry":
                    #���Ϊ��ҵ����Ҫ�÷���ƴ��������Ϊ�ַ���
                    industries = response_selector.select(
                        self.home_field_xpath[field_name]).extract()
                    field_value[field_name] = ",".join(industries)
                else:
                    field_value[field_name] = response_selector.select(
                        self.home_field_xpath[field_name]).extract()[0]
                    #ƴ��ͼƬ��ַ
        field_value["image"] = "http://" + response.url.split(
            "/")[2] + field_value["image"]

        yield scrapy.Request(url=response.url + "detail/",
                             callback=self.parse_detail,
                             meta=field_value,
                             headers=DEFAULT_REQUEST_HEADERS)

Example #5

0

Show file

    def parse_company(self, response):
        response_selector = HtmlXPathSelector(response)
        list = response_selector.select(self.companylist_xpath)
        #print list
        for sel in list:
            item = ParkCompanyItem()
            company_name = sel.select(self.companyname_xpath).extract()[0]
            #print company_name
            # print response.meta["park_md5"]
            item['company_name'] = company_name
            item["park_md5"] = response.meta["park_md5"]
            yield item
        #��һҳ
        page = response.meta['page']
        #��һҳ��ԭ����԰��ҳ����ͬ
        if len(response_selector.select(self.nextpage_xpath).extract()) > 0:
            nextMsg = response_selector.select(
                self.nextpage_xpath).extract()[0]
            currentPage = nextMsg.split(",")[0].split("(")[1]
            if cmp(str(page), str(currentPage)) == 0:

                if page == 1:
                    url = response.url + "?page=" + str(page + 1)
                else:
                    url = response.url.split("=")[0] + "=" + str(page + 1)
                print url
                yield scrapy.Request(url=url,
                                     callback=self.parse_company,
                                     meta={
                                         "park_md5": item["park_md5"],
                                         "page": page + 1
                                     },
                                     headers=DEFAULT_REQUEST_HEADERS)

Example #6

0

Show file

    def parse_yhzc(self, response):
        response_selector = HtmlXPathSelector(response)
        yhzcList = response_selector.select(self.yhzc_xpath).extract()
        yhzcStr = "".join(yhzcList)

        message = response.meta
        #��װ԰����Ϣ
        item = ParkBaseItem()
        item['preferential'] = yhzcStr.decode("utf-8")
        item['name'] = message['name']
        item['level'] = message['level']
        item['address'] = message['address']
        item['area'] = message['area']
        item['industry'] = message['industry']
        item['image'] = message['image']
        item['detail'] = message['detail']
        item['plan'] = message['plan']
        item['url'] = message['url']
        item["sub_url"] = message["sub_url"]
        m2 = hashlib.md5()
        m2.update(item["url"])
        item["url_md5"] = m2.hexdigest()

        item['created'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                        time.localtime(time.time()))

        # print item
        yield item
        yield scrapy.Request(url=item['url'] + "company/",
                             callback=self.parse_company,
                             meta={
                                 "park_md5": item["url_md5"],
                                 "page": 1
                             },
                             headers=DEFAULT_REQUEST_HEADERS)

Example #7

0

Show file

 def test_htmlxpathselector(self):
     with warnings.catch_warnings(record=True):
         hs = HtmlXPathSelector(text=self.text)
         self.assertEqual(
             hs.select("//div").extract(),
             [u'<div><img src="a.jpg"><p>Hello</p></div>'])
         self.assertRaises(RuntimeError, hs.css, 'div')

Example #8

0

Show file

File: test_selector.py Project: 7924102/scrapy

 def test_htmlxpathselector(self):
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', ScrapyDeprecationWarning)
         hs = HtmlXPathSelector(text=self.text)
         self.assertEqual(hs.select("//div").extract(),
                          [u'<div><img src="a.jpg"><p>Hello</p></div>'])
         self.assertRaises(RuntimeError, hs.css, 'div')

Example #9

0

Show file

 def test_htmlxpathselector(self):
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', ScrapyDeprecationWarning)
         hs = HtmlXPathSelector(text=self.text)
         self.assertEqual(
             hs.select("//div").extract(),
             [u'<div><img src="a.jpg"><p>Hello</p></div>'])
         self.assertRaises(RuntimeError, hs.css, 'div')

Example #10

0

Show file

File: fetch_details.py Project: Gaurang033/Mutlilevel_scrapy

    def parse_cats(self, response):

        hxs = HtmlXPathSelector(response)
        item = Link2LinkItem(response.meta['item'])
        all_product_links = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href").extract()

        for product_link in all_product_links:
            yield Request(self.get_url(product_link), callback=self.parse_products, meta={'item': item})

Example #11

0

Show file

File: tongcheng_spider.py Project: tukeq/cantin

  def parse_list(self, response):
    hxs = HtmlXPathSelector(response)
    activity_nodes = hxs.select("//li[@class='list-entry']/div[2]/div/a")

    for node in activity_nodes:
      activity_url = self.href(node)
      self.log('Found activity url: %s' % activity_url)
      yield Request(activity_url, callback = self.parse_activity)

Example #12

0

Show file

File: wienerlinien.py Project: theandyl/wienerlinien-fahrplan

   def parse(self, response):
      hxs = HtmlXPathSelector(response)
      items = hxs.select('//*[@id="col3_content"]/div/div/div/a')

      for item in items:
         url = urljoin(response.url, item.select('./@href').extract()[0])
         self.log("Following URL %s" % (url), level=log.DEBUG)
         yield Request(url, callback=self.parseLineType)

Example #13

0

Show file

File: scrapy_hello.py Project: chouqin/test-code

    def parse_torrent(self, response):
        x = HtmlXPathSelector(response)

        torrent = TorrentItem()
        torrent['url'] = response.url
        torrent['name'] = x.select("//h1/text()").extract()
        torrent['description'] = x.select("//div[@id='description']").extract()
        torrent['size'] = x.select("//div[@id='info-left']/p[2]/text()[2]").extract()
        return torrent

Example #14

0

Show file

File: fetch_details.py Project: Gaurang033/Mutlilevel_scrapy

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     brands = hxs.select("//div[@id='contentFull']/div/p/a/@href")
     # self.item = Link2LinkItem()
     item = Link2LinkItem()
     for brand in brands:
         brand_page = brand.extract()
         request = Request(self.get_url(brand_page), callback=self.parse_brands,meta={'item':item})
         yield request

Example #15

0

Show file

File: appslistspider.py Project: shebeerki/appslist

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     appsl = hxs.select('//div[contains(@class,"loops-wrapper list-thumb-image")]/div')
     for entry in appsl:
         item = AppslistItem()
         item["title"] = entry.select("div/h2/a/text()").extract()
         item["image_link"] = entry.select("p/a/img/@src").extract()
         item["desc"] = entry.select("div/p/text()").extract()
         yield item

Example #16

0

Show file

 def parse_plan(self, response):
     response_selector = HtmlXPathSelector(response)
     planList = response_selector.select(self.plan_xpath).extract()
     planStr = "".join(planList)
     baseUrl = response.meta['url']
     response.meta['plan'] = planStr.decode("utf-8")
     yield scrapy.Request(url=baseUrl + "yhzc/",
                          callback=self.parse_yhzc,
                          meta=response.meta,
                          headers=DEFAULT_REQUEST_HEADERS)

Example #17

0

Show file

File: tucarro_crawl.py Project: saulm/tucarro_scrapy_sample

 def parse_carro(self, response):
     hxs = HtmlXPathSelector(response)
     carro = Carro()
     carro['modelo'] = hxs.select('//div[@class="section primary"]/h1/text()').extract()[0]
     anio, tipo = hxs.select('//div[@class="section primary"]/h4/text()').extract()[0].split("|")
     carro['anio'] = anio
     carro['tipo'] = tipo
     carro['precio'] = hxs.select('//div[@class="section primary"]/h3/text()').extract()[0].split(" ")[-1]
     carro['url'] = response.url
     yield carro

Example #18

0

Show file

File: fetch_details.py Project: Gaurang033/Mutlilevel_scrapy

    def parse_brands(self, response):

        hxs = HtmlXPathSelector(response)
        brands = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href")
        # for brand in brands:
        item = Link2LinkItem(response.meta['item'])
        products_category = hxs.select("//*[@id='contentFull']/fieldset[2]/div/p[2]/a/text()").extract()
        item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract()
        if "Products" in hxs.select("//*[@id='contentFull']/fieldset[2]/legend/text()").extract()[0]:
            #Catagory exsist, i.e. Dog, Cat
            all_catagories_links = hxs.select("//*[@id='contentFull']/fieldset[2]/div/p[2]/a/@href").extract()
            index=0
            # c_list=[]
            for product in products_category:
                item = Link2LinkItem(response.meta['item'])
                item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract()
                catatory_link = all_catagories_links[index]
                item['Products_Category'] = product
                index = index + 1
                # yield  item
                yield Request(self.get_url(catatory_link), callback=self.parse_cats, meta={'item': item})

        else:

            #direct product link is available.
            item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract()
            if "Products_Category" not in item:
                item['Products_Category'] = "Not Available"
            all_product_links = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href").extract()

            for product_link in all_product_links:
                yield Request(self.get_url(product_link), callback=self.parse_products, meta={'item': item})

Example #19

0

Show file

    def parse_detail(self, response):

        response_selector = HtmlXPathSelector(response)
        detailList = response_selector.select(self.detail_xpath).extract()
        detailStr = "".join(detailList)
        baseUrl = response.meta['url']
        response.meta['detail'] = detailStr.decode("utf-8")
        yield scrapy.Request(url=baseUrl + "plan/",
                             callback=self.parse_plan,
                             meta=response.meta,
                             headers=DEFAULT_REQUEST_HEADERS)

Example #20

0

Show file

File: spider_name.py Project: yh623962721/QbSpider

    def parse(self, response):

        hxs = HtmlXPathSelector(response)

        print "*" * 66

        print hxs.xpath("//script[@class='J_auto-load']/text()").extract()

        print "-" * 66

        return

Example #21

0

Show file

File: DeputeInfoSpider.py Project: spgenot/assemblee

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     item = DeputeInfoItem()
     item["party"] = hxs.select('//td[@id="cbfv_55"]/text()').extract()
     item["name"] = hxs.select('//h1[@class="title"]/span/span/text()').extract()
     account = hxs.select('//td[@id="cbfv_60"]/a/text()').extract()
     print(account)
     if(len(account) != 0):
         item["twitter"] = account[0].split('/')[-1]
     else:
         item["twitter"] = []
     yield item

Example #22

0

Show file

    def parse_renqizhishu(self, response):

        html = HtmlXPathSelector(response)

        popularity_ranking = ''.join(
            html.xpath(u"//*[contains(text(),'第')]/text()").extract())

        item = {"popularity_ranking": popularity_ranking}

        self.con.hmset(self.jobid, item)

        del response

Example #23

0

Show file

    def parse_district_num(self, response):

        html = HtmlXPathSelector(response)

        district_num = ''.join(
            html.xpath("//span[@class='num']/text()").extract()).replace(
                "(", "").replace(")", "")

        item = {"district_num": district_num}

        self.con.hmset(self.jobid, item)

        del response

Example #24

0

Show file

File: wienerlinien.py Project: theandyl/wienerlinien-fahrplan

 def parseLineType(self, response):
    hxs = HtmlXPathSelector(response)
    lineType = hxs.select('//*[@id="pagebar"]/h1/text()').extract()[0].strip()
    self.log('Processing %s...' % (lineType), level=log.DEBUG)
    
    items = hxs.select('//*[@id="tbl_fahrplaene"]/tbody/tr/td[2]/a')
    for item in items:
       url = urljoin(response.url, item.select('./@href').extract()[0])
       req = Request(url, callback=self.parseFahrplan)
       req.meta['lineType'] = lineType
       req.meta['lineName'] = item.select('./text()').extract()[0].strip()
       self.log("Following URL %s" % (url), level=log.DEBUG)
       yield req

Example #25

0

Show file

    def parse(self, response):

        hxs = HtmlXPathSelector(response)

        all_city_name = hxs.xpath('//div[@class="all-city"]/descendant::a[contains(@data-gzlog,"select_city")]/text()').re(r'\S+')

        all_city_url = hxs.xpath('//div[@class="all-city"]/descendant::a[contains(@data-gzlog,"select_city")]/@href').extract()

        for item in zip(all_city_name,all_city_url):

            if len(item) ==2:

                yield Request(urljoin(self.start_urls[0],item[1]),callback=self.parse_list)

Example #26

0

Show file

File: jd_spider.py Project: yh623962721/QbSpider

    def parse_order_year(self, response):

        hxs = HtmlXPathSelector(response)

        order_urls = hxs.xpath(
            "//div[@class='time-list']/ul/li[position()>1]/a/@_val").extract(
            )[0:2]

        order_url = "https://order.jd.com/center/list.action?search=0&d="

        for urls in order_urls:

            yield Request(url=order_url + urls, callback=self.parse_order_list)

Example #27

0

Show file

    def parse_list(self,response):

        hxs = HtmlXPathSelector(response)

        all_brand_name = hxs.xpath('//span[contains(@class,"brand-all")]/descendant::a/text()').re(r'\S+')

        all_brand_url = hxs.xpath('//span[contains(@class,"brand-all")]/descendant::a/@href').extract()

        for item in zip(all_brand_name,all_brand_url):

            if len(item) ==2:

                yield Request(urljoin(self.start_urls[0], item[1]), callback=self.parse_item,meta={'brand_name':item[0]})

Example #28

0

Show file

 def parse_community(self, response):
     item = response.meta.get("item")
     coin_name = response.meta.get("coin_name")
     hxs = HtmlXPathSelector(response)
     item['subscribers'] = hxs.xpath(
         '//a[@rel="nofollow"][contains(text(),"Subscribers")]/../following-sibling::p[1]/text()'
     ).extract_first()
     item["followers"] = hxs.xpath(
         '//a[@rel="nofollow"][contains(text(),"Followers")]/../following-sibling::p[1]/text()'
     ).extract_first()
     item["likes"] = hxs.xpath(
         '//a[@rel="nofollow"][contains(text(),"Likes")]/../following-sibling::p[1]/text()'
     ).extract_first()
     item["avg_users_online"] = hxs.xpath(
         '//div[contains(@class, "social-media")][contains(text(), "Online")]/p[1]/text()'
     ).extract_first()
     item["avg_new_hot_posts_per_hour"] = hxs.xpath(
         '//div[contains(@class, "social-media")][contains(text(), "New Hot")]/p[1]/text()'
     ).extract_first()
     item["avg_new_comments_on_hot_posts_per_hour"] = hxs.xpath(
         '//div[contains(@class, "col-md")][contains(text(), "Comments")]/p[1]/text()'
     ).extract_first()
     url = "https://www.coingecko.com/en/coins/{}/developer#panel".format(
         coin_name)
     yield Request(url=url,
                   callback=self.parse_developer,
                   meta={"item": item},
                   dont_filter=True)

Example #29

0

Show file

File: fetch_details.py Project: Gaurang033/Mutlilevel_scrapy

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        item = Link2LinkItem(response.meta["item"])

        item['Specification'] = hxs.select("//div[@id='tab1']/p/text()").extract()
        item['Product_Name'] = hxs.select("//*[@id='contentFull']/h1/text()").extract()
        ga = hxs.select("//*[@id='tab2']/p/text()").extract()
        if ga:
            item['Guaranteed_Analysis'] = ga
        else:
            item['Guaranteed_Analysis'] = "Not Available"


        item['Product_Description'] = hxs.select(".//*[@id='contentFull']/p/text()").extract()
        yield item

Example #30

0

Show file

File: swa_spider.py Project: maverick915/swa-scraper

    def scrapeFlights(self, response):
        """Scrape the flights into a Fare() object."""

        html = HtmlXPathSelector(response)
        errors = html.select("//ul[@id='errors']/li/text()")

        # Catch errors given by Southwest's page
        if (len(errors) > 0):
            self.log("Error: %s" % theError, level=log.ERROR)
            return

        # Conveniently packaged flight info in string form for form submission
        xpath = '//div[@class="productPricing"]//input/@title'
        selectors = [
            '//table[@id="faresOutbound"]//td[@class="price_column "]' +
            xpath,  # business select
            '//table[@id="faresOutbound"]//td[@class="price_column"][1]' +
            xpath,  # anytime
            '//table[@id="faresOutbound"]//td[@class="price_column"][2]' +
            xpath  # wanna get away
        ]
        fareList = []
        for selector in selectors:
            fareList.append(html.select(selector).extract())

        # Process that info and load into a Fare() item.
        i = 0
        fareType = ["Business Select", "Anytime", "Wanna Get Away"]
        for fare in fareList:
            self.log("Faretype: %d %s" % (i, fareType[i]))
            for flightString in fare:
                if (flightString[0] == 'D'):
                    flightData = Util.parseFlight(flightString,
                                                  self.outDate.date())
                    self.log("Found: %s" % flightString)
                    flight = Fare()

                    for key in flightData:
                        flight[key] = flightData[key]
                    flight['origin'] = self.origin
                    flight['destination'] = self.destination
                    flight['date'] = self.outDate
                    flight['faretype'] = fareType[i]
                    self.log('Added')
                    yield flight
                else:
                    continue
            i += 1

Example #31

0

Show file

File: tongcheng_spider.py Project: tukeq/cantin

  def parse(self, response):
    hxs = HtmlXPathSelector(response)
    last_page_node = hxs.select("//div[@class='paginator']/a[last()]")

    last_page = int(self.href(last_page_node).split('=')[1])
    pages = range(10, last_page, 10)
    #pages = []

    for page in pages:
      page_url = response.url + '?start=' + str(page)
      self.log('Found page url: %s' % page_url)
      yield Request(page_url, callback = self.parse_list)

    # process first page
    # self.parse_list(response)
    yield Request(response.url, callback=self.parse_list)

Example #32

0

Show file

File: DoubiSpider.py Project: sid2656/fetchData

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     data = {}
     data['key'] = response.url
     data['s'] = 1
     if (MongoDbUtil.MongoDbConnect.count(self.modle, data) == 0):
         print '-------------------------------------------', response.url
         self.parseAllA(response.url, hxs)
         try:
             MongoDbUtil.MongoDbConnect.update(self.modle, data)
             self.finishUrls.append(response.url)
         except Exception as e:
             print response.url, ":", e
             data = {}
             data['key'] = response.url
             data['type'] = self.error_url
             data['e'] = e
             MongoDbUtil.MongoDbConnect.save(self.modle_error, data)
     data = {}
     data['s'] = 0
     urls = MongoDbUtil.MongoDbConnect.list(self.modle, data)
     try:
         yield Request(urls[0]['key'], callback=self.parse)
     except Exception as e:
         print "yield Request :", e

Example #33

0

Show file

File: parlparse.py Project: robyoung/Hanalytics

def commons_speech_feeder(working_dir, _fetch_url=None):
    """Return a generator that yields file urls"""
    # TODO: find a faster way of doing this
    if not _fetch_url:
        _fetch_url = fetch_url
    list_url = 'http://ukparse.kforge.net/parldata/scrapedxml/debates/'
    log.debug("Fetching index")
    data = _fetch_url(list_url, "Failed to fetch index.")
    if data:
        hxs = HtmlXPathSelector(text=unicode(data, errors="ignore"))
        selector = hxs.select(r'//table//td//a/@href')
        check_href = create_href_checker(re.compile(r'^debates\d{4}'), working_dir)
        urls = selector.extract()
        log.debug("Fetched %s urls from index" % len(urls))
        for href in urls:
            if check_href(href):
                yield urlparse.urljoin(list_url, href)

Example #34

0

Show file

File: tucarro_base.py Project: saulm/tucarro_scrapy_sample

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        car_lis = hxs.select('//li[@class="standard_200 t gal-normal-mot"]')
        #Parsear cada li de carro
        for li in car_lis:
            carro = Carro()
            carro['modelo'] = li.select('div/h3/a/text()').extract()[0]
            anio, tipo = li.select('div[@class="itemInfo"]//h4/strong/text()').extract()[0].split("|")
            carro['anio'] = anio
            carro['tipo'] = tipo
            carro['precio'] = li.select('div[@class="itemInfo"]//li[@class="precio_gal"]/strong/text()'
                                        ).extract()[0].split(" ")[-1]
            carro['url'] = li.select('div/h3/a/@href').extract()[0]
            yield carro

        url_paginas = hxs.select("//div[@id='paginador']/a/@href").extract()
        for pagina in url_paginas:
            yield(Request(pagina, callback=self.parse))

Example #35

0

Show file

File: swa_spider.py Project: TheFrostyboss/swa-scraper

	def scrapeFlights(self, response):
		"""Scrape the flights into a Fare() object."""
		
		html = HtmlXPathSelector(response)
 		errors = html.select("//ul[@id='errors']/li/text()")
		
		# Catch errors given by Southwest's page
		if ( len(errors) > 0 ):
			self.log("Error: %s" % theError , level=log.ERROR)
			return

		# Conveniently packaged flight info in string form for form submission
		xpath = '//div[@class="productPricing"]//input/@title'
		selectors = [ 
			'//table[@id="faresOutbound"]//td[@class="price_column "]' + xpath,   # business select
			'//table[@id="faresOutbound"]//td[@class="price_column"][1]' + xpath, # anytime
			'//table[@id="faresOutbound"]//td[@class="price_column"][2]' + xpath  # wanna get away
			]
		fareList = []
		for selector in selectors:
			fareList.append( html.select(selector).extract() )

		# Process that info and load into a Fare() item.
		i = 0
		fareType = ["Business Select", "Anytime", "Wanna Get Away"]
		for fare in fareList:
			self.log("Faretype: %d %s" % (i, fareType[i]) )
			for flightString in fare:
				if ( flightString[0] == 'D' ):
					flightData = Util.parseFlight(flightString, self.outDate.date())
					self.log("Found: %s" % flightString)
					flight = Fare()		
			
					for	key in flightData:
						flight[key] = flightData[key]
					flight['origin'] = self.origin
					flight['destination'] = self.destination
					flight['date'] = self.outDate
					flight['faretype'] = fareType[i]
					self.log('Added')
					yield flight
				else:
					continue
			i += 1

Example #36

0

Show file

File: jd_spider.py Project: yh623962721/QbSpider

    def parse_safetycenter(self, response):

        hxs = HtmlXPathSelector(response)

        item = response.meta["item"]

        item["userphone"] = "".join(
            hxs.xpath("//strong[@id='mobile']/text()").re(r'\S+'))

        item["useridcard"] = "".join(
            hxs.xpath(
                u"//span[contains(text(),'您认证的实名信息：')]/following::strong[2]/text()"
            ).extract())

        order_url = "https://order.jd.com/center/list.action"

        self.items.append(item)

        yield Request(url=order_url, callback=self.parse_order_year)

Example #37

0

Show file

File: torrentSpider.py Project: iingyeo/MovieCrawler

    def parse_list(self, response):
        hxs = HtmlXPathSelector(response)

        # print(hxs.extract())
        body = hxs.select('//div[@id="contents"]')

        # print(body.extract())

        item = MovieItem()

        item['title'] = body.select('//div[@id="bo_v_title"]/h1/text()').extract()[0]
        item['link'] = body.select('//div[@class="bo_v_file"]/a/@href').extract()[1]
        item['date'] = body.select('//div[@id="bo_v_info"]/table/tbody/tr/td/table/tbody/tr/td/text()').extract()[2]

        if not item['title']:
            print("Table Title")
        else:
            print(item)
            yield item

Example #38

0

Show file

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     for data in hxs.xpath("//*[@id='gecko-table']/tbody/tr"):
         item = IcoItem()
         item["name"] = data.xpath(
             './/td[@class="coin-name"]//span[@class="coin-content-name"]/text()'
         ).extract_first()
         item["symbol"] = data.xpath(
             './/td[@class="coin-name"]//span[@class="coin-content-symbol"]/text()'
         ).extract_first()
         item["img_url"] = "https:" + data.xpath(
             './/td[@class="coin-name"]//img/@data-src').extract_first()
         item["other"] = data.xpath(
             './/td[@class="coin-name"]//small/text()').extract_first()
         item["developer"] = data.xpath(
             './/td[@class="td-developer_score dev"]/div[1]/text()'
         ).extract_first()
         item["community"] = data.xpath(
             './/td[@class="td-community_score community"]/div[1]/text()'
         ).extract_first()
         item["public_interest"] = data.xpath(
             './/td[@class="td-public_interest_score pb-interest"]/div[1]/text()'
         ).extract_first()
         item["total"] = data.xpath(
             './/td[@class="total"]/div[1]/text()').extract_first()
         coin_name = data.xpath(
             './/td[@class="coin-name"]//a[@class="currency_exchangable_chart_link"]/@href'
         ).re(r'price_charts/([\S\s]+?)/usd')
         url = "https://www.coingecko.com/en/coins/{}#panel".format(
             coin_name[0])
         yield Request(url=url,
                       callback=self.parse_baseinfo,
                       meta={
                           "item": item,
                           "coin_name": coin_name[0]
                       },
                       dont_filter=True)
     next_page_url = hxs.xpath('//link[@rel="next"]/@href').extract_first()
     self.logger.info("current page url <{}>".format(next_page_url))
     yield response.follow(next_page_url, self.parse)

Example #39

0

Show file

File: jd_spider.py Project: yh623962721/QbSpider

    def parse_order_list(self, response):

        hxs = HtmlXPathSelector(response)

        orders_urls = hxs.xpath("//a[@name='orderIdLinks']/@href").extract()

        headers = dict(response.request.headers)

        headers.update({"Referer": None})

        sess = {}

        cookie = response.request.headers.getlist('Cookie')[0].split(";")

        for cook in cookie:

            sess.update({
                cook[:cook.index("=")]:
                cook[cook.index("=") + 1:].replace('"', "")
            })

        for order_url in orders_urls:

            if "orderId=" in order_url or "orderid" in order_url:

                #self.queues.push(Request(url=urljoin(self.start_urls[0],order_url),meta={"jobid":self.jobid},headers=headers))

                yield Request(url=urljoin(self.start_urls[0], order_url),
                              cookies=sess,
                              meta={"jobid": self.jobid},
                              callback=self.parse_items)

        next_page_url = hxs.xpath("//a[@class='next']/@href").extract()

        if next_page_url:

            for next_url in next_page_url:

                yield Request(url=urljoin(self.start_urls[0], next_url),
                              callback=self.parse_order_list)

Example #40

0

Show file

File: jd_spider.py Project: yh623962721/QbSpider

    def parse_userinfo(self, response):

        hxs = HtmlXPathSelector(response)

        item = response.meta["item"]

        item["userloginname"] = "".join(
            hxs.xpath("//div[@id='aliasBefore']/strong/text()").extract())

        item["usermail"] = "".join(
            hxs.xpath(
                u"//span[contains(text(),'邮箱：')]/following-sibling::div[1]/div/strong/text()"
            ).re(r'\S+'))

        item["userrealname"] = "".join(
            hxs.xpath("//input[@id='realName']/@value").extract())

        item["usertype"] = "".join(
            hxs.xpath(
                u"translate(//div[@class='info-m']/div[contains(text(),'会员类型：')]/text(),'会员类型：','')"
            ).extract())

        safetycenter_url = "https://safe.jd.com/user/paymentpassword/safetyCenter.action"

        yield Request(url=safetycenter_url,
                      callback=self.parse_safetycenter,
                      meta={"item": item})

Example #41

0

Show file

File: mafengwo.py Project: nanfang/lvtuso

    def _parse_jd(self, response):
        r = response.request
        hxs = HtmlXPathSelector(response)
        items = hxs.select("//li[@class='item']/div[@class='store']")
        if items:
            yield self._jd_request(r.meta['level'], r.meta['id'], r.meta['name'], r.meta['page']+1)
        for item in items:
            name = item.select('.//h3/a/text()').extract()[0]
            rating = item.select(".//div[@class='rank']/span/@class").extract()[0]
            tags = item.select(".//div[@class='tag']/a/text()").extract()
            path = item.select('.//h3/a/@href').extract()[0]
            place = MafengwoPlace()
            place['name'] = name
            if rating:
                place['rating'] = int(rating[4:])

            place['tags'] = tags
            place['id'] = self._extract_id(path)
            place['p_id'] = r.meta['id']
            place['level'] = self._sub_level(r.meta['level'])

            yield self._jd_detail_request(place, path)

Example #42

0

Show file

File: jd_spider.py Project: DahuK/Crawler

    def parse_detail(self, response):
        print "--------------------------"
        self.logger.info("--------------------------")
        hxs = HtmlXPathSelector(response)
        items = []
        price_url_pre = "http://p.3.cn/prices/mgets?skuIds=J_"
        for gl_item in hxs.xpath("//*[@id='plist']/ul/li[@class='gl-item']"):
            # self.logger.info("GGGGGGGGGGGGGGGGGGGGGGGGG: %s" % gl_item.extract())
            book_element = gl_item.xpath("div[@class='tab-content-item j-sku-item tab-cnt-i-selected']")
            if book_element is None or len(book_element) == 0:
                book_element = gl_item.xpath("div")

            data_sku_id = self.get_xpath_val(book_element, "@data-sku")
            price_url = price_url_pre + data_sku_id
            item = JdbookItem()
            item["name"] = self.get_xpath_val(book_element, "div[3]/a/em/text()")
            item["publisher"] = self.get_xpath_val(book_element, "div[4]/span[2]/a/text()")
            item["author"] = self.get_xpath_val(book_element, "div[4]/span[1]/span[1]/a[1]/text()")
            item["commit"] = self.get_xpath_val(book_element, "div[6]/strong/a/text()")
            item["shop"] = self.get_xpath_val(book_element, "div[7]/span/text()")
            r = Request(price_url, callback=self.parse_price, dont_filter=True, meta={"item": item})
            items.append(item)
            yield r

Example #43

0

Show file

File: jd_spider.py Project: yh623962721/QbSpider

    def sendphonecode(self, response):

        hxs = HtmlXPathSelector(response)

        sendphonecodekey = "".join(
            hxs.xpath(
                "translate(//*[@id='sendMobileCode']/@href,'javascript:sendFindPwdCode(|);','')"
            ).extract()).replace("'", "")

        if self.vercode is None:

            item = JdItem()

            logging.warning(
                msg=
                "Login jd need phone vercode, will send phone code to user phone "
            )

            item["status"] = 2

            jobid = self.settings.get("jobid", None)

            item["jobid"] = jobid

            #self.con.lpush(jobid, item)

            self.items.append(item)

            yield Request(url=self.sendcodeurl % sendphonecodekey,
                          dont_filter=True)

        else:

            yield Request(url=self.validatecodeurl %
                          (self.vercode, sendphonecodekey),
                          callback=self.checkphonekey)

Example #44

0

Show file

File: tongcheng_spider.py Project: tukeq/cantin

  def parse_activity(self, response):
      hxs = HtmlXPathSelector(response)
      event_info_node = hxs.select('//div[@class="event-info"]')
      title = self.text(event_info_node.select('h1'))
      event_detail_nodes = event_info_node.select('div[@class="event-detail"]')
      details = {}
      for n in event_detail_nodes:
        self.log(n.extract())
        key = self.text(n.select('span')).strip().replace(':','')
        value = re.sub('<[^<]+?>', '', n.extract()).split(':')[1].strip()
        details[key] = value

      description = '\n'.join(hxs.select('//div[@class="related_info"]/div/div/text()').extract())
      photo_urls = hxs.select('//ul[contains(@class,"event-detail-photo")]/li/a/img/@src').extract()
      photo_urls = map(lambda x:x.replace('albumicon', 'photo'), photo_urls)

      entry = Activity()
      entry['id'] = response.url
      entry['title'] = title
      entry['description'] = description
      entry['images'] = photo_urls
      entry['details'] = details

      return entry

Example #45

0

Show file

File: jd_spider.py Project: DahuK/Crawler

    def parse_next_page(self, response):
        print ("Fetch group home page: %s" % response.url)

        hxs = HtmlXPathSelector(response)

        pages_num = hxs.select("//*[@id='J_bottomPage']/span[2]/em[1]/b/text()").extract()[0]
        print pages_num
        page_url = hxs.select("//*[@id='J_bottomPage']/span[1]/a[2]/@href").extract()[0]
        print page_url
        PAGE_PATTERN = r"(.+page=)[\d]+(.+)"
        parse = re.compile(PAGE_PATTERN, re.UNICODE | re.VERBOSE)
        match = parse.search(page_url)
        if match:

            pre_url = match.group(1)
            post_url = match.group(2)

            for i in xrange(1, int(pages_num) + 1):
                # iterate each page
                page_url = "http://list.jd.com" + pre_url + str(i) + post_url
                r = Request(page_url, callback=self.parse_detail)
                yield r
        else:
            print "NONONONO!"

Example #46

0

Show file

 def parse_baseinfo(self, response):
     item = response.meta.get("item")
     coin_name = response.meta.get("coin_name")
     hxs = HtmlXPathSelector(response)
     item['liquidity'] = hxs.xpath(
         '//div[@class="score"][contains(text(), "Liquidity")]/span/text()'
     ).extract_first()
     item["hash_algorithm"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Hashing Algorithm")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["hash_rate"] = hxs.xpath(
         '//div[@class="hashrate"]/p/text()').extract_first()
     item["block_time"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Block Time")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["homepage"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Homepage")]/following-sibling::p[1]/a/text()'
     ).extract_first()
     item["block_chain_supply"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Blockchain/Supply")]/following-sibling::p[1]/a/text()'
     ).extract_first()
     item["discussion_forum"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Discussion Forum")]/following-sibling::p[1]/a/text()'
     ).extract_first()
     item["available_total_supply"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Available/Total Supply")]/following-sibling::p[1]/text()'
     ).extract_first()
     url = "https://www.coingecko.com/en/coins/{}/social#panel".format(
         coin_name)
     yield Request(url=url,
                   callback=self.parse_community,
                   meta={
                       "item": item,
                       "coin_name": coin_name
                   },
                   dont_filter=True)

Example #47

0

Show file

    def parse_starts(self, response):

        jobid = response.meta["jobid"]

        html = HtmlXPathSelector(response)

        meta = response.meta

        meta["start_5"] = ''.join(
            html.xpath(u"//a[text()='5星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        meta["strat_4"] = ''.join(
            html.xpath(u"//a[text()='4星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        meta["start_3"] = ''.join(
            html.xpath(u"//a[text()='3星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        meta["start_2"] = ''.join(
            html.xpath(u"//a[text()='2星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        meta["start_1"] = ''.join(
            html.xpath(u"//a[text()='1星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        next_page = ''.join(html.xpath(u"//a[text()='下一页']/@href").extract())

        item = {}

        item["business_details"] = meta

        self.items.append(item)

        yield Request(urljoin(meta["review_urls"], "?pageno=1"),
                      callback=self.parse_item,
                      meta=meta)

        del response

Example #48

0

Show file

File: jd_spider.py Project: yh623962721/QbSpider

    def parse_ballancecount(self, response):

        item = JdItem()

        hxs = HtmlXPathSelector(response)

        item["spidertime"] = time.strftime('%Y%m%d%H%M%S',
                                           time.localtime(time.time()))

        item["username"] = self.username

        item["passwd"] = self.passwd

        item["usernickname"] = "".join(
            hxs.xpath("//div[@class='u-name']/a/text()").extract())

        item["userrank"] = "".join(
            hxs.xpath("//div[@class='u-level']/span/a/text()").extract())

        item["balance"] = "".join(
            hxs.xpath("//a[@id='BalanceCount']/text()").extract())

        item["baitiaobalance"] = "".join(
            hxs.xpath("//span[@class='baitiao-limit']/text()").extract())

        item["wallet"] = "".join(
            hxs.xpath("//div[@id='balance']/a[2]/em/text()").extract())

        item["yesprofit"] = "".join(
            hxs.xpath("//div[@class='ftx01 profit']/a/text()").extract())

        userinfo_url = 'https://i.jd.com/user/info'

        yield Request(url=userinfo_url,
                      callback=self.parse_userinfo,
                      meta={"item": item})

Example #49

0

Show file

    def parse_xiangxi(self, response):

        html = HtmlXPathSelector(response)

        meta = response.meta

        meta["title"] = ''.join(
            html.xpath("//*[@id='basic-info']/h1/text()").extract()).replace(
                "\r", "").replace("\n", "").replace("\t", "").replace(" ", "")

        meta["start"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[1]/@title").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["mean_price"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[3]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["address"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[2]/span[2]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["taste"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[4]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["environment"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[5]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["service"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[6]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["tel"] = ''.join(
            html.xpath("//*[@id='basic-info']/p[1]/span[2]/text()").extract()
        ).replace("\r", "").replace("\n", "").replace("\t",
                                                      "").replace(" ", "")

        meta["review_num"] = ''.join(
            html.xpath(
                "//*[@id='comment']/h2/a[2]/span/text()").extract()).replace(
                    ")", "").replace("(", "").replace("\r", "").replace(
                        "\n", "").replace("\t", "").replace(" ", "")

        # more_review = ''.join(html.xpath("//*[@id='comment']/p/a/@href").extract())
        more_review = ''.join(
            html.xpath(u"//a[contains(text(),'更多点评')]/@href").extract())

        review_url = meta["url"] + "/review_more#start=10"

        meta["review_urls"] = meta["url"] + "/review_more"

        yield Request(review_url, callback=self.parse_starts, meta=meta)

        del response

Example #50

0

Show file

File: test_selector.py Project: 247DigitalGroup/scrapy

 def test_htmlxpathselector(self):
     with warnings.catch_warnings(record=True):
         hs = HtmlXPathSelector(text=self.text)
         self.assertEqual(hs.select("//div").extract(),
                          [u'<div><img src="a.jpg"><p>Hello</p></div>'])
         self.assertRaises(RuntimeError, hs.css, 'div')

Example #51

0

Show file

File: hl_spider.py Project: tomdee/HLScraper

    def parse_fund(self, response):
        x = HtmlXPathSelector(response)
        
        fund = HlscraperItem()
        fund['Url'] = response.url
        fund['Name'] = x.select("normalize-space(/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/text())").extract()
        fund['ExdividendDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Ex-dividend date')]]/../td/text())").extract()
        fund['PaymentDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Payment date')]]/../td/text())").extract()
        fund['RunningYield'] = x.select("normalize-space(//tr/th[text()[contains(., 'Running yield')]]/../td/text())").extract()
        fund['HistoricYield'] = x.select("normalize-space(//tr/th[text()[contains(., 'Historic yield')]]/../td/text())").extract()
        fund['IncomePaid'] = x.select("normalize-space(//tr/th[text()[contains(., 'Income paid')]]/../td/text())").extract()
        fund['TypeOfPayment'] = x.select("normalize-space(//tr/th[text()[contains(., 'Type of payment')]]/../td/text())").extract()
        fund['LaunchDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Launch date')]]/../td/text())").extract()
        fund['Sector'] = x.select("normalize-space(//tr/th[text()[contains(., 'Sector')]]/../td/text())").extract()
        fund['FundSize'] = x.select("normalize-space(//tr/th[text()[contains(., 'Fund size')]]/../td/text())").extract()
        fund['NumberOfHoldings'] = x.select("normalize-space(//tr/th[text()[contains(., 'Number of holdings')]]/../td/text())").extract()
        fund['TypeOfUnits'] = x.select("normalize-space(//tr/th[text()[contains(., 'Type of units')]]/../td/text())").extract()
        fund['FundType'] = x.select("normalize-space(//tr/th[text()[contains(., 'Fund type')]]/../td/text())").extract()
        fund['NetInitialCharge'] = x.select("normalize-space(//tr/th[text()[contains(., 'Net initial charge')]]/../td/text())").extract()
        fund['NetAnnualCharge'] = x.select("normalize-space(//tr/th[text()[contains(., 'Net Annual charge')]]/../td/text())").extract()
        fund['OtherExpenses'] = x.select("normalize-space(//tr/th[text()[contains(., \"Fund manager's other expenses\")]]/../td/text())").extract()
        fund['PerformanceFee'] = x.select("normalize-space(//tr/th[text()[contains(., 'Performance fee')]]/../td/text())").extract()
        fund['PlatformFee'] = x.select("normalize-space(//tr/th[text()[contains(., 'HL Platform charge')]]/../td/text())").extract()
        
        fund['Wealth150'] = x.select("/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/a/img/@src").extract()

        return fund

Example #52

0

Show file

    def parse_item(self, response):

        jobid = response.meta["jobid"]

        html = HtmlXPathSelector(response)

        meta = response.meta

        item = {}

        _item = []

        for comment in html.xpath("//*[@class='comment-list']/ul/li"):

            comments = {}

            comments["start"] = ''.join(
                comment.xpath(
                    ".//*[@class='user-info']/span/@title").extract()).replace(
                        "\r", "").replace("\n",
                                          "").replace("\t",
                                                      "").replace(" ", "")

            comments["taste"] = ''.join(
                comment.xpath(u".//*[contains(text(),'口味')]/em/text()").
                extract()).replace(")", "").replace("(", "").replace(
                    "\r", "").replace("\n", "").replace("\t",
                                                        "").replace(" ", "")

            comments["environment"] = ''.join(
                comment.xpath(u".//*[contains(text(),'环境')]/em/text()").
                extract()).replace(")", "").replace("(", "").replace(
                    "\r", "").replace("\n", "").replace("\t",
                                                        "").replace(" ", "")

            comments["service"] = ''.join(
                comment.xpath(u".//*[contains(text(),'服务')]/em/text()").
                extract()).replace(")", "").replace("(", "").replace(
                    "\r", "").replace("\n", "").replace("\t",
                                                        "").replace(" ", "")

            comments["review_text"] = ''.join(
                comment.xpath(".//*[@class='comment-txt']/div/text()").extract(
                )).replace("\r", "").replace("\n",
                                             "").replace("\t",
                                                         "").replace(" ", "")

            comments["review_time"] = ''.join(
                comment.xpath(".//*[@class='time']/text()").extract()).replace(
                    "\r", "").replace("\n", "").replace("\t",
                                                        "").replace(" ", "")

            comments["discussant"] = ''.join(
                comment.xpath(
                    ".//*[@class='name']/a/text()").extract()).replace(
                        "\r", "").replace("\n",
                                          "").replace("\t",
                                                      "").replace(" ", "")

            comments["discussant_contribution"] = ''.join(
                comment.xpath(".//*[@class='contribution']/span/@title").
                extract()).replace("\r", "").replace("\n", "").replace(
                    "\t", "").replace(" ", "")

            _item.append(comments)

        item["review_details"] = _item

        item["review_page"] = ''.join(
            html.xpath("//span[@class='PageSel']/text()").extract())

        self.items.append(item)

        next_page = ''.join(html.xpath(u"//a[text()='下一页']/@href").extract())

        if next_page:

            yield Request(urljoin(meta["review_urls"], next_page),
                          callback=self.parse_item,
                          meta=meta)

        del response

Example #53

0

Show file

File: hl_spider.py Project: osgirl/HLScraper

    def parse_fund(self, response):
        x = HtmlXPathSelector(response)

        fund = HlscraperItem()
        fund['Url'] = response.url
        fund['Name'] = x.select(
            "normalize-space(/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/text())"
        ).extract()
        fund['ExdividendDate'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Ex-dividend date')]]/../td/text())"
        ).extract()
        fund['PaymentDate'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Payment date')]]/../td/text())"
        ).extract()
        fund['RunningYield'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Running yield')]]/../td/text())"
        ).extract()
        fund['HistoricYield'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Historic yield')]]/../td/text())"
        ).extract()
        fund['IncomePaid'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Income paid')]]/../td/text())"
        ).extract()
        fund['TypeOfPayment'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Type of payment')]]/../td/text())"
        ).extract()
        fund['LaunchDate'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Launch date')]]/../td/text())"
        ).extract()
        fund['Sector'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Sector')]]/../td/text())"
        ).extract()
        fund['FundSize'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Fund size')]]/../td/text())"
        ).extract()
        fund['NumberOfHoldings'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Number of holdings')]]/../td/text())"
        ).extract()
        fund['TypeOfUnits'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Type of units')]]/../td/text())"
        ).extract()
        fund['FundType'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Fund type')]]/../td/text())"
        ).extract()
        fund['NetInitialCharge'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Net initial charge')]]/../td/text())"
        ).extract()
        fund['NetAnnualCharge'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Net Annual charge')]]/../td/text())"
        ).extract()
        fund['OtherExpenses'] = x.select(
            "normalize-space(//tr/th[text()[contains(., \"Fund manager's other expenses\")]]/../td/text())"
        ).extract()
        fund['PerformanceFee'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'Performance fee')]]/../td/text())"
        ).extract()
        fund['PlatformFee'] = x.select(
            "normalize-space(//tr/th[text()[contains(., 'HL Platform charge')]]/../td/text())"
        ).extract()

        fund['Wealth150'] = x.select(
            "/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/a/img/@src"
        ).extract()

        return fund