Exemple #1
0
	def parse(self, response):

		selector = HtmlXPathSelector(response)

		# iterate over deals
		for deal in selector.select(self.deals_list_xpath):
			loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

			# define processors
			loader.default_input_processor = MapCompose(unicode.strip)
			loader.default_output_processor = Join()

			for field, xpath in self.item_fields.iteritems():
				loader.add_xpath(field, xpath)
			yield loader.load_item()
Exemple #2
0
    def parseAPage(self, response):
        if self.need_help(response):
            return

        hxs = HtmlXPathSelector(response)
        viewstate = self.getViewState(hxs)

        self.pages += 1
        for tr in hxs.select(
                '//table[@id="ctl00_ContentPlaceHolder1_gvOrdenes"]/tr[position() > 1]'
        ):
            detalle = self.postBackArgs(tr.select('td[9]/a'))
            if detalle:
                detalle = self.postBackArgs(tr.select('td[9]/a'))
                i = CompraItem()
                l = XPathItemLoader(item=i, selector=tr)
                l.add_xpath('orden_compra', 'td[1]/text()')
                l.add_xpath('fecha', 'td[2]/text()')
                l.add_xpath('importe', 'td[3]/text()')
                l.add_xpath('proveedor', 'td[4]/text()')
                l.add_xpath('destino', 'td[5]/text()')
                l.add_xpath('suministro', 'td[6]/text()')
                l.add_xpath('anio', 'td[7]/text()')
                l.add_xpath('tipo', 'td[8]/text()')
                compra = l.load_item()
                compra['compra_linea_items'] = []

                detalle = self.formdata(viewstate, *detalle)
                req = FormRequest(url,
                                  formdata=detalle,
                                  callback=self.parseDetalle)
                req.meta['compra'] = compra
                req.meta['viewstate'] = viewstate
                yield req

        # Get previous page
        if self.pages < MAX_PAGES:
            prev = None
            for td in hxs.select('//td[@colspan="9"]//td'):
                args = self.postBackArgs(td.select('a'))
                if args:
                    prev = args
                else:
                    prev = self.formdata(viewstate, *prev)
                    req = FormRequest(url,
                                      formdata=prev,
                                      callback=self.parseAPage)
                    yield req
Exemple #3
0
 def get_answer(self, selector, response):
     answer_loader = XPathItemLoader(item = LazyTweetAnswer(), \
             selector = selector)
     answer_loader.add_value('question_id', response.url.split('/')[-1])
     answer_loader.add_value(
         'answerer',
         self.get_user(
             selector.select(''.join(['.//span[@class="answer-meta"]']))))
     answer_loader.add_xpath(
         'answer_content', ''.join([
             './/span[@class="answer-body"]',
             '//span[@class="answer-status"]//descendant-or-self::text()'
         ]))
     print answer_loader.get_output_value('answer_content')
     a = input()
     return answer_loader.load_item()
Exemple #4
0
    def parse_full_report(self, response):
        # need to work around weird bug where lxml can't handle encode=WINDOWS-1252
        # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it
        # since XPathItemLoader requires a Response object
        text = unicode (response.body, response.encoding)
        t = TextResponse (url=response.url, body=text.encode('utf-8'), encoding='utf-8')

        l= XPathItemLoader(NrcScrapedFullReport(), response=t)
        url_parts = urlsplit(response.url)
        l.add_value('reportnum', parse_qs(url_parts.query)['standard_web inc_seq'])
        l.add_xpath('full_report_body', '//body')
        l.add_value('full_report_url', response.url)
        item = l.load_item()
        reportnum = item['reportnum']
        yield item
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
Exemple #5
0
    def parse(self, response):
        """Get response from start_urls"""

        selector = HtmlXPathSelector(response)

        for deal in selector.xpath(self.xpath_for_deals):
            loader = XPathItemLoader(LivingSocial(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath.strip())
            yield loader.load_item()
Exemple #6
0
    def search_results(self, response):
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        reports = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(reports) == 0):
            self.log('Incident report data not present in response', log.ERROR)
        else:
            # Skip the first report record because this is the header row
            reports.pop (0)
            if (len(reports) == 0):
                self.log('No incident reports found in response', log.WARNING)
            else:
                self.log('Retrieved {0} incident reports'.format(len(reports)), log.INFO)

        for report in reports:
            l = XPathItemLoader(NrcScrapedReport(), report)
            l.context['base_url'] = response.url
            for name, params in NrcScrapedReport.fields.items():
                l.add_xpath(name, params['xpath'])
            item = l.load_item()
            if self.db.reportExists(item['reportnum']):
                self.log('Report {0} already exists.  Skipping to next report.'.format(item['reportnum']), log.INFO)
            else:
                f_request = Request(
                    item['full_report_url'],
                    callback=self.parse_full_report)
                m_request = Request(
                    item['materials_url'],
                    callback=self.parse_materials)
                yield item
                self.db.setBotTaskStatus(item['reportnum'], self.name, 'DONE')

#                if self.db.fullReportExists (item['reportnum']):
#                    self.log('Full report Report {0} already exists.  Skipping download.'.format(item['reportnum']), log.INFO)
#                else:
#                    yield f_request
#
#                if self.db.materialExists (item['reportnum']):
#                    self.log('Materials record(s) already exist for report {0}.  Skipping download.'.format(item['reportnum']), log.INFO)
#               else:
#                    yield m_request

        # get next page of results
        next = hxs.select('//td[@class="pagination"][4]/a/@href')
        if len(next) > 0:
            yield Request (urljoin(response.url, next[0].extract()), callback=self.search_results)
Exemple #7
0
    def parseDetalle(self, response):
        # Page 253, Orden 2665 has multiple pages
        if self.need_help(response):
            return

        hxs = HtmlXPathSelector(response)

        viewstate = self.getViewState(hxs, save=False)

        orden_compra = response.request.meta['compra']
        hxs = HtmlXPathSelector(response)
        for tr in hxs.select(
                '//table[@id="ctl00_ContentPlaceHolder1_gvDetalle"]/tr[position() > 1]'
        ):
            i = CompraLineaItem()
            l = XPathItemLoader(item=i, selector=tr)
            l.add_xpath('cantidad', 'td[1]/text()')
            l.add_xpath('unidad_medida', 'td[2]/text()')
            l.add_xpath('detalle', 'td[3]/text()')
            l.add_xpath('importe', 'td[4]/text()')
            x = l.load_item()

            if 'cantidad' in x:
                orden_compra['compra_linea_items'].append(x)
            foundCurrent = False

        lastPage = True  # when no paging
        foundCurrent = False
        for td in hxs.select('//td[@colspan="4"]//td'):
            lastPage = False  # only commit in the last page
            args = self.postBackArgs(td.select('a'))
            if not args:  # page with no links
                lastPage = True
                foundCurrent = True
            elif foundCurrent:
                args = self.formdata(viewstate, *args)
                req = FormRequest(url,
                                  formdata=args,
                                  callback=self.parseDetalle)
                req.meta['compra'] = orden_compra
                yield req
                break

        if lastPage:
            yield orden_compra
Exemple #8
0
    def parse(self, response):

        selector = HtmlXPathSelector(response)

        # iterate over data_list
        for data in selector.select(self.data_list):
            loader = XPathItemLoader(TeoniteItem(), selector=data)

            loader.default_input_processor = MapCompose(str.strip)
            loader.default_output_processor = Join()

            # add xpath to loader
            for field, xpath in self.item_fields.items():
                loader.add_xpath(field, xpath)
            yield loader.load_item()

        for nextp in selector.select(self.next_page):
            yield response.follow(nextp, callback=self.parse)
Exemple #9
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for qxs in hxs.select(self.lista_linhas_xpath):
            loader = XPathItemLoader(LinhaItem(), selector=qxs)
            loader.add_xpath('linha', './td[1]/p//text()')
            loader.add_xpath('nome', './td[3]/p//text()')

            link = self.base_url + qxs.select('./td[3]//a/@href').extract()[0]
            #TODO: Deveria manter o contexto e retornar os dados da proxima pagina
            #      mas o que parece eh que nao esta retornando
            request = Request(link, callback=self.parse_item)
            #pdb.set_trace()

            loader.add_value('ida', request.meta['ida'])
            loader.add_value('volta', request.meta['volta'])

            yield loader.load_item()
Exemple #10
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for qxs in hxs.select(self.question_list_xpath):
            loader = XPathItemLoader(QuestionItem(), selector=qxs)
            loader.add_xpath('title', './/h3/a/text()')
            loader.add_xpath('summary', './/h3/a/@title')
            loader.add_xpath('tags', './/a[@rel="tag"]/text()')
            loader.add_xpath('user', './/div[@class="started"]/a[2]/text()')
            loader.add_xpath('posted',
                             './/div[@class="started"]/a[1]/span/@title')
            loader.add_xpath('votes', './/div[@class="votes"]/div[1]/text()')
            loader.add_xpath(
                'answers',
                './/div[contains(@class, "answered")]/div[1]/text()')
            loader.add_xpath('views', './/div[@class="views"]/div[1]/text()')

            yield loader.load_item()
Exemple #11
0
    def get_user(self, selector, response, label):
        user_loader = XPathItemLoader(item = StackOverflowUser(),
                selector = selector)
        user_loader.add_xpath('user_name', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/text()'
            ]))
        user_loader.add_xpath('user_link', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/@href'
            ]))

        if user_loader.get_output_value('user_link'):
            user_id = user_loader.get_output_value('user_link')
            user_loader.add_value('user_id',
                    user_loader.get_output_value('user_link'))

        return user_loader.load_item()
Exemple #12
0
 def parse(self, response):
   response.body = response.body.replace('\\','').replace('\xa0','')
   p = XPathItemLoader(item=PersonItem(), response=response)
   
   try:
     p.add_value('first_name', re.findall( '&qf=(\w+)&', response.url )[0] )
     p.add_value('middle_name', re.findall( '&qmi=(\w+)&', response.url )[0] )
     p.add_value('last_name', re.findall( '&qn=(\w+)&', response.url )[0] )
     p.add_value('city', re.findall( '&qc=(\w+)&', response.url )[0] )
     p.add_value('state', re.findall( '&qs=(\w+)&', response.url )[0] )
     p.add_value('zipcode', re.findall( '&qz=(\d+)&', response.url )[0] )
     p.add_value('prop_ref', re.findall( '&prop_ref=(\d+)', response.url )[0] )
     
     p.add_xpath('cities', '//div[@class="addresses"]/p/b/text()[1]', re="([^\(]+)")
     p.add_xpath('age','//div[@class="greenTopBoxLeft round12_12_0_0"]/p[@class="nameAge"]/text()[2]', re=", Age (\d+)")
   except IndexError:
     pass
   else:
     return p.load_item()
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        """
        selector = HtmlXPathSelector(response)

        #iterate over events
        for event in selector.select(self.events_list_xpath):
            loader = XPathItemLoader(CrunchBaseEvent(), selector=event)

            #define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            #iterate over fields and add xpaths to the loader.
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            yield loader.load_item()
Exemple #14
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        item_name = hxs.select(
            "//input[@id='ctl00_ctlPagePlaceHolder_Keywords']/@value").extract(
            )
        item_hash = hashlib.md5(
            '%s::%s::%s' %
            (self.auction_id, item_name, self.name)).hexdigest()

        loader = XPathItemLoader(item=SearchResultItem(), response=response)
        loader.add_value("id", item_hash)
        loader.add_value("auction_id", self.auction_id)
        loader.add_value("site", self.name)
        loader.add_xpath(
            "name", "//input[@id='ctl00_ctlPagePlaceHolder_Keywords']/@value")
        loader.add_value("link", response.url)
        loader.add_xpath("price", "//td[7]/text()")

        return loader.load_item()
Exemple #15
0
    def parse_rental(self, response):
        l = XPathItemLoader(item=RentalItem(), response=response)

        l.add_value('url', response.url)
        l.add_xpath('address', '//th[text()="Address:"]/../td/text()')

        l.add_xpath('price', '//th[text()="Price:"]/../td/div/text()')
        l.add_xpath('price_period',
                    '//th[text()="Price:"]/../td/div/span/text()')

        l.add_xpath('bedrooms', '//th[text()="Bedrooms:"]/../td/text()')
        l.add_xpath('bathrooms',
                    '//th[text()="Bathrooms:"]/../td/text()',
                    re=r'(\d+)')
        l.add_xpath('powder_rooms',
                    '//th[text()="Bathrooms:"]/../td/text()',
                    re=r', (\d+)')
        l.add_xpath('property_type',
                    '//th[text()="Property type:"]/../td/text()')
        l.add_xpath('size',
                    '//th[text()="Size:"]/../td/text()',
                    re=r'([\d|,]+) sqft')
        l.add_xpath('lot', '//th[text()="Lot:"]/../td/text()')
        l.add_xpath('year_built', '//th[text()="Year built:"]/../td/text()')
        l.add_xpath('lease_term',
                    '//th[text()="Terms of lease:"]/../td/text()')
        l.add_xpath('pets_allowed', '//th[text()="Pets:"]/../td/text()')
        l.add_xpath('date_listed',
                    '//th[text()="Added on Trulia:"]/../td/text()')
        l.add_xpath('mls_id', '//th[text()="MLS/ID:"]/../td/text()')

        l.add_xpath('descriptive_title',
                    '//h2[@class="descriptive_title"]/text()')
        l.add_xpath('description',
                    '//div[@class="listing_description_module"]/text()')

        l.add_xpath('additional_fields',
                    'id("property_listing_details_module")/ul/li/span/text()')

        l.add_xpath('public_records',
                    'id("property_public_info_module")/ul/li/span/text()')

        return l.load_item()
    def parse_item(self, response):
        sel = Selector(response)
        print response.url
        app_loader = XPathItemLoader(item=AppItem(),
                                     selector=sel)  # init the item loader
        # set app id
        app_loader.add_value('app_id', parse_id(response.url))
        # composite the title
        app_loader.add_xpath(
            'title', '//div[contains(@class, "document-title")]//text()')
        app_loader.add_xpath(
            'description',
            '//div[contains(@class, "id-app-orig-desc")]//text()')
        app_loader.add_xpath('score',
                             '//meta[@itemprop="ratingValue"]//@content')
        app_loader.add_xpath(
            'icon_url',
            '//div[contains(@class, "details-info")]//img[contains(@class, "cover-image")]/@src'
        )
        app_loader.add_xpath(
            'author',
            '//div[@itemprop="author"]//span[@itemprop="name"]//text()')
        app_loader.add_xpath(
            'app_type',
            '//div[contains(@class, "details-info")]//span[@itemprop="genre"]/text()'
        )

        # get the similarities and the more from developers
        app_loader.add_xpath(
            'similarity',
            '//div[contains(@class, "recommendation")]//div[contains(@class, "details-section-contents")]/div[@class="rec-cluster" and position()=1]//div[contains(@class, "card")]/@data-docid'
        )

        app_loader.add_xpath(
            'more_from_devs',
            '//div[contains(@class, "recommendation")]//div[contains(@class, "details-section-contents")]/div[@class="rec-cluster" and position()=2]//div[contains(@class, "card")]/@data-docid'
        )

        # print app_loader.load_item()
        # print app_loader.get_output_value('app_id')

        return app_loader.load_item()
Exemple #17
0
    def myparse(self, response):
        print "myParse"
        selector = HtmlXPathSelector(response)
        # l = selector.select(self.deals_list_xpath)
        l = selector.select('//div[@id="detailed"]')
        ll = l.select('.//div[@class="title4"]/a/text()').extract()
        open(ll[0].strip() + '.html', 'wb').write(response.body)
        print ll[0].strip()
        for deal in l:

            #loader = XPathItemLoader(LivingSocialDeal(),selector=deal)
            loader = XPathItemLoader(MoviesClass(), selector=deal)
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            loader.default_output_processor = TakeFirst()

            for field, xpath in self.mov_fields.iteritems():
                loader.add_xpath(field, xpath)
                x = deal.select(field).extract()
            yield loader.load_item()
Exemple #18
0
    def parse_movie_info(self, response):
        """Scrapes movie information"""
        self.log("Parsing Movie Info")
        hxs = HtmlXPathSelector(response)
        selector = hxs.select('//div[@class="maindetails"]')

        item = MovieItem()
        # set url
        item['url'] = response.url

        # use item loader for other attributes
        l = XPathItemLoader(item=item, selector=selector)
        l.add_xpath('title', './/h1/text()')
        l.add_xpath(
            'release_date', './/h5[text()="Release Date:"]'
            '/following-sibling::div/text()')
        l.add_xpath(
            'tagline', './/h5[text()="Tagline:"]'
            '/following-sibling::div/text()')

        yield l.load_item()
    def parse_category(self, response):
        # The main selector we're using to extract data from the page
        main_selector = HtmlXPathSelector(response)

        # The XPath to website links in the directory page
        xpath = '//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font'

        # Get a list of (sub) selectors to each website node pointed by the XPath
        sub_selectors = main_selector.select(xpath)

        # Iterate over the sub-selectors to extract data for each website
        for selector in sub_selectors:
            item = GoogledirItem()

            l = XPathItemLoader(item=item, selector=selector)
            l.add_xpath('name', 'a/text()')
            l.add_xpath('url', 'a/@href')
            l.add_xpath('description', 'font[2]/text()')

            # Here we populate the item and yield it
            yield l.load_item()
Exemple #20
0
    def get_answer(self, selector, response):
        answer_loader = XPathItemLoader(item = StackOverflowAnswer(),
                selector = selector)
        answer_loader.add_xpath('answer_content', ''.join([
            ".//td[@class='answercell']/div[@class='post-text']",
            "/p/text()"
            ]))
        answer_loader.add_xpath('answer_id', ''.join([
            "./@data-answerid"
            ]))
        answer_loader.add_xpath('marks', ''.join([
            ".//span[contains(@class, 'vote-count-post')]/text()"
            ]))
        # is best answer?
        if selector.select('./@class').extract()[0].find('accepted-answer') != -1:
            answer_loader.add_value('is_best_answer', 1)
        else:
            answer_loader.add_value('is_best_answer', 0)
        # get user name
        answer_loader.add_value('answerer', self.get_user(selector, response, 'answer'))

        return answer_loader.load_item()
Exemple #21
0
    def parse_materials(self, response):
        text = unicode (response.body, response.encoding)
        hxs = HtmlXPathSelector(text=text)
        materials = hxs.select ('//table[@class="t16Standard"]/tr')
        if (len(materials) == 0):
            self.log('Materials data not present in response from {0}'.format(response.url), log.INFO)
        else:
            # Skip the first report record because this is the header row
            materials.pop (0)
            if (len(materials) == 0):
                self.log('No incident reports found in response', log.INFO)
            else:
                self.log('Retrieved {0} materials records'.format(len(materials)), log.INFO)

        for material in materials:
            l = XPathItemLoader(NrcScrapedMaterial(), material)
            l.add_value('reportnum', response.url, TakeFirst(), re='P3_SEQNOS:(\d+)')
            for name, params in NrcScrapedMaterial.fields.items():
                if 'xpath' in params:
                    l.add_xpath(name, params['xpath'])
            item = l.load_item()
            yield item
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses
        Testing contracts:
        @url http://www.livingsocial.com/cities/15-san-francisco
        @returns items 1
        @scrapes title link
        """
        selector = HtmlXPathSelector(response)

        # iterate over deals
        for deal in selector.xpath(self.deals_list_xpath):
            loader = XPathItemLoader(LivingSocialDeal(), selector=deal)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            yield loader.load_item()
Exemple #23
0
    def scrape_content_items(self, response):
        hxs = HtmlXPathSelector(response)
        stats = self.crawler.stats
        page_num = hxs.select(
            '//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value'
        ).extract()
        if page_num:
            page_num = page_num[0]
            self.log(
                '%s Scraping page %s' % (response.meta['cookiejar'], page_num),
                log.INFO)
        else:
            self.log('%s No page number found' % (response.meta['cookiejar']),
                     log.WARNING)

        stats.inc_value('_pages', spider=self)
        reports = hxs.select(
            '//table[@id="MainContent_DocumentList1_GridView1"]//tr')

        for report in reports:
            l = XPathItemLoader(FracFocusScrape(), report)
            l.state_in = lambda slist: [s[:20] for s in slist]
            l.county_in = lambda slist: [s[:20] for s in slist]
            for name, params in FracFocusScrape.fields.items():
                l.add_xpath(name, params['xpath'])
            item = l.load_item()
            if item.get('api'):
                if self.db.itemExists(item):
                    stats.inc_value('_existing_count', spider=self)
                else:
                    stats.inc_value('_new_count', spider=self)
                    #                print item['operator']
                    yield item
        if not stats.get_value('_existing_count') and not stats.get_value(
                '_new_count'):
            self.log('%s No records found' % (response.meta['cookiejar']),
                     log.WARNING)
    def parse_full_report(self, response):
        reportnum = response.request.meta['reportnum']

        # need to work around weird bug where lxml can't handle encode=WINDOWS-1252
        # so pull out the body, convert to utf-8 and create a new TextResponse object to contain it
        # since XPathItemLoader requires a Response object
        text = unicode(response.body, response.encoding)
        if len(
                text
        ) < 1000:  # check for an empty response- if so then bail out - we'll try again next time around
            return

        t = TextResponse(url=response.url,
                         body=text.encode('utf-8'),
                         encoding='utf-8')

        l = XPathItemLoader(NrcScrapedFullReport(), response=t)
        url_parts = urlsplit(response.url)
        l.add_value('reportnum', reportnum)
        l.add_xpath('full_report_body', '//body')
        l.add_value('full_report_url', response.url)
        item = l.load_item()
        yield item
        self.db.setBotTaskStatus(reportnum, self.name, 'DONE')
Exemple #25
0
    def get_question(self, selector, response):
        hxs = HtmlXPathSelector(response)
        number_of_answers = hxs.select(''.join([
            '//div[@id="answers"]',
            '//div[contains(@class, "answers-subheader")]',
            '/h2/text()'
            ])).extract()

        question_loader = XPathItemLoader(item = StackOverflowQuestion(),
                selector = selector)
        question_loader.add_xpath('question_content', ''.join([
            ".//td[@class='postcell']",
            "//div[@class='post-text']/p/text()"
            ]))
        question_loader.add_xpath('question_tags', ''.join([
            ".//div[@class='post-taglist']",
            "//a[@class='post-tag']/text()"
            ]))
        question_loader.add_xpath('question_id', ''.join([
            './@data-questionid'
            ]))
        question_loader.add_xpath('marks', ''.join([
            ".//span[contains(@class, 'vote-count-post')]/text()"
            ]))
        question_loader.add_value('asker', self.get_user(selector, response, 'question'))
        question_loader.add_value('number_of_answers',
                int(number_of_answers[0].strip().split(' ')[0]))

        question_title = hxs.select(''.join([
            '//div[contains(@id, "question-header")]',
            '//a[contains(@class, "question-hyperlink")]/text()'
            ])).extract()
        question_loader.add_value('question_title', question_title)
        # print  question_loader.get_output_value('question_title')

        return question_loader.load_item()
Exemple #26
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     teams = hxs.select('//tbody/tr')  #get the table rows
     season = hxs.select('//small/text()').extract()
     day = hxs.select('//h3/text()').extract()
     items = []
     for index, team in enumerate(teams):
         l = XPathItemLoader(item=TeamItem(),
                             response=response,
                             selector=team)
         l.add_xpath('name', 'td[@class="equipo"]/text()')
         l.add_xpath('name', 'td[@class="equipo"]/a/text()')
         l.add_value('season', season)
         l.add_value('day', day)
         l.add_value('position', str(index + 1))
         l.add_xpath('pj', 'td[@class="pj"]/text()')
         l.add_xpath('pg', 'td[@class="pg"]/text()')
         l.add_xpath('pe', 'td[@class="pe"]/text()')
         l.add_xpath('pp', 'td[@class="pp"]/text()')
         l.add_xpath('gf', 'td[@class="gf"]/text()')
         l.add_xpath('gc', 'td[@class="gc"]/text()')
         l.add_xpath('points', 'td[@class="pts seleccionado"]/text()')
         items.append(l.load_item())
     return items
Exemple #27
0
    def parse_product(self, response):
        '''
        Gather all the information from the product

        name
        price
        description
        image_urls

        '''
        l = XPathItemLoader(item=Product(), response=response)

        l.add_xpath('name', XPATHS['product']['name'])

        l.add_xpath('description', XPATHS['product']['description'])

        # price
        for xpath in XPATHS['product']['prices']:
            l.add_xpath('price', xpath)

        l.add_xpath('image_urls', XPATHS['product']['image_urls'] \
            , re='\'(.*?)\'')

        return l.load_item()
    def parse_doctor(self, response):
        response_url = response.url
        doctor_id = re.search('doctor/([^\.]*)\.htm', response_url).group(1)

        hxs = Selector(response)

        #parse doctor name
        name_list = hxs.xpath("//input[@name='doctor_name']/@value")
        doctor_name = ''
        if len(name_list) != 0:
            doctor_name = name_list[0].extract()

        #hospital department
        hospital_department_selectors = hxs.xpath("//meta[@name='keywords']/@content")
        hospital = ''
        department = ''
        if len(hospital_department_selectors) != 0:
            hospital_re = r',(?P<hospital>.*?)' + doctor_name
            hospital_match = re.search(hospital_re, hospital_department_selectors[0].extract())
            if hospital_match != None:
                hospital = hospital_match.group('hospital')

            department_re = hospital + r'(?P<department>.*?)' + doctor_name + ','
            department_match = re.search(department_re, hospital_department_selectors[0].extract())
            if department_match != None:
                department = department_match.group('department')

        #title
        title = ''
        title_selectors = hxs.xpath('//meta[@name="description"]/@content')
        if len(title_selectors) != 0:
            title_re_str = doctor_name + r'(?P<doctor_title>.*?)' + u'简介'
            title = re.search(title_re_str, title_selectors[0].extract()).group(1)

        doctor_about_dict = None
        tag_doctor_about_selectors = hxs.xpath('//div[@id="bp_doctor_about"]/div[@class="doctor_about"]')
        if len(tag_doctor_about_selectors) != 0:
            doctor_about_dict = self.parse_doctor_about(tag_doctor_about_selectors)
        else:
            doctor_about_match_list = hxs.xpath(
                '//script[@type="text/javascript"]/text()').re(
                'BigPipe.onPageletArrive\((?P<doctor_about>\{"id":"bp_doctor_about".*\})\);')
            if doctor_about_match_list:
                da_dict = json.loads(doctor_about_match_list[0])
                if 'content' in da_dict:
                    doctor_about_hxs = Selector(HtmlResponse(url=response.url, body=da_dict['content'].encode('utf-8')))
                    doctor_about_dict = self.parse_doctor_about(doctor_about_hxs)


        #schedule
        doctor_schedule = []
        trs = hxs.xpath("//table[@class='doctortimefrom1']/tr")
        day_part = 0
        for itr in trs:
            if 0 != day_part:
                doctor_schedule.extend(self.weekday_operation(itr, day_part)) #上午
            day_part += 1

        # #disease
        # disease_list = list()
        # disease_ht_selector = hxs.xpath('//div[@class="ltdiv"]//table[@class="jbsm"]//td')
        # if len(disease_ht_selector) == 1:
        #     disease_list = self.parse_disease_from_td_selector(disease_ht_selector, doctor_id=doctor_id)
        # else:
        #     disease_match_list = hxs.xpath(
        #         '//script[@type="text/javascript"]/text()').re(
        #         'BigPipe.onPageletArrive\((?P<dict_content>\{"id":"bp_doctor_getvote".*\})\);')

        #     if disease_match_list:
        #         disease_match = disease_match_list[0]
        #         d_dict = json.loads(disease_match)

        #         if 'content' in d_dict:
        #             disease_hxs = Selector(HtmlResponse(url=response.url, body=d_dict['content'].encode('utf-8')))
        #             disease_selector = disease_hxs.xpath('//div[@class="ltdiv"]//table[@class="jbsm"]//td')
        #             if len(disease_selector) == 1:
        #                 disease_list = self.parse_disease_from_td_selector(disease_selector, doctor_id=doctor_id)


        zanwu_re = re.compile(u'暂无')
        empty_sub_re = re.compile(r'(<!--.*?-->|\n|\t|\r|[ ])')

        item = XPathItemLoader(DoctorDetailItem(),hxs)
        item.add_value('doctor_id',doctor_id)
        if doctor_name:
            item.add_value('_name',doctor_name)
        if response.meta['city']:
            item.add_value('city',response.meta['city'])
        if hospital:
            item.add_value('hospital',hospital)
        if department:
            item.add_value('department',department)
        if title:
            item.add_value('title',title)
        if doctor_schedule:
            item.add_value('schedule',doctor_schedule)
        else:
            if len(hxs.xpath('//table[@class="doctortimefrom1"]')) == 0:
                for content in hxs.xpath('//script[@type="text/javascript"]/text()').extract():
                    if content.find('doctortimefrom1') != -1:
                        item.add_value('schedule','') # shouldn't exist in js
                        break


        if doctor_about_dict:
            if 'image_url' in doctor_about_dict:
                item.add_value('image',doctor_about_dict['image_url'])
            if 'bio' in doctor_about_dict:
                bio = doctor_about_dict['bio']
                if zanwu_re.search(bio) != None:
                    bio = ''
                if bio:
                    item.add_value('bio',empty_sub_re.sub('', bio))
            if 'feature' in doctor_about_dict:
                feature = doctor_about_dict['feature']
                if zanwu_re.search(feature) != None:
                    feature = ''
                if feature:
                    item.add_value('feature',empty_sub_re.sub('', feature))

        yield item.load_item()



        url=u'http://www.haodf.com/doctor/'+doctor_id+u'/jingyan/1.htm'

        l = LetterItem()
        l['doctor_id'] = doctor_id
        letter = []

        disease_item = DoctorDiseaseItem()
        disease_item['doctor_id'] = doctor_id

        req=Request(url,callback=self.parse_letter)
        req.meta['item']=l
        req.meta['letter']=letter
        req.meta['disease']=disease_item
        yield req
Exemple #29
0
    def parse(self, response):
        response.body = response.body.replace('\\', '').replace('\xa0', '')
        parcel = XPathItemLoader(item=TCADParcelItem(), response=response)

        parcel.add_value('url', response.url)
        parcel.add_xpath(
            'prop_id',
            '//font[text()="Property ID Number:"]/../../td[3]/font/b/text()')
        parcel.add_xpath(
            'owner',
            '//td[text()="Owner\'s Name"]/../td[@class="reports_blacktxt"]/font/b/text()'
        )
        parcel.add_xpath(
            'owner_address',
            '//td[text()="Owner\'s Name"]/../../tr[2]/td[2]/text()')
        parcel.add_xpath(
            'address', '//td[text()="Owner\'s Name"]/../../tr[3]/td[2]/text()')

        parcel.add_xpath(
            'land_value',
            '//font[text()="Land Value"]/../../td[@class="reports_blacktxt"]/p/text()'
        )
        parcel.add_xpath(
            'improvement_value',
            '//font[text()="Improvement Value"]/../../td[@class="reports_blacktxt"]/p/text()'
        )
        parcel.add_xpath(
            'market_value',
            '//font[text()="Total Value"]/../../td[@class="reports_blacktxt"]/p/text()'
        )

        parcel.add_xpath(
            'acreage',
            '//font[text()="Land Acres"]/../../td[@class="reports_blacktxt"]/p/text()'
        )
        parcel.add_xpath(
            'neighborhood',
            '//font[text()="Neighborhood Code"]/../../td[@class="reports_blacktxt"]/text()'
        )

        parcel.add_xpath(
            'improvement_area',
            '//font[text()="Total Living Area"]/../../td[2]//b/text()')

        def improvement(text, url):
            response = http.TextResponse(url=url, body=str(text))
            i = XPathItemLoader(item=TCADImprovementItem(), response=response)

            i.add_xpath('id', '//td[1]/text()')
            i.add_xpath('state_category', '//td[2]/text()')
            i.add_xpath('description', '//td[3]/text()')

            return i.load_item()

        def segment(text, url):
            response = http.TextResponse(url=url,
                                         body=str(text.replace(u'\xa0', '')))
            s = XPathItemLoader(item=TCADSegmentItem(), response=response)

            s.add_xpath('improvement_id', '//td[1]/text()')
            s.add_xpath('id', '//td[2]/text()')
            s.add_xpath('type_code', '//td[3]/text()')
            s.add_xpath('description', '//td[4]/text()')
            s.add_xpath('klass', '//td[5]/text()')
            s.add_xpath('year_built', '//td[6]/text()')
            s.add_xpath('area', '//td[7]/text()')

            return s.load_item()

        def history(text, url):
            response = http.TextResponse(url=url,
                                         body=str(text.replace(u'\xa0', '')))
            h = XPathItemLoader(item=TCADValueHistoryItem(), response=response)

            h.add_xpath('year', '//td[1]/text()')
            h.add_xpath('value', '//td[4]/text()')

            return h.load_item()

        hxs = HtmlXPathSelector(response)
        values = hxs.select(
            '//font[text()="Improvement ID"]/../../../../tr[position()>1]'
        ).extract()
        parcel.add_value(
            'improvements',
            map(improvement, values, [
                response.url,
            ] * len(values)))

        values = hxs.select(
            '//font[text()="Imp ID"]/../../../../tr[position()>1 and position()<last()]'
        ).extract()
        parcel.add_value('segments',
                         map(segment, values, [
                             response.url,
                         ] * len(values)))

        values = hxs.select(
            '//td[text()="Certified Value History"]/../../../..//td[@colspan="5"]/following::tr[1]'
        ).extract()
        parcel.add_value('historical_values',
                         map(history, values, [
                             response.url,
                         ] * len(values)))

        return parcel.load_item()
Exemple #30
0
    def parse_item(self, response):
        loader = XPathItemLoader(item=ImageItem(), response=response)
        loader.add_xpath('image_urls', '//img/@src')

        return loader.load_item()