Beispiel #1
0
    def processPaper(self, response):
        # get abstract text
        text = Selector(text=response.body).xpath('//text()').extract()
        text = ''.join(text)
        abstract_index = text.lower().find("abstract")
        abs_index = text.lower().find("[abs]")
        abstract = text[abstract_index + 8:abs_index]
        abstract = abstract.replace("\n", " ").strip()
        item = response.meta['item']
        item['abstract'] = abstract

        # process pdf link
        request = scrapy.Request(item['url'], callback=processPDF)
        request.meta['item'] = item
        yield request
    def parse(self, response):
        """Parse the search result of an author"""

        author_name = response.meta['author']
        print "Parsing author {}".format(author_name)
        try:
            # Extract the panel on the right containing author results
            panel = response.xpath(
                '//*[@class="six columns omega"]').extract_first()

            # Check that the Matching Pages includes 'Authors'
            matching_pages = Selector(
                text=panel).xpath('//*[@class="bq_s"]').extract_first()
            matching_pages_text = Selector(
                text=matching_pages).xpath('//text()').extract()

            if "Authors" not in matching_pages_text:
                raise ValueError("No author page found!")

            # Get the list of authors by extracting the first row
            matches_row = Selector(text=matching_pages).xpath(
                '//*[@class="row"]').extract_first()
            matches = Selector(
                text=matches_row).xpath('//*[@class="bqLn"]').extract()

            # Find matching author and extract their url
            author_href = None
            for match in matches:
                match_name = Selector(
                    text=match).xpath('//a//text()').extract_first()
                if match_name.lower() == author_name.lower():
                    author_href = Selector(
                        text=match).xpath('//a/@href').extract_first()
                    author_href = response.urljoin(author_href)
                    break
            if not author_href:
                raise ValueError("No matching author found!")

        except Exception, e:
            print "Error parsing author: {}".format(author_name)
            print str(e)
            with open(missing_authors_file, 'a') as f:
                f.write(author_name + '\n')
            return