def processPaper(self, response): # get abstract text text = Selector(text=response.body).xpath('//text()').extract() text = ''.join(text) abstract_index = text.lower().find("abstract") abs_index = text.lower().find("[abs]") abstract = text[abstract_index + 8:abs_index] abstract = abstract.replace("\n", " ").strip() item = response.meta['item'] item['abstract'] = abstract # process pdf link request = scrapy.Request(item['url'], callback=processPDF) request.meta['item'] = item yield request
def parse(self, response): """Parse the search result of an author""" author_name = response.meta['author'] print "Parsing author {}".format(author_name) try: # Extract the panel on the right containing author results panel = response.xpath( '//*[@class="six columns omega"]').extract_first() # Check that the Matching Pages includes 'Authors' matching_pages = Selector( text=panel).xpath('//*[@class="bq_s"]').extract_first() matching_pages_text = Selector( text=matching_pages).xpath('//text()').extract() if "Authors" not in matching_pages_text: raise ValueError("No author page found!") # Get the list of authors by extracting the first row matches_row = Selector(text=matching_pages).xpath( '//*[@class="row"]').extract_first() matches = Selector( text=matches_row).xpath('//*[@class="bqLn"]').extract() # Find matching author and extract their url author_href = None for match in matches: match_name = Selector( text=match).xpath('//a//text()').extract_first() if match_name.lower() == author_name.lower(): author_href = Selector( text=match).xpath('//a/@href').extract_first() author_href = response.urljoin(author_href) break if not author_href: raise ValueError("No matching author found!") except Exception, e: print "Error parsing author: {}".format(author_name) print str(e) with open(missing_authors_file, 'a') as f: f.write(author_name + '\n') return