def fetch_url(self, url): """ Return tuple of urls followed and the final page grabbed """ # Construct request req = urllib2.Request(url, headers=utils.headers) # Open request and follow redirects to final location of content # get_response_chain returns a tuple: urls, page urls, page = utils.get_response_chain(req) urls.append((0,url)) page_content = page.read() page_url = page.geturl() # Check for meta refresh tag # <meta http-equiv=refresh content="0; url=/13/10339/2013/acp-13-10339-2013.html"> if 'http-equiv' in page_content and 'refresh' in page_content: tree = lxml.html.fromstring(page_content) try: redirect_url = tree.xpath('.//meta[@http-equiv="refresh"]/@content')[0] except IndexError: pass else: bits = redirect_url.split(';') if len(bits) > 1: url = bits[1].split('=')[1] url = urlparse.urljoin(page_url, url) # Recurse! iter_urls, page_content, page_url = self.fetch_url(url) urls = urls + iter_urls return urls, page_content, page_url
def scrape(abstract_url): req = urllib2.Request(abstract_url, headers=utils.headers) urls, response = utils.get_response_chain(req) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO( response.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data = response.read() page_text = data.decode('utf-8') tree = lxml.html.fromstring(page_text) article = make_blank_article() article['scraper'] = 'npg' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('DC.title',tree) if article['title'] == None: article['title'] = get_meta('dc.title',tree) article['publisher'] = get_meta('DC.publisher',tree) if article['publisher'] == None: article['publisher'] = get_meta('dc.publisher',tree) article['author_names'] = get_meta_list('DC.creator',tree) if article['author_names'] == None: article['author_names'] = get_meta_list('dc.creator',tree) article['abstract'] = get_meta('description', tree) if not article['abstract']: try: article['abstract'] = tree.xpath("//div[@class='content']/p")[0].text_content() except: pass if not article['abstract']: try: article['abstract'] = tree.xpath("//div[@id='abs']/p")[0].text_content() except: pass article['citation']['journal'] = get_meta('citation_journal_title', tree) article['citation']['volume'] = get_meta('prism.volume', tree) article['citation']['page'] = get_meta('prism.startingPage', tree) article['journal'] = get_meta('prism.publicationName', tree) year = get_meta('citation_date', tree) if year: article['citation']['year'] = year[0:4] article['ids'] = dict(zip(['doi'], [tree.xpath("//meta[@name='citation_doi']/@content")[0][4:]])) pub_date = get_meta('DC.date', tree) if pub_date == None: pub_date = get_meta('dc.date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) #Specific abstract scrapers for subsidiary journals if article['journal'] == 'The EMBO Journal': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'EMBO reports': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'Oncogene': try: article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content() except: pass return article