Esempio n. 1
0
    def fetch_url(self, url):
        """
        Return tuple of urls followed and the final page grabbed
        """

        # Construct request
        req = urllib2.Request(url, headers=utils.headers)
        # Open request and follow redirects to final location of content
        # get_response_chain returns a tuple: urls, page
        urls, page = utils.get_response_chain(req)
        
        urls.append((0,url))

        page_content = page.read()
        page_url = page.geturl()

        # Check for meta refresh tag
        # <meta http-equiv=refresh content="0; url=/13/10339/2013/acp-13-10339-2013.html">

        if 'http-equiv' in page_content and 'refresh' in page_content:
          tree = lxml.html.fromstring(page_content)

          try:
            redirect_url = tree.xpath('.//meta[@http-equiv="refresh"]/@content')[0]

          except IndexError:
            pass

          else:
            bits = redirect_url.split(';')

            if len(bits) > 1:
              url = bits[1].split('=')[1]

              url = urlparse.urljoin(page_url, url)

              # Recurse!
              iter_urls, page_content, page_url = self.fetch_url(url)
              urls = urls + iter_urls 

        return urls, page_content, page_url
Esempio n. 2
0
def scrape(abstract_url):
  req = urllib2.Request(abstract_url, headers=utils.headers)
  urls, response = utils.get_response_chain(req)

  if response.info().get('Content-Encoding') == 'gzip':
    buf = StringIO( response.read())
    f = gzip.GzipFile(fileobj=buf)
    data = f.read()
  else:
    data = response.read()

  page_text = data.decode('utf-8')
  tree = lxml.html.fromstring(page_text)

  article = make_blank_article()
  article['scraper'] = 'npg'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('DC.title',tree)
  if article['title'] == None:
      article['title'] = get_meta('dc.title',tree)

  article['publisher'] = get_meta('DC.publisher',tree)
  if article['publisher'] == None:
      article['publisher'] = get_meta('dc.publisher',tree)

  article['author_names'] = get_meta_list('DC.creator',tree)
  if article['author_names'] == None:
      article['author_names'] = get_meta_list('dc.creator',tree)
  
  article['abstract'] = get_meta('description', tree)
  if not article['abstract']: 
      try:
          article['abstract'] = tree.xpath("//div[@class='content']/p")[0].text_content()
      except:
          pass
  if not article['abstract']: 
      try:
          article['abstract'] = tree.xpath("//div[@id='abs']/p")[0].text_content()
      except:
          pass

  article['citation']['journal'] = get_meta('citation_journal_title', tree)
  article['citation']['volume'] = get_meta('prism.volume', tree)
  article['citation']['page'] = get_meta('prism.startingPage', tree)

  article['journal'] = get_meta('prism.publicationName', tree)

  year = get_meta('citation_date', tree)
  if year:
    article['citation']['year'] = year[0:4]

  article['ids'] = dict(zip(['doi'], [tree.xpath("//meta[@name='citation_doi']/@content")[0][4:]]))

  pub_date = get_meta('DC.date', tree)
  if pub_date == None:
      pub_date = get_meta('dc.date', tree)

  if pub_date:
      split = pub_date.split('-')
      article['date_published'] = make_datestamp(split[2], split[1], split[0])

  #Specific abstract scrapers for subsidiary journals
  if article['journal'] == 'The EMBO Journal':
      try:
          article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
      except:
          pass
  
  elif article['journal'] == 'EMBO reports':
      try:
          article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
      except:
          pass

  elif article['journal'] == 'Oncogene':
      try:
          article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content()
      except:
          pass

  return article