Example #1
0
def scrape(url):
    if "?fmt=txt" not in url:
        url = url + "?fmt=txt"
    urls, response = get_response_chain(url)

    abstxt = response.read()

    paper = build_paper(abstxt)
    paper["source_urls"] = [uri for _, uri in urls] + [url.replace("?fmt=txt", "")]

    return paper
Example #2
0
def scrape(abstract_url):
  req = urllib2.Request(abstract_url, headers=utils.headers)
  urls, response = utils.get_response_chain(req)

  if response.info().get('Content-Encoding') == 'gzip':
    buf = StringIO( response.read())
    f = gzip.GzipFile(fileobj=buf)
    data = f.read()
  else:
    data = response.read()

  page_text = data.decode('utf-8')
  tree = lxml.html.fromstring(page_text)

  article = make_blank_article()
  article['scraper'] = 'npg'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('DC.title',tree)
  if article['title'] == None:
      article['title'] = get_meta('dc.title',tree)

  article['publisher'] = get_meta('DC.publisher',tree)
  if article['publisher'] == None:
      article['publisher'] = get_meta('dc.publisher',tree)

  article['author_names'] = get_meta_list('DC.creator',tree)
  if article['author_names'] == None:
      article['author_names'] = get_meta_list('dc.creator',tree)
  
  article['abstract'] = get_meta('description', tree)
  if not article['abstract']: 
      try:
          article['abstract'] = tree.xpath("//div[@class='content']/p")[0].text_content()
      except:
          pass
  if not article['abstract']: 
      try:
          article['abstract'] = tree.xpath("//div[@id='abs']/p")[0].text_content()
      except:
          pass

  article['citation']['journal'] = get_meta('citation_journal_title', tree)
  article['citation']['volume'] = get_meta('prism.volume', tree)
  article['citation']['page'] = get_meta('prism.startingPage', tree)

  article['journal'] = get_meta('prism.publicationName', tree)

  year = get_meta('citation_date', tree)
  if year:
    article['citation']['year'] = year[0:4]

  article['ids'] = dict(zip(['doi'], [tree.xpath("//meta[@name='citation_doi']/@content")[0][4:]]))

  pub_date = get_meta('DC.date', tree)
  if pub_date == None:
      pub_date = get_meta('dc.date', tree)

  if pub_date:
      split = pub_date.split('-')
      article['date_published'] = make_datestamp(split[2], split[1], split[0])

  #Specific abstract scrapers for subsidiary journals
  if article['journal'] == 'The EMBO Journal':
      try:
          article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
      except:
          pass
  
  elif article['journal'] == 'EMBO reports':
      try:
          article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
      except:
          pass

  elif article['journal'] == 'Oncogene':
      try:
          article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content()
      except:
          pass

  return article
Example #3
0
def scrape(abstract_url):
  req = urllib2.Request(abstract_url, headers=headers)
  urls, page = utils.get_response_chain(req)

  # Parse the HTML into a tree we can query
  page_text = page.read().decode('utf-8')
  tree = lxml.html.fromstring(page_text, base_url=abstract_url)

  # Make XPATH queries for the first H1 and second H2 for the article title and how to cite it
  title = tree.xpath('//h1')[0].text_content().strip()
  cite_as = tree.xpath('//h2')[1].text.strip()

  # Scrub the citation.
  cite_as = re.sub('\s{1,}', ' ', cite_as)
  cite_as = re.sub(' \[.*?\]', '', cite_as)

  # Make our article object
  article = {}
  article['scraper'] = 'pr'
  article['title'] = title
  article['cite'] = cite_as
  
  try:
    article['citation'] = parse_citation(cite_as)
  except:
    pass

  # Grab all links inside the the <div> with the id='aps-authors' and take their text as the author list.
  article['author_names'] = [author.text.strip() for author in tree.xpath("//div[@id='aps-authors']//a")]

  try:
    article['author_names'].remove('Hide All Authors/Affiliations')
  except:
    pass

  # Find the div with class 'aps-abstractbox' and grab the text of the first <p> within it as the abstract
  try:
    article['abstract'] = tree.xpath("//div[@class='aps-abstractbox']/p")[0].text_content()
  except:
    pass

  months = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}

  # Received 21 December 2011; revised 18 February 2012; published  9 April 2012

  date_received = re.findall('Received\s+([0-9]+)\s+([A-Za-z]+)\s+([0-9]+)', page_text)
  date_revised = re.findall('revised\s+([0-9]+)\s+([A-Za-z]+)\s+([0-9]+)', page_text) 
  date_published = re.findall('published\s+([0-9]+)\s+([A-Za-z]+)\s+([0-9]+)', page_text) 

  def make_datestamp(date_tuple):
    year = int(date_tuple[2])
    month = months[date_tuple[1]]
    day = int(date_tuple[0])
    return time.mktime(datetime.date(year, month, day).timetuple())

  if date_received:
    article['date_received'] = make_datestamp(date_received[0])
  if date_revised:
    article['date_revised'] = make_datestamp(date_revised[0])
  if date_published:
    article['date_published'] = make_datestamp(date_published[0])

  # Find the <div> with the id 'aps-article-info' and take the respective columns cell contents as the type of id and id.
  article['ids'] = dict(zip([e.text.strip().lower().replace(':','') for e in tree.xpath("//div[@id='aps-article-info']//div[@class='table-cell bold']")],\
                            [e.text.strip() for e in tree.xpath("//div[@id='aps-article-info']//div[@class='table-cell']")]))

  if 'subject areas' in article['ids']:
    article['pr_subject_areas'] = article['ids']['subject areas']
    del article['ids']['subject areas']

  article['journal'] = recognise_journal(page.geturl())
  article['source_urls'] = [uri for _, uri in urls]

  # PACS will be recognised as an id, even though its actually a list of categories.
  # Split them out into their own custom field and delete from ids.
  if 'pacs' in article['ids']:
    article['categories'] = {'PACS':[c.strip() for c in
                                     article['ids']['pacs'].split(',')]}
    del article['ids']['pacs']

  
  return article