Example #1
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'rup'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('DC.Title', tree)
  article['journal'] = get_meta('citation_journal_title', tree)
  article['publisher'] = get_meta('DC.Publisher', tree)
  article['author_names'] = get_meta_list('DC.Contributor', tree)

  article['abstract'] = strip_space( tree.xpath("//div[@class='section abstract']/p")[0].text_content() )

  article['citation']['journal'] = get_meta('citation_journal_abbrev', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  article['ids'] = {'doi':get_meta('DC.Identifier', tree),}
  
  pub_date = get_meta('DC.Date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]
  
  return article
Example #2
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'iop'
  article['source_urls'] = [uri for _, uri in urls]

  article['publisher'] = get_meta('citation_publisher', tree)

  article['title'] = get_meta('dc.title', tree)
  if article['title'] == None:
    article['title'] = get_meta('dc.Title', tree)


  article['author_names'] = get_meta_list('dc.creator', tree)
  if article['author_names'] == None:
    article['author_names'] = get_meta_list('dc.contributor', tree)

  article['abstract'] = get_meta('dc.description', tree)
  
  article['journal'] = get_meta('citation_journal_title', tree)

  article['citation']['journal'] = get_meta('citation_journal_abbrev', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  article['ids'] = dict(zip(['doi'], [get_meta('citation_doi', tree)]))
 
  pub_date = get_meta('citation_publication_date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]

  return article 
Example #3
0
def scrape(abstract_url):
    tree, urls, page_text = get_tree(abstract_url)

    article = sn.scrape(abstract_url)

    article['abstract'] = None

    if article['journal'] == 'The EMBO Journal':
        try:
            article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
        except:
            pass

        
    elif article['journal'] == 'EMBO reports':
        try:
            article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content()
        except:
            pass

    elif article['journal'] == 'Oncogene':
        try:
            article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content()
        except:
            pass
        
        
    return article
Example #4
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'rsc'
  article['source_urls'] = [uri for _, uri in urls]
  article['ids'] = {'doi': get_meta('DC.Identifier', tree), }  

  article['title'] = get_meta('DC.title', tree)
  article['publisher'] = get_meta('DC.publisher', tree)
  article['author_names'] = get_meta_list('DC.Creator', tree)

  try:
      article['journal'] = tree.xpath("//img[@id='imgLoader']/@title")[0]
  except:
      pass

  try:
      article['abstract'] = tree.xpath("//p[@xmlns='http://www.rsc.org/schema/rscart38']")[0].text_content()
  except:
      pass
  
  article['citation']['journal'] = get_meta('citation_journal_title', tree)
  article['citation']['volume'] = get_meta('citation_volume', tree)
  article['citation']['page'] = get_meta('citation_firstpage', tree)

  pub_date = get_meta('citation_publication_date', tree)
  if pub_date:
    split = pub_date.split('-')
    article['date_published'] = make_datestamp(split[2], split[1], split[0])
    article['citation']['year'] = split[0]

  return article
Example #5
0
def scrape(abstract_url):
  abstract_url = fix_wiley_url(abstract_url)
  tree, urls, page_text = get_tree(abstract_url) 

  article = make_blank_article()
  article['scraper'] = 'wiley'
  article['source_urls'] = [uri for _, uri in urls]

  try:
    article['journal'] = get_meta('citation_journal_title', tree)
  except:
    pass
  try:
    article['title'] = get_meta('citation_title', tree)
  except:
    pass
  try:
    article['ids'] = dict(zip(['doi'], [get_meta('citation_doi', tree)]))
  except:
    pass
  try:
    article['author_names'] = get_meta_list('citation_author', tree)
  except:
    pass
  try:
    article['abstract'] = tree.xpath("//div[@id='abstract']/div/p")[0].text_content()
  except:
    article['abstract'] = tree.xpath("//div[@id='graphicalAbstract']/div/p")[0].text_content()
    
  x = get_meta('citation_publication_date', tree)
  if x is None:
    x = get_meta('citation_online_date', tree)
  

  year, month, day = x.split('/')
  new_date = make_datestamp(day, month, year)
  article['date_published'] = new_date

  
  article['citation']['journal'] = article['journal']
  article['citation']['volume'] = get_meta('citation_volume', tree)
  try:
    article['citation']['year'] = year
  except:
    pass
  
  first_page = get_meta('citation_firstpage', tree)
  if first_page == None:
    first_page = '0'
  
  last_page = get_meta('citation_lastpage', tree)
  if last_page == None:
    last_page = '0'
    
  if first_page != '0' and last_page != '0':
    article['citation']['page'] = first_page + '-' + last_page
    
  return article
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url) 

  article = scrape_tree(tree, urls, page_text)

  for field in NECESSARY_FIELDS:
        if field not in article or not article[field]:
            print field
            raise ScraperNotFound

  return article
Example #7
0
def scrape(abstract_url):
    tree, urls, page_text = utils.get_tree(abstract_url)

    article = make_blank_article()
    article['scraper'] = 'AA'
    article['source_urls'] = [uri for _, uri in urls]
    
    try:
        article['title'] = tree.xpath("//div[@id='head']/h2")[0].text_content()
    except:
        pass

    

    article['author_names'] = tree.xpath("//div[@id='head']/p")[0].text_content()
    
    article['journal'] = tree.xpath("//meta[@name='keywords']/@content")[0].split(',')[0]
    
    info = tree.xpath("//div[@id='head']//p[@class='history']")[0].text_content()
    pubdate = info.split(' ')[4:]
    pubdate[1] = months[pubdate[1]]
    day, month, year = int(pubdate[0]), int(pubdate[1]), int(pubdate[2])
    pubdateuni = time.mktime(datetime.date(year, month, day).timetuple())
    article['date_published'] = pubdateuni



    info = tree.xpath("//div[@id='head']")[0].text_content()

    ab1 = info.split("Abstract")[1]
    ab2 = ab1.split("Key words")[0]
    article['abstract'] = ab2

    rec1 = info.split("Accepted: ")[1]
    rec2 = rec1.split("\nAbstract")[0]

    day, month, year = rec2.split(' ')
    article['date_published'] = make_datestamp(day, months[month], year)

    article['citation']['year'] = year

    issueinfo = info.split(article['title'])[0]
    jour, vol, num, yea = issueinfo.split(' ')[0], issueinfo.split(' ')[1],\
                          issueinfo.split(' ')[2], issueinfo.split(' ')[3]

    article['citation']['journal'] = jour
    article['citation']['volume'] = vol.split(',')[0]

    doi = article['source_urls'][0].split('doi=')[1]
    doi2 = doi.split('&')[0]
    article['ids'] = doi2


    return article
Example #8
0
def scrape(abstract_url):
    tree, urls, page_text = utils.get_tree(abstract_url)

    article = smt.scrape(abstract_url)

    ab1 = tree.xpath("//div[@id='load']")[0].text_content()
    ab2 = ab1.split("Summary")[1]
    article["abstract"] = ab2

    article["scraper"] = "cell"

    return article
Example #9
0
def scrape(abstract_url):
    tree, urls, page_text = utils.get_tree(abstract_url)

    article = make_blank_article()
    article["scraper"] = "acs"
    article["source_urls"] = [uri for _, uri in urls]

    article["title"] = utils.get_meta("dc.Title", tree)
    article["publisher"] = utils.get_meta("dc.Publisher", tree)
    article["author_names"] = utils.get_meta_list("dc.Creator", tree)

    article["ids"] = dict(zip(["doi"], [utils.get_meta("dc.Identifier", tree)]))

    try:
        article["journal"] = tree.xpath("//div[@id='journalTop']/div/a/img/@alt")[0]
    except:
        pass

    try:
        article["abstract"] = tree.xpath("//div[@id='abstractBox']/p")[0].text_content()
    except:
        pass

    try:
        article["citation"]["journal"] = tree.xpath("//div[@id='citation']/cite")[0].text
    except:
        pass

    try:
        article["citation"]["volume"] = tree.xpath("//span[@class='citation_volume']")[0].text
    except:
        pass

    page_cite = tree.xpath("//div[@id='citation']")
    if page_cite:
        page = re.findall("pp\s([0-9]+)", page_cite[0].text_content())
        if page:
            article["citation"]["page"] = page[0]

    date = utils.get_meta("dc.Date", tree).split()
    if date:
        article["date_published"] = utils.make_datestamp(date[1][:-1], months[date[0]], date[2])
        article["citation"]["year"] = date[2]

    return article
Example #10
0
def scrape(abstract_url):
  tree, urls, page_text = get_tree(abstract_url)

  article = make_blank_article()
  article['scraper'] = 'MIT'
  article['source_urls'] = [uri for _, uri in urls]

  article['title'] = get_meta('dc.Title', tree)
  article['publisher'] = get_meta('dc.Publisher', tree).strip()
  article['author_names'] = get_meta_list('dc.Creator', tree)

  # Two identifier schemes used --- do we want both?
  article['ids'] = dict(zip(['publisher-id','doi'], get_meta_list('dc.Identifier', tree)))

  article['journal'] = tree.xpath("//h1[@class='journalTitle']/a/img/@alt")[0]

  # dc.Description is present, but contains an abbreviated abstract
  # --- this gets the full abstract
  article['abstract'] = tree.xpath("//div[@class='abstractSection']/p/text()")[0]

  article['citation']['journal'] = article['journal']

  # Citation details (volume, number, pages) given as text
  # immediately following the h1 tag.
  # example: December 2012, Vol. 24, No. 2, Pages 1-35

  citation_text = tree.xpath("//h1[@class='journalTitle']/following-sibling::text()")[0]

  pub_year, volume, number, page_first, page_last = re.findall('\d+', citation_text)

  article['citation'] = {}
  article['citation']['volume'] = volume
  article['citation']['number'] = number
  article['citation']['page_first'] = page_first
  article['citation']['page_last'] = page_last

  date = get_meta('dc.Date', tree).split('-')
  if date:
    article['date_published'] = make_datestamp(date[2], date[1], date[0])
    article['citation']['year'] = date[0]

  return article