def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'rup' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('DC.Title', tree) article['journal'] = get_meta('citation_journal_title', tree) article['publisher'] = get_meta('DC.Publisher', tree) article['author_names'] = get_meta_list('DC.Contributor', tree) article['abstract'] = strip_space( tree.xpath("//div[@class='section abstract']/p")[0].text_content() ) article['citation']['journal'] = get_meta('citation_journal_abbrev', tree) article['citation']['volume'] = get_meta('citation_volume', tree) article['citation']['page'] = get_meta('citation_firstpage', tree) article['ids'] = {'doi':get_meta('DC.Identifier', tree),} pub_date = get_meta('DC.Date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) article['citation']['year'] = split[0] return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'iop' article['source_urls'] = [uri for _, uri in urls] article['publisher'] = get_meta('citation_publisher', tree) article['title'] = get_meta('dc.title', tree) if article['title'] == None: article['title'] = get_meta('dc.Title', tree) article['author_names'] = get_meta_list('dc.creator', tree) if article['author_names'] == None: article['author_names'] = get_meta_list('dc.contributor', tree) article['abstract'] = get_meta('dc.description', tree) article['journal'] = get_meta('citation_journal_title', tree) article['citation']['journal'] = get_meta('citation_journal_abbrev', tree) article['citation']['volume'] = get_meta('citation_volume', tree) article['citation']['page'] = get_meta('citation_firstpage', tree) article['ids'] = dict(zip(['doi'], [get_meta('citation_doi', tree)])) pub_date = get_meta('citation_publication_date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) article['citation']['year'] = split[0] return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = sn.scrape(abstract_url) article['abstract'] = None if article['journal'] == 'The EMBO Journal': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'EMBO reports': try: article['abstract'] = tree.xpath("//p[@class='lead']")[0].text_content() except: pass elif article['journal'] == 'Oncogene': try: article['abstract'] = tree.xpath("//p[@class='abs lead']")[0].text_content() except: pass return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'rsc' article['source_urls'] = [uri for _, uri in urls] article['ids'] = {'doi': get_meta('DC.Identifier', tree), } article['title'] = get_meta('DC.title', tree) article['publisher'] = get_meta('DC.publisher', tree) article['author_names'] = get_meta_list('DC.Creator', tree) try: article['journal'] = tree.xpath("//img[@id='imgLoader']/@title")[0] except: pass try: article['abstract'] = tree.xpath("//p[@xmlns='http://www.rsc.org/schema/rscart38']")[0].text_content() except: pass article['citation']['journal'] = get_meta('citation_journal_title', tree) article['citation']['volume'] = get_meta('citation_volume', tree) article['citation']['page'] = get_meta('citation_firstpage', tree) pub_date = get_meta('citation_publication_date', tree) if pub_date: split = pub_date.split('-') article['date_published'] = make_datestamp(split[2], split[1], split[0]) article['citation']['year'] = split[0] return article
def scrape(abstract_url): abstract_url = fix_wiley_url(abstract_url) tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'wiley' article['source_urls'] = [uri for _, uri in urls] try: article['journal'] = get_meta('citation_journal_title', tree) except: pass try: article['title'] = get_meta('citation_title', tree) except: pass try: article['ids'] = dict(zip(['doi'], [get_meta('citation_doi', tree)])) except: pass try: article['author_names'] = get_meta_list('citation_author', tree) except: pass try: article['abstract'] = tree.xpath("//div[@id='abstract']/div/p")[0].text_content() except: article['abstract'] = tree.xpath("//div[@id='graphicalAbstract']/div/p")[0].text_content() x = get_meta('citation_publication_date', tree) if x is None: x = get_meta('citation_online_date', tree) year, month, day = x.split('/') new_date = make_datestamp(day, month, year) article['date_published'] = new_date article['citation']['journal'] = article['journal'] article['citation']['volume'] = get_meta('citation_volume', tree) try: article['citation']['year'] = year except: pass first_page = get_meta('citation_firstpage', tree) if first_page == None: first_page = '0' last_page = get_meta('citation_lastpage', tree) if last_page == None: last_page = '0' if first_page != '0' and last_page != '0': article['citation']['page'] = first_page + '-' + last_page return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = scrape_tree(tree, urls, page_text) for field in NECESSARY_FIELDS: if field not in article or not article[field]: print field raise ScraperNotFound return article
def scrape(abstract_url): tree, urls, page_text = utils.get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'AA' article['source_urls'] = [uri for _, uri in urls] try: article['title'] = tree.xpath("//div[@id='head']/h2")[0].text_content() except: pass article['author_names'] = tree.xpath("//div[@id='head']/p")[0].text_content() article['journal'] = tree.xpath("//meta[@name='keywords']/@content")[0].split(',')[0] info = tree.xpath("//div[@id='head']//p[@class='history']")[0].text_content() pubdate = info.split(' ')[4:] pubdate[1] = months[pubdate[1]] day, month, year = int(pubdate[0]), int(pubdate[1]), int(pubdate[2]) pubdateuni = time.mktime(datetime.date(year, month, day).timetuple()) article['date_published'] = pubdateuni info = tree.xpath("//div[@id='head']")[0].text_content() ab1 = info.split("Abstract")[1] ab2 = ab1.split("Key words")[0] article['abstract'] = ab2 rec1 = info.split("Accepted: ")[1] rec2 = rec1.split("\nAbstract")[0] day, month, year = rec2.split(' ') article['date_published'] = make_datestamp(day, months[month], year) article['citation']['year'] = year issueinfo = info.split(article['title'])[0] jour, vol, num, yea = issueinfo.split(' ')[0], issueinfo.split(' ')[1],\ issueinfo.split(' ')[2], issueinfo.split(' ')[3] article['citation']['journal'] = jour article['citation']['volume'] = vol.split(',')[0] doi = article['source_urls'][0].split('doi=')[1] doi2 = doi.split('&')[0] article['ids'] = doi2 return article
def scrape(abstract_url): tree, urls, page_text = utils.get_tree(abstract_url) article = smt.scrape(abstract_url) ab1 = tree.xpath("//div[@id='load']")[0].text_content() ab2 = ab1.split("Summary")[1] article["abstract"] = ab2 article["scraper"] = "cell" return article
def scrape(abstract_url): tree, urls, page_text = utils.get_tree(abstract_url) article = make_blank_article() article["scraper"] = "acs" article["source_urls"] = [uri for _, uri in urls] article["title"] = utils.get_meta("dc.Title", tree) article["publisher"] = utils.get_meta("dc.Publisher", tree) article["author_names"] = utils.get_meta_list("dc.Creator", tree) article["ids"] = dict(zip(["doi"], [utils.get_meta("dc.Identifier", tree)])) try: article["journal"] = tree.xpath("//div[@id='journalTop']/div/a/img/@alt")[0] except: pass try: article["abstract"] = tree.xpath("//div[@id='abstractBox']/p")[0].text_content() except: pass try: article["citation"]["journal"] = tree.xpath("//div[@id='citation']/cite")[0].text except: pass try: article["citation"]["volume"] = tree.xpath("//span[@class='citation_volume']")[0].text except: pass page_cite = tree.xpath("//div[@id='citation']") if page_cite: page = re.findall("pp\s([0-9]+)", page_cite[0].text_content()) if page: article["citation"]["page"] = page[0] date = utils.get_meta("dc.Date", tree).split() if date: article["date_published"] = utils.make_datestamp(date[1][:-1], months[date[0]], date[2]) article["citation"]["year"] = date[2] return article
def scrape(abstract_url): tree, urls, page_text = get_tree(abstract_url) article = make_blank_article() article['scraper'] = 'MIT' article['source_urls'] = [uri for _, uri in urls] article['title'] = get_meta('dc.Title', tree) article['publisher'] = get_meta('dc.Publisher', tree).strip() article['author_names'] = get_meta_list('dc.Creator', tree) # Two identifier schemes used --- do we want both? article['ids'] = dict(zip(['publisher-id','doi'], get_meta_list('dc.Identifier', tree))) article['journal'] = tree.xpath("//h1[@class='journalTitle']/a/img/@alt")[0] # dc.Description is present, but contains an abbreviated abstract # --- this gets the full abstract article['abstract'] = tree.xpath("//div[@class='abstractSection']/p/text()")[0] article['citation']['journal'] = article['journal'] # Citation details (volume, number, pages) given as text # immediately following the h1 tag. # example: December 2012, Vol. 24, No. 2, Pages 1-35 citation_text = tree.xpath("//h1[@class='journalTitle']/following-sibling::text()")[0] pub_year, volume, number, page_first, page_last = re.findall('\d+', citation_text) article['citation'] = {} article['citation']['volume'] = volume article['citation']['number'] = number article['citation']['page_first'] = page_first article['citation']['page_last'] = page_last date = get_meta('dc.Date', tree).split('-') if date: article['date_published'] = make_datestamp(date[2], date[1], date[0]) article['citation']['year'] = date[0] return article