def testOverlap(): """ See if there is any overlap betwen linked list for two identical authors """ name = 'Williams, B' year = 2002 phd1 = list( ads.query('bibstem:*PhDT', authors=name, dates=year, database='astronomy', rows='all'))[0] year = 2010 phd2 = list( ads.query('bibstem:*PhDT', authors=name, dates=year, database='astronomy', rows='all'))[0] list1 = phdArticle2row(phd1, returnLinkedPapers=True) list2 = phdArticle2row(phd2, returnLinkedPapers=True) overlap = list(set(list1).intersection(list2)) print 'number of overlap papers in linked lists = %i' % len(overlap)
def generate_df(arXiv_dict): df = pd.DataFrame() for i, key in enumerate(arXiv_dict.keys()): pages, figures = search_comments(arXiv_dict[key]['comment']) try: ads_paper = list(ads.query(title=str(arXiv_dict[key]['title']))) if len(ads_paper) == 1: new_row = pd.DataFrame({ 'Arxiv_key': [arXiv_dict.keys()[i]], 'Title': [ads_paper[0].title], 'Author': [ads_paper[0].author], 'Citation_count': [ads_paper[0].citation_count], 'Year': [ads_paper[0].year], 'Pub': [ads_paper[0].pub], 'Pages': [pages], 'Figures': [figures] }) df = df.append(new_row) except: continue return df
def test_citation_tree(): output_filename = "citation-tree.json" paper = list( ads.query(author="^Casey, Andrew R.", sort="citations", order="desc", rows=1))[0] paper.build_citation_tree(depth=2) ads.network.export( paper, "citations", output_filename, article_repr=lambda article: article.author[0], new_branch_repr=lambda article, branch: { "name": article.author[0], "children": branch }, end_branch_repr=lambda article: {"name": article.author[0]}, indent=2, clobber=True) if os.path.exists(output_filename): os.unlink(output_filename)
def testOverlap(): """ See if there is any overlap betwen linked list for two identical authors """ name = 'Williams, B' year = 2002 phd1 = list(ads.query('bibstem:*PhDT', authors=name, dates=year, database='astronomy', rows='all'))[0] year = 2010 phd2 = list(ads.query('bibstem:*PhDT', authors=name, dates=year, database='astronomy', rows='all'))[0] list1 = phdArticle2row(phd1, returnLinkedPapers=True) list2 = phdArticle2row(phd2, returnLinkedPapers=True) overlap = list(set(list1).intersection(list2)) print 'number of overlap papers in linked lists = %i' % len(overlap)
def testPerson(name, phdyear): """ Test a random person and see if it gives the correct answer """ phdA = list(ads.query('bibstem:*PhDT', authors=name, dates=phdyear, database='astronomy', rows='all')) result = phdArticle2row(phdA[0], checkUSA=False, verbose=True, plot=True) print 'PhD Institution: %s' % result['phd aff'] print 'Latest Institution: %s' % result['latest aff'].encode('utf-8') print 'Last year: %i' % result['latest year']
def test_citation_tree(): output_filename = "citation-tree.json" paper = list(ads.query(author="^Casey, Andrew R.", sort="citations", order="desc", rows=1))[0] paper.build_citation_tree(depth=2) ads.network.export(paper, "citations", output_filename, article_repr=lambda article: article.author[0], new_branch_repr=lambda article, branch: {"name": article.author[0], "children": branch}, end_branch_repr=lambda article: {"name": article.author[0]}, indent=2, clobber=True) if os.path.exists(output_filename): os.unlink(output_filename)
def __getitem__(self, bibcode): """Access a paper given its bibcode.""" # grab from the cache if self._ads_cache is not None: if bibcode in self._ads_cache: return self._ads_cache[bibcode] # or query from ADS ads_query = ads.query(query=bibcode) pub = ADSPub(ads_query.next()) # cache it if we can if self._ads_cache is not None: self._ads_cache.insert(pub) return pub
def add_by_bibcode(self, bibcode, interactive=False, **kwargs): if ads is None: log.error("This action requires the ADS key to be setup.") return q = ads.query(bibcode) for article in q: # Data products are sometimes returned as NONARTICLE entries if article in self: log.warning("{} is already in the db.".format(article.bibcode)) elif 'NONARTICLE' in article.property: log.warning("{} is not an article.".format(article.bibcode)) else: if interactive: self.add_interactively(article) else: self.add(article, **kwargs)
def abstract(request, bibcode): # return HttpResponse(f"Viewing abstract for bibcode {bibcode}") q = list( ads.query(bibcode, fl=[ 'bibcode', 'title', 'author', 'aff', 'doi', 'pub', 'pubdate', 'citation_count', 'abstract', 'arxiv_class', 'volume', 'issue', 'page', 'year', 'keyword', 'orcid_pub', 'orcid_user', 'orcid_other' ])) assert len(q) == 1, "Non-unique bibcode" paper = q[0] bibtex = ads.ExportQuery(bibcode).execute() try: eprint = re.search(r'eprint = \{(.+)\}', bibtex)[1] except: eprint = None orcid = paper.orcid_pub try: orcid = [ pub if pub != '-' else auth for pub, auth in zip(paper.orcid_pub, paper.orcid_user) ] except: pass try: orcid = [ o if o != '-' else other for o, other in zip(orcid, paper.orcid_other) ] except: pass template = loader.get_template('abstract.html') context = { 'paper': paper, 'eprint': eprint, 'bibtex': bibtex, 'authors': zip(paper.author, paper.aff, orcid) } return HttpResponse(template.render(context, request))
def testRow(): myphd = list(ads.query('bibcode:2007PhDT.........3Y', database='astronomy', rows='all'))[0] phdArticle2row(myphd)
def phdArticle2row(phdArticle, yearsPrePhD=7, verbose=False, checkUSA=True, justKeys=False, plot=False, returnNetwork=False, returnLinkedPapers=False): """ Take an ads article object and return a dict of information with keys: [name, phd year, phd bibcode, phd.aff, latest paper bibcode, latest year, latest aff, latest 1st author bibcode, latest 1st year, latest 1st aff, largest publication gap] Note: Currently not making any cut based on peer-review. Thus, latest 1st author paper could be a AAS poster, SPIE paper, arXive posting, etc. XXX-consider pulling some metrics from ADS and putting them in the row. """ if verbose: print 'searching for papers linked to:', phdArticle result = {} resultKeys = ['name', 'phd year', 'phd bibcode', 'phd aff', 'latest year', 'latest aff', 'latest 1st year', 'latest 1st aff', 'largest publication gap', 'numRecords','numLinked', 'uniqueName', 'latest year unlinked', 'noAstroJournal', 'nonUS', 'hindex', '1st auth hindex'] if justKeys: return resultKeys for key in resultKeys: result[key] = None maxYear = datetime.date.today().year minYear = int(phdArticle.year) - yearsPrePhD years = '%i-%i'% (minYear,maxYear) #range(minYear, maxYear+1) #str(minYear)+'-%i'% maxYear result['name'] = authSimple(phdArticle.author[0]) result['phd year'] = int(phdArticle.year) result['phd aff'] = phdArticle.aff[0] result['phd bibcode'] = phdArticle.bibcode result['phd aff'] = phdArticle.aff[0] # Check that phd is from the US if checkUSA: if not checkUSAff(phdArticle.aff[0]): result['nonUS'] = True if verbose: print '%s does not test as a USA affiliation' % phdArticle.aff[0].encode('utf-8') return result # Query for all the papers by this author name paperList = authorsPapers(phdArticle.author[0], years=years) if verbose: print 'Found %i papers' % len(paperList) result['numRecords'] = len(paperList) # Check that there's an astro paper in here if not inAstroJ(paperList): result['noAstroJournal'] = True if verbose: print 'Did not find an astro paper in results' return result # Find all the papers linked to the PHD in question linkedPapers, linkedGraph = authorGroup(paperList, phdArticle, authSimple(phdArticle.author[0])) # Check if the citations = [paper.citation_count for paper in linkedPapers] result['hindex'] = hindex(citations) if returnLinkedPapers: return linkedPapers result['numLinked'] = len(linkedPapers) if plot: years = [float(paper.year) for paper in linkedPapers] nx.draw_spring(linkedGraph)#, node_color=np.array(years)) if verbose: print 'Found %i papers linked to phd' % len(linkedPapers) # Make sure there's still a publication in an astro journal if not inAstroJ(linkedPapers): result['noAstroJournal'] = True if verbose: print 'Did not find an astro paper in linked results' return result linkedYears = [] linked1stA = [] linked1stAYears = [] latestPaper = linkedPapers[0] latest1stApaper = phdArticle latestAff = phdArticle.aff[0] affDate = phdArticle.pubdate.split('-') month = int(affDate[1]) if month < 1: month = 1 affDate = datetime.date(year=int(phdArticle.year), month=month, day=1) for paper in linkedPapers: if hasattr(paper, 'year'): linkedYears.append(int(paper.year)) if int(paper.year) > int(latestPaper.year): latestPaper = paper if authSimple(paper.author[0]) == authSimple(phdArticle.author[0]): linked1stA.append(paper) if hasattr(paper, 'year'): linked1stAYears.append(int(paper.year)) if int(paper.year) > int(latest1stApaper.year): latest1stApaper = paper paperDate =int(paper.pubdate.split('-')[1]) if paperDate < 1: paperDate = 1 if hasattr(paper,'year'): paperDate = datetime.date(int(paper.year), paperDate, 1) if paperDate >= affDate: for auth,aff in zip(paper.author, paper.aff): if authSimple(auth) == authSimple(phdArticle.author[0]): if aff is not None: if len(aff) > 3: latestAff = aff affYear = int(paper.year) result['largest publication gap'] = np.max(np.diff(np.sort(linkedYears))) result['latest year'] = int(latestPaper.year) result['latest 1st year'] = int(latest1stApaper.year) result['latest aff'] = latestAff allYears = [int(paper.year) for paper in paperList if hasattr(paper,'year')] result['latest year unlinked'] = np.max(allYears) citations = [paper.citation_count for paper in linked1stA] result['1st auth hindex'] = hindex(citations) # Test to see if this is the only person with this name and a phd in astro ack = list(ads.query('bibstem:"*PhDT", author:"%s"' % authSimple(phdArticle.author[0]), database='astronomy')) titles = [] if len(ack) > 1: # Make sure the titles are different for paper in ack: if hasattr(paper, 'title'): if paper.title is not None: titles.append(paper.title[0].lower()) titles = set(titles) # titles = set([paper.title[0].lower() for paper in ack if hasattr(paper, 'title')]) if len(titles) > 1: if verbose: print authSimple(phdArticle.author[0])+' returns multiple PhDT.' result['uniqueName'] = False else: result['uniqueName'] = True if returnNetwork: return result, linkedGraph return result
def update(self, month=None, exclude=[ 'keplerian', 'johannes', 'k<sub>2</sub>', "kepler equation", "kepler's equation", "xmm-newton", "kepler's law", "kepler's third law", "kepler problem", "kepler crater", "kepler's supernova", "kepler's snr" ]): """Query ADS for new publications. Parameters ---------- month : str Of the form "YYYY-MM". exclude : list of str Ignore articles if they contain any of the strings given in this list. (Case-insensitive.) """ if ads is None: log.error("This action requires the ADS key to be setup.") return if month is None: month = datetime.datetime.now().strftime("%Y-%m") # First show all the papers with the Kepler funding message in the ack log.info("Querying ADS for acknowledgements (month={}).".format(month)) qry = ads.query("""ack:"Kepler mission" OR ack:"K2 mission" OR ack:"Kepler team" OR ack:"K2 team" -ack:"partial support from" """, dates=month, rows='all', database='astronomy') articles = list(qry) for idx, article in enumerate(articles): statusmsg = ("Showing article {} out of {} that mentions Kepler " "in the acknowledgements.\n\n".format( idx + 1, len(articles))) self.add_interactively(article, statusmsg=statusmsg) # Then search for keywords in the title and abstracts log.info( "Querying ADS for titles and abstracts (month={}).".format(month)) qry = ads.query("""abs:"Kepler" OR abs:"K2" OR abs:"KIC" OR abs:"EPIC" OR abs:"KOI" OR title:"Kepler" OR title:"K2" """, dates=month, rows='all', database='astronomy') # ,property='refereed') articles = list(qry) for idx, article in enumerate(articles): # Ignore articles without abstract if not hasattr(article, 'abstract'): continue abstract_lower = article.abstract.lower() ignore = False # Ignore articles containing any of the excluded terms for term in exclude: if term.lower() in abstract_lower: ignore = True # Ignore articles already in the database if article in self: ignore = True # Ignore all the unrefereed non-arxiv stuff try: if "NOT REFEREED" in article.property and article.pub != "ArXiv e-prints": ignore = True except AttributeError: pass # no .pub attribute # Ignore proposals and cospar abstracts if ".prop." in article.bibcode or "cosp.." in article.bibcode: ignore = True if not ignore: # Propose to the user statusmsg = '(Reviewing article {} out of {}.)\n\n'.format( idx + 1, len(articles)) self.add_interactively(article, statusmsg=statusmsg) log.info('Finished reviewing all articles for {}.'.format(month))
page_numbers = int(comments[page_number_ind]) if figure_number_ind!=-1: if comments[figure_number_ind].isdigit(): figure_numbers = int(comments[figure_number_ind]) return [page_numbers,figure_numbers] # In[273]: df = pd.DataFrame() for i,key in enumerate(arXiv_dict.keys()): pages,figures = search_comments(arXiv_dict[key]['comment']) try: ads_paper = list(ads.query(title=str(arXiv_dict[key]['title']))) if len(ads_paper)==1: new_row = pd.DataFrame({'Title':[ads_paper[0].title], 'Author':[ads_paper[0].author], 'Citation_count':[ads_paper[0].citation_count], 'Year':[ads_paper[0].year], 'Pub':[ads_paper[0].pub], 'Pages':[pages],'Figures':[figures]}) else: print i,'key',key,'has multiple query returns' df = df.append(new_row) except: continue # except error: # continue # In[282]: df.plot(x='Pages',y='Citation_count',kind='scatter')
def recommend_references(bibcode, num=3, ratio=0.5): """ Return bibcodes of recommended articles that might have been worth citing in the bibcode provided. :param bibcode: The bibcode of the article that you would like to have recommended citations for. :type bibcode: str :param num: Number of article bibcodes to recommend. :type num: int :param ratio: Self-similarity ratio in names in order to identify two author names as being the same person. :type ratio: float :returns: Article bibcodes that probably should have been cited. """ # OK I have used crazy variable names for "humans" because this is otherwise # a conceptually annoying problem. num = int(num) if 1 > num: raise ValueError("number of requested articles must be a a positive integer") if not (1 >= ratio > 0): raise ValueError("self-similarity ratio must be between (0, 1]") # First get the article try: original_article = list(ads.query(bibcode))[0] except: raise ValueError("could not find original article with bibcode {0}".format(bibcode)) #logging.debug("Article has bibcode, title: {0} {1}".format( # original_article.bibcode, original_article.title)) # Find all the citations that the paper made articles_i_cited = list(original_article.references) #logging.debug("This article cited {0} papers".format(len(articles_i_cited))) bibcodes_of_articles_i_cited = [each.bibcode for each in articles_i_cited] # Who else cited the papers that I cited? which_articles_cite_what_we_cite = [] # Go to all of those papers for article in articles_i_cited: # Find all of the references from those papers # (let's call them "all_references") which_articles_cite_what_we_cite.extend( [each.bibcode for each in article.citations]) most_popular_articles_cited = [article for article in which_articles_cite_what_we_cite \ if article not in bibcodes_of_articles_i_cited] # Create a collection counter for these items most_popular_articles_cited = collections.Counter(most_popular_articles_cited) # Sort by most values most_popular_articles_cited = sorted( most_popular_articles_cited.items(), key=operator.itemgetter(1))[::-1] # Let's not include papers written by the current author recommended_bibcodes = [] original_author = original_article.author[0].lower().replace(".", "") for bibcode, popularity in most_popular_articles_cited: article = list(ads.query(bibcode))[0] this_author = article.author[0].lower().replace(".", "") if difflib.SequenceMatcher(a=this_author, b=original_author).ratio() > ratio: # Same author as the original one, so let's ignore it. continue else: recommended_bibcodes.append(bibcode) if len(recommended_bibcodes) == num: break return recommended_bibcodes
continue else: recommended_bibcodes.append(bibcode) if len(recommended_bibcodes) == num: break return recommended_bibcodes if __name__ == "__main__": # citation-buddy bibtex_code # bibtex_code = 2014MNRAS.443..828C # STEPS # (1) Find all the citations that the paper made # (2) Go to all of those papers, and find out who else cited those papers # (3) Go to all of those those papers, and find out which papers they cited that # you didn't. Count the frequency and find the highest ones if len(sys.argv) > 1: bibcodes = recommend_references(sys.argv[1]) for bibcode in bibcodes: article = list(ads.query(bibcode))[0] et_al = ["", " et al"][len(article.author) > 1] print("I recommend this paper: {0} by {1}{2} at {3}".format( article.title[0], article.author[0], et_al, article.url))
def update(self, month=None, exclude=['keplerian', 'johannes', 'k<sub>2</sub>', "kepler equation", "kepler's equation", "xmm-newton", "kepler's law", "kepler's third law", "kepler problem", "kepler crater", "kepler's supernova", "kepler's snr"]): """Query ADS for new publications. Parameters ---------- month : str Of the form "YYYY-MM". exclude : list of str Ignore articles if they contain any of the strings given in this list. (Case-insensitive.) """ if ads is None: log.error("This action requires the ADS key to be setup.") return if month is None: month = datetime.datetime.now().strftime("%Y-%m") # First show all the papers with the Kepler funding message in the ack log.info("Querying ADS for acknowledgements (month={}).".format(month)) qry = ads.query("""ack:"Kepler mission" OR ack:"K2 mission" OR ack:"Kepler team" OR ack:"K2 team" -ack:"partial support from" """, dates=month, rows='all', database='astronomy') articles = list(qry) for idx, article in enumerate(articles): statusmsg = ("Showing article {} out of {} that mentions Kepler " "in the acknowledgements.\n\n".format( idx+1, len(articles))) self.add_interactively(article, statusmsg=statusmsg) # Then search for keywords in the title and abstracts log.info("Querying ADS for titles and abstracts (month={}).".format(month)) qry = ads.query("""abs:"Kepler" OR abs:"K2" OR abs:"KIC" OR abs:"EPIC" OR abs:"KOI" OR title:"Kepler" OR title:"K2" """, dates=month, rows='all', database='astronomy') # ,property='refereed') articles = list(qry) for idx, article in enumerate(articles): # Ignore articles without abstract if not hasattr(article, 'abstract'): continue abstract_lower = article.abstract.lower() ignore = False # Ignore articles containing any of the excluded terms for term in exclude: if term.lower() in abstract_lower: ignore = True # Ignore articles already in the database if article in self: ignore = True # Ignore all the unrefereed non-arxiv stuff try: if "NOT REFEREED" in article.property and article.pub != "ArXiv e-prints": ignore = True except AttributeError: pass # no .pub attribute # Ignore proposals and cospar abstracts if ".prop." in article.bibcode or "cosp.." in article.bibcode: ignore = True if not ignore: # Propose to the user statusmsg = '(Reviewing article {} out of {}.)\n\n'.format( idx+1, len(articles)) self.add_interactively(article, statusmsg=statusmsg) log.info('Finished reviewing all articles for {}.'.format(month))
def phdArticle2row(phdArticle, yearsPrePhD=7, verbose=False, checkUSA=True, justKeys=False, plot=False, returnNetwork=False, returnLinkedPapers=False): """ Take an ads article object and return a dict of information with keys: [name, phd year, phd bibcode, phd.aff, latest paper bibcode, latest year, latest aff, latest 1st author bibcode, latest 1st year, latest 1st aff, largest publication gap] Note: Currently not making any cut based on peer-review. Thus, latest 1st author paper could be a AAS poster, SPIE paper, arXive posting, etc. XXX-consider pulling some metrics from ADS and putting them in the row. """ if verbose: print 'searching for papers linked to:', phdArticle result = {} resultKeys = [ 'name', 'phd year', 'phd bibcode', 'phd aff', 'latest year', 'latest aff', 'latest 1st year', 'latest 1st aff', 'largest publication gap', 'numRecords', 'numLinked', 'uniqueName', 'latest year unlinked', 'noAstroJournal', 'nonUS', 'hindex', '1st auth hindex' ] if justKeys: return resultKeys for key in resultKeys: result[key] = None maxYear = datetime.date.today().year minYear = int(phdArticle.year) - yearsPrePhD years = '%i-%i' % ( minYear, maxYear ) #range(minYear, maxYear+1) #str(minYear)+'-%i'% maxYear result['name'] = authSimple(phdArticle.author[0]) result['phd year'] = int(phdArticle.year) result['phd aff'] = phdArticle.aff[0] result['phd bibcode'] = phdArticle.bibcode result['phd aff'] = phdArticle.aff[0] # Check that phd is from the US if checkUSA: if not checkUSAff(phdArticle.aff[0]): result['nonUS'] = True if verbose: print '%s does not test as a USA affiliation' % phdArticle.aff[ 0].encode('utf-8') return result # Query for all the papers by this author name paperList = authorsPapers(phdArticle.author[0], years=years) if verbose: print 'Found %i papers' % len(paperList) result['numRecords'] = len(paperList) # Check that there's an astro paper in here if not inAstroJ(paperList): result['noAstroJournal'] = True if verbose: print 'Did not find an astro paper in results' return result # Find all the papers linked to the PHD in question linkedPapers, linkedGraph = authorGroup(paperList, phdArticle, authSimple(phdArticle.author[0])) # Check if the citations = [paper.citation_count for paper in linkedPapers] result['hindex'] = hindex(citations) if returnLinkedPapers: return linkedPapers result['numLinked'] = len(linkedPapers) if plot: years = [float(paper.year) for paper in linkedPapers] nx.draw_spring(linkedGraph) #, node_color=np.array(years)) if verbose: print 'Found %i papers linked to phd' % len(linkedPapers) # Make sure there's still a publication in an astro journal if not inAstroJ(linkedPapers): result['noAstroJournal'] = True if verbose: print 'Did not find an astro paper in linked results' return result linkedYears = [] linked1stA = [] linked1stAYears = [] latestPaper = linkedPapers[0] latest1stApaper = phdArticle latestAff = phdArticle.aff[0] affDate = phdArticle.pubdate.split('-') month = int(affDate[1]) if month < 1: month = 1 affDate = datetime.date(year=int(phdArticle.year), month=month, day=1) for paper in linkedPapers: if hasattr(paper, 'year'): linkedYears.append(int(paper.year)) if int(paper.year) > int(latestPaper.year): latestPaper = paper if authSimple(paper.author[0]) == authSimple(phdArticle.author[0]): linked1stA.append(paper) if hasattr(paper, 'year'): linked1stAYears.append(int(paper.year)) if int(paper.year) > int(latest1stApaper.year): latest1stApaper = paper paperDate = int(paper.pubdate.split('-')[1]) if paperDate < 1: paperDate = 1 if hasattr(paper, 'year'): paperDate = datetime.date(int(paper.year), paperDate, 1) if paperDate >= affDate: for auth, aff in zip(paper.author, paper.aff): if authSimple(auth) == authSimple(phdArticle.author[0]): if aff is not None: if len(aff) > 3: latestAff = aff affYear = int(paper.year) result['largest publication gap'] = np.max(np.diff(np.sort(linkedYears))) result['latest year'] = int(latestPaper.year) result['latest 1st year'] = int(latest1stApaper.year) result['latest aff'] = latestAff allYears = [ int(paper.year) for paper in paperList if hasattr(paper, 'year') ] result['latest year unlinked'] = np.max(allYears) citations = [paper.citation_count for paper in linked1stA] result['1st auth hindex'] = hindex(citations) # Test to see if this is the only person with this name and a phd in astro ack = list( ads.query('bibstem:"*PhDT", author:"%s"' % authSimple(phdArticle.author[0]), database='astronomy')) titles = [] if len(ack) > 1: # Make sure the titles are different for paper in ack: if hasattr(paper, 'title'): if paper.title is not None: titles.append(paper.title[0].lower()) titles = set(titles) # titles = set([paper.title[0].lower() for paper in ack if hasattr(paper, 'title')]) if len(titles) > 1: if verbose: print authSimple(phdArticle.author[0]) + ' returns multiple PhDT.' result['uniqueName'] = False else: result['uniqueName'] = True if returnNetwork: return result, linkedGraph return result
def testRow(): myphd = list( ads.query('bibcode:2007PhDT.........3Y', database='astronomy', rows='all'))[0] phdArticle2row(myphd)