def get_papers_from_paper_citations(paper_title: str): """ gets the papers that cited the paper given as a parameter it registers the found papers in articles folder and registres the citation relationship in the citations folder """ target_paper_generator = scholarly.search_pubs( paper_title) # search by title as a keyword print("=======> getting the rarget pater") target_paper = next(target_paper_generator) # get the first result print('##########################') publications_generator = scholarly.citedby(target_paper) try: citations_count= 0 while citations_count<=NB_MAX_CITATIONS_PER_PAPERS: publication = next(publications_generator) # filled_publication = scholarly.fill(publication) mydict = publication_to_dict(publication) write_publication(mydict, PUBLICATIONS_CSV_FILE_OUTPUT) register_citation( target_paper['citedby_url'], mydict['citedby_url']) citations_count+=1 except Exception as e: raise e
def grab_related(cp, max_no=10): # grab max_no number of papers that are the highest cited amongst the #ones that cite the initial paper and their information cites = [] count = 0 for citation in tqdm(scholarly.citedby(cp), total=max_no): if count > max_no: break else: cites.append(citation) count += 1 return cites
def test_search_pubs_citedby(self): """ Testing that when we retrieve the list of publications that cite a publication, the number of citing publication is the same as the number of papers that are returned. We use a publication with a small number of citations, so that the test runs quickly. The 'Machine-learned epidemiology' paper had 11 citations as of June 1, 2020. """ query = 'Machine-learned epidemiology: real-time detection of foodborne illness at scale' pubs = [p for p in scholarly.search_pubs(query)] self.assertGreaterEqual(len(pubs), 1) filled = scholarly.fill(pubs[0]) cites = [c for c in scholarly.citedby(filled)] self.assertEqual(len(cites), filled['num_citations'])
def download_citations(): # Retrieve the author's data, fill-in, and print # search_query = scholarly.search_author(NAME) search_query = scholarly.search_author_id(AUTHOR_ID) # author = scholarly.fill(next(search_query)) author = scholarly.fill(search_query) print(author) # Print the titles of the author's publications print([pub['bib']['title'] for pub in author['publications']]) # Take a closer look at the first publication # pub = scholarly.fill(author['publications'][1]) # print(pub) independent_citations = [] for pub in author['publications'][:]: res_dict = {} time.sleep(random.randint(WAIT, WAIT * 2)) pub = scholarly.fill(pub) res_dict["title"] = pub['bib']["title"] res_dict["year"] = pub['bib']["pub_year"] print(pub['bib']["title"]) res_dict["author"] = [name.strip() for name in pub['bib']["author"].split("and")] time.sleep(random.randint(WAIT, WAIT * 2)) cited_this = scholarly.citedby(pub) if cited_this: res_dict['cited_this'] = [{"author": citation['bib']["author"], "title": citation['bib']["title"]} for citation in cited_this] indep_citations = print_citations(res_dict) res_dict['independent_citations'] = indep_citations independent_citations.append( {"title": res_dict["title"], "author": res_dict["author"], 'independent_citations': indep_citations}) save_json(res_dict['title'], res_dict) else: break save_json("independent_citations.json", independent_citations)
def search_cited_papers(pub): # only consider the most prominant ten papers. return [_ for _, __ in zip(scholarly.citedby(pub), range(10))]
from scholarly import scholarly, ProxyGenerator import json from dotenv import load_dotenv from pathlib import Path import os env_path = Path('../') / '.env' load_dotenv(dotenv_path=env_path) SCRAPER = os.getenv("SCRAPER") proxy_generator = ProxyGenerator() proxy_generator.ScraperAPI(SCRAPER) scholarly.set_timeout(60) scholarly.use_proxy(proxy_generator) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pubs = [ scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations'] > 0) ] pubs2 = [[pub, (list(scholarly.citedby(pub)))] for pub in pubs if 'citedby_url' in pub] print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
all_pubs = [] all_cites = [] for paper_title in flame_pubs: results = scholarly.search_pubs(paper_title) pubs = [p for p in results] assert len(pubs) > 0 # Paper not found? print(f"Found '{paper_title}'.") # fill by querying site pub = scholarly.fill(pubs[0]) all_pubs.append(pub) print(f"Details returned for '{paper_title}'.") # get all publications that cite the current paper cites = [ dict(c, **{'flame_paper': paper_title}) for c in scholarly.citedby(pub) ] all_cites.extend(cites) print(f"Found {len(cites)} citations for '{paper_title}'\n") # dump to file #f_pubs.write(yaml.dump([pubs])) #f_cites.write(yaml.dump([cites])) # remove duplicates from citations list unique_cites = [] for p in all_cites: if p not in unique_cites: unique_cites.append(p) # remove cross refs to pubs
#anvil.server.wait_forever() busca_publicaciones( ['MADAIN PEREZ PATRICIO', 'Abiel Aguilar-González', 'Steven A Cholewiak']) # Retrieve the author's data, fill-in, and print #print(author) # Print the titles of the author's publications #print([pub.bib['title'] for pub in author.publications]) # Take a closer look at the first publication #pub = author.publications[0].fill() #print(pub) # Which papers cited that publication? #print([citation.bib['title'] for citation in pub.citedby]) # Retrieve the author's data, fill-in, and print search_query = scholarly.search_author('Steven A Cholewiak') author = scholarly.fill(next(search_query)) print(author) # Print the titles of the author's publications print([pub['bib']['title'] for pub in author['publications']]) # Take a closer look at the first publication pub = scholarly.fill(author['publications'][0]) print(pub) # Which papers cited that publication? print([citation['bib']['title'] for citation in scholarly.citedby(pub)])
from scholarly import scholarly, ProxyGenerator import json from dotenv import load_dotenv from pathlib import Path proxy_generator = ProxyGenerator() proxy_generator.Tor_Internal(tor_cmd='tor') scholarly.use_proxy(proxy_generator) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pubs = [ scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations'] > 0) ] pubs2 = [] for pub in pubs: if 'citedby_url' in pub: pubs2 = [pubs2, [pub, (list(scholarly.citedby(pub)))]] print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
print("Searching on Google scholar") author = scholarly.search_author_id('_7AMrKgAAAAJ') # _7AMrKgAAAAJ is Quasar quasar_stats = scholarly.fill( author, sections=['basics', 'indices', 'counts', 'publications']) scholarly.pprint(quasar_stats) # What papers cited our publications? cit = [] for pub in quasar_stats['publications']: print(pub) cit.append( [citation for citation in scholarly.citedby(pub)] ) # limit the number of test runs because this will get blocked bu Google quickly print( f'There are currently {len(quasar_stats["publications"])} Quasar papers.') for pub in quasar_stats['publications']: print(' ', pub['bib']['title']) fcit = [item for sublist in cit for item in sublist] # this is a flat list now print(f'\nWe have {len(fcit)} citations so far for our Quasar papers.') # I wonder if this can be done in fewer lines. :D authors = [c["author_id"] for c in fcit] citing_authors = [item for sublist in authors for item in sublist] citing_authors = set([c for c in citing_authors if c]) # citing authors with Google Scholar profile
# Retrieve the author's data, fill-in, and print search_query = scholarly.search_author('Steven A Cholewiak') author = next(search_query) author_filled = scholarly.fill(author) scholarly.pprint(author) print(author) # Take a closer look at the first publication publication = author["publications"][0] pub = scholarly.fill(publication) # citations= pub.citedby() citations_iterator = scholarly.citedby(pub) i = 0 while i < 100: i += 1 citations = next(citations_iterator) with open('citedby.txt', 'w+') as file: print(citations, file=file) # print(pub) # Which papers cited that publication? # print([citation.bib['title'] for citation in pub.citedby]) # Free Proxy # pg = ProxyGenerator() # pg.FreeProxies()
class ScraperAPI(ProxyGenerator): def __init__(self, api_key): self._api_key = api_key self._client = ScraperAPIClient(api_key) assert api_key is not None super(ScraperAPI, self).__init__() self._TIMEOUT = 120 self._session = self._client self._session.proxies = {} def _new_session(self): self.got_403 = False return self._session def _close_session(self): pass # no need to close the ScraperAPI client pg = ScraperAPI(SCRAPER) scholarly.use_proxy(pg) scholarly.set_timeout(120) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pub = scholarly.fill(author['publications'][16]) print(pub) print(list(scholarly.citedby(pub)))
from scholarly import scholarly, ProxyGenerator import json from dotenv import load_dotenv from pathlib import Path proxy_generator = ProxyGenerator() proxy_generator.Tor_Internal(tor_cmd = 'tor') scholarly.use_proxy(proxy_generator) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pubs=[scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations']>0)] pubs2=[ [pub, (list(scholarly.citedby(pub)))] for pub in pubs if 'citedby_url' in pub] print(json.dumps(pubs2,indent=2, default=lambda o: '<not serializable>'))