def main(author_name): """ Print all publications as JSON to STDOUT """ data = {} data['publications'] = [] author = scholarly.fill(next(scholarly.search_author(author_name))) for pub in author['publications']: pub_details = scholarly.fill(pub)['bib'] data['publications'].append({ 'authors': reformat_coauthors(pub_details['author'].split(' and ')), 'year': pub_details.get('pub_year', ''), 'title': pub_details.get('title', ''), 'journal': pub_details.get('journal', ''), 'volume': pub_details.get('volume', ''), 'issue': pub_details.get('issue', ''), 'pages': pub_details.get('pages', ''), 'citations': pub.get('num_citations', 0), 'pub_url': pub.get('pub_url', ''), #'eprint_url': pub.get('pub_url', '') # seems to be same as pub_url }) output_json(data)
def get_all_coauthors(author_name, min_year, max_year, max_coauthors, include_no_year): """ Get a set of all coauthors """ author = scholarly.fill(next(scholarly.search_author(author_name))) all_coauthors = set() for pub in author['publications']: # Evaluate if publication year is indicated (if not, ignore depending # on presence of --include_no_year flag) if 'pub_year' in pub['bib']: pub_year = int(pub['bib']['pub_year']) elif include_no_year: pub_year = max_year else: pub_year = min_year - 1 # Evaluate whether publication falls within indicated timerange if min_year <= pub_year <= max_year: coauthors = scholarly.fill(pub)['bib']['author'].split(' and ') # Evaluate if number of coauthors meets optional threshold if len(coauthors) <= max_coauthors: [ all_coauthors.add(reformat_name(coauthor)) for coauthor in coauthors ] return all_coauthors
def main(args): gs_author = {} gs_pubs = [] if not args.force: gs_author = load('google_scholar_author.pkl') or {} gs_pubs = load('google_scholar_publications.pkl') or [] gs_author = scholarly.search_author_id('SZR6mXsAAAAJ') gs_author = scholarly.fill(gs_author) save(gs_author, 'google_scholar_author.pkl') current_pubs_ids = {p['author_pub_id'] for p in gs_pubs} author_pubs_ids = {p['author_pub_id'] for p in gs_author['publications']} new_pubs_ids = author_pubs_ids - current_pubs_ids new_pubs = [ p for p in gs_author['publications'] if p['author_pub_id'] in new_pubs_ids ] # TODO update based on info available on gs_author for p in tqdm(new_pubs): p = scholarly.fill(p) gs_pubs.append(p) save(gs_pubs, 'google_scholar_publications.pkl')
def find_bbe_coauthors( name, institution="Caltech", start=2015, verbose=True): search_query = scholarly.search_author(name + ", " + institution) author = scholarly.fill(next(search_query)) coauthors = set() for i, pub in enumerate(author['publications']): # Make sure this is within the date range that we care about try: if int(pub['bib']['pub_year']) < start: # Skip this entry continue elif verbose: print(i, end=" ", flush=True) except KeyError: continue # Get the full data pub = scholarly.fill(pub) for author in pub['bib']['author'].split("and"): coauthors.add(author.strip()) print("") return coauthors
def test_search_author_single_author(self): query = 'Steven A. Cholewiak' authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1) author = scholarly.fill(authors[0]) self.assertEqual(author['name'], u'Steven A. Cholewiak, PhD') self.assertEqual(author['scholar_id'], u'4bahYMkAAAAJ') pub = scholarly.fill(author['publications'][2]) self.assertEqual(pub['author_pub_id'], u'4bahYMkAAAAJ:LI9QrySNdTsC')
def scholarly_request(search_string: str) -> Dict: '''This function takes a search keyword string and request information about the corresponding article via scholarly''' # Get all available information search_query = scholarly.search_pubs(search_string) article_info = next(search_query) scholarly.fill(article_info) article_dict = article_info['bib'] article_dict = normalize_scholarly_dict(article_dict) article_dict = add_retrieval_information(article_dict, 'Scholarly', 'unstructured_ID', search_string) return article_dict
def fetch_citations(author, filesave="citations.json", proxy="", proxy_list=""): """ Fetch citations from google scholar using scholarly """ if proxy != "": print("Setting up proxy ", proxy) scholarly.use_proxy(scholarly.SingleProxy(http=proxy, https=proxy)) if proxy_list != "": lproxies = open(proxy_list, 'r').readlines() def proxy_gen(): if proxy_gen.counter >= len(lproxies): raise IndexError("We ran out of proxies...") proxy = lproxies[proxy_gen.counter] if not proxy.startswith("http"): proxy = "http://" + proxy proxy_gen.counter += 1 return proxy proxy_gen.counter = 0 scholarly.use_proxy(proxy_gen) print("Looking up " + author) search = scholarly.search_author(author) author = scholarly.fill(next(search)) publications = [] for i, pub in enumerate(author['publications']): cites = pub['num_citations'] # often this gets messed up upon .fill() if "pub_year" in pub['bib']: pubyear = pub['bib'][ "pub_year"] # also this gets messed up upon .fill() pub = scholarly.fill(pub) pub['bib']["pub_year"] = pubyear else: pub = scholarly.fill(pub) if not "pub_year" in pub.bib: # skip publications that really don't have a year, # they probably are crap that was picked up by the search robot continue pub['num_citations'] = cites print("Fetching: " + str(i) + "/" + str(len(author['publications'])) + ": " + pub['bib']["title"] + " (" + str(pub['bib']["pub_year"]) + ")") pub['bib'].pop("abstract", None) pub.pop("source", None) publications.append(pub) f = open(filesave, "w") f.write(json.dumps(publications)) f.close()
def quick_fetch_author(name): search_query = scholarly.search_author(name) author = scholarly.fill(next(search_query), sections=['publications', 'coauthors']) iterator = 0 data = {} publications = [] coauthors = [] for auth in author['coauthors']: coauthors.append(auth['name']) for pub in author['publications']: pub_info = {} make_attribute(pub_info, 'title', pub, 'bib') make_attribute(pub_info, 'num_citations', pub, 'plain') make_attribute(pub_info, 'pub_year', pub, 'bib') pub_info['_id'] = iterator iterator += 1 publications.append(pub_info) make_attribute(data, 'name', author, 'plain') make_attribute(data, 'coauthors', coauthors, 'obj') make_attribute(data, 'affiliation', author, 'plain') make_attribute(data, 'email_domain', author, 'plain') make_attribute(data, 'interests', author, 'plain') make_attribute(data, 'citedby', author, 'plain') make_attribute(data, 'number_of_publications', len(publications), 'obj') make_attribute(data, 'publications', publications, 'obj') return data
def extract_coauthors_by_id(author_id): """ extracts the co-authors of the currently existing authors in the dataset """ # create the output file author = scholarly.search_author_id(author_id) filled_coauthors = scholarly.fill(author, ['coauthors']) coauthors_list = filled_coauthors['coauthors'] for author in coauthors_list: filled_author = scholarly.fill(author, ['indices']) register_coauthering(author_id, filled_author['scholar_id']) print(filled_author) mydict = filled_author_to_dict(filled_author) write_author(mydict, AUTHORS_CSV_FILE_OUTPUT_COAUTHORS)
def add_publications(self, request): owner_id = self.request.user.id owner = get_object_or_404(User, pk=owner_id) Publication.objects.filter( owner=owner ).delete() author_id = request.GET.get('author_id', None) author_basic = scholarly.search_author_id(author_id) author = scholarly.fill(author_basic) data = {} for publication in author['publications'][:25]: publication_info = publication['bib'] if 'title' in publication_info: data['title'] = publication_info['title'] if 'pub_year' in publication_info: data['publication_year'] = publication_info['pub_year'] if 'author_pub_id' in publication: elem = publication['author_pub_id'].split(':') base_link = "https://scholar.google.com/citations?user={}" + \ "#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3D" +\ "view_citation%26user%3D{}%26citation_for_view%3D{}%3A{}" data['link'] = base_link.format( elem[0], elem[0], elem[0], elem[1]) if 'num_citations' in publication: data['citation_number'] = publication['num_citations'] data['owner'] = owner_id serializer = PublicationSerializer(data=data) serializer.is_valid(raise_exception=True) Publication.objects.create( **serializer.validated_data) return Response(status=status.HTTP_201_CREATED)
def get_schoolar_data(author_name, cache_folder="scholarly", affiliation='UBC'): output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", cache_folder) cached = os.path.join(output_folder, format_author(author_name)) from_cache = False final_data = [] if not os.path.isfile(cached): try: # Retrieve the author's data, fill-in, and print search_query = scholarly.search_author( f'{author_name} {affiliation}') author = scholarly.fill(next(search_query)) # Print the titles of the author's publications titles = [pub['bib']['title'] for pub in author['publications']] final_data = [] for title in titles: logger.info("Processing " + Fore.YELLOW + title + Style.RESET_ALL) ret = get_publication(title) retries = 0 while not ret['success'] and retries < MAX_RETRIES_ON_ERROR: retries += 1 msg = "Error while querying CrossRef API ({}), retrying ({})...".format( ret["exception"], retries) logger.info(Fore.RED + msg + Style.RESET_ALL) ret = get_publication(title) sleep(3) if ret['success']: ret['original_title'] = title final_data.append(ret) else: logger.info(Fore.RED + '> Failed' + Style.RESET_ALL) final_data = list( filter(lambda k: k['result']['similarity'] >= 0.7, final_data)) final_data = sorted(final_data, key=lambda k: k['result']['similarity'], reverse=True) with open(cached, 'w') as fo: json.dump(final_data, fo, indent=4, sort_keys=True) except StopIteration: logger.info(Fore.RED + 'no more schoolar data available' + Style.RESET_ALL) with open(cached, 'w') as fo: json.dump(final_data, fo, indent=4, sort_keys=True) except Exception as ex: logger.exception(str(ex)) else: with open(cached, 'r') as fo: final_data = json.load(fo) from_cache = True return final_data, from_cache
def test_search_pubs_filling_publication_contents(self): ''' This process checks the process of filling a publication that is derived from the search publication snippets. ''' query = 'Creating correct blur and its effect on accommodation' results = scholarly.search_pubs(query) pubs = [p for p in results] self.assertGreaterEqual(len(pubs), 1) f = scholarly.fill(pubs[0]) self.assertTrue( f['bib']['author'] == u'Cholewiak, Steven A and Love, Gordon D and Banks, Martin S') self.assertTrue( f['author_id'] == ['4bahYMkAAAAJ', '3xJXtlwAAAAJ', 'Smr99uEAAAAJ']) self.assertTrue(f['bib']['journal'] == u'Journal of vision') self.assertTrue(f['bib']['number'] == '9') self.assertTrue(f['bib']['pages'] == u'1--1') self.assertTrue( f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') self.assertTrue( f['bib']['title'] == u'Creating correct blur and its effect on accommodation') self.assertTrue( f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') self.assertTrue(f['bib']['volume'] == '18') self.assertTrue(f['bib']['pub_year'] == u'2018')
def _add_or_update_academic(google_scholar_id, user_id): user = User.query.get(user_id) current_app.logger.info(f'Adding or Updating Academic: {google_scholar_id} as user {user}') Entrez.email = user.email resp = scholarly.fill(scholarly.search_author_id(google_scholar_id), sections=['indices']) if resp: a = Academic.query.filter(Academic.google_scholar_id == google_scholar_id).one_or_none() if a is None: a = Academic(google_scholar_id=google_scholar_id) a.name=resp['name'] a.affiliation=resp['affiliation'] a.cited_by=resp['citedby'] a.h_index=resp['hindex'] a.i10_index=resp['i10index'] a.last_update_date=datetime.utcnow() a.is_updating = True db.session.add(a) db.session.commit() _update_publications(a) a = Academic.query.filter(Academic.google_scholar_id == google_scholar_id).one() a.is_updating = False db.session.add(a) db.session.commit() current_app.logger.info(f'Adding or Updating Academic Completed: {google_scholar_id}')
def fetch_publication_from_id(name, id): search_query = scholarly.search_author(name) pub = scholarly.fill(next(search_query), sections=['publications']) scholarly.fill(pub['publications'][id]) result = pub['publications'][id] data = {} make_attribute(data, 'title', result, 'bib') make_attribute(data, 'author', result, 'bib') make_attribute(data, 'pub_year', result, 'bib') make_attribute(data, 'abstract', result, 'bib') make_attribute(data, 'journal', result, 'bib') make_attribute(data, 'number', result, 'bib') make_attribute(data, 'pages', result, 'bib') make_attribute(data, 'publisher', result, 'bib') make_attribute(data, 'volume', result, 'bib') make_attribute(data, 'num_citations', result, 'plain') return data
def fetch_scholar_author(gsID, fill=True): ''' Queries google scholar for a given author Also fills all stats if fill is True (a bit slower)''' author = scholarly.search_author_id(gsID) if fill: author = scholarly.fill( author, sections=['publications', 'basics', 'indices', 'counts']) return author
def get_papers_for_author(author_id): ''' Gets and registers the papers of an author ''' print("getting paper for author " + author_id) author = scholarly.search_author_id(author_id) filled_publications = scholarly.fill(author, ['publications']) publications_list = filled_publications['publications'] nbpubs_counter = 0 for publication in publications_list: filled_publication = scholarly.fill(publication) mydict = tiny_publication_to_dict(filled_publication) write_publication(mydict, PUBLICATIONS_CSV_FILE_OUTPUT) nbpubs_counter += 1 print("nbpubs_counter =====>") print(nbpubs_counter) if nbpubs_counter > NB_MAX_PAPERS_PER_AUTHOR: break
def main(args): print('Querying Google Scholar ...') author = scholarly.search_author_id('SZR6mXsAAAAJ') author = scholarly.fill(author, sections=['publications']) pubs = map(_parse, author['publications']) cites = {x[0]: x[1] for x in pubs if x[1] > 0} # jsonp = 'cites = {};'.format() jsonp = json.dumps(cites) with open(args.output, 'w') as out: out.write(jsonp)
def register_authors_from_generator(author_generator): """ This method goes throught the author generator and gets all the authors and registre them in the authors dataset """ # create the file open(AUTHORS_CSV_FILE_OUTPUT, 'w') while True: author = next(author_generator) filled_author = scholarly.fill(author, ['indices']) mydict = filled_author_to_dict(filled_author) write_author(mydict, AUTHORS_CSV_FILE_OUTPUT)
def save_csv(): _file = open("output.csv", "w+") search_query = scholarly.search_author("Mayken Espinoza-Andaluz") author = scholarly.fill(next(search_query)) _file.write("title|authors|year|abstract\n") for pub in author["publications"]: title = pub["bib"]["title"] year = pub["bib"]["pub_year"] abstract = pub["bib"]["title"] _file.write(f"{title}|{authors}|{year}|{abstract}\n") _file.close()
def get_author(author,university=""): url_part = "https://scholar.google.co.in/citations?user="******", "+university if university!='' else '')) try: authorResult = next(authorSearch) except: return "Not Found" authorRaw = scholarly.fill(authorResult,sections=['basics','indices','publications']) authorDetails = {'name':authorRaw['name'],'affiliation':authorRaw['affiliation'],'email_domain':authorRaw['email_domain'],'interests':authorRaw['interests'] ,'publications':len(authorRaw['publications']),'citedby':authorRaw['citedby'],'hindex':authorRaw['hindex'],'i10index':authorRaw['i10index'] ,'gscholar_url':url_part+authorRaw['scholar_id']} return authorDetails
def composeTweet(authorName, authorHandles_dic, pubTitle, pub): pub = scholarly.fill(pub) pubURL = pub['pub_url'] if 'pub_year' in pub['bib'].keys(): if pub['bib']['pub_year'] == str(currentYear): if authorName in authorHandles_dic.keys(): tweet = '[' + authorName + ']' + ' just published a new paper: "' + pubTitle + '"\n' + pubURL + '\n' + authorHandles_dic[authorName] return pubURL, tweet else: tweet = '[' + authorName + ']' + ' just published a new paper: "' + pubTitle + '"\n' + pubURL return pubURL, tweet return None, None
def get_publications_sch(author_name): # This block is useful for debugging and development # reset=False # if reset: # search_query = scholarly.search_author('Nathan Pemberton') # author = scholarly.fill(next(search_query)) # with open('author.pickle', 'wb') as f: # pickle.dump(author, f) # # pubs = [ scholarly.fill(p) for p in author['publications'] ] # with open('pubs.pickle', 'wb') as f: # pickle.dump(pubs, f) # else: # with open('author.pickle', 'rb') as f: # author = pickle.load(f) # with open('pubs.pickle', 'rb') as f: # pubs = pickle.load(f) search_query = scholarly.search_author('Nathan Pemberton') author = scholarly.fill(next(search_query)) pubs = [scholarly.fill(p) for p in author['publications']] for pub in pubs: bib = pub['bib'] bib['ENTRYTYPE'] = 'article' bib['ID'] = pub['author_pub_id'] bib['url'] = pub['eprint_url'] if 'pub_year' in bib: bib['year'] = bib['pub_year'] special_cases(pub) normalize_journal(pub) pubs.sort(key=sort_by_year, reverse=True) with open(BIBPATH, 'w') as f: f.write('@preamble{"{"name" : "' + author_name + '"}"}\n') for p in pubs: f.write(scholarly.bibtex(p)) f.write('\n')
def download_citations(): # Retrieve the author's data, fill-in, and print # search_query = scholarly.search_author(NAME) search_query = scholarly.search_author_id(AUTHOR_ID) # author = scholarly.fill(next(search_query)) author = scholarly.fill(search_query) print(author) # Print the titles of the author's publications print([pub['bib']['title'] for pub in author['publications']]) # Take a closer look at the first publication # pub = scholarly.fill(author['publications'][1]) # print(pub) independent_citations = [] for pub in author['publications'][:]: res_dict = {} time.sleep(random.randint(WAIT, WAIT * 2)) pub = scholarly.fill(pub) res_dict["title"] = pub['bib']["title"] res_dict["year"] = pub['bib']["pub_year"] print(pub['bib']["title"]) res_dict["author"] = [name.strip() for name in pub['bib']["author"].split("and")] time.sleep(random.randint(WAIT, WAIT * 2)) cited_this = scholarly.citedby(pub) if cited_this: res_dict['cited_this'] = [{"author": citation['bib']["author"], "title": citation['bib']["title"]} for citation in cited_this] indep_citations = print_citations(res_dict) res_dict['independent_citations'] = indep_citations independent_citations.append( {"title": res_dict["title"], "author": res_dict["author"], 'independent_citations': indep_citations}) save_json(res_dict['title'], res_dict) else: break save_json("independent_citations.json", independent_citations)
def author_to_affiliations(NAME): try: with open('affilations.p', 'rb') as f: affiliations = pickle.load(f) for k, v in affiliations.items(): if type(v) is type(list()): affiliations[k] = v[0]['name'] except: pass response = requests.get("https://dissem.in/api/search/?authors=" + str(NAME)) author_papers = response.json() visit_urls = [] coauthors = [] titles = [] affiliations = {} orcids = {} for p in author_papers["papers"]: coauthors_ = p["authors"] records = p["records"][0] if "doi" in records.keys(): visit_urls.append(records["doi"]) doi_to_author_affil_list = crossref_commons.retrieval.get_publication_as_json( records["doi"]) for al in doi_to_author_affil_list["author"]: key = al['given'] + str(" ") + al['family'] #if key not in affiliations.keys(): if len(al['affiliation']): affiliations[key] = al['affiliation'][0]['name'] if "ORCID" in al.keys(): orcids[key] = al["ORCID"] #if not len(al['affiliation']): search_query = list(scholarly.search_author(key)) #sq = search_query[0] if len(search_query): sq = search_query[0] res_author_search = scholarly.fill(sq) afil = res_author_search['affiliation'] #if "university" in afil or "state" in afil or "universidad" in afil or "college" in afil or "school" in afil: if len(al['affiliation']): #if al['affiliation'] in res_author_search['affiliation']: print(al['affiliation'], res_author_search['affiliation']) affiliations[key] = res_author_search['affiliation'] #print(affiliations[key],key) #print(affiliations) with open('affilations.p', 'wb') as f: pickle.dump(affiliations, f) return affiliations
def busca_publicaciones(lista): listapub = [] for row in lista: print(row) search_query = scholarly.search_author(row) author = scholarly.fill(next(search_query)) for index in range(len(author['publications'])): pub = scholarly.fill(author['publications'][index]) print(pub['bib']) try: listapub.append({ 'investigador': row, 'year': pub['bib']['pub_year'], 'title': pub['bib']['title'], 'author': pub['bib']['author'], 'journal': pub['bib']['journal'] }) except: pass print(listapub) return listapub
def test_search_pubs_citedby(self): """ Testing that when we retrieve the list of publications that cite a publication, the number of citing publication is the same as the number of papers that are returned. We use a publication with a small number of citations, so that the test runs quickly. The 'Machine-learned epidemiology' paper had 11 citations as of June 1, 2020. """ query = 'Machine-learned epidemiology: real-time detection of foodborne illness at scale' pubs = [p for p in scholarly.search_pubs(query)] self.assertGreaterEqual(len(pubs), 1) filled = scholarly.fill(pubs[0]) cites = [c for c in scholarly.citedby(filled)] self.assertEqual(len(cites), filled['num_citations'])
def get_author_statistics(name): OUT = np.zeros((1, ), dtype=my_dtype) search_query = scholarly.search_author(name) # author = scholarly.fill(next(search_query)) for s in statistics[:6]: OUT[s] = author[s] for year in YEARS: yearstr = "cit%s" % (year) try: OUT[yearstr] = author['cites_per_year'][year] except: pass return OUT
from scholarly import scholarly author = scholarly.search_author_id('V4ycRTQAAAAJ') pubs = scholarly.fill(author) pub = pubs['publications'][0]['bib']['title'] query = next(scholarly.search_pubs(pub)) # depois de algumas chamadas, o Google bloqueia... bib = scholarly.bibtex(query) print(bib) #query = scholarly.search_pubs("A density-based algorithm for discovering clusters in large spatial databases with noise") #pub = next(query) #print(pub) #print(scholarly.bibtex(pub))
# Add new author and populate previous pubs into pubfile from scholarly import scholarly newAuthors = ['Jerome Buhl', 'Steven Strogatz'] # Replace Me with open('authors.csv', 'a') as authorFile: for newAuthor in newAuthors: authorFile.write(newAuthor + '\n') search_query = scholarly.search_author(newAuthor) author = scholarly.fill(next(search_query)) pubIDs = [] for pub in author['publications']: authPubID = pub['author_pub_id'] iColon = authPubID.find(':') pubID = authPubID[iColon + 1:] pubIDs.append(pubID) with open('pubs.csv', 'a') as pubFile: for pubID in pubIDs: pubFile.write(pubID + '\n')
from scholarly import scholarly as schl import pandas as pd import requests labull = schl.search_author_id('Lszt1B4AAAAJ') schl.pprint(labull) # pubs associated w labull auth = schl.fill(labull, sections=['publications', 'indices'], sortby='year') # fill pub info pub_info = [ schl.fill(pub, sections=['bib', 'pub_url']) for pub in auth['publications'] ] # co-authors pub_auth = [pub['bib']['author'] for pub in pub_info] pub_surnames = [] for auth_list in pub_auth: sn = [auth.split()[-1] for auth in auth_list.split(' and ')] sn[sn.index('Bull')] = '**Bull**' pub_surnames.append(sn) # journal # pub_jnl = [pub['bib'].get('journal') for pub in pub_info] # pub info string publ_entry = [ ', '.join(pub_surnames[pp]) + ' ' + # author surnames '[' + pub['bib']['title'] + ']' + '(' + pub.get('pub_url') + ')' + ' ' + # title w links '(' + str(pub['bib']['pub_year']) + ')' # yr