def _add_or_update_academic(google_scholar_id, user_id): user = User.query.get(user_id) current_app.logger.info(f'Adding or Updating Academic: {google_scholar_id} as user {user}') Entrez.email = user.email resp = scholarly.fill(scholarly.search_author_id(google_scholar_id), sections=['indices']) if resp: a = Academic.query.filter(Academic.google_scholar_id == google_scholar_id).one_or_none() if a is None: a = Academic(google_scholar_id=google_scholar_id) a.name=resp['name'] a.affiliation=resp['affiliation'] a.cited_by=resp['citedby'] a.h_index=resp['hindex'] a.i10_index=resp['i10index'] a.last_update_date=datetime.utcnow() a.is_updating = True db.session.add(a) db.session.commit() _update_publications(a) a = Academic.query.filter(Academic.google_scholar_id == google_scholar_id).one() a.is_updating = False db.session.add(a) db.session.commit() current_app.logger.info(f'Adding or Updating Academic Completed: {google_scholar_id}')
def plot_citations(author_name): m = Basemap(projection='mill', lon_0=180) m.drawmapboundary(fill_color='aqua') m.fillcontinents(color='coral', lake_color='aqua') search_query = scholarly.search_author(author_name) author = next(search_query).fill() print(author) for pub in [author.publications[0]]: print('Title: ', pub.bib['title']) pub = pub.fill() sleep(45) for citation in pub.citedby: print(citation) sleep(45) firstAuthorId = None while firstAuthorId is None or len(citation.bib['author_id']) == 0: firstAuthorId = citation.bib['author_id'].pop() if firstAuthorId is None: continue print(firstAuthorId) author = scholarly.search_author_id(firstAuthorId) sleep(45) lat, lon = get_location(author.affiliation) x, y = m(float(lon), float(lat)) m.plot(x, y, marker='D') plt.show()
def fetch_citer_by_author(author): # first, get the author entry # check if item is a name or an id #if name if True: author_gen = proxied_search_author(author) #else (if id) else: author_gen = scholarly.search_author_id(author) # what happens if there is more then one other? # check for that and throw an error matches = list(author_gen) assert (len(matches) == 1), "Author query not unique" author = matches[0] author = proxied_author_fill(author) # second, fetch all publications #authors_publications = [pub.bib['title'] for pub in author.publications] #print(authors_publications) # third, use these publications to query for citations for kk, pub in enumerate(author.publications): print(kk) # fetch the full publictation entry to get citations pub = proxied_pub_fill(pub) citers = [citation.bib['title'] for citation in pub.citedby] return citers
def main(args): gs_author = {} gs_pubs = [] if not args.force: gs_author = load('google_scholar_author.pkl') or {} gs_pubs = load('google_scholar_publications.pkl') or [] gs_author = scholarly.search_author_id('SZR6mXsAAAAJ') gs_author = scholarly.fill(gs_author) save(gs_author, 'google_scholar_author.pkl') current_pubs_ids = {p['author_pub_id'] for p in gs_pubs} author_pubs_ids = {p['author_pub_id'] for p in gs_author['publications']} new_pubs_ids = author_pubs_ids - current_pubs_ids new_pubs = [ p for p in gs_author['publications'] if p['author_pub_id'] in new_pubs_ids ] # TODO update based on info available on gs_author for p in tqdm(new_pubs): p = scholarly.fill(p) gs_pubs.append(p) save(gs_pubs, 'google_scholar_publications.pkl')
def add_publications(self, request): owner_id = self.request.user.id owner = get_object_or_404(User, pk=owner_id) Publication.objects.filter( owner=owner ).delete() author_id = request.GET.get('author_id', None) author_basic = scholarly.search_author_id(author_id) author = scholarly.fill(author_basic) data = {} for publication in author['publications'][:25]: publication_info = publication['bib'] if 'title' in publication_info: data['title'] = publication_info['title'] if 'pub_year' in publication_info: data['publication_year'] = publication_info['pub_year'] if 'author_pub_id' in publication: elem = publication['author_pub_id'].split(':') base_link = "https://scholar.google.com/citations?user={}" + \ "#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3D" +\ "view_citation%26user%3D{}%26citation_for_view%3D{}%3A{}" data['link'] = base_link.format( elem[0], elem[0], elem[0], elem[1]) if 'num_citations' in publication: data['citation_number'] = publication['num_citations'] data['owner'] = owner_id serializer = PublicationSerializer(data=data) serializer.is_valid(raise_exception=True) Publication.objects.create( **serializer.validated_data) return Response(status=status.HTTP_201_CREATED)
def fetch_scholar_author(gsID, fill=True): ''' Queries google scholar for a given author Also fills all stats if fill is True (a bit slower)''' author = scholarly.search_author_id(gsID) if fill: author = scholarly.fill( author, sections=['publications', 'basics', 'indices', 'counts']) return author
def test_search_author_id(self): """ Test the search by author ID. Marie Skłodowska-Curie's ID is EmD_lTEAAAAJ and these IDs are permenant """ author = scholarly.search_author_id('EmD_lTEAAAAJ') self.assertEqual(author.name, u'Marie Skłodowska-Curie') self.assertEqual(author.affiliation, u'Institut du radium, University of Paris')
def main(args): print('Querying Google Scholar ...') author = scholarly.search_author_id('SZR6mXsAAAAJ') author = scholarly.fill(author, sections=['publications']) pubs = map(_parse, author['publications']) cites = {x[0]: x[1] for x in pubs if x[1] > 0} # jsonp = 'cites = {};'.format() jsonp = json.dumps(cites) with open(args.output, 'w') as out: out.write(jsonp)
def background(self): self.data = [] if self.test: self.data = [25, 30] return # Matthew, Jorgen, Igor for id in ["wxM0Gh8AAAAJ", "hfeXoYMAAAAJ", "rSVxxwsAAAAJ"]: self.data.append(scholarly.search_author_id(id).get("citedby"))
def background(self): self.data = [] if self.test: self.data = [25, 30] return # Matthew, Jorgen, Igor for id in ["wxM0Gh8AAAAJ", "hfeXoYMAAAAJ"]: # , "rSVxxwsAAAAJ"]: author = scholarly.search_author_id(id).fill() count = sum(int(paper.bib["cites"]) for paper in author.publications) self.data.append(count)
def test_search_author_id_filled(self): """ Test the search by author ID. Marie Skłodowska-Curie's ID is EmD_lTEAAAAJ and these IDs are permenant. As of July 2020, Marie Skłodowska-Curie has 1963 citations on Google Scholar and 179 publications """ author = scholarly.search_author_id('EmD_lTEAAAAJ', filled=True) self.assertEqual(author.name, u'Marie Skłodowska-Curie') self.assertEqual(author.affiliation, u'Institut du radium, University of Paris') self.assertGreaterEqual(author.citedby, 1963) self.assertGreaterEqual(len(author.publications), 179)
def extract_coauthors_by_id(author_id): """ extracts the co-authors of the currently existing authors in the dataset """ # create the output file author = scholarly.search_author_id(author_id) filled_coauthors = scholarly.fill(author, ['coauthors']) coauthors_list = filled_coauthors['coauthors'] for author in coauthors_list: filled_author = scholarly.fill(author, ['indices']) register_coauthering(author_id, filled_author['scholar_id']) print(filled_author) mydict = filled_author_to_dict(filled_author) write_author(mydict, AUTHORS_CSV_FILE_OUTPUT_COAUTHORS)
def get_papers_for_author(author_id): ''' Gets and registers the papers of an author ''' print("getting paper for author " + author_id) author = scholarly.search_author_id(author_id) filled_publications = scholarly.fill(author, ['publications']) publications_list = filled_publications['publications'] nbpubs_counter = 0 for publication in publications_list: filled_publication = scholarly.fill(publication) mydict = tiny_publication_to_dict(filled_publication) write_publication(mydict, PUBLICATIONS_CSV_FILE_OUTPUT) nbpubs_counter += 1 print("nbpubs_counter =====>") print(nbpubs_counter) if nbpubs_counter > NB_MAX_PAPERS_PER_AUTHOR: break
def download_citations(): # Retrieve the author's data, fill-in, and print # search_query = scholarly.search_author(NAME) search_query = scholarly.search_author_id(AUTHOR_ID) # author = scholarly.fill(next(search_query)) author = scholarly.fill(search_query) print(author) # Print the titles of the author's publications print([pub['bib']['title'] for pub in author['publications']]) # Take a closer look at the first publication # pub = scholarly.fill(author['publications'][1]) # print(pub) independent_citations = [] for pub in author['publications'][:]: res_dict = {} time.sleep(random.randint(WAIT, WAIT * 2)) pub = scholarly.fill(pub) res_dict["title"] = pub['bib']["title"] res_dict["year"] = pub['bib']["pub_year"] print(pub['bib']["title"]) res_dict["author"] = [name.strip() for name in pub['bib']["author"].split("and")] time.sleep(random.randint(WAIT, WAIT * 2)) cited_this = scholarly.citedby(pub) if cited_this: res_dict['cited_this'] = [{"author": citation['bib']["author"], "title": citation['bib']["title"]} for citation in cited_this] indep_citations = print_citations(res_dict) res_dict['independent_citations'] = indep_citations independent_citations.append( {"title": res_dict["title"], "author": res_dict["author"], 'independent_citations': indep_citations}) save_json(res_dict['title'], res_dict) else: break save_json("independent_citations.json", independent_citations)
def extract_interests(input_output_file): df = get_authors_dataframe(input_output_file) df = df.astype({"interests": str}) df = df.astype({"url_picture": str}) print("file readed successfully") for index, row in df.iterrows(): print('interest====>' + str(row['interests'])) if row['interests'] in ("", None, "nan") or pd.isna(row['interests']): print("Getting interests of author :" + row['scholar_id']) try: author = scholarly.search_author_id(row['scholar_id']) if 'interests' in author: interests = '|'.join(author['interests']) if 'url_picture' in author: url_picture = author['url_picture'] df.at[index, 'interests'] = interests df.at[index, 'url_picture'] = url_picture update_authors_dataframe(input_output_file, df) except Exception as identifier: print( "An exception happened while getting interests of : " + row['scholar_id']) df.at[index, 'interests'] = 'error' print(identifier.args) update_authors_dataframe(input_output_file, df)
def fetch_auth_data_google_scholar(self): base_url = 'https://scholar.google.com/citations?user='******'author_id']: if aid is not None: author = scholarly.search_author_id(aid) authors.append( author.fill( sections=['basics', 'indices', 'publications'])) author_info = [] for author in authors: a_dict = {'authorId': author.id} auth_url = base_url + a_dict['authorId'] a_dict['Publications'] = len(author.publications) a_dict['Citations'] = author.citedby a_dict['h-index'] = author.hindex front_end_info = { 'meta': 'auth_meta', 'name': author.name, 'publications': a_dict['Publications'], 'url': auth_url, 'citations': a_dict['Citations'], 'h_index': a_dict['h-index'] } print("GScholar Info: ", front_end_info) # self.socketio.emit('server_response', front_end_info, namespace='') author_info.append(a_dict) if len(author_info) == 0: front_end_info = { 'meta': 'auth_meta', 'author_info_not_found': '1' } # self.socketio.emit('server_response', front_end_info, namespace='') print("GScholar Info: ", front_end_info) return pd.DataFrame(author_info)
def pubs(WIDTH): def sort_items(item): """Sort special variables first, then alphabetically.""" try: return int(item["bib"]["pub_year"]) except KeyError: return 0 myid = "8eDOmAQAAAAJ" # google scholar ID me = scholarly.search_author_id(myid, filled=True) pubs = pi.Report("Publications", accent=orange, dim=amber) pubs.width = WIDTH pubs.add(gscholar_bio(me, WIDTH), "rich") pubs.spacer(2) year = None for n, pub in enumerate( sorted(me["publications"], key=sort_items, reverse=True)): cites = pub["num_citations"] pub = scholarly.fill(pub)["bib"] if "pub_year" not in pub.keys(): continue # Mark year if pub["pub_year"] != year: if n > 0: pubs.line(amber) pubs.spacer() pubs.add(f'[bold {salmon_light}]{pub["pub_year"]}', justify="center") year = pub["pub_year"] pubs.spacer() # add title pubs.add(f"[bold italic {orange_light}]" + pub["title"]) # add authors try: auths = pub["author"].replace(" and", ",") except KeyError: auths = "" names = ["F Claudi", "Federico Claudi", "F. Claudi"] formatted_auths = "" for author in auths.split(","): if author.strip() in names: formatted_auths += f"[bold {pink}]{author}[/bold {pink}]," else: formatted_auths += f"[{blue_grey}]{author}[/{blue_grey}]," pubs.add(formatted_auths) # Add journal if "eprint" in pub.keys(): url = pub["eprint"] elif "url" in pub.keys(): url = pub["url"] else: url = "" try: journal = pub["journal"] except KeyError: journal = "" pubs.add(f"[i {blue_grey_light}]" + journal + f"[/i {blue_grey_light}]" + "[dim]\n" + url) # Add citations pubs.add( f"[{blue_grey_light}]Citations: [{orange}]{cites}", justify="right", ) pubs.spacer() return pubs
from scholarly import scholarly import jsonpickle import json from datetime import datetime import os author: dict = scholarly.search_author_id(os.environ['GOOGLE_SCHOLAR_ID']) scholarly.fill(author, sections=['basics', 'indices', 'counts', 'publications']) name = author['name'] author['updated'] = str(datetime.now()) author['publications'] = {v['author_pub_id']:v for v in author['publications']} print(json.dumps(author, indent=2)) os.makedirs('results', exist_ok=True) with open(f'results/gs_data.json', 'w') as outfile: json.dump(author, outfile, ensure_ascii=False) shieldio_data = { "schemaVersion": 1, "label": "citations", "message": f"{author['citedby']}", } with open(f'results/gs_data_shieldsio.json', 'w') as outfile: json.dump(shieldio_data, outfile, ensure_ascii=False)
return first, itertools.chain([first], iterable) for index, row in data.iterrows(): # Retrieve the author's data, fill-in, and print # search_query = scholarly.search_author('Steven A Cholewiak') # do not use generators coz we'll always get a single profile name = row['Last name'].strip() + " " + row['First name'].strip() url = row['Google scholar'] print("Author: ", name) if not pd.isna(url): print("Research Profile: ", url) parsed = urlparse.urlparse(url) search_query = scholarly.search_author_id( parse_qs(parsed.query)['user'][0]) author = search_query.fill() print(search_query) print(author) # break # Print the titles of the author's publications # print([pub.bib['title'] for pub in author.publications]) # Take a closer look at the first publication # pub = author.publications[0].fill() # print(pub) # Which papers cited that publication? # print([citation.bib['title'] for citation in pub.citedby]) # break
num_possible = 0 for instructor_name in names: if not instructors_internal_db.check_its_time( instructor_name, 'gscholar_last_search', UPDATE_MIN_DAYS, UPDATE_MAX_DAYS): continue if instructors[instructor_name]['gscholar'] \ and instructors_internal_db.check_its_time(instructor_name, 'gscholar_last_update', UPDATE_MIN_DAYS, UPDATE_MAX_DAYS): # skipping cause not expired 'gscholar_last_update' and gscholar already exists continue # search or update if instructors[instructor_name]['gscholar']: logger.info("Updating instructor: %s", instructor_name) result = scholarly.search_author_id( instructors[instructor_name]['gscholar']['scholar_id']) found = True else: logger.info("Searching instructor: %s", instructor_name) results = scholarly.search_author(instructor_name) # print("Our db:", instructors[instructor_name]) found = False for result in results: # print("Google Scholar result: ", result) if ('columbia' in result['affiliation'].lower() and 'british' not in result['affiliation'].lower()) \ or 'columbia' in result['email_domain'].lower() \ or 'barnard' in result['email_domain'].lower(): found = True break elif words_match2(instructor_name, result['name']): logger.info(
def newGenerateCitedGoogleScholarCSV(list_of_researchers): grid = [] #titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL'] #grid.append(titles_array) outputGrid = [] output_columns = ['Name', 'URL', 'SuccessfullyScraped'] for i in range(0, len(list_of_researchers)): #researcher = list_of_researchers.iloc[i][0] researcher = "" researcher_URL = list_of_researchers.iloc[i][1] researcherID = researcher_URL.replace("https://scholar.google.com/citations?user="******"") researcherID = researcherID.replace("&hl=en&oi=ao", "") #print(researcher) search_query = '' author = '' publications = [] try: search_query = scholarly.search_author_id(researcherID) author = search_query.fill() researcher = author.name print(researcher) if author != None: publications = author.publications outputGrid.append([researcher, str(researcher_URL), "True"]) else: search_query = 'error' author = 'error' outputGrid.append([researcher, str(researcher_URL), "False"]) except: try: search_query = next(scholarly.search_author(list_of_researchers.iloc[i][0])) author = search_query.fill() researcher = author.name print(researcher) if author != None: publications = author.publications outputGrid.append([researcher, str(researcher_URL), "True"]) else: search_query = 'error' author = 'error' outputGrid.append([researcher, str(researcher_URL), "False"]) except: search_query = 'error' author = 'error' outputGrid.append([researcher, str(researcher_URL), "False"]) if search_query != 'error': # Gather the author, URL, title, abstract, keywords, citations, affiliation, publication year, # and picture URL from each researcher's publication for i in range(0, 50): print(i) name = researcher url = '' try: url = researcher_URL except: url = 'error' pictureURL = '' try: pictureURL = "https://scholar.googleusercontent.com/citations?view_op=medium_photo&user="******"' + title + '"' title = title.replace(",", "") #title = regex.sub(' ', title) except: title = 'error' #title = title.replace(',', ' ') abstract = '' try: #abstract = clean_abstract(str(author.publications[i].fill().bib['abstract'])) abstract = str(author.publications[i].fill().bib['abstract']) #abstract = '"' + abstract + '"' abstract = abstract.replace(",", "") #abstract = regex.sub(' ', abstract) except: abstract = 'error' #abstract = abstract.replace(',', ' ') interests = '' try: for i in range(0, len(author.interests)): if len(interests) == 0: interests = str(author.interests[i]) #interests = regex.sub(' ', interests) else: nextInterest = str(author.interests[i]) #nextInterest = regex.sub(' ', nextInterest) interests = interests + '/' + nextInterest except: interests = 'error' interests = interests.replace(',',' ') citations = '' try: citations = str(author.citedby) except: citations = 'error' citations = citations.replace(',',' ') affiliation = '' try: affiliation = str(author.affiliation) # Take note of position of commas using unusual alphabetic series affiliation = affiliation.replace(',','XYZ') #affiliation = regex.sub(' ', affiliation) affiliation = affiliation.replace('XYZ',',') except: affiliation = 'error' # Default set the year to -1 since the information # isn't relevant for analysis of this publication set year = -1 affiliation = affiliation.replace(',',' / ') test_array = [str(name), url, str(title), str(abstract), str(interests), citations, str(affiliation), year, pictureURL] grid.append(test_array) return grid, outputGrid
import sys import toml from scholarly import scholarly print("Searching on Google scholar") author = scholarly.search_author_id('_7AMrKgAAAAJ') # _7AMrKgAAAAJ is Quasar quasar_stats = scholarly.fill( author, sections=['basics', 'indices', 'counts', 'publications']) scholarly.pprint(quasar_stats) # What papers cited our publications? cit = [] for pub in quasar_stats['publications']: print(pub) cit.append( [citation for citation in scholarly.citedby(pub)] ) # limit the number of test runs because this will get blocked bu Google quickly print( f'There are currently {len(quasar_stats["publications"])} Quasar papers.') for pub in quasar_stats['publications']: print(' ', pub['bib']['title']) fcit = [item for sublist in cit for item in sublist] # this is a flat list now print(f'\nWe have {len(fcit)} citations so far for our Quasar papers.') # I wonder if this can be done in fewer lines. :D
from scholarly import scholarly as schl import pandas as pd import requests labull = schl.search_author_id('Lszt1B4AAAAJ') schl.pprint(labull) # pubs associated w labull auth = schl.fill(labull, sections=['publications', 'indices'], sortby='year') # fill pub info pub_info = [ schl.fill(pub, sections=['bib', 'pub_url']) for pub in auth['publications'] ] # co-authors pub_auth = [pub['bib']['author'] for pub in pub_info] pub_surnames = [] for auth_list in pub_auth: sn = [auth.split()[-1] for auth in auth_list.split(' and ')] sn[sn.index('Bull')] = '**Bull**' pub_surnames.append(sn) # journal # pub_jnl = [pub['bib'].get('journal') for pub in pub_info] # pub info string publ_entry = [ ', '.join(pub_surnames[pp]) + ' ' + # author surnames '[' + pub['bib']['title'] + ']' + '(' + pub.get('pub_url') + ')' + ' ' + # title w links '(' + str(pub['bib']['pub_year']) + ')' # yr
while hasSignificantFaculty: r = requests.get(url) htmlSrc = r.text facultyIDSet = set() index = htmlSrc.find(USER_PROFILE_PREFIX) while index != -1: quote = htmlSrc.find("\"", index) facultyID = htmlSrc[index + len(USER_PROFILE_PREFIX):quote] if facultyID not in facultyIDSet: facultyIDSet.add(facultyID) author = scholarly.search_author_id(facultyID) print("Filling publication and citation data for " + author.name) author.fill(['publications', 'counts']) print("Checking if faculty is significant enough to continue") citedby = 0 for i in author.cites_per_year.values(): citedby += i if citedby < lastUpdatedH: hasSignificantFaculty = False break print("Adding " + author.name + "\'s publications: ", end="") c = 0 for pub in author.publications:
def newGenerateRecentGoogleScholarCSV(list_of_researchers): grid = [] #titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL'] #grid.append(titles_array) outputGrid = [] output_columns = ['Name', 'URL', 'SuccessfullyScraped'] for i in range(0, len(list_of_researchers)): #researcher = list_of_researchers.iloc[i][0] researcher = "" researcher_URL = list_of_researchers.iloc[i][1] researcherID = researcher_URL.replace("https://scholar.google.com/citations?user="******"") researcherID = researcherID.replace("&hl=en&oi=ao", "") #print(researcher) search_query = '' author = '' publications = [] try: search_query = scholarly.search_author_id(researcherID) author = search_query.fill() researcher = author.name print(researcher) if author != None: publications = author.publications outputGrid.append([researcher, str(researcher_URL), "True"]) else: search_query = 'error' author = 'error' outputGrid.append([researcher, str(researcher_URL), "False"]) except: search_query = 'error' author = 'error' outputGrid.append([researcher, str(researcher_URL), "False"]) if search_query != 'error': # Find the min year and the max year of publication max_year = 0 min_year = 3000 for publication in publications: try: if int(publication.bib['year']) > max_year: max_year = int(publication.bib['year']) if int(publication.bib['year']) < min_year: min_year = int(publication.bib['year']) except: max_year = max_year # Only go to 50 publications counter = 50 # Go through the publications in descending order of publication for i in range(0, max_year - min_year + 1): if counter == 0: i = max_year - min_year + 1 else: # Look through publications for all publications in that year print("Year " + str(max_year - i)) for publication in publications: try: if int(publication.bib['year']) == max_year - i and counter > 0: counter = counter - 1 print(counter) name = researcher url = '' try: url = researcher_URL except: url = 'error' pictureURL = '' try: pictureURL = "https://scholar.googleusercontent.com/citations?view_op=medium_photo&user="******"' + title + '"' title = title.replace(",", "") #title = regex.sub(' ', title) except: title = 'error' #title = title.replace(',', ' ') abstract = '' try: #abstract = clean_abstract(str(publication.fill().bib['abstract'])) abstract = str(publication.fill().bib['abstract']) #abstract = '"' + abstract + '"' abstract = abstract.replace(",", "") #abstract = regex.sub(' ', abstract) except: abstract = 'error' #abstract = abstract.replace(',', ' ') interests = '' try: for j in range(0, len(author.interests)): if len(interests) == 0: interests = str(author.interests[j]) #interests = regex.sub(' ', interests) else: nextInterest = str(author.interests[j]) #nextInterest = regex.sub(' ', nextInterest) interests = interests + '/' + nextInterest except: interests = 'error' interests = interests.replace(',',' ') citations = '' try: citations = str(author.citedby) except: citations = 'error' citations = citations.replace(',',' ') affiliation = '' try: affiliation = str(author.affiliation) # Take note of position of commas using unusual alphabetic series affiliation = affiliation.replace(',','XYZ') #affiliation = regex.sub(' ', affiliation) affiliation = affiliation.replace('XYZ',',') except: affiliation = 'error' affiliation = affiliation.replace(',','/') year = 0 try: year = int(publication.bib['year']) except: year = -1 test_array = [str(name), url, str(title), str(abstract), str(interests), citations, str(affiliation), year, pictureURL] grid.append(test_array) except: test_array = [] return grid, outputGrid
def get_research(self, lod: List[ExcelData]) -> List[ResearchData]: """ Takes list of professor data, hands it to scholarly, returns list of research data """ # List of professors with their attached research lor = [] # List[Research] data: ExcelData for data in lod: if DEBUG: print(f"Trying {data.lab}...") lab = None if DEBUG and False: # Don't make extra API calls (worried about 429), instead load in "shelved" data # If you don't have any data shelved, add the else case here one once; # you could also run p.fill() on the pubs if you want to save # filled data data = shelve.open("data") lab = data["biot"] else: lab = scholarly.search_author_id(data.lab_id) try: lab.fill() except AttributeError: print(f"Lab came out as {lab}, which wasn't fillable!") continue # Compile all publications lop = [] # List[Publication] count = 0 # p's type is given by scholarly max: int if DEBUG: max = 9 else: max = 50 for publication in lab.publications: added = False # Only collect 50 publications max per lab if count > max: break try: publication.fill() bib = publication.bib custom_pub = None try: custom_pub = PublicationData( bib["title"], bib["author"], int(bib["year"]), self.get_citations(publication.cites_per_year), bib["publisher"], ) added = True except KeyError: continue # custom_pub = PublicationData( # bib["title"], # "n/a", # int(bib["year"]), # self.get_citations(publication.cites_per_year), # "n/a", # ) # print(bib["title"] + " was missing information") if custom_pub is not None: lop.append(custom_pub) if added: count += 1 except Exception: continue # Attach professor to (sorted) publications research = ResearchData(data.lab, data.lab_id, lop) lor.append(research) print( f"""{TerminalColors.OKGREEN.value}Done gathering research! Now creating the output file...{TerminalColors.ENDC.value}""" ) return lor
from scholarly import scholarly author = scholarly.search_author_id('V4ycRTQAAAAJ') pubs = scholarly.fill(author) pub = pubs['publications'][0]['bib']['title'] query = next(scholarly.search_pubs(pub)) # depois de algumas chamadas, o Google bloqueia... bib = scholarly.bibtex(query) print(bib) #query = scholarly.search_pubs("A density-based algorithm for discovering clusters in large spatial databases with noise") #pub = next(query) #print(pub) #print(scholarly.bibtex(pub))
from scholarly import scholarly import jsonpickle import json from datetime import datetime scholar_ids = ['RwMPs-8AAAAJ&hl'] for id in scholar_ids: author = scholarly.search_author_id(id) scholarly.fill(author, sections=['basics', 'indices', 'counts', 'publications']) name = author['name'] author['updated'] = str(datetime.now) scholarly.pprint(author) author = jsonpickle.encode(author) with open(f'{name}.json', 'w') as outfile: json.dump(author, outfile)