Esempio n. 1
0
def _add_or_update_academic(google_scholar_id, user_id):
    user = User.query.get(user_id)

    current_app.logger.info(f'Adding or Updating Academic: {google_scholar_id} as user {user}')

    Entrez.email = user.email

    resp = scholarly.fill(scholarly.search_author_id(google_scholar_id), sections=['indices'])

    if resp:
        a = Academic.query.filter(Academic.google_scholar_id == google_scholar_id).one_or_none()

        if a is None:
            a = Academic(google_scholar_id=google_scholar_id)
        
        a.name=resp['name']
        a.affiliation=resp['affiliation']
        a.cited_by=resp['citedby']
        a.h_index=resp['hindex']
        a.i10_index=resp['i10index']
        a.last_update_date=datetime.utcnow()
        a.is_updating = True

        db.session.add(a)
        db.session.commit()

        _update_publications(a)

        a = Academic.query.filter(Academic.google_scholar_id == google_scholar_id).one()
        a.is_updating = False
        db.session.add(a)
        db.session.commit()

    current_app.logger.info(f'Adding or Updating Academic Completed: {google_scholar_id}')
Esempio n. 2
0
def plot_citations(author_name):
    m = Basemap(projection='mill', lon_0=180)
    m.drawmapboundary(fill_color='aqua')
    m.fillcontinents(color='coral', lake_color='aqua')

    search_query = scholarly.search_author(author_name)
    author = next(search_query).fill()
    print(author)
    for pub in [author.publications[0]]:
        print('Title: ', pub.bib['title'])
        pub = pub.fill()
        sleep(45)
        for citation in pub.citedby:
            print(citation)
            sleep(45)
            firstAuthorId = None
            while firstAuthorId is None or len(citation.bib['author_id']) == 0:
                firstAuthorId = citation.bib['author_id'].pop()
            if firstAuthorId is None:
                continue
            print(firstAuthorId)
            author = scholarly.search_author_id(firstAuthorId)
            sleep(45)
            lat, lon = get_location(author.affiliation)
            x, y = m(float(lon), float(lat))
            m.plot(x, y, marker='D')
    plt.show()
Esempio n. 3
0
def fetch_citer_by_author(author):
    # first, get the author entry

    # check if item is a name or an id
    #if name
    if True:
        author_gen = proxied_search_author(author)
    #else (if id)
    else:
        author_gen = scholarly.search_author_id(author)

    # what happens if there is more then one other?
    # check for that and throw an error
    matches = list(author_gen)

    assert (len(matches) == 1), "Author query not unique"
    author = matches[0]
    author = proxied_author_fill(author)

    # second, fetch all publications

    #authors_publications = [pub.bib['title'] for pub in author.publications]
    #print(authors_publications)

    # third, use these publications to query for citations
    for kk, pub in enumerate(author.publications):
        print(kk)
        # fetch the full publictation entry to get citations
        pub = proxied_pub_fill(pub)
        citers = [citation.bib['title'] for citation in pub.citedby]

    return citers
Esempio n. 4
0
def main(args):
    gs_author = {}
    gs_pubs = []

    if not args.force:
        gs_author = load('google_scholar_author.pkl') or {}
        gs_pubs = load('google_scholar_publications.pkl') or []

    gs_author = scholarly.search_author_id('SZR6mXsAAAAJ')
    gs_author = scholarly.fill(gs_author)
    save(gs_author, 'google_scholar_author.pkl')

    current_pubs_ids = {p['author_pub_id'] for p in gs_pubs}
    author_pubs_ids = {p['author_pub_id'] for p in gs_author['publications']}
    new_pubs_ids = author_pubs_ids - current_pubs_ids
    new_pubs = [
        p for p in gs_author['publications']
        if p['author_pub_id'] in new_pubs_ids
    ]

    # TODO update based on info available on gs_author

    for p in tqdm(new_pubs):
        p = scholarly.fill(p)
        gs_pubs.append(p)
        save(gs_pubs, 'google_scholar_publications.pkl')
Esempio n. 5
0
    def add_publications(self, request):
        owner_id = self.request.user.id
        owner = get_object_or_404(User, pk=owner_id)
        Publication.objects.filter(
            owner=owner
        ).delete()
        author_id = request.GET.get('author_id', None)
        author_basic = scholarly.search_author_id(author_id)
        author = scholarly.fill(author_basic)
        data = {}
        for publication in author['publications'][:25]:
            publication_info = publication['bib']
            if 'title' in publication_info:
                data['title'] = publication_info['title']
            if 'pub_year' in publication_info:
                data['publication_year'] = publication_info['pub_year']
            if 'author_pub_id' in publication:
                elem = publication['author_pub_id'].split(':')
                base_link = "https://scholar.google.com/citations?user={}" + \
                    "#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3D" +\
                    "view_citation%26user%3D{}%26citation_for_view%3D{}%3A{}"
                data['link'] = base_link.format(
                    elem[0], elem[0], elem[0], elem[1])
            if 'num_citations' in publication:
                data['citation_number'] = publication['num_citations']
            data['owner'] = owner_id

            serializer = PublicationSerializer(data=data)
            serializer.is_valid(raise_exception=True)
            Publication.objects.create(
                **serializer.validated_data)

        return Response(status=status.HTTP_201_CREATED)
Esempio n. 6
0
def fetch_scholar_author(gsID, fill=True):
    ''' Queries google scholar for a given author
		Also fills all stats if fill is True (a bit slower)'''
    author = scholarly.search_author_id(gsID)
    if fill:
        author = scholarly.fill(
            author, sections=['publications', 'basics', 'indices', 'counts'])
    return author
Esempio n. 7
0
 def test_search_author_id(self):
     """
     Test the search by author ID. Marie Skłodowska-Curie's ID is
     EmD_lTEAAAAJ and these IDs are permenant
     """
     author = scholarly.search_author_id('EmD_lTEAAAAJ')
     self.assertEqual(author.name, u'Marie Skłodowska-Curie')
     self.assertEqual(author.affiliation,
                      u'Institut du radium, University of Paris')
Esempio n. 8
0
def main(args):
    print('Querying Google Scholar ...')
    author = scholarly.search_author_id('SZR6mXsAAAAJ')
    author = scholarly.fill(author, sections=['publications'])
    pubs = map(_parse, author['publications'])
    cites = {x[0]: x[1] for x in pubs if x[1] > 0}
    # jsonp = 'cites = {};'.format()
    jsonp = json.dumps(cites)
    with open(args.output, 'w') as out:
        out.write(jsonp)
Esempio n. 9
0
    def background(self):
        self.data = []

        if self.test:
            self.data = [25, 30]
            return

        # Matthew, Jorgen, Igor
        for id in ["wxM0Gh8AAAAJ", "hfeXoYMAAAAJ", "rSVxxwsAAAAJ"]:
            self.data.append(scholarly.search_author_id(id).get("citedby"))
Esempio n. 10
0
    def background(self):
        self.data = []

        if self.test:
            self.data = [25, 30]
            return

        # Matthew, Jorgen, Igor
        for id in ["wxM0Gh8AAAAJ", "hfeXoYMAAAAJ"]:  # , "rSVxxwsAAAAJ"]:
            author = scholarly.search_author_id(id).fill()
            count = sum(int(paper.bib["cites"])
                        for paper in author.publications)
            self.data.append(count)
Esempio n. 11
0
 def test_search_author_id_filled(self):
     """
     Test the search by author ID. Marie Skłodowska-Curie's ID is
     EmD_lTEAAAAJ and these IDs are permenant.
     As of July 2020, Marie Skłodowska-Curie has 1963 citations
     on Google Scholar and 179 publications
     """
     author = scholarly.search_author_id('EmD_lTEAAAAJ', filled=True)
     self.assertEqual(author.name, u'Marie Skłodowska-Curie')
     self.assertEqual(author.affiliation,
                      u'Institut du radium, University of Paris')
     self.assertGreaterEqual(author.citedby, 1963)
     self.assertGreaterEqual(len(author.publications), 179)
Esempio n. 12
0
def extract_coauthors_by_id(author_id):
    """
        extracts the co-authors of the currently existing authors in the dataset
    """
    # create the output file

    author = scholarly.search_author_id(author_id)
    filled_coauthors = scholarly.fill(author, ['coauthors'])

    coauthors_list = filled_coauthors['coauthors']
    for author in coauthors_list:
        filled_author = scholarly.fill(author, ['indices'])
        register_coauthering(author_id, filled_author['scholar_id'])
        print(filled_author)
        mydict = filled_author_to_dict(filled_author)
        write_author(mydict, AUTHORS_CSV_FILE_OUTPUT_COAUTHORS)
Esempio n. 13
0
def get_papers_for_author(author_id):
    '''
        Gets and registers the papers of an author
    '''
    print("getting paper for author " + author_id)
    author = scholarly.search_author_id(author_id)
    filled_publications = scholarly.fill(author, ['publications'])
    publications_list = filled_publications['publications']
    nbpubs_counter = 0
    for publication in publications_list:
        filled_publication = scholarly.fill(publication)
        mydict = tiny_publication_to_dict(filled_publication)
        write_publication(mydict, PUBLICATIONS_CSV_FILE_OUTPUT)
        nbpubs_counter += 1
        print("nbpubs_counter =====>")
        print(nbpubs_counter)
        if nbpubs_counter > NB_MAX_PAPERS_PER_AUTHOR:
            break
Esempio n. 14
0
def download_citations():
    # Retrieve the author's data, fill-in, and print
    # search_query = scholarly.search_author(NAME)
    search_query = scholarly.search_author_id(AUTHOR_ID)
    # author = scholarly.fill(next(search_query))
    author = scholarly.fill(search_query)
    print(author)

    # Print the titles of the author's publications
    print([pub['bib']['title'] for pub in author['publications']])

    # Take a closer look at the first publication
    # pub = scholarly.fill(author['publications'][1])
    # print(pub)
    independent_citations = []
    for pub in author['publications'][:]:
        res_dict = {}
        time.sleep(random.randint(WAIT, WAIT * 2))
        pub = scholarly.fill(pub)
        res_dict["title"] = pub['bib']["title"]
        res_dict["year"] = pub['bib']["pub_year"]
        print(pub['bib']["title"])
        res_dict["author"] = [name.strip() for name in pub['bib']["author"].split("and")]
        time.sleep(random.randint(WAIT, WAIT * 2))
        cited_this = scholarly.citedby(pub)
        if cited_this:
            res_dict['cited_this'] = [{"author": citation['bib']["author"], "title": citation['bib']["title"]} for
                                      citation
                                      in
                                      cited_this]
            indep_citations = print_citations(res_dict)
            res_dict['independent_citations'] = indep_citations
            independent_citations.append(
                {"title": res_dict["title"], "author": res_dict["author"], 'independent_citations': indep_citations})
            save_json(res_dict['title'], res_dict)
        else:
            break

    save_json("independent_citations.json", independent_citations)
Esempio n. 15
0
def extract_interests(input_output_file):
    df = get_authors_dataframe(input_output_file)
    df = df.astype({"interests": str})
    df = df.astype({"url_picture": str})
    print("file readed successfully")
    for index, row in df.iterrows():
        print('interest====>' + str(row['interests']))
        if row['interests'] in ("", None, "nan") or pd.isna(row['interests']):
            print("Getting interests of author :" + row['scholar_id'])
            try:
                author = scholarly.search_author_id(row['scholar_id'])
                if 'interests' in author: interests = '|'.join(author['interests'])
                if 'url_picture' in author: url_picture = author['url_picture']

                df.at[index, 'interests'] = interests
                df.at[index, 'url_picture'] = url_picture
                update_authors_dataframe(input_output_file, df)
            except Exception as identifier:
                print(
                    "An exception happened while getting interests of : " + row['scholar_id'])
                df.at[index, 'interests'] = 'error'
                print(identifier.args)
                update_authors_dataframe(input_output_file, df)
Esempio n. 16
0
    def fetch_auth_data_google_scholar(self):
        base_url = 'https://scholar.google.com/citations?user='******'author_id']:
            if aid is not None:
                author = scholarly.search_author_id(aid)
                authors.append(
                    author.fill(
                        sections=['basics', 'indices', 'publications']))

        author_info = []
        for author in authors:
            a_dict = {'authorId': author.id}
            auth_url = base_url + a_dict['authorId']
            a_dict['Publications'] = len(author.publications)
            a_dict['Citations'] = author.citedby
            a_dict['h-index'] = author.hindex
            front_end_info = {
                'meta': 'auth_meta',
                'name': author.name,
                'publications': a_dict['Publications'],
                'url': auth_url,
                'citations': a_dict['Citations'],
                'h_index': a_dict['h-index']
            }
            print("GScholar Info: ", front_end_info)
            # self.socketio.emit('server_response', front_end_info, namespace='')
            author_info.append(a_dict)
        if len(author_info) == 0:
            front_end_info = {
                'meta': 'auth_meta',
                'author_info_not_found': '1'
            }
            # self.socketio.emit('server_response', front_end_info, namespace='')
            print("GScholar Info: ", front_end_info)
        return pd.DataFrame(author_info)
Esempio n. 17
0
def pubs(WIDTH):
    def sort_items(item):
        """Sort special variables first, then alphabetically."""
        try:
            return int(item["bib"]["pub_year"])
        except KeyError:
            return 0

    myid = "8eDOmAQAAAAJ"  # google scholar ID
    me = scholarly.search_author_id(myid, filled=True)

    pubs = pi.Report("Publications", accent=orange, dim=amber)
    pubs.width = WIDTH

    pubs.add(gscholar_bio(me, WIDTH), "rich")
    pubs.spacer(2)

    year = None
    for n, pub in enumerate(
            sorted(me["publications"], key=sort_items, reverse=True)):
        cites = pub["num_citations"]
        pub = scholarly.fill(pub)["bib"]

        if "pub_year" not in pub.keys():
            continue

        # Mark year
        if pub["pub_year"] != year:

            if n > 0:
                pubs.line(amber)
                pubs.spacer()

            pubs.add(f'[bold {salmon_light}]{pub["pub_year"]}',
                     justify="center")
            year = pub["pub_year"]
            pubs.spacer()

        # add title
        pubs.add(f"[bold italic {orange_light}]" + pub["title"])

        # add authors
        try:
            auths = pub["author"].replace(" and", ",")
        except KeyError:
            auths = ""

        names = ["F Claudi", "Federico Claudi", "F. Claudi"]
        formatted_auths = ""
        for author in auths.split(","):
            if author.strip() in names:
                formatted_auths += f"[bold {pink}]{author}[/bold {pink}],"
            else:
                formatted_auths += f"[{blue_grey}]{author}[/{blue_grey}],"

        pubs.add(formatted_auths)

        # Add journal
        if "eprint" in pub.keys():
            url = pub["eprint"]
        elif "url" in pub.keys():
            url = pub["url"]
        else:
            url = ""

        try:
            journal = pub["journal"]
        except KeyError:
            journal = ""

        pubs.add(f"[i {blue_grey_light}]" + journal +
                 f"[/i {blue_grey_light}]" + "[dim]\n" + url)

        # Add citations
        pubs.add(
            f"[{blue_grey_light}]Citations: [{orange}]{cites}",
            justify="right",
        )

        pubs.spacer()

    return pubs
Esempio n. 18
0
from scholarly import scholarly
import jsonpickle
import json
from datetime import datetime
import os

author: dict = scholarly.search_author_id(os.environ['GOOGLE_SCHOLAR_ID'])
scholarly.fill(author, sections=['basics', 'indices', 'counts', 'publications'])
name = author['name']
author['updated'] = str(datetime.now())
author['publications'] = {v['author_pub_id']:v for v in author['publications']}
print(json.dumps(author, indent=2))
os.makedirs('results', exist_ok=True)
with open(f'results/gs_data.json', 'w') as outfile:
    json.dump(author, outfile, ensure_ascii=False)

shieldio_data = {
  "schemaVersion": 1,
  "label": "citations",
  "message": f"{author['citedby']}",
}

with open(f'results/gs_data_shieldsio.json', 'w') as outfile:
    json.dump(shieldio_data, outfile, ensure_ascii=False)
Esempio n. 19
0
    return first, itertools.chain([first], iterable)


for index, row in data.iterrows():
    # Retrieve the author's data, fill-in, and print
    # search_query = scholarly.search_author('Steven A Cholewiak')
    # do not use generators coz we'll always get a single profile
    name = row['Last name'].strip() + " " + row['First name'].strip()
    url = row['Google scholar']

    print("Author: ", name)

    if not pd.isna(url):
        print("Research Profile: ", url)
        parsed = urlparse.urlparse(url)
        search_query = scholarly.search_author_id(
            parse_qs(parsed.query)['user'][0])
        author = search_query.fill()

        print(search_query)
        print(author)
        # break
        # Print the titles of the author's publications
        # print([pub.bib['title'] for pub in author.publications])

        # Take a closer look at the first publication
        # pub = author.publications[0].fill()
        # print(pub)

        # Which papers cited that publication?
        # print([citation.bib['title'] for citation in pub.citedby])
        # break
Esempio n. 20
0
num_possible = 0
for instructor_name in names:
    if not instructors_internal_db.check_its_time(
            instructor_name, 'gscholar_last_search', UPDATE_MIN_DAYS,
            UPDATE_MAX_DAYS):
        continue
    if instructors[instructor_name]['gscholar'] \
            and instructors_internal_db.check_its_time(instructor_name, 'gscholar_last_update',
                                                       UPDATE_MIN_DAYS, UPDATE_MAX_DAYS):
        # skipping cause not expired 'gscholar_last_update' and gscholar already exists
        continue

    # search or update
    if instructors[instructor_name]['gscholar']:
        logger.info("Updating instructor: %s", instructor_name)
        result = scholarly.search_author_id(
            instructors[instructor_name]['gscholar']['scholar_id'])
        found = True
    else:
        logger.info("Searching instructor: %s", instructor_name)
        results = scholarly.search_author(instructor_name)
        # print("Our db:", instructors[instructor_name])
        found = False
        for result in results:
            # print("Google Scholar result: ", result)
            if ('columbia' in result['affiliation'].lower() and 'british' not in result['affiliation'].lower()) \
                    or 'columbia' in result['email_domain'].lower() \
                    or 'barnard' in result['email_domain'].lower():
                found = True
                break
            elif words_match2(instructor_name, result['name']):
                logger.info(
Esempio n. 21
0
def newGenerateCitedGoogleScholarCSV(list_of_researchers):
    grid = []

    #titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
    #grid.append(titles_array)

    outputGrid = []
    output_columns = ['Name', 'URL', 'SuccessfullyScraped']

    for i in range(0, len(list_of_researchers)):

    	#researcher = list_of_researchers.iloc[i][0]
    	researcher = ""
    	researcher_URL = list_of_researchers.iloc[i][1]

    	researcherID = researcher_URL.replace("https://scholar.google.com/citations?user="******"")
    	researcherID = researcherID.replace("&hl=en&oi=ao", "")

    	#print(researcher)
    	search_query = ''
    	author = ''
    	publications = []

    	try:
    		search_query = scholarly.search_author_id(researcherID)
    		author = search_query.fill()
			


    		researcher = author.name

    		print(researcher)
			


    		if author != None:

    			publications = author.publications

    			outputGrid.append([researcher, str(researcher_URL), "True"])

    		else:
    			search_query = 'error'
	    		author = 'error'

	    		outputGrid.append([researcher, str(researcher_URL), "False"])

    	except:

    		try:

    			search_query = next(scholarly.search_author(list_of_researchers.iloc[i][0]))
	    		author = search_query.fill()
				


	    		researcher = author.name

	    		print(researcher)
				


	    		if author != None:

	    			publications = author.publications

	    			outputGrid.append([researcher, str(researcher_URL), "True"])

	    		else:
	    			search_query = 'error'
		    		author = 'error'

		    		outputGrid.append([researcher, str(researcher_URL), "False"])

    		except:

	    		search_query = 'error'
	    		author = 'error'

	    		outputGrid.append([researcher, str(researcher_URL), "False"])


    	if search_query != 'error':

	    	# Gather the author, URL, title, abstract, keywords, citations, affiliation, publication year,
	    	# and picture URL from each researcher's publication
	    	for i in range(0, 50):
		        print(i)
		        name = researcher
		        url = ''
		        try:
		        	url = researcher_URL
		        except:
		        	url = 'error'
		        pictureURL = ''
		        try:
		        	pictureURL = "https://scholar.googleusercontent.com/citations?view_op=medium_photo&user="******"' + title + '"'
		        	title = title.replace(",", "")
		        	#title = regex.sub(' ', title)
		        except:
		        	title = 'error'
		        #title = title.replace(',', ' ')
		        abstract = ''
		        try:
		        	#abstract = clean_abstract(str(author.publications[i].fill().bib['abstract']))
		        	abstract = str(author.publications[i].fill().bib['abstract'])
		        	#abstract = '"' + abstract + '"'
		        	abstract = abstract.replace(",", "")
		        	#abstract = regex.sub(' ', abstract)
		        except:
		        	abstract = 'error'
		        #abstract = abstract.replace(',', ' ')
		        interests = ''
		        try:
		        	for i in range(0, len(author.interests)):
		        		if len(interests) == 0:
		        			interests = str(author.interests[i])
		        			#interests = regex.sub(' ', interests)
		        		else:
		        			nextInterest = str(author.interests[i])
		        			#nextInterest = regex.sub(' ', nextInterest)
		        			interests = interests + '/' + nextInterest
		        except:
		        	interests = 'error'
		        interests = interests.replace(',',' ')

		        citations = ''
		        try:
		        	citations = str(author.citedby)
		        except:
		        	citations = 'error'
		        citations = citations.replace(',',' ')

		        affiliation = ''
		        try:
		        	affiliation = str(author.affiliation)

		        	# Take note of position of commas using unusual alphabetic series
		        	affiliation = affiliation.replace(',','XYZ')
		        	#affiliation = regex.sub(' ', affiliation)
		        	affiliation = affiliation.replace('XYZ',',')
		        except:
		        	affiliation = 'error'

		        # Default set the year to -1 since the information 
		        # isn't relevant for analysis of this publication set
		        year = -1

		        affiliation = affiliation.replace(',',' / ')

		        
		        test_array = [str(name), url, str(title), str(abstract), str(interests), citations, str(affiliation), year, pictureURL]
		        grid.append(test_array)

    return grid, outputGrid
Esempio n. 22
0
import sys

import toml
from scholarly import scholarly

print("Searching on Google scholar")

author = scholarly.search_author_id('_7AMrKgAAAAJ')  # _7AMrKgAAAAJ is Quasar

quasar_stats = scholarly.fill(
    author, sections=['basics', 'indices', 'counts', 'publications'])

scholarly.pprint(quasar_stats)

# What papers cited our publications?
cit = []
for pub in quasar_stats['publications']:
    print(pub)
    cit.append(
        [citation for citation in scholarly.citedby(pub)]
    )  # limit the number of test runs because this will get blocked bu Google quickly

print(
    f'There are currently {len(quasar_stats["publications"])} Quasar papers.')
for pub in quasar_stats['publications']:
    print(' ', pub['bib']['title'])

fcit = [item for sublist in cit for item in sublist]  # this is a flat list now
print(f'\nWe have {len(fcit)} citations so far for our Quasar papers.')

# I wonder if this can be done in fewer lines. :D
Esempio n. 23
0
from scholarly import scholarly as schl
import pandas as pd
import requests

labull = schl.search_author_id('Lszt1B4AAAAJ')
schl.pprint(labull)

# pubs associated w labull
auth = schl.fill(labull, sections=['publications', 'indices'], sortby='year')
# fill pub info
pub_info = [
    schl.fill(pub, sections=['bib', 'pub_url']) for pub in auth['publications']
]

# co-authors
pub_auth = [pub['bib']['author'] for pub in pub_info]
pub_surnames = []
for auth_list in pub_auth:
    sn = [auth.split()[-1] for auth in auth_list.split(' and ')]
    sn[sn.index('Bull')] = '**Bull**'
    pub_surnames.append(sn)

# journal
# pub_jnl = [pub['bib'].get('journal') for pub in pub_info]

# pub info string
publ_entry = [
    ', '.join(pub_surnames[pp]) + ' ' +  # author surnames
    '[' + pub['bib']['title'] + ']' + '(' + pub.get('pub_url') + ')' +
    ' ' +  # title w links
    '(' + str(pub['bib']['pub_year']) + ')'  # yr
Esempio n. 24
0
while hasSignificantFaculty:
    r = requests.get(url)
    htmlSrc = r.text

    facultyIDSet = set()

    index = htmlSrc.find(USER_PROFILE_PREFIX)
    while index != -1:
        quote = htmlSrc.find("\"", index)
        facultyID = htmlSrc[index + len(USER_PROFILE_PREFIX):quote]

        if facultyID not in facultyIDSet:
            facultyIDSet.add(facultyID)

            author = scholarly.search_author_id(facultyID)
            print("Filling publication and citation data for " + author.name)
            author.fill(['publications', 'counts'])

            print("Checking if faculty is significant enough to continue")
            citedby = 0
            for i in author.cites_per_year.values():
                citedby += i

            if citedby < lastUpdatedH:
                hasSignificantFaculty = False
                break

            print("Adding " + author.name + "\'s publications: ", end="")
            c = 0
            for pub in author.publications:
Esempio n. 25
0
def newGenerateRecentGoogleScholarCSV(list_of_researchers):
	grid = []

	#titles_array = ['Author','URL','Title','Abstract','Keywords','Citations','Affiliation','Year', 'PictureURL']
	#grid.append(titles_array)

	outputGrid = []
	output_columns = ['Name', 'URL', 'SuccessfullyScraped']

	for i in range(0, len(list_of_researchers)):

		#researcher = list_of_researchers.iloc[i][0]
		researcher = ""
		researcher_URL = list_of_researchers.iloc[i][1]

		researcherID = researcher_URL.replace("https://scholar.google.com/citations?user="******"")
		researcherID = researcherID.replace("&hl=en&oi=ao", "")

		#print(researcher)
		search_query = ''
		author = ''
		publications = []

		try:
		  search_query = scholarly.search_author_id(researcherID)
		  author = search_query.fill()
		

		  researcher = author.name
		
		  print(researcher)
   

		  if author != None:

			  publications = author.publications

			  outputGrid.append([researcher, str(researcher_URL), "True"])

		  else:
			  search_query = 'error'
			  author = 'error'

			  outputGrid.append([researcher, str(researcher_URL), "False"])

		except:

			search_query = 'error'
			author = 'error'

			outputGrid.append([researcher, str(researcher_URL), "False"])
      

		if search_query != 'error':

    	# Find the min year and the max year of publication
			max_year = 0
			min_year = 3000
			for publication in publications:
				try:

					if int(publication.bib['year']) > max_year:
						max_year = int(publication.bib['year'])
					if int(publication.bib['year']) < min_year:
						min_year = int(publication.bib['year'])
				except:
					max_year = max_year

			# Only go to 50 publications
			counter = 50

			# Go through the publications in descending order of publication
			for i in range(0, max_year - min_year + 1):
				if counter == 0:
					i = max_year - min_year + 1
				else:

					# Look through publications for all publications in that year
					print("Year " + str(max_year - i))
					for publication in publications:
						try:
							if int(publication.bib['year']) == max_year - i and counter > 0:

								counter = counter - 1
								print(counter)

								name = researcher
								url = ''

								try:
									url = researcher_URL
								except:
									url = 'error'

								pictureURL = ''
								try:
									pictureURL = "https://scholar.googleusercontent.com/citations?view_op=medium_photo&user="******"' + title + '"'
									title = title.replace(",", "")
									#title = regex.sub(' ', title)
								except:
									title = 'error'
								#title = title.replace(',', ' ')
								abstract = ''
								try:
									#abstract = clean_abstract(str(publication.fill().bib['abstract']))
									abstract = str(publication.fill().bib['abstract'])
									#abstract = '"' + abstract + '"'
									abstract = abstract.replace(",", "")
									#abstract = regex.sub(' ', abstract)
								except:
									abstract = 'error'
								#abstract = abstract.replace(',', ' ')
								interests = ''
								try:
									for j in range(0, len(author.interests)):
										if len(interests) == 0:
											interests = str(author.interests[j])
											#interests = regex.sub(' ', interests)
										else:
											nextInterest = str(author.interests[j])
											#nextInterest = regex.sub(' ', nextInterest)
											interests = interests + '/' + nextInterest
								except:
									interests = 'error'
								interests = interests.replace(',',' ')


								citations = ''
								try:
									citations = str(author.citedby)
								except:
									citations = 'error'
								citations = citations.replace(',',' ')

								affiliation = ''
								try:
									affiliation = str(author.affiliation)

									# Take note of position of commas using unusual alphabetic series
									affiliation = affiliation.replace(',','XYZ')
									#affiliation = regex.sub(' ', affiliation)
									affiliation = affiliation.replace('XYZ',',')

								except:
									affiliation = 'error'
								affiliation = affiliation.replace(',','/')

								year = 0
								try:
									year = int(publication.bib['year'])
								except:
									year = -1

								test_array = [str(name), url, str(title), str(abstract), str(interests), citations, str(affiliation), year, pictureURL]
								grid.append(test_array)
						except:
							test_array = []
	

	return grid, outputGrid
Esempio n. 26
0
    def get_research(self, lod: List[ExcelData]) -> List[ResearchData]:
        """
        Takes list of professor data, hands it to scholarly, returns list of research data
        """

        # List of professors with their attached research
        lor = []  # List[Research]

        data: ExcelData
        for data in lod:
            if DEBUG:
                print(f"Trying {data.lab}...")

            lab = None
            if DEBUG and False:
                # Don't make extra API calls (worried about 429), instead load in "shelved" data
                # If you don't have any data shelved, add the else case here one once;
                # you could also run p.fill() on the pubs if you want to save
                # filled data
                data = shelve.open("data")
                lab = data["biot"]
            else:
                lab = scholarly.search_author_id(data.lab_id)
                try:
                    lab.fill()
                except AttributeError:
                    print(f"Lab came out as {lab}, which wasn't fillable!")
                    continue

            # Compile all publications
            lop = []  # List[Publication]

            count = 0
            # p's type is given by scholarly
            max: int
            if DEBUG:
                max = 9
            else:
                max = 50
            for publication in lab.publications:
                added = False
                # Only collect 50 publications max per lab
                if count > max:
                    break
                try:
                    publication.fill()
                    bib = publication.bib
                    custom_pub = None
                    try:
                        custom_pub = PublicationData(
                            bib["title"],
                            bib["author"],
                            int(bib["year"]),
                            self.get_citations(publication.cites_per_year),
                            bib["publisher"],
                        )
                        added = True
                    except KeyError:
                        continue
                        # custom_pub = PublicationData(
                        #     bib["title"],
                        #     "n/a",
                        #     int(bib["year"]),
                        #     self.get_citations(publication.cites_per_year),
                        #     "n/a",
                        # )
                        # print(bib["title"] + " was missing information")
                    if custom_pub is not None:
                        lop.append(custom_pub)
                    if added:
                        count += 1
                except Exception:
                    continue

            # Attach professor to (sorted) publications
            research = ResearchData(data.lab, data.lab_id, lop)
            lor.append(research)
        print(
            f"""{TerminalColors.OKGREEN.value}Done gathering research! Now creating the output file...{TerminalColors.ENDC.value}"""
        )
        return lor
Esempio n. 27
0
from scholarly import scholarly

author = scholarly.search_author_id('V4ycRTQAAAAJ')

pubs = scholarly.fill(author)
pub = pubs['publications'][0]['bib']['title']

query = next(scholarly.search_pubs(pub))  # depois de algumas chamadas, o Google bloqueia...
bib = scholarly.bibtex(query)
print(bib)

#query = scholarly.search_pubs("A density-based algorithm for discovering clusters in large spatial databases with noise")
#pub = next(query)
#print(pub)
#print(scholarly.bibtex(pub))
Esempio n. 28
0
from scholarly import scholarly
import jsonpickle
import json
from datetime import datetime

scholar_ids = ['RwMPs-8AAAAJ&hl']

for id in scholar_ids:
    author = scholarly.search_author_id(id)
    scholarly.fill(author,
                   sections=['basics', 'indices', 'counts', 'publications'])
    name = author['name']
    author['updated'] = str(datetime.now)
    scholarly.pprint(author)
    author = jsonpickle.encode(author)

    with open(f'{name}.json', 'w') as outfile:
        json.dump(author, outfile)