def report(query, label, refresh=True): """Print out an org-mode report for search results. Parameters ---------- query : str The search query based on which results the report should be generated. label : str The label used in the document title ("Report for ..."). refresh : bool (optional, default=True) Whether to refresh a cached file containing results of a previous query or not. """ # Header print('*** Report for {}\n'.format(label)) print('#+attr_latex: :placement [H] :center nil') # Perform query s = ScopusSearch(query, refresh=refresh) journal_res = [p for p in s.results if p.aggregationType == "Journal"] # Parse results doc_types = Counter([p.aggregationType for p in s.results]) paper_cites = {(p.title, p.doi): int(p.citedby_count) for p in journal_res} Ncites = sum(paper_cites.values()) papers = len(journal_res) author_count = [len(p.authid.split(";")) for p in journal_res] au_counts = defaultdict(lambda: 0) j_counts = defaultdict(lambda: 0) for p in journal_res: for auth in zip(p.authname.split(";"), p.authid.split(";")): key = (auth[0], auth[1]) au_counts[key] += 1 jkey = (p.publicationName, p.source_id, p.issn) j_counts[jkey] += 1 # Document information print('#+caption: Types of documents found for {}.'.format(label)) print('| Document type | count |\n|-') for key, value in doc_types.items(): print('| {} | {} |'.format(key, value)) print('\n\n{} articles ({} citations) ' 'found by {} authors'.format(papers, Ncites, len(au_counts))) # Author counts {(name, scopus-id): count} auth_url = "[[https://www.scopus.com/authid/detail.uri?authorId={}][{}]]" view = [(auth_url.format(k[1], k[0]), v, k[1]) for k, v in au_counts.items()] view.sort(reverse=True, key=itemgetter(1)) print('\n#+attr_latex: :placement [H] :center nil') print('#+caption: Author publication counts for {0}.'.format(label)) print('| name | count | categories |\n|-') for name, count, identifier in view[:20]: cats = ', '.join([ '{} ({})'.format(cat[0], cat[1]) for cat in get_subject_docs(identifier, refresh)[0:3] ]) print('| {} | {} | {} |'.format(name, count, cats)) # Journal information jour_url = '[[https://www.scopus.com/source/sourceInfo.url?sourceId={}][{}]]' jview = [(jour_url.format(k[1], k[0][0:50]), k[1], k[2], v) for k, v in j_counts.items()] jview.sort(reverse=True, key=itemgetter(3)) print('\n\n#+attr_latex: :placement [H] :center nil') print('#+caption: Journal publication counts for {}.'.format(label)) print('| Journal | count |\n|-') for journal, sid, issn, count in jview[0:12]: print('| {} | {} |'.format(journal, count)) # Top cited papers pview = [('[[{}][{}]]'.format(k[1], k[0][0:60]), int(v)) for k, v in paper_cites.items()] pview.sort(reverse=True, key=itemgetter(1)) h_index = hindex([p[1] for p in pview]) print('\n\n#+attr_latex: :placement [H] :center nil') print('#+caption: Top cited publication' ' counts for {}. h-index = {}.'.format(label, h_index)) print('| title | cite count |\n|-') for title, count in pview[0:10]: print('| {} | {} |'.format(title, count)) # Plot authors per publication plt.figure() plt.hist(author_count, 20) plt.xlabel('# authors') plt.ylabel('frequency') plt.savefig('{}-nauthors-per-publication.png'.format(label)) # Bibliography print('\n\n#+caption: Number of authors ' 'on each publication for {}.'.format(label)) print('[[./{}-nauthors-per-publication.png]]'.format(label)) print('''**** Bibliography :noexport: :PROPERTIES: :VISIBILITY: folded :END:''') for i, p in enumerate(journal_res): abstract = AbstractRetrieval(p.eid) print('{}. {}\n'.format(i + 1, abstract))
def calculate_h_index(df_authors, db_path, has_key=False): # Collect three stats: (i) author name and his/her h-index, (ii) citation list of each pmid, and (iii) author pmids author_2_hindex = dict() author_2_hindex_return = dict() pmid_2_cite = dict() author_2_pmids = dict() h_index_db = dict() with open(db_path) as h_index_db_file: h_index_db = json.load(h_index_db_file) for name in tqdm(df_authors): if name[0] == ' ': author = name[1:] # + ' ' + surname else: author = name # + ' ' + surname author_pmids = [] # First check if the name already exists in our db, if 0, then try again call_api_flag = True if author in h_index_db['h_indices']: h_index = h_index_db['h_indices'][author] # print(f"{author} found in current database ({db_path}) with a value of {h_index}") if h_index != 0: # print(f"{author} found in current database ({db_path}) with a value of {h_index}") author_2_hindex_return[author] = h_index call_api_flag = False else: # print(f"{author} found in current database ({db_path}) with a value of {h_index} ... retrying API call") # BELOW IS JUST FOR DEMO!!! #TODO author_2_hindex_return[author] = h_index call_api_flag = False else: print(f"{author} not found in current database ({db_path})") # This ensures that we are not checking short and very common names which takes forever to collect information if len(author) > 5 and call_api_flag: if has_key: page = pull_url(author, Entrez.email, Entrez.api_key) else: page = pull_url(author, Entrez.email) soup = BeautifulSoup(page.content, 'xml') ids = soup.find_all('Id', {}) for id_ in ids: author_pmids.append(id_.get_text()) author_2_pmids[author] = author_pmids citations = [] retrieved = get_links_ids([ int(pmid) for pmid in author_pmids if int(pmid) not in pmid_2_cite.keys() ]) for pmid in retrieved.keys(): link_list = [] if pmid in retrieved: link_list = retrieved[pmid] pmid_2_cite[pmid] = link_list citations.append(len(link_list)) author_2_hindex[author] = int(hindex(citations)) author_2_hindex_return[author] = author_2_hindex[author] elif call_api_flag: author_2_hindex[author] = -1 author_2_hindex_return[author] = -1 h_index_db['h_indices'].update(author_2_hindex) h_index_db['pmids'].update(author_2_pmids) h_index_db['citations'].update(pmid_2_cite) #TODO: make changes to path to overwrite current file with open( os.path.join(os.path.dirname(__file__), f'./data/author_h_indexes.json'), 'w') as output: output.write( json.dumps({ 'h_indices': h_index_db['h_indices'], 'pmids': h_index_db['pmids'], 'citations': h_index_db['citations'] })) return author_2_hindex_return
def test_hindex_with_nan(self): citations = [6, 10, 5, 46, np.nan, 2] received = hindex(citations) expected = 4 self.assertEqual(received, expected)
def test_hindex_with_only_nan(self): citations = [np.nan, np.nan] received = hindex(citations, ignore_nan=False) self.assertTrue(np.isnan(received))
def test_hindex(self): citations = [6, 10, 5, 46, 0, 2] received = hindex(citations) expected = 4 self.assertEqual(received, expected)
def calculateHIndex(self): authors_with_hIndex = 0 paperCitations = [] for eachCitation in self.citationArticles: paperCitations.append(eachCitation["Citations"]) return hindex(paperCitations)