def count_self_cites(author, num_load): author.loadPapers(num_load, loadPaperPDFs=False, pubFilter=False) self_cite_arr = [] print("Author fully loaded. Processing loaded papers...") try: for idx, paper in enumerate(author.getPapers()): #auth_word = get_ref_author_format(fname, lname, paper.getInfo()['Publisher']) auth_word = author.getLastName().title() title = paper.getInfo()['Title'] print('Paper title: ' + str(title)) paper.setPdfObj() analyzer = PaperReferenceExtractor() pdf_paper = paper.getPdfObj() if (pdf_paper is None): print('No PDF object for this paper, skipping.') self_cite_arr.append({ 'Paper Title': title, 'Self Cites': 'No PDF found' }) continue refContent = analyzer.getReferencesContent(pdf_paper) num_cites = 0 if (refContent is not None): num_cites = analyzer.getCitesToAuthor(auth_word, refContent) #print (fname+ ' '+lname+ ' has '+str(numCites)+' number of self-cites in paper: '+ paper.getInfo()['Title']) self_cites_info = { 'Paper Title': title, 'Self Cites': num_cites } else: self_cites_info = { 'Paper Title': title, 'Self Cites': 'No PDF found' } print('Paper title: ' + str(title) + ' has self cites: ' + str(num_cites)) self_cite_arr.append(self_cites_info) except KeyboardInterrupt: print('key board KeyboardInterrupt returninbg self cite array') print(self_cite_arr) return self_cite_arr
def findAllCitations(self): ref_processor = PaperReferenceExtractor() ref_content = ref_processor.getReferencesContent(self.__pdfObj) if (self.getInfo()['Publisher'] == 'Springer US'): parser = SpringerReferenceParser() elif (self.getInfo()['Publisher'] == 'IEEE'): parser = IeeeReferenceParser() else: raise Exception('Publisher not recognized; no citation parser for this format') citation_list = parser.citeParse(ref_content) for idx, citation in enumerate(citation_list): citation = Citation(citation) citation_list[idx] = citation return citation_list
def count_overcites_paper(paper, author, cite_num_to_load=40): overcites_info = [] try: all_pdfObjs = paper.getCitingPdfs(cite_num_to_load) analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except AttributeError as e: print('google scholar possibly has blocked you, sending back collected data...') print(e) return overcites_info except KeyboardInterrupt: print('User ended program. Returning existing Data') return overcites_info return overcites_info
def count_self_cites(author, num_load): author.loadPapers(num_load, loadPaperPDFs=False, pubFilter=False) self_cite_arr = [] print("Author fully loaded. Processing loaded papers...") try: for idx, paper in enumerate(author.getPapers()): #auth_word = get_ref_author_format(fname, lname, paper.getInfo()['Publisher']) auth_word = author.getLastName().title() title = paper.getInfo()['Title'] print('Paper title: ' + str(title)) paper.setPdfObj() analyzer = PaperReferenceExtractor() pdf_paper = paper.getPdfObj() if (pdf_paper is None): print('No PDF object for this paper, skipping.') self_cite_arr.append({'Paper Title': title, 'Self Cites': 'No PDF found'}) continue refContent = analyzer.getReferencesContent(pdf_paper) num_cites = 0 if (refContent is not None): num_cites = analyzer.getCitesToAuthor(auth_word, refContent) #print (fname+ ' '+lname+ ' has '+str(numCites)+' number of self-cites in paper: '+ paper.getInfo()['Title']) self_cites_info = {'Paper Title': title, 'Self Cites': num_cites} else: self_cites_info = {'Paper Title': title, 'Self Cites': 'No PDF found'} print('Paper title: ' + str(title) + ' has self cites: ' + str(num_cites)) self_cite_arr.append(self_cites_info) except KeyboardInterrupt: print('key board KeyboardInterrupt returninbg self cite array') print(self_cite_arr) return self_cite_arr
def findAllCitations(self): ref_processor = PaperReferenceExtractor() ref_content = ref_processor.getReferencesContent(self.__pdfObj) if (self.getInfo()['Publisher'] == 'Springer US'): parser = SpringerReferenceParser() elif (self.getInfo()['Publisher'] == 'IEEE'): parser = IeeeReferenceParser() else: raise Exception( 'Publisher not recognized; no citation parser for this format') citation_list = parser.citeParse(ref_content) for idx, citation in enumerate(citation_list): citation = Citation(citation) citation_list[idx] = citation return citation_list
def count_overcites_paper(paper, author, cite_num_to_load=40): overcites_info = [] try: all_pdfObjs = paper.getCitingPdfs(cite_num_to_load) analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except KeyboardInterrupt: print('User ended program. Returning existing Data') WatLibSeleniumParser.reset() return overcites_info return overcites_info
def count_overcites_paper(paper, author, cite_num_to_load=30): try: pdfExtractor = GscPdfExtractor() cited_by_url = paper.getCitedByUrl() url_part_one = SessionInitializer.ROOT_URL + '/scholar?start=' url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites=' cited_by_url = cited_by_url[:cited_by_url.rfind('&')] paper_code = cited_by_url[cited_by_url.rfind('=')+1:] all_pdfObjs = [] overcites_info = [] print('-----------------------------------LOADING CITING PAPERS-----------------------------------') for i in range (0, cite_num_to_load, 10): time.sleep(10) final_url = url_part_one+str(i)+url_part_two+paper_code print('page url for citations:') print(final_url) current_pdfObjs = pdfExtractor.findPapersFromCitations(final_url) all_pdfObjs += current_pdfObjs print('-----------------------------------DONE CITING PAPERS-------------------------------------') print('Loaded: ' + str(len(all_pdfObjs)) + ' pdf objects.') analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except AttributeError as e: print('google scholar possibly has blocked you, sending back collected data...') print(e) return overcites_info except KeyboardInterrupt: print('User ended program. Returning existing Data') return overcites_info return overcites_info
def count_cross_cites_stage3(orig_author, author_dist, x_most_rel, top_x, y_most_rel): gsc_bot = GscHtmlFunctions() top_x_authors = [] print('STAGE 3 CREATING NEW AUTHOR OBJECTS ---------------------------------------------------------') count_x = 0 #this part will create valid author objects for each of the top cited authors and append it to a list for index, author_info in enumerate(author_dist): count_x+=1 if count_x>y_most_rel: break time.sleep(10) if (index > top_x - 1): break #author info should be in the form ('author', {'freq': 5, 'papers':[]}) first_paper_title = author_info[1]['papers'][0] frequency = author_info[1]['freq'] author_name = author_info[0] print('Trying to find author: ' + str(author_info)) returned_author = gsc_bot.get_author_from_search(author_name, first_paper_title) if returned_author is None: #if can't find gsc profile for author, go onto the next top cited author top_x += 1 else: #each value is an array of two values: author object, and frequency cited top_x_authors.append([returned_author, returned_author.getFirstName(), returned_author.getLastName(), frequency]) print('DONE STAGE 3 --------------------------------------------------------------------------') print('Top citing authors: ') print(top_x_authors) print('STAGE 4 COUNTING NUMBER OF CITATIONS TO ORIGINAL AUTHOR --------------------------------') #gets number of times each of these authors cites the original author # array to store another array of author, and how many times they cite the original author cited_author_info_arr = [] ORIG_FNAME = orig_author.getFirstName() ORIG_LNAME = orig_author.getLastName() for cited_author_freq_arr in top_x_authors: time.sleep(5) top_cited_author = cited_author_freq_arr[0] top_cited_author.loadPapers(y_most_rel, pubFilter=True, delay=True) cited_fname = top_cited_author.getFirstName() cited_lname = top_cited_author.getLastName() print('ANALYZING AUTHOR: ' + str(cited_fname) + ' ' + str(cited_lname)) temp_paper_lst = top_cited_author.getPapers() # Take out Papers with no PDFs temp_paper_lst = [p for p in temp_paper_lst if p.getPdfObj() is not None] pap_list_len = len(temp_paper_lst) total_paper_cites = [] #determines number of times the paper cites the original author for paper in temp_paper_lst: pap_title = paper.getInfo()['Title'] print('Paper title: ' + pap_title) # For ambiguous names auth_word = get_ref_author_format(ORIG_FNAME, ORIG_LNAME, paper.getInfo()['Publisher']) pdf_paper = paper.getPdfObj() analyzer = PaperReferenceExtractor() content = analyzer.getReferencesContent(pdf_paper) if (content is None): total_paper_cites.append([pap_title, -1]) continue elif auth_word is None: print('for some reason, authword is none. Shouldnt be happening') continue num_cites = analyzer.getCitesToAuthor(auth_word, content) total_paper_cites.append([pap_title, num_cites]) print(total_paper_cites) cited_author_info_arr.append([top_cited_author, cited_fname, cited_lname, total_paper_cites, pap_list_len]) print('STAGE 4 COMPLETE ---------------------------------------------------------------------') print('cited_author_info_arr: ' + str(cited_author_info_arr)) print('FINAL INFO DICTIONARY -------------------------------------------------------------') #compilation of all the information final_info_dict = {'First Name': ORIG_FNAME, 'Last Name': ORIG_LNAME, 'Author_citation_frequency': top_x_authors, 'Cited_authors_overcite_frequency': cited_author_info_arr, 'x_most_rel': x_most_rel, 'y_most_rel': y_most_rel} print(final_info_dict) return final_info_dict
def count_cross_cites (author, x_most_rel, top_x, y_most_rel): author.loadPapers(x_most_rel, pubFilter=True, delay=True) paper_list = author.getPapers() x_most_rel = len(paper_list) ORIG_FNAME = author.getFirstName() ORIG_LNAME = author.getLastName() print("Total number of valid GSC papers: " + str(len(paper_list))) citation_list = [] springer_bot = SpringerReferenceParser() ieee_bot = IeeeReferenceParser() # gets all the citations from all the papers in the list print('STAGE 1 GETTING CITATIONS') print("-----------------------------------------------------------") for paper in paper_list: pub = paper.getInfo()['Publisher'] pdf_paper = paper.getPdfObj() print('Paper title: ' + str(paper.getInfo()['Title'])) if (pdf_paper is None): print('paper object is none') continue extractor = PaperReferenceExtractor() ref_content = extractor.getReferencesContent(pdf_paper) if (ref_content is None): continue try: if (pub == 'IEEE'): citations = ieee_bot.citeParse(ref_content) elif (pub == 'Springer US'): citations = springer_bot.citeParse(ref_content) else: print('Invalid publication format from: ' + pub) continue except Exception as e: print('An exception occured with parsing citations: ' + str(e)) citation_list += citations print("STAGE 1 COMPLETE -----------------------------------------------------------") print('From the valid top ' + str(top_x) +' papers, all the citations found: ' + str(citation_list)) author_dist = {} #goes through each citation and takes out authors and paper names and puts it in the valid frequency dictionary # end results: {'author': {'freq': int frequency original author cites him, 'paper': [array of paper titles in which the cited author is cited]}, print('STAGE 2 AGGREGATING CITATION COUNTS BY AUTHOR ------------------------------------') for citation in citation_list: title = citation['title'] for cited_author in citation['authors']: if cited_author in author_dist: author_dist[cited_author]['freq'] += 1 if title not in author_dist[cited_author]['papers']: author_dist[cited_author]['papers'].append(title) else: author_dist[cited_author] = {} author_dist[cited_author]['freq'] = 1 author_dist[cited_author]['papers'] = [title] #sorts the dictionary - now an array of tuples that are sorted by frequency #author_dist should be in the form [('author', {'freq': 5, 'papers':[]}), ...] author_dist = list(reversed(sorted(author_dist.items(), key=lambda x: x[1]['freq']))) print('STAGE 2 COMPLETE -----------------------------------------------------------------') print('sorted author list in tuples:') print(author_dist) count_cross_cites_stage3(author, author_dist, x_most_rel, top_x, y_most_rel)
def count_overcites_paper(paper, author, cite_num_to_load=30): try: pdfExtractor = GscPdfExtractor() cited_by_url = paper.getCitedByUrl() url_part_one = SessionInitializer.ROOT_URL + '/scholar?start=' url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites=' cited_by_url = cited_by_url[:cited_by_url.rfind('&')] paper_code = cited_by_url[cited_by_url.rfind('=') + 1:] all_pdfObjs = [] overcites_info = [] print( '-----------------------------------LOADING CITING PAPERS-----------------------------------' ) for i in range(0, cite_num_to_load, 10): time.sleep(10) final_url = url_part_one + str(i) + url_part_two + paper_code print('page url for citations:') print(final_url) current_pdfObjs = pdfExtractor.findPapersFromCitations(final_url) all_pdfObjs += current_pdfObjs print( '-----------------------------------DONE CITING PAPERS-------------------------------------' ) print('Loaded: ' + str(len(all_pdfObjs)) + ' pdf objects.') analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx + 1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx + 1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx + 1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx + 1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except AttributeError as e: print( 'google scholar possibly has blocked you, sending back collected data...' ) print(e) return overcites_info except KeyboardInterrupt: print('User ended program. Returning existing Data') return overcites_info return overcites_info
def count_cross_cites_stage3(orig_author, author_dist, x_most_rel, top_x, y_most_rel): gsc_bot = GscHtmlFunctions() top_x_authors = [] print( 'STAGE 3 CREATING NEW AUTHOR OBJECTS ---------------------------------------------------------' ) count_x = 0 #this part will create valid author objects for each of the top cited authors and append it to a list for index, author_info in enumerate(author_dist): count_x += 1 if count_x > y_most_rel: break time.sleep(10) if (index > top_x - 1): break #author info should be in the form ('author', {'freq': 5, 'papers':[]}) first_paper_title = author_info[1]['papers'][0] frequency = author_info[1]['freq'] author_name = author_info[0] print('Trying to find author: ' + str(author_info)) returned_author = gsc_bot.get_author_from_search( author_name, first_paper_title) if returned_author is None: #if can't find gsc profile for author, go onto the next top cited author top_x += 1 else: #each value is an array of two values: author object, and frequency cited top_x_authors.append([ returned_author, returned_author.getFirstName(), returned_author.getLastName(), frequency ]) print( 'DONE STAGE 3 --------------------------------------------------------------------------' ) print('Top citing authors: ') print(top_x_authors) print( 'STAGE 4 COUNTING NUMBER OF CITATIONS TO ORIGINAL AUTHOR --------------------------------' ) #gets number of times each of these authors cites the original author # array to store another array of author, and how many times they cite the original author cited_author_info_arr = [] ORIG_FNAME = orig_author.getFirstName() ORIG_LNAME = orig_author.getLastName() for cited_author_freq_arr in top_x_authors: time.sleep(5) top_cited_author = cited_author_freq_arr[0] top_cited_author.loadPapers(y_most_rel, pubFilter=True, delay=True) cited_fname = top_cited_author.getFirstName() cited_lname = top_cited_author.getLastName() print('ANALYZING AUTHOR: ' + str(cited_fname) + ' ' + str(cited_lname)) temp_paper_lst = top_cited_author.getPapers() # Take out Papers with no PDFs temp_paper_lst = [ p for p in temp_paper_lst if p.getPdfObj() is not None ] pap_list_len = len(temp_paper_lst) total_paper_cites = [] #determines number of times the paper cites the original author for paper in temp_paper_lst: pap_title = paper.getInfo()['Title'] print('Paper title: ' + pap_title) # For ambiguous names auth_word = get_ref_author_format(ORIG_FNAME, ORIG_LNAME, paper.getInfo()['Publisher']) pdf_paper = paper.getPdfObj() analyzer = PaperReferenceExtractor() content = analyzer.getReferencesContent(pdf_paper) if (content is None): total_paper_cites.append([pap_title, -1]) continue elif auth_word is None: print( 'for some reason, authword is none. Shouldnt be happening') continue num_cites = analyzer.getCitesToAuthor(auth_word, content) total_paper_cites.append([pap_title, num_cites]) print(total_paper_cites) cited_author_info_arr.append([ top_cited_author, cited_fname, cited_lname, total_paper_cites, pap_list_len ]) print( 'STAGE 4 COMPLETE ---------------------------------------------------------------------' ) print('cited_author_info_arr: ' + str(cited_author_info_arr)) print( 'FINAL INFO DICTIONARY -------------------------------------------------------------' ) #compilation of all the information final_info_dict = { 'First Name': ORIG_FNAME, 'Last Name': ORIG_LNAME, 'Author_citation_frequency': top_x_authors, 'Cited_authors_overcite_frequency': cited_author_info_arr, 'x_most_rel': x_most_rel, 'y_most_rel': y_most_rel } print(final_info_dict) return final_info_dict
def count_cross_cites(author, x_most_rel, top_x, y_most_rel): author.loadPapers(x_most_rel, pubFilter=True, delay=True) paper_list = author.getPapers() x_most_rel = len(paper_list) ORIG_FNAME = author.getFirstName() ORIG_LNAME = author.getLastName() print("Total number of valid GSC papers: " + str(len(paper_list))) citation_list = [] springer_bot = SpringerReferenceParser() ieee_bot = IeeeReferenceParser() # gets all the citations from all the papers in the list print('STAGE 1 GETTING CITATIONS') print("-----------------------------------------------------------") for paper in paper_list: pub = paper.getInfo()['Publisher'] pdf_paper = paper.getPdfObj() print('Paper title: ' + str(paper.getInfo()['Title'])) if (pdf_paper is None): print('paper object is none') continue extractor = PaperReferenceExtractor() ref_content = extractor.getReferencesContent(pdf_paper) if (ref_content is None): continue try: if (pub == 'IEEE'): citations = ieee_bot.citeParse(ref_content) elif (pub == 'Springer US'): citations = springer_bot.citeParse(ref_content) else: print('Invalid publication format from: ' + pub) continue except Exception as e: print('An exception occured with parsing citations: ' + str(e)) citation_list += citations print( "STAGE 1 COMPLETE -----------------------------------------------------------" ) print('From the valid top ' + str(top_x) + ' papers, all the citations found: ' + str(citation_list)) author_dist = {} #goes through each citation and takes out authors and paper names and puts it in the valid frequency dictionary # end results: {'author': {'freq': int frequency original author cites him, 'paper': [array of paper titles in which the cited author is cited]}, print( 'STAGE 2 AGGREGATING CITATION COUNTS BY AUTHOR ------------------------------------' ) for citation in citation_list: title = citation['title'] for cited_author in citation['authors']: if cited_author in author_dist: author_dist[cited_author]['freq'] += 1 if title not in author_dist[cited_author]['papers']: author_dist[cited_author]['papers'].append(title) else: author_dist[cited_author] = {} author_dist[cited_author]['freq'] = 1 author_dist[cited_author]['papers'] = [title] #sorts the dictionary - now an array of tuples that are sorted by frequency #author_dist should be in the form [('author', {'freq': 5, 'papers':[]}), ...] author_dist = list( reversed(sorted(author_dist.items(), key=lambda x: x[1]['freq']))) print( 'STAGE 2 COMPLETE -----------------------------------------------------------------' ) print('sorted author list in tuples:') print(author_dist) count_cross_cites_stage3(author, author_dist, x_most_rel, top_x, y_most_rel)