def findAllCitations(self): ref_processor = PaperReferenceExtractor() ref_content = ref_processor.getReferencesContent(self.__pdfObj) if (self.getInfo()['Publisher'] == 'Springer US'): parser = SpringerReferenceParser() elif (self.getInfo()['Publisher'] == 'IEEE'): parser = IeeeReferenceParser() else: raise Exception('Publisher not recognized; no citation parser for this format') citation_list = parser.citeParse(ref_content) for idx, citation in enumerate(citation_list): citation = Citation(citation) citation_list[idx] = citation return citation_list
def findAllCitations(self): ref_processor = PaperReferenceExtractor() ref_content = ref_processor.getReferencesContent(self.__pdfObj) if (self.getInfo()['Publisher'] == 'Springer US'): parser = SpringerReferenceParser() elif (self.getInfo()['Publisher'] == 'IEEE'): parser = IeeeReferenceParser() else: raise Exception( 'Publisher not recognized; no citation parser for this format') citation_list = parser.citeParse(ref_content) for idx, citation in enumerate(citation_list): citation = Citation(citation) citation_list[idx] = citation return citation_list
def count_cross_cites (author, x_most_rel, top_x, y_most_rel): author.loadPapers(x_most_rel, pubFilter=True, delay=True) paper_list = author.getPapers() x_most_rel = len(paper_list) ORIG_FNAME = author.getFirstName() ORIG_LNAME = author.getLastName() print("Total number of valid GSC papers: " + str(len(paper_list))) citation_list = [] springer_bot = SpringerReferenceParser() ieee_bot = IeeeReferenceParser() # gets all the citations from all the papers in the list print('STAGE 1 GETTING CITATIONS') print("-----------------------------------------------------------") for paper in paper_list: pub = paper.getInfo()['Publisher'] pdf_paper = paper.getPdfObj() print('Paper title: ' + str(paper.getInfo()['Title'])) if (pdf_paper is None): print('paper object is none') continue extractor = PaperReferenceExtractor() ref_content = extractor.getReferencesContent(pdf_paper) if (ref_content is None): continue try: if (pub == 'IEEE'): citations = ieee_bot.citeParse(ref_content) elif (pub == 'Springer US'): citations = springer_bot.citeParse(ref_content) else: print('Invalid publication format from: ' + pub) continue except Exception as e: print('An exception occured with parsing citations: ' + str(e)) citation_list += citations print("STAGE 1 COMPLETE -----------------------------------------------------------") print('From the valid top ' + str(top_x) +' papers, all the citations found: ' + str(citation_list)) author_dist = {} #goes through each citation and takes out authors and paper names and puts it in the valid frequency dictionary # end results: {'author': {'freq': int frequency original author cites him, 'paper': [array of paper titles in which the cited author is cited]}, print('STAGE 2 AGGREGATING CITATION COUNTS BY AUTHOR ------------------------------------') for citation in citation_list: title = citation['title'] for cited_author in citation['authors']: if cited_author in author_dist: author_dist[cited_author]['freq'] += 1 if title not in author_dist[cited_author]['papers']: author_dist[cited_author]['papers'].append(title) else: author_dist[cited_author] = {} author_dist[cited_author]['freq'] = 1 author_dist[cited_author]['papers'] = [title] #sorts the dictionary - now an array of tuples that are sorted by frequency #author_dist should be in the form [('author', {'freq': 5, 'papers':[]}), ...] author_dist = list(reversed(sorted(author_dist.items(), key=lambda x: x[1]['freq']))) print('STAGE 2 COMPLETE -----------------------------------------------------------------') print('sorted author list in tuples:') print(author_dist) count_cross_cites_stage3(author, author_dist, x_most_rel, top_x, y_most_rel)
def count_cross_cites(author, x_most_rel, top_x, y_most_rel): author.loadPapers(x_most_rel, pubFilter=True, delay=True) paper_list = author.getPapers() x_most_rel = len(paper_list) ORIG_FNAME = author.getFirstName() ORIG_LNAME = author.getLastName() print("Total number of valid GSC papers: " + str(len(paper_list))) citation_list = [] springer_bot = SpringerReferenceParser() ieee_bot = IeeeReferenceParser() # gets all the citations from all the papers in the list print('STAGE 1 GETTING CITATIONS') print("-----------------------------------------------------------") for paper in paper_list: pub = paper.getInfo()['Publisher'] pdf_paper = paper.getPdfObj() print('Paper title: ' + str(paper.getInfo()['Title'])) if (pdf_paper is None): print('paper object is none') continue extractor = PaperReferenceExtractor() ref_content = extractor.getReferencesContent(pdf_paper) if (ref_content is None): continue try: if (pub == 'IEEE'): citations = ieee_bot.citeParse(ref_content) elif (pub == 'Springer US'): citations = springer_bot.citeParse(ref_content) else: print('Invalid publication format from: ' + pub) continue except Exception as e: print('An exception occured with parsing citations: ' + str(e)) citation_list += citations print( "STAGE 1 COMPLETE -----------------------------------------------------------" ) print('From the valid top ' + str(top_x) + ' papers, all the citations found: ' + str(citation_list)) author_dist = {} #goes through each citation and takes out authors and paper names and puts it in the valid frequency dictionary # end results: {'author': {'freq': int frequency original author cites him, 'paper': [array of paper titles in which the cited author is cited]}, print( 'STAGE 2 AGGREGATING CITATION COUNTS BY AUTHOR ------------------------------------' ) for citation in citation_list: title = citation['title'] for cited_author in citation['authors']: if cited_author in author_dist: author_dist[cited_author]['freq'] += 1 if title not in author_dist[cited_author]['papers']: author_dist[cited_author]['papers'].append(title) else: author_dist[cited_author] = {} author_dist[cited_author]['freq'] = 1 author_dist[cited_author]['papers'] = [title] #sorts the dictionary - now an array of tuples that are sorted by frequency #author_dist should be in the form [('author', {'freq': 5, 'papers':[]}), ...] author_dist = list( reversed(sorted(author_dist.items(), key=lambda x: x[1]['freq']))) print( 'STAGE 2 COMPLETE -----------------------------------------------------------------' ) print('sorted author list in tuples:') print(author_dist) count_cross_cites_stage3(author, author_dist, x_most_rel, top_x, y_most_rel)