def get_valid_venues_papers_ids_sframe_from_mag(min_ref_number, min_journal_papers_num): dataset_dir = pathlib.Path(STORAGE_PATH) mag_path = dataset_dir / "MAG" mag = MicrosoftAcademicGraph(mag_path) sf = mag.extended_papers['Journal ID mapped to venue name', 'Original venue name', 'Paper ID', 'Ref Number'] sf = sf[sf['Ref Number'] >= min_ref_number] sf.materialize() sf['Journal name'] = sf['Original venue name'].apply( lambda n: n.lower().strip()) sf.materialize() g = sf.groupby( ['Journal ID mapped to venue name'], { 'Count': agg.COUNT(), 'Paper IDs List': agg.CONCAT("Paper ID"), 'Journals names': agg.CONCAT('Journal name') }) g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l))) g = g[g['Count'] >= min_journal_papers_num] g = g[g['Journals names'].apply(lambda l: len(l) == 1)] g['Journals names'] = g['Journals names'].apply(lambda l: l[0]) g = g.rename({'Journals names': 'Journal name'}) g.materialize() return g
def get_valid_venues_papers_ids_sframe(min_ref_number, min_journal_papers_num): # Criteria I: we use only journals that have paper with valid DOI that appears in both AMiner and MAG datasets sf = tc.load_sframe(str(AMINER_MAG_JOIN_SFRAME)) sf['Original venue name'] = sf['Original venue name'].apply( lambda n: n.lower()) g = sf.groupby( 'Journal ID mapped to venue name', { 'venue name': agg.CONCAT('Original venue name'), 'issn': agg.CONCAT('issn') }) g['issn'] = g['issn'].apply(lambda l: list(set(l))) g['venue name'] = g['venue name'].apply(lambda l: list(set(l))) # Criteria II: the journal as only signle name g = g[g['venue name'].apply(lambda l: len(l) == 1)] g.materialize() g['venue name'] = g['venue name'].apply(lambda l: l[0].strip()) # Criteria III: the journal's name appears in SJR sjr_dict = VenueFetcher.get_sjr_journals_dict() g = g[g['venue name'].apply(lambda v: v in sjr_dict)] venues_ids = set(g['Journal ID mapped to venue name']) # Criteria IV: Each venue need to have at least min_journal_papers_num papers with at # least min_ref_number refs in each paper dataset_dir = pathlib.Path(STORAGE_PATH) mag_path = dataset_dir / "MAG" mag = MicrosoftAcademicGraph(mag_path) sf = mag.extended_papers['Journal ID mapped to venue name', 'Original venue name', 'Paper ID', 'Ref Number'] sf = sf[sf['Ref Number'] >= min_ref_number] sf.materialize() sf = sf[sf['Journal ID mapped to venue name'].apply( lambda i: i in venues_ids)] sf['Journal name'] = sf['Original venue name'].apply( lambda n: n.lower().strip()) sf.materialize() # Notice that with the full Papers SFrmae journal can have several names g = sf.groupby( ['Journal ID mapped to venue name'], { 'Count': agg.COUNT(), 'Paper IDs List': agg.CONCAT("Paper ID"), 'Journals names': agg.CONCAT('Journal name') }) g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l))) g = g[g['Count'] >= min_journal_papers_num] g = g[g['Journals names'].apply(lambda l: len(l) == 1)] g['Journals names'] = g['Journals names'].apply(lambda l: l[0]) g = g.rename({'Journals names': 'Journal name'}) g.materialize() return g
def _papers_citations_number_by_year(self, without_self_citation=True): """ Get papers total number of citation in each year :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations :return: SFrame with a column that contains citations_dict by year """ ref_sf = self.extended_references if without_self_citation: ref_sf = ref_sf[ref_sf['self citation'] == 0] sf = self.papers["PaperId", "Year"] sf = ref_sf.join(sf, on="PaperId") g = sf.groupby(["PaperReferenceId", "Year"], {"Citation Number": agg.COUNT()}) g = g.rename({"Year": "Year", "PaperReferenceId": "PaperId"}) g['Citation by Year'] = g.apply(lambda r: (r["Year"], r["Citation Number"])) h = g.groupby('PaperId', {'Citation by Years': agg.CONCAT('Citation by Year')}) if without_self_citation: h['Total Citations by Year without Self Citations'] = h[ 'Citation by Years'].apply( lambda l: self._get_total_citation_by_year(l)) else: h['Total Citations by Year'] = h['Citation by Years'].apply( lambda l: self._get_total_citation_by_year(l)) return h.remove_column("Citation by Years")
def get_venue_median_number_of_authors_by_year(self): sf = self._all_papers_sf.groupby( "Paper publish year", {'Authors Number List': agg.CONCAT("Authors Number")}) return { r["Paper publish year"]: np.median(r['Authors Number List']) for r in sf }
def get_venues_authors_ids(self, end_year): p_sf = get_papers_sframe(min_ref_num=self._min_ref_num, end_year=end_year) a_sf = get_authors_sframe(min_ref_num=self._min_ref_num, end_year=end_year) sf = a_sf.join(p_sf, on="Paper ID") return sf.groupby(self._venue_col_name, {'authors_list': agg.CONCAT('Author ID')})
def urls(self): """ Creating URLs SFrame from.txt.gz files """ cols = ["PaperId", "SourceType", "SourceUrl", "LanguageCode"] urls = SFrame( pd.read_csv(self._dataset_dir / "PaperUrls.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return urls.groupby("PaperId", {"Urls": agg.CONCAT("SourceUrl")})
def create_paper_keywords_list_sframe(): """ Creating Paper Keywords List SFrame """ logger.info("Creating Papers' Keywords List SFrame") if os.path.isdir(PAPER_KEYWORDS_LIST_SFRAME): return sf = tc.load_sframe(PAPER_KEYWORDS_SFRAME) g = sf.groupby("Paper ID", {"Keywords List": agg.CONCAT("Keyword name")}) g.save(PAPER_KEYWORDS_LIST_SFRAME)
def create_urls_sframe(): """ Creating URLs SFrame from txt files """ logger.info("Creating urls SFrame") if os.path.isdir(PAPER_URLS_SFRAME): return sf = tc.SFrame.read_csv(PAPER_URLS_TXT, header=False, delimiter="\t") sf = sf.rename({"X1": "Paper ID", "X2": "Url"}) g = sf.groupby("Paper ID", {"Urls": agg.CONCAT("Url")}) g.save(PAPER_URLS_SFRAME)
def get_co_authors_dict_sframe(self): """ Create SFrame with each author's coauthors by year :return: SFrame with AuthorId and Coauthors by Years Dict :note: the function can take considerable amount of time to execute """ logger.info("Calcualting authors' coauthors by year") sf = self.paper_authors_years sf = sf.join(sf, on='PaperId') sf = sf[sf['AuthorId'] != sf['AuthorId.1']] sf = sf.remove_column('Year.1') sf = sf.groupby(['AuthorId', 'Year'], {'Coauthors List': agg.CONCAT('AuthorId.1')}) sf['Coauthors Year'] = sf.apply(lambda r: (r['Year'], r['Coauthors List'])) sf = sf.groupby("AuthorId", {'Coauthors list': agg.CONCAT('Coauthors Year')}) sf['Coauthors by Years Dict'] = sf['Coauthors list'].apply( lambda l: {y: coa_list for y, coa_list in l}) sf = sf.remove_column('Coauthors list') return sf
def get_authors_papers_dict_sframe(self): """ Create SFrame in which each row contains an AuthorId and a dict with the author's publication by year dict :return: SFrame with Authors ID and Papers by Years Dict columns :rtype: tc.SFrame """ logger.info("Calcualting authors' papers by year") a_sf = self.paper_authors_years a_sf['Paper Year'] = a_sf.apply(lambda r: (r["Year"], r["PaperId"])) g = a_sf.groupby("AuthorId", {"Papers List": agg.CONCAT("Paper Year")}) g['Papers by Years Dict'] = g["Papers List"].apply( lambda l: _entities_years_list_to_dict(l)) g = g.remove_column("Papers List") return g
def create_ground_truth_names(baby_names_path, wikitree_users_path, ratio=0.9): """ Createing SFrame with statistics on first name gender probability using data from WikiTree and SSA :param baby_names_path: the file to SSA baby names files :param wikitree_users_path: link to file with WikiTree names :param ratio: the ratio that above it the name gender is considered male :return: SFrame with data regarding first name gender :rtype: tc.SFrame :note: first names data files can be downloaded from http://www.ssa.gov/oact/babynames/names.zip and https://www.wikitree.com/wiki/Help:Database_Dumps """ sf = tc.SFrame.read_csv("%s/*.txt" % baby_names_path, header=False) sf = sf.rename({'X1': 'First Name', 'X2': 'Gender', 'X3': 'Count'}) w_sf = tc.SFrame.read_csv(wikitree_users_path, delimiter="\t", header=True) w_sf = w_sf[['Preferred Name', 'Gender']] w_sf = w_sf.rename({'Preferred Name': 'First Name'}) w_sf = w_sf[w_sf['Gender'] != 0] w_sf['First Name'] = w_sf['First Name'].apply(lambda n: n.split()[0] if len(n) > 0 else '') w_sf = w_sf[w_sf['First Name'] != ''] w_sf['Gender'] = w_sf['Gender'].apply(lambda g: 'M' if g == 1 else 'F') w_sf = w_sf.groupby(['First Name', 'Gender'], {'Count': agg.COUNT()}) sf = sf.append(w_sf) sf['First Name'] = sf['First Name'].apply(lambda n: n.lower()) g = sf.groupby(['First Name', 'Gender'], agg.SUM('Count')) g['stat'] = g.apply(lambda r: (r['Gender'], r['Sum of Count'])) sf = g.groupby('First Name', {'Stats': agg.CONCAT('stat')}) sf['Total Births'] = sf['Stats'].apply(lambda l: sum([i[1] for i in l])) sf['Total Males'] = sf['Stats'].apply( lambda l: sum([i[1] for i in l if i[0] == 'M'])) sf['Percentage Males'] = sf.apply( lambda r: float(r['Total Males']) / r['Total Births']) sf = sf[sf['Total Births'] >= 5] def get_name_gender(p): if p >= ratio: return 'Male' if p <= (1 - ratio): return 'Female' return 'Unisex' sf['Gender'] = sf['Percentage Males'].apply(lambda p: get_name_gender(p)) sf = sf.remove_column('Stats') return sf
def papers_authors_lists(self): """ Create SFrame in which each row contains PaperId and a sorted list of the paper's authors """ authors_sf = self.paper_author_affiliations["PaperId", "AuthorId", "AuthorSequenceNumber"] authors_sf['Author_Seq'] = authors_sf.apply( lambda r: [r["AuthorId"], r["AuthorSequenceNumber"]]) g = authors_sf.groupby("PaperId", {"Authors List": agg.CONCAT('Author_Seq')}) g['Authors List Sorted'] = g["Authors List"].apply( lambda l: sorted(l, key=lambda i: i[1])) g['Authors List Sorted'] = g['Authors List Sorted'].apply( lambda l: [i[0] for i in l]) g = g.remove_column("Authors List") g = g["PaperId", 'Authors List Sorted'] g['Authors Number'] = g['Authors List Sorted'].apply(lambda l: len(l)) return g
def create_papers_fields_of_study(flevels=(0, 1, 2, 3)): """ Create SFrame with each paper fields of study by hierarchical levels :param flevels: list of levels, for each level add the papers fields of study in this level """ logger.info("Creating Papers Fields of Study SFrame") if os.path.isdir(PAPERS_FIELDS_OF_STUDY_SFRAME): return k_sf = tc.load_sframe(KEYWORDS_SFRAME) g = k_sf.groupby('Paper ID', { 'Field of study list': agg.CONCAT("Field of study ID mapped to keyword") }) fh = FieldsHierarchyAnalyzer() # add fileds of study names from ID names = [] for l in g['Field of study list']: names.append([fh.get_field_name(i) for i in l]) g['Field of study list names'] = names for flevel in flevels: logger.info("Adding papers fields of study level %s" % flevel) parent_list = [] for paper_field_of_study_list in g['Field of study list']: parent_list.append( list( set.union(*[ fh.get_parents_field_of_study(field, flevel) for field in paper_field_of_study_list ]))) g['Fields of study parent list (L%s)' % flevel] = parent_list names = [] for paper_field_of_study_parents_list in g[ 'Fields of study parent list (L%s)' % flevel]: names.append([ fh.get_field_name(field_of_study) for field_of_study in paper_field_of_study_parents_list ]) g['Fields of study parent list names (L%s)' % flevel] = names g.save(PAPERS_FIELDS_OF_STUDY_SFRAME)
def _create_field_of_study_paper_ids(self, level): """ Create SFrame in which each row contains a field of study and it's matching list of PaperIds :param level: field of study level :return: SFrame with the fields of study in the input level papers ids :rtype: SFrame """ col = 'Fields of study parent list (L%s)' % level sf = self.extended_papers new_col_name = "Field ID" sf = sf[sf[col] != None] sf = sf.stack(col, new_column_name=new_col_name) g = sf.groupby(new_col_name, {'PaperIds': agg.CONCAT("PaperId")}) g[new_col_name] = g[new_col_name].astype(int) f_sf = self.fields_of_study g = g.join(f_sf, on={new_col_name: "FieldOfStudyId"}) g['Number of Paper'] = g['PaperIds'].apply(lambda l: len(l)) g['Level'] = level return g.rename({new_col_name: "Field of study ID"})
def create_papers_authors_lists_sframe(): """ Create SFrame in which each row contains paper id and a sorted list of the paper's authors """ logger.info("Creating Authors Lists SFrame") if os.path.isdir(PAPERS_ORDERED_AUTHORS_LIST_SFRAME): return authors_sf = tc.load_sframe(PAPER_AUTHOR_AFFILIATIONS_SFRAME) authors_sf = authors_sf["Paper ID", "Author ID", "Author sequence number"] authors_sf['Author_Seq'] = authors_sf.apply( lambda r: [r["Author ID"], r["Author sequence number"]]) g = authors_sf.groupby("Paper ID", {"Authors List": agg.CONCAT('Author_Seq')}) g['Authors List Sorted'] = g["Authors List"].apply( lambda l: sorted(l, key=lambda i: i[1])) g['Authors List Sorted'] = g['Authors List Sorted'].apply( lambda l: [i[0] for i in l]) g = g.remove_column("Authors List") g = g["Paper ID", 'Authors List Sorted'] g['Authors Number'] = g['Authors List Sorted'].apply(lambda l: len(l)) g.save(PAPERS_ORDERED_AUTHORS_LIST_SFRAME)
def _get_author_feature_by_year_sframe(self, feature_name, feature_col_name): """ Create a SFrame with AuthorId and a dict with the author's input feature (feature_name) over the years values :param feature_name: input feature name :param feature_col_name: the Sframe column name which contains dict with the author feature_name values over the years :return: SFrame with AuthorId and feature_col_name columns :rtype: tc.SFrame """ logger.info("Calcualting authors feature %s by year" % feature_name) a_sf = self.paper_author_affiliation_sframe['AuthorId', 'Year', feature_name] a_sf['Feature Year'] = a_sf.apply(lambda r: (int(r["Year"]), r[feature_name])) g = a_sf.groupby("AuthorId", {"Feature List": agg.CONCAT("Feature Year")}) g[feature_col_name] = g["Feature List"].apply( lambda l: _entities_years_list_to_dict(l)) g = g.remove_column("Feature List") return g
def _create_field_of_study_paper_ids_sframe(level): """ Create SFrame in which each row contains a field of study and it's matching list of paper ids :param level: field of study level :return: SFrame with the fields of stuyd in the input level papers ids :rtype: tc.SFrame """ logger.info("Creating fields os study paper ids SFrame level - %s " % level) col = 'Fields of study parent list (L%s)' % level sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME) new_col_name = "Field ID" sf = sf.stack(col, new_column_name=new_col_name) sf = sf[sf[col] != None] g = sf.groupby(new_col_name, {'Paper IDs': agg.CONCAT("Paper ID")}) f_sf = tc.load_sframe(FIELDS_OF_STUDY_SFRAME) g = g.join(f_sf, on={new_col_name: "Field of study ID"}) g['Number of Paper'] = g['Paper IDs'].apply(lambda l: len(l)) g['Level'] = level g = g.rename({new_col_name: "Field of study ID"}) return g
def papers_fields_of_study_level(self, flevels=(0, 1, 2, 3)): """ Create SFrame with each paper fields of study by hierarchical levels :param flevels: list of levels, for each level add the papers fields of study in this level """ k_sf = self.paper_fields_of_study # FieldOfStudyId g = k_sf.groupby('PaperId', {'Field of study list': agg.CONCAT("FieldOfStudyId")}) fh = FieldsHierarchyAnalyzer(self) # add fields of study names from ID names = [] for l in tqdm(g['Field of study list']): names.append([fh.get_field_name(i) for i in l]) g['Field of study list names'] = names for flevel in flevels: parent_list = [] for paper_field_of_study_list in tqdm(g['Field of study list']): parent_list.append( list( set.union(*[ fh.get_parents_field_of_study(field, flevel) for field in paper_field_of_study_list ]))) g[f'Fields of study parent list (L{flevel})'] = parent_list names = [] for paper_field_of_study_parents_list in g[ f'Fields of study parent list (L{flevel})']: names.append([ fh.get_field_name(field_of_study) for field_of_study in paper_field_of_study_parents_list ]) g[f'Fields of study parent list names (L{flevel})'] = names return g
def get_venues_papers_ids(self, end_year): p_sf = get_papers_sframe(min_ref_num=self._min_ref_num, end_year=end_year) return p_sf.groupby(self._venue_col_name, {'papers_list': agg.CONCAT('Paper ID')})
p_sf = gl.load_sframe("./Papers.sframe/") # 126,903,970 rows p_sf = r_sf.join(p_sf) # 22,082,741 p_sf.save('./PapersMin5Ref.sframe') p_sf = gl.load_sframe('./PapersMin5Ref.sframe') a_sf = gl.load_sframe('./PaperAuthorAffiliations.sframe/') # 337000127 sf = p_sf[['Paper ID']].join(a_sf) # 86,561,861 rows sf = sf.join(p_sf, on="Paper ID") sf.groupby( "Author ID", { 'Papers Count': agg.COUNT_DISTINCT('Paper ID'), 'start_year': agg.MIN('Paper publish year'), 'last_year': agg.MAX('Paper publish year'), 'mean_ref_count': agg.AVG('Ref Count'), 'papers_list': agg.CONCAT('Paper ID'), 'journals_list': agg.CONCAT('Journal ID mapped to venue name'), 'conference_list': agg.CONCAT('Conference ID mapped to venue name'), 'affilation_list': agg.CONCAT('Affiliation ID') }) sf = gl.SFrame() r = re.compile(r"\d{4}") for i in l: try: y = r.findall(i)[0] x = gl.SFrame.read_csv("%s/%s" % (p, i)) x['Year'] = y x['Total Docs'] = x['Total Docs. (%s)' % y] x = x['Title', 'H index', 'SJR Best Quartile', 'SJR', 'Type', 'Rank', 'Year', 'Total Docs']