def get_valid_venues_papers_ids_sframe_from_mag(min_ref_number,
                                                    min_journal_papers_num):

        dataset_dir = pathlib.Path(STORAGE_PATH)
        mag_path = dataset_dir / "MAG"
        mag = MicrosoftAcademicGraph(mag_path)

        sf = mag.extended_papers['Journal ID mapped to venue name',
                                 'Original venue name', 'Paper ID',
                                 'Ref Number']
        sf = sf[sf['Ref Number'] >= min_ref_number]
        sf.materialize()
        sf['Journal name'] = sf['Original venue name'].apply(
            lambda n: n.lower().strip())
        sf.materialize()
        g = sf.groupby(
            ['Journal ID mapped to venue name'], {
                'Count': agg.COUNT(),
                'Paper IDs List': agg.CONCAT("Paper ID"),
                'Journals names': agg.CONCAT('Journal name')
            })
        g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l)))
        g = g[g['Count'] >= min_journal_papers_num]
        g = g[g['Journals names'].apply(lambda l: len(l) == 1)]
        g['Journals names'] = g['Journals names'].apply(lambda l: l[0])
        g = g.rename({'Journals names': 'Journal name'})
        g.materialize()
        return g
    def get_valid_venues_papers_ids_sframe(min_ref_number,
                                           min_journal_papers_num):

        # Criteria I: we use only journals that have paper with valid DOI that appears in both AMiner and MAG datasets
        sf = tc.load_sframe(str(AMINER_MAG_JOIN_SFRAME))
        sf['Original venue name'] = sf['Original venue name'].apply(
            lambda n: n.lower())
        g = sf.groupby(
            'Journal ID mapped to venue name', {
                'venue name': agg.CONCAT('Original venue name'),
                'issn': agg.CONCAT('issn')
            })

        g['issn'] = g['issn'].apply(lambda l: list(set(l)))
        g['venue name'] = g['venue name'].apply(lambda l: list(set(l)))

        # Criteria II:  the journal as only signle name
        g = g[g['venue name'].apply(lambda l: len(l) == 1)]
        g.materialize()
        g['venue name'] = g['venue name'].apply(lambda l: l[0].strip())

        # Criteria III:  the journal's name appears in SJR
        sjr_dict = VenueFetcher.get_sjr_journals_dict()
        g = g[g['venue name'].apply(lambda v: v in sjr_dict)]

        venues_ids = set(g['Journal ID mapped to venue name'])

        # Criteria IV: Each venue need to have at least min_journal_papers_num papers with at
        # least min_ref_number refs in each paper
        dataset_dir = pathlib.Path(STORAGE_PATH)
        mag_path = dataset_dir / "MAG"
        mag = MicrosoftAcademicGraph(mag_path)

        sf = mag.extended_papers['Journal ID mapped to venue name',
                                 'Original venue name', 'Paper ID',
                                 'Ref Number']
        sf = sf[sf['Ref Number'] >= min_ref_number]
        sf.materialize()
        sf = sf[sf['Journal ID mapped to venue name'].apply(
            lambda i: i in venues_ids)]
        sf['Journal name'] = sf['Original venue name'].apply(
            lambda n: n.lower().strip())
        sf.materialize()
        # Notice that with the full Papers SFrmae journal can have several names
        g = sf.groupby(
            ['Journal ID mapped to venue name'], {
                'Count': agg.COUNT(),
                'Paper IDs List': agg.CONCAT("Paper ID"),
                'Journals names': agg.CONCAT('Journal name')
            })
        g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l)))
        g = g[g['Count'] >= min_journal_papers_num]
        g = g[g['Journals names'].apply(lambda l: len(l) == 1)]
        g['Journals names'] = g['Journals names'].apply(lambda l: l[0])
        g = g.rename({'Journals names': 'Journal name'})
        g.materialize()

        return g
    def _papers_citations_number_by_year(self, without_self_citation=True):
        """
        Get papers total number of citation in each year
        :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations
        :return: SFrame with a column that contains citations_dict by year
        """
        ref_sf = self.extended_references
        if without_self_citation:
            ref_sf = ref_sf[ref_sf['self citation'] == 0]

        sf = self.papers["PaperId", "Year"]
        sf = ref_sf.join(sf, on="PaperId")
        g = sf.groupby(["PaperReferenceId", "Year"],
                       {"Citation Number": agg.COUNT()})
        g = g.rename({"Year": "Year", "PaperReferenceId": "PaperId"})
        g['Citation by Year'] = g.apply(lambda r:
                                        (r["Year"], r["Citation Number"]))
        h = g.groupby('PaperId',
                      {'Citation by Years': agg.CONCAT('Citation by Year')})
        if without_self_citation:
            h['Total Citations by Year without Self Citations'] = h[
                'Citation by Years'].apply(
                    lambda l: self._get_total_citation_by_year(l))
        else:
            h['Total Citations by Year'] = h['Citation by Years'].apply(
                lambda l: self._get_total_citation_by_year(l))
        return h.remove_column("Citation by Years")
 def get_venue_median_number_of_authors_by_year(self):
     sf = self._all_papers_sf.groupby(
         "Paper publish year",
         {'Authors Number List': agg.CONCAT("Authors Number")})
     return {
         r["Paper publish year"]: np.median(r['Authors Number List'])
         for r in sf
     }
Exemple #5
0
    def get_venues_authors_ids(self, end_year):
        p_sf = get_papers_sframe(min_ref_num=self._min_ref_num,
                                 end_year=end_year)
        a_sf = get_authors_sframe(min_ref_num=self._min_ref_num,
                                  end_year=end_year)
        sf = a_sf.join(p_sf, on="Paper ID")

        return sf.groupby(self._venue_col_name,
                          {'authors_list': agg.CONCAT('Author ID')})
 def urls(self):
     """
     Creating URLs SFrame from.txt.gz files
     """
     cols = ["PaperId", "SourceType", "SourceUrl", "LanguageCode"]
     urls = SFrame(
         pd.read_csv(self._dataset_dir / "PaperUrls.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
     return urls.groupby("PaperId", {"Urls": agg.CONCAT("SourceUrl")})
def create_paper_keywords_list_sframe():
    """
    Creating Paper Keywords List SFrame
    """
    logger.info("Creating Papers' Keywords List SFrame")
    if os.path.isdir(PAPER_KEYWORDS_LIST_SFRAME):
        return

    sf = tc.load_sframe(PAPER_KEYWORDS_SFRAME)
    g = sf.groupby("Paper ID", {"Keywords List": agg.CONCAT("Keyword name")})
    g.save(PAPER_KEYWORDS_LIST_SFRAME)
def create_urls_sframe():
    """
    Creating URLs SFrame from txt files
    """
    logger.info("Creating urls SFrame")
    if os.path.isdir(PAPER_URLS_SFRAME):
        return
    sf = tc.SFrame.read_csv(PAPER_URLS_TXT, header=False, delimiter="\t")
    sf = sf.rename({"X1": "Paper ID", "X2": "Url"})
    g = sf.groupby("Paper ID", {"Urls": agg.CONCAT("Url")})
    g.save(PAPER_URLS_SFRAME)
 def get_co_authors_dict_sframe(self):
     """
     Create SFrame with each author's coauthors by year
     :return: SFrame with AuthorId and Coauthors by Years Dict
     :note: the function can take considerable amount of time to execute
     """
     logger.info("Calcualting authors' coauthors by year")
     sf = self.paper_authors_years
     sf = sf.join(sf, on='PaperId')
     sf = sf[sf['AuthorId'] != sf['AuthorId.1']]
     sf = sf.remove_column('Year.1')
     sf = sf.groupby(['AuthorId', 'Year'],
                     {'Coauthors List': agg.CONCAT('AuthorId.1')})
     sf['Coauthors Year'] = sf.apply(lambda r:
                                     (r['Year'], r['Coauthors List']))
     sf = sf.groupby("AuthorId",
                     {'Coauthors list': agg.CONCAT('Coauthors Year')})
     sf['Coauthors by Years Dict'] = sf['Coauthors list'].apply(
         lambda l: {y: coa_list
                    for y, coa_list in l})
     sf = sf.remove_column('Coauthors list')
     return sf
 def get_authors_papers_dict_sframe(self):
     """
     Create SFrame in which each row contains an AuthorId and a dict with the author's publication by year dict
     :return: SFrame with Authors ID and Papers by Years Dict columns
     :rtype: tc.SFrame
     """
     logger.info("Calcualting authors' papers by year")
     a_sf = self.paper_authors_years
     a_sf['Paper Year'] = a_sf.apply(lambda r: (r["Year"], r["PaperId"]))
     g = a_sf.groupby("AuthorId", {"Papers List": agg.CONCAT("Paper Year")})
     g['Papers by Years Dict'] = g["Papers List"].apply(
         lambda l: _entities_years_list_to_dict(l))
     g = g.remove_column("Papers List")
     return g
Exemple #11
0
def create_ground_truth_names(baby_names_path, wikitree_users_path, ratio=0.9):
    """
    Createing SFrame with statistics on first name gender probability using data from WikiTree and SSA
    :param baby_names_path: the file to SSA baby names files
    :param wikitree_users_path: link to file with WikiTree names
    :param ratio: the ratio that above it the name gender is considered male
    :return: SFrame with data regarding first name gender
    :rtype: tc.SFrame
    :note: first names data files can be downloaded from  http://www.ssa.gov/oact/babynames/names.zip and
            https://www.wikitree.com/wiki/Help:Database_Dumps
    """
    sf = tc.SFrame.read_csv("%s/*.txt" % baby_names_path, header=False)
    sf = sf.rename({'X1': 'First Name', 'X2': 'Gender', 'X3': 'Count'})

    w_sf = tc.SFrame.read_csv(wikitree_users_path, delimiter="\t", header=True)
    w_sf = w_sf[['Preferred Name', 'Gender']]
    w_sf = w_sf.rename({'Preferred Name': 'First Name'})
    w_sf = w_sf[w_sf['Gender'] != 0]
    w_sf['First Name'] = w_sf['First Name'].apply(lambda n: n.split()[0]
                                                  if len(n) > 0 else '')
    w_sf = w_sf[w_sf['First Name'] != '']
    w_sf['Gender'] = w_sf['Gender'].apply(lambda g: 'M' if g == 1 else 'F')
    w_sf = w_sf.groupby(['First Name', 'Gender'], {'Count': agg.COUNT()})

    sf = sf.append(w_sf)
    sf['First Name'] = sf['First Name'].apply(lambda n: n.lower())
    g = sf.groupby(['First Name', 'Gender'], agg.SUM('Count'))

    g['stat'] = g.apply(lambda r: (r['Gender'], r['Sum of Count']))
    sf = g.groupby('First Name', {'Stats': agg.CONCAT('stat')})
    sf['Total Births'] = sf['Stats'].apply(lambda l: sum([i[1] for i in l]))
    sf['Total Males'] = sf['Stats'].apply(
        lambda l: sum([i[1] for i in l if i[0] == 'M']))
    sf['Percentage Males'] = sf.apply(
        lambda r: float(r['Total Males']) / r['Total Births'])
    sf = sf[sf['Total Births'] >= 5]

    def get_name_gender(p):
        if p >= ratio:
            return 'Male'
        if p <= (1 - ratio):
            return 'Female'
        return 'Unisex'

    sf['Gender'] = sf['Percentage Males'].apply(lambda p: get_name_gender(p))
    sf = sf.remove_column('Stats')

    return sf
    def papers_authors_lists(self):
        """
        Create SFrame in which each row contains PaperId and a sorted list of the paper's authors
        """

        authors_sf = self.paper_author_affiliations["PaperId", "AuthorId",
                                                    "AuthorSequenceNumber"]
        authors_sf['Author_Seq'] = authors_sf.apply(
            lambda r: [r["AuthorId"], r["AuthorSequenceNumber"]])
        g = authors_sf.groupby("PaperId",
                               {"Authors List": agg.CONCAT('Author_Seq')})
        g['Authors List Sorted'] = g["Authors List"].apply(
            lambda l: sorted(l, key=lambda i: i[1]))
        g['Authors List Sorted'] = g['Authors List Sorted'].apply(
            lambda l: [i[0] for i in l])
        g = g.remove_column("Authors List")
        g = g["PaperId", 'Authors List Sorted']
        g['Authors Number'] = g['Authors List Sorted'].apply(lambda l: len(l))
        return g
def create_papers_fields_of_study(flevels=(0, 1, 2, 3)):
    """
    Create SFrame with each paper fields of study by hierarchical levels
    :param flevels: list of levels, for each level add the papers fields of study in this level
    """
    logger.info("Creating Papers Fields of Study SFrame")
    if os.path.isdir(PAPERS_FIELDS_OF_STUDY_SFRAME):
        return
    k_sf = tc.load_sframe(KEYWORDS_SFRAME)
    g = k_sf.groupby('Paper ID', {
        'Field of study list':
        agg.CONCAT("Field of study ID mapped to keyword")
    })
    fh = FieldsHierarchyAnalyzer()

    # add fileds of study names from ID
    names = []
    for l in g['Field of study list']:
        names.append([fh.get_field_name(i) for i in l])
    g['Field of study list names'] = names

    for flevel in flevels:
        logger.info("Adding papers fields of study level %s" % flevel)
        parent_list = []
        for paper_field_of_study_list in g['Field of study list']:
            parent_list.append(
                list(
                    set.union(*[
                        fh.get_parents_field_of_study(field, flevel)
                        for field in paper_field_of_study_list
                    ])))
        g['Fields of study parent list (L%s)' % flevel] = parent_list

        names = []
        for paper_field_of_study_parents_list in g[
                'Fields of study parent list (L%s)' % flevel]:
            names.append([
                fh.get_field_name(field_of_study)
                for field_of_study in paper_field_of_study_parents_list
            ])
        g['Fields of study parent list names (L%s)' % flevel] = names
    g.save(PAPERS_FIELDS_OF_STUDY_SFRAME)
    def _create_field_of_study_paper_ids(self, level):
        """
        Create SFrame in which each row contains a field of study and it's matching list of PaperIds
        :param level: field of study level
        :return: SFrame with the fields of study in the input level papers ids
        :rtype: SFrame
        """

        col = 'Fields of study parent list (L%s)' % level
        sf = self.extended_papers
        new_col_name = "Field ID"
        sf = sf[sf[col] != None]
        sf = sf.stack(col, new_column_name=new_col_name)
        g = sf.groupby(new_col_name, {'PaperIds': agg.CONCAT("PaperId")})
        g[new_col_name] = g[new_col_name].astype(int)
        f_sf = self.fields_of_study
        g = g.join(f_sf, on={new_col_name: "FieldOfStudyId"})
        g['Number of Paper'] = g['PaperIds'].apply(lambda l: len(l))
        g['Level'] = level
        return g.rename({new_col_name: "Field of study ID"})
def create_papers_authors_lists_sframe():
    """
    Create SFrame in which each row contains paper id and a sorted list of the paper's authors
    """
    logger.info("Creating Authors Lists SFrame")
    if os.path.isdir(PAPERS_ORDERED_AUTHORS_LIST_SFRAME):
        return
    authors_sf = tc.load_sframe(PAPER_AUTHOR_AFFILIATIONS_SFRAME)
    authors_sf = authors_sf["Paper ID", "Author ID", "Author sequence number"]
    authors_sf['Author_Seq'] = authors_sf.apply(
        lambda r: [r["Author ID"], r["Author sequence number"]])
    g = authors_sf.groupby("Paper ID",
                           {"Authors List": agg.CONCAT('Author_Seq')})
    g['Authors List Sorted'] = g["Authors List"].apply(
        lambda l: sorted(l, key=lambda i: i[1]))
    g['Authors List Sorted'] = g['Authors List Sorted'].apply(
        lambda l: [i[0] for i in l])
    g = g.remove_column("Authors List")
    g = g["Paper ID", 'Authors List Sorted']
    g['Authors Number'] = g['Authors List Sorted'].apply(lambda l: len(l))
    g.save(PAPERS_ORDERED_AUTHORS_LIST_SFRAME)
    def _get_author_feature_by_year_sframe(self, feature_name,
                                           feature_col_name):
        """
        Create a SFrame with AuthorId and a dict with the author's input feature (feature_name) over the years values
        :param feature_name: input feature name
        :param feature_col_name: the Sframe column name which contains dict with the author feature_name values over the years
        :return: SFrame with AuthorId and feature_col_name columns
        :rtype: tc.SFrame
        """
        logger.info("Calcualting authors feature %s by year" % feature_name)
        a_sf = self.paper_author_affiliation_sframe['AuthorId', 'Year',
                                                    feature_name]
        a_sf['Feature Year'] = a_sf.apply(lambda r:
                                          (int(r["Year"]), r[feature_name]))
        g = a_sf.groupby("AuthorId",
                         {"Feature List": agg.CONCAT("Feature Year")})
        g[feature_col_name] = g["Feature List"].apply(
            lambda l: _entities_years_list_to_dict(l))
        g = g.remove_column("Feature List")

        return g
def _create_field_of_study_paper_ids_sframe(level):
    """
    Create SFrame in which each row contains a field of study and it's matching list of paper ids
    :param level: field of study level
    :return: SFrame with the fields of stuyd in the input level papers ids
    :rtype: tc.SFrame
    """
    logger.info("Creating fields os study paper ids SFrame level - %s " %
                level)

    col = 'Fields of study parent list (L%s)' % level
    sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME)
    new_col_name = "Field ID"
    sf = sf.stack(col, new_column_name=new_col_name)
    sf = sf[sf[col] != None]
    g = sf.groupby(new_col_name, {'Paper IDs': agg.CONCAT("Paper ID")})
    f_sf = tc.load_sframe(FIELDS_OF_STUDY_SFRAME)
    g = g.join(f_sf, on={new_col_name: "Field of study ID"})
    g['Number of Paper'] = g['Paper IDs'].apply(lambda l: len(l))
    g['Level'] = level
    g = g.rename({new_col_name: "Field of study ID"})
    return g
    def papers_fields_of_study_level(self, flevels=(0, 1, 2, 3)):
        """
        Create SFrame with each paper fields of study by hierarchical levels
        :param flevels: list of levels, for each level add the papers fields of study in this level
        """
        k_sf = self.paper_fields_of_study
        #         FieldOfStudyId
        g = k_sf.groupby('PaperId',
                         {'Field of study list': agg.CONCAT("FieldOfStudyId")})
        fh = FieldsHierarchyAnalyzer(self)

        # add fields of study names from ID
        names = []
        for l in tqdm(g['Field of study list']):
            names.append([fh.get_field_name(i) for i in l])
        g['Field of study list names'] = names

        for flevel in flevels:
            parent_list = []
            for paper_field_of_study_list in tqdm(g['Field of study list']):
                parent_list.append(
                    list(
                        set.union(*[
                            fh.get_parents_field_of_study(field, flevel)
                            for field in paper_field_of_study_list
                        ])))
            g[f'Fields of study parent list (L{flevel})'] = parent_list

            names = []
            for paper_field_of_study_parents_list in g[
                    f'Fields of study parent list (L{flevel})']:
                names.append([
                    fh.get_field_name(field_of_study)
                    for field_of_study in paper_field_of_study_parents_list
                ])
            g[f'Fields of study parent list names (L{flevel})'] = names
        return g
Exemple #19
0
 def get_venues_papers_ids(self, end_year):
     p_sf = get_papers_sframe(min_ref_num=self._min_ref_num,
                              end_year=end_year)
     return p_sf.groupby(self._venue_col_name,
                         {'papers_list': agg.CONCAT('Paper ID')})
Exemple #20
0
p_sf = gl.load_sframe("./Papers.sframe/")  # 126,903,970 rows
p_sf = r_sf.join(p_sf)  # 22,082,741
p_sf.save('./PapersMin5Ref.sframe')

p_sf = gl.load_sframe('./PapersMin5Ref.sframe')
a_sf = gl.load_sframe('./PaperAuthorAffiliations.sframe/')  # 337000127
sf = p_sf[['Paper ID']].join(a_sf)  # 86,561,861 rows
sf = sf.join(p_sf, on="Paper ID")
sf.groupby(
    "Author ID", {
        'Papers Count': agg.COUNT_DISTINCT('Paper ID'),
        'start_year': agg.MIN('Paper publish year'),
        'last_year': agg.MAX('Paper publish year'),
        'mean_ref_count': agg.AVG('Ref Count'),
        'papers_list': agg.CONCAT('Paper ID'),
        'journals_list': agg.CONCAT('Journal ID mapped to venue name'),
        'conference_list': agg.CONCAT('Conference ID mapped to venue name'),
        'affilation_list': agg.CONCAT('Affiliation ID')
    })

sf = gl.SFrame()
r = re.compile(r"\d{4}")
for i in l:
    try:
        y = r.findall(i)[0]
        x = gl.SFrame.read_csv("%s/%s" % (p, i))
        x['Year'] = y
        x['Total Docs'] = x['Total Docs. (%s)' % y]
        x = x['Title', 'H index', 'SJR Best Quartile', 'SJR', 'Type', 'Rank',
              'Year', 'Total Docs']