def aminer_mag_links_by_doi(self):
        """
        Create Links Sframe that match papers from the MAG dataset
        with papers from the AMiner dataset based on the papers DOI
        :return:
        """
        extended_papers = self.mag.extended_papers
        g1 = extended_papers.groupby('Paper Document Object Identifier (DOI)',
                                     {'Count': agg.COUNT()})
        s1 = set(g1[g1['Count'] > 1]['Paper Document Object Identifier (DOI)'])
        extended_papers = extended_papers[
            extended_papers['Paper Document Object Identifier (DOI)'].apply(
                lambda doi: doi not in s1)]
        extended_papers.materialize()

        aminer = self.aminer.data
        g2 = aminer.groupby('doi', {'Count': agg.COUNT()})
        s2 = set(g2[g2['Count'] > 1]['doi'])
        aminer = aminer[aminer['doi'].apply(lambda doi: doi not in s2)]
        aminer.materialize()

        aminer_mag = extended_papers.join(
            aminer, {'Paper Document Object Identifier (DOI)': 'doi'})
        aminer_mag['title_len'] = aminer_mag['title'].apply(lambda t: len(t))
        aminer_mag['title_len2'] = aminer_mag['Original paper title'].apply(
            lambda t: len(t))
        aminer_mag = aminer_mag[aminer_mag['title_len'] > 0]
        aminer_mag = aminer_mag[aminer_mag['title_len2'] > 0]

        aminer_mag = aminer_mag.rename({
            "Paper ID": "MAG Paper ID",
            "id": "Aminer Paper ID"
        })
        return aminer_mag.remove_columns(['title_len', 'title_len2'])
def create_aminer_mag_links_by_doi_sframe():
    """
    Create Links Sframe that match papers from the MAG dataset with papers from the AMiner dataset based on the papers
    DOI
    :return:
    """
    if os.path.isdir(AMINER_MAG_JOIN_SFRAME):
        return
    sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME)
    g1 = sf.groupby('Paper Document Object Identifier (DOI)',
                    {'Count': agg.COUNT()})
    s1 = set(g1[g1['Count'] > 1]['Paper Document Object Identifier (DOI)'])
    sf = sf[sf['Paper Document Object Identifier (DOI)'].apply(
        lambda doi: doi not in s1)]
    sf.materialize()

    sf2 = tc.load_sframe(AMINER_PAPERS_SFRAME)
    g2 = sf2.groupby('doi', {'Count': agg.COUNT()})
    s2 = set(g2[g2['Count'] > 1]['doi'])
    sf2 = sf2[sf2['doi'].apply(lambda doi: doi not in s2)]
    sf2.materialize()

    j = sf.join(sf2, {'Paper Document Object Identifier (DOI)': 'doi'})
    j['title_len'] = j['title'].apply(lambda t: len(t))
    j['title_len2'] = j['Original paper title'].apply(lambda t: len(t))
    j = j[j['title_len'] > 0]
    j = j[j['title_len2'] > 0]

    j = j.rename({"Paper ID": "MAG Paper ID", "id": "Aminer Paper ID"})
    j = j.remove_columns(['title_len', 'title_len2'])
    j.save(AMINER_MAG_JOIN_SFRAME)
    def _papers_citations_number_by_year(self, without_self_citation=True):
        """
        Get papers total number of citation in each year
        :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations
        :return: SFrame with a column that contains citations_dict by year
        """
        ref_sf = self.extended_references
        if without_self_citation:
            ref_sf = ref_sf[ref_sf['self citation'] == 0]

        sf = self.papers["PaperId", "Year"]
        sf = ref_sf.join(sf, on="PaperId")
        g = sf.groupby(["PaperReferenceId", "Year"],
                       {"Citation Number": agg.COUNT()})
        g = g.rename({"Year": "Year", "PaperReferenceId": "PaperId"})
        g['Citation by Year'] = g.apply(lambda r:
                                        (r["Year"], r["Citation Number"]))
        h = g.groupby('PaperId',
                      {'Citation by Years': agg.CONCAT('Citation by Year')})
        if without_self_citation:
            h['Total Citations by Year without Self Citations'] = h[
                'Citation by Years'].apply(
                    lambda l: self._get_total_citation_by_year(l))
        else:
            h['Total Citations by Year'] = h['Citation by Years'].apply(
                lambda l: self._get_total_citation_by_year(l))
        return h.remove_column("Citation by Years")
def _papers_citations_number_by_year_sframe(without_self_citation=True):
    """
    Get papers total number of citation in each year
    :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations
    :return: SFrame with a column that contains citations_dict by year
    """
    logger.info("Creating Paper Citations by Year (without_self_citation=%s)" %
                without_self_citation)
    ref_sf = tc.load_sframe(EXTENDED_PAPER_REFERENCES_SFRAME)
    if without_self_citation:
        ref_sf = ref_sf[ref_sf['self citation'] == 0]

    sf = tc.load_sframe(PAPERS_SFRAME)["Paper ID", "Paper publish year"]
    sf = ref_sf.join(sf, on="Paper ID")
    g = sf.groupby(["Paper reference ID", "Paper publish year"],
                   {"Citation Number": agg.COUNT()})
    g = g.rename({
        "Paper publish year": "Year",
        "Paper reference ID": "Paper ID"
    })
    g['Citation by Year'] = g.apply(lambda r:
                                    (r["Year"], r["Citation Number"]))
    h = g.groupby(
        'Paper ID',
        {'Citation by Years': tc.aggregate.CONCAT('Citation by Year')})
    if without_self_citation:
        h['Total Citations by Year without Self Citations'] = h[
            'Citation by Years'].apply(
                lambda l: _get_total_citation_by_year(l))
    else:
        h['Total Citations by Year'] = h['Citation by Years'].apply(
            lambda l: _get_total_citation_by_year(l))
    h = h.remove_column("Citation by Years")
    return h
    def get_valid_venues_papers_ids_sframe_from_mag(min_ref_number,
                                                    min_journal_papers_num):

        dataset_dir = pathlib.Path(STORAGE_PATH)
        mag_path = dataset_dir / "MAG"
        mag = MicrosoftAcademicGraph(mag_path)

        sf = mag.extended_papers['Journal ID mapped to venue name',
                                 'Original venue name', 'Paper ID',
                                 'Ref Number']
        sf = sf[sf['Ref Number'] >= min_ref_number]
        sf.materialize()
        sf['Journal name'] = sf['Original venue name'].apply(
            lambda n: n.lower().strip())
        sf.materialize()
        g = sf.groupby(
            ['Journal ID mapped to venue name'], {
                'Count': agg.COUNT(),
                'Paper IDs List': agg.CONCAT("Paper ID"),
                'Journals names': agg.CONCAT('Journal name')
            })
        g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l)))
        g = g[g['Count'] >= min_journal_papers_num]
        g = g[g['Journals names'].apply(lambda l: len(l) == 1)]
        g['Journals names'] = g['Journals names'].apply(lambda l: l[0])
        g = g.rename({'Journals names': 'Journal name'})
        g.materialize()
        return g
def create_references_count_sframe():
    """Creating SFrame with the number of references in each paper"""
    logger.info("Creating References Count SFrame")
    if os.path.isdir(PAPER_REFERENCES_COUNT_SFRAME):
        return
    r_sf = tc.load_sframe(PAPER_REFERENCES_SFRAME)
    sf = r_sf.groupby("Paper ID", {"Ref Number": agg.COUNT()})
    sf.save(PAPER_REFERENCES_COUNT_SFRAME)
    def get_valid_venues_papers_ids_sframe(min_ref_number,
                                           min_journal_papers_num):

        # Criteria I: we use only journals that have paper with valid DOI that appears in both AMiner and MAG datasets
        sf = tc.load_sframe(str(AMINER_MAG_JOIN_SFRAME))
        sf['Original venue name'] = sf['Original venue name'].apply(
            lambda n: n.lower())
        g = sf.groupby(
            'Journal ID mapped to venue name', {
                'venue name': agg.CONCAT('Original venue name'),
                'issn': agg.CONCAT('issn')
            })

        g['issn'] = g['issn'].apply(lambda l: list(set(l)))
        g['venue name'] = g['venue name'].apply(lambda l: list(set(l)))

        # Criteria II:  the journal as only signle name
        g = g[g['venue name'].apply(lambda l: len(l) == 1)]
        g.materialize()
        g['venue name'] = g['venue name'].apply(lambda l: l[0].strip())

        # Criteria III:  the journal's name appears in SJR
        sjr_dict = VenueFetcher.get_sjr_journals_dict()
        g = g[g['venue name'].apply(lambda v: v in sjr_dict)]

        venues_ids = set(g['Journal ID mapped to venue name'])

        # Criteria IV: Each venue need to have at least min_journal_papers_num papers with at
        # least min_ref_number refs in each paper
        dataset_dir = pathlib.Path(STORAGE_PATH)
        mag_path = dataset_dir / "MAG"
        mag = MicrosoftAcademicGraph(mag_path)

        sf = mag.extended_papers['Journal ID mapped to venue name',
                                 'Original venue name', 'Paper ID',
                                 'Ref Number']
        sf = sf[sf['Ref Number'] >= min_ref_number]
        sf.materialize()
        sf = sf[sf['Journal ID mapped to venue name'].apply(
            lambda i: i in venues_ids)]
        sf['Journal name'] = sf['Original venue name'].apply(
            lambda n: n.lower().strip())
        sf.materialize()
        # Notice that with the full Papers SFrmae journal can have several names
        g = sf.groupby(
            ['Journal ID mapped to venue name'], {
                'Count': agg.COUNT(),
                'Paper IDs List': agg.CONCAT("Paper ID"),
                'Journals names': agg.CONCAT('Journal name')
            })
        g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l)))
        g = g[g['Count'] >= min_journal_papers_num]
        g = g[g['Journals names'].apply(lambda l: len(l) == 1)]
        g['Journals names'] = g['Journals names'].apply(lambda l: l[0])
        g = g.rename({'Journals names': 'Journal name'})
        g.materialize()

        return g
Beispiel #8
0
def generate_blacklist_roles():
    firstnames = SFrame.read_csv(f"{DATA_PATH}/firstnames.csv",
                                 verbose=False)["Name"]
    surenames = SFrame.read_csv(f"{DATA_PATH}/surenames.csv",
                                verbose=False)["name"]
    surenames = surenames.apply(lambda n: n.title())
    sf = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz",
                         delimiter="\t",
                         column_type_hints={"characters": list},
                         na_values=["\\N"])
    sf = sf.filter_by(["actor", "actress"], "category")["tconst", "ordering",
                                                        "characters", "nconst"]
    sf = sf.join(imdb_data.title[imdb_data.title["titleType"] == "movie"])
    sf = sf.stack("characters", "character")
    sf["character"] = sf["character"].apply(lambda c: c.title())
    sf.export_csv(f"{TEMP_PATH}/roles3.csv")

    whitelist = sf.groupby(key_column_names=['character', "nconst"],
                           operations={'count': agg.COUNT()})
    whitelist = whitelist[whitelist["count"] > 1]['character']
    sf = sf.filter_by(whitelist, "character", True)
    sf = sf.groupby(key_column_names=['character'],
                    operations={
                        'ordering': agg.AVG("ordering"),
                        'count': agg.COUNT()
                    })
    sf["name"] = sf["character"].apply(lambda c: c.split(" ")[-1].strip())
    sf = sf.filter_by(names.words(), "name", exclude=True)
    sf = sf.filter_by(surenames, "name", exclude=True)
    sf = sf.filter_by(firstnames, "name", exclude=True)
    sf = sf.sort("count", False)
    sf = sf[sf['ordering'] > 3]
    w = {x.replace("_", " ").title()
         for x in wordnet.words()} - set(names.words())
    sf["set"] = sf["character"].apply(lambda x: x.split(" "))
    sf["set"] = sf["set"].apply(lambda x: w & set(x))
    sf = sf[sf['count'] > 11].append(sf[(sf['count'] > 1) & (sf['count'] < 10)
                                        & (sf["set"] != [])])
    sf[["character"]].export_csv(f"{OUTPUT_PATH}/blacklist_roles.csv")
def get_agg_cols(postfix,
                 agg_type,
                 agg_cols=['not_skipped', 'skip_1', 'skip_2', 'skip_3']):
    if agg_type == "mean":
        return {("%s_mean_%s" % (col, postfix)): agg.MEAN(col)
                for col in agg_cols}
    elif agg_type == "sum":
        return {("%s_sum_%s" % (col, postfix)): agg.SUM(col)
                for col in agg_cols}
    elif agg_type == "count":
        return {("cnt_%s" % postfix): agg.COUNT()}
    else:
        raise RuntimeError("Aggregation is not supported by this function!")
Beispiel #10
0
def triangles():
    triagles_gender = get_relationship_triangles()

    moive_triangle = triagles_gender.groupby(["movie", "year", "total"],
                                             operations={'count': agg.COUNT()})

    traingles_at_movie = moive_triangle.to_dataframe().pivot_table(
        index=["movie", "year"],
        values="count",
        columns='total',
        aggfunc=lambda x: x)
    traingles_at_movie = traingles_at_movie.fillna(0)

    traingles_at_movie = traingles_at_movie.reset_index()
    return traingles_at_movie
Beispiel #11
0
def create_ground_truth_names(baby_names_path, wikitree_users_path, ratio=0.9):
    """
    Createing SFrame with statistics on first name gender probability using data from WikiTree and SSA
    :param baby_names_path: the file to SSA baby names files
    :param wikitree_users_path: link to file with WikiTree names
    :param ratio: the ratio that above it the name gender is considered male
    :return: SFrame with data regarding first name gender
    :rtype: tc.SFrame
    :note: first names data files can be downloaded from  http://www.ssa.gov/oact/babynames/names.zip and
            https://www.wikitree.com/wiki/Help:Database_Dumps
    """
    sf = tc.SFrame.read_csv("%s/*.txt" % baby_names_path, header=False)
    sf = sf.rename({'X1': 'First Name', 'X2': 'Gender', 'X3': 'Count'})

    w_sf = tc.SFrame.read_csv(wikitree_users_path, delimiter="\t", header=True)
    w_sf = w_sf[['Preferred Name', 'Gender']]
    w_sf = w_sf.rename({'Preferred Name': 'First Name'})
    w_sf = w_sf[w_sf['Gender'] != 0]
    w_sf['First Name'] = w_sf['First Name'].apply(lambda n: n.split()[0]
                                                  if len(n) > 0 else '')
    w_sf = w_sf[w_sf['First Name'] != '']
    w_sf['Gender'] = w_sf['Gender'].apply(lambda g: 'M' if g == 1 else 'F')
    w_sf = w_sf.groupby(['First Name', 'Gender'], {'Count': agg.COUNT()})

    sf = sf.append(w_sf)
    sf['First Name'] = sf['First Name'].apply(lambda n: n.lower())
    g = sf.groupby(['First Name', 'Gender'], agg.SUM('Count'))

    g['stat'] = g.apply(lambda r: (r['Gender'], r['Sum of Count']))
    sf = g.groupby('First Name', {'Stats': agg.CONCAT('stat')})
    sf['Total Births'] = sf['Stats'].apply(lambda l: sum([i[1] for i in l]))
    sf['Total Males'] = sf['Stats'].apply(
        lambda l: sum([i[1] for i in l if i[0] == 'M']))
    sf['Percentage Males'] = sf.apply(
        lambda r: float(r['Total Males']) / r['Total Births'])
    sf = sf[sf['Total Births'] >= 5]

    def get_name_gender(p):
        if p >= ratio:
            return 'Male'
        if p <= (1 - ratio):
            return 'Female'
        return 'Unisex'

    sf['Gender'] = sf['Percentage Males'].apply(lambda p: get_name_gender(p))
    sf = sf.remove_column('Stats')

    return sf
    def popular_actors(self):
        if self._actors is None:
            download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False)
            self._actors = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", na_values=["\\N"],
                                           verbose=self._verbose)
            self._actors = self._actors.filter_by(["actor", "actress"], "category")["tconst", "nconst"]

            self._actors = self._actors.join(
                self.rating[(self.rating["titleType"] == "movie") & (self.rating["numVotes"] > 1000)])
            self._actors = self._actors.groupby("nconst", operations={'averageRating': agg.AVG("averageRating"),
                                                                      'count': agg.COUNT()})
            self._actors = self._actors.sort("averageRating", ascending=False)
            names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t")

            self._actors = self._actors.join(names)
            self._actors["gender"] = self._actors.apply(lambda p: self.add_actor_gender(p))

        return self._actors
Beispiel #13
0
    def triangles(self):
        triagles_gender = get_relationship_triangles()
        # triagles_gender["1"] = triagles_gender["X.0"] == "M"
        # triagles_gender["2"] = triagles_gender["X.1"] == "M"
        # triagles_gender["3"] = triagles_gender["X.2"] == "M"
        # triagles_gender["total"] = triagles_gender["1"] + triagles_gender["2"] + triagles_gender["3"]

        moive_triangle = triagles_gender.groupby(
            ["movie", "year", "total_men"], operations={'count': agg.COUNT()})
        # type(moive_triangle)
        traingles_at_movie = moive_triangle.to_dataframe().pivot_table(
            index=["movie", "year"],
            values="count",
            columns='total_men',
            aggfunc=lambda x: x)
        traingles_at_movie = traingles_at_movie.fillna(0)

        traingles_at_movie = traingles_at_movie.reset_index()
        # bechdel_triangles = SFrame(traingles_at_movie).join(self.bechdel_imdb, {"tconst": "tconst"})
        return traingles_at_movie
def get_papers_sframe(min_ref_num=None, start_year=None, end_year=None):
    """
    Return SFrame with Papers data accoring to the input filter variables
    :param min_ref_num:  paper's minimal references number
    :param start_year: start year (only include paper that were published after start year)
    :param end_year: end year (only include paper that were published before end year)
    :return: SFrame with paper data
    :rtype: tc.SFrame
    :note: after the SFrame is created it is saved to the TMP_DIR to future use
    """
    sf = tc.load_sframe(PAPER_REFERENCES_SFRAME)
    tmp_papers_sf_path = _get_tmp_papers_sframe_path(min_ref_num, start_year,
                                                     end_year)
    if os.path.isdir(tmp_papers_sf_path):
        return tc.load_sframe(tmp_papers_sf_path)

    if min_ref_num is not None:
        logger.info(
            f"Getting papers ids with at least refrences {min_ref_num}")
        sf = sf.groupby(
            'Paper ID',
            {'Ref Count': agg.COUNT()})  # There are 30058322 in the list
        sf = sf[sf['Ref Count'] >= min_ref_num]  # left with 22,083,058
        sf.__materialize__()
    p_sf = tc.load_sframe(PAPERS_SFRAME)
    sf = p_sf.join(sf)
    if start_year is not None:
        logger.info("Getting papers with from %s " % start_year)
        sf = sf[sf['Paper publish year'] >= start_year]
    if end_year is not None:
        logger.info("Getting papers with util %s " % end_year)
        sf = sf[sf['Paper publish year'] <= end_year]
    sf.__materialize__()

    if not os.path.isdir(tmp_papers_sf_path):
        sf.save(tmp_papers_sf_path)

    return sf
    def get_papers_sframe(self,
                          min_ref_num=None,
                          start_year=None,
                          end_year=None):
        """
        Return SFrame with Papers data according to the input filter variables
        :param min_ref_num:  paper's minimal references number
        :param start_year: start year (only include paper that were published after start year)
        :param end_year: end year (only include paper that were published before end year)
        :return: SFrame with paper data
        :rtype: SFrame
        :note: after the SFrame is created it is saved to the TMP_DIR to future use
        """
        sf = self.references
        tmp_papers_sf_path = self._get_tmp_papers_sframe_path(
            min_ref_num, start_year, end_year)
        if tmp_papers_sf_path.is_dir():
            return load_sframe(str(tmp_papers_sf_path))

        if min_ref_num is not None:
            sf = sf.groupby(
                'PaperId',
                {'Ref Count': agg.COUNT()})  # There are 30058322 in the list
            sf = sf[sf['Ref Count'] >= min_ref_num]  # left with 22,083,058
            sf.__materialize__()
        p_sf = self.papers
        sf = p_sf.join(sf)
        if start_year is not None:
            sf = sf[sf['Year'] >= start_year]
        if end_year is not None:
            sf = sf[sf['Year'] <= end_year]
        sf.__materialize__()

        if not tmp_papers_sf_path.is_dir():
            sf.save(str(tmp_papers_sf_path))

        return sf
Beispiel #16
0
    def predict(self):
        all = []
        with open(self.list_loc, 'r') as fp:
            list = fp.read().splitlines()
            for i in list:
                df = self.download_news(i, 1)
                all.append(df)

        data = pd.concat(all, ignore_index=True)
        print(data)
        sf = tc.SFrame(data)

        model = tc.load_model(self.model_loc)
        # Save predictions to an SArray
        predictions = model.predict(sf)
        sf['prediction'] = predictions
        #sf.explore()
        trade_list = sf.groupby(key_column_names='stock',
                                operations={
                                    'avg': agg.MEAN('prediction'),
                                    'count': agg.COUNT()
                                })
        #trade_list['label'] = trade_list.apply(lambda x: 'rise' if (x['avg'] >= 0.8 and x['count'] >= 10) else 'drop')
        self.shortlist = trade_list.to_dataframe()
Beispiel #17
0
    parser.add_argument('--functions',
                        help='name of the functions directory',
                        required=True)
    parser.add_argument('--p',
                        help='partition number',
                        type=int,
                        required=True)
    parser.add_argument('--output', help='output path', required=True)
    args = parser.parse_args()

    path = setup_path(args)
    setup_logging(path=path, parser=parser)

    tc.config.set_runtime_config('TURI_FILEIO_MAXIMUM_CACHE_CAPACITY',
                                 5 * 2147483648)
    tc.config.set_runtime_config('TURI_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE',
                                 5 * 134217728)
    # following can reduce the memory footprint
    tc.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)

    mw = load_functions_partition(directory=args.funtions, name=args.p)
    logging.info(f"Read {mw.num_rows()} rows")

    ags = mw.groupby(key_column_names='apk',
                     operations={'fcount': agg.COUNT()})
    ags.save(f"{path}/apks.csv", format='csv')

    fgs = mw.groupby(key_column_names='function',
                     operations={'acount': agg.COUNT()})
    fgs.save(f"{path}/funcs.csv", format='csv')
Beispiel #18
0
import turicreate as gl
import turicreate.aggregate as agg
import re
# working on papers with at most 5 citations
r_sf = gl.load_sframe('./PaperReferences.sframe')

r_sf = r_sf.groupby(
    'Paper ID', {'Ref Count': agg.COUNT()})  # There are 30058322 in the list
r_sf.save('/data/sframes/PapersRefCount.sframe')
r_sf = r_sf[r_sf['Ref Count'] >= 5]  # left with 22,083,058

p_sf = gl.load_sframe("./Papers.sframe/")  # 126,903,970 rows
p_sf = r_sf.join(p_sf)  # 22,082,741
p_sf.save('./PapersMin5Ref.sframe')

p_sf = gl.load_sframe('./PapersMin5Ref.sframe')
a_sf = gl.load_sframe('./PaperAuthorAffiliations.sframe/')  # 337000127
sf = p_sf[['Paper ID']].join(a_sf)  # 86,561,861 rows
sf = sf.join(p_sf, on="Paper ID")
sf.groupby(
    "Author ID", {
        'Papers Count': agg.COUNT_DISTINCT('Paper ID'),
        'start_year': agg.MIN('Paper publish year'),
        'last_year': agg.MAX('Paper publish year'),
        'mean_ref_count': agg.AVG('Ref Count'),
        'papers_list': agg.CONCAT('Paper ID'),
        'journals_list': agg.CONCAT('Journal ID mapped to venue name'),
        'conference_list': agg.CONCAT('Conference ID mapped to venue name'),
        'affilation_list': agg.CONCAT('Affiliation ID')
    })
 def get_number_of_papers_by_year(self):
     sf = self._all_papers_sf.groupby("Paper publish year",
                                      {"Count": agg.COUNT()})
     return {r["Paper publish year"]: r["Count"] for r in sf}
random_tweets_sf['docs'] = docs

sample_random_tweets_sf = random_tweets_sf.sample(fraction=0.05)
# Predict topic probability:
topic_probability_array = topic_model.predict(sample_random_tweets_sf['docs'],output_type='probability')

topic_dict = defaultdict(list)
for probabilities in tqdm(topic_probability_array):
  topic_num = 1
  for topic_prob in probabilities:
    topic_dict['Topic '+str(topic_num)].append(topic_prob)
    topic_num+=1
    
# Distribution of Topics in the Corpora:
import turicreate.aggregate as agg
topic_gb = random_tweets_sf.groupby(key_column_names='Topic',operations={'id': agg.COUNT()})
topic_gb = topic_gb.to_dataframe()
topic_gb = topic_gb.rename(columns={"id": "count"})
ax = sns.barplot(x="Topic",y='count', data=topic_gb).set_title('Distribution of Topics in the Corpora')
plt.show()

# Get daily topic distribution:
topic_prob_gb = topic_prob_sf.groupby(key_column_names='date',operations={cols[0]: agg.AVG(cols[0])})
for col in cols[1:]:
  gb = topic_prob_sf.groupby(key_column_names='date',operations={col: agg.AVG(col)})
  topic_prob_gb = topic_prob_gb.add_columns(gb[[col]])
  
topic_prob_gb = topic_prob_gb.to_dataframe()

topic_prob_gb['date_str'] = topic_prob_gb['date'].astype(str)
daily_topic_dist_df = topic_prob_gb.merge(corona_per_dates_gb, left_on='date_str', right_on='date', how='left')
print("## ii.) Track info based aggregations")

track_stats_dir = "%s/train/%s/" % (experiment_dir, stats_experiment_id)

for part in ["first", "second", "both"]:
    track_infos = tc.load_sframe("%s/%s_track_infos" % (track_stats_dir, part))
    print(part, track_infos.shape)
    session_data = batch_join(session_data, track_infos, ["track_code"])
    print("#### free memory")
    del track_infos

print("## iii.) Repeat count of tracks")

track_repeat = session_data.groupby(["session_code", "track_code"],
                                    operations={"repeat_cnt": agg.COUNT()})

track_repeat = track_repeat[track_repeat["repeat_cnt"] > 1]
keys = list(zip(track_repeat["session_code"], track_repeat["track_code"]))
track_repeat_dict = dict(zip(keys, track_repeat["repeat_cnt"]))
del track_repeat

session_data["repeat_cnt"] = session_data.apply(
    lambda x: track_repeat_dict.get((x["session_code"], x["track_code"]), 1))
del track_repeat_dict

print("## iv.) Mahalanobis distance")


def get_dists(sf, cols, variance_dict):
    for i, col in enumerate(cols):
 def reference_count(self):
     return self.references.groupby("PaperId", {"Ref Number": agg.COUNT()})
    def get_directors_data(self):

        rating = self.rating[self.rating["numVotes"] > 10000]

        sf = self.crew.join(rating)

        title = self.title[self.title["titleType"] == "movie"]
        sf = sf.join(title)
        sf = sf.groupby(key_column_names='directors',
                        operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()})

        sf = sf[sf["count"] > 5]

        names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t")
        sf = sf.join(names, {"directors": "nconst"})
        return sf.sort("averageRating", ascending=False)
    print(part, track_infos.shape)
    session_data = batch_join(session_data, track_infos[list(track_inf_cols[part])], ["track_code"])
    print("#### free memory")
    del track_infos

print("## iii.) Repeat count of tracks")

calc_repeat_cnt = False
for m_col in model_features:
    if "repeat_cnt" in m_col:
        calc_repeat_cnt = True
        break
print("calculate repeat_cnt:", calc_repeat_cnt)

if calc_repeat_cnt:
    track_repeat = session_data.groupby(["session_code","track_code"], operations={"repeat_cnt":agg.COUNT()})
    track_repeat = track_repeat[track_repeat["repeat_cnt"] > 1]
    keys = list(zip(track_repeat["session_code"],track_repeat["track_code"]))
    track_repeat_dict = dict(zip(keys, track_repeat["repeat_cnt"]))
    del track_repeat
    session_data["repeat_cnt"] = session_data.apply(lambda x: track_repeat_dict.get((x["session_code"],x["track_code"]), 1))
    del track_repeat_dict
    print("'repeat_cnt' generated!")
else:
    print("'repeat_cnt' skipped!")

print("## iv.) Mahalanobis distance")

calc_distances = False
for m_col in model_features:
    if "dist_from_sess_mean" in m_col: