def aminer_mag_links_by_doi(self): """ Create Links Sframe that match papers from the MAG dataset with papers from the AMiner dataset based on the papers DOI :return: """ extended_papers = self.mag.extended_papers g1 = extended_papers.groupby('Paper Document Object Identifier (DOI)', {'Count': agg.COUNT()}) s1 = set(g1[g1['Count'] > 1]['Paper Document Object Identifier (DOI)']) extended_papers = extended_papers[ extended_papers['Paper Document Object Identifier (DOI)'].apply( lambda doi: doi not in s1)] extended_papers.materialize() aminer = self.aminer.data g2 = aminer.groupby('doi', {'Count': agg.COUNT()}) s2 = set(g2[g2['Count'] > 1]['doi']) aminer = aminer[aminer['doi'].apply(lambda doi: doi not in s2)] aminer.materialize() aminer_mag = extended_papers.join( aminer, {'Paper Document Object Identifier (DOI)': 'doi'}) aminer_mag['title_len'] = aminer_mag['title'].apply(lambda t: len(t)) aminer_mag['title_len2'] = aminer_mag['Original paper title'].apply( lambda t: len(t)) aminer_mag = aminer_mag[aminer_mag['title_len'] > 0] aminer_mag = aminer_mag[aminer_mag['title_len2'] > 0] aminer_mag = aminer_mag.rename({ "Paper ID": "MAG Paper ID", "id": "Aminer Paper ID" }) return aminer_mag.remove_columns(['title_len', 'title_len2'])
def create_aminer_mag_links_by_doi_sframe(): """ Create Links Sframe that match papers from the MAG dataset with papers from the AMiner dataset based on the papers DOI :return: """ if os.path.isdir(AMINER_MAG_JOIN_SFRAME): return sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME) g1 = sf.groupby('Paper Document Object Identifier (DOI)', {'Count': agg.COUNT()}) s1 = set(g1[g1['Count'] > 1]['Paper Document Object Identifier (DOI)']) sf = sf[sf['Paper Document Object Identifier (DOI)'].apply( lambda doi: doi not in s1)] sf.materialize() sf2 = tc.load_sframe(AMINER_PAPERS_SFRAME) g2 = sf2.groupby('doi', {'Count': agg.COUNT()}) s2 = set(g2[g2['Count'] > 1]['doi']) sf2 = sf2[sf2['doi'].apply(lambda doi: doi not in s2)] sf2.materialize() j = sf.join(sf2, {'Paper Document Object Identifier (DOI)': 'doi'}) j['title_len'] = j['title'].apply(lambda t: len(t)) j['title_len2'] = j['Original paper title'].apply(lambda t: len(t)) j = j[j['title_len'] > 0] j = j[j['title_len2'] > 0] j = j.rename({"Paper ID": "MAG Paper ID", "id": "Aminer Paper ID"}) j = j.remove_columns(['title_len', 'title_len2']) j.save(AMINER_MAG_JOIN_SFRAME)
def _papers_citations_number_by_year(self, without_self_citation=True): """ Get papers total number of citation in each year :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations :return: SFrame with a column that contains citations_dict by year """ ref_sf = self.extended_references if without_self_citation: ref_sf = ref_sf[ref_sf['self citation'] == 0] sf = self.papers["PaperId", "Year"] sf = ref_sf.join(sf, on="PaperId") g = sf.groupby(["PaperReferenceId", "Year"], {"Citation Number": agg.COUNT()}) g = g.rename({"Year": "Year", "PaperReferenceId": "PaperId"}) g['Citation by Year'] = g.apply(lambda r: (r["Year"], r["Citation Number"])) h = g.groupby('PaperId', {'Citation by Years': agg.CONCAT('Citation by Year')}) if without_self_citation: h['Total Citations by Year without Self Citations'] = h[ 'Citation by Years'].apply( lambda l: self._get_total_citation_by_year(l)) else: h['Total Citations by Year'] = h['Citation by Years'].apply( lambda l: self._get_total_citation_by_year(l)) return h.remove_column("Citation by Years")
def _papers_citations_number_by_year_sframe(without_self_citation=True): """ Get papers total number of citation in each year :param without_self_citation: if True calculate only non-self citations, other calculate with self-citations :return: SFrame with a column that contains citations_dict by year """ logger.info("Creating Paper Citations by Year (without_self_citation=%s)" % without_self_citation) ref_sf = tc.load_sframe(EXTENDED_PAPER_REFERENCES_SFRAME) if without_self_citation: ref_sf = ref_sf[ref_sf['self citation'] == 0] sf = tc.load_sframe(PAPERS_SFRAME)["Paper ID", "Paper publish year"] sf = ref_sf.join(sf, on="Paper ID") g = sf.groupby(["Paper reference ID", "Paper publish year"], {"Citation Number": agg.COUNT()}) g = g.rename({ "Paper publish year": "Year", "Paper reference ID": "Paper ID" }) g['Citation by Year'] = g.apply(lambda r: (r["Year"], r["Citation Number"])) h = g.groupby( 'Paper ID', {'Citation by Years': tc.aggregate.CONCAT('Citation by Year')}) if without_self_citation: h['Total Citations by Year without Self Citations'] = h[ 'Citation by Years'].apply( lambda l: _get_total_citation_by_year(l)) else: h['Total Citations by Year'] = h['Citation by Years'].apply( lambda l: _get_total_citation_by_year(l)) h = h.remove_column("Citation by Years") return h
def get_valid_venues_papers_ids_sframe_from_mag(min_ref_number, min_journal_papers_num): dataset_dir = pathlib.Path(STORAGE_PATH) mag_path = dataset_dir / "MAG" mag = MicrosoftAcademicGraph(mag_path) sf = mag.extended_papers['Journal ID mapped to venue name', 'Original venue name', 'Paper ID', 'Ref Number'] sf = sf[sf['Ref Number'] >= min_ref_number] sf.materialize() sf['Journal name'] = sf['Original venue name'].apply( lambda n: n.lower().strip()) sf.materialize() g = sf.groupby( ['Journal ID mapped to venue name'], { 'Count': agg.COUNT(), 'Paper IDs List': agg.CONCAT("Paper ID"), 'Journals names': agg.CONCAT('Journal name') }) g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l))) g = g[g['Count'] >= min_journal_papers_num] g = g[g['Journals names'].apply(lambda l: len(l) == 1)] g['Journals names'] = g['Journals names'].apply(lambda l: l[0]) g = g.rename({'Journals names': 'Journal name'}) g.materialize() return g
def create_references_count_sframe(): """Creating SFrame with the number of references in each paper""" logger.info("Creating References Count SFrame") if os.path.isdir(PAPER_REFERENCES_COUNT_SFRAME): return r_sf = tc.load_sframe(PAPER_REFERENCES_SFRAME) sf = r_sf.groupby("Paper ID", {"Ref Number": agg.COUNT()}) sf.save(PAPER_REFERENCES_COUNT_SFRAME)
def get_valid_venues_papers_ids_sframe(min_ref_number, min_journal_papers_num): # Criteria I: we use only journals that have paper with valid DOI that appears in both AMiner and MAG datasets sf = tc.load_sframe(str(AMINER_MAG_JOIN_SFRAME)) sf['Original venue name'] = sf['Original venue name'].apply( lambda n: n.lower()) g = sf.groupby( 'Journal ID mapped to venue name', { 'venue name': agg.CONCAT('Original venue name'), 'issn': agg.CONCAT('issn') }) g['issn'] = g['issn'].apply(lambda l: list(set(l))) g['venue name'] = g['venue name'].apply(lambda l: list(set(l))) # Criteria II: the journal as only signle name g = g[g['venue name'].apply(lambda l: len(l) == 1)] g.materialize() g['venue name'] = g['venue name'].apply(lambda l: l[0].strip()) # Criteria III: the journal's name appears in SJR sjr_dict = VenueFetcher.get_sjr_journals_dict() g = g[g['venue name'].apply(lambda v: v in sjr_dict)] venues_ids = set(g['Journal ID mapped to venue name']) # Criteria IV: Each venue need to have at least min_journal_papers_num papers with at # least min_ref_number refs in each paper dataset_dir = pathlib.Path(STORAGE_PATH) mag_path = dataset_dir / "MAG" mag = MicrosoftAcademicGraph(mag_path) sf = mag.extended_papers['Journal ID mapped to venue name', 'Original venue name', 'Paper ID', 'Ref Number'] sf = sf[sf['Ref Number'] >= min_ref_number] sf.materialize() sf = sf[sf['Journal ID mapped to venue name'].apply( lambda i: i in venues_ids)] sf['Journal name'] = sf['Original venue name'].apply( lambda n: n.lower().strip()) sf.materialize() # Notice that with the full Papers SFrmae journal can have several names g = sf.groupby( ['Journal ID mapped to venue name'], { 'Count': agg.COUNT(), 'Paper IDs List': agg.CONCAT("Paper ID"), 'Journals names': agg.CONCAT('Journal name') }) g['Journals names'] = g['Journals names'].apply(lambda l: list(set(l))) g = g[g['Count'] >= min_journal_papers_num] g = g[g['Journals names'].apply(lambda l: len(l) == 1)] g['Journals names'] = g['Journals names'].apply(lambda l: l[0]) g = g.rename({'Journals names': 'Journal name'}) g.materialize() return g
def generate_blacklist_roles(): firstnames = SFrame.read_csv(f"{DATA_PATH}/firstnames.csv", verbose=False)["Name"] surenames = SFrame.read_csv(f"{DATA_PATH}/surenames.csv", verbose=False)["name"] surenames = surenames.apply(lambda n: n.title()) sf = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", column_type_hints={"characters": list}, na_values=["\\N"]) sf = sf.filter_by(["actor", "actress"], "category")["tconst", "ordering", "characters", "nconst"] sf = sf.join(imdb_data.title[imdb_data.title["titleType"] == "movie"]) sf = sf.stack("characters", "character") sf["character"] = sf["character"].apply(lambda c: c.title()) sf.export_csv(f"{TEMP_PATH}/roles3.csv") whitelist = sf.groupby(key_column_names=['character', "nconst"], operations={'count': agg.COUNT()}) whitelist = whitelist[whitelist["count"] > 1]['character'] sf = sf.filter_by(whitelist, "character", True) sf = sf.groupby(key_column_names=['character'], operations={ 'ordering': agg.AVG("ordering"), 'count': agg.COUNT() }) sf["name"] = sf["character"].apply(lambda c: c.split(" ")[-1].strip()) sf = sf.filter_by(names.words(), "name", exclude=True) sf = sf.filter_by(surenames, "name", exclude=True) sf = sf.filter_by(firstnames, "name", exclude=True) sf = sf.sort("count", False) sf = sf[sf['ordering'] > 3] w = {x.replace("_", " ").title() for x in wordnet.words()} - set(names.words()) sf["set"] = sf["character"].apply(lambda x: x.split(" ")) sf["set"] = sf["set"].apply(lambda x: w & set(x)) sf = sf[sf['count'] > 11].append(sf[(sf['count'] > 1) & (sf['count'] < 10) & (sf["set"] != [])]) sf[["character"]].export_csv(f"{OUTPUT_PATH}/blacklist_roles.csv")
def get_agg_cols(postfix, agg_type, agg_cols=['not_skipped', 'skip_1', 'skip_2', 'skip_3']): if agg_type == "mean": return {("%s_mean_%s" % (col, postfix)): agg.MEAN(col) for col in agg_cols} elif agg_type == "sum": return {("%s_sum_%s" % (col, postfix)): agg.SUM(col) for col in agg_cols} elif agg_type == "count": return {("cnt_%s" % postfix): agg.COUNT()} else: raise RuntimeError("Aggregation is not supported by this function!")
def triangles(): triagles_gender = get_relationship_triangles() moive_triangle = triagles_gender.groupby(["movie", "year", "total"], operations={'count': agg.COUNT()}) traingles_at_movie = moive_triangle.to_dataframe().pivot_table( index=["movie", "year"], values="count", columns='total', aggfunc=lambda x: x) traingles_at_movie = traingles_at_movie.fillna(0) traingles_at_movie = traingles_at_movie.reset_index() return traingles_at_movie
def create_ground_truth_names(baby_names_path, wikitree_users_path, ratio=0.9): """ Createing SFrame with statistics on first name gender probability using data from WikiTree and SSA :param baby_names_path: the file to SSA baby names files :param wikitree_users_path: link to file with WikiTree names :param ratio: the ratio that above it the name gender is considered male :return: SFrame with data regarding first name gender :rtype: tc.SFrame :note: first names data files can be downloaded from http://www.ssa.gov/oact/babynames/names.zip and https://www.wikitree.com/wiki/Help:Database_Dumps """ sf = tc.SFrame.read_csv("%s/*.txt" % baby_names_path, header=False) sf = sf.rename({'X1': 'First Name', 'X2': 'Gender', 'X3': 'Count'}) w_sf = tc.SFrame.read_csv(wikitree_users_path, delimiter="\t", header=True) w_sf = w_sf[['Preferred Name', 'Gender']] w_sf = w_sf.rename({'Preferred Name': 'First Name'}) w_sf = w_sf[w_sf['Gender'] != 0] w_sf['First Name'] = w_sf['First Name'].apply(lambda n: n.split()[0] if len(n) > 0 else '') w_sf = w_sf[w_sf['First Name'] != ''] w_sf['Gender'] = w_sf['Gender'].apply(lambda g: 'M' if g == 1 else 'F') w_sf = w_sf.groupby(['First Name', 'Gender'], {'Count': agg.COUNT()}) sf = sf.append(w_sf) sf['First Name'] = sf['First Name'].apply(lambda n: n.lower()) g = sf.groupby(['First Name', 'Gender'], agg.SUM('Count')) g['stat'] = g.apply(lambda r: (r['Gender'], r['Sum of Count'])) sf = g.groupby('First Name', {'Stats': agg.CONCAT('stat')}) sf['Total Births'] = sf['Stats'].apply(lambda l: sum([i[1] for i in l])) sf['Total Males'] = sf['Stats'].apply( lambda l: sum([i[1] for i in l if i[0] == 'M'])) sf['Percentage Males'] = sf.apply( lambda r: float(r['Total Males']) / r['Total Births']) sf = sf[sf['Total Births'] >= 5] def get_name_gender(p): if p >= ratio: return 'Male' if p <= (1 - ratio): return 'Female' return 'Unisex' sf['Gender'] = sf['Percentage Males'].apply(lambda p: get_name_gender(p)) sf = sf.remove_column('Stats') return sf
def popular_actors(self): if self._actors is None: download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False) self._actors = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._actors = self._actors.filter_by(["actor", "actress"], "category")["tconst", "nconst"] self._actors = self._actors.join( self.rating[(self.rating["titleType"] == "movie") & (self.rating["numVotes"] > 1000)]) self._actors = self._actors.groupby("nconst", operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()}) self._actors = self._actors.sort("averageRating", ascending=False) names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t") self._actors = self._actors.join(names) self._actors["gender"] = self._actors.apply(lambda p: self.add_actor_gender(p)) return self._actors
def triangles(self): triagles_gender = get_relationship_triangles() # triagles_gender["1"] = triagles_gender["X.0"] == "M" # triagles_gender["2"] = triagles_gender["X.1"] == "M" # triagles_gender["3"] = triagles_gender["X.2"] == "M" # triagles_gender["total"] = triagles_gender["1"] + triagles_gender["2"] + triagles_gender["3"] moive_triangle = triagles_gender.groupby( ["movie", "year", "total_men"], operations={'count': agg.COUNT()}) # type(moive_triangle) traingles_at_movie = moive_triangle.to_dataframe().pivot_table( index=["movie", "year"], values="count", columns='total_men', aggfunc=lambda x: x) traingles_at_movie = traingles_at_movie.fillna(0) traingles_at_movie = traingles_at_movie.reset_index() # bechdel_triangles = SFrame(traingles_at_movie).join(self.bechdel_imdb, {"tconst": "tconst"}) return traingles_at_movie
def get_papers_sframe(min_ref_num=None, start_year=None, end_year=None): """ Return SFrame with Papers data accoring to the input filter variables :param min_ref_num: paper's minimal references number :param start_year: start year (only include paper that were published after start year) :param end_year: end year (only include paper that were published before end year) :return: SFrame with paper data :rtype: tc.SFrame :note: after the SFrame is created it is saved to the TMP_DIR to future use """ sf = tc.load_sframe(PAPER_REFERENCES_SFRAME) tmp_papers_sf_path = _get_tmp_papers_sframe_path(min_ref_num, start_year, end_year) if os.path.isdir(tmp_papers_sf_path): return tc.load_sframe(tmp_papers_sf_path) if min_ref_num is not None: logger.info( f"Getting papers ids with at least refrences {min_ref_num}") sf = sf.groupby( 'Paper ID', {'Ref Count': agg.COUNT()}) # There are 30058322 in the list sf = sf[sf['Ref Count'] >= min_ref_num] # left with 22,083,058 sf.__materialize__() p_sf = tc.load_sframe(PAPERS_SFRAME) sf = p_sf.join(sf) if start_year is not None: logger.info("Getting papers with from %s " % start_year) sf = sf[sf['Paper publish year'] >= start_year] if end_year is not None: logger.info("Getting papers with util %s " % end_year) sf = sf[sf['Paper publish year'] <= end_year] sf.__materialize__() if not os.path.isdir(tmp_papers_sf_path): sf.save(tmp_papers_sf_path) return sf
def get_papers_sframe(self, min_ref_num=None, start_year=None, end_year=None): """ Return SFrame with Papers data according to the input filter variables :param min_ref_num: paper's minimal references number :param start_year: start year (only include paper that were published after start year) :param end_year: end year (only include paper that were published before end year) :return: SFrame with paper data :rtype: SFrame :note: after the SFrame is created it is saved to the TMP_DIR to future use """ sf = self.references tmp_papers_sf_path = self._get_tmp_papers_sframe_path( min_ref_num, start_year, end_year) if tmp_papers_sf_path.is_dir(): return load_sframe(str(tmp_papers_sf_path)) if min_ref_num is not None: sf = sf.groupby( 'PaperId', {'Ref Count': agg.COUNT()}) # There are 30058322 in the list sf = sf[sf['Ref Count'] >= min_ref_num] # left with 22,083,058 sf.__materialize__() p_sf = self.papers sf = p_sf.join(sf) if start_year is not None: sf = sf[sf['Year'] >= start_year] if end_year is not None: sf = sf[sf['Year'] <= end_year] sf.__materialize__() if not tmp_papers_sf_path.is_dir(): sf.save(str(tmp_papers_sf_path)) return sf
def predict(self): all = [] with open(self.list_loc, 'r') as fp: list = fp.read().splitlines() for i in list: df = self.download_news(i, 1) all.append(df) data = pd.concat(all, ignore_index=True) print(data) sf = tc.SFrame(data) model = tc.load_model(self.model_loc) # Save predictions to an SArray predictions = model.predict(sf) sf['prediction'] = predictions #sf.explore() trade_list = sf.groupby(key_column_names='stock', operations={ 'avg': agg.MEAN('prediction'), 'count': agg.COUNT() }) #trade_list['label'] = trade_list.apply(lambda x: 'rise' if (x['avg'] >= 0.8 and x['count'] >= 10) else 'drop') self.shortlist = trade_list.to_dataframe()
parser.add_argument('--functions', help='name of the functions directory', required=True) parser.add_argument('--p', help='partition number', type=int, required=True) parser.add_argument('--output', help='output path', required=True) args = parser.parse_args() path = setup_path(args) setup_logging(path=path, parser=parser) tc.config.set_runtime_config('TURI_FILEIO_MAXIMUM_CACHE_CAPACITY', 5 * 2147483648) tc.config.set_runtime_config('TURI_FILEIO_MAXIMUM_CACHE_CAPACITY_PER_FILE', 5 * 134217728) # following can reduce the memory footprint tc.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 4) mw = load_functions_partition(directory=args.funtions, name=args.p) logging.info(f"Read {mw.num_rows()} rows") ags = mw.groupby(key_column_names='apk', operations={'fcount': agg.COUNT()}) ags.save(f"{path}/apks.csv", format='csv') fgs = mw.groupby(key_column_names='function', operations={'acount': agg.COUNT()}) fgs.save(f"{path}/funcs.csv", format='csv')
import turicreate as gl import turicreate.aggregate as agg import re # working on papers with at most 5 citations r_sf = gl.load_sframe('./PaperReferences.sframe') r_sf = r_sf.groupby( 'Paper ID', {'Ref Count': agg.COUNT()}) # There are 30058322 in the list r_sf.save('/data/sframes/PapersRefCount.sframe') r_sf = r_sf[r_sf['Ref Count'] >= 5] # left with 22,083,058 p_sf = gl.load_sframe("./Papers.sframe/") # 126,903,970 rows p_sf = r_sf.join(p_sf) # 22,082,741 p_sf.save('./PapersMin5Ref.sframe') p_sf = gl.load_sframe('./PapersMin5Ref.sframe') a_sf = gl.load_sframe('./PaperAuthorAffiliations.sframe/') # 337000127 sf = p_sf[['Paper ID']].join(a_sf) # 86,561,861 rows sf = sf.join(p_sf, on="Paper ID") sf.groupby( "Author ID", { 'Papers Count': agg.COUNT_DISTINCT('Paper ID'), 'start_year': agg.MIN('Paper publish year'), 'last_year': agg.MAX('Paper publish year'), 'mean_ref_count': agg.AVG('Ref Count'), 'papers_list': agg.CONCAT('Paper ID'), 'journals_list': agg.CONCAT('Journal ID mapped to venue name'), 'conference_list': agg.CONCAT('Conference ID mapped to venue name'), 'affilation_list': agg.CONCAT('Affiliation ID') })
def get_number_of_papers_by_year(self): sf = self._all_papers_sf.groupby("Paper publish year", {"Count": agg.COUNT()}) return {r["Paper publish year"]: r["Count"] for r in sf}
random_tweets_sf['docs'] = docs sample_random_tweets_sf = random_tweets_sf.sample(fraction=0.05) # Predict topic probability: topic_probability_array = topic_model.predict(sample_random_tweets_sf['docs'],output_type='probability') topic_dict = defaultdict(list) for probabilities in tqdm(topic_probability_array): topic_num = 1 for topic_prob in probabilities: topic_dict['Topic '+str(topic_num)].append(topic_prob) topic_num+=1 # Distribution of Topics in the Corpora: import turicreate.aggregate as agg topic_gb = random_tweets_sf.groupby(key_column_names='Topic',operations={'id': agg.COUNT()}) topic_gb = topic_gb.to_dataframe() topic_gb = topic_gb.rename(columns={"id": "count"}) ax = sns.barplot(x="Topic",y='count', data=topic_gb).set_title('Distribution of Topics in the Corpora') plt.show() # Get daily topic distribution: topic_prob_gb = topic_prob_sf.groupby(key_column_names='date',operations={cols[0]: agg.AVG(cols[0])}) for col in cols[1:]: gb = topic_prob_sf.groupby(key_column_names='date',operations={col: agg.AVG(col)}) topic_prob_gb = topic_prob_gb.add_columns(gb[[col]]) topic_prob_gb = topic_prob_gb.to_dataframe() topic_prob_gb['date_str'] = topic_prob_gb['date'].astype(str) daily_topic_dist_df = topic_prob_gb.merge(corona_per_dates_gb, left_on='date_str', right_on='date', how='left')
print("## ii.) Track info based aggregations") track_stats_dir = "%s/train/%s/" % (experiment_dir, stats_experiment_id) for part in ["first", "second", "both"]: track_infos = tc.load_sframe("%s/%s_track_infos" % (track_stats_dir, part)) print(part, track_infos.shape) session_data = batch_join(session_data, track_infos, ["track_code"]) print("#### free memory") del track_infos print("## iii.) Repeat count of tracks") track_repeat = session_data.groupby(["session_code", "track_code"], operations={"repeat_cnt": agg.COUNT()}) track_repeat = track_repeat[track_repeat["repeat_cnt"] > 1] keys = list(zip(track_repeat["session_code"], track_repeat["track_code"])) track_repeat_dict = dict(zip(keys, track_repeat["repeat_cnt"])) del track_repeat session_data["repeat_cnt"] = session_data.apply( lambda x: track_repeat_dict.get((x["session_code"], x["track_code"]), 1)) del track_repeat_dict print("## iv.) Mahalanobis distance") def get_dists(sf, cols, variance_dict): for i, col in enumerate(cols):
def reference_count(self): return self.references.groupby("PaperId", {"Ref Number": agg.COUNT()})
def get_directors_data(self): rating = self.rating[self.rating["numVotes"] > 10000] sf = self.crew.join(rating) title = self.title[self.title["titleType"] == "movie"] sf = sf.join(title) sf = sf.groupby(key_column_names='directors', operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()}) sf = sf[sf["count"] > 5] names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t") sf = sf.join(names, {"directors": "nconst"}) return sf.sort("averageRating", ascending=False)
print(part, track_infos.shape) session_data = batch_join(session_data, track_infos[list(track_inf_cols[part])], ["track_code"]) print("#### free memory") del track_infos print("## iii.) Repeat count of tracks") calc_repeat_cnt = False for m_col in model_features: if "repeat_cnt" in m_col: calc_repeat_cnt = True break print("calculate repeat_cnt:", calc_repeat_cnt) if calc_repeat_cnt: track_repeat = session_data.groupby(["session_code","track_code"], operations={"repeat_cnt":agg.COUNT()}) track_repeat = track_repeat[track_repeat["repeat_cnt"] > 1] keys = list(zip(track_repeat["session_code"],track_repeat["track_code"])) track_repeat_dict = dict(zip(keys, track_repeat["repeat_cnt"])) del track_repeat session_data["repeat_cnt"] = session_data.apply(lambda x: track_repeat_dict.get((x["session_code"],x["track_code"]), 1)) del track_repeat_dict print("'repeat_cnt' generated!") else: print("'repeat_cnt' skipped!") print("## iv.) Mahalanobis distance") calc_distances = False for m_col in model_features: if "dist_from_sess_mean" in m_col: