def get_platform_rank(self, df: pd.DataFrame) -> pd.DataFrame: df_platform = df.groupby(['Platform', 'Publisher'])['Name'].count().reset_index() df_platform = change_column_name(df_platform, 'Name', 'count') df_pub_platform_target_mean = df.groupby( ['Platform', 'Publisher'])['Global_Sales'].mean().reset_index() df_pub_platform_target_mean = change_column_name( df_pub_platform_target_mean, 'Global_Sales', 'mean_target') df_platform = pd.merge(df_platform, df_pub_platform_target_mean, how='left', on=['Platform', 'Publisher']) df_platform.loc[:, 'platform_score'] = df_platform.loc[:, 'count'] * df_platform.loc[:, 'mean_target'] df_pub_platform_max = df_platform.groupby( 'Platform')['platform_score'].max().reset_index() df_pub_platform_max = change_column_name(df_pub_platform_max, 'platform_score', 'max_score') df_platform = pd.merge(df_platform, df_pub_platform_max, how='left', on='Platform') df_platform['rank'] = df_platform.groupby( ['Platform'])['platform_score'].rank(ascending=False) return df_platform
def create(self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) le = LabelEncoder() df_whole['le_Platform'] = le.fit_transform(df_whole.loc[:, 'Platform']) df_main['le_Platform'] = le.transform(df_main.loc[:, 'Platform']) platform_list = df_whole.groupby('Name')['le_Platform'].apply( list).reset_index() platform_list = change_column_name(platform_list, 'le_Platform', 'other_platforms') df_main = pd.merge(df_main, platform_list, how='left', on='Name') # other から自分を抜く def _get_other_platforms(x: pd.DataFrame): if np.isnan(x['other_platforms']).any(): return [] return [p for p in x['other_platforms'] if p != x['le_Platform']] df_main['other_platforms'] = df_main[[ 'other_platforms', 'le_Platform' ]].apply(_get_other_platforms, axis=1) num_platforms = len(df_whole['le_Platform'].unique()) df_onehot = self._onehot_platforms(df_main.loc[:, 'other_platforms'], num_platforms, df_main.shape[0]) return df_onehot
def _merge_agg_scores(self, df: pd.DataFrame, merge_to_df: pd.DataFrame) -> pd.DataFrame: user_score_agg = df.groupby(['Platform'])['User_Score'].agg(['mean', 'std']) user_score_agg = change_column_name( user_score_agg, ['mean', 'std'], ['mean_user_score_by_platform', 'std_user_score_by_platform']) critic_score_agg = df.groupby(['Platform'])['Critic_Score'].agg(['mean', 'std']) critic_score_agg = change_column_name( critic_score_agg, ['mean', 'std'], ['mean_critic_score_by_platform', 'std_critic_score_by_platform']) user_count_agg = df.groupby(['Platform'])['User_Count'].agg('mean') user_count_agg = change_column_name(user_count_agg, 'User_Count', 'mean_user_count_by_platform') critic_count_agg = df.groupby(['Platform'])['Critic_Count'].agg('mean') critic_count_agg = change_column_name(critic_count_agg, 'Critic_Count', 'mean_critic_count_by_platform') merge_to_df = pd.merge(merge_to_df, user_score_agg, how='left', on='Platform') merge_to_df = pd.merge(merge_to_df, critic_score_agg, how='left', on='Platform') merge_to_df = pd.merge(merge_to_df, user_count_agg, how='left', on='Platform') merge_to_df = pd.merge(merge_to_df, critic_count_agg, how='left', on='Platform') return merge_to_df
def create( self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs ) -> pd.DataFrame: df_main = others['main'].copy() df_main = df_main.reset_index() df_main = change_column_name(df_main, 'index', 'original_index') return df_main.loc[:, ['original_index']]
def test_change_column_name(self): df = pd.read_csv('./tests/data/dummy.csv') df = util.change_column_name(df, 'a', 'aaa') assert 'aaa' in list(df.columns) assert 'a' not in list(df.columns) assert 'b' in list(df.columns) # remain other columns df = util.change_column_name(df, ['b'], ['bbb']) assert 'bbb' in list(df.columns) assert 'b' not in list(df.columns) df = util.change_column_name(df, ['x', 'y'], ['xxx', 'yyy']) assert 'xxx' in list(df.columns) assert 'yyy' in list(df.columns) assert 'x' not in list(df.columns) assert 'y' not in list(df.columns) with pytest.raises(ValueError): df = util.change_column_name(df, ['x', 'y'], ['xxx', 'yyy', 'zzz'])
def _get_mean_target_per_day(self, df: pd.DataFrame): df['imp_day'] = pd.to_datetime(df.loc[:, 'imp_at']).dt.day df_target_per_day = df.groupby( 'imp_day')['target'].mean().reset_index() df_target_per_day = change_column_name(df_target_per_day, 'target', 'day_avg_target') df_tmp = pd.concat([df_target_per_day, df_target_per_day], axis=0) df_tmp['target_5days_mean'] = df_tmp.rolling( 5, center=True)['day_avg_target'].mean() df_target_per_day = pd.merge(df_target_per_day, df_tmp.iloc[14:44], how='left', on='imp_day') return df_target_per_day
def create(self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) multi_platform = (df_whole.loc[:, ['Name']].value_counts()).reset_index() multi_platform = change_column_name(multi_platform, 0, 'multi_pf_count') df_main = pd.merge(df_main, multi_platform, how='left', on='Name') return df_main.loc[:, ['multi_pf_count']]
def create(self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) df_pivot = None for i, c in enumerate(COLUMNS): transform = PivotTransform(indices=['Publisher'], column=c, target='id', aggs=['count'], fillna=0) pub_to_c = transform(df_whole) if df_pivot is None: df_pivot = pub_to_c else: df_pivot = pd.merge(df_pivot, pub_to_c, how='left', on='Publisher') column_name = 'kmeans_cluster_by_Publisher_pivotby_all' df_main[column_name] = base.copy().loc[:, column_name] df_main = change_column_name(df_main, column_name, 'kmeans_cluster') transform = BasicGroupByTransform(keys=['kmeans_cluster'], targets=['target'], aggs=['mean']) cluster_target = transform(df_main) df_main = pd.merge(df_main, cluster_target, how='left', on='kmeans_cluster') return df_main.loc[:, [cluster_target.columns[-1]]]
def create(self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) df_whole.loc[:, 'clean_name'] = self._clean(df_whole.loc[:, 'Name']) df_main.loc[:, 'clean_name'] = self._clean(df_main.loc[:, 'Name']) series_count1 = self._get_top_text_ngrams( df_whole.loc[:, 'clean_name'], 100000, (1, 1), 50) series_count1 = dict(series_count1[::-1]) series_count2 = self._get_top_text_ngrams( df_whole.loc[:, 'clean_name'], 100000, (2, 2), 20) series_count2 = dict(series_count2[::-1][:-2]) series_count3 = self._get_top_text_ngrams( df_whole.loc[:, 'clean_name'], 100000, (3, 3), 10) series_count3 = dict(series_count3[::-1]) df_main['num_word_series_1'] = 0 for i in series_count1: if len(i) < 5: continue idx = df_main[df_main.loc[:, 'clean_name'].str.startswith(i)].index df_main.loc[idx, 'num_word_series_1'] = series_count1[i] df_main['num_word_series_2'] = 0 for i in series_count2: idx = df_main[df_main.loc[:, 'clean_name'].str.startswith(i)].index df_main.loc[idx, 'num_word_series_2'] = series_count2[i] df_main['num_word_series_3'] = 0 for i in series_count3: idx = df_main[df_main.loc[:, 'clean_name'].str.contains(i)].index df_main.loc[idx, 'num_word_series_3'] = series_count3[i] name_platform = df_whole.groupby('Name')['Platform'].nunique() name_platform = change_column_name(name_platform, 'Platform', 'nunique_platform') df_main = pd.merge(df_main, name_platform, how='left', on='Name') df_main[ 'num_word_series_1'] = df_main.loc[:, 'num_word_series_1'] / df_main.loc[:, 'nunique_platform'] df_main[ 'num_word_series_2'] = df_main.loc[:, 'num_word_series_2'] / df_main.loc[:, 'nunique_platform'] df_main[ 'num_word_series_3'] = df_main.loc[:, 'num_word_series_3'] / df_main.loc[:, 'nunique_platform'] df_main.loc[df_main['num_word_series_1'] == 0, 'num_word_series_1'] = 1 df_main.loc[df_main['num_word_series_2'] == 0, 'num_word_series_2'] = 1 df_main.loc[df_main['num_word_series_3'] == 0, 'num_word_series_3'] = 1 return df_main.loc[:, [ 'num_word_series_1', 'num_word_series_2', 'num_word_series_3' ]]