Exemple #1
0
 def get_platform_rank(self, df: pd.DataFrame) -> pd.DataFrame:
     df_platform = df.groupby(['Platform',
                               'Publisher'])['Name'].count().reset_index()
     df_platform = change_column_name(df_platform, 'Name', 'count')
     df_pub_platform_target_mean = df.groupby(
         ['Platform', 'Publisher'])['Global_Sales'].mean().reset_index()
     df_pub_platform_target_mean = change_column_name(
         df_pub_platform_target_mean, 'Global_Sales', 'mean_target')
     df_platform = pd.merge(df_platform,
                            df_pub_platform_target_mean,
                            how='left',
                            on=['Platform', 'Publisher'])
     df_platform.loc[:,
                     'platform_score'] = df_platform.loc[:,
                                                         'count'] * df_platform.loc[:,
                                                                                    'mean_target']
     df_pub_platform_max = df_platform.groupby(
         'Platform')['platform_score'].max().reset_index()
     df_pub_platform_max = change_column_name(df_pub_platform_max,
                                              'platform_score', 'max_score')
     df_platform = pd.merge(df_platform,
                            df_pub_platform_max,
                            how='left',
                            on='Platform')
     df_platform['rank'] = df_platform.groupby(
         ['Platform'])['platform_score'].rank(ascending=False)
     return df_platform
Exemple #2
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        le = LabelEncoder()
        df_whole['le_Platform'] = le.fit_transform(df_whole.loc[:, 'Platform'])
        df_main['le_Platform'] = le.transform(df_main.loc[:, 'Platform'])
        platform_list = df_whole.groupby('Name')['le_Platform'].apply(
            list).reset_index()
        platform_list = change_column_name(platform_list, 'le_Platform',
                                           'other_platforms')
        df_main = pd.merge(df_main, platform_list, how='left', on='Name')

        # other から自分を抜く
        def _get_other_platforms(x: pd.DataFrame):
            if np.isnan(x['other_platforms']).any():
                return []
            return [p for p in x['other_platforms'] if p != x['le_Platform']]

        df_main['other_platforms'] = df_main[[
            'other_platforms', 'le_Platform'
        ]].apply(_get_other_platforms, axis=1)
        num_platforms = len(df_whole['le_Platform'].unique())
        df_onehot = self._onehot_platforms(df_main.loc[:, 'other_platforms'],
                                           num_platforms, df_main.shape[0])
        return df_onehot
Exemple #3
0
    def _merge_agg_scores(self, df: pd.DataFrame, merge_to_df: pd.DataFrame) -> pd.DataFrame:
        user_score_agg = df.groupby(['Platform'])['User_Score'].agg(['mean', 'std'])
        user_score_agg = change_column_name(
            user_score_agg, ['mean', 'std'], ['mean_user_score_by_platform', 'std_user_score_by_platform'])
        critic_score_agg = df.groupby(['Platform'])['Critic_Score'].agg(['mean', 'std'])
        critic_score_agg = change_column_name(
            critic_score_agg, ['mean', 'std'], ['mean_critic_score_by_platform', 'std_critic_score_by_platform'])

        user_count_agg = df.groupby(['Platform'])['User_Count'].agg('mean')
        user_count_agg = change_column_name(user_count_agg, 'User_Count', 'mean_user_count_by_platform')
        critic_count_agg = df.groupby(['Platform'])['Critic_Count'].agg('mean')
        critic_count_agg = change_column_name(critic_count_agg, 'Critic_Count', 'mean_critic_count_by_platform')

        merge_to_df = pd.merge(merge_to_df, user_score_agg, how='left', on='Platform')
        merge_to_df = pd.merge(merge_to_df, critic_score_agg, how='left', on='Platform')
        merge_to_df = pd.merge(merge_to_df, user_count_agg, how='left', on='Platform')
        merge_to_df = pd.merge(merge_to_df, critic_count_agg, how='left', on='Platform')

        return merge_to_df
Exemple #4
0
 def create(
     self,
     base: pd.DataFrame,
     others: Optional[Dict[str, pd.DataFrame]] = None,
     *args, **kwargs
 ) -> pd.DataFrame:
     df_main = others['main'].copy()
     df_main = df_main.reset_index()
     df_main = change_column_name(df_main, 'index', 'original_index')
     return df_main.loc[:, ['original_index']]
Exemple #5
0
    def test_change_column_name(self):
        df = pd.read_csv('./tests/data/dummy.csv')
        df = util.change_column_name(df, 'a', 'aaa')
        assert 'aaa' in list(df.columns)
        assert 'a' not in list(df.columns)
        assert 'b' in list(df.columns)  # remain other columns

        df = util.change_column_name(df, ['b'], ['bbb'])
        assert 'bbb' in list(df.columns)
        assert 'b' not in list(df.columns)

        df = util.change_column_name(df, ['x', 'y'], ['xxx', 'yyy'])
        assert 'xxx' in list(df.columns)
        assert 'yyy' in list(df.columns)
        assert 'x' not in list(df.columns)
        assert 'y' not in list(df.columns)

        with pytest.raises(ValueError):
            df = util.change_column_name(df, ['x', 'y'], ['xxx', 'yyy', 'zzz'])
Exemple #6
0
 def _get_mean_target_per_day(self, df: pd.DataFrame):
     df['imp_day'] = pd.to_datetime(df.loc[:, 'imp_at']).dt.day
     df_target_per_day = df.groupby(
         'imp_day')['target'].mean().reset_index()
     df_target_per_day = change_column_name(df_target_per_day, 'target',
                                            'day_avg_target')
     df_tmp = pd.concat([df_target_per_day, df_target_per_day], axis=0)
     df_tmp['target_5days_mean'] = df_tmp.rolling(
         5, center=True)['day_avg_target'].mean()
     df_target_per_day = pd.merge(df_target_per_day,
                                  df_tmp.iloc[14:44],
                                  how='left',
                                  on='imp_day')
     return df_target_per_day
Exemple #7
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        multi_platform = (df_whole.loc[:,
                                       ['Name']].value_counts()).reset_index()
        multi_platform = change_column_name(multi_platform, 0,
                                            'multi_pf_count')
        df_main = pd.merge(df_main, multi_platform, how='left', on='Name')
        return df_main.loc[:, ['multi_pf_count']]
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        df_pivot = None
        for i, c in enumerate(COLUMNS):
            transform = PivotTransform(indices=['Publisher'],
                                       column=c,
                                       target='id',
                                       aggs=['count'],
                                       fillna=0)
            pub_to_c = transform(df_whole)
            if df_pivot is None:
                df_pivot = pub_to_c
            else:
                df_pivot = pd.merge(df_pivot,
                                    pub_to_c,
                                    how='left',
                                    on='Publisher')

        column_name = 'kmeans_cluster_by_Publisher_pivotby_all'
        df_main[column_name] = base.copy().loc[:, column_name]
        df_main = change_column_name(df_main, column_name, 'kmeans_cluster')
        transform = BasicGroupByTransform(keys=['kmeans_cluster'],
                                          targets=['target'],
                                          aggs=['mean'])
        cluster_target = transform(df_main)
        df_main = pd.merge(df_main,
                           cluster_target,
                           how='left',
                           on='kmeans_cluster')
        return df_main.loc[:, [cluster_target.columns[-1]]]
Exemple #9
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        df_whole.loc[:, 'clean_name'] = self._clean(df_whole.loc[:, 'Name'])
        df_main.loc[:, 'clean_name'] = self._clean(df_main.loc[:, 'Name'])

        series_count1 = self._get_top_text_ngrams(
            df_whole.loc[:, 'clean_name'], 100000, (1, 1), 50)
        series_count1 = dict(series_count1[::-1])
        series_count2 = self._get_top_text_ngrams(
            df_whole.loc[:, 'clean_name'], 100000, (2, 2), 20)
        series_count2 = dict(series_count2[::-1][:-2])
        series_count3 = self._get_top_text_ngrams(
            df_whole.loc[:, 'clean_name'], 100000, (3, 3), 10)
        series_count3 = dict(series_count3[::-1])

        df_main['num_word_series_1'] = 0
        for i in series_count1:
            if len(i) < 5:
                continue
            idx = df_main[df_main.loc[:, 'clean_name'].str.startswith(i)].index
            df_main.loc[idx, 'num_word_series_1'] = series_count1[i]
        df_main['num_word_series_2'] = 0
        for i in series_count2:
            idx = df_main[df_main.loc[:, 'clean_name'].str.startswith(i)].index
            df_main.loc[idx, 'num_word_series_2'] = series_count2[i]
        df_main['num_word_series_3'] = 0
        for i in series_count3:
            idx = df_main[df_main.loc[:, 'clean_name'].str.contains(i)].index
            df_main.loc[idx, 'num_word_series_3'] = series_count3[i]

        name_platform = df_whole.groupby('Name')['Platform'].nunique()
        name_platform = change_column_name(name_platform, 'Platform',
                                           'nunique_platform')
        df_main = pd.merge(df_main, name_platform, how='left', on='Name')
        df_main[
            'num_word_series_1'] = df_main.loc[:,
                                               'num_word_series_1'] / df_main.loc[:,
                                                                                  'nunique_platform']
        df_main[
            'num_word_series_2'] = df_main.loc[:,
                                               'num_word_series_2'] / df_main.loc[:,
                                                                                  'nunique_platform']
        df_main[
            'num_word_series_3'] = df_main.loc[:,
                                               'num_word_series_3'] / df_main.loc[:,
                                                                                  'nunique_platform']

        df_main.loc[df_main['num_word_series_1'] == 0, 'num_word_series_1'] = 1
        df_main.loc[df_main['num_word_series_2'] == 0, 'num_word_series_2'] = 1
        df_main.loc[df_main['num_word_series_3'] == 0, 'num_word_series_3'] = 1

        return df_main.loc[:, [
            'num_word_series_1', 'num_word_series_2', 'num_word_series_3'
        ]]