Example #1
0
    def test_pivot(self):
        df = pd.read_csv('./tests/data/dummy.csv')
        df = df[(df['b'] == 'aa') | (df['b'] == 'bb')]
        indices = ['a']
        target = 'x'
        column = 'b'
        aggs = ['mean', 'sum']

        expected_columns = [
            'mean_x_pivot_by_a_for_b_aa',
            'mean_x_pivot_by_a_for_b_bb',
            'sum_x_pivot_by_a_for_b_aa',
            'sum_x_pivot_by_a_for_b_bb',
        ]
        expected_mean_x_pivot_by_a_for_b_aa = [(0.1 + 0.2 + 0.4) / 3, 0.4]
        expected_sum_x_pivot_by_a_for_b_aa = [0.1 + 0.2 + 0.4, 0.4]
        expected_mean_x_pivot_by_a_for_b_bb = [0.3, 0.6]
        expected_sum_x_pivot_by_a_for_b_bb = [0.3, 0.6]

        expected_values = [
            expected_mean_x_pivot_by_a_for_b_aa,
            expected_mean_x_pivot_by_a_for_b_bb,
            expected_sum_x_pivot_by_a_for_b_aa,
            expected_sum_x_pivot_by_a_for_b_bb
        ]

        transform = PivotTransform(indices, column, target, aggs)
        df_output = transform.pivot(df, indices, column, target, aggs)

        assert df_output.columns.tolist() == indices + expected_columns
        for c, v in zip(expected_columns, expected_values):
            print(c, v)
            assert df_output.loc[0, c] == approx(v[0])
            assert df_output.loc[1, c] == approx(v[1])
Example #2
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        all_columns = []
        for c in COLUMNS:
            transform = PivotTransform(indices=['Platform'],
                                       column=c,
                                       target='id',
                                       aggs=['count'],
                                       fillna=0)
            pub_to_c = transform(df_whole)
            df_pca = pd.DataFrame(
                self._pca_transform(pub_to_c, self.n_components))
            pub_to_c = pd.concat([pub_to_c, df_pca], axis=1)
            pub_to_c = pub_to_c.iloc[:, [0] +
                                     list(range(-1, -self.n_components -
                                                1, -1))]
            pca_columns = [
                '_'.join(['pca',
                          str(n), 'count_id_pivotby_Platform_for', c])
                for n in range(self.n_components)
            ]
            all_columns.extend(pca_columns)
            pub_to_c.columns = ['Platform'] + pca_columns
            df_main = pd.merge(df_main, pub_to_c, how='left', on='Platform')
        return df_main.loc[:, all_columns]
    def create(
        self,
        base: pd.DataFrame,
        others: Optional[Dict[str, pd.DataFrame]] = None,
        *args, **kwargs
    ) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        df_pivot = None
        for i, c in enumerate(COLUMNS):
            transform = PivotTransform(indices=['Publisher'], column=c, target='id', aggs=['count'], fillna=0)
            pub_to_c = transform(df_whole)
            if df_pivot is None:
                df_pivot = pub_to_c
            else:
                df_pivot = pd.merge(df_pivot, pub_to_c, how='left', on='Publisher')

        df_pivot = df_pivot.fillna(0)
        df_pca = pd.DataFrame(self._pca_transform(df_pivot, self.n_components))

        df_pivot = pd.concat([df_pivot, df_pca], axis=1)
        df_pivot = df_pivot.iloc[:, [0] + list(range(-1, -self.n_components - 1, -1))]
        pca_columns = ['_'.join(['pca', str(n), 'count_id_pivotby_Publisher_for_all'])
                       for n in range(self.n_components)]
        df_pivot.columns = ['Publisher'] + pca_columns
        df_main = pd.merge(df_main, df_pivot, how='left', on='Publisher')
        return df_main.loc[:, pca_columns]
    def create(
        self,
        base: pd.DataFrame,
        others: Optional[Dict[str, pd.DataFrame]] = None,
        *args, **kwargs
    ) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        df_pivot = None
        for i, c in enumerate(COLUMNS):
            transform = PivotTransform(indices=['Publisher'], column=c, target='id', aggs=['count'], fillna=0)
            pub_to_c = transform(df_whole)
            if df_pivot is None:
                df_pivot = pub_to_c
            else:
                df_pivot = pd.merge(df_pivot, pub_to_c, how='left', on='Publisher')

        df_pivot = df_pivot.fillna(0)
        pca = self._pca_transform(df_pivot, self.n_components)
        kmeans = KMeans(n_clusters=self.n_clusters, random_state=1019)
        clusters = kmeans.fit_predict(pca)
        column_name = 'kmeans_cluster_by_Publisher_pivotby_all'
        df_pivot[column_name] = clusters
        df_main = pd.merge(df_main, df_pivot, how='left', on='Publisher')
        # transform = BasicGroupByTransform(keys=[column_name], targets=['target'], aggs=['mean'])
        # cluster_target = transform(df_main)
        # df_main = pd.merge(df_main, cluster_target, how='left', on=column_name)
        return df_main.loc[:, [column_name]]
Example #5
0
    def test_prepare_columns(self):
        df = pd.read_csv('./tests/data/dummy.csv')
        df = df[(df['b'] == 'aa') | (df['b'] == 'bb')]
        indices = ['a']
        target = 'x'
        column = 'b'
        column_values = df[column].unique().tolist()
        aggs = ['mean', 'sum']

        expected_columns = [
            'mean_x_pivot_by_a_for_b_aa',
            'mean_x_pivot_by_a_for_b_bb',
            'sum_x_pivot_by_a_for_b_aa',
            'sum_x_pivot_by_a_for_b_bb',
        ]
        transform = PivotTransform(indices, target, column, aggs)
        columns = transform._prepare_columns(indices, column, column_values,
                                             target, aggs)

        assert expected_columns == columns
Example #6
0
    def test_pivot_multi_indices(self):
        df = pd.read_csv('./tests/data/dummy.csv')
        df = df[(df['b'] == 'aa') | (df['b'] == 'bb')]
        indices = ['a', 'c']
        target = 'x'
        column = 'b'
        aggs = ['mean', 'sum']

        expected_columns = [
            'mean_x_pivot_by_a_c_for_b_aa',
            'mean_x_pivot_by_a_c_for_b_bb',
            'sum_x_pivot_by_a_c_for_b_aa',
            'sum_x_pivot_by_a_c_for_b_bb',
        ]
        expected_mean_x_pivot_by_a_c_for_b_aa = [0.4, 0.2, 0.1, np.nan, 0.4]
        expected_mean_x_pivot_by_a_c_for_b_bb = [
            np.nan, 0.3, np.nan, 0.6, np.nan
        ]
        expected_sum_x_pivot_by_a_c_for_b_aa = [0.4, 0.2, 0.1, np.nan, 0.4]
        expected_sum_x_pivot_by_a_c_for_b_bb = [
            np.nan, 0.3, np.nan, 0.6, np.nan
        ]

        expected_values = [
            expected_mean_x_pivot_by_a_c_for_b_aa,
            expected_mean_x_pivot_by_a_c_for_b_bb,
            expected_sum_x_pivot_by_a_c_for_b_aa,
            expected_sum_x_pivot_by_a_c_for_b_bb
        ]

        transform = PivotTransform(indices, column, target, aggs)
        df_output = transform.pivot(df, indices, column, target, aggs)

        assert df_output.columns.tolist() == indices + expected_columns
        for c, v in zip(expected_columns, expected_values):
            print(c, v)
            for i in range(5):
                if np.isnan(v[i]):
                    assert np.isnan(df_output.loc[i, c])
                else:
                    assert df_output.loc[i, c] == approx(v[i])
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        df_pivot = None
        for i, c in enumerate(COLUMNS):
            transform = PivotTransform(indices=['Publisher'],
                                       column=c,
                                       target='id',
                                       aggs=['count'],
                                       fillna=0)
            pub_to_c = transform(df_whole)
            if df_pivot is None:
                df_pivot = pub_to_c
            else:
                df_pivot = pd.merge(df_pivot,
                                    pub_to_c,
                                    how='left',
                                    on='Publisher')

        column_name = 'kmeans_cluster_by_Publisher_pivotby_all'
        df_main[column_name] = base.copy().loc[:, column_name]
        df_main = change_column_name(df_main, column_name, 'kmeans_cluster')
        transform = BasicGroupByTransform(keys=['kmeans_cluster'],
                                          targets=['target'],
                                          aggs=['mean'])
        cluster_target = transform(df_main)
        df_main = pd.merge(df_main,
                           cluster_target,
                           how='left',
                           on='kmeans_cluster')
        return df_main.loc[:, [cluster_target.columns[-1]]]
Example #8
0
    def create(self,
               base: pd.DataFrame,
               others: Optional[Dict[str, pd.DataFrame]] = None,
               *args,
               **kwargs) -> pd.DataFrame:
        df_main = others['main'].copy()
        df_another = others['another'].copy()
        if self.train:
            df_whole = pd.concat([df_main, df_another])
        else:
            df_whole = pd.concat([df_another, df_main])

        agg_columns = []
        for c in COLUMNS:
            transform = PivotTransform(indices=['Publisher'],
                                       column=c,
                                       target='id',
                                       aggs=['count'],
                                       fillna=0)
            pub_to_c = transform(df_whole)
            agg_columns.extend(pub_to_c.columns[1:])
            df_main = pd.merge(df_main, pub_to_c, how='left', on='Publisher')

        return df_main.loc[:, agg_columns]