def test_pivot(self): df = pd.read_csv('./tests/data/dummy.csv') df = df[(df['b'] == 'aa') | (df['b'] == 'bb')] indices = ['a'] target = 'x' column = 'b' aggs = ['mean', 'sum'] expected_columns = [ 'mean_x_pivot_by_a_for_b_aa', 'mean_x_pivot_by_a_for_b_bb', 'sum_x_pivot_by_a_for_b_aa', 'sum_x_pivot_by_a_for_b_bb', ] expected_mean_x_pivot_by_a_for_b_aa = [(0.1 + 0.2 + 0.4) / 3, 0.4] expected_sum_x_pivot_by_a_for_b_aa = [0.1 + 0.2 + 0.4, 0.4] expected_mean_x_pivot_by_a_for_b_bb = [0.3, 0.6] expected_sum_x_pivot_by_a_for_b_bb = [0.3, 0.6] expected_values = [ expected_mean_x_pivot_by_a_for_b_aa, expected_mean_x_pivot_by_a_for_b_bb, expected_sum_x_pivot_by_a_for_b_aa, expected_sum_x_pivot_by_a_for_b_bb ] transform = PivotTransform(indices, column, target, aggs) df_output = transform.pivot(df, indices, column, target, aggs) assert df_output.columns.tolist() == indices + expected_columns for c, v in zip(expected_columns, expected_values): print(c, v) assert df_output.loc[0, c] == approx(v[0]) assert df_output.loc[1, c] == approx(v[1])
def create(self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) all_columns = [] for c in COLUMNS: transform = PivotTransform(indices=['Platform'], column=c, target='id', aggs=['count'], fillna=0) pub_to_c = transform(df_whole) df_pca = pd.DataFrame( self._pca_transform(pub_to_c, self.n_components)) pub_to_c = pd.concat([pub_to_c, df_pca], axis=1) pub_to_c = pub_to_c.iloc[:, [0] + list(range(-1, -self.n_components - 1, -1))] pca_columns = [ '_'.join(['pca', str(n), 'count_id_pivotby_Platform_for', c]) for n in range(self.n_components) ] all_columns.extend(pca_columns) pub_to_c.columns = ['Platform'] + pca_columns df_main = pd.merge(df_main, pub_to_c, how='left', on='Platform') return df_main.loc[:, all_columns]
def create( self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs ) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) df_pivot = None for i, c in enumerate(COLUMNS): transform = PivotTransform(indices=['Publisher'], column=c, target='id', aggs=['count'], fillna=0) pub_to_c = transform(df_whole) if df_pivot is None: df_pivot = pub_to_c else: df_pivot = pd.merge(df_pivot, pub_to_c, how='left', on='Publisher') df_pivot = df_pivot.fillna(0) df_pca = pd.DataFrame(self._pca_transform(df_pivot, self.n_components)) df_pivot = pd.concat([df_pivot, df_pca], axis=1) df_pivot = df_pivot.iloc[:, [0] + list(range(-1, -self.n_components - 1, -1))] pca_columns = ['_'.join(['pca', str(n), 'count_id_pivotby_Publisher_for_all']) for n in range(self.n_components)] df_pivot.columns = ['Publisher'] + pca_columns df_main = pd.merge(df_main, df_pivot, how='left', on='Publisher') return df_main.loc[:, pca_columns]
def create( self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs ) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) df_pivot = None for i, c in enumerate(COLUMNS): transform = PivotTransform(indices=['Publisher'], column=c, target='id', aggs=['count'], fillna=0) pub_to_c = transform(df_whole) if df_pivot is None: df_pivot = pub_to_c else: df_pivot = pd.merge(df_pivot, pub_to_c, how='left', on='Publisher') df_pivot = df_pivot.fillna(0) pca = self._pca_transform(df_pivot, self.n_components) kmeans = KMeans(n_clusters=self.n_clusters, random_state=1019) clusters = kmeans.fit_predict(pca) column_name = 'kmeans_cluster_by_Publisher_pivotby_all' df_pivot[column_name] = clusters df_main = pd.merge(df_main, df_pivot, how='left', on='Publisher') # transform = BasicGroupByTransform(keys=[column_name], targets=['target'], aggs=['mean']) # cluster_target = transform(df_main) # df_main = pd.merge(df_main, cluster_target, how='left', on=column_name) return df_main.loc[:, [column_name]]
def test_prepare_columns(self): df = pd.read_csv('./tests/data/dummy.csv') df = df[(df['b'] == 'aa') | (df['b'] == 'bb')] indices = ['a'] target = 'x' column = 'b' column_values = df[column].unique().tolist() aggs = ['mean', 'sum'] expected_columns = [ 'mean_x_pivot_by_a_for_b_aa', 'mean_x_pivot_by_a_for_b_bb', 'sum_x_pivot_by_a_for_b_aa', 'sum_x_pivot_by_a_for_b_bb', ] transform = PivotTransform(indices, target, column, aggs) columns = transform._prepare_columns(indices, column, column_values, target, aggs) assert expected_columns == columns
def test_pivot_multi_indices(self): df = pd.read_csv('./tests/data/dummy.csv') df = df[(df['b'] == 'aa') | (df['b'] == 'bb')] indices = ['a', 'c'] target = 'x' column = 'b' aggs = ['mean', 'sum'] expected_columns = [ 'mean_x_pivot_by_a_c_for_b_aa', 'mean_x_pivot_by_a_c_for_b_bb', 'sum_x_pivot_by_a_c_for_b_aa', 'sum_x_pivot_by_a_c_for_b_bb', ] expected_mean_x_pivot_by_a_c_for_b_aa = [0.4, 0.2, 0.1, np.nan, 0.4] expected_mean_x_pivot_by_a_c_for_b_bb = [ np.nan, 0.3, np.nan, 0.6, np.nan ] expected_sum_x_pivot_by_a_c_for_b_aa = [0.4, 0.2, 0.1, np.nan, 0.4] expected_sum_x_pivot_by_a_c_for_b_bb = [ np.nan, 0.3, np.nan, 0.6, np.nan ] expected_values = [ expected_mean_x_pivot_by_a_c_for_b_aa, expected_mean_x_pivot_by_a_c_for_b_bb, expected_sum_x_pivot_by_a_c_for_b_aa, expected_sum_x_pivot_by_a_c_for_b_bb ] transform = PivotTransform(indices, column, target, aggs) df_output = transform.pivot(df, indices, column, target, aggs) assert df_output.columns.tolist() == indices + expected_columns for c, v in zip(expected_columns, expected_values): print(c, v) for i in range(5): if np.isnan(v[i]): assert np.isnan(df_output.loc[i, c]) else: assert df_output.loc[i, c] == approx(v[i])
def create(self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) df_pivot = None for i, c in enumerate(COLUMNS): transform = PivotTransform(indices=['Publisher'], column=c, target='id', aggs=['count'], fillna=0) pub_to_c = transform(df_whole) if df_pivot is None: df_pivot = pub_to_c else: df_pivot = pd.merge(df_pivot, pub_to_c, how='left', on='Publisher') column_name = 'kmeans_cluster_by_Publisher_pivotby_all' df_main[column_name] = base.copy().loc[:, column_name] df_main = change_column_name(df_main, column_name, 'kmeans_cluster') transform = BasicGroupByTransform(keys=['kmeans_cluster'], targets=['target'], aggs=['mean']) cluster_target = transform(df_main) df_main = pd.merge(df_main, cluster_target, how='left', on='kmeans_cluster') return df_main.loc[:, [cluster_target.columns[-1]]]
def create(self, base: pd.DataFrame, others: Optional[Dict[str, pd.DataFrame]] = None, *args, **kwargs) -> pd.DataFrame: df_main = others['main'].copy() df_another = others['another'].copy() if self.train: df_whole = pd.concat([df_main, df_another]) else: df_whole = pd.concat([df_another, df_main]) agg_columns = [] for c in COLUMNS: transform = PivotTransform(indices=['Publisher'], column=c, target='id', aggs=['count'], fillna=0) pub_to_c = transform(df_whole) agg_columns.extend(pub_to_c.columns[1:]) df_main = pd.merge(df_main, pub_to_c, how='left', on='Publisher') return df_main.loc[:, agg_columns]