def test_int_index(self): from pandas.core.series import Series arr = np.random.randn(100, 4) result = reduction.reduce(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) result = reduction.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) expected = arr.sum(1) assert_almost_equal(result, expected) dummy = Series(0., index=np.arange(100)) result = reduction.reduce(arr, np.sum, dummy=dummy, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) dummy = Series(0., index=np.arange(4)) result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))) expected = arr.sum(1) assert_almost_equal(result, expected) result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))) assert_almost_equal(result, expected)
def apply_standard(self): # try to reduce first (by default) # this only matters if the reduction in values is of different dtype # e.g. if we want to apply to a SparseFrame, then can't directly reduce # we cannot reduce using non-numpy dtypes, # as demonstrated in gh-12244 if (self.result_type in ['reduce', None] and not self.dtypes.apply(is_extension_type).any()): # Create a dummy Series from an empty array from pandas import Series values = self.values index = self.obj._get_axis(self.axis) labels = self.agg_axis empty_arr = np.empty(len(index), dtype=values.dtype) dummy = Series(empty_arr, index=index, dtype=values.dtype) try: result = reduction.reduce(values, self.f, axis=self.axis, dummy=dummy, labels=labels) return self.obj._constructor_sliced(result, index=labels) except Exception: pass # compute the result using the series generator self.apply_series_generator() # wrap results return self.wrap_results()
def apply_raw(self): try: result = reduction.reduce(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case from pandas import DataFrame, Series if result.ndim == 2: return DataFrame(result, index=self.index, columns=self.columns) else: return Series(result, index=self.agg_axis)
def apply_raw(self): """ apply to the values as a numpy array """ try: result = reduction.reduce(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case if result.ndim == 2: return self.obj._constructor(result, index=self.index, columns=self.columns) else: return self.obj._constructor_sliced(result, index=self.agg_axis)
def calculateStringDataFeaturesForGroup(self, df1, key_column, uniquable_columns, rename_label): """ Method to calculate features for String/Textual fields in the data frame for a particular group Parameters ---------- df1: pandas.DataFrame dataframe containg the data key_column: string The column having the User ID uniquable_columns: list[string] The list of columns for which number of unique/distinct entries are to be calculated rename_label:string Prefix to be prepended to features Returns ------- A dataframe containing the textual features for a group """ exclude_list = [key_column, 'month', 'ChurnPeriod', 'Timestamp'] # Generating Features corresponding to the uniquable list feat_dfs = [] for feat in uniquable_columns: if feat not in exclude_list: print('Calculating number of unique entries for ', feat) df_temp = self.calculateDistinct_X(df1, key_column, feat) df_temp.columns = [key_column, 'Unique_' + feat] feat_dfs.append(df_temp) # Generating Features corresponding to the summable list df_final = reduce( lambda left, right: pd.merge( left, right, on=key_column, how='outer'), feat_dfs) renamed_columns_list = [] return df_final
def apply_standard(self): from pandas import Series reduce = self.reduce if reduce is None: reduce = True # try to reduce first (by default) # this only matters if the reduction in values is of different dtype # e.g. if we want to apply to a SparseFrame, then can't directly reduce if reduce: values = self.values # we cannot reduce using non-numpy dtypes, # as demonstrated in gh-12244 if not is_extension_type(values): # Create a dummy Series from an empty array index = self.obj._get_axis(self.axis) empty_arr = np.empty(len(index), dtype=values.dtype) dummy = Series(empty_arr, index=index, dtype=values.dtype) try: labels = self.agg_axis result = reduction.reduce(values, self.f, axis=self.axis, dummy=dummy, labels=labels) return Series(result, index=labels) except Exception: pass # compute the result using the series generator results, res_index, res_columns = self._apply_series_generator() # wrap results return self.wrap_results(results, res_index, res_columns)
def calculateNumericalDataFeaturesForGroup(self, df1, key_column, summable_columns, max_date, rename_label): """ Method to calculate features for Numerical fields in the data frame for a particular group Parameters ---------- df1: pandas.DataFrame dataframe containg the data key_column: string The column having the User ID summable_columns: list[string] The list of columns for which number of unique/distinct entries are to be calculated rename_label:string Prefix to be preprended to features Returns ------- A dataframe containing the numeric features for a particular group """ print( 'calculateNumericalDataFeaturesForGroup: max_date is {}, rename_label is {}' .format(max_date, rename_label)) exclude_list = [key_column, 'month', 'ChurnPeriod', 'Timestamp'] feat_dfs = [] # Generating Features corresponding to the summable list for feat in summable_columns: if feat not in exclude_list: df_temp = self.calculateSum_X(df1, key_column, feat) print('Calculating Sum for ', feat, ' returned a data frame of shape', df_temp.shape) df_temp.columns = [key_column, 'Total_' + feat] feat_dfs.append(df_temp) # Calculating Standard Deviations for summable features for feat in summable_columns: if feat not in exclude_list: df_temp = self.calculateStDev_X(df1, key_column, feat) print('Calculating Stdev for ', feat, ' returned a data frame of shape', df_temp.shape) df_temp.columns = [key_column, 'StDev_' + feat] feat_dfs.append(df_temp) df_final = reduce( lambda left, right: pd.merge( left, right, on=key_column, how='outer'), feat_dfs) renamed_columns_list = [] df_recency = self.calculateRecency(df1, key_column, max_date) df_timedelta = self.calculateTimeDelta(df1, key_column) feat_dfs.append(df_timedelta) feat_dfs.append(df_recency) df_final = reduce( lambda left, right: pd.merge( left, right, on=key_column, how='outer'), feat_dfs) print( 'CalculateNumericalDataFeaturesForGroup returned a dataframe of shape', df_final.shape) return df_final
def calculateAverages(self, df1, df2, key_column, uniquable_columns, summable_columns): """ Method to calculate averages which are basically the ratio of the features returned by the calculateStringDataFeatures to the features returned by the calculateNumericalDataFeatures Parameters ---------- df1: pandas.DataFrame DataFrame containing the Numerical Features df2: pandas.DataFrame DataFrame containing the String Features key_column:String the column on which aggregation has to be made. ( UserId ) uniquable_columns:list[String] name of the columns in the dataframe2 summable_columns:list[String] name of the columns in the dataframe1 Returns -------- A dataframe containing averages. """ base_features_list = [key_column] df_profile = df1[base_features_list].drop_duplicates() exclude_list = [ key_column, 'month', 'ChurnPeriod', 'Timestamp', 'Group' ] df_final = reduce( lambda left, right: pd.merge( left, right, on=[key_column], how='outer'), [df1, df2]) print('Uniquable columns', uniquable_columns) print('Summable columns', summable_columns) print('df_final.shape', df_final.shape) # Calculating normalized stats for num_feat in summable_columns: if num_feat not in exclude_list: # numerator feature for denom_feat in uniquable_columns: if denom_feat not in exclude_list: if 'Period' in num_feat or 'Period' in denom_feat: if 'Period' in num_feat and 'Period' in denom_feat: print('Calculating {}'.format(num_feat + '_ratio_' + denom_feat)) num_prefix = num_feat.split('_')[0].split( '-')[1] denom_prefix = \ denom_feat.split('_')[0].split('-')[1] if num_prefix == denom_prefix: print('Calculating {}'.format(num_feat + '_ratio_' + denom_feat)) df_final[num_feat + '_per_' + denom_feat] = \ df_final[num_feat] / ( df_final[denom_feat] + 1.0) elif 'StDev' in num_feat or 'StDev' in denom_feat or 'Recency' in num_feat or 'Recency' in denom_feat or 'AvgTimeDelta' in num_feat or 'AvgTimeDelta' in denom_feat: pass else: print( 'Calculating {}'.format(num_feat + '_ratio_' + denom_feat)) df_final[num_feat + '_per_' + denom_feat] = \ df_final[num_feat] / (df_final[denom_feat] + 1.0) df_final = df_final.reset_index() df_final.fillna(0, inplace=True) renamed_columns_list = [] to_keep_list = [key_column] for each in df_final.columns: if 'per' in each: to_keep_list.append(each) return df_final[to_keep_list]