def test_int_index(self):
        from pandas.core.series import Series

        arr = np.random.randn(100, 4)
        result = reduction.reduce(arr, np.sum, labels=Index(np.arange(4)))
        expected = arr.sum(0)
        assert_almost_equal(result, expected)

        result = reduction.reduce(arr, np.sum, axis=1,
                                  labels=Index(np.arange(100)))
        expected = arr.sum(1)
        assert_almost_equal(result, expected)

        dummy = Series(0., index=np.arange(100))
        result = reduction.reduce(arr, np.sum, dummy=dummy,
                                  labels=Index(np.arange(4)))
        expected = arr.sum(0)
        assert_almost_equal(result, expected)

        dummy = Series(0., index=np.arange(4))
        result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy,
                                  labels=Index(np.arange(100)))
        expected = arr.sum(1)
        assert_almost_equal(result, expected)

        result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy,
                                  labels=Index(np.arange(100)))
        assert_almost_equal(result, expected)
Esempio n. 2
0
    def apply_standard(self):

        # try to reduce first (by default)
        # this only matters if the reduction in values is of different dtype
        # e.g. if we want to apply to a SparseFrame, then can't directly reduce

        # we cannot reduce using non-numpy dtypes,
        # as demonstrated in gh-12244
        if (self.result_type in ['reduce', None]
                and not self.dtypes.apply(is_extension_type).any()):

            # Create a dummy Series from an empty array
            from pandas import Series
            values = self.values
            index = self.obj._get_axis(self.axis)
            labels = self.agg_axis
            empty_arr = np.empty(len(index), dtype=values.dtype)
            dummy = Series(empty_arr, index=index, dtype=values.dtype)

            try:
                result = reduction.reduce(values,
                                          self.f,
                                          axis=self.axis,
                                          dummy=dummy,
                                          labels=labels)
                return self.obj._constructor_sliced(result, index=labels)
            except Exception:
                pass

        # compute the result using the series generator
        self.apply_series_generator()

        # wrap results
        return self.wrap_results()
Esempio n. 3
0
    def apply_standard(self):

        # try to reduce first (by default)
        # this only matters if the reduction in values is of different dtype
        # e.g. if we want to apply to a SparseFrame, then can't directly reduce

        # we cannot reduce using non-numpy dtypes,
        # as demonstrated in gh-12244
        if (self.result_type in ['reduce', None] and
                not self.dtypes.apply(is_extension_type).any()):

            # Create a dummy Series from an empty array
            from pandas import Series
            values = self.values
            index = self.obj._get_axis(self.axis)
            labels = self.agg_axis
            empty_arr = np.empty(len(index), dtype=values.dtype)
            dummy = Series(empty_arr, index=index, dtype=values.dtype)

            try:
                result = reduction.reduce(values, self.f,
                                          axis=self.axis,
                                          dummy=dummy,
                                          labels=labels)
                return self.obj._constructor_sliced(result, index=labels)
            except Exception:
                pass

        # compute the result using the series generator
        self.apply_series_generator()

        # wrap results
        return self.wrap_results()
Esempio n. 4
0
    def apply_raw(self):
        try:
            result = reduction.reduce(self.values, self.f, axis=self.axis)
        except Exception:
            result = np.apply_along_axis(self.f, self.axis, self.values)

        # TODO: mixed type case
        from pandas import DataFrame, Series
        if result.ndim == 2:
            return DataFrame(result, index=self.index, columns=self.columns)
        else:
            return Series(result, index=self.agg_axis)
Esempio n. 5
0
    def apply_raw(self):
        try:
            result = reduction.reduce(self.values, self.f, axis=self.axis)
        except Exception:
            result = np.apply_along_axis(self.f, self.axis, self.values)

        # TODO: mixed type case
        from pandas import DataFrame, Series
        if result.ndim == 2:
            return DataFrame(result, index=self.index, columns=self.columns)
        else:
            return Series(result, index=self.agg_axis)
Esempio n. 6
0
    def apply_raw(self):
        """ apply to the values as a numpy array """

        try:
            result = reduction.reduce(self.values, self.f, axis=self.axis)
        except Exception:
            result = np.apply_along_axis(self.f, self.axis, self.values)

        # TODO: mixed type case
        if result.ndim == 2:
            return self.obj._constructor(result,
                                         index=self.index,
                                         columns=self.columns)
        else:
            return self.obj._constructor_sliced(result, index=self.agg_axis)
Esempio n. 7
0
    def apply_raw(self):
        """ apply to the values as a numpy array """

        try:
            result = reduction.reduce(self.values, self.f, axis=self.axis)
        except Exception:
            result = np.apply_along_axis(self.f, self.axis, self.values)

        # TODO: mixed type case
        if result.ndim == 2:
            return self.obj._constructor(result,
                                         index=self.index,
                                         columns=self.columns)
        else:
            return self.obj._constructor_sliced(result,
                                                index=self.agg_axis)
    def calculateStringDataFeaturesForGroup(self, df1, key_column,
                                            uniquable_columns, rename_label):
        """
        Method to calculate features for String/Textual fields in the data frame for a particular group

        Parameters
        ----------

        df1: pandas.DataFrame
            dataframe containg the data

        key_column: string
            The column having the User ID

        uniquable_columns: list[string]
            The list of columns for which number of unique/distinct entries are to be calculated

        rename_label:string
            Prefix to be prepended to features
        Returns
        -------
            A dataframe containing the textual features for a group

        """
        exclude_list = [key_column, 'month', 'ChurnPeriod', 'Timestamp']

        # Generating Features corresponding to the uniquable list
        feat_dfs = []

        for feat in uniquable_columns:

            if feat not in exclude_list:
                print('Calculating number of unique entries for ', feat)
                df_temp = self.calculateDistinct_X(df1, key_column, feat)
                df_temp.columns = [key_column, 'Unique_' + feat]
                feat_dfs.append(df_temp)

        # Generating Features corresponding to the summable list

        df_final = reduce(
            lambda left, right: pd.merge(
                left, right, on=key_column, how='outer'), feat_dfs)
        renamed_columns_list = []

        return df_final
Esempio n. 9
0
    def apply_standard(self):
        from pandas import Series

        reduce = self.reduce
        if reduce is None:
            reduce = True

        # try to reduce first (by default)
        # this only matters if the reduction in values is of different dtype
        # e.g. if we want to apply to a SparseFrame, then can't directly reduce
        if reduce:
            values = self.values

            # we cannot reduce using non-numpy dtypes,
            # as demonstrated in gh-12244
            if not is_extension_type(values):

                # Create a dummy Series from an empty array
                index = self.obj._get_axis(self.axis)
                empty_arr = np.empty(len(index), dtype=values.dtype)

                dummy = Series(empty_arr, index=index, dtype=values.dtype)

                try:
                    labels = self.agg_axis
                    result = reduction.reduce(values,
                                              self.f,
                                              axis=self.axis,
                                              dummy=dummy,
                                              labels=labels)
                    return Series(result, index=labels)
                except Exception:
                    pass

        # compute the result using the series generator
        results, res_index, res_columns = self._apply_series_generator()

        # wrap results
        return self.wrap_results(results, res_index, res_columns)
Esempio n. 10
0
    def apply_standard(self):
        from pandas import Series

        reduce = self.reduce
        if reduce is None:
            reduce = True

        # try to reduce first (by default)
        # this only matters if the reduction in values is of different dtype
        # e.g. if we want to apply to a SparseFrame, then can't directly reduce
        if reduce:
            values = self.values

            # we cannot reduce using non-numpy dtypes,
            # as demonstrated in gh-12244
            if not is_extension_type(values):

                # Create a dummy Series from an empty array
                index = self.obj._get_axis(self.axis)
                empty_arr = np.empty(len(index), dtype=values.dtype)

                dummy = Series(empty_arr, index=index, dtype=values.dtype)

                try:
                    labels = self.agg_axis
                    result = reduction.reduce(values, self.f,
                                              axis=self.axis,
                                              dummy=dummy,
                                              labels=labels)
                    return Series(result, index=labels)
                except Exception:
                    pass

        # compute the result using the series generator
        results, res_index, res_columns = self._apply_series_generator()

        # wrap results
        return self.wrap_results(results, res_index, res_columns)
    def calculateNumericalDataFeaturesForGroup(self, df1, key_column,
                                               summable_columns, max_date,
                                               rename_label):
        """
        Method to calculate features for Numerical fields in the data frame for a particular group
        Parameters
        ----------

        df1: pandas.DataFrame
            dataframe containg the data

        key_column: string
            The column having the User ID

        summable_columns: list[string]
            The list of columns for which number of unique/distinct entries are to be calculated

        rename_label:string
            Prefix to be preprended to features

        Returns 
        -------
            A dataframe containing the numeric features for a particular group
        """
        print(
            'calculateNumericalDataFeaturesForGroup: max_date is {}, rename_label is {}'
            .format(max_date, rename_label))
        exclude_list = [key_column, 'month', 'ChurnPeriod', 'Timestamp']
        feat_dfs = []

        # Generating Features corresponding to the summable list
        for feat in summable_columns:

            if feat not in exclude_list:
                df_temp = self.calculateSum_X(df1, key_column, feat)
                print('Calculating Sum for ', feat,
                      ' returned a data frame of shape', df_temp.shape)
                df_temp.columns = [key_column, 'Total_' + feat]
                feat_dfs.append(df_temp)

        # Calculating Standard Deviations for summable features
        for feat in summable_columns:

            if feat not in exclude_list:
                df_temp = self.calculateStDev_X(df1, key_column, feat)
                print('Calculating Stdev for ', feat,
                      ' returned a data frame of shape', df_temp.shape)
                df_temp.columns = [key_column, 'StDev_' + feat]
                feat_dfs.append(df_temp)

        df_final = reduce(
            lambda left, right: pd.merge(
                left, right, on=key_column, how='outer'), feat_dfs)
        renamed_columns_list = []
        df_recency = self.calculateRecency(df1, key_column, max_date)
        df_timedelta = self.calculateTimeDelta(df1, key_column)
        feat_dfs.append(df_timedelta)
        feat_dfs.append(df_recency)
        df_final = reduce(
            lambda left, right: pd.merge(
                left, right, on=key_column, how='outer'), feat_dfs)
        print(
            'CalculateNumericalDataFeaturesForGroup returned a dataframe of shape',
            df_final.shape)

        return df_final
    def calculateAverages(self, df1, df2, key_column, uniquable_columns,
                          summable_columns):
        """
        Method to calculate averages which are basically the ratio of the features returned by the calculateStringDataFeatures to the features returned by the calculateNumericalDataFeatures

        Parameters
        ----------

        df1: pandas.DataFrame
            DataFrame containing the Numerical Features
        df2: pandas.DataFrame
            DataFrame containing the String Features
        key_column:String
            the column on which aggregation has to be made. ( UserId )
        uniquable_columns:list[String]
            name of the columns in the dataframe2
        summable_columns:list[String]
            name of the columns in the dataframe1
        Returns
        --------
            A dataframe containing averages.
        """
        base_features_list = [key_column]
        df_profile = df1[base_features_list].drop_duplicates()
        exclude_list = [
            key_column, 'month', 'ChurnPeriod', 'Timestamp', 'Group'
        ]

        df_final = reduce(
            lambda left, right: pd.merge(
                left, right, on=[key_column], how='outer'), [df1, df2])

        print('Uniquable columns', uniquable_columns)
        print('Summable columns', summable_columns)
        print('df_final.shape', df_final.shape)

        # Calculating normalized stats
        for num_feat in summable_columns:

            if num_feat not in exclude_list:

                # numerator feature
                for denom_feat in uniquable_columns:

                    if denom_feat not in exclude_list:

                        if 'Period' in num_feat or 'Period' in denom_feat:

                            if 'Period' in num_feat and 'Period' in denom_feat:

                                print('Calculating {}'.format(num_feat +
                                                              '_ratio_' +
                                                              denom_feat))
                                num_prefix = num_feat.split('_')[0].split(
                                    '-')[1]
                                denom_prefix = \
                                    denom_feat.split('_')[0].split('-')[1]

                                if num_prefix == denom_prefix:
                                    print('Calculating {}'.format(num_feat +
                                                                  '_ratio_' +
                                                                  denom_feat))
                                    df_final[num_feat + '_per_' + denom_feat] = \
                                        df_final[num_feat] / (
                                                df_final[denom_feat] + 1.0)

                        elif 'StDev' in num_feat or 'StDev' in denom_feat or 'Recency' in num_feat or 'Recency' in denom_feat or 'AvgTimeDelta' in num_feat or 'AvgTimeDelta' in denom_feat:
                            pass
                        else:
                            print(
                                'Calculating {}'.format(num_feat + '_ratio_' +
                                                        denom_feat))
                            df_final[num_feat + '_per_' + denom_feat] = \
                                df_final[num_feat] / (df_final[denom_feat] + 1.0)

        df_final = df_final.reset_index()
        df_final.fillna(0, inplace=True)
        renamed_columns_list = []
        to_keep_list = [key_column]

        for each in df_final.columns:

            if 'per' in each:
                to_keep_list.append(each)

        return df_final[to_keep_list]