Beispiel #1
0
    def balance_data(self, data_model: DataModel,
                     target_column_name: str) -> DataModel:
        weights_by_label = self.map_weights_by_cat_label(
            data_model, target_column_name)

        df = data_model.get_dataframe()
        weights = self.get_weights_list(df, target_column_name,
                                        weights_by_label)

        weights_column_warning = 'note: adding weights column ({}), make sure it is passed to the estimator- and ' \
                                 'data builder!'.format(WEIGHTS_COLUMN)
        warnings.warn(weights_column_warning)

        df[WEIGHTS_COLUMN] = weights
        data_model.set_dataframe(df)

        return data_model
Beispiel #2
0
    def balance_data(self, data_model: DataModel,
                     target_column_name: str) -> DataModel:
        long_stack, short_stack = self.prepare_data(
            data_model=data_model, target_column_name=target_column_name)

        length_to_have = len(long_stack)

        duplicate_short_stack = short_stack.copy()
        while len(short_stack) < length_to_have:
            short_stack = pd.concat([short_stack, duplicate_short_stack])

        short_stack = self.cut_df_to_length(short_stack, length_to_have)

        self.validate_result(long_stack, short_stack)

        new_df = self.merge_stacks(long_stack, short_stack)
        data_model.set_dataframe(new_df)

        return data_model
Beispiel #3
0
 def randomize_data(data: DataModel, seed: int):
     df = data.get_dataframe()
     df = df.sample(frac=1, random_state=seed)
     data.set_dataframe(df)