Exemple #1
0
def impute_datawig_iterative(X, mask):
    X_incomplete = X.copy()
    X_incomplete[mask] = np.nan
    df = pd.DataFrame(X_incomplete)
    df.columns = [str(c) for c in df.columns]
    df = SimpleImputer.complete(df, hpo=False, verbose=0, iterations=5)
    mse = evaluate_mse(df.values, X, mask)
    return mse
Exemple #2
0
 def get_imputer(cls):
     """Get the model object for this instance, loading it if it's not already loaded."""
     if cls.imputer is None:
         imputer = SimpleImputer.load(model_path)
         print(imputer.input_columns)
         imputer.load_hpo_model()
         print(imputer.input_columns)
         imputer.imputer.batch_size = 1
         cls.imputer = imputer
     return cls.imputer
Exemple #3
0
def impute_datawig(X, mask):
    X_incomplete = X.copy()
    X_incomplete[mask] = np.nan
    df = pd.DataFrame(X_incomplete)
    df.columns = [str(c) for c in df.columns]
    dw_dir = os.path.join(DIR_PATH,'datawig_imputers')
    df = SimpleImputer.complete(df, output_path=dw_dir, hpo=True, verbose=0, iterations=1)
    for d in glob.glob(os.path.join(dw_dir,'*')):
        shutil.rmtree(d)
    mse = evaluate_mse(df.values, X, mask)
    return mse
Exemple #4
0
    def process(self, store):
        """
        Runs check on provided columns
        :return: result of the check
        """
        if any(column not in store.column_names() for column in self.columns):
            raise Exception(
                "Not all defined columns are present in both data frames. "
                "Defined: {}. Actual: {}".format(self.columns,
                                                 store.column_names()))

        self.imputer = SimpleImputer(input_columns=self.columns,
                                     output_column=self.output_column,
                                     output_path=self.output_path)

        df1 = store.df1[self.columns]
        df2 = store.df2[self.columns]

        df1_train, df1_test, df2_train, df2_test = self.prepare_dfs(df1, df2)

        train_df = pd.concat([df1_train, df2_train], ignore_index=True)
        test_df = pd.concat([df1_test, df2_test], ignore_index=True)

        self.imputer.fit(train_df, test_df, num_epochs=self.num_epochs)

        imputed = self.imputer.predict(test_df)
        y_true, y_pred = imputed[self.output_column], imputed[
            self.output_column + '_imputed']

        base_accuracy, permuted_accuracies = self.calculate_permuted_accuracies(
            df1_test, df2_test, self.columns)

        result = {
            'y_true': y_true,
            'y_pred': y_pred,
            'base_accuracy': base_accuracy,
            'permuted_accuracies': permuted_accuracies
        }

        return self.columns, result
Exemple #5
0
def impute_datawig(X):
    df = pd.DataFrame(X)
    df.columns = [str(c) for c in df.columns]
    df_imputed = {}
    for output_col in df.columns:
        input_cols = sorted(list(set(df.columns) - set([output_col])))
        idx_missing = df[output_col].isnull()
        output_path = os.path.join(dir_path, output_col)
        imputer = SimpleImputer(input_columns=input_cols,
                                output_column=output_col,
                                output_path=output_path).fit(
                                    df.loc[~idx_missing, :],
                                    num_epochs=50,
                                    patience=5)
        df_imputed[output_col] = imputer.predict(df.loc[idx_missing, :])
        shutil.rmtree(output_path)

    for output_col in df.columns:
        idx_missing = df[output_col].isnull()
        df.loc[idx_missing, output_col] = \
            df_imputed[output_col].loc[idx_missing, output_col + "_imputed"]

    return df.values
import pandas as pd
"""
Load Data
"""
df = pd.read_csv('mae_train_dataset.csv').sample(n=1000)
df_train, df_test = random_split(df, split_ratios=[0.8, 0.2])

# ------------------------------------------------------------------------------------
"""
Run default SimpleImputer
"""
# Initialize a SimpleImputer model
imputer = SimpleImputer(
    input_columns=[
        'title', 'text'
    ],  # columns containing information about the column we want to impute
    output_column='finish',  # the column we'd like to impute values for
    output_path='imputer_model'  # stores model data and metrics
)

# Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=5)

# Impute missing values and return original dataframe with predictions
predictions = imputer.predict(df_test)

# Calculate f1 score for true vs predicted values
f1 = f1_score(predictions['finish'],
              predictions['finish_imputed'],
              average='weighted')
# permissions and limitations under the License.

import pandas as pd

from datawig import SimpleImputer
from datawig.utils import random_split
'''
Text Data
'''
df = pd.read_csv('../finish_val_data_sample.csv')
df_train, df_test = random_split(df, split_ratios=[0.8, 0.2])

#Fit a Model Without HPO
imputer_text = SimpleImputer(input_columns=['title', 'text'],
                             output_column='finish',
                             output_path='imputer_text_model',
                             num_hash_buckets=2**15,
                             tokens='chars')

imputer_text.fit(train_df=df_train,
                 learning_rate=1e-4,
                 num_epochs=50,
                 final_fc_hidden_units=[512])

#Fit a Model With HPO
imputer_text = SimpleImputer(
    input_columns=['title', 'text'],
    output_column='finish',
    output_path='imputer_model',
)
def impute_values(df, i):
    df_model = df.loc[df.site == i]
    #Produce anomaly labels
    df_model.loc[df_model['anomaly'] == 1, 'anomaly'] = -1
    df_model.loc[df_model['anomaly'] == 0, 'anomaly'] = 1
    df_model = pivot(df_model)
    scaler = StandardScaler()
    scaler.fit_transform(df_model.usage)
    df_model.loc[df_model.anomaly.isin([-1]).any(axis=1), 'anomaly'] = -1
    labels = df_model.iloc[:, 0]
    print(labels)

    X_train, X_test, Y_train, Y_test = train_test_split(df_model.usage.values,
                                                        labels.values,
                                                        train_size=0.75,
                                                        shuffle=False)
    names = df_model.usage.columns.values
    df_train = pd.DataFrame(
        data=X_train,
        columns=names,
    )
    df_train["anomaly"] = Y_train
    df_train.columns = df_train.columns.astype(str)
    df_test = pd.DataFrame(
        data=X_test,
        columns=names,
    )
    df_test["anomaly"] = Y_test
    df_test.columns = df_test.columns.astype(str)

    names = df_train.columns.values.tolist()
    names.remove("anomaly")
    imputer_1 = SimpleImputer(input_columns=names,
                              output_column=names[0],
                              output_path='data/imputer_model_1')
    imputer_2 = SimpleImputer(input_columns=names,
                              output_column=names[1],
                              output_path='data/imputer_model_2')
    print(imputer_1.__class__.__name__)
    EPOCHS = 20
    imputer_1.fit(
        train_df=df_train,
        learning_rate=1e-4,
        num_epochs=EPOCHS,
    )
    imputer_2.fit(
        train_df=df_train,
        learning_rate=1e-4,
        num_epochs=EPOCHS,
    )
    clf = IsolationForest()
    clf.fit(X_train, Y_train)

    def make_prediction(title, X_test):
        print(title)
        Y_hat = clf.predict(X_test)
        pred_results = {
            'mse': mean_squared_error(Y_test, Y_hat),
            'f1': f1_score(Y_test, Y_hat),
            'balanced_accuracy': balanced_accuracy_score(Y_test, Y_hat),
        }
        print(pred_results)

    make_prediction("Full data", X_test)
    X_test_missing = X_test
    p = 0.2
    indices = np.random.choice(np.arange(X_test_missing.size),
                               replace=False,
                               size=int(X_test_missing.size * p))
    X_test_missing[np.unravel_index(indices, X_test_missing.shape)] = -1
    make_prediction("Missing data", X_test_missing)
    predictions_1 = imputer_1.predict(df_test)
    predictions_2 = imputer_2.predict(df_test)
    X_test_imputed = np.concatenate((predictions_1.iloc[:, 0].values.reshape(
        -1, 1), predictions_2.iloc[:, 1].values.reshape(-1, 1)),
                                    axis=1)
    make_prediction("Imputed data", X_test_imputed)
Exemple #9
0
class DistinctionPrecalculation(Precalculation):
    def __init__(self, columns, num_epochs=10):
        if not isinstance(columns, Iterable) \
                or any(not isinstance(column, str) for column in columns) \
                or len(columns) < 1:
            raise TypeError(
                "columns should be a list of strings. Received: {}".format(
                    columns))
        self.columns = list(columns)

        if not isinstance(num_epochs, int):
            raise TypeError("num_epochs should be a Number. "
                            "Received: {} ({})".format(
                                num_epochs, num_epochs.__class__.__name__))
        if num_epochs < 1:
            raise ValueError("num_epochs should be greater than 0. "
                             "Received: {}.".format(num_epochs))
        self.num_epochs = num_epochs

        self.output_column = '__shift_detector__dataset'
        self.output_path = 'tmp/basicChecks_params'

        self.imputer = None

    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return False
        return set(self.columns) == set(
            other.columns) and self.num_epochs == other.num_epochs

    def __hash__(self):
        hash_items = sorted(self.columns) + [self.__class__, self.num_epochs]
        return hash(tuple(hash_items))

    def process(self, store):
        """
        Runs check on provided columns
        :return: result of the check
        """
        if any(column not in store.column_names() for column in self.columns):
            raise Exception(
                "Not all defined columns are present in both data frames. "
                "Defined: {}. Actual: {}".format(self.columns,
                                                 store.column_names()))

        self.imputer = SimpleImputer(input_columns=self.columns,
                                     output_column=self.output_column,
                                     output_path=self.output_path)

        df1 = store.df1[self.columns]
        df2 = store.df2[self.columns]

        df1_train, df1_test, df2_train, df2_test = self.prepare_dfs(df1, df2)

        train_df = pd.concat([df1_train, df2_train], ignore_index=True)
        test_df = pd.concat([df1_test, df2_test], ignore_index=True)

        self.imputer.fit(train_df, test_df, num_epochs=self.num_epochs)

        imputed = self.imputer.predict(test_df)
        y_true, y_pred = imputed[self.output_column], imputed[
            self.output_column + '_imputed']

        base_accuracy, permuted_accuracies = self.calculate_permuted_accuracies(
            df1_test, df2_test, self.columns)

        result = {
            'y_true': y_true,
            'y_pred': y_pred,
            'base_accuracy': base_accuracy,
            'permuted_accuracies': permuted_accuracies
        }

        return self.columns, result

    def label_dfs(self, df1: pd.DataFrame,
                  df2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Set labels of the first dataframe to 'A' and those of the second dataframe to 'B'
        :param df1: first DataFrame
        :param df2: second DataFrame
        :return: tuple of labeled DataFrames
        """
        # Change the logging mode of pandas in order to not show the warning that shouldn't actually be shown.
        mode = pd.options.mode.chained_assignment
        pd.options.mode.chained_assignment = None

        df1.loc[:, self.output_column] = 'A'
        df2.loc[:, self.output_column] = 'B'

        pd.options.mode.chained_assignment = mode

        return df1, df2

    @staticmethod
    def sample_dfs(df1: pd.DataFrame,
                   df2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Sample DataFrames to length of shorter DataFrame
        :param df1: first DataFrame
        :param df2: second DataFrame
        :return: tuple of sampled DataFrame
        """
        min_len = min(len(df1), len(df2))
        return df1.sample(n=min_len), df2.sample(n=min_len)

    def prepare_dfs(
        self, df1: pd.DataFrame, df2: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Create a train and a test dataset, in which the number number of tuples
        that come from the first and the number of those from the second dataset are equal
        :param df1: first dataset
        :param df2: second dataset
        :return: tuple of train and test dataset
        """
        df1, df2 = self.label_dfs(df1, df2)
        df1_sampled, df2_sampled = self.sample_dfs(df1, df2)

        df1_train, df1_test = random_split(df1_sampled)
        df2_train, df2_test = random_split(df2_sampled)

        return df1_train, df1_test, df2_train, df2_test

    def get_accuracy(self, df):
        """
        Predict the label for df and calculate the accuracy.
        :param df: DataFrame
        :return: accuracy
        """
        imputed = self.imputer.predict(df)
        y_true, y_pred = imputed[self.output_column], imputed[
            self.output_column + '_imputed']

        return accuracy_score(y_true, y_pred)

    def permuted_accuracy(self, df1, df2, column):
        """
        Shuffle the column of both dfs and then switch it.
        Calculate the accuracy for the new DataFrame.
        Do this multiple times to receive a meaningful average accuracy.
        :param df1: first DataFrame
        :param df2: second DataFrame
        :param column: the column that will be permuted
        :return: averaged accuracy
        """
        accuracies = []
        df = pd.concat([df1, df2], ignore_index=True)

        for _ in range(5):
            df1_col_rand = shuffle(df1[column])
            df2_col_rand = shuffle(df2[column])

            col_rand = pd.concat([df2_col_rand, df1_col_rand],
                                 ignore_index=True)
            df[column] = col_rand

            accuracy = self.get_accuracy(df)
            accuracies.append(accuracy)

        return np.array(accuracies).mean()

    def calculate_permuted_accuracies(self, df1, df2, columns):
        """
        Calculate the base accuracy and the permuted accuracy for all columns.
        :param df1: first DataFrame
        :param df2: second DataFrame
        :param columns: columns to calculate the permuted accuracy for
        :return: base accuracy and the permuted accuracies as a dictionary from column to accuracy
        """
        df = pd.concat([df1, df2], ignore_index=True)
        base_accuracy = self.get_accuracy(df)

        permuted_accuracies = {
            col: self.permuted_accuracy(df1, df2, col)
            for col in columns
        }

        return base_accuracy, permuted_accuracies
Exemple #10
0
import pandas as pd

from datawig import SimpleImputer
from datawig.utils import random_split

import numpy as np
"""
Text Data
"""
df = pd.read_csv('mae_train_dataset.csv').sample(n=1000)
df_train, df_test = random_split(df, split_ratios=[0.8, 0.2])

# Fit a Model Without HPO
imputer_text = SimpleImputer(input_columns=['title', 'text'],
                             output_column='finish',
                             output_path='imputer_text_model',
                             num_hash_buckets=2**15,
                             tokens='chars')

imputer_text.fit(train_df=df_train,
                 learning_rate=1e-4,
                 num_epochs=5,
                 final_fc_hidden_units=[512])

# Fit a Model With HPO
imputer_text = SimpleImputer(
    input_columns=['title', 'text'],
    output_column='finish',
    output_path='imputer_model',
)
Exemple #11
0
 def get_imputer(cls):
     """Get the model object for this instance, loading it if it's not already loaded."""
     if cls.imputer == None:
         cls.imputer = SimpleImputer.load(model_path)
     return cls.imputer