コード例 #1
0
    def row_distance(self, n_samples: int = None) -> Tuple[float, float]:
        """
        Calculate mean and standard deviation distances between `self.fake` and `self.real`.

        :param n_samples: Number of samples to take for evaluation. Compute time increases exponentially.
        :return: `(mean, std)` of these distances.
        """
        if n_samples is None:
            n_samples = len(self.real)
        real = numerical_encoding(self.real,
                                  nominal_columns=self.categorical_columns)
        fake = numerical_encoding(self.fake,
                                  nominal_columns=self.categorical_columns)

        columns = sorted(real.columns.tolist())
        real = real[columns]

        for col in columns:
            if col not in fake.columns.tolist():
                fake[col] = 0
        fake = fake[columns]

        for column in real.columns.tolist():
            if len(real[column].unique()) > 2:
                real[column] = (real[column] -
                                real[column].mean()) / real[column].std()
                fake[column] = (fake[column] -
                                fake[column].mean()) / fake[column].std()
        assert real.columns.tolist() == fake.columns.tolist()

        distances = cdist(real[:n_samples], fake[:n_samples])
        min_distances = np.min(distances, axis=1)
        min_mean = np.mean(min_distances)
        min_std = np.std(min_distances)
        return min_mean, min_std
コード例 #2
0
    def pca_correlation(self, lingress=False):
        """
        Calculate the relation between PCA explained variance values. Due to some very large numbers, in recent implementation the MAPE(log) is used instead of
        regressions like Pearson's r.

        :param lingress: whether to use a linear regression, in this case Pearson's.
        :return: the correlation coefficient if lingress=True, otherwise 1 - MAPE(log(real), log(fake))
        """
        self.pca_r = PCA(n_components=5)
        self.pca_f = PCA(n_components=5)

        real = self.real
        fake = self.fake

        real = numerical_encoding(real, nominal_columns=self.categorical_columns)
        fake = numerical_encoding(fake, nominal_columns=self.categorical_columns)

        self.pca_r.fit(real)
        self.pca_f.fit(fake)
        if self.verbose:
            results = pd.DataFrame({'real': self.pca_r.explained_variance_, 'fake': self.pca_f.explained_variance_})
            print(f'\nTop 5 PCA components:')
            print(results.to_string())

        if lingress:
            corr, p, _ = self.comparison_metric(self.pca_r.explained_variance_, self.pca_f.explained_variance_)
            return corr
        else:
            pca_error = mean_absolute_percentage_error(self.pca_r.explained_variance_, self.pca_f.explained_variance_)
            return 1 - pca_error
コード例 #3
0
def test_numerical_encoding():
    num_encoding = numerical_encoding(real, nominal_columns=cat_cols)
    uint_cols = num_encoding.select_dtypes(include=['uint8']).columns.tolist()
    num_encoding[uint_cols] = num_encoding[uint_cols].astype('int64')
    stored_encoding = pd.read_csv(test_data_folder/'real_test_sample_numerical_encoded.csv')
    pd.testing.assert_frame_equal(num_encoding, stored_encoding)

    num_encoding = numerical_encoding(fake, nominal_columns=cat_cols)
    uint_cols = num_encoding.select_dtypes(include=['uint8']).columns.tolist()
    num_encoding[uint_cols] = num_encoding[uint_cols].astype('int64')
    stored_encoding = pd.read_csv(test_data_folder/'fake_test_sample_numerical_encoded.csv')
    pd.testing.assert_frame_equal(num_encoding, stored_encoding)
コード例 #4
0
    def convert_numerical(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Special function to convert dataset to a numerical representations while making sure they have identical columns. This is sometimes a problem with
        categorical columns with many values or very unbalanced values

        :return: Real and fake dataframe with categorical columns one-hot encoded and binary columns factorized.
        """
        real = numerical_encoding(self.real, nominal_columns=self.categorical_columns)

        columns = sorted(real.columns.tolist())
        real = real[columns]
        fake = numerical_encoding(self.fake, nominal_columns=self.categorical_columns)
        for col in columns:
            if col not in fake.columns.tolist():
                fake[col] = 0
        fake = fake[columns]
        return real, fake
コード例 #5
0
def test_numerical_encoding():
    """
    Tests that check wether the dython numerical_encoding are still computed as is expected.
    """
    num_encoding = numerical_encoding(real, nominal_columns=cat_cols)
    uint_cols = num_encoding.select_dtypes(include=['uint8']).columns.tolist()
    num_encoding[uint_cols] = num_encoding[uint_cols].astype('int64')
    stored_encoding = pd.read_csv(test_data_folder /
                                  'real_test_sample_numerical_encoded.csv')
    pd.testing.assert_frame_equal(num_encoding, stored_encoding)

    num_encoding = numerical_encoding(fake, nominal_columns=cat_cols)
    uint_cols = num_encoding.select_dtypes(include=['uint8']).columns.tolist()
    num_encoding[uint_cols] = num_encoding[uint_cols].astype('int64')
    stored_encoding = pd.read_csv(test_data_folder /
                                  'fake_test_sample_numerical_encoded.csv')
    pd.testing.assert_frame_equal(num_encoding, stored_encoding)
コード例 #6
0
    def plot_pca(self):
        """
        Plot the first two components of a PCA of real and fake data.
        """
        real = numerical_encoding(self.real, nominal_columns=self.categorical_columns)
        fake = numerical_encoding(self.fake, nominal_columns=self.categorical_columns)
        pca_r = PCA(n_components=2)
        pca_f = PCA(n_components=2)

        real_t = pca_r.fit_transform(real)
        fake_t = pca_f.fit_transform(fake)

        fig, ax = plt.subplots(1, 2, figsize=(12, 6))
        fig.suptitle('First two components of PCA', fontsize=16)
        sns.scatterplot(ax=ax[0], x=real_t[:, 0], y=real_t[:, 1])
        sns.scatterplot(ax=ax[1], x=fake_t[:, 0], y=fake_t[:, 1])
        ax[0].set_title('Real data')
        ax[1].set_title('Fake data')
        plt.show()
コード例 #7
0
    def estimator_evaluation(self,
                             target_col: str,
                             target_type: str = 'class') -> float:
        """
        Method to do full estimator evaluation, including training. And estimator is either a regressor or a classifier, depending on the task. Two sets are
        created of each of the estimators `S_r` and `S_f`, for the real and fake data respectively. `S_f` is trained on ``self.real`` and `S_r` on
        ``self.fake``. Then, both are evaluated on their own and the others test set. If target_type is ``regr`` we do a regression on the RMSE scores with
        Pearson's. If target_type is ``class``, we calculate F1 scores and do return ``1 - MAPE(F1_r, F1_f)``.

        :param target_col: which column should be considered the target both both the regression and classification task.
        :param target_type: what kind of task this is. Can be either ``class`` or ``regr``.
        :return: Correlation value or 1 - MAPE
        """
        self.target_col = target_col
        self.target_type = target_type

        # Convert both datasets to numerical representations and split x and  y
        real_x = numerical_encoding(self.real.drop([target_col], axis=1),
                                    nominal_columns=self.categorical_columns)

        columns = sorted(real_x.columns.tolist())
        real_x = real_x[columns]
        fake_x = numerical_encoding(self.fake.drop([target_col], axis=1),
                                    nominal_columns=self.categorical_columns)
        for col in columns:
            if col not in fake_x.columns.tolist():
                fake_x[col] = 0
        fake_x = fake_x[columns]

        assert real_x.columns.tolist() == fake_x.columns.tolist(
        ), f'real and fake columns are different: \n{real_x.columns}\n{fake_x.columns}'

        if self.target_type == 'class':
            # Encode real and fake target the same
            real_y, uniques = pd.factorize(self.real[target_col])
            mapping = {key: value for value, key in enumerate(uniques)}
            fake_y = [
                mapping.get(key) for key in self.fake[target_col].tolist()
            ]
        elif self.target_type == 'regr':
            real_y = self.real[target_col]
            fake_y = self.fake[target_col]
        else:
            raise Exception(f'Target Type must be regr or class')

        # For reproducibilty:
        np.random.seed(self.random_seed)

        self.real_x_train, self.real_x_test, self.real_y_train, self.real_y_test = train_test_split(
            real_x, real_y, test_size=0.2)
        self.fake_x_train, self.fake_x_test, self.fake_y_train, self.fake_y_test = train_test_split(
            fake_x, fake_y, test_size=0.2)

        if target_type == 'regr':
            self.estimators = [
                RandomForestRegressor(n_estimators=20,
                                      max_depth=5,
                                      random_state=42),
                Lasso(random_state=42),
                Ridge(alpha=1.0, random_state=42),
                ElasticNet(random_state=42),
            ]
        elif target_type == 'class':
            self.estimators = [
                LogisticRegression(multi_class='auto',
                                   solver='lbfgs',
                                   max_iter=500,
                                   random_state=42),
                RandomForestClassifier(n_estimators=10, random_state=42),
                DecisionTreeClassifier(random_state=42),
                MLPClassifier([50, 50],
                              solver='adam',
                              activation='relu',
                              learning_rate='adaptive',
                              random_state=42),
            ]
        else:
            raise ValueError(f'target_type must be \'regr\' or \'class\'')

        self.r_estimators = copy.deepcopy(self.estimators)
        self.f_estimators = copy.deepcopy(self.estimators)
        self.estimator_names = [type(clf).__name__ for clf in self.estimators]

        for estimator in self.estimators:
            assert hasattr(estimator, 'fit')
            assert hasattr(estimator, 'score')

        self.fit_estimators()
        self.estimators_scores = self.score_estimators()
        print('\nClassifier F1-scores and their Jaccard similarities:') if self.target_type == 'class' \
            else print('\nRegressor MSE-scores and their Jaccard similarities:')
        print(self.estimators_scores.to_string())

        if self.target_type == 'regr':
            corr, p = self.comparison_metric(self.estimators_scores['real'],
                                             self.estimators_scores['fake'])
            return corr
        elif self.target_type == 'class':
            mean = mean_absolute_percentage_error(
                self.estimators_scores['f1_real'],
                self.estimators_scores['f1_fake'])
            return 1 - mean
コード例 #8
0
    def run(self):
        os.makedirs(DATASET_DIR, exist_ok=True)

        df = pd.read_csv(self.input().path)

        if self.dataset_split_method == "holdout":
            train_df, test_df = train_test_split(df,
                                                 test_size=self.test_size,
                                                 stratify=df["stroke"],
                                                 random_state=self.seed)
        else:
            train_df, test_df = self._kfold_split(df)

        if self.smoking_status_imputation_strategy == "mode":
            smoking_status_mode = train_df["smoking_status"].mode()
            train_df["smoking_status"] = train_df["smoking_status"].fillna(
                smoking_status_mode)
            test_df["smoking_status"] = test_df["smoking_status"].fillna(
                smoking_status_mode)
        elif self.smoking_status_imputation_strategy == "mode_by_gender":
            male_smoking_status_mode = train_df.loc[
                train_df["gender"] == "Male", "smoking_status"].mode()
            female_smoking_status_mode = train_df.loc[
                train_df["gender"] == "Female", "smoking_status"].mode()
            other_smoking_status_mode = train_df.loc[
                train_df["gender"] == "Other", "smoking_status"].mode()

            smoking_status_mode_dict = dict(Male=male_smoking_status_mode,
                                            Female=female_smoking_status_mode,
                                            Other=other_smoking_status_mode)

            train_df["smoking_status"] = train_df["smoking_status"].apply(
                lambda ss: smoking_status_mode_dict[ss] if pd.isna(ss) else ss)
            test_df["smoking_status"] = test_df["smoking_status"].apply(
                lambda ss: smoking_status_mode_dict[ss] if pd.isna(ss) else ss)

        if self.bmi_imputation_strategy == "mean":
            bmi_mean = train_df["bmi"].mean()
            train_df["bmi"] = train_df["bmi"].fillna(bmi_mean)
            test_df["bmi"] = test_df["bmi"].fillna(bmi_mean)
        elif self.bmi_imputation_strategy == "mean_by_gender":
            male_bmi_mean = train_df.loc[train_df["gender"] == "Male",
                                         "bmi"].mean()
            female_bmi_mean = train_df.loc[train_df["gender"] == "Female",
                                           "bmi"].mean()
            other_bmi_mean = train_df.loc[train_df["gender"] == "Other",
                                          "bmi"].mean()

            bmi_mean_dict = dict(Male=male_bmi_mean,
                                 Female=female_bmi_mean,
                                 Other=other_bmi_mean)

            train_df["bmi"] = train_df["bmi"].apply(
                lambda bmi: bmi_mean_dict[bmi] if pd.isna(bmi) else bmi)
            test_df["bmi"] = test_df["bmi"].apply(
                lambda bmi: bmi_mean_dict[bmi] if pd.isna(bmi) else bmi)

        nominal_columns = [
            "gender", "hypertension", "heart_disease", "ever_married",
            "work_type", "Residence_type", "smoking_status"
        ]
        train_df = numerical_encoding(train_df,
                                      nominal_columns=nominal_columns,
                                      nan_strategy="SKIP")
        test_df = numerical_encoding(test_df,
                                     nominal_columns=nominal_columns,
                                     nan_strategy="SKIP")

        if self.normalize_numerical_features:
            age_scaler = StandardScaler()
            age_scaler.fit(train_df["age"])
            avg_glucose_level_scaler = StandardScaler()
            avg_glucose_level_scaler.fit(train_df["avg_glucose_level"])
            bmi_scaler = StandardScaler()
            bmi_scaler.fit(train_df["bmi"])

            train_df["age"] = age_scaler.transform(train_df["age"])
            test_df["age"] = age_scaler.transform(test_df["age"])
            train_df["avg_glucose_level"] = avg_glucose_level_scaler.transform(
                train_df["avg_glucose_level"])
            test_df["avg_glucose_level"] = avg_glucose_level_scaler.transform(
                test_df["avg_glucose_level"])
            train_df["bmi"] = bmi_scaler.transform(train_df["bmi"])
            test_df["bmi"] = bmi_scaler.transform(test_df["bmi"])

        if self.sampling_strategy != "none":
            train_df = self._balance_dataset(train_df)

        train_df.to_csv(self.output()[0].path, index=False)
        test_df.to_csv(self.output()[1].path, index=False)