def row_distance(self, n_samples: int = None) -> Tuple[float, float]: """ Calculate mean and standard deviation distances between `self.fake` and `self.real`. :param n_samples: Number of samples to take for evaluation. Compute time increases exponentially. :return: `(mean, std)` of these distances. """ if n_samples is None: n_samples = len(self.real) real = numerical_encoding(self.real, nominal_columns=self.categorical_columns) fake = numerical_encoding(self.fake, nominal_columns=self.categorical_columns) columns = sorted(real.columns.tolist()) real = real[columns] for col in columns: if col not in fake.columns.tolist(): fake[col] = 0 fake = fake[columns] for column in real.columns.tolist(): if len(real[column].unique()) > 2: real[column] = (real[column] - real[column].mean()) / real[column].std() fake[column] = (fake[column] - fake[column].mean()) / fake[column].std() assert real.columns.tolist() == fake.columns.tolist() distances = cdist(real[:n_samples], fake[:n_samples]) min_distances = np.min(distances, axis=1) min_mean = np.mean(min_distances) min_std = np.std(min_distances) return min_mean, min_std
def pca_correlation(self, lingress=False): """ Calculate the relation between PCA explained variance values. Due to some very large numbers, in recent implementation the MAPE(log) is used instead of regressions like Pearson's r. :param lingress: whether to use a linear regression, in this case Pearson's. :return: the correlation coefficient if lingress=True, otherwise 1 - MAPE(log(real), log(fake)) """ self.pca_r = PCA(n_components=5) self.pca_f = PCA(n_components=5) real = self.real fake = self.fake real = numerical_encoding(real, nominal_columns=self.categorical_columns) fake = numerical_encoding(fake, nominal_columns=self.categorical_columns) self.pca_r.fit(real) self.pca_f.fit(fake) if self.verbose: results = pd.DataFrame({'real': self.pca_r.explained_variance_, 'fake': self.pca_f.explained_variance_}) print(f'\nTop 5 PCA components:') print(results.to_string()) if lingress: corr, p, _ = self.comparison_metric(self.pca_r.explained_variance_, self.pca_f.explained_variance_) return corr else: pca_error = mean_absolute_percentage_error(self.pca_r.explained_variance_, self.pca_f.explained_variance_) return 1 - pca_error
def test_numerical_encoding(): num_encoding = numerical_encoding(real, nominal_columns=cat_cols) uint_cols = num_encoding.select_dtypes(include=['uint8']).columns.tolist() num_encoding[uint_cols] = num_encoding[uint_cols].astype('int64') stored_encoding = pd.read_csv(test_data_folder/'real_test_sample_numerical_encoded.csv') pd.testing.assert_frame_equal(num_encoding, stored_encoding) num_encoding = numerical_encoding(fake, nominal_columns=cat_cols) uint_cols = num_encoding.select_dtypes(include=['uint8']).columns.tolist() num_encoding[uint_cols] = num_encoding[uint_cols].astype('int64') stored_encoding = pd.read_csv(test_data_folder/'fake_test_sample_numerical_encoded.csv') pd.testing.assert_frame_equal(num_encoding, stored_encoding)
def convert_numerical(self) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Special function to convert dataset to a numerical representations while making sure they have identical columns. This is sometimes a problem with categorical columns with many values or very unbalanced values :return: Real and fake dataframe with categorical columns one-hot encoded and binary columns factorized. """ real = numerical_encoding(self.real, nominal_columns=self.categorical_columns) columns = sorted(real.columns.tolist()) real = real[columns] fake = numerical_encoding(self.fake, nominal_columns=self.categorical_columns) for col in columns: if col not in fake.columns.tolist(): fake[col] = 0 fake = fake[columns] return real, fake
def test_numerical_encoding(): """ Tests that check wether the dython numerical_encoding are still computed as is expected. """ num_encoding = numerical_encoding(real, nominal_columns=cat_cols) uint_cols = num_encoding.select_dtypes(include=['uint8']).columns.tolist() num_encoding[uint_cols] = num_encoding[uint_cols].astype('int64') stored_encoding = pd.read_csv(test_data_folder / 'real_test_sample_numerical_encoded.csv') pd.testing.assert_frame_equal(num_encoding, stored_encoding) num_encoding = numerical_encoding(fake, nominal_columns=cat_cols) uint_cols = num_encoding.select_dtypes(include=['uint8']).columns.tolist() num_encoding[uint_cols] = num_encoding[uint_cols].astype('int64') stored_encoding = pd.read_csv(test_data_folder / 'fake_test_sample_numerical_encoded.csv') pd.testing.assert_frame_equal(num_encoding, stored_encoding)
def plot_pca(self): """ Plot the first two components of a PCA of real and fake data. """ real = numerical_encoding(self.real, nominal_columns=self.categorical_columns) fake = numerical_encoding(self.fake, nominal_columns=self.categorical_columns) pca_r = PCA(n_components=2) pca_f = PCA(n_components=2) real_t = pca_r.fit_transform(real) fake_t = pca_f.fit_transform(fake) fig, ax = plt.subplots(1, 2, figsize=(12, 6)) fig.suptitle('First two components of PCA', fontsize=16) sns.scatterplot(ax=ax[0], x=real_t[:, 0], y=real_t[:, 1]) sns.scatterplot(ax=ax[1], x=fake_t[:, 0], y=fake_t[:, 1]) ax[0].set_title('Real data') ax[1].set_title('Fake data') plt.show()
def estimator_evaluation(self, target_col: str, target_type: str = 'class') -> float: """ Method to do full estimator evaluation, including training. And estimator is either a regressor or a classifier, depending on the task. Two sets are created of each of the estimators `S_r` and `S_f`, for the real and fake data respectively. `S_f` is trained on ``self.real`` and `S_r` on ``self.fake``. Then, both are evaluated on their own and the others test set. If target_type is ``regr`` we do a regression on the RMSE scores with Pearson's. If target_type is ``class``, we calculate F1 scores and do return ``1 - MAPE(F1_r, F1_f)``. :param target_col: which column should be considered the target both both the regression and classification task. :param target_type: what kind of task this is. Can be either ``class`` or ``regr``. :return: Correlation value or 1 - MAPE """ self.target_col = target_col self.target_type = target_type # Convert both datasets to numerical representations and split x and y real_x = numerical_encoding(self.real.drop([target_col], axis=1), nominal_columns=self.categorical_columns) columns = sorted(real_x.columns.tolist()) real_x = real_x[columns] fake_x = numerical_encoding(self.fake.drop([target_col], axis=1), nominal_columns=self.categorical_columns) for col in columns: if col not in fake_x.columns.tolist(): fake_x[col] = 0 fake_x = fake_x[columns] assert real_x.columns.tolist() == fake_x.columns.tolist( ), f'real and fake columns are different: \n{real_x.columns}\n{fake_x.columns}' if self.target_type == 'class': # Encode real and fake target the same real_y, uniques = pd.factorize(self.real[target_col]) mapping = {key: value for value, key in enumerate(uniques)} fake_y = [ mapping.get(key) for key in self.fake[target_col].tolist() ] elif self.target_type == 'regr': real_y = self.real[target_col] fake_y = self.fake[target_col] else: raise Exception(f'Target Type must be regr or class') # For reproducibilty: np.random.seed(self.random_seed) self.real_x_train, self.real_x_test, self.real_y_train, self.real_y_test = train_test_split( real_x, real_y, test_size=0.2) self.fake_x_train, self.fake_x_test, self.fake_y_train, self.fake_y_test = train_test_split( fake_x, fake_y, test_size=0.2) if target_type == 'regr': self.estimators = [ RandomForestRegressor(n_estimators=20, max_depth=5, random_state=42), Lasso(random_state=42), Ridge(alpha=1.0, random_state=42), ElasticNet(random_state=42), ] elif target_type == 'class': self.estimators = [ LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=500, random_state=42), RandomForestClassifier(n_estimators=10, random_state=42), DecisionTreeClassifier(random_state=42), MLPClassifier([50, 50], solver='adam', activation='relu', learning_rate='adaptive', random_state=42), ] else: raise ValueError(f'target_type must be \'regr\' or \'class\'') self.r_estimators = copy.deepcopy(self.estimators) self.f_estimators = copy.deepcopy(self.estimators) self.estimator_names = [type(clf).__name__ for clf in self.estimators] for estimator in self.estimators: assert hasattr(estimator, 'fit') assert hasattr(estimator, 'score') self.fit_estimators() self.estimators_scores = self.score_estimators() print('\nClassifier F1-scores and their Jaccard similarities:') if self.target_type == 'class' \ else print('\nRegressor MSE-scores and their Jaccard similarities:') print(self.estimators_scores.to_string()) if self.target_type == 'regr': corr, p = self.comparison_metric(self.estimators_scores['real'], self.estimators_scores['fake']) return corr elif self.target_type == 'class': mean = mean_absolute_percentage_error( self.estimators_scores['f1_real'], self.estimators_scores['f1_fake']) return 1 - mean
def run(self): os.makedirs(DATASET_DIR, exist_ok=True) df = pd.read_csv(self.input().path) if self.dataset_split_method == "holdout": train_df, test_df = train_test_split(df, test_size=self.test_size, stratify=df["stroke"], random_state=self.seed) else: train_df, test_df = self._kfold_split(df) if self.smoking_status_imputation_strategy == "mode": smoking_status_mode = train_df["smoking_status"].mode() train_df["smoking_status"] = train_df["smoking_status"].fillna( smoking_status_mode) test_df["smoking_status"] = test_df["smoking_status"].fillna( smoking_status_mode) elif self.smoking_status_imputation_strategy == "mode_by_gender": male_smoking_status_mode = train_df.loc[ train_df["gender"] == "Male", "smoking_status"].mode() female_smoking_status_mode = train_df.loc[ train_df["gender"] == "Female", "smoking_status"].mode() other_smoking_status_mode = train_df.loc[ train_df["gender"] == "Other", "smoking_status"].mode() smoking_status_mode_dict = dict(Male=male_smoking_status_mode, Female=female_smoking_status_mode, Other=other_smoking_status_mode) train_df["smoking_status"] = train_df["smoking_status"].apply( lambda ss: smoking_status_mode_dict[ss] if pd.isna(ss) else ss) test_df["smoking_status"] = test_df["smoking_status"].apply( lambda ss: smoking_status_mode_dict[ss] if pd.isna(ss) else ss) if self.bmi_imputation_strategy == "mean": bmi_mean = train_df["bmi"].mean() train_df["bmi"] = train_df["bmi"].fillna(bmi_mean) test_df["bmi"] = test_df["bmi"].fillna(bmi_mean) elif self.bmi_imputation_strategy == "mean_by_gender": male_bmi_mean = train_df.loc[train_df["gender"] == "Male", "bmi"].mean() female_bmi_mean = train_df.loc[train_df["gender"] == "Female", "bmi"].mean() other_bmi_mean = train_df.loc[train_df["gender"] == "Other", "bmi"].mean() bmi_mean_dict = dict(Male=male_bmi_mean, Female=female_bmi_mean, Other=other_bmi_mean) train_df["bmi"] = train_df["bmi"].apply( lambda bmi: bmi_mean_dict[bmi] if pd.isna(bmi) else bmi) test_df["bmi"] = test_df["bmi"].apply( lambda bmi: bmi_mean_dict[bmi] if pd.isna(bmi) else bmi) nominal_columns = [ "gender", "hypertension", "heart_disease", "ever_married", "work_type", "Residence_type", "smoking_status" ] train_df = numerical_encoding(train_df, nominal_columns=nominal_columns, nan_strategy="SKIP") test_df = numerical_encoding(test_df, nominal_columns=nominal_columns, nan_strategy="SKIP") if self.normalize_numerical_features: age_scaler = StandardScaler() age_scaler.fit(train_df["age"]) avg_glucose_level_scaler = StandardScaler() avg_glucose_level_scaler.fit(train_df["avg_glucose_level"]) bmi_scaler = StandardScaler() bmi_scaler.fit(train_df["bmi"]) train_df["age"] = age_scaler.transform(train_df["age"]) test_df["age"] = age_scaler.transform(test_df["age"]) train_df["avg_glucose_level"] = avg_glucose_level_scaler.transform( train_df["avg_glucose_level"]) test_df["avg_glucose_level"] = avg_glucose_level_scaler.transform( test_df["avg_glucose_level"]) train_df["bmi"] = bmi_scaler.transform(train_df["bmi"]) test_df["bmi"] = bmi_scaler.transform(test_df["bmi"]) if self.sampling_strategy != "none": train_df = self._balance_dataset(train_df) train_df.to_csv(self.output()[0].path, index=False) test_df.to_csv(self.output()[1].path, index=False)