def impute_parameter_adjustment(method, param_grid, impute_radio, x_init, y_init, reference_x, reference_y): model = joblib.load('..\\models\\vote_model_hard.joblib') markers = ['o', '*', '1', 's', '2'] I = 20 for radio, marker in zip(impute_radio, markers): acc_1 = {i: 0 for i in param_grid} acc_2 = {i: 0 for i in param_grid} for m in range(I): corruptor = Corruptor(x_init, radio) x_miss = getattr(corruptor, "mcar")() for n in param_grid: if method == 'knn': x_impute = fancyimpute.KNN(k=n).fit_transform( np.vstack( (x_miss, reference_x)))[range(x_init.shape[0])] if method == 'mice': data_impute_list = [] for i in range(n): imputer = fancyimpute.IterativeImputer( n_iter=13, sample_posterior=True, random_state=i) data_impute_list.append( imputer.fit_transform( np.vstack( (x_miss, reference_x)))[range(x_init.shape[0])]) x_impute = np.mean(data_impute_list, 0) print(radio, m, n) if method == 'em': x_impute = em(np.vstack((x_miss, reference_x)), loops=n)[range(x_init.shape[0])] if method == 'som': x_impute = impute_SOM(x_miss, n)[range(x_init.shape[0])] y_pred1 = model.predict(x_impute) y_pred2 = model.predict(x_init) acc_1[n] += 1 - accuracy_score(y_pred1, y_pred2) acc_2[n] += 1 - accuracy_score(y_pred1, y_init) acc_1 = {i: (j / I) for i, j in acc_1.items()} acc_2 = {i: (j / I) for i, j in acc_2.items()} plt.subplot(121) plt.plot(acc_1.keys(), acc_1.values(), marker=marker, label='%.1f%%' % (radio * 100)) plt.xlabel('K') plt.ylabel('CER between imputation and prediction') plt.subplot(122) plt.plot(acc_2.keys(), acc_2.values(), marker=marker, label='%.1f%%' % (radio * 100)) plt.xlabel('K') plt.ylabel('CER between imputation and real label') plt.legend(loc=0, bbox_to_anchor=(0.3, -0.05), ncol=5) plt.show()
def multi_imp_conf(perc, c, m=10): """multiple imputations for conf intervals""" df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) prob_sum = np.zeros(20) # guassian probs sum for k in range(m): clf = BayesianRidge() design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int( k * 243624) % 2**(32 - 1)).fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) prob_sum += (1 / m) * scipy.stats.norm(drug_preds, std * 1).pdf(drug_true) return sum(prob_sum < 0.05) # 95 percent
def multi_imp(perc, c, m=10): """multiple imputation for log prob""" df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) drug_dists = [] prob_sum = 0 # guassian probs sum for k in range(m): clf = BayesianRidge() design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int( k * 243624) % 2**(32 - 1)).fit_transform(df) clf.fit(design, y) drug_preds, std = clf.predict(drug_vals, return_std=True) prob_sum += (1 / m) * scipy.stats.norm(drug_preds, std * 1).pdf(drug_true).prod() return -np.log(prob_sum)
def impute(data, method='knn', n=5): if method == 'knn': data_impute = fancyimpute.KNN(k=n).fit_transform(data) if method == 'mice': data_impute_list = [] for i in range(11): imputer = fancyimpute.IterativeImputer(n_iter=13, sample_posterior=True, random_state=i) data_impute_list.append(imputer.fit_transform(data)) data_impute = np.mean(data_impute_list, 0) # data_impute = mice(data) if method == 'em': data_impute = em(data) if method == 'mean': data_impute = fancyimpute.simple_fill.SimpleFill( fill_method='mean').fit_transform(data) return data_impute
def dec_multi(perc, c, m=10): """ Decision tree classifier with multiple imputation """ clf = DecisionTreeClassifier() df, y = glm_testing.create_missing(perc=perc, c=c) drug_vals, drug_true = glm_testing.test_drug(c=c) y = y.apply(lambda x: 1 if x > 1 else 0) drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0) drug_dists = [] prob_sum = 0 for k in range(m): design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int( k * 243624) % 2**(32 - 1)).fit_transform(df) clf.fit(design, y) prob_sum += (1 / m) * (clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod( ) * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()) return np.log(prob_sum)
def log_multi(perc, c, m=10, extras=0): """ Logistic regression classifier with multiple imputation """ clf = LogisticRegression(solver="liblinear") df, y = glm_testing.create_missing(perc=perc, c=c, extras=extras) drug_vals, drug_true = glm_testing.test_drug(c=c, extras=extras) y = y.apply(lambda x: 1 if x > 1 else 0) drug_true = drug_true.apply(lambda x: 1 if x > 1 else 0) drug_dists = [] prob_sum = 0 # guassian probs sum for k in range(m): design = fancyimpute.IterativeImputer(n_iter=10, sample_posterior=True, random_state=int( k * 243624) % 2**(32 - 1)).fit_transform(df) clf.fit(design, y) prob_sum += (1 / m) * (clf.predict_proba(drug_vals)[:, 1][drug_true == 1].prod( ) * clf.predict_proba(drug_vals)[:, 0][drug_true == 0].prod()) return np.log(prob_sum)
def impute(df, method, verbose=False): """ Impute missing data using specified imputation method. Parameters ---------- df: pd.DataFrame Stat DataFrame with source columns and player/team multi-index. method: str/bool Imputation method for missing data. - False: Do not impute missing data. - None: Do not impute missing data. - 'BiScaler' - 'IterativeImpute' - 'IterativeSVD' - 'KNN': Impute with nearest neighbors. - 'Mean': Impute missing with average of other sources. - 'NuclearNorm' - 'SoftImpute' verbose: bool, default=False If True, print debugging information. Returns ------- df: pd.DataFrame Imputed DataFrame with no NaNs. """ warnings.filterwarnings('ignore', category=RuntimeWarning) # Subset DataFrame to only include only projection columns. ignored_cols = ['Player', 'Team', 'Pos', 'Week', 'STATS'] impute_cols = [col for col in list(df) if col not in ignored_cols] X = df[impute_cols].copy().T # Impute DataFrame. v = verbose if method in [None, False]: imputed_vals = X.values elif np.sum(np.sum(X.isnull())) == 0: # No missing values. imputed_vals = X.values elif method == 'BiScaler': imputed_vals = fi.BiScaler(verbose=v).fit_transform(X) elif method == 'IterativeImpute': imputed_vals = fi.IterativeImputer(verbose=v).fit_transform(X) elif method == 'IterativeSVD': imputed_vals = fi.IterativeSVD(verbose=v).fit_transform(X) elif method == 'KNN': imputed_vals = fi.KNN(k=3, verbose=v).fit_transform(X) elif method == 'MatrixFactorization': imputed_vals = fi.MatrixFactorization(verbose=v).fit_transform(X) elif method == 'Mean': imputed_vals = fi.SimpleFill('mean').fit_transform(X) elif method == 'Median': imputed_vals = fi.SimpleFill('median').fit_transform(X) elif method == 'NuclearNorm': imputed_vals = fi.NuclearNormMinimization(verbose=v).fit_transform(X) elif method == 'SoftImpute': imputed_vals = fi.SoftImpute(verbose=v).fit_transform(X) # Recombine ignored columns with imputed data. imputed_df = pd.DataFrame(imputed_vals.T, columns=X.index) for col in impute_cols: if len(imputed_df[col]) != len(df[col]): print(f'df: {len(df[col])}\nimp: {len(imputed_df[col])}') df[col] = imputed_df[col].values return df
from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix, classification_report from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel from sklearn.svm import LinearSVC from sklearn.feature_selection import RFE from sklearn.feature_selection import RFECV from sklearn.tree import DecisionTreeClassifier from imblearn.pipeline import Pipeline from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler import fancyimpute model_tree = RandomForestClassifier(random_state=100, n_estimators=50) steps = [('imputation', fancyimpute.IterativeImputer(verbose=0)), ('scaler', StandardScaler()), ("over", SMOTE(random_state=42)), ("under", RandomUnderSampler()), ('tree', RandomForestClassifier())] # Create the pipeline: pipeline pipeline = Pipeline(steps) #best result with selected parameters # Number of trees in random forest n_estimators = [1400] # Number of features to consider at every split max_features = ['auto'] # Maximum number of levels in tree max_depth = [130] # Minimum number of samples required to split a node min_samples_split = [8]
def feature_create(config): """ Function to describe the modeling data.""" df = pandas.read_csv(config.raw_file, delimiter=',') # Feature creation title df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) df.loc[df["Title"] == 'Mr', 'Title2'] = 'Mr' df.loc[df["Title"] == 'Mrs', 'Title2'] = 'Mrs' df.loc[df["Title"] == 'Miss', 'Title2'] = 'Miss' df.loc[df["Title"] == 'Master', 'Title2'] = 'Master' df.loc[df["Title"] == 'Ms', 'Title2'] = 'Ms' df.loc[df["Title"] == 'Mlle', 'Title2'] = 'Miss' df.loc[df["Title"] == 'Ms', 'Title2'] = 'Miss' df.loc[df["Title"] == 'Mme', 'Title2'] = 'Mrs' df.loc[df["Title2"].isna(), 'Title2'] = 'Rare' df_title = pandas.get_dummies(df.Title2, prefix='Title') df = pandas.concat([df, df_title], axis=1) df['Name_len'] = df.Name.str.len() df['Name_space'] = df.Name.str.count(' ') df = df.drop(['Name'], axis=1) df = df.drop(['Title'], axis=1) df = df.drop(['Title2'], axis=1) # Missing Value treatment for Cabin df['Cabin_new'] = df.Cabin.str[:1] df = df.drop(['Cabin'], axis=1) df_cabin = pandas.get_dummies(df.Cabin_new, prefix='Cabin', dummy_na=True) df = pandas.concat([df, df_cabin], axis=1) df = df.drop(['Cabin_new'], axis=1) # Feature creation Sex df['Gender'] = df['Sex'].map({'female': 1, 'male': 0}).astype(int) df_sex = pandas.get_dummies(df.Sex, prefix='Sex') df = pandas.concat([df, df_sex], axis=1) df = df.drop(['Sex'], axis=1) # Family df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 df['withsomebody'] = df['SibSp'] + df['Parch'] df["isalone"] = df['withsomebody'].copy() df["isalone"].loc[df['withsomebody'] > 0] = 0 df["isalone"].loc[df['withsomebody'] == 0] = 1 # Missing Value treatment for Embarked df['Embarked'] = df['Embarked'].fillna('C') df_Embarked = pandas.get_dummies(df.Embarked, prefix='Embarked') df = pandas.concat([df, df_Embarked], axis=1) df = df.drop(['Embarked'], axis=1) # Missing Value treatment for Ticket new = df["Ticket"].str.split(" ", n=2, expand=True) new[3] = numpy.where(new[2].isna(), new[1], new[2]) new['Ticket1'] = numpy.where(new[3].isna(), new[0], new[3]) new['Ticket2'] = new[0].str.extract('([A-Za-z]+)', expand=False) new['T_length'] = new.Ticket1.str.len() new['T_First'] = new.Ticket1.str[:1] new = new.drop([0], axis=1) new = new.drop([1], axis=1) new = new.drop([2], axis=1) new = new.drop([3], axis=1) new = new.drop(['Ticket1'], axis=1) df = pandas.concat([df, new], axis=1) df = df.drop(['Ticket'], axis=1) df.loc[df['T_length'] < 5, 'T_l_new'] = 'S' df.loc[df['T_length'] == 5, 'T_l_new'] = 'M' df.loc[df['T_length'] > 5, 'T_l_new'] = 'L' df.loc[df['T_First'] == '1', 'T_f_new'] = 'S' df.loc[df['T_First'] == '2', 'T_f_new'] = 'M' df.loc[df['T_f_new'].isna(), 'T_f_new'] = 'L' df['High_ticket'] = df['Ticket2'].isin(['PP', 'PC', 'C', 'P']) df_t1 = pandas.get_dummies(df.T_l_new, prefix='T_l') df = pandas.concat([df, df_t1], axis=1) df_t2 = pandas.get_dummies(df.T_f_new, prefix='T_F') df = pandas.concat([df, df_t2], axis=1) df = df.drop(['T_l_new'], axis=1) df = df.drop(['T_f_new'], axis=1) df = df.drop(['T_First'], axis=1) df = df.drop(['T_length'], axis=1) df = df.drop(['Ticket2'], axis=1) # interaction between class and age df['Age*Class'] = df["Age"] * df["Pclass"] # interaction between class and child df.loc[df['Age'] < 16, 'ischild'] = 1 df.loc[df.ischild.isna(), 'ischild'] = 0 df["Child*Class"] = df["ischild"] * df["Pclass"] # interaction between class and gender df["Gender*Class"] = df["Gender"] * df["Pclass"] # Missing Value treatment for Age df_ii = pandas.DataFrame(fancyimpute.IterativeImputer().fit_transform( df.drop('Survived', axis=1))) df_ii.columns = df.drop('Survived', axis=1).columns df_ii.index = df.index df_ii = pandas.concat([df_ii, df.Survived], axis=1) df_ii.loc[df_ii['_data_'] == 0, 'Survived'] = float('NaN') df = df_ii df.loc[df['Age'] <= 26, 'Age_new'] = 'A' df.loc[(df['Age'] > 16) & (df['Age'] <= 26), 'Age_new'] = 'B' df.loc[(df['Age'] > 26) & (df['Age'] <= 36), 'Age_new'] = 'C' df.loc[(df['Age'] > 36) & (df['Age'] <= 62), 'Age_new'] = 'D' df.loc[df['Age'] > 62, 'Age_new'] = 'E' df_age = pandas.get_dummies(df.Age_new, prefix='Age') df = pandas.concat([df, df_age], axis=1) df = df.drop(['Age_new'], axis=1) df = scaleColumns(config, df) df.to_csv(config.feature_file, index=False) return None
from data.read_data import DataSet with open('split_hf.pickle', 'rb') as f: sssList = pickle.load(f) med_data = pd.read_csv('follow_up_data_2_1y_2_drug_8.csv', encoding='gbk') med_data = med_data.values t = med_data[:, -1] t = t.reshape(-1, 1) patient_data = pd.read_csv('follow_up_data_2_1y_2_feature_drug.csv', encoding='gbk') patient_data = patient_data.values patient_data = patient_data[:, 1:106] patient_data = fi.IterativeImputer().fit_transform(patient_data) #patient_data = minmax_scale(patient_data, feature_range=(0, 1)) x = patient_data outcome = pd.read_csv('follow_up_data_2_1y_2_outcome.csv', encoding='gbk') outcome = outcome.values outcome = outcome[:, 1] y = outcome.reshape(-1, 1) Acc_tol = np.zeros(shape=(10, 20)) precision_tol = np.zeros(shape=(10, 20)) recall_tol = np.zeros(shape=(10, 20)) f1_tol = np.zeros(shape=(10, 20)) for i in range(0, 10): #hyperparameter tuning print("iteration number: %d" % i) sss = sssList[i]