def smote(df, log=False): if log: section_timer = Timer(log=f"oversampling using SMOTE") if "_merge" in df.columns: df = df.drop("_merge", axis=1) target_values = pd.unique(df["TARGET"]).tolist() target_values.sort() false_number, true_number = target_values df = df.replace(to_replace={false_number: 0, true_number: 1}) df, df["TARGET"] = SMOTE(n_jobs=4).fit_resample(df.drop(columns="TARGET"), df["TARGET"]) if log: section_timer.end_timer(log=f"for a total shape of {df.shape}") return df
#print(sorted(Counter(temp_labels).items())) temp_features, temp_labels = SMOTE().fit_resample( temp_features.loc[:, temp_features.columns != 'Longterm_TransplantOutcome'], temp_features['Longterm_TransplantOutcome']) print(sorted(Counter(temp_labels).items())) temp_features = temp_features.join( pd.DataFrame(temp_labels, columns=['Longterm_TransplantOutcome'])) events = temp_features['Longterm_TransplantOutcome'].astype(bool) for col in temp_features.columns: if (temp_features.loc[events, col].var() == 0.0 or temp_features.loc[~events, col].var() == 0.0) and col != 'Longterm_TransplantOutcome': print('Dropped column ' + col + ' (no variance)') temp_features.drop([col], axis=1, inplace=True) feature_list = temp_features.columns logdir = r'T:\tbase\logs' box = r'T:\git\tbase\DeepSurvHyperParamBoxConstraints.json' num_evals = 5 update_fn = 'sgd' num_epochs = 10 num_folds = 5 x, ytemp = dataframe_to_deepsurv_ds(temp_features, event_col='Longterm_TransplantOutcome', time_col='tenure') strata = None ya = np.array(ytemp['e'])
def regression_logistique_OverSamp(nbiter, train_proportion, centered=False): if centered == False: data = pd.read_csv("dataframe_regression.csv") else: data = pd.read_csv("dataframe_regression_centre.csv") data = data.sort_values(["passe_id", "receveur_potentiel" ]).reset_index().drop(["index"], axis=1) data.drop(["Unnamed: 0"], axis=1) col = data.columns.tolist() col = col[1:] data = data[col] data = data.drop(["sender_id"], 1) data = data.drop(["premiere_distance_sender"], 1) data = data.drop(["seconde_distance_sender"], 1) from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler #scaler = StandardScaler() method = LogisticRegression(penalty="l1", C=3.5, solver='saga') n_passes = 10039 matrice_coef = np.zeros((nbiter, 9)) liste_scores = np.zeros(nbiter) table = [] for niter in range(nbiter): liste_passes_train = choice(range(1, n_passes + 1), size=int(train_proportion * n_passes), replace=False) liste_passes_train.sort() n_passes_train = int(train_proportion * n_passes) n_passes_test = n_passes - n_passes_train #data = scaler.fit_transform (data) X_train = data[data["passe_id"].isin(liste_passes_train)] y_train = data[data["passe_id"].isin(liste_passes_train)]["passe"] X_test = data[~data["passe_id"].isin(liste_passes_train)] y_test = data[~data["passe_id"].isin(liste_passes_train)]["passe"] #X_train, X_test, y_train, y_test = train_test_split(data, Y, test_size=0.2, random_state=37) from imblearn.over_sampling import SMOTE, ADASYN X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train) #print(sorted(Counter(y_resampled).items())) vect_receveur_potentiel = X_test["receveur_potentiel"] vect_vrai_receveur = X_test["receiver_id"] X_resampled = X_resampled.drop(["passe"], 1) X_resampled = X_resampled.drop(["passe_id"], 1) X_resampled = X_resampled.drop(["receveur_potentiel"], 1) X_resampled = X_resampled.drop(["receiver_id"], 1) X_test = X_test.drop(["passe"], 1) X_test = X_test.drop(["passe_id"], 1) X_test = X_test.drop(["receveur_potentiel"], 1) X_test = X_test.drop(["receiver_id"], 1) method = method.fit(X_resampled, y_resampled) proba = method.predict_proba(X_test) pred = method.predict(X_test) score = method.score(X_test, y_test) coef = method.coef_ table += [pd.crosstab(pred, y_test)] matrice_coef[niter, :] = coef result = proba[:, 1] #on recupere dans prediction_indice les indices des lignes du dataframe test ou il y a la proba max prediction_indice = np.zeros(n_passes_test) for i in range(n_passes_test): prediction_indice[i] = int( np.argmax(result[i * 14:(i + 1) * 14]) + (i * 14)) X_test["receveur_potentiel"] = vect_receveur_potentiel #on recupere dans prediction les receveurs potentiels qui ont le plus de chance de recevoir la passe prediction = np.zeros(n_passes_test) count = 0 for i in prediction_indice: prediction[count] = X_test.iloc[int(i)]["receveur_potentiel"] count += 1 verif = np.zeros(len(prediction)) for i in range(len(prediction)): verif[i] = np.array(vect_vrai_receveur)[i * 14] taux_reussite = np.mean((verif - prediction) % 14 == 0) liste_scores[niter] = taux_reussite moyenne_matrice_coef = np.zeros(9) for i in range(9): moyenne_matrice_coef[i] = np.mean(matrice_coef[:, i]) moyenne_table = (table[0] + table[1] + table[2] + table[3] + table[4] + table[5] + table[6] + table[7] + table[8] + table[9]) / 10 return liste_scores, np.mean( liste_scores), matrice_coef, moyenne_matrice_coef, proba, moyenne_table
# ============================================================================= from collections import Counter from imblearn.over_sampling import SMOTE #print(sorted(Counter(train_labels).items())) train_features, train_labels = SMOTE().fit_resample(train_features.loc[:, train_features.columns != 'Longterm_TransplantOutcome'], train_features['Longterm_TransplantOutcome']) print(sorted(Counter(train_labels).items())) train_features = train_features.join(pd.DataFrame(train_labels,columns=['Longterm_TransplantOutcome'])) # ============================================================================= # # Drop features with no variance # ============================================================================= events = train_features['Longterm_TransplantOutcome'].astype(bool) for col in train_features.columns: if (train_features.loc[events, col].var() == 0.0 or train_features.loc[~events, col].var() == 0.0 ) and col != 'Longterm_TransplantOutcome': print('Dropped column ' + col + ' (no variance)') train_features.drop([col], axis=1, inplace=True) test_features.drop([col], axis=1, inplace=True) feature_list = train_features.columns # ============================================================================= #Feature importance selection # ============================================================================= #def fit_and_score_features2(X): #X = train_features.copy() #y=X[["Longterm_TransplantOutcome","tenure"]] # #X.drop(["tenure", "Longterm_TransplantOutcome"], axis=1, inplace=True) #n_features = X.shape[1] #scores = {'test':0} #m = CoxPHFitter(penalizer=0.1)
def create_model(temp_features, current_cluster, use_cluster_as_feature): print('----------------------------------------------------------------------------------------------------------------------------') print('----------------------------------------------------------------------------------------------------------------------------') print('----------------------------------------------------------------------------------------------------------------------------') print('----------------------------------------------------------------------------------------------------------------------------') print('----------------------------------------------------------------------------------------------------------------------------') # ============================================================================= # #Keep TransplantationID in test data for error analysis # ============================================================================= temp_labels = np.array(temp_features['Longterm_TransplantOutcome']) temp_features= temp_features.drop('TransplantationID', axis = 1) temp_features= temp_features.drop('PatientID', axis = 1) if use_cluster_as_feature: temp_features = pd.get_dummies(data=temp_features, columns=['cluster']) print('Creating model for all clusters with cluster as feature') else: temp_features= temp_features.drop('cluster', axis = 1) print('Creating model for cluster ' + str(current_cluster)) #for col in temp_features.columns: # print(col) # ============================================================================= # #Spliting datasets into train and test sets # ============================================================================= from sklearn.model_selection import train_test_split train_features, test_features, train_labels, test_labels = train_test_split(temp_features, temp_labels, test_size = 0.25, random_state = 42) # ============================================================================= # #SMOTE for upsampling # ============================================================================= from imblearn.over_sampling import SMOTE train_features, train_labels = SMOTE().fit_resample(train_features, train_labels) # ============================================================================= # # Drop features with no variance # ============================================================================= events = train_labels.astype(bool) for col in train_features.columns: if (train_features.loc[events, col].var() == 0.0 or train_features.loc[~events, col].var() == 0.0 ) and col != 'Longterm_TransplantOutcome': #print('Dropped column ' + col + ' (no variance)') train_features.drop([col], axis=1, inplace=True) test_features.drop([col], axis=1, inplace=True) # ============================================================================= # #Cox Regression model # ============================================================================= cph = CoxPHFitter(penalizer=0.1) ## Instantiate the class to create a cph object cph.fit(train_features, 'tenure', event_col='Longterm_TransplantOutcome', show_progress=False, step_size=0.1) ## Fit the data to train the model print('concordance index: ' + str(cph.concordance_index_)) tr_rows = test_features.loc[:, test_features.columns != 'Longterm_TransplantOutcome'].iloc[:, :] predictions = cph.predict_survival_function(tr_rows) predictions = predictions.transpose() # ============================================================================= # #Error analysis # ============================================================================= for col in predictions.columns: if float(col) > (365*6): col_use = col print(col_use) break predictions = predictions[col_use] predictions = predictions.to_frame(name='prediction') predictions.loc[predictions['prediction'] > 0.5, ['prediction']] = 1 predictions.loc[predictions['prediction'] <= 0.5, ['prediction']] = 0 predictions=(~predictions.astype(bool)).astype(int) labels = pd.DataFrame(test_labels, columns=['label']) predictions.reset_index(drop=True, inplace=True) labels.reset_index(drop=True, inplace=True) # ============================================================================= # #Confusion matrix # ============================================================================= from sklearn.metrics import confusion_matrix conf_mat = confusion_matrix(labels, predictions) print(conf_mat) import seaborn seaborn.heatmap(conf_mat) labels_desc = [1,0 ] cm = confusion_matrix(predictions, labels, labels_desc) print_cm(cm, labels_desc) # ============================================================================= # #Precision, Recall, F1-Score # ============================================================================= print(sklearn.metrics.classification_report(labels, predictions, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn')) # ============================================================================= # #ROC curve # ============================================================================= import sklearn.metrics as metrics fpr, tpr, threshold = metrics.roc_curve(labels, predictions) roc_auc = metrics.auc(fpr, tpr) import matplotlib.pyplot as plt plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()