コード例 #1
0
def smote(df, log=False):
    if log:
        section_timer = Timer(log=f"oversampling using SMOTE")
    if "_merge" in df.columns:
        df = df.drop("_merge", axis=1)

    target_values = pd.unique(df["TARGET"]).tolist()
    target_values.sort()
    false_number, true_number = target_values
    df = df.replace(to_replace={false_number: 0, true_number: 1})

    df, df["TARGET"] = SMOTE(n_jobs=4).fit_resample(df.drop(columns="TARGET"),
                                                    df["TARGET"])

    if log:
        section_timer.end_timer(log=f"for a total shape of {df.shape}")

    return df
#print(sorted(Counter(temp_labels).items()))
temp_features, temp_labels = SMOTE().fit_resample(
    temp_features.loc[:,
                      temp_features.columns != 'Longterm_TransplantOutcome'],
    temp_features['Longterm_TransplantOutcome'])
print(sorted(Counter(temp_labels).items()))
temp_features = temp_features.join(
    pd.DataFrame(temp_labels, columns=['Longterm_TransplantOutcome']))

events = temp_features['Longterm_TransplantOutcome'].astype(bool)
for col in temp_features.columns:
    if (temp_features.loc[events, col].var() == 0.0
            or temp_features.loc[~events, col].var()
            == 0.0) and col != 'Longterm_TransplantOutcome':
        print('Dropped column ' + col + ' (no variance)')
        temp_features.drop([col], axis=1, inplace=True)

feature_list = temp_features.columns

logdir = r'T:\tbase\logs'
box = r'T:\git\tbase\DeepSurvHyperParamBoxConstraints.json'
num_evals = 5
update_fn = 'sgd'
num_epochs = 10
num_folds = 5

x, ytemp = dataframe_to_deepsurv_ds(temp_features,
                                    event_col='Longterm_TransplantOutcome',
                                    time_col='tenure')
strata = None
ya = np.array(ytemp['e'])
コード例 #3
0
def regression_logistique_OverSamp(nbiter, train_proportion, centered=False):
    if centered == False:
        data = pd.read_csv("dataframe_regression.csv")
    else:
        data = pd.read_csv("dataframe_regression_centre.csv")
    data = data.sort_values(["passe_id", "receveur_potentiel"
                             ]).reset_index().drop(["index"], axis=1)
    data.drop(["Unnamed: 0"], axis=1)

    col = data.columns.tolist()
    col = col[1:]
    data = data[col]
    data = data.drop(["sender_id"], 1)
    data = data.drop(["premiere_distance_sender"], 1)
    data = data.drop(["seconde_distance_sender"], 1)

    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler

    #scaler = StandardScaler()
    method = LogisticRegression(penalty="l1", C=3.5, solver='saga')

    n_passes = 10039

    matrice_coef = np.zeros((nbiter, 9))
    liste_scores = np.zeros(nbiter)

    table = []

    for niter in range(nbiter):

        liste_passes_train = choice(range(1, n_passes + 1),
                                    size=int(train_proportion * n_passes),
                                    replace=False)
        liste_passes_train.sort()

        n_passes_train = int(train_proportion * n_passes)
        n_passes_test = n_passes - n_passes_train
        #data = scaler.fit_transform (data)
        X_train = data[data["passe_id"].isin(liste_passes_train)]
        y_train = data[data["passe_id"].isin(liste_passes_train)]["passe"]
        X_test = data[~data["passe_id"].isin(liste_passes_train)]
        y_test = data[~data["passe_id"].isin(liste_passes_train)]["passe"]
        #X_train, X_test, y_train, y_test = train_test_split(data, Y, test_size=0.2, random_state=37)
        from imblearn.over_sampling import SMOTE, ADASYN
        X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
        #print(sorted(Counter(y_resampled).items()))
        vect_receveur_potentiel = X_test["receveur_potentiel"]
        vect_vrai_receveur = X_test["receiver_id"]

        X_resampled = X_resampled.drop(["passe"], 1)
        X_resampled = X_resampled.drop(["passe_id"], 1)
        X_resampled = X_resampled.drop(["receveur_potentiel"], 1)
        X_resampled = X_resampled.drop(["receiver_id"], 1)

        X_test = X_test.drop(["passe"], 1)
        X_test = X_test.drop(["passe_id"], 1)
        X_test = X_test.drop(["receveur_potentiel"], 1)
        X_test = X_test.drop(["receiver_id"], 1)

        method = method.fit(X_resampled, y_resampled)
        proba = method.predict_proba(X_test)
        pred = method.predict(X_test)
        score = method.score(X_test, y_test)
        coef = method.coef_

        table += [pd.crosstab(pred, y_test)]

        matrice_coef[niter, :] = coef

        result = proba[:, 1]

        #on recupere dans prediction_indice les indices des lignes du dataframe test ou il y a la proba max
        prediction_indice = np.zeros(n_passes_test)
        for i in range(n_passes_test):
            prediction_indice[i] = int(
                np.argmax(result[i * 14:(i + 1) * 14]) + (i * 14))

        X_test["receveur_potentiel"] = vect_receveur_potentiel

        #on recupere dans prediction les receveurs potentiels qui ont le plus de chance de recevoir la passe
        prediction = np.zeros(n_passes_test)
        count = 0
        for i in prediction_indice:
            prediction[count] = X_test.iloc[int(i)]["receveur_potentiel"]
            count += 1

        verif = np.zeros(len(prediction))
        for i in range(len(prediction)):
            verif[i] = np.array(vect_vrai_receveur)[i * 14]

        taux_reussite = np.mean((verif - prediction) % 14 == 0)

        liste_scores[niter] = taux_reussite

        moyenne_matrice_coef = np.zeros(9)
        for i in range(9):
            moyenne_matrice_coef[i] = np.mean(matrice_coef[:, i])

    moyenne_table = (table[0] + table[1] + table[2] + table[3] + table[4] +
                     table[5] + table[6] + table[7] + table[8] + table[9]) / 10

    return liste_scores, np.mean(
        liste_scores), matrice_coef, moyenne_matrice_coef, proba, moyenne_table
# =============================================================================
from collections import Counter
from imblearn.over_sampling import SMOTE
#print(sorted(Counter(train_labels).items()))
train_features, train_labels = SMOTE().fit_resample(train_features.loc[:, train_features.columns != 'Longterm_TransplantOutcome'], train_features['Longterm_TransplantOutcome'])
print(sorted(Counter(train_labels).items()))
train_features = train_features.join(pd.DataFrame(train_labels,columns=['Longterm_TransplantOutcome']))

# =============================================================================
# # Drop features with no variance
# =============================================================================
events = train_features['Longterm_TransplantOutcome'].astype(bool)
for col in train_features.columns:
    if (train_features.loc[events, col].var() == 0.0 or train_features.loc[~events, col].var() == 0.0 ) and col != 'Longterm_TransplantOutcome':
        print('Dropped column ' + col + ' (no variance)')
        train_features.drop([col], axis=1, inplace=True)
        test_features.drop([col], axis=1, inplace=True)
        
feature_list = train_features.columns
# =============================================================================
#Feature importance selection
# =============================================================================
#def fit_and_score_features2(X):

#X = train_features.copy()
#y=X[["Longterm_TransplantOutcome","tenure"]]
#   
#X.drop(["tenure", "Longterm_TransplantOutcome"], axis=1, inplace=True)
#n_features = X.shape[1]
#scores = {'test':0}
#m = CoxPHFitter(penalizer=0.1) 
def create_model(temp_features, current_cluster, use_cluster_as_feature):
    print('----------------------------------------------------------------------------------------------------------------------------')
    print('----------------------------------------------------------------------------------------------------------------------------')
    print('----------------------------------------------------------------------------------------------------------------------------')
    print('----------------------------------------------------------------------------------------------------------------------------')
    print('----------------------------------------------------------------------------------------------------------------------------')
    
    # =============================================================================
    # #Keep TransplantationID in test data for error analysis
    # =============================================================================
    temp_labels = np.array(temp_features['Longterm_TransplantOutcome'])    
    temp_features= temp_features.drop('TransplantationID', axis = 1)
    temp_features= temp_features.drop('PatientID', axis = 1)
    if use_cluster_as_feature:
        temp_features = pd.get_dummies(data=temp_features, columns=['cluster'])
        print('Creating model for all clusters with cluster as feature')
    else:
        temp_features= temp_features.drop('cluster', axis = 1)    
        print('Creating model for cluster ' + str(current_cluster))
    #for col in temp_features.columns:
    #    print(col)
    # =============================================================================
    # #Spliting datasets into train and test sets
    # =============================================================================
    from sklearn.model_selection import train_test_split
    train_features, test_features, train_labels, test_labels = train_test_split(temp_features, temp_labels, test_size = 0.25, random_state = 42)
    
    # =============================================================================
    # #SMOTE for upsampling
    # =============================================================================
    from imblearn.over_sampling import SMOTE
    train_features, train_labels = SMOTE().fit_resample(train_features, train_labels)
    
    
    # =============================================================================
    # # Drop features with no variance
    # =============================================================================
    events = train_labels.astype(bool)
    for col in train_features.columns:
        if (train_features.loc[events, col].var() == 0.0 or train_features.loc[~events, col].var() == 0.0 ) and col != 'Longterm_TransplantOutcome':
            #print('Dropped column ' + col + ' (no variance)')
            train_features.drop([col], axis=1, inplace=True)
            test_features.drop([col], axis=1, inplace=True)

    # =============================================================================
    # #Cox Regression model
    # =============================================================================
    
    cph = CoxPHFitter(penalizer=0.1)   ## Instantiate the class to create a cph object
    cph.fit(train_features, 'tenure', event_col='Longterm_TransplantOutcome', show_progress=False, step_size=0.1)   ## Fit the data to train the model
    
    print('concordance index: ' + str(cph.concordance_index_))
    
    tr_rows = test_features.loc[:, test_features.columns != 'Longterm_TransplantOutcome'].iloc[:, :]    
    predictions = cph.predict_survival_function(tr_rows)
    predictions = predictions.transpose()
    
    # =============================================================================
    # #Error analysis
    # =============================================================================
    for col in predictions.columns:
        if float(col) > (365*6):
            col_use = col
            print(col_use)
            break
            
    predictions = predictions[col_use]
    predictions = predictions.to_frame(name='prediction')
    
    predictions.loc[predictions['prediction'] > 0.5, ['prediction']] = 1
    predictions.loc[predictions['prediction'] <= 0.5, ['prediction']] = 0
    predictions=(~predictions.astype(bool)).astype(int)
    
    labels = pd.DataFrame(test_labels, columns=['label'])
    
    predictions.reset_index(drop=True, inplace=True)
    labels.reset_index(drop=True, inplace=True)

    # =============================================================================
    # #Confusion matrix
    # =============================================================================
    from sklearn.metrics import confusion_matrix
    conf_mat = confusion_matrix(labels, predictions)
    print(conf_mat)
    import seaborn
    seaborn.heatmap(conf_mat)
    
    labels_desc = [1,0 ]
    cm = confusion_matrix(predictions, labels, labels_desc)
    print_cm(cm, labels_desc)
    
    # =============================================================================
    # #Precision, Recall, F1-Score
    # =============================================================================
    print(sklearn.metrics.classification_report(labels, predictions, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn'))
    
    # =============================================================================
    # #ROC curve
    # =============================================================================
    import sklearn.metrics as metrics
    fpr, tpr, threshold = metrics.roc_curve(labels,  predictions)
    roc_auc = metrics.auc(fpr, tpr)
    import matplotlib.pyplot as plt
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()