Beispiel #1
0
def predict_fake_input(model, task, title):

    generated = data_handler.load_fake_input(task)
    print('Number of generated conditions : ', generated.shape)

    if (task == 0):
        pred = model.predict_proba(generated)
        final_state = pd.Series(pred[:, 1], name='Pred_Result')
    elif (task == 1):
        pred = model.predict(generated)
        final_state = pd.Series(pred, name='Pred_Result')

    result = pd.concat([generated, final_state], axis=1)
    data_handler.save_csv(result, title + 'pred_fake_input')
Beispiel #2
0
def extract_feature_importance(model, X, title):

    print('Feature importance...')
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)

    shap.summary_plot(shap_values, feature_names=X.columns, plot_type="bar")

    # normalize importance values
    sum_col = abs(shap_values).sum(axis=0)
    imp = np.array(sum_col / sum_col.sum())

    ind = np.argsort(imp)[::-1]
    sorted_imp = imp[ind]
    sorted_feature = X.columns[ind]

    feature_imp_sorted = pd.DataFrame([sorted_imp], columns=sorted_feature)

    print(feature_imp_sorted)
    data_handler.save_csv(feature_imp_sorted,
                          title=title + 'feature_imp_sorted')
def plot_learning_curve_versus_tr_set_size(title='',
                                           save_csv=True,
                                           scoring='roc_auc'):
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set
    X, Y = data_handler.load_XY()

    _ylabel = 'Mean AUROC'
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=6)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=False, random_state=3)
    xgb_clf = XGBClassifier(objective="binary:logistic",
                            min_child_weight=1,
                            **{'tree_method': 'exact'},
                            silent=True,
                            n_jobs=1,
                            random_state=3,
                            seed=3)
    tuned_parameters = dict(learning_rate=[0.01, 0.1],
                            n_estimators=[100, 300, 500],
                            colsample_bylevel=[0.5, 0.7, 0.9],
                            gamma=[0, 0.2, 0.4],
                            max_depth=[3, 5, 7],
                            reg_lambda=[0.1, 1, 10],
                            subsample=[0.4, 0.7, 1])
    xgb_cv = GridSearchCV(xgb_clf,
                          tuned_parameters,
                          cv=inner_cv,
                          scoring='roc_auc',
                          verbose=0,
                          n_jobs=1)

    _ylim = (0.5, 1.01)
    n_jobs = 4
    train_sizes = np.linspace(.2, 1.0, 5)

    # create learning curve values
    train_sizes, train_scores, test_scores = learning_curve(
        xgb_cv,
        X,
        Y,
        cv=outer_cv,
        n_jobs=4,
        train_sizes=train_sizes,
        scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    tr_size_df = pd.Series(train_sizes, name='training_set_size')
    tr_sc_m_df = pd.Series(train_scores_mean, name='training_score_mean')
    cv_sc_m_df = pd.Series(test_scores_mean, name='cv_score_mean')
    tr_sc_std_df = pd.Series(train_scores_std, name='training_score_std')
    cv_sc_std_df = pd.Series(test_scores_std, name='cv_score_std')

    if (save_csv):
        res = pd.concat(
            [tr_size_df, tr_sc_m_df, cv_sc_m_df, tr_sc_std_df, cv_sc_std_df],
            axis=1)
        data_handler.save_csv(data=res, title=title + '_learning_curve')

    # plotting
    _ylim = (0.5, 1.01)

    fig = plt.figure(figsize=(12, 12 / 1.618))
    ax1 = fig.add_subplot(111)

    ax1.set_ylim(_ylim)
    ax1.set_xlabel("Number of Training Samples")
    ax1.set_ylabel(_ylabel)
    plt.grid(False)

    ax1.plot(tr_size_df, tr_sc_m_df, 'o-', color="r", label="Training")
    ax1.plot(tr_size_df,
             cv_sc_m_df,
             '^--',
             color="b",
             label="Cross-Validation")

    plt.setp(ax1.spines.values(), color='black')
    plt.legend(loc="lower right")

    plt.show()
    to_path = data_handler.format_title(to_dir, title + '_learning_curve',
                                        '.png')
    fig.savefig(to_path, dpi=1000, bbox_inches="tight", pad_inches=0)

    return to_path
def plot_learning_curve_versus_tr_epoch(title='',
                                        ntrials=1,
                                        nfolds=10,
                                        save_csv=False,
                                        verbose=True,
                                        save_fig=False):
    X_df, Y_df = data_handler.load_XY()
    X = X_df.values
    Y = Y_df.values

    _ylabel = 'Mean AUROC'
    n_jobs = 4

    # cross validation settup
    Ntrials = ntrials
    outter_nsplit = nfolds
    tot_count = Ntrials * outter_nsplit

    # Results store
    train_mat = np.zeros((tot_count, 500))
    test_mat = np.zeros((tot_count, 500))

    for i in range(Ntrials):
        init_time = time.time()
        print("trial = ", i)
        train_index = []
        test_index = []

        outer_cv = StratifiedKFold(n_splits=outter_nsplit,
                                   shuffle=True,
                                   random_state=i)
        for train_ind, test_ind in outer_cv.split(X, Y):
            train_index.append(train_ind.tolist())
            test_index.append(test_ind.tolist())

        for j in range(outter_nsplit):  #outter_nsplit
            count = i * outter_nsplit + j
            print(str(count), "  / ", str(tot_count))
            X_train = X[train_index[j]]
            Y_train = Y[train_index[j]]

            X_test = X[test_index[j]]
            Y_test = Y[test_index[j]]

            eval_sets = [(X_train, Y_train), (X_test, Y_test)]

            clf = XGBClassifier(objective="binary:logistic",
                                min_child_weight=1,
                                **{'tree_method': 'exact'},
                                silent=True,
                                n_jobs=4,
                                random_state=3,
                                seed=3,
                                learning_rate=0.01,
                                colsample_bylevel=0.9,
                                colsample_bytree=0.9,
                                n_estimators=500,
                                gamma=0.8,
                                max_depth=11,
                                reg_lambda=0.8,
                                subsample=0.4)
            clf.fit(X_train,
                    Y_train,
                    eval_metric=['auc'],
                    eval_set=eval_sets,
                    verbose=False)
            results = clf.evals_result()
            epochs = len(results['validation_0']['auc'])

            # record results
            train_mat[count] = results['validation_0']['auc']
            test_mat[count] = results['validation_1']['auc']

            if (verbose):
                print('Iter: %d, epochs: %d' % (count, epochs))
                print('training result: %.4f, testing result: %.4f' %
                      (train_mat[count][499], test_mat[count][499]))

        print('total time: %.4f mins' % ((time.time() - init_time) / 60))

    # Results store
    epoch_lists = list(range(1, epochs + 1))
    train_results = pd.DataFrame(
        data=train_mat, columns=['epoch_' + str(i) for i in epoch_lists])
    test_results = pd.DataFrame(
        data=test_mat, columns=['epoch_' + str(i) for i in epoch_lists])

    if (save_csv):
        data_handler.save_csv(train_results,
                              title='mos2_learning_curve_train_raw')
        data_handler.save_csv(test_results,
                              title='mos2_learning_curve_test_raw')

    print('end')

    _ylim = (0.5, 1.01)
    n_jobs = 4

    # create learning curve values
    train_scores_mean = np.mean(train_mat, axis=0)
    train_scores_std = np.std(train_mat, axis=0)
    test_scores_mean = np.mean(test_mat, axis=0)
    test_scores_std = np.std(test_mat, axis=0)

    tr_size_df = pd.Series(epoch_lists, name='training_epoch')
    tr_sc_m_df = pd.Series(train_scores_mean, name='training_score_mean')
    val_sc_m_df = pd.Series(test_scores_mean, name='val_score_mean')
    tr_sc_std_df = pd.Series(train_scores_std, name='training_score_std')
    val_sc_std_df = pd.Series(test_scores_std, name='val_score_std')

    if (save_csv):
        res = pd.concat(
            [tr_size_df, tr_sc_m_df, val_sc_m_df, tr_sc_std_df, val_sc_std_df],
            axis=1)
        data_handler.save_csv(data=res, title=title + '_learning_curve')

    # plotting
    _ylim = (0.5, 1.01)

    fig = plt.figure(figsize=(12, 12 / 1.618))
    ax1 = fig.add_subplot(111)

    ax1.set_ylim(_ylim)
    ax1.set_xlabel("Number of Training Epochs")
    ax1.set_ylabel(_ylabel)
    plt.grid(False)

    ax1.plot(tr_size_df, tr_sc_m_df, color="r", label="Training")  #'o-',
    ax1.plot(tr_size_df, val_sc_m_df, color="b", label="Validation")  #'^--',
    # plot error bars
    #ax1.errorbar(tr_size_df, tr_sc_m_df, yerr=tr_sc_std_df,color="r", )
    #ax1.errorbar(tr_size_df, val_sc_m_df, yerr=val_sc_std_df)

    plt.setp(ax1.spines.values(), color='black')
    plt.legend(loc="lower right")

    plt.show()
    to_path = None
    if save_fig:
        to_path = data_handler.format_title(to_dir, title + '_learning_curve',
                                            '.png')
        fig.savefig(to_path, dpi=1000, bbox_inches="tight", pad_inches=0.1)

    return to_path
def plot_ROC_curve(pipe,
                   tuned_parameters,
                   title='roc_curve',
                   save_csv=True,
                   task=0):
    # cross validation settup
    Ntrials = 1
    outter_nsplit = 10
    inner_nsplit = 10

    # Results store
    Y_true = pd.Series(name='Y_true')
    pred_results = pd.Series(name='pred_prob')

    # load data
    assert (task == 0 or task == 2), 'Error: invalid task spec!'
    X_df, Y_df = data_handler.load_XY(task)
    X = X_df.values
    Y = Y_df.values

    for i in range(Ntrials):

        train_index = []
        test_index = []

        outer_cv = StratifiedKFold(n_splits=outter_nsplit,
                                   shuffle=True,
                                   random_state=i)
        for train_ind, test_ind in outer_cv.split(X, Y):
            train_index.append(train_ind.tolist())
            test_index.append(test_ind.tolist())

        for j in range(outter_nsplit):  #outter_nsplit
            print("progress >> ", j, ' / ', outter_nsplit)
            X_train = X[train_index[j]]
            Y_train = Y[train_index[j]]

            X_test = X[test_index[j]]
            Y_test = Y[test_index[j]]

            inner_cv = StratifiedKFold(n_splits=inner_nsplit,
                                       shuffle=False,
                                       random_state=j)

            clf = GridSearchCV(pipe,
                               tuned_parameters,
                               cv=inner_cv,
                               scoring='roc_auc')
            clf.fit(X_train, Y_train)
            pred = pd.Series(clf.predict_proba(X_test)[:, 1])
            pred_results = pd.concat([pred_results, pred],
                                     axis=0,
                                     ignore_index=True)
            Y_test_df = pd.Series(Y_test, name='Y_test')
            Y_true = pd.concat([Y_true, Y_test_df], axis=0, ignore_index=True)

    # plotting
    fpr, tpr, thresholds = metrics.roc_curve(Y_true, pred_results)
    roc_auc = metrics.auc(fpr, tpr)
    auc_value = metrics.roc_auc_score(Y_true, pred_results)

    fig = plt.figure(figsize=(12, 12 / 1.618))
    ax1 = fig.add_subplot(111)

    labl = np.linspace(0, 1, 6)
    labels = [float("{0:.2f}".format(x)) for x in labl]

    ax1.set_xticks(labels)
    ax1.set_xticklabels(labels)
    labels[0] = ''
    ax1.set_yticklabels(labels)
    plt.grid(False)

    ax1.plot(fpr,
             tpr,
             lw=2,
             label='ROC curve (area = {:.2f})'.format(auc_value),
             marker='.',
             linestyle='-',
             color='b')
    ax1.plot([0, 1], [0, 1], linestyle='--', color='k')

    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax1.set_xlim(0, 1)
    ax1.set_ylim(0, 1)
    ax1.legend(loc='lower right')

    color = 'black'

    plt.setp(ax1.spines.values(), color=color)
    ax1.yaxis.set_visible(True)
    ax1.xaxis.set_visible(True)
    ax1.yaxis.set_ticks_position('left')
    ax1.xaxis.set_ticks_position('bottom')
    ax1.get_yaxis().set_tick_params(direction='out', width=2)
    plt.show()
    fig.savefig(data_handler.format_title(to_dir, title + '_ROC_curve',
                                          '.png'),
                dpi=1000,
                bbox_inches="tight",
                pad_inches=0)

    # save results to csv if true
    if save_csv:
        data_mat = np.array([fpr, tpr]).T
        ret = pd.DataFrame(data_mat, columns=['fpr', 'tpr'])
        data_handler.save_csv(ret, title + '_ROC_curve')
    return True
Beispiel #6
0
def PAM_regression(save_csv=False,
                   verbose=False,
                   to_break=True,
                   title='cqd_PAM_',
                   batch=1):

    ## start PAM guided synthesis...
    init_time = time.time()
    Nc = 0

    #construct initial training set
    results_mat = np.zeros(((totalSamp - init_train_size), 12))

    train_ind = random.sample(all_ind_wo_max, init_train_size)
    test_ind = [x for x in all_ind if x not in train_ind]
    if (verbose):
        print('initial training set indexes', train_ind)

    # set up result storage to compute eval metrics, in the order of PAM
    #  ignore the initial training set, as it is not determined by PAM
    pred_results = np.zeros(totalSamp - init_train_size)
    true_results = np.zeros(totalSamp - init_train_size)

    # setup the hyperparameter range for tuning
    tuned_parameters = dict(
        learning_rate=[0.01],
        n_estimators=[300, 500, 700],  #100,,300,400,500
        colsample_bylevel=[0.5, 0.7, 0.9],
        gamma=[0, 0.2],  #0,0.1,0.2,0.3,0.4
        max_depth=[3, 7, 11],  # [3,7,11]]
        reg_lambda=[0.1, 1, 10],  #[0.1,1,10]
        # reg_alpha = [1],
        subsample=[0.4, 0.7, 1])

    j = 0
    loop_count = 0
    mean_y_only_init = np.mean(Y[train_ind])
    std_y_only_init = np.std(Y[train_ind])

    while (j < totalSamp - init_train_size):
        inner_cv = KFold(n_splits=inner_nsplits, shuffle=True, random_state=j)
        X_train = X[train_ind]
        Y_train = Y[train_ind]
        X_test = X[test_ind]
        Y_test = Y[test_ind]

        last_max = np.max(Y_train)

        # GradientBoost
        reg = xgb.XGBRegressor(objective="reg:linear",
                               min_child_weight=1,
                               **{'tree_method': 'exact'},
                               silent=True,
                               n_jobs=4,
                               random_state=3,
                               seed=3)

        gb_clf = GridSearchCV(reg,
                              tuned_parameters,
                              cv=inner_cv,
                              scoring='r2',
                              verbose=0,
                              n_jobs=4)
        gb_clf.fit(X_train, Y_train)
        y_pred = gb_clf.predict(X_test)

        # choose the batch of conditions with best predicted yield
        best_pos_ind = np.argsort(-y_pred)[:batch]
        best_prob = y_pred[best_pos_ind]
        next_ind = np.array(test_ind)[best_pos_ind]

        # update results storage
        train_size = len(Y_train)
        temp = list(range(0, len(y_pred)))
        ind_notbest = [x for x in temp if x not in best_pos_ind]

        start_ptr = j
        end_ptr = np.min([start_ptr + batch, totalSamp - init_train_size])
        pred_results[start_ptr:end_ptr] = best_prob
        pred_results[end_ptr:totalSamp - init_train_size] = y_pred[ind_notbest]
        true_results[start_ptr:end_ptr] = Y_test[best_pos_ind]
        true_results[end_ptr:totalSamp - init_train_size] = Y_test[ind_notbest]

        pred_metrics = test(pred_results, true_results, end_ptr - 1)

        # calculate results
        next_best_true_ind = next_ind[np.argmax(Y_test[best_pos_ind])]
        next_best_y_true = np.max(Y_test[best_pos_ind])
        result_list = [
            train_size,
            next_best_true_ind,
            next_best_y_true,
            best_prob[0],
        ] + pred_metrics
        results_mat[loop_count, :] = np.array(result_list)

        loop_count = loop_count + 1
        j = j + batch

        if (verbose):
            print(loop_count, '->', j, ', best_next_ind=', next_best_true_ind,
                  ' best_Y_true=', "{0:.6f}".format(next_best_y_true),
                  ' train_max=', "{0:.6f}".format(last_max), ' r2=',
                  pred_metrics[0])

        train_ind = [*train_ind, *next_ind]
        test_ind = [x for x in test_ind if x not in next_ind]

        ## critical point
        if (next_best_y_true == Y_global_max and Nc == 0):
            Nc = j + init_train_size
            if (to_break):
                break

    saved_title = '-'
    if (save_csv):
        results = pd.DataFrame(data=results_mat[0:j, :],
                               columns=[
                                   'sample_size', 'pred_ind',
                                   'best_pred_result', 'y_true', 'r2',
                                   'pearson', 'p_value', 'mse', 'r2_s',
                                   'pearson_s', 'p_value_s', 'mse_s'
                               ])
        saved_title = data_handler.save_csv(results, title=title)

    # compute stats
    mean_y_wo_init = np.mean(true_results[0:j])
    std_y_wo_init = np.std(true_results[0:j])

    mean_y_w_init = np.mean(Y[train_ind])
    std_y_w_init = np.std(Y[train_ind])

    run_time = (time.time() - init_time) / 60

    return [
        saved_title, Nc, mean_y_wo_init, std_y_wo_init, mean_y_w_init,
        std_y_w_init, mean_y_only_init, std_y_only_init, run_time
    ]
Beispiel #7
0
# save the results some repetitions for backup
for j in range(0, outer_loop):

    #PAM_results = np.zeros((inner_loop,9))
    init_time = time.time()
    res_arr = []

    for i in range(0, inner_loop):

        loop_count = j * inner_loop + i
        result = PAM_regression(save_csv=False,
                                verbose=False,
                                to_break=True,
                                title='cqd_PAM_' + str(loop_count) +
                                'th_loop_')
        res_arr.append(result)
        print(str(loop_count), ' -> ', str(result[0]), '  time=',
              result[len(result) - 1])

    PAM_df = pd.DataFrame(data=res_arr,
                          columns=[
                              'file-name', 'num_experiments', 'mean_y_wo_init',
                              'std_y_wo_init', 'mean_y_w_init', 'std_y_w_init',
                              'mean_y_only_init', 'std_y_only_init', 'run_time'
                          ])
    saved_path = data_handler.save_csv(PAM_df,
                                       title='cqd_PAM_' + str(inner_loop) +
                                       'times_')
    print('total = ', str((time.time() - init_time) / 3600),
          '  hrs  >>-------saved')
def PAM_classfication(verbose=False,
                      save_csv=False,
                      to_break=True,
                      title='mos2_PAM_'):
    '''
        PAM of classification problem.
        
        Arguments:
            verbose : Bool. 
            save_csv: Bool. Whether to save detailed results of the PAM into csv file
            to_break: Bool. Whether to reinforce additional stopping condition when critical point is found
        
        Return:
            [Nc, results[Nc,:]] : Nc is the critical point
    '''
    #critical point
    Nc = 0
    init_time = time.time()

    # setup initial sets
    init_sets = generate_init_sets()
    train_ind = init_sets['train_ind']
    test_ind = init_sets['test_ind']
    if (verbose):
        print('initial training set indexes', train_ind)

    # Results store
    init_train_size = len(train_ind)
    init_cnot_count = list(Y[train_ind]).count(0)
    init_can_count = init_train_size - init_cnot_count
    results_mat = np.zeros((totalSamp - init_train_size, 8))

    # setup hyperparameter range to tune
    tuned_parameters = dict(
        learning_rate=[0.01],  #0.01,0.1,0.2,0.3
        n_estimators=[100, 300, 500],  #100
        gamma=[0, 0.2, 0.4],  #0,0.1,0.2,0.3,0.4
        max_depth=[5, 7, 9, 11],  # [4,5,6]
        reg_lambda=[0.1, 1, 10],
        colsample_bylevel=[0.9],
        subsample=[0.4, 0.7, 1])

    # start PAM guided synthesis...
    for j in range(totalSamp):  #outter_nspliT
        inner_cv = StratifiedKFold(
            n_splits=inner_nsplits, shuffle=True, random_state=j
        )  #StratifiedKFold(n_splits=inner_nsplits, random_state=j)
        X_train = X[train_ind]
        Y_train = Y[train_ind]
        X_test = X[test_ind]
        Y_test = Y[test_ind]

        #count pos/neg of training set
        tr_zero_count = list(Y_train).count(0)
        tr_total_count = len(train_ind)
        pos_tr = tr_total_count - tr_zero_count

        # GradientBoost
        pipe = xgb.XGBClassifier(objective='binary:logistic',
                                 min_child_weight=1,
                                 **{'tree_method': 'exact'},
                                 silent=True,
                                 n_jobs=4,
                                 random_state=3,
                                 seed=3,
                                 scale_pos_weight=1)

        gb_clf = GridSearchCV(pipe,
                              tuned_parameters,
                              cv=inner_cv,
                              scoring='roc_auc',
                              verbose=0,
                              n_jobs=4)
        gb_clf.fit(X_train, Y_train)
        result_list, next_ind, best_prob, fp_ts, fn_ts = test(
            gb_clf, X_test, Y_test)

        # calculate results
        type1_err = (fp_ts + tr_zero_count -
                     init_cnot_count) / (tot_cnot_count - init_cnot_count)
        type2_err = fn_ts / (tot_can_count - init_can_count)
        results_mat[j, :] = np.array([tr_total_count] + result_list +
                                     [best_prob, pos_tr, type1_err, type2_err])

        next_ind = test_ind[next_ind]
        if (verbose):
            print(j, 'loop, next_ind=', next_ind, ' #tr=', tr_total_count,
                  ' pos_tr=', pos_tr, ' best_prob=',
                  "{0:.6f}".format(best_prob), ' type1=',
                  "{0:.6f}".format(type1_err), ' type2=',
                  "{0:.6f}".format(type2_err))

        # critical point
        if ((best_prob < 0.5) and (Nc == 0)):
            Nc = tr_total_count
            if (to_break):
                break

        #stopping condition
        if (pos_tr == tot_can_count):
            break

        #update train/test sets
        train_ind = train_ind + [next_ind]
        test_ind.remove(next_ind)

    saved_title = '-'
    if (save_csv):
        results_df = pd.DataFrame(data=results_mat[0:j + 1],
                                  columns=[
                                      'sample_size', 'acc_ts', 'tpr_ts',
                                      'tnr_ts', 'best_prob', 'pos_tr',
                                      'type1_err', 'type2_err'
                                  ])
        saved_title = data_handler.save_csv(results_df, title=title)

    run_time = (time.time() - init_time) / 60

    return [saved_title, Nc] + results_mat[j].tolist() + [run_time]