Beispiel #1
0
    def findBestK(data, x_cols, y_cols):
        """
        Non-nested approach to knn. Also for quick accuracy testing

        Arguments:
            data {array} -- Data
            x_cols {array} -- x columns
            y_cols {array} -- y columns
        """

        best_k=0
        best_accu=0

        x = data.loc[:, x_cols]
        y = data.loc[:, y_cols]

        #Picking best k
        for k in range(2,11): #from 2 to 10
            loo = LeaveOneOut()
            loo.get_n_splits(data)
            n=loo.split(data)
            knnClassifier = KNeighborsClassifier(n_neighbors=k, weights="uniform", metric="euclidean")

            accuracy_a = []
            real_label = []
            pred_label = []

            for train_index, test_index in n: #Each row is test data once
                xtrain, xtest = x.iloc[train_index], x.iloc[test_index]
                ytrain, ytest = y.iloc[train_index], y.iloc[test_index]

                knnClassifier.fit(xtrain, ytrain.values.ravel())
                ypred=knnClassifier.predict(xtest)
                pred_label.append(ypred)
                real_label.append(ytest)
                
                acc = accuracy_score(ytest, ypred)
                accuracy_a.append(acc)
            avg_acc = np.mean(accuracy_a)
            print(k,": average accuracy ", avg_acc)

            if(avg_acc>best_accu): #Updating best_k if accuracy is better
                best_accu=avg_acc
                best_k=k

        print("Best k=",best_k)
        print("Best accuracy=",best_accu)

        return(best_k)
Beispiel #2
0
def element_check(X, Y, num):
    if num == 2:
        clf = MLPClassifier(max_iter=500,
                            alpha=1.0,
                            random_state=21,
                            tol=0.000000001)
    elif num == 1:
        clf = KNeighborsClassifier(n_neighbors=3)
    TN = 0
    FP = 0
    FN = 0
    TP = 0
    F1 = 0
    Educate = 0.0
    Test = 0.0
    count = 0
    loo = LeaveOneOut()
    loo.get_n_splits(X)

    for train_index, test_index in loo.split(X):
        start_time = time.time()
        clf.fit(X[train_index], Y[train_index])
        education_time = time.time() - start_time
        start_time = time.time()
        proba = clf.predict(X[test_index])
        test_time = time.time() - start_time
        Educate += education_time
        Test += test_time

        tn, fp, fn, tp = confusion_matrix(Y[test_index], proba,
                                          labels=[0, 1]).ravel()
        TN += tn
        FP += fp
        FN += fn
        TP += tp
        count += 1
        F1 += (f1_score(Y[test_index], proba, average='binary'))

    summ = TP + TN + FP + FN

    print('TP: ', TP / summ)
    print('TN: ', TN / summ)
    print('FP: ', FP / summ)
    print('FN: ', FN / summ)
    print('Точность (Precision): ', TN / (TN + FN))
    print('Полнота(Recall)', TN / (TN + TP))
    print('F-мера: ', F1 / len(Y))
    print('Время обучения: ', Educate)
    print('Время тестирования: ', Test)
def save_regression_leave_one_out(X, y, classification_model, classification_model_name):
    y_preds_list = []
    y_list = []
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regressionFunction = classification_model.fit(X_train, y_train)
        y_pred = regressionFunction.predict(X_test)
        y_list.append(y_test)
        y_preds_list.append(y_pred)

    f = open(f"./results/leave_one_out/{classification_model_name}.txt", "w")
    f.write(get_model_report_for_multi_y_pred(y_list, y_preds_list))
def k_fold(reg, x_train, y_train, k=5):

    if k == -1:
        kf = LeaveOneOut()
    else:
        kf = KFold(n_splits=k)
    kf.get_n_splits(x_train)

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    i = 0
    for train_index, test_index in kf.split(x_train):
        _x_train, _x_test = x_train.values[train_index, :], x_train.values[test_index, :]
        _y_train, _y_test = y_train[train_index],    y_train[test_index]

        probas_ = reg.fit(_x_train, _y_train).predict_proba(_x_test)

        fpr, tpr, thresholds = roc_curve(_y_test, probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        # plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
        i += 1

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr, mean_tpr, label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")

    return reg
Beispiel #5
0
def find_squared_losses(N, X, Y, degree, btype):
    """ returns squared losses for both train and test when loo is performed """

    # design matrix and optimal parameters
    dm = build_design_matrix(X, degree, btype)
    ot = find_optimal_parameters(dm, Y)

    # init leave-one-out validator
    loo = LeaveOneOut()
    nb_splits = loo.get_n_splits(X)

    # run leave-one-out validation to calculate the losses
    squared_losses_test = []
    for train_index, test_index in loo.split(X):
        # prepare train and test sets
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

        # find squared loss for test set
        dm_test = build_design_matrix(X_test, degree, btype)
        ot_test = find_optimal_parameters(dm_test, y_test)
        squared_loss_test = calculate_squared_loss(y_test, dm_test, ot_test)
        squared_loss_test_scalar = np.asscalar(squared_loss_test)
        squared_losses_test.append(squared_loss_test_scalar)

    # calculate mean values
    mean_squared_loss_test = sum(squared_losses_test) / nb_splits

    # find maximum likelihood for variance for the whole dataset
    dm = build_design_matrix(X, degree, btype)
    ot = find_optimal_parameters(dm, Y)
    squared_loss_mle_var = calculate_squared_loss(Y, dm, ot)
    squared_loss_mle_var_scalar = np.asscalar(squared_loss_mle_var)

    return mean_squared_loss_test, squared_loss_mle_var_scalar
Beispiel #6
0
def LOOCV(ohe_df, use_previous_years=False):
    '''
    Leave one out cross validation to check performance on
    multiple regression models.
    '''
    models = {
     'RFR': RandomForestRegressor(n_estimators=50, random_state=0),
     'GBR': GradientBoostingRegressor(max_depth=1, random_state=0),
     'LIR': LinearRegression(),
     'SVR': SVR(kernel='linear')
    }
    if use_previous_years is False:
        ordinal_columns = df[['INCOME_BINS', '2017 RATING']]
        ohe_df = pd.concat([categorical_columns, ordinal_columns], axis=1)

    df_x = ohe_df.iloc[:, :-1]
    df_y = ohe_df.iloc[:, -1]

    scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error']
    loo = LeaveOneOut.get_n_splits(df_x, df_y)

    for name, model in models.items():
        scores = cross_validate(model, df_x, df_y, cv=loo,
                                scoring=scoring)
        rmse = (-1*mean(scores['test_neg_mean_squared_error']))**0.5
        mae = -1*mean(scores['test_neg_mean_absolute_error'])
        print(f'{name} RMSE: {rmse: 0.4f}, MAE: {mae: 0.4f}')
    def intra_set_cross_validation(self, pred_metric, target_metric):
        loo = LeaveOneOut()
        loo.get_n_splits(np.arange(self.num_samples))

        pred_intra_set_distance_cv = np.zeros(
            (self.num_samples, 1, self.num_samples - 1))
        target_intra_set_distance_cv = np.zeros(
            (self.num_samples, 1, self.num_samples - 1))

        for train_index, test_index in loo.split(np.arange(self.num_samples)):
            pred_intra_set_distance_cv[test_index[0]][0] = utils.c_dist(
                pred_metric[test_index], pred_metric[train_index])
            target_intra_set_distance_cv[test_index[0]][0] = utils.c_dist(
                target_metric[test_index], target_metric[train_index])

        return pred_intra_set_distance_cv, target_intra_set_distance_cv
Beispiel #8
0
def loocv_logistic_retrain(subj_censored, features, DV, best_c):
    start = time.time()
    cv = LeaveOneOut()
    num_cv = cv.get_n_splits(subj_censored)
    res = []

    for i in range(num_cv):
        # define train, test data
        train_data, test_data = loocv_train_test_split_ith(subj_censored, i)

        # define model
        best_lasso_model = LogisticRegression(penalty='l1',
                                              solver='saga',
                                              C=best_c,
                                              fit_intercept=False)

        # fit model
        best_lasso_model.fit(train_data[features], train_data[DV])

        # predict prob
        yhat = best_lasso_model.predict(test_data[features])
        yprob = best_lasso_model.predict_proba(test_data[features])[:, 1]

        # save pred outcomes
        res.append([
            test_data['HCPID'].values[0], test_data[DV].values[0], yhat[0],
            yprob[0]
        ])

    res = pd.DataFrame(res, columns=['HCPID', 'ytrue', 'yhat', 'yprob'])

    print('Time Usage (s)', round((time.time() - start), 4))
    return res
Beispiel #9
0
def call_SVM_LOOCV(X, y, verbose=0):
    """
	This function applies LDA on the data and returns the LOOCV scores in 2 ways.

	Created by: Loukas Serafeim, Nov 2017

	Args:
		X: A numpy array of the input features
		y: A numpy array of the target values. Note: this shoud have shape= [n_features, ]

	Returns:
 		The mean LOOCV scores of LDA classification
	"""

    ###### Standardize Data ###########
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('clf', SVC(kernel='linear', random_state=1))])
    #clf = SVC(kernel = 'linear', random_state = 1)
    #sc = StandardScaler()
    #pipe =  make_pipeline(sc, clf)
    loo = LeaveOneOut()

    if verbose:
        print("The number of splits is:{}\n".format(loo.get_n_splits(X)))

    ########################  1st WAY ######################
    test_fold_predictions = []
    y_test_all = []

    for i, j in loo.split(X):
        X_train, X_test = X[i], X[j]
        y_train, y_test = y[i], y[j]
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        test_fold_predictions.append(y_pred)
        y_test_all.append(y_test)
    if verbose:
        print('Confusion matrix \n{}\n'.format(
            metrics.confusion_matrix(y_test_all, test_fold_predictions)))
        print("Accuracy is %r \n" %
              metrics.accuracy_score(y_test_all, test_fold_predictions))

    ################ PLOT CONFUSION MATRIX PLOT #########################
    #plt.imshow(confusion_matrix(y_test_all, test_fold_predictions), interpolation='nearest', cmap=plt.cm.Blues)
    #plt.colorbar()
    #plt.xlabel("True label")
    #plt.ylabel("Predicted label")
    #plt.title(" The Confusion Matrix")
    ### stop blocking #########3
    #plt.show(block = False)

    ###################### 2nd way using sklearn build-in functions ###################
    # sc = StandardScaler()
    # pipe =  make_pipeline(sc, clf)
    scores = cross_val_score(pipe, X, y, cv=loo, scoring="accuracy")
    if verbose:
        print("Accuracy of 2nd way is %r\n" % np.mean(scores))
    #plt.show()

    return np.mean(scores)
Beispiel #10
0
def LeaveOneOut_Onemodel(X, y, model):
    # LeaveOneOut for one model
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    Prediction = []
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        pre = model.predict(X_test)[0]
        print(pre)
        Prediction.append(pre)
    pred = pd.Series(Prediction)
    SST = sum((y - y.mean())**2)
    SSE = sum((pred - y)**2)
    print("LLO", 1 - SSE / SST)
Beispiel #11
0
def leave_one_out(data, classlabel, n=3, loop=True):
    """
    tests k-fold accuracy when k = n
    :param data: the test data without the class identifier
    :param classlabel: the lable of the class for each instance
    :param n: the number of neighbors to be tested, default is 3
    :param loop: loop true is normal, loop false is for testing
    :return: the average overall accuracy for leave one out
    """
    knn = KNeighborsRegressor(n_neighbors=n)
    loo = LeaveOneOut()
    size = loo.get_n_splits(data, classlabel)

    rate = 0
    if loop:
        for training, testing in loo.split(data):
            x_train, x_test = data.iloc[training], data.iloc[testing]
            y_train, y_test = classlabel[training], classlabel[testing]

            knn.fit(x_train, y_train)
            if knn.predict(x_test)[0] == y_test.iloc[0]:
                rate = rate + 1
        error = rate / size
    else:
        error = random.uniform(1.0, 0.0)

    return error
def classify2(X, y):
    from sklearn import svm

    clf = svm.LinearSVC(C=1.0,
                        dual=True,
                        fit_intercept=True,
                        intercept_scaling=1,
                        loss='squared_hinge',
                        max_iter=1000,
                        multi_class='ovr',
                        penalty='l2',
                        random_state=None,
                        tol=0.0001,
                        verbose=0)
    from sklearn.model_selection import LeaveOneOut
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    pred = []
    target = []
    for train_index, test_index in loo.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        Y_pred = clf.predict(X_test)

        pred.append(Y_pred)
        target.append(y_test)

    # print (target)

    f_micro = sklearn.metrics.f1_score(target, pred, average='micro')
    p_micro = sklearn.metrics.precision_score(target, pred, average='micro')
    r_micro = sklearn.metrics.recall_score(target, pred, average='micro')

    # f_macro = sklearn.metrics.f1_score(target, pred, average='macro')
    # p_macro = sklearn.metrics.precision_score(target, pred, average='macro')
    # r_macro = sklearn.metrics.recall_score(target, pred, average='macro')

    accuracy = sklearn.metrics.accuracy_score(target, pred)

    print('Accuracy=%f' % accuracy)

    print('*' * 10 + ' Micro Score ' + '*' * 10)
    print('p=%f' % p_micro)
    print('r=%f' % r_micro)
    print('f-score=%f' % f_micro)
def evaluate(columns):
    modify("data.csv", translate(columns))

    df = pd.read_csv("modified.csv", header=0)

    dataset = df.values
    X = dataset[:, 1:]
    y = dataset[:, 0]
    y = y.astype('int')

    scale = StandardScaler().fit(X)
    X_std = scale.transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        train_size=.9)

    loo = LeaveOneOut()
    loo.get_n_splits(X_train)

    parameters = {
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'C': [1, 100, 1000]
    }
    log = LogisticRegression(multi_class='auto', max_iter=1000)
    clf = GridSearchCV(log, parameters, cv=loo)
    clf.fit(X_train, y_train)
    a = clf.best_score_
    p = clf.best_params_

    parameters = {'kernel': ['rbf', 'linear', 'poly'], 'C': [1, 100, 1000]}
    svc = SVC(gamma="scale")
    clf = GridSearchCV(svc, parameters, cv=loo)
    clf.fit(X_train, y_train)
    if clf.best_score_ > a:
        a = clf.best_score_
        p = clf.best_params_

    parameters = {'n_neighbors': [2, 3, 4, 5, 6], 'p': [1, 2]}
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn, parameters, cv=loo)
    clf.fit(X_train, y_train)
    if clf.best_score_ > a:
        a = clf.best_score_
        p = clf.best_params_

    return a,
Beispiel #14
0
def loo_risk(X, y, regmod):
    """
    Construct the leave-one-out square error risk for a regression model
    
    Input: design matrix, X, response vector, y, a regression model, regmod
    Output: scalar LOO risk
    """
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    loo_losses = []
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regmod.fit(X_train, y_train)
        y_hat = regmod.predict(X_test)
        loss = np.sum((y_hat - y_test)**2)
        loo_losses.append(loss)
    return np.mean(loo_losses)
Beispiel #15
0
def LeaveOneOut_test(dataset,min_support,min_threshold):
    score = 0
    tot = 0
    loo = LeaveOneOut()
    loo.get_n_splits(dataset)

    dataset = numpy.array(dataset)

    for train_index, test_index in loo.split(dataset):
        dataset_train = dataset[train_index] # train
        dataset_test =  dataset[test_index] # ActiveUserTransactions
        rules = ARM_train(dataset_train,min_support,min_threshold)
        if not rules.empty:
            score += ARM_test(rules, dataset_test[0])

        tot = tot + 1
        #print(score)

    return float(score)/tot # this is accuracy
Beispiel #16
0
def knn(n):

    x, y = getData()

    model = KNeighborsClassifier(n_neighbors=n)

    loo = LeaveOneOut()
    loo.get_n_splits(x)
    y_pred = []
    a = np.array(y)
    for train_index, test_index in loo.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = a[train_index], a[test_index]

        model.fit(x_train, y_train)
        y_pred.extend(model.predict(x_test))

    print("KNN com N =", n)
    print("y: ")
    print(y)
    print("y_pred: ")
    print(y_pred)

    print("recall score:")
    print("macro:", recall_score(y, y_pred, average='macro'))

    print("micro:", recall_score(y, y_pred, average='micro'))

    print(recall_score(y, y_pred, average=None))

    print("precision score:")
    print("macro:", precision_score(y, y_pred, average='macro'))

    print("micro", precision_score(y, y_pred, average='micro'))

    print("weighted:", precision_score(y, y_pred, average='weighted'))

    print(precision_score(y, y_pred, average=None))

    print("accuracy score:")
    print("normalizado:", accuracy_score(y, y_pred))

    print("nao normalizado:", accuracy_score(y, y_pred, normalize=False), "\n")
Beispiel #17
0
def classification_within_modality(dataFrame, categoria, exposure):
    '''
    
    
    
    
    
    
    '''
    dataFrame_result = []
    loo = LeaveOneOut()

    pbar = tqdm(total=loo.get_n_splits(dataFrame))

    for ind, pearson in dataFrame.groupby('people'):

        X = pearson.drop(['trial', 'group', 'people'], 1)
        y = pearson['group']

        loo = LeaveOneOut()

        for train_index, test_index in loo.split(X):

            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            #Normalize
            train_mean = average(X_train, axis=0)

            X_train_without_mean = subtract(X_train, train_mean)
            X_test_without_mean = subtract(X_test, train_mean)

            clf = GaussianNB()

            clf.class_prior_ = [(1 / 6), (1 / 6), (1 / 6), (1 / 6), (1 / 6),
                                (1 / 6)]

            pca_ = PCA(random_state=42, svd_solver='full', n_components=0.99)

            pca = pca_.fit(X_train_without_mean)

            X_train_pca = pca.transform(X_train_without_mean)

            X_test_pca = pca.transform(X_test_without_mean)

            clf = clf.fit(X_train_pca, y_train)

            y_pred = clf.predict(X_test_pca)

            dataFrame_result.append(
                [ind, y_pred, y_test.values, categoria, exposure])

            pbar.update(1)

    return dataFrame_result
Beispiel #18
0
def main():
    columns = "age sex bmi map tc ldl hdl tch ltg glu".split()
    diabetes = datasets.load_diabetes()
    print(diabetes)
    print(columns)
    df = pd.DataFrame(diabetes.data, columns=columns)
    y = diabetes.target

    # create training and testing vars
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)
    print X_train.shape, y_train.shape
    print X_test.shape, y_test.shape
    lm = linear_model.LinearRegression()
    model = lm.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(predictions)

    # the linear model
    print 'score', model.score(X_test, y_test)

    # KFold split example
    X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
    y = np.array([1, 2, 3, 4])
    kf = KFold(n_splits=2)
    kf.get_n_splits(X)
    print kf
    KFold(n_splits=2, random_state=None, shuffle=False)
    for train_index, test_index in kf.split(X):
        print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # leave one out cross validation
    X = np.array([[1, 2], [3, 4]])
    y = np.array([1, 2])
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    for train_index, test_index in loo.split(X):
        print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(X_train, X_test, y_train, y_test)
Beispiel #19
0
def find_parameters_evaluation(index_set, gene_expression, cell_count_aa):
    prediction = []
    actual_value = []
    n_splines_all = []
    lam_all = []

    # THIS IS OUTER LOOP: for VALIDATION/TESTING
    #train n models and evaluate their average performance
    gene_indexes = index_set
    y = cell_count_aa
    X = gene_expression[gene_expression.columns[gene_indexes]]
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    gam = LinearGAM()
    gam = gam.gridsearch(X,
                         y,
                         n_splines=np.arange(10, 50),
                         lam=[0.4, 0.5, 0.6, 0.7, 0.8])

    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # THIS IS INNER LOOP: for TRAINING/VALIDATION
        #train model with given optimized parameters
        regr = gam.fit(X_train, y_train)
        #make a prediction on OUTER LOOP test set
        prediction_val = regr.predict(X_test)[0]
        # store predictions and actual values
        prediction.append(prediction_val)
        actual_value.append(y_test[0])
        # add optimal parameter values to arrays
        n_splines_all.append(regr.n_splines)
        lam_all.append(regr.lam)
        print(test_index)
        print(str(prediction_val), " ", str(y_test[0]))
    #calculate spearman correlation over all of the models
    rho, pval = spearmanr(actual_value, prediction)
    lams = np.array(lam_all)
    lams_mean = lams.mean()
    n_splines_all = np.array(n_splines_all)
    n_splines_mean = n_splines_all.mean()
    return lams_mean, n_splines_mean, rho, pval
Beispiel #20
0
def run3():
    import warnings
    warnings.filterwarnings("ignore")
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    df=pd.read_csv("AdmissionDataset/data.csv")
    X=df.iloc[:,0:8].as_matrix()
    #print(X.shape)
    y=df.iloc[:,8:9].values
    y=y.reshape((y.shape[0],))
    #print(y.shape)
    d_m=np.mean(X)
    d_s=np.std(X)
    d_n=(X-d_m)/d_s
    from sklearn.model_selection import LeaveOneOut
    kf = LeaveOneOut()
    kf.get_n_splits(d_n)
    msel=[]
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = d_n[train_index], d_n[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model =RidgeRegression(0.0001, iters=3000, lrate=0.001)
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        from sklearn.metrics import mean_squared_error,r2_score
        mse = mean_squared_error(y_test, y_pred)
        msel.append(mse)
    print("Mean Error for Ridge Regression : "+str(sum(msel)/len(msel)))
    msel=[]
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = d_n[train_index], d_n[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model =LassoRegression(0.0001, iters=3000, lrate=0.001)
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        from sklearn.metrics import mean_squared_error
        mse = mean_squared_error(y_test, y_pred)
        msel.append(mse)
    print("Mean Error for Lasso Regression : "+str(sum(msel)/len(msel)))
Beispiel #21
0
def get_cross_validation_predictions(data_obj, data, target, tags, method):
    import numpy as np
    data = np.array(data)
    target = np.array(target)

    from sklearn.model_selection import LeaveOneOut
    loo = LeaveOneOut()
    loo.get_n_splits(data)

    preds = []
    for train_index, test_index in loo.split(data):
        indexes_to_leave_out, q_tag = get_all_questions_belonging_to_thread(
            data_obj, tags, index=list(test_index)[0])
        train_index = np.delete(train_index, indexes_to_leave_out, 0)
        train_target, test_target = target[train_index], target[test_index]
        train_data, test_data = data[train_index], data[test_index]
        pred = method(train_data, train_target, test_data, q_tag)
        preds.append(pred[0])

    return preds, target
Beispiel #22
0
def runTest(X,Y):
    Y_pred = np.zeros((len(Y),), dtype='uint8')
    loo = LeaveOneOut()
    print("LOO : Total {:d} tests to perform".format(loo.get_n_splits(X)))
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        clf = SVC(C=2)
        clf.fit(X_train, Y_train)
        Y_pred[test_index] = clf.predict(X_test)
    return Y_pred
Beispiel #23
0
 def loadVideos(self):
     """
     Load the video data, Extract feature and train hmm model
     """
     mat_contents = sio.loadmat('data/gait.mat')
     mat_contents = mat_contents['gait']
     for category_name in self.categories:
         """Each  category"""
         images = []
         for person in self.persons:
             """Each person"""
             if person == 'lena_' and (category_name == 'run'
                                       or category_name == 'skip'
                                       or category_name == 'walk'):
                 """Person is Lena and category run, skip or walk"""
                 video = mat_contents[person + category_name + '1'][0][0]
                 if self.args.mhi:
                     data = self.extractMhiFeature(video)
                 else:
                     data = self.extractFeature(video)
                 images.append(data)
                 video = mat_contents[person + category_name + '2'][0][0]
                 if self.args.mhi:
                     data = self.extractMhiFeature(video)
                 else:
                     data = self.extractFeature(video)
                 images.append(data)
             else:
                 video = mat_contents[person + category_name][0][0]
                 if self.args.mhi:
                     data = self.extractMhiFeature(video)
                 else:
                     data = self.extractFeature(video)
                 images.append(data)
         if images.__len__() != 0:
             loo = LeaveOneOut()  # images.__len__()
             images = np.array(images)
             # train hmm with category all video
             self.fullDataTrainHmm[
                 category_name], std_scale, std_scale1 = self.train(images)
             self.model[category_name] = {}
             self.model[category_name]['hmm'] = []
             self.model[category_name]['std_scale'] = []
             self.model[category_name]['std_scale1'] = []
             self.model[category_name]['data'] = []
             print(loo.get_n_splits(images))
             for train, test in loo.split(images):
                 markov_model, std_scale, std_scale1 = self.train(
                     images[train])
                 self.model[category_name]['hmm'].append(markov_model)
                 self.model[category_name]['std_scale'].append(std_scale)
                 self.model[category_name]['std_scale1'].append(std_scale1)
                 self.model[category_name]['data'].append(images[test])
         self.target_names = self.categories
Beispiel #24
0
def calculate_BAcc(features_array):
    '''For given feature(s), use NN to calculate its BAcc_values with respect to their true lables. Leave-one-out'''
    features_array = features_array.T
    # Features subset used to calculate BAcc
    true_labels = 1*c_mic.T
    # true_labels =  Data set columns converted to boolean then 0/1.
    loo = LeaveOneOut()
    loo.get_n_splits(features_array)
    # The number of iterations.
    classifier = KNeighborsClassifier(n_neighbors=1)
    # KNN, k = 1
    y_pred = []
    for train_index, test_index in loo.split(features_array):
        X_train, X_test = features_array.iloc[train_index], features_array.iloc[test_index]
        y_train, y_test = true_labels[train_index], true_labels[test_index]
        classifier.fit(X_train, y_train)
        y_pred_i = classifier.predict(X_test)
        y_pred.append(y_pred_i)
    BAcc = balanced_accuracy_score(true_labels, y_pred)
    return BAcc
Beispiel #25
0
def get_loocv_accuracy(model, data, labels):
    loo = LeaveOneOut()
    print(
        f"\nCalculating LOOCV accuracy with {loo.get_n_splits(data)} iterations..."
    )
    total_accuracy = 0
    for training_indices, testing_indices in loo.split(data):
        model.fit(data[training_indices], labels[training_indices])
        y_predicted = model.predict(data[testing_indices])
        total_accuracy += accuracy_score(labels[testing_indices], y_predicted)
    return total_accuracy / loo.get_n_splits(data)
Beispiel #26
0
def spatial_kfold(central_shape,year,thresh):
    
    data=central_shape[['Call_density','geometry', '{}_pop_density'.format(year)]].reset_index(drop=True)
    #Get the centoid for each TA
    data['centroid']=data['geometry'].centroid
    loo = LeaveOneOut()
    loo.get_n_splits(data)
    
    coef_sp=pd.DataFrame()

    for train_index, test_index in loo.split(data):
       print("TRAIN:", train_index, "TEST:", test_index)
       train=data.loc[train_index].reset_index(drop=True)
       test=data.loc[test_index].reset_index(drop=True)
       train_new=pd.DataFrame()
       
       #check whether training points and test points distance between threshold
       #is greater than threshold
       for i,row in train.iterrows():
           if test.centroid.distance(train.centroid.iloc[i]).values> thresh:
               train_new=train_new.append(train.iloc[i])
       
       #Get the train and test datasets
       X_train = pd.DataFrame(np.log(train_new['Call_density'])).reset_index(drop=True)
       y_train = pd.DataFrame(np.log(train_new['{}_pop_density'.format(year)])).reset_index(drop=True)
       X_test = pd.DataFrame(np.log(test['Call_density'])).reset_index(drop=True)
       y_test = pd.DataFrame(np.log(test['{}_pop_density'.format(year)])).reset_index(drop=True)
       
       #Fit regression model
       lm = LinearRegression()
       lm.fit(X_train,y_train)
       
       #Get RMSE train and test
       RMSEtrain=np.sqrt(mean_squared_error(y_train,lm.predict(X_train)))
       RMSEtest=np.sqrt(mean_squared_error(y_test,lm.predict(X_test)))
       
       #Get the coefficient for all iterations
       coef_sp=coef_sp.append({'Alpha':float(lm.intercept_),'Beta':float(lm.coef_),'R^2':lm.score(X_train,y_train),
                         'RMSE_train':RMSEtrain,'RMSE_test':RMSEtest},ignore_index=True)
    
    return coef_sp
def makeItemCrossValidation(dataset, labels):

    print('[*] Item cross validation has started')

    tnSum, fpSum, fnSum, tpSum = 0, 0, 0, 0
    trainingTime = 0.0
    testingTime = 0.0
    fscore = 0.0

    myClassifier = RandomForestClassifier(min_samples_leaf=2,
                                          random_state=17,
                                          criterion='entropy')

    loo = LeaveOneOut()
    loo.get_n_splits(dataset)
    LeaveOneOut()

    for trainIndex, testIndex in loo.split(dataset):

        # Обучение
        begin = time.time()
        myClassifier.fit(dataset[trainIndex], labels[trainIndex])
        trainingTime += time.time() - begin

        # Тестирование
        begin = time.time()
        y_pred = myClassifier.predict(dataset[testIndex])
        testingTime += time.time() - begin

        tn, fp, fn, tp = confusion_matrix(labels[testIndex],
                                          y_pred,
                                          labels=[0, 1]).ravel()
        tnSum += tn
        fpSum += fp
        fnSum += fn
        tpSum += tp

        fscore += f1_score(labels[testIndex], y_pred, average='binary')

    resultTesting(tpSum, tnSum, fpSum, fnSum, len(dataset), fscore,
                  len(dataset), trainingTime, testingTime)
def test_LeaveOneOut():
    '''
    测试  LeaveOneOut 的用法
    :return: None
    '''
    # X = np.array([[1, 2, 3, 4],
    #               [11, 12, 13, 14],
    #               [21, 22, 23, 24],
    #               [31, 32, 33, 34]]
    #              )
    # y = np.array([1, 1, 0, 0])
    #
    # loo = LeaveOneOut()
    # loo.get_n_splits(X)
    # for train_index, test_index in loo.split(X):
    #     print("Train Index:", train_index)
    #     print("Test Index:", test_index)
    #     print("X_train:", X[train_index])
    #     print("X_test:", X[test_index])
    #     print("")

    from sklearn.datasets import load_digits
    from sklearn.svm import LinearSVC
    from sklearn.metrics import mean_squared_error

    digits = load_digits()  # 加载用于分类问题的数据集
    X = digits.data
    y = digits.target
    print(y)
    print(len(y))
    SVC = LinearSVC()
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    mean_squared_error_list = []
    for train_index, test_index in loo.split(X):
        SVC.fit(X[train_index], y[train_index])
        prediction = SVC.predict(X[test_index])
        print(mean_squared_error(y[test_index], prediction))
        mean_squared_error_list.append(
            mean_squared_error(y[test_index], prediction))
    print(np.average(mean_squared_error_list))
Beispiel #29
0
def meta_model(combined_meta, metric_df, algorithm_name):
    loo = LeaveOneOut()
    loo.get_n_splits(combined_meta)

    idx = metric_df.columns.get_loc(algorithm_name)
    m, n = combined_meta.shape
    pca = PCA(n_components=3)
    y_pred = np.zeros(shape=(m, 1))

    for train_index, test_index in loo.split(combined_meta):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = combined_meta.iloc[
            train_index, :], combined_meta.iloc[test_index, :]
        pca_train_data = pca.fit_transform(X_train)
        pca_test_data = pca.transform(X_test)
        y_train, y_test = metric_df.iloc[train_index,
                                         idx], metric_df.iloc[test_index, idx]

        # Calculate actual gamma values to test
        model = SVR(C=1, epsilon=0.1, gamma='scale')

        # model = linear_model.LinearRegression()
        model.fit(pca_train_data, y_train)

        y_pred[test_index] = model.predict(pca_test_data)

        # Uncomment the next two lines to manually get prediction.
        #print(get_pred(model, pca_test_data, pca_train_data))
        #print(y_pred[test_index])  # The same oputput by both

        # print("Train Data", X_train, X_test, "\n Response \n", y_train, y_test)
        # print(y_test, '\n')

        # y_pred = pd.DataFrame({
        #   algorithm_name: y_pred, }, index=y_test.index)

    data = pd.DataFrame(y_pred, index=metric_df.index)
    data.columns = [algorithm_name]

    return data
Beispiel #30
0
def DecisionTree():
    print("Decision Tree")
    x, y = getData()
    clf = DecisionTreeClassifier()

    #tree.plot_tree(clf.fit(x, y))

    loo = LeaveOneOut()
    loo.get_n_splits(x)
    y_pred = []
    a = np.array(y)
    for train_index, test_index in loo.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = a[train_index], a[test_index]

        clf.fit(x_train, y_train)
        y_pred.extend(clf.predict(x_test))

    print(y)
    print(y_pred)
    print("recall score:")
    print("macro:", recall_score(y, y_pred, average='macro'))

    print("micro:", recall_score(y, y_pred, average='micro'))

    print(recall_score(y, y_pred, average=None))

    print("precision score:")
    print("macro:", precision_score(y, y_pred, average='macro'))

    print("micro", precision_score(y, y_pred, average='micro'))

    print("weighted:", precision_score(y, y_pred, average='weighted'))

    print(precision_score(y, y_pred, average=None))

    print("accuracy score:")
    print("normalizado:", accuracy_score(y, y_pred))

    print("nao normalizado:", accuracy_score(y, y_pred, normalize=False), "\n")
Beispiel #31
0
def computeCVROC(df, model, outcomeVar, predVars, nFolds=10, LOO=False):
    """Apply model to df and return performance metrics in a cross-validation framework.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain outcome and predictor variables.
    model : sklearn or other model
        Model must have fit and predict methods.
    outcomeVar : str
    predVars : ndarray or list
        Predictor variables in the model.
    nFolds : int
        N-fold cross-validation (not required for LOO)

    Returns
    -------
    fpr : np.ndarray
        Pre-specified vector of FPR thresholds for interpolation
        fpr = np.linspace(0, 1, 100)
    meanTPR : np.ndarray
        Mean true-positive rate in test fraction.
    auc : float
        Area under the mean ROC curve.
    acc : float
        Mean accuracy score in test fraction.
    results : returned by model.fit()
        Training model results object for each fold
    prob : pd.Series
        Mean predicted probabilities on test data with index from df
    success : bool
        An indicator of whether the cross-validation was completed."""

    if not isinstance(predVars, list):
        predVars = list(predVars)
    
    tmp = df[[outcomeVar] + predVars].dropna()
    X,y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float)

    if LOO:
        cv = LeaveOneOut()
        nFolds = cv.get_n_splits(y)
        cv_iter = cv.split(y=y)
    else:
        cv = StratifiedKFold(n_splits=nFolds, shuffle=True)
        cv_iter = cv.split(X=X, y=y)
    
    fpr = np.linspace(0, 1, 100)
    tpr = np.nan * np.zeros((fpr.shape[0], nFolds))
    acc = np.nan * np.zeros(nFolds)
    auc = np.nan * np.zeros(nFolds)
    coefs = []
    probs = []

    for outi, (trainInd, testInd) in enumerate(cv_iter):
        Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd]
        ytrain, ytest = y.iloc[trainInd], y.iloc[testInd]

        results = model.fit(X=Xtrain, y=ytrain)
        prob = results.predict_proba(Xtest)
        
        class1Ind = np.nonzero(results.classes_ == 1)[0][0]
        fprTest, tprTest, _ = sklearn.metrics.roc_curve(ytest, prob[:, class1Ind])

        tpr[:, outi] = np.interp(fpr, fprTest, tprTest)
        auc[outi] = sklearn.metrics.auc(fprTest, tprTest)
        acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, class1Ind]), normalize=True)
        coefs.append(results.coef_[None,:])
        probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index))
    
    meanTPR = np.mean(tpr, axis=1)
    meanTPR[0], meanTPR[-1] = 0, 1
    meanACC = np.mean(acc)
    meanAUC = sklearn.metrics.auc(fpr, meanTPR)
    
    """Compute mean probability over test predictions in CV"""
    probS = pd.concat(probs).groupby(level=0).agg(np.mean)
    probS.name = 'Prob'

    """Refit all the data for final model"""
    result = model.fit(X=X, y=y)

    rocRes = rocStats(y, np.round(probS))
    
    outD = {'fpr':fpr,                      # (100, ) average FPR for ROC
            'tpr':meanTPR,                  # (100, ) average TPR for ROC
            'AUC':auc,                      # (CVfolds, ) AUC of ROC for each outer test fold
            'mAUC': meanAUC,                # (1, ) AUC of the average ROC
            'mACC': np.mean(acc),
            'ACC':acc,                      # (CVfolds, ) accuracy across outer test folds
            'finalResult': result,          # final fitted model with predict() exposed
            'prob':probS,                   # (N,) pd.Series of predicted probabilities avg over outer folds
            'coefs':np.concatenate(coefs),  # (CVfolds, predVars)
            'Xvars':predVars,
            'Yvar':outcomeVar,
            'nFolds':nFolds,
            'LOO':'Yes' if LOO else 'No',
            'N':tmp.shape[0]}                  
    outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict())
    return outD