Exemple #1
0
 def _iter_cv(n):  # XXX support sklearn < 0.18
     if hasattr(LeaveOneOut, 'split'):
         cv = LeaveOneOut()
         return cv.split(np.zeros((n, 1)))
     else:
         cv = LeaveOneOut(len(data))
         return cv
def run_leave_one_out_cv(features, labels, classifier=LinearDiscriminantAnalysis()):
    """
    Runs leave one out CV.
    :param features: Features shape(epoch, feature)
    :param labels: list of lables of length num epochs
    :param classifier: Sklearn classifier (Defaults to LDA)
    :return: A list of cross validation scores.  Use np.average on the result to find the average score.
    """
    loo = LeaveOneOut()
    scores = []
    for train_indexes, test_indexes in loo.split(features, labels):

        # Assert our split maintains the same number of features
        CCDLAssert.assert_equal(len(train_indexes) + len(test_indexes), features.shape[0])

        # Assert we have the same number of features

        X_train, X_test = features[train_indexes, :], features[test_indexes, :]
        Y_train, Y_test = np.asarray(labels)[train_indexes], np.asarray(labels)[test_indexes]

        # Assert our X_train and X_test have the same number of features
        CCDLAssert.assert_equal(X_train.shape[1], X_test.shape[1])

        # Fit our classifier to our
        classifier.fit(X_train, Y_train)

        score = classifier.score(X_test, Y_test)
        scores.append(score)

    return scores
def main(argv):
    filename = argv[0]
    t = float(argv[1]) # threshold for logistic regression (default=0.5)
    dup = int(argv[2]) # if 1, bad queries will be duplicated
    subset = 'cache' # column title for precision of cache
    full = 'full' # column title for precision of full db
    df = pd.read_csv('../../data/cache_selection_structured/' + filename)
    df = df.drop(['query', 'freq'], axis = 1)
    df = df.fillna(0)
    df['label'] = np.where(df['full'] > df['cache'], 1, 0)
    if dup:
        print('duping..')
        bads = df[df['label'] == 1]
        df = df.append(bads, ignore_index=True)
    X = df.drop(['label'], axis = 1)
    y = df['label']
    df = df.drop(['label'], axis = 1)
    p20_mean = np.zeros([1, 6])
    bad_mean = np.zeros([1, 6])
    ml_average_rare = 0
    ql_average_rare = 0
    best_average_rare = 0
    loo = LeaveOneOut()
    bad_counter = 0
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train = X_train.drop([subset, full], axis=1)
        p12 = X_test[subset].iloc[0]
        p100 = X_test[full].iloc[0]
        is_bad = p12 < p100
        X_test = X_test.drop([subset, full], axis=1)
        # compute query likelihood based effectiveness
        ql_cache = np.mean(X_test['ql_0_0'] + X_test['ql_0_1'] +
                           X_test['ql_1_0'] + X_test['ql_2_0'])
        ql_rest = np.mean(X_test['ql_rest_0_0'] + X_test['ql_rest_0_1'] +
                           X_test['ql_rest_1_0'] + X_test['ql_rest_2_0'])
        #ql_pred = X_test['ql_0_1'].iloc[0] < X_test['ql_rest_0_1'].iloc[0]
        ql_pred = 1 if ql_cache < ql_rest else 0
        ql = p12 if ql_pred == 0 else p100
        # learn the model
        print(X_train.shape)
        print(df.columns.shape)
        # y_pred = train_lr(X_train, y_train, X_test, y_test, t, df.columns.values[:-2])
        y_pred = train_lr(X_train, y_train, X_test, y_test, t)
        ml = p12 if y_pred[0] == 0 else p100
        best = p12 if y_test.iloc[0] == 0 else p100
        rnd = p12 if np.random.randint(0, 2) == 1 else p100
        p20_mean += [p12, p100, ml, ql,
                     best, rnd]
        if is_bad:
            #bad_mean += [p12[0], p100[0], ml[0], ql[0], best[0], rnd[0]]
            bad_mean += [p12, p100, ml, ql, best, rnd]
            bad_counter += 1
    print('final results:')
    print('\t'.join(map(str,['set', 'cache', 'db', 'ml', 'ql', 'best',
                              'rand'])))
    print('\t'.join(['bad'] + map(str, np.round(bad_mean[0] / bad_counter, 2))))
    print('\t'.join(['all'] + map(str, np.round(p20_mean[0] / df.shape[0], 2))))
Exemple #4
0
def roc_data(X,Y,clf,n_iter=50,test_size=0.1):
    if n_iter is None and test_size is None:
        cv = LeaveOneOut()
    else:
        cv = ShuffleSplit(n_iter=n_iter,test_size=test_size)
    n_labels = Y.shape[1]
    Y_cv = {i:[] for i in range(n_labels)}
    p = {i:[] for i in range(n_labels)}
    p_1 = {i:[] for i in range(n_labels)}
    p_0 = {i:[] for i in range(n_labels)}
    for train, test in cv.split(Y):
        clf.fit(X[train,:], Y[train,:])
        Y_predicted = clf.predict_proba(X[test,:])
        for i in range(Y.shape[1]):
            if type(Y_predicted) is list:
                p_ = 1 - Y_predicted[i][:,0]
            else:
                p_ = Y_predicted[:,i]
            Y_cv[i] += list(Y[test,i])
            p[i] += list(p_)
            p_1[i] += list(p_[np.where(Y[test,i]==1)[0]])
            p_0[i] += list(p_[np.where(Y[test,i]==0)[0]])
    return Y_cv, p, p_1, p_0
Exemple #5
0
def _print_classification_results(classifier, regressors, response, regressors_test, response_test, regressor_names,
                                  messages):

    loo = LeaveOneOut()
    cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors))
    classifier.fit(regressors, response)
    messages.AddMessage("Adaboost classifier with " + str(classifier.n_estimators) + " estimators and learning rate "
                        + str(classifier.learning_rate))

    if regressors_test is None or response_test is None:
        regressors_test = regressors
        response_test = response
        t_set = "Train"
    else:
        t_set = "Test"

    messages.AddMessage("Score (" + t_set + " Set):" + str(classifier.score(regressors_test, response_test)))
    messages.AddMessage("Score (Leave one Out):" + str(cv_score.mean()))
    messages.AddMessage("Confusion Matrix (" + t_set + " Set):")

    confusion = confusion_matrix(response_test, classifier.predict(regressors_test))
    labels = ["Non Prospective", "Prospective"]
    row_format = "{:6}" + "{:^16}" * (len(labels) + 1)
    messages.AddMessage(row_format.format("", "", "Predicted", ""))
    messages.AddMessage(row_format.format("True", "", *labels))
    for label, row in zip(labels, confusion):
        messages.AddMessage(row_format.format("", label, *row))
    messages.AddMessage("Area Under the curve (AUC):" + str(roc_auc_score(response_test,
                                                            classifier.decision_function(regressors_test))))

    messages.AddMessage("Feature importances: ")
    importances = [[name, val] for name, val in zip(regressor_names, classifier.feature_importances_)]
    for elem in sorted(importances, key=lambda imp: imp[1], reverse=True):
        if elem[1] > 0:
            messages.AddMessage(elem[0] + ": \t" + str(elem[1]*100) + "%")
    return
Exemple #6
0
    return outputs


l1 = add_layers(xs, 7, 20, activation_function=tf.nn.sigmoid)
predict = add_layers(l1, 20, 8, activation_function=tf.nn.softmax)

loss = tf.nn.softmax_cross_entropy_with_logits(labels=ys, logits=predict)
loss = tf.reduce_mean(loss)

train = tf.train.AdamOptimizer(0.01).minimize(loss)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    output_res = []
    for i in range(20):
        loo = LeaveOneOut()
        lo = loo.split(X, y)
        pred = []
        for train_index, test_index in lo:
            train_X, train_Y = X[train_index], y[train_index]
            test_X, test_Y = X[test_index], y[test_index]
            sess.run(train, feed_dict={xs: train_X, ys: train_Y})
            test_res = tf.argmax(sess.run(predict, feed_dict={xs: test_X}), 1)
            res = sess.run(test_res)
            for r in res:
                pred.append(r)
        test_accuracy = accuracy_score(label, pred)
        # print('----%s times------%f%%'%(i,test_accuracy*100))
        print(test_accuracy * 100)
        output_res.append(test_accuracy)
def allergies_distance_matrix(distance='spearman', clustering='spectral'):
    for i in range(0, df.shape[1]):
        for j in range(0, df.shape[1]):
            #Spearman correlation
            if distance == 'spearman':
                dist_mat.at[df.columns[i], df.columns[j]] = abs(
                    round(
                        scipy.stats.spearmanr(
                            np.array(df.iloc[:, i]).astype(float),
                            np.array(df.iloc[:, j]).astype(float))[0], 4))
            #Euclidean distance
            else:
                dist_mat.at[df.columns[i], df.columns[j]] = np.linalg.norm(
                    np.array(df.iloc[:, i]).astype(float) -
                    np.array(df.iloc[:, j]).astype(float))
    if clustering == 'spectral':
        clustering = SpectralClustering(n_clusters=2,
                                        affinity='precomputed',
                                        assign_labels='discretize',
                                        random_state=0)
    else:
        clustering = AgglomerativeClustering(affinity='precomputed',
                                             linkage='average')
    clustering.fit(dist_mat.values)
    bact_label1 = []
    bact_label0 = []
    bact_label = {0: [], 1: []}

    for i in range(0, df.shape[1]):
        if clustering.labels_[i] == 1:
            bact_label1.append(df.columns[i])
        else:
            bact_label0.append(df.columns[i])
    bact_label_name = {0: [], 1: []}
    bact_label_tmp = {0: [], 1: []}
    bact_level = level - 1
    for k in [0, 1]:
        for i in bact_label[k]:
            for key, value in dict_bact.items():
                for j in value:
                    if i == j:
                        bact_label_tmp[k].append(key)
        bact_label_tmp[k] = set(bact_label_tmp[k])
        for i in bact_label_tmp[k]:
            if i != 'else':
                for j in taxonomy:
                    try:
                        if j.split(';')[bact_level] == i:
                            bact_label_name[k].append(','.join(
                                j.split(';')[0:bact_level + 1]))
                            break
                    except:
                        continue
            else:
                bact_label_name[k].append('else')
        bact_label_name[k] = set(bact_label_name[k])
    df1 = df[bact_label1]
    df0 = df[bact_label0]
    pca = PCA(n_components=min(round(df0.shape[1] / 2) + 1, df0.shape[0]))
    pca.fit(df0)
    sum = 0
    num_comp = 0
    for (i, component) in enumerate(pca.explained_variance_ratio_):
        if sum <= 0.5:
            sum += component
        else:
            num_comp = i
            break
    if num_comp == 0:
        num_comp += 1

    otu_after_pca0, _ = apply_pca(df0, n_components=num_comp, print_data=False)
    merged_data0 = otu_after_pca0.join(mapping_file)
    X = merged_data0.drop(['disease'], axis=1)
    y = merged_data0['disease']
    loo = LeaveOneOut()
    accuracy = []
    y_pred_list = []
    for train_index, test_index in loo.split(X):
        train_index = list(train_index)
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        model = XGBClassifier(max_depth=5,
                              n_estimators=300,
                              learning_rate=15 / 100,
                              objective='binary:logistic',
                              scale_pos_weight=(np.sum(y_train == 0) /
                                                np.sum(y_train == 1)),
                              reg_lambda=450)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_list.append(y_pred)
    y_pred_train = model.predict(X_train)
    print('Train Precision: ' +
          str(round(precision_score(y_train, y_pred_train), 2)))
    print('Train Recall: ' +
          str(round(recall_score(y_train, y_pred_train), 2)))

    cnf_matrix = metrics.confusion_matrix(y_train, y_pred_train)
    class_names = ['Control', 'GVHD']
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=list(class_names),
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.show()

    print('Precision: ' + str(round(precision_score(y, y_pred_list), 2)))
    print('Recall: ' + str(round(recall_score(y, y_pred_list), 2)))

    cnf_matrix = metrics.confusion_matrix(y, y_pred_list)
    # # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=list(class_names),
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.show()
    #
    pca = PCA(n_components=min(round(df1.shape[1] / 2) + 1, df1.shape[0]))
    pca.fit(df1)
    sum = 0
    num_comp = 0
    for (i, component) in enumerate(pca.explained_variance_ratio_):
        if sum <= 0.5:
            sum += component
        else:
            num_comp = i
            break
    if num_comp == 0:
        num_comp += 1

    otu_after_pca1, _ = apply_pca(df1, n_components=num_comp, print_data=False)
    merged_data1 = otu_after_pca1.join(mapping_file)
    X = merged_data1.drop(['disease'], axis=1)
    y = merged_data1['disease']
    loo = LeaveOneOut()
    accuracy = []
    y_pred_list = []
    for train_index, test_index in loo.split(X):
        train_index = list(train_index)
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        model = XGBClassifier(max_depth=5,
                              n_estimators=300,
                              learning_rate=15 / 100,
                              objective='binary:logistic',
                              scale_pos_weight=(np.sum(y_train == 0) /
                                                np.sum(y_train == 1)),
                              reg_lambda=450)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_list.append(y_pred)

    y_pred_train = model.predict(X_train)
    print('Train Precision: ' +
          str(round(precision_score(y_train, y_pred_train), 2)))
    print('Train Recall: ' +
          str(round(recall_score(y_train, y_pred_train), 2)))

    cnf_matrix = metrics.confusion_matrix(y_train, y_pred_train)
    class_names = ['Control', 'GVHD']
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=list(class_names),
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.show()

    print('Precision: ' + str(round(precision_score(y, y_pred_list), 2)))
    print('Recall: ' + str(round(recall_score(y, y_pred_list), 2)))

    cnf_matrix = metrics.confusion_matrix(y, y_pred_list)
    # # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=list(class_names),
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.show()
Exemple #8
0
def pmo(alg_no, rm_38, type_filter, feature_selection):
    raw_data = np.loadtxt('data-PMO.csv', delimiter=',', skiprows=1)

    if rm_38:
        raw_data = raw_data[np.where(raw_data[:, 12] != 38.98)]

    index = np.array([], dtype=int)
    for it in type_filter:
        index = np.append(index, np.where(raw_data[:, it] == 1))
    raw_data = raw_data[index, :]
    data = preprocessing.StandardScaler().fit(raw_data).transform(raw_data)
    X = data[:, type_filter + feature_selection]
    y = data[:, 12]

    tot = np.linalg.norm(y - np.mean(y))**2

    mse_min = 10000
    hp_opt = 0
    py_opt = []

    for hp in range(1, 2):
        py = []
        ry = []

        loo = LeaveOneOut()
        for train, test in loo.split(X, y):
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]

            if alg_no == 0:
                # Lasso
                model = linear_model.LinearRegression()
            elif alg_no == 1:
                # SVR
                model = svm.SVR()
            elif alg_no == 2:
                # KNR
                model = neighbors.KNeighborsRegressor(hp, weights='distance')
            else:
                # DTR
                model = tree.DecisionTreeRegressor(max_depth=hp,
                                                   random_state=0)

            model.fit(X_train, y_train)
            py.append(model.predict(X_test))
            ry.append(y_test)

        mse = np.linalg.norm(np.array(py) - np.array(ry))**2
        if mse < mse_min:
            mse_min = mse
            hp_opt = hp
            py_opt = py

    print(1 - mse_min / tot)
    print(hp_opt)
    plt.plot([min(np.min(y), np.min(py_opt)),
              max(np.max(y), np.max(py_opt))],
             [min(np.min(y), np.min(py_opt)),
              max(np.max(y), np.max(py_opt))])
    py_opt = np.array(py_opt)
    if 0 in type_filter:
        plt.scatter(y[np.where(raw_data[:, 0] == 1)],
                    py_opt[np.where(raw_data[:, 0] == 1)],
                    c='r',
                    label='20CrMnTi')
    if 1 in type_filter:
        plt.scatter(y[np.where(raw_data[:, 1] == 1)],
                    py_opt[np.where(raw_data[:, 1] == 1)],
                    c='g',
                    label='45#')
    if 2 in type_filter:
        plt.scatter(y[np.where(raw_data[:, 2] == 1)],
                    py_opt[np.where(raw_data[:, 2] == 1)],
                    c='b',
                    label='60Si2Mn')
    if 3 in type_filter:
        plt.scatter(y[np.where(raw_data[:, 3] == 1)],
                    py_opt[np.where(raw_data[:, 3] == 1)],
                    c='k',
                    label='AM2')
    if 4 in type_filter:
        plt.scatter(y[np.where(raw_data[:, 4] == 1)],
                    py_opt[np.where(raw_data[:, 4] == 1)],
                    c='y',
                    label='GCr15')
    if 5 in type_filter:
        plt.scatter(y[np.where(raw_data[:, 5] == 1)],
                    py_opt[np.where(raw_data[:, 5] == 1)],
                    c='c',
                    label='SA-210C')
    if 6 in type_filter:
        plt.scatter(y[np.where(raw_data[:, 6] == 1)],
                    py_opt[np.where(raw_data[:, 6] == 1)],
                    c='m',
                    label='ZTM-S2')
    plt.legend()
    plt.show()
Exemple #9
0
    def trainModelFV_LOOCV_Fusion(self, extension='*.*'):
        """
        This method contains the entire module
        required for training the Bag of Poses model
        Use of helper functions will be extensive.
        """
        self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile(
            self.label_path)

        # read file. prepare file lists.
        self.files1, self.trainFilesCount1 = self.file_helper.getFilesFromDirectory(
            self.base_path, self.datasets, extension)

        self.files2, self.trainFilesCount2 = self.file_helper.getFilesFromDirectory(
            self.base_path2, self.datasets, extension)

        save = True
        self.parameters += 'Classifier Parameters\n'
        self.parameters += '%s' % self.classifier_helper.clf

        features_nd1 = np.asarray(self.files1)
        features_nd2 = np.asarray(self.files2)

        features_nd1.sort(axis=0)
        features_nd2.sort(axis=0)
        # build GMMs
        self.descriptor_list1 = []
        self.descriptor_list2 = []
        for f in features_nd1:
            feature = f[0]
            des1 = self.file_helper.loadFeaturesFromFile(feature)
            self.descriptor_list1.append(des1)

        for f in features_nd2:
            feature = f[0]
            des2 = self.file_helper.loadFeaturesFromFile(feature)
            self.descriptor_list2.append(des2)

        ft1 = self.classifier_helper.formatND(self.descriptor_list1)
        ft2 = self.classifier_helper.formatND(self.descriptor_list2)

        gmm1 = GMM(n_components=self.no_clusters,
                   covariance_type='diag',
                   verbose=0)
        gmm1.fit(ft1)

        gmm2 = GMM(n_components=self.no_clusters,
                   covariance_type='diag',
                   verbose=0)
        gmm2.fit(ft2)

        # Train Classifier
        loo = LeaveOneOut()
        predictions = []
        pre = []
        lab = []
        hits = 0
        c = 0
        for train, test in loo.split(features_nd1):
            feature_test_file1 = str(features_nd1[test][0][0])
            feature_test_file2 = str(features_nd2[test][0][0])

            class_name_test = feature_test_file1.split(os.sep)[-2]
            c += 1

            currenInvDate = datetime.datetime.now().strftime(
                "%d/%m/%Y %H:%M:%S")
            print('Step: %i/%i - %s\n%s\n%s' %
                  (c, features_nd1.shape[0], currenInvDate, feature_test_file1,
                   feature_test_file2))
            if c == 1 or c % 25 == 0:
                self.mail_helper.sendMail(
                    "Progress: %s - %s" % (self.test_name, self.OsName),
                    "Samples processed: %i" % c)

            self.descriptor_list1 = []
            self.descriptor_list2 = []
            self.train_labels = []
            for feature in features_nd1[train]:
                feature = feature[0]
                label_number = self.number_dict[feature.split(os.sep)[-2]]
                self.train_labels = np.append(self.train_labels, label_number)
                des1 = self.file_helper.loadFeaturesFromFile(feature)
                self.descriptor_list1.append(des1)

            for feature in features_nd2[train]:
                feature = feature[0]
                des2 = self.file_helper.loadFeaturesFromFile(feature)
                self.descriptor_list2.append(des2)

            # format data as nd array
            ft1 = self.classifier_helper.formatND(self.descriptor_list1)
            ft2 = self.classifier_helper.formatND(self.descriptor_list2)

            fv_dim1 = self.no_clusters + 2 * self.no_clusters * ft1.shape[1]
            fv_dim2 = self.no_clusters + 2 * self.no_clusters * ft2.shape[1]
            print(fv_dim1, fv_dim2)
            n_videos = train.shape[0]
            features1 = np.array([np.zeros(fv_dim1) for i in range(n_videos)])
            features2 = np.array([np.zeros(fv_dim2) for i in range(n_videos)])
            count1 = 0
            count2 = 0
            for i in range(n_videos):
                len_video1 = len(self.descriptor_list1[i])
                fv1 = fisher_vector(ft1[count1:count1 + len_video1], gmm1)
                features1[i] = fv1
                count1 += len_video1

                len_video2 = len(self.descriptor_list2[i])
                fv2 = fisher_vector(ft2[count2:count2 + len_video2], gmm2)
                features2[i] = fv2
                count2 += len_video2

            print(features1.shape)
            print('Data normalization. 1')
            scaler1 = StandardScaler()
            # train normalization
            features1 = scaler1.fit_transform(features1)
            features1 = power_normalize(features1, 0.5)
            features1 = L2_normalize(features1)

            print(features2.shape)
            print('Data normalization. 2')
            scaler2 = StandardScaler()
            # train normalization
            features2 = scaler2.fit_transform(features2)
            features2 = power_normalize(features2, 0.5)
            features2 = L2_normalize(features2)

            # real label
            lab.extend(
                [self.number_dict[feature_test_file1.split(os.sep)[-2]]])

            # test features 1
            feature_test1 = self.file_helper.loadFeaturesFromFile(
                feature_test_file1)
            test_fv1 = fisher_vector(feature_test1, gmm1)
            # train normalization
            test_fv1 = test_fv1.reshape(1, -1)
            test_fv1 = scaler1.transform(test_fv1)
            test_fv1 = power_normalize(test_fv1, 0.5)
            test_fv1 = L2_normalize(test_fv1)

            # test features 2
            feature_test2 = self.file_helper.loadFeaturesFromFile(
                feature_test_file2)
            test_fv2 = fisher_vector(feature_test2, gmm2)
            # train normalization
            test_fv2 = test_fv2.reshape(1, -1)
            test_fv2 = scaler2.transform(test_fv2)
            test_fv2 = power_normalize(test_fv2, 0.5)
            test_fv2 = L2_normalize(test_fv2)

            ## concatenate two fv test
            feature_test = np.concatenate((test_fv1, test_fv2),
                                          axis=1).reshape(1, -1)

            ## concatenate two fv train
            feature_train = np.concatenate((features1, features2), axis=1)

            # train classifiers
            self.classifier_helper.clf.fit(feature_train, self.train_labels)
            cl = int(self.classifier_helper.clf.predict(feature_test)[0])
            class_name_predict = self.name_dict[str(cl)]
            if class_name_test == class_name_predict:
                hits += 1

            error = c - hits
            msg_progress = 'Hits: %i/%i  -  Accuracy: %.4f  -   Error: %i\n\n' % (
                hits, c, hits / c, error)

            print(msg_progress)
            if c % 25 == 0:
                self.mail_helper.sendMail(
                    "Progress: %s - %s" % (self.test_name, self.OsName),
                    msg_progress)

            if error > 40:
                save = False
                print('Error excedded')
                break

            # predicted label
            pre.extend([cl])
            predictions.append({
                'image1': feature_test_file1,
                'image2': feature_test_file2,
                'class': cl,
                'object_name': self.name_dict[str(cl)]
            })

        if save:
            self.saveResults(predictions, pre, lab, features_nd1.shape[0])
Exemple #10
0
            bar[i] = "T"
            output_test = "{}({}: {}) ".format(output_test, i, data[i])

        print("[ {} ]".format(" ".join(bar)))
        print("Train: {}".format(output_train))
        print("Test:  {}\n".format(output_test))


# Create some data to split with
data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]])

# Our two methods
loocv = LeaveOneOut()
lpocv = LeavePOut(p=P_VAL)

split_loocv = loocv.split(data)
split_lpocv = lpocv.split(data)

print("""\
The Leave-P-Out method works by using every combination of P points as test data.

The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods.
A bar displaying the current train-test split as well as the actual data points are displayed for each split.
In the bar, "-" is a training point and "T" is a test point.
""")

print("Data:\n{}\n".format(data))

print("Leave-One-Out:\n")
print_result(split_loocv)
def main():
    feature_array_all = np.loadtxt('x_189.txt', dtype=np.float32)
    f = open("y.txt", "rb")
    label_vector = f.read().decode()
    label_vector = list(label_vector)
    f.close()
    label_vector = np.array(label_vector, dtype=np.float32)

    loo = LeaveOneOut()
    X = feature_array_all
    y = label_vector
    predict_y_test = np.empty(0)
    predictions_test = np.empty(0)
    for train_index, test_index in loo.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = svm.SVC(probability=True,
                      C=20.6913808111479,
                      gamma=0.25118864315095824)
        clf = clf.fit(X_train, y_train)
        score_r = clf.score(X_test, y_test)
        predict_y_test_single = clf.predict(X_test)
        predict_y_test = np.append(predict_y_test,
                                   predict_y_test_single,
                                   axis=None)
        prob_predict_y_test = clf.predict_proba(X_test)
        predictions_test_single = prob_predict_y_test[:, 1]
        predictions_test = np.append(predictions_test,
                                     predictions_test_single,
                                     axis=None)
        print('Sequence ' + str(test_index[0] + 1) +
              ' has finished. (1805 sequences in total)')

    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(0, len(y)):
        if int(y[i]) == 1 and int(predict_y_test[i]) == 1:
            TP = TP + 1
        elif int(y[i]) == 1 and int(predict_y_test[i]) == 0:
            FN = FN + 1
        elif int(y[i]) == 0 and int(predict_y_test[i]) == 0:
            TN = TN + 1
        elif int(y[i]) == 0 and int(predict_y_test[i]) == 1:
            FP = FP + 1
    Sn = float(TP) / (TP + FN)
    Sp = float(TN) / (TN + FP)
    ACC = float((TP + TN)) / (TP + TN + FP + FN)
    y_validation = np.array(y, dtype=int)
    fpr, tpr, thresholds = metrics.roc_curve(y_validation,
                                             predictions_test,
                                             pos_label=1)
    roc_auc = auc(fpr, tpr)
    F1 = metrics.f1_score(y_validation, np.array(predict_y_test, int))
    MCC = metrics.matthews_corrcoef(y_validation,
                                    np.array(predict_y_test, int))
    print('svm ACC:%s' % ACC)
    print('svm AUC:%s' % roc_auc)
    print('svm Sn:%s' % Sn)
    print('svm Sp:%s' % Sp)
    print('svm F1:%s' % F1)
    print('svm MCC:%s' % MCC)
Exemple #12
0
def calculate_concrete_IH(X, y, full, clfList):
    ndata = X.shape[0]
    numClf = len(clfList)  # Num of classifiers
    knn_clf = KNeighborsClassifier(np.floor(np.sqrt(ndata) /
                                            2))  # k = sqrt(n)/2
    tree_clf = DecisionTreeClassifier(max_depth=5)
    nb_clf = GaussianNB()
    lr_clf = LogisticRegression()
    lda_clf = LinearDiscriminantAnalysis()
    qda_clf = QuadraticDiscriminantAnalysis()

    # Matrix that record misclassification
    misclf_matrix = np.zeros((ndata, numClf))

    # If full = True, perform Leave-one-out cross validation for all classifiers
    if full == True:
        loo = LeaveOneOut()
        for train_index, test_index in loo.split(X):

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Classifier 0: kNN
            if 0 in clfList:
                knn_clf.fit(X_train, y_train)
                pred_knn = knn_clf.predict(X_test)
                if pred_knn != y_test:
                    misclf_matrix[test_index[0]][0] = 1

            # Classifier 1: Decision Tree
            if 1 in clfList:
                tree_clf.fit(X_train, y_train)
                pred_tree = tree_clf.predict(X_test)
                if pred_tree != y_test:
                    misclf_matrix[test_index[0]][1] = 1

            # Classifier 2: Naive Bayes
            if 2 in clfList:
                nb_clf.fit(X_train, y_train)
                pred_nb = nb_clf.predict(X_test)
                if pred_nb != y_test:
                    misclf_matrix[test_index[0]][2] = 1

            # Classifier 3: Logistic Regression
            if 3 in clfList:
                lr_clf.fit(X_train, y_train)
                pred_lr = lr_clf.predict(X_test)
                if pred_lr != y_test:
                    misclf_matrix[test_index[0]][3] = 1

            # Classifier 4: LDA
            if 4 in clfList:
                lda_clf.fit(X_train, y_train)
                pred_lda = lda_clf.predict(X_test)
                if pred_lda != y_test:
                    misclf_matrix[test_index[0]][4] = 1

            # Classifier 5: QDA
            if 5 in clfList:
                qda_clf.fit(X_train, y_train)
                pred_qda = qda_clf.predict(X_test)
                if pred_qda != y_test:
                    misclf_matrix[test_index[0]][5] = 1

            ih_vector = np.zeros(ndata)
            for i in range(ndata):
                ih_vector[i] = sum(misclf_matrix[i, :]) / numClf

        return ih_vector, misclf_matrix

    # else perform niter by nfolds (default is 5 by 10) fold cross validation
    else:
        niter = 5  # Num of iterations
        nfolds = 10
        misclf = np.zeros(
            (ndata, numClf, niter)
        )  # For each data, misclassif by each classifier on each iteration

        for randseed in range(niter):
            np.random.seed(randseed)
            kf = KFold(n_splits=nfolds, shuffle=True)
            fold = 0
            for tr_idx, test_idx in kf.split(X):

                X_train, X_test = X[tr_idx], X[test_idx]
                y_train, y_test = y[tr_idx], y[test_idx]

                # Classifier 0: kNN
                if 0 in clfList:
                    knn_clf.fit(X_train, y_train)
                    pred_knn = knn_clf.predict(X_test)
                    for i in range(len(test_idx)):
                        if pred_knn[i] != y_test[i]:
                            misclf[test_idx[i]][0][randseed] = 1

                # Classifier 1: Decision Tree
                if 1 in clfList:
                    tree_clf.fit(X_train, y_train)
                    pred_tree = tree_clf.predict(X_test)
                    for i in range(len(test_idx)):
                        if pred_tree[i] != y_test[i]:
                            misclf[test_idx[i]][1][randseed] = 1

                # Classifier 2: Naive Bayes
                if 2 in clfList:
                    nb_clf.fit(X_train, y_train)
                    pred_nb = nb_clf.predict(X_test)
                    for i in range(len(test_idx)):
                        if pred_nb[i] != y_test[i]:
                            misclf[test_idx[i]][2][randseed] = 1

                # Classifier 3: Logistic Regression
                if 3 in clfList:
                    lr_clf.fit(X_train, y_train)
                    pred_lr = lr_clf.predict(X_test)
                    for i in range(len(test_idx)):
                        if pred_lr[i] != y_test[i]:
                            misclf[test_idx[i]][3][randseed] = 1

                # Classifier 4: LDA
                if 4 in clfList:
                    lda_clf.fit(X_train, y_train)
                    pred_lda = lda_clf.predict(X_test)
                    for i in range(len(test_idx)):
                        if pred_lda[i] != y_test[i]:
                            misclf[test_idx[i]][4][randseed] = 1

                # Classifier 5: QDA
                if 5 in clfList:
                    qda_clf.fit(X_train, y_train)
                    pred_qda = qda_clf.predict(X_test)
                    for i in range(len(test_idx)):
                        if pred_qda[i] != y_test[i]:
                            misclf[test_idx[i]][5][randseed] = 1

                fold = fold + 1

        ih_vector = np.zeros(ndata)
        for i in range(ndata):
            ih_vector[i] = sum(sum(misclf[i])) / (
                numClf * niter
            )  # Avg of matrix with numClf classifiers and niter iterations

        return ih_vector, misclf
    def fit(self, df):
        """Train the model using the given Pandas dataframe df as input. The dataframe
        has a hierarchical index where the outer index (ID) is over individuals,
        and the inner index (Time) is over time points. Many features are available.
        There are five binary label columns:

        ['label:LYING_DOWN', 'label:SITTING', 'label:FIX_walking', 'label:TALKING', 'label:OR_standing']

        The dataframe contains both missing feature values and missing label values
        indicated by nans. Your goal is to design and fit a probabilistic forecasting
        model that when given a dataframe containing a sequence of incomplete observations
        and a time stamp t, outputs the probability that each label is active (e.g.,
        equal to 1) at time t.

        Arguments:
            df: A Pandas data frame containing the feature and label data
        """
        """#print(df.to_string())
        print(df)
        print(df.shape)
        #df.to_excel("data.xlsx")
        print(df.columns[1:-5].values) #features -- without timestamp
        print(df.index.names)
        #with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df.loc[pd.IndexSlice[2, :],:'discrete:time_of_day:between21and3'])
        #print(df.loc(axis=0)[pd.IndexSlice[0, :]])"""

        df_timestamps, df_features, df_output = df.iloc[:,
                                                        0], df.iloc[:, 1:
                                                                    -5], df.iloc[:,
                                                                                 -5:]
        loo = LeaveOneOut()
        batch_size = 1
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        lr = 0.005
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        for train, test in loo.split(range(len(df.groupby(level=0)))):
            df_train, df_test = df.loc[pd.IndexSlice[train, :], :], df.loc[
                pd.IndexSlice[test, :], :]
            df_test_timestamps, df_test_features, df_test_output = df_test.iloc[:,
                                                                                0].values, df_test.iloc[:,
                                                                                                        1:
                                                                                                        -5].values, df_test.iloc[:,
                                                                                                                                 -5:].values
            df_test_t = df_test_timestamps[-1] + torch.randint(
                low=1, high=61, size=(1, ))
            #sub = df_test_timestamps[0]
            #df_test_timestamps = [i - sub + 1 for i in df_test_timestamps]
            #df_test_features = (df_test_features.T*df_test_timestamps).T
            #df_train_timestamps, df_train_features, df_train_output = df_train.iloc[:,0], df_train.iloc[:,1:-5], df_train.iloc[:,-5:]

            for train_index in train:
                df_train_timestamps, df_train_features, df_train_output = df_train.loc[
                    pd.
                    IndexSlice[train_index, :], :].values[:, 0], df_train.loc[
                        pd.IndexSlice[
                            train_index, :], :].values[:, 1:-5], df_train.loc[
                                pd.IndexSlice[train_index, :], :].values[:,
                                                                         -5:]
                print(train_index)
                sub = df_train_timestamps[0]
                df_train_timestamps = [
                    i - sub + 1 for i in df_train_timestamps
                ]
                df_train_features = (df_train_features.T *
                                     df_train_timestamps).T
                #print(df_train_features.shape)
                #print(df_train_output.shape)

                for train_tuple in range(df_train_features.shape[0]):
                    tuple = np.reshape(df_train_features[train_tuple, :],
                                       (1, 1, df_train_features.shape[1]))
                    result = np.reshape(df_train_output[train_tuple, :],
                                        (1, 1, df_train_output.shape[1]))
                    #print(df_train_output.shape)
                    self.model.train()
                    optimizer.zero_grad()
                    out = self.model(torch.tensor(tuple).float())
                    loss = criterion(out, torch.tensor(result).float())
                    #print(loss)
                    loss.backward()
                    optimizer.step()

                    #input()

            self.forecast(df_test, df_test_t)
Exemple #14
0
def computeCVROC(df, model, outcomeVar, predVars, nFolds=10, LOO=False):
    """Apply model to df and return performance metrics in a cross-validation framework.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain outcome and predictor variables.
    model : sklearn or other model
        Model must have fit and predict methods.
    outcomeVar : str
    predVars : ndarray or list
        Predictor variables in the model.
    nFolds : int
        N-fold cross-validation (not required for LOO)

    Returns
    -------
    fpr : np.ndarray
        Pre-specified vector of FPR thresholds for interpolation
        fpr = np.linspace(0, 1, 100)
    meanTPR : np.ndarray
        Mean true-positive rate in test fraction.
    auc : float
        Area under the mean ROC curve.
    acc : float
        Mean accuracy score in test fraction.
    results : returned by model.fit()
        Training model results object for each fold
    prob : pd.Series
        Mean predicted probabilities on test data with index from df
    success : bool
        An indicator of whether the cross-validation was completed."""

    if not isinstance(predVars, list):
        predVars = list(predVars)
    
    tmp = df[[outcomeVar] + predVars].dropna()
    X,y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float)

    if LOO:
        cv = LeaveOneOut()
        nFolds = cv.get_n_splits(y)
        cv_iter = cv.split(y=y)
    else:
        cv = StratifiedKFold(n_splits=nFolds, shuffle=True)
        cv_iter = cv.split(X=X, y=y)
    
    fpr = np.linspace(0, 1, 100)
    tpr = np.nan * np.zeros((fpr.shape[0], nFolds))
    acc = np.nan * np.zeros(nFolds)
    auc = np.nan * np.zeros(nFolds)
    coefs = []
    probs = []

    for outi, (trainInd, testInd) in enumerate(cv_iter):
        Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd]
        ytrain, ytest = y.iloc[trainInd], y.iloc[testInd]

        results = model.fit(X=Xtrain, y=ytrain)
        prob = results.predict_proba(Xtest)
        
        class1Ind = np.nonzero(results.classes_ == 1)[0][0]
        fprTest, tprTest, _ = sklearn.metrics.roc_curve(ytest, prob[:, class1Ind])

        tpr[:, outi] = np.interp(fpr, fprTest, tprTest)
        auc[outi] = sklearn.metrics.auc(fprTest, tprTest)
        acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, class1Ind]), normalize=True)
        coefs.append(results.coef_[None,:])
        probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index))
    
    meanTPR = np.mean(tpr, axis=1)
    meanTPR[0], meanTPR[-1] = 0, 1
    meanACC = np.mean(acc)
    meanAUC = sklearn.metrics.auc(fpr, meanTPR)
    
    """Compute mean probability over test predictions in CV"""
    probS = pd.concat(probs).groupby(level=0).agg(np.mean)
    probS.name = 'Prob'

    """Refit all the data for final model"""
    result = model.fit(X=X, y=y)

    rocRes = rocStats(y, np.round(probS))
    
    outD = {'fpr':fpr,                      # (100, ) average FPR for ROC
            'tpr':meanTPR,                  # (100, ) average TPR for ROC
            'AUC':auc,                      # (CVfolds, ) AUC of ROC for each outer test fold
            'mAUC': meanAUC,                # (1, ) AUC of the average ROC
            'mACC': np.mean(acc),
            'ACC':acc,                      # (CVfolds, ) accuracy across outer test folds
            'finalResult': result,          # final fitted model with predict() exposed
            'prob':probS,                   # (N,) pd.Series of predicted probabilities avg over outer folds
            'coefs':np.concatenate(coefs),  # (CVfolds, predVars)
            'Xvars':predVars,
            'Yvar':outcomeVar,
            'nFolds':nFolds,
            'LOO':'Yes' if LOO else 'No',
            'N':tmp.shape[0]}                  
    outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict())
    return outD
            bar[i] = "T"
            output_test = "{}({}: {}) ".format(output_test, i, data[i])
            
        print("[ {} ]".format(" ".join(bar)))
        print("Train: {}".format(output_train))
        print("Test:  {}\n".format(output_test))


# Create some data to split with
data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]])

# Our two methods
loocv = LeaveOneOut()
lpocv = LeavePOut(p=P_VAL)

split_loocv = loocv.split(data)
split_lpocv = lpocv.split(data)

print("""\
The Leave-P-Out method works by using every combination of P points as test data.

The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods.
A bar displaying the current train-test split as well as the actual data points are displayed for each split.
In the bar, "-" is a training point and "T" is a test point.
""")

print("Data:\n{}\n".format(data))

print("Leave-One-Out:\n")
print_result(split_loocv)
Exemple #16
0
Fichier : 3.4a.py Projet : kelent/-
#Iris data cross-validation

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import LeaveOneOut
from sklearn import datasets
import numpy as np
import pandas as pd

#tmp=pd.read_csv('iris.data',sep=',')
#iris=np.loadtxt('iris.data', delimiter=',')
iris=datasets.load_iris()
x=iris['data'][0:149]
y=iris['target'][0:149]


log_model=LogisticRegression()
m=np.shape(x)[0]

y_pred=cross_val_predict(log_model,x,y,cv=10)
print(metrics.accuracy_score(y,y_pred))
#print(y_pred)

loo=LeaveOneOut()
accuracy=0
for train,test in loo.split(x):
      log_model.fit(x[train],y[train])
      y_pred1=log_model.predict(x[test])
      if y_pred1==y[test]:accuracy+=1
print (accuracy/m)
y = iris.target

knn = KNeighborsClassifier()

# ==== Leave-one-out validation ====

from sklearn.model_selection import LeaveOneOut
# Instatiate `LeaveOneOut` class. See [here](http://scikit-learn.org/stable/modules/cross_validation.html#leave-one-out-loo)
# and [here](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneOut.html#sklearn.model_selection.LeaveOneOut)
# for more details.
loo = LeaveOneOut()
# Keep track of successful predictions
successes = []

# the `split` method generates indices to split data into training and test set.
for train_index, test_index in loo.split(X):
    # `fit` classifier on training indices
    knn.fit(X[train_index], y[train_index])
    # `score` classifier on testing indices; since there will be only one
    # test index, the score will be either 1 (for a correct prediction) or
    # 0 (for an incorrect prediction).
    successes.append(knn.score(X[test_index], y[test_index]))
# Divide `successes` by the sample size to get the percentage score.
print("Accuracy for iris dataset with Leave-One-Out validation is {}.\n".
      format(np.mean(successes)))

# ==== Random permutation cross validation ====

from sklearn.model_selection import ShuffleSplit
# Instantiate ShuffleSplit class with `n_splits` (number of repetitions) and
# `test_size` (percentage of dataset to withhold for the test data). See
Exemple #18
0
#                # # Plot normalized confusion matrix
#                plt.figure()
#                plot_confusion_matrix(cnf_matrix, classes=list(class_names), normalize=True,
#                                      title='Normalized confusion matrix')
#
#                plt.show()
#                scores = np.array(metrics.accuracy_score(y, y_pred_list))
#                print(md, ne, lr, round(scores.mean(), 2), round(scores.std(), 2) * 2)

#SVM

clf = svm.SVC(kernel='linear', probability=True)
auc = []
y_pred1 = []
y_test1 = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:, 1]
    y_pred1.append(y_pred)
    y_test1.append(y_test.values[0])
W = clf.coef_[0]
try:
    df = pd.DataFrame({
        'Taxonome': preproccessed_data.columns[3:],
        'Coefficients': np.dot(clf.coef_[0], pca_components)
    })
except:
    df = pd.DataFrame({
        'Taxonome': preproccessed_data.columns[3:],
Exemple #19
0
CrossValidation5Fold = BaseCrossValidation(
    n_split='5',
    description="To determine the hyper-parameter (e.g. the number of "
    "features) of model, we applied cross validation with 5-fold "
    "on the training data set. The hyper-parameters were set "
    "according to the model performance on the validation data set. ")
CrossValidation10Fold = BaseCrossValidation(
    n_split='10',
    description="To determine the hyper-parameter (e.g. the number of "
    "features) of model, we applied cross validation with 10-fold "
    "on the training data set. The hyper-parameters were set "
    "according to the model performance on the validation data set. ")
CrossValidationLOO = BaseCrossValidation(
    n_split='all',
    description="To determine the hyper-parameter (e.g. the number of features) "
    "of model, we applied cross validation with leave-one-out on the "
    "training data set. The hyper-parameters were set according to "
    "the model performance on the validation data set. ")

if __name__ == '__main__':
    import numpy as np

    data = np.random.random((100, 10))
    label = np.concatenate((np.ones((60, )), np.zeros((40, ))), axis=0)

    cv = LeaveOneOut()
    for train, val in cv.split(data, label):
        print(train)
        print(val)
        print('')
        if args.fold < 0:
            logger.info("Validation set")
            random.shuffle(duplicate_reports)

            sample_size = int(len(duplicate_reports) * 0.05)
            data_alpha = sorted(duplicate_reports[:sample_size],
                                key=lambda bug_id: int(bug_id))

            splits = [(sorted(duplicate_reports[sample_size:],
                              key=lambda bug_id: int(bug_id)), data_alpha)]
        elif args.fold == 0:
            logger.info("Leave one out")
            loo = LeaveOneOut()

            splits = loo.split(duplicate_reports)
        else:
            logger.info("K-folds: {}".format(args.fold))
            kf = KFold(n_splits=args.fold)

            splits = kf.split(duplicate_reports)

        base_filename = path.splitext(path.split(args.dt)[1])[0] + '_' + args.l
        max_bug_id = max(
            map(lambda bug_id: int(bug_id), trainingDataset.bugIds))
        masterSetById = bugReportDatabase.getMasterSetById(
            trainingDataset.bugIds)
        map_by_alpha = []
        n_queries = 0
        # Preprocess the reports before writing REP input file
        rep_reports = generate_input_vec(bugReportDatabase, max_bug_id)
Exemple #21
0
    model[j].add(Conv2D(j*32+32,kernel_size=5,activation='relu'))
    """
    model[j].add(MaxPool2D())
    model[j].add(Flatten())
    model[j].add(Dense(256, activation='relu'))
    model[j].add(Dense(2, activation='softmax'))
    model[j].compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])    
    
#LEAVE ONE OUT CROSS-VALIDATION 
history = [0] * nets
names = ["8 maps","16 maps","24 maps","32 maps","48 maps","64 maps"]
cv_result = []

for j in range(nets):
    cv_result = []
    for train_index, test_index in loo.split(X_train):
        clf = model[j]
        X_train2, X_val2 = X_train[train_index], X_train[test_index]
        Y_train2, Y_val2 = Y_train[train_index], Y_train[test_index]
        acc = clf.fit(X_train2,Y_train2, batch_size=None, epochs=5, verbose=0,
                      validation_data=(X_val2, Y_val2), workers=1, callbacks=[annealer])
        cv_result.append(acc.history['val_accuracy'])
    history[j] = np.mean(cv_result)
    print("CNN {0}: Validation accuracy={1:.5f}".format(names[j], history[j]))
    
#EXPERIMENT 3
nets = 4
model = [0] *nets
for j in range(4):
    model[j] = Sequential()
    model[j].add(Conv2D(j*8+8,kernel_size=5,activation='relu',input_shape=(64,64,1)))
Exemple #22
0
def main():
    x1 = getMatrix("data/Train915/result/negative/pssm_profile_uniref50")
    x2 = getMatrix("data/Train915/result/positive/pssm_profile_uniref50")
    x = np.vstack((x1, x2))
    y = [-1 for i in range(x1.shape[0])]
    y.extend([1 for i in range(x2.shape[0])])
    y = np.array(y)
    #
    N = x.shape[1]
    print(int(sqrt(N).real), N // 5, int(log(N, 2).real), N // 3, N // 2,
          N // 4, N // 10)
    param_grid = {
        'max_features': [
            int(sqrt(N).real), N // 5,
            int(log(N, 2).real), N // 3, N // 2, N // 4, N // 10
        ]
    }
    gs = GridSearchCV(RandomForestClassifier(n_estimators=1000,
                                             random_state=1),
                      param_grid,
                      cv=10)
    gs.fit(x, y)
    print(gs.best_estimator_)
    print(gs.best_score_)

    #
    clf = gs.best_estimator_
    loo = LeaveOneOut()
    score = cross_val_score(clf, x, y, cv=loo).mean()
    print("LOO:{}".format(score))
    #
    loo_probas_y = []  #
    loo_test_y = []  #
    loo_predict_y = []  #
    for train, test in loo.split(x):
        clf.fit(x[train], y[train])
        loo_predict_y.extend(clf.predict(x[test]))  #
        loo_probas_y.extend(clf.predict_proba(x[test]))  #
        loo_test_y.extend(y[test])  #
    loo_probas_y = np.array(loo_probas_y)
    loo_test_y = np.array(loo_test_y)
    print(loo_probas_y.shape)
    #np.savetxt("915-RFclassification-DWT-LOO-probas_y.csv", loo_probas_y, delimiter=",")
    #np.savetxt("915-RFclassification-DWT-LOO-test_y.csv", loo_test_y, delimiter=",")
    #
    confusion = sklearn.metrics.confusion_matrix(loo_test_y, loo_predict_y)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    print("ROC:{}".format(roc_auc_score(loo_test_y, loo_probas_y[:, 1])))
    print("SP:{}".format(TN / (TN + FP)))
    print("SN:{}".format(TP / (TP + FN)))
    n = (TP * TN - FP * FN) / (((TP + FP) * (TP + FN) * (TN + FP) *
                                (TN + FN))**0.5)
    print("PRE:{}".format(TP / (TP + FP)))
    print("MCC:{}".format(n))
    print("F-score:{}".format((2 * TP) / (2 * TP + FP + FN)))
    print("ACC:{}".format((TP + TN) / (TP + FP + TN + FN)))

    #
    test_x1 = getMatrix("data/Test850/result/negative/pssm_profile_uniref50")
    test_x2 = getMatrix("data/Test850/result/positive/pssm_profile_uniref50")
    test_x = np.vstack((test_x1, test_x2))
    test_y = [-1 for i in range(test_x1.shape[0])]
    test_y.extend([1 for i in range(test_x2.shape[0])])
    clf = gs.best_estimator_
    clf.fit(x, y)
    predict_y = clf.predict(test_x)
    print("IND:{}".format(accuracy_score(test_y, predict_y)))
Exemple #23
0
print('\nExperiment 7 - Fit a 5th order polynomial to female400')
print("Mean square Error - Female400 - Linear : %.3f" % mean_squared_error(train_time_female400, pred_time_female400_all))
print("Mean square Error - Female400 - Degree 3 : %.3f" % mean_squared_error(train_time_female400, pred_time_female400_all_3))
print("Mean square Error - Female400 - Degree 5 : %.3f" % mean_squared_error(train_time_female400, pred_time_female400_all_5))
print("The error does not improve,"
      "And slightly increases from degree 3 to degree 5")

# Experiment 8
# Use LOOCV for both 3rd and 5th order polynomials
# Reference - http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneOut.html

print('\nExperiment 8 - Use LOOCV for both 3rd and 5th order polynomials')
poly_model_3_loo = make_pipeline(PolynomialFeatures(degree=3), LinearRegression(fit_intercept=False))
results_3 = []
loo = LeaveOneOut()
for train_index, test_index in loo.split(train_year_female400):
    train_year_f400_loo, test_year_f400_loo = train_year_female400[train_index], train_year_female400[test_index]
    train_time_f400_loo, test_time_f400_loo = train_time_female400[train_index], train_time_female400[test_index]
    poly_model_3_loo.fit(train_year_f400_loo, train_time_f400_loo)
    results_3.append(mean_squared_error(test_time_f400_loo, poly_model_3_loo.predict(test_year_f400_loo)))

formatted_result_3 = ["%.3f" % item for item in results_3]
pred_time_f400_all_3_loo = poly_model_3_loo.predict(train_year_female400)
# Plot to check fit
plt.scatter(train_year_female400, train_time_female400, c='r')
plt.plot(train_year_female400, pred_time_f400_all_3_loo, c='b')
plt.xlabel('Year')
plt.ylabel('Time(seconds)')
plt.title('Exp 8 - Use LOOCV for 3rd order polynomial')
plt.show()
print("Mean square Error - Female400 - Degree 3 - LOOCV : %.3f" % mean_squared_error(train_time_female400, pred_time_f400_all_3_loo))
Exemple #24
0
def main():
    ######################### Defining_Autism_brains #########################
    n = 31
    A = numpy.zeros((90, 90, n))
    A[:, :, 0] = numpy.loadtxt(open("28853.csv", "rb"), delimiter=",")
    A[:, :, 1] = numpy.loadtxt(open("28855.csv", "rb"), delimiter=",")
    A[:, :, 2] = numpy.loadtxt(open("28856.csv", "rb"), delimiter=",")
    A[:, :, 3] = numpy.loadtxt(open("28857.csv", "rb"), delimiter=",")
    A[:, :, 4] = numpy.loadtxt(open("28859.csv", "rb"), delimiter=",")
    A[:, :, 5] = numpy.loadtxt(open("28860.csv", "rb"), delimiter=",")
    A[:, :, 6] = numpy.loadtxt(open("28861.csv", "rb"), delimiter=",")
    A[:, :, 7] = numpy.loadtxt(open("28864.csv", "rb"), delimiter=",")
    A[:, :, 8] = numpy.loadtxt(open("28865.csv", "rb"), delimiter=",")
    A[:, :, 9] = numpy.loadtxt(open("28866.csv", "rb"), delimiter=",")
    A[:, :, 10] = numpy.loadtxt(open("28871.csv", "rb"), delimiter=",")
    A[:, :, 11] = numpy.loadtxt(open("28872.csv", "rb"), delimiter=",")
    A[:, :, 12] = numpy.loadtxt(open("28873.csv", "rb"), delimiter=",")
    A[:, :, 13] = numpy.loadtxt(open("28874.csv", "rb"), delimiter=",")
    A[:, :, 14] = numpy.loadtxt(open("28875.csv", "rb"), delimiter=",")
    A[:, :, 15] = numpy.loadtxt(open("28876.csv", "rb"), delimiter=",")
    A[:, :, 16] = numpy.loadtxt(open("28879.csv", "rb"), delimiter=",")
    A[:, :, 17] = numpy.loadtxt(open("28885.csv", "rb"), delimiter=",")
    A[:, :, 18] = numpy.loadtxt(open("28887.csv", "rb"), delimiter=",")
    A[:, :, 19] = numpy.loadtxt(open("28890.csv", "rb"), delimiter=",")
    A[:, :, 20] = numpy.loadtxt(open("28896.csv", "rb"), delimiter=",")
    A[:, :, 21] = numpy.loadtxt(open("28897.csv", "rb"), delimiter=",")
    A[:, :, 22] = numpy.loadtxt(open("28898.csv", "rb"), delimiter=",")
    A[:, :, 23] = numpy.loadtxt(open("28899.csv", "rb"), delimiter=",")
    A[:, :, 24] = numpy.loadtxt(open("28901.csv", "rb"), delimiter=",")
    A[:, :, 25] = numpy.loadtxt(open("28903.csv", "rb"), delimiter=",")
    A[:, :, 26] = numpy.loadtxt(open("28905.csv", "rb"), delimiter=",")
    A[:, :, 27] = numpy.loadtxt(open("28906.csv", "rb"), delimiter=",")
    A[:, :, 28] = numpy.loadtxt(open("28907.csv", "rb"), delimiter=",")
    A[:, :, 29] = numpy.loadtxt(open("28908.csv", "rb"), delimiter=",")
    A[:, :, 30] = numpy.loadtxt(open("28909.csv", "rb"), delimiter=",")
    ############################Defining_Normal_brains#################################
    m = 24
    B = numpy.zeros((90, 90, m))
    B[:, :, 0] = numpy.loadtxt(open("28854.csv", "rb"), delimiter=",")
    B[:, :, 1] = numpy.loadtxt(open("28858.csv", "rb"), delimiter=",")
    B[:, :, 2] = numpy.loadtxt(open("28862.csv", "rb"), delimiter=",")
    B[:, :, 3] = numpy.loadtxt(open("28863.csv", "rb"), delimiter=",")
    B[:, :, 4] = numpy.loadtxt(open("28867.csv", "rb"), delimiter=",")
    B[:, :, 5] = numpy.loadtxt(open("28868.csv", "rb"), delimiter=",")
    B[:, :, 6] = numpy.loadtxt(open("28870.csv", "rb"), delimiter=",")
    B[:, :, 7] = numpy.loadtxt(open("28877.csv", "rb"), delimiter=",")
    B[:, :, 8] = numpy.loadtxt(open("28878.csv", "rb"), delimiter=",")
    B[:, :, 9] = numpy.loadtxt(open("28880.csv", "rb"), delimiter=",")
    B[:, :, 10] = numpy.loadtxt(open("28881.csv", "rb"), delimiter=",")
    B[:, :, 11] = numpy.loadtxt(open("28882.csv", "rb"), delimiter=",")
    B[:, :, 12] = numpy.loadtxt(open("28883.csv", "rb"), delimiter=",")
    B[:, :, 13] = numpy.loadtxt(open("28886.csv", "rb"), delimiter=",")
    B[:, :, 14] = numpy.loadtxt(open("28888.csv", "rb"), delimiter=",")
    B[:, :, 15] = numpy.loadtxt(open("28889.csv", "rb"), delimiter=",")
    B[:, :, 16] = numpy.loadtxt(open("28891.csv", "rb"), delimiter=",")
    B[:, :, 17] = numpy.loadtxt(open("28892.csv", "rb"), delimiter=",")
    B[:, :, 18] = numpy.loadtxt(open("28893.csv", "rb"), delimiter=",")
    B[:, :, 19] = numpy.loadtxt(open("28894.csv", "rb"), delimiter=",")
    B[:, :, 20] = numpy.loadtxt(open("28895.csv", "rb"), delimiter=",")
    B[:, :, 21] = numpy.loadtxt(open("28900.csv", "rb"), delimiter=",")
    B[:, :, 22] = numpy.loadtxt(open("28902.csv", "rb"), delimiter=",")
    B[:, :, 23] = numpy.loadtxt(open("28904.csv", "rb"), delimiter=",")
    ###################################################################################
    # Defining Austim[] and Normal[] brains after measuring the four matrices
    Autism = numpy.zeros((31, 4))
    Normal = numpy.zeros((24, 4))
    All_brains_matrices = numpy.zeros((55, 4))
    Autism = caculate_matrices(A,
                               n)  # calculate the four matrices for 31 brains
    Normal = caculate_matrices(B,
                               m)  # calculate the four matrices for 24 brains
    #Combine the two matrices into one matrix (All_brains_matrices
    All_brains_matrices = numpy.concatenate((Autism, Normal), axis=0)
    print("All_Brain_Matrices is ", All_brains_matrices)
    print("Dim_All_Brain_Matrices is", All_brains_matrices.shape)
    #############################################################################
    X = All_brains_matrices
    y = numpy.zeros(55)
    y[0:31] = 1
    y[31:55] = 0
    ################################ Leave-One-Out ####################################################
    loo = LeaveOneOut()
    score = numpy.zeros(55)
    count = 0
    train_X = numpy.zeros((54, 4))
    train_y = numpy.zeros(54)
    test_X = [0]
    test_y = [0]
    for train_index, test_index in loo.split(X):
        #print(train_index.shape,test_index.shape)
        for i in range(len(train_index)):
            train_X[i, :] = X[train_index[i]]
            train_y[i] = y[train_index[i]]
        test_X = X[test_index[0]]
        test_y = y[test_index[0]]
        clf = svm.SVC(kernel='linear', C=1,
                      probability=True).fit(train_X, train_y)
        probs = clf.predict_proba(test_X)
        score[count] = probs[:, 0]
        count += 1

############ Alessandro's ROC#############
    roc_x = []
    roc_y = []
    min_score = min(score)
    max_score = max(score)
    thr = numpy.linspace(min_score, max_score, 30)
    FP = 0
    TP = 0
    P = sum(y)
    N = len(y) - P

    for (i, T) in enumerate(thr):
        for i in range(0, len(score)):
            if (score[i] > T):
                if (y[i] == 1):
                    TP = TP + 1
                if (y[i] == 0):
                    FP = FP + 1
        roc_x.append(FP / float(N))
        roc_y.append(TP / float(P))
        FP = 0
        TP = 0

    roc_auc = auc(roc_x, roc_y)
    ##############################################################################
    #Plot of a ROC curve for a specific class
    lw = 2
    plt.plot(roc_x,
             roc_y,
             color='darkorange',
             lw=lw,
             label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
Exemple #25
0
                    "SDML was not able to converge in less than {} attempts. ".
                    format(max_runs))
                continue
        else:
            is_sdml = False

        loo = LeaveOneOut()
        estimated_parameters = []
        true_parameters_list = []
        parameter_distances = []
        embedding_distances = []

        estimated_KL = []
        estimated_TV = []

        for ref_index, obs_index in loo.split(
                test_data, test_labels):  # this returns the indices

            # split the dataset
            n_samples = len(ref_index)

            output_ref = test_data[ref_index]
            param1_ref = param1_test[ref_index]
            param2_ref = param2_test[ref_index]

            observation = test_data[obs_index]
            param1_obs = param1_test[obs_index]
            param2_obs = param2_test[obs_index]
            true_parameters = np.array([param1_obs, param2_obs]).reshape(-1)

            if name == 'true':
                output_ref_transformed = np.column_stack(
Exemple #26
0
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_group_idx, va_group_idx in kf.split(unique_user_ids):
    # 顧客IDをtrain/valid(学習に使うデータ、バリデーションデータ)に分割する
    tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[
        va_group_idx]

    # 各レコードの顧客IDがtrain/validのどちらに属しているかによって分割する
    is_tr = user_id.isin(tr_groups)
    is_va = user_id.isin(va_groups)
    tr_x, va_x = train_x[is_tr], train_x[is_va]
    tr_y, va_y = train_y[is_tr], train_y[is_va]

# (参考)GroupKFoldクラスではシャッフルと乱数シードの指定ができないため使いづらい
kf = GroupKFold(n_splits=4)
for tr_idx, va_idx in kf.split(train_x, train_y, user_id):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

# -----------------------------------
# leave-one-out
# -----------------------------------
# データが100件しかないものとする
train_x = train_x.iloc[:100, :].copy()
# -----------------------------------
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
for tr_idx, va_idx in loo.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
from sklearn.model_selection import LeaveOneOut
#importing the logistic regression module from logisticRegression.py
from LogisticRegression import LogisticRegression

import numpy as np

irisdata = datasets.load_iris()
data = np.delete(irisdata['data'][50:], [0, 1], axis=1)

lv = LeaveOneOut()
lv.get_n_splits(data)

labels = irisdata.target[50:].ravel()

error = []

for train_index, test_index in lv.split(data):

    #creating object for logistic regression
    model = LogisticRegression()

    train_data = data[train_index]
    train_labels = labels[train_index]
    test_data = data[test_index]
    test_labels = labels[test_index]
    #training the logistic regression object
    model.train(train_data, train_labels)

    error.append(1 - (test_labels == model.test(test_data, test_labels)[0]))

print(np.mean(error))
Exemple #28
0
    def trainModelFV_LOOCV_Classifiers(self, extension='*.txt'):
        """
        This method contains the entire module
        required for training the bag of visual words model
        Use of helper functions will be extensive.
        """

        print('trainModelFV_LOOCV_Classifiers')
        names = ["Linear SVM"]
        classifiers = [SVC(kernel='linear')]

        self.name_dict, self.number_dict, self.count_class = self.file_helper.getLabelsFromFile(
            self.label_path)

        # read file. prepare file lists.
        self.images, self.trainImageCount = self.file_helper.getFilesFromDirectory(
            self.base_path, self.datasets, extension)

        self.parameters += 'Classifier Parameters\n'
        self.parameters += '%s' % self.classifier_helper.clf

        features_nd = np.asarray(self.images)
        #features_nd.sort(axis=0)
        loo = LeaveOneOut()
        predictions = {}
        p = {}
        l = []
        hits = {}
        for name in names:
            predictions[name] = []
            p[name] = []
            hits[name] = 0

        c = 0
        for train, test in loo.split(features_nd):
            feature_test_file = str(features_nd[test][0][0])
            class_name_test = feature_test_file.split(os.sep)[-2]
            c += 1
            currenInvDate = datetime.datetime.now().strftime(
                "%d/%m/%Y %H:%M:%S")
            print('Step: %i/%i - %s - %s' %
                  (c, features_nd.shape[0], currenInvDate, feature_test_file))
            # if c == 1 or c % 25 == 0:
            #    self.mail_helper.sendMail("Progress: %s - %s" % (self.test_name, self.OsName), "Samples processed: %i" % c)

            self.descriptor_list = []
            self.train_labels = []
            for feature in features_nd[train]:
                feature = feature[0]
                label_number = self.number_dict[feature.split(os.sep)[-2]]
                self.train_labels = np.append(self.train_labels, label_number)
                des = self.file_helper.loadFeaturesFromFile(feature)
                self.descriptor_list.append(des)

            # format data as nd array
            self.classifier_helper.formatND(self.descriptor_list)

            gmm = GMM(n_components=self.no_clusters, covariance_type='diag')
            gmm.fit(self.classifier_helper.descriptor_vstack)

            fv_dim = self.no_clusters + 2 * self.no_clusters * self.classifier_helper.descriptor_vstack.shape[
                1]
            print(fv_dim)
            n_videos = train.shape[0]
            features = np.array([np.zeros(fv_dim) for i in range(n_videos)])
            count = 0
            for i in range(n_videos):
                len_video = len(self.descriptor_list[i])
                fv = fisher_vector(
                    self.classifier_helper.descriptor_vstack[count:count +
                                                             len_video], gmm)
                features[i] = fv
                count += len_video

            print(features.shape)
            print('Data normalization.')
            scaler = StandardScaler()
            # train normalization
            features = scaler.fit_transform(features)
            features = power_normalize(features, 0.5)
            features = L2_normalize(features)

            # real label
            l.extend([self.number_dict[feature_test_file.split(os.sep)[-2]]])

            # test features
            feature_test = self.file_helper.loadFeaturesFromFile(
                feature_test_file)
            test_fv = fisher_vector(feature_test, gmm)
            # train normalization
            test_fv = test_fv.reshape(1, -1)
            test_fv = scaler.transform(test_fv)
            test_fv = power_normalize(test_fv, 0.5)
            test_fv = L2_normalize(test_fv)

            # train classifiers
            for name, clf in zip(names, classifiers):
                print(name)
                clf.fit(features, self.train_labels)
                cl = int(clf.predict(test_fv)[0])
                class_name_predict = self.name_dict[str(cl)]
                if class_name_test == class_name_predict:
                    hits[name] += 1

                # predicted label
                p[name].extend([cl])
                predictions[name].append({
                    'image': feature_test_file,
                    'class': cl,
                    'object_name': self.name_dict[str(cl)]
                })
            msg_progress = ''
            for name in names:
                msg_progress += 'Classifier: %s - Hits:%i/%i - Accuracy: %.4f\n' % (
                    name.ljust(20), hits[name], c, hits[name] / c)

            print(msg_progress)
            print('\n\n')
            if c == 1 or c % 25 == 0:
                self.mail_helper.sendMail(
                    "Progress: %s - %s" % (self.test_name, self.OsName),
                    msg_progress)

        for name in names:
            print(name)
            self.saveResults(predictions[name],
                             p[name],
                             l,
                             features_nd.shape[0],
                             classifier_name=name)
Exemple #29
0
print("Importing data")
(train_x, train_y) = mmi()
print("Imported data, initiating training")
train_x = train_x[1:]
train_y = train_y[1:]

loo = LeaveOneOut()

vals_y = []
Poly_preds_y = []
Gaussian_preds_y = []

SVM1 = sklearn.svm.SVC(C=5.0, kernel='poly', coef0=1.0)
SVM2 = sklearn.svm.SVC(C=20.0, kernel='rbf')

for train_idx, val_idx in loo.split(train_x):
    X_train, X_val = train_x[train_idx], train_x[val_idx]
    y_train, y_val = train_y[train_idx], train_y[val_idx]

    SVM1.fit(X_train, y_train)
    SVM2.fit(X_train, np.ravel(y_train))
    Poly_pred_y = SVM1.predict(X_val)
    Gaussian_pred_y = SVM2.predict(X_val)

    vals_y.append(list(y_val))
    Poly_preds_y.append(list(Poly_pred_y))
    Gaussian_preds_y.append(list(Gaussian_pred_y))

vals_y = np.ravel(vals_y)
Poly_preds_y = np.ravel(Poly_preds_y)
Gaussian_preds_y = np.ravel(Gaussian_preds_y)
Exemple #30
0
def balance_tpr(cfg, featdata):
    """
    Find the threshold of class index 0 that yields equal number of true positive samples of each class.
    Currently only available for binary classes.

    Params
    ======
    cfg: config module
    feetdata: feature data computed using compute_features()
    """

    n_jobs = cfg.N_JOBS
    if n_jobs is None:
        n_jobs = mp.cpu_count()
    if n_jobs > 1:
        print('balance_tpr(): Using %d cores' % n_jobs)
        pool = mp.Pool(n_jobs)
        results = []

    # Init a classifier
    if cfg.CLASSIFIER == 'GB':
        cls = GradientBoostingClassifier(loss='deviance',
                                         learning_rate=cfg.GB['learning_rate'],
                                         n_estimators=cfg.GB['trees'],
                                         subsample=1.0,
                                         max_depth=cfg.GB['max_depth'],
                                         random_state=cfg.GB['seed'],
                                         max_features='sqrt',
                                         verbose=0,
                                         warm_start=False,
                                         presort='auto')
    elif cfg.CLASSIFIER == 'XGB':
        cls = XGBClassifier(loss='deviance',
                            learning_rate=cfg.GB['learning_rate'],
                            n_estimators=cfg.GB['trees'],
                            subsample=1.0,
                            max_depth=cfg.GB['max_depth'],
                            random_state=cfg.GB['seed'],
                            max_features='sqrt',
                            verbose=0,
                            warm_start=False,
                            presort='auto')
    elif cfg.CLASSIFIER == 'RF':
        cls = RandomForestClassifier(n_estimators=cfg.RF['trees'],
                                     max_features='auto',
                                     max_depth=cfg.RF['max_depth'],
                                     n_jobs=cfg.N_JOBS,
                                     random_state=cfg.RF['seed'],
                                     oob_score=True,
                                     class_weight='balanced_subsample')
    elif cfg.CLASSIFIER == 'LDA':
        cls = LDA()
    elif cfg.CLASSIFIER == 'rLDA':
        cls = rLDA(cfg.RLDA_REGULARIZE_COEFF)
    else:
        raise ValueError('Unknown classifier type %s' % cfg.CLASSIFIER)

    # Setup features
    X_data = featdata['X_data']
    Y_data = featdata['Y_data']
    wlen = featdata['wlen']
    if cfg.PSD['wlen'] is None:
        cfg.PSD['wlen'] = wlen

    # Choose CV type
    ntrials, nsamples, fsize = X_data.shape
    if cfg.CV_PERFORM == 'LeaveOneOut':
        print('\n>> %d-fold leave-one-out cross-validation' % ntrials)
        if SKLEARN_OLD:
            cv = LeaveOneOut(len(Y_data))
        else:
            cv = LeaveOneOut()
    elif cfg.CV_PERFORM == 'StratifiedShuffleSplit':
        print(
            '\n>> %d-fold stratified cross-validation with test set ratio %.2f'
            % (cfg.CV_FOLDS, cfg.CV_TEST_RATIO))
        if SKLEARN_OLD:
            cv = StratifiedShuffleSplit(Y_data[:, 0],
                                        cfg.CV_FOLDS,
                                        test_size=cfg.CV_TEST_RATIO,
                                        random_state=cfg.CV_RANDOM_SEED)
        else:
            cv = StratifiedShuffleSplit(n_splits=cfg.CV_FOLDS,
                                        test_size=cfg.CV_TEST_RATIO,
                                        random_state=cfg.CV_RANDOM_SEED)
    else:
        raise NotImplementedError('%s is not supported yet. Sorry.' %
                                  cfg.CV_PERFORM)
    print('%d trials, %d samples per trial, %d feature dimension' %
          (ntrials, nsamples, fsize))

    # For classifier itself, single core is usually faster
    cls.n_jobs = 1
    Y_preds = []

    if SKLEARN_OLD:
        splits = cv
    else:
        splits = cv.split(X_data, Y_data[:, 0])
    for cnum, (train, test) in enumerate(splits):
        X_train = np.concatenate(X_data[train])
        X_test = np.concatenate(X_data[test])
        Y_train = np.concatenate(Y_data[train])
        Y_test = np.concatenate(Y_data[test])
        if n_jobs > 1:
            results.append(
                pool.apply_async(
                    get_predict_proba,
                    [cls, X_train, Y_train, X_test, Y_test, cnum + 1]))
        else:
            Y_preds.append(
                get_predict_proba(cls, X_train, Y_train, X_test, Y_test,
                                  cnum + 1))
        cnum += 1

    # Aggregate predictions
    if n_jobs > 1:
        pool.close()
        pool.join()
        for r in results:
            Y_preds.append(r.get())
    Y_preds = np.concatenate(Y_preds, axis=0)

    # Find threshold for class index 0
    Y_preds = sorted(Y_preds)
    mid_idx = int(len(Y_preds) / 2)
    if len(Y_preds) == 1:
        return 0.5  # should not reach here in normal conditions
    elif len(Y_preds) % 2 == 0:
        thres = Y_preds[mid_idx -
                        1] + (Y_preds[mid_idx] - Y_preds[mid_idx - 1]) / 2
    else:
        thres = Y_preds[mid_idx]
    return thres
                        scoring=scoring,
                        cv=5,
                        return_train_score=True)
sorted(scores.keys())
print('测试结果:', scores)  # scores类型为字典。包含训练得分,拟合次数, score-times (得分次数)

# ==================================K折交叉验证、留一交叉验证、留p交叉验证、随机排列交叉验证==========================================
# k折划分子集
kf = KFold(n_splits=2)
for train, test in kf.split(iris.data):
    print("k折划分:%s %s" % (train.shape, test.shape))
    break

# 留一划分子集
loo = LeaveOneOut()
for train, test in loo.split(iris.data):
    print("留一划分:%s %s" % (train.shape, test.shape))
    break

# 留p划分子集
lpo = LeavePOut(p=2)
for train, test in loo.split(iris.data):
    print("留p划分:%s %s" % (train.shape, test.shape))
    break

# 随机排列划分子集
ss = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)
for train_index, test_index in ss.split(iris.data):
    print("随机排列划分:%s %s" % (train.shape, test.shape))
    break
Exemple #32
0
iris = sns.load_dataset("iris")
X = iris.values[50:150, 0:4]
y = iris.values[50:150, 4]

# 逻辑回归
log_model = LogisticRegression()
m = np.shape(X)[0]

# 10-folds 交叉验证
y_pred = cross_val_predict(log_model, X, y, cv=10)
print(metrics.accuracy_score(y, y_pred))

# 留一法
loo = LeaveOneOut()
accuracy = 0
for train, test in loo.split(X):
    log_model.fit(X[train], y[train])  # fitting
    y_p = log_model.predict(X[test])
    if y_p == y[test]:
        accuracy += 1
print(accuracy / np.shape(X)[0])
'''
transfusion-blood dats set analysis
'''
# import numpy as np  # for matrix calculation
dataset_transfusion = np.loadtxt('../data/transfusion.data',
                                 delimiter=",",
                                 skiprows=1)
X2 = dataset_transfusion[:, 0:4]
y2 = dataset_transfusion[:, 4]
Exemple #33
0
	print c,ytest,a 
	if a>200:
		test_pred = [5]
	else:
		test_pred = [2]
	if(test_pred[0] == ytest[0]):
		return 1
	else:
		return 0

from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
# xall = x3
# yall = y3
xall = np.array(x1.tolist()+x2.tolist()+x3.tolist())
yall = np.array(y1.tolist()+y2.tolist()+y3.tolist())
score = [0,0,0,0,0]
total = [0,0,0,0,0]
for a,b in loo.split(xall,yall):
	#trainindices = a
	#testindex = b
	xtrain, xtest = xall[a], xall[b]
	ytrain, ytest = yall[a], yall[b]
	score[ytest[0]-1] +=get_score(xtrain,xtest,ytrain,ytest)
	total[ytest[0]-1] +=1

print ['Transport','Weather','Inflation','Fuel','Hoarding']
print 'correct = ', score, 'overall = ', sum(score)
print 'total = ', total, 'total = ', sum(total)
print 'accuracy = ',[score[i]*1.0/total[i] for i in range(0,5)], 'overall = ', sum(score)*1.0/sum(total)  
    def evaluate_model_CNN_LSTM(self, df):

        verbose, epochs, batch_size = 0, 25, 64
        loo = LeaveOneOut()

        for train, test in loo.split(range(len(df.groupby(level=0)))):
            df_train, df_test = df.loc[pd.IndexSlice[train, :], :], df.loc[
                pd.IndexSlice[test, :], :]
            df_test_timestamps, df_test_features, df_test_output = df_test.iloc[:,
                                                                                0], df_test.iloc[:,
                                                                                                 1:
                                                                                                 -5], df_test.iloc[:,
                                                                                                                   -5:]
            df_train_timestamps, df_train_features, df_train_output = df_train.iloc[:,
                                                                                    0], df_train.iloc[:,
                                                                                                      1:
                                                                                                      -5], df_train.iloc[:,
                                                                                                                         -5:]
            #df_train_timestamps.reset_index(level=1, drop=True, inplace=True)
            #df_train_features.index = df_train_features.index.droplevel(1)
            #df_train_output.index = df_train_output.index.droplevel(1)
            #print(df_train.loc[pd.IndexSlice[2, :],:])
            for train_index in train:
                [w, h] = (df_train.loc[pd.IndexSlice[
                    train_index, :], :'raw_acc:magnitude_stats:time_entropy']
                          ).shape
                temp = (df_train.loc[pd.IndexSlice[
                    train_index, :], :'raw_acc:magnitude_stats:time_entropy']
                        ).values.reshape(1, -1, w, h)
                print([w, h])
                #input()
                model = torch.nn.Sequential(
                    torch.nn.Conv2d(1, 32, 3, stride=2, bias=True, padding=3),
                    torch.nn.ReLU(), torch.nn.Dropout(0.5),
                    torch.nn.MaxPool2d(3),
                    torch.nn.Conv2d(32, 64, 3, stride=2, bias=True, padding=3),
                    torch.nn.ReLU(), torch.nn.Dropout(0.5),
                    torch.nn.MaxPool2d(3), torch.nn.Flatten(0, 1),
                    torch.nn.LSTM(w, h, 100), torch.nn.Dropout(0.5),
                    torch.nn.Linear(100, 5), torch.nn.ReLU(),
                    torch.nn.Linear(100, 5), torch.nn.Softmax())
                criterion = torch.nn.CrossEntropyLoss()
                optimizer = torch.optim.Adam(model.parameters())

                for epoch in range(epochs):
                    model = model.float()
                    model.train()
                    optimizer.zero_grad()
                    y_ = model(
                        torch.tensor(df_train.loc[
                            pd.IndexSlice[train_index, :], :
                            'raw_acc:magnitude_stats:time_entropy'].values.
                                     reshape(1, -1, w, h)).float())
                    print(y_)
                    input()
                    loss = criterion(y_, df_train.iloc[0, -5:].values)
                    loss.backward()
                    optimizer.step()
                    print(f"Epoch {epoch+1}/{n_epochs}, loss = {loss.item()}")

                [w, h] = (df_test.loc[pd.IndexSlice[
                    test, :], :'raw_acc:magnitude_stats:time_entropy']).shape
                temp = (df_test.loc[pd.IndexSlice[
                    test, :], :'raw_acc:magnitude_stats:time_entropy']
                        ).values.reshape(1, -1, w, h)
                y_test_predict = model(temp)
                mae = np.sum(
                    np.absolute((df_test.loc[pd.IndexSlice[test, :],
                                             'label:LYING_DOWN':]).values -
                                y_test_predict))
Exemple #35
0
def func(x, a, b, k):
    C = np.log((b - a) / (b - x))
    y_pred = np.multiply(C, k)
    return y_pred


LAI_test_pred = []
NDVI_test = []
LAI_test = []
LAI_ALL_P = []
param = []
A = []
B = []
K = []
for train_index, test_index in loo.split(NDVI):
    X_train, X_test = NDVI[train_index], NDVI[test_index]
    y_train, y_test = LAI[train_index], LAI[test_index]

    param_bounds = ([0.01, 0.93, 1.3], [0.15, 0.95,
                                        1.8])  #参数上下限,第一个方括号为所有参数下限,第二个为所有参数上限
    popt, pcov = curve_fit(func, X_train, y_train, bounds=param_bounds)
    param.append(popt)
    a = popt[0]
    b = popt[1]
    k = popt[2]
    A.append(a)
    B.append(b)
    K.append(k)
    NDVI_test.append(X_test)
    LAI_test.append(y_test)
    def evaluate_by_loo(energies_train,
                        energies_target,
                        regr=LinearRegression()):
        loo = LeaveOneOut()
        loo.get_n_splits(energies_train)

        train_r2_scores = np.array([])
        test_r2_scores = np.array([])
        train_rmse_scores = np.array([])
        test_rmse_scores = np.array([])
        predicted_powers = np.array([])
        actual_powers = np.array([])

        # Train Linear Regression model
        # It is small data, so
        for train_index, test_index in loo.split(energies_train):
            # print("Test index:{}".format(test_index))
            # print("TRAIN:", train_index, "TEST:", test_index)
            # regr = LinearRegression()

            x_train, x_test = energies_train[train_index], \
                              energies_train[test_index]
            y_train, y_test = energies_target.iloc[train_index], \
                              energies_target.iloc[test_index]
            regr.fit(x_train, y_train)
            # print(X_test, y_test)

            y_train_pred = execute(
                {
                    'pipeline': regr,
                    'statistics': features_statistics
                },
                features=x_train,
                prediction=True)
            y_test_pred = execute(
                {
                    'pipeline': regr,
                    'statistics': features_statistics
                },
                features=x_test,
                prediction=True)

            # print(y_test.values, y_test_pred)

            train_r2_score = regr.score(x_train, y_train)
            train_r2_scores = np.append(train_r2_scores, train_r2_score)
            test_r2_score = r2_score(y_test.values, y_test_pred)
            test_r2_scores = np.append(test_r2_scores, test_r2_score)

            train_rmse_score = rmse(y_train, y_train_pred)
            train_rmse_scores = np.append(train_rmse_scores, train_rmse_score)
            test_rmse_score = rmse(y_test.values, y_test_pred)
            test_rmse_scores = np.append(test_rmse_scores, test_rmse_score)

            actual_powers = np.append(actual_powers, y_test.values[0])
            predicted_powers = np.append(predicted_powers, y_test_pred[0])
            # print("Actual energy generation: {}\tPredicted energy generation: {}"
            #      .format(y_test.values[0], y_test_pred[0]))
            # print("Train R^2 score: {}\tTest R^2 score:{}"
            #      .format(train_r2_score, test_r2_score))
            # print("Train RMSE: {}\tTest RMSE:{}\n"
            #      .format(train_rmse_score, test_rmse_score))

        # Standard deviation of training data is base line of RMSE
        # print("Standard deviation: {}".format(pd.DataFrame.std(energies_target)))

        print("Train average RMSE: {}\tTest average RMSE:{}".format(
            np.average(train_rmse_scores), np.average(test_rmse_scores)))
        print("Train average R^2: {}\tTest average R^2:{}".format(
            np.average(train_r2_scores), np.average(test_r2_scores)))

        return actual_powers, predicted_powers
def regress_loo(features, grades, method='ridge', standard=False, use_intercept=True, groups=None, convert='none', alpha=1.0):
    """Calculates linear regression with leave-one-out split and L2 regularization.

    Parameters
    ----------
    features : ndarray
        Input features used in creating regression model.
    grades : ndarray
        Ground truth for the model.
    method : str
        Regression model used. Defaults to ridge regression, but lasso is also possible. Ridge seems to perform better.
    standard : bool
        Choice whether to center features by the mean of training split.
        Defaults to false, since whitened PCA is assumed to be centered.
    use_intercept : bool
        Choice whether to use intercept term on the model.
        If the model does not provide very powerful predictions, it is better to center them by the intercept.
    groups : ndarray
        Patients groups. Used in leave-one-group-out split.
    convert : str
        Possibility to predict exp or log of ground truth. Defaults to no conversion.
    Returns
    -------
    Array of model prdictions, model coefficients and model intercept term.
    """

    # Convert grades
    if convert == 'exp':
        grades = np.exp(grades)
    elif convert == 'log':
        grades = np.log(grades)
    else:
        pass

    predictions = []
    # Get leave-one-out split
    loo = LeaveOneOut()
    loo.get_n_splits(features)
    for train_idx, test_idx in loo.split(features):
        # Train split
        x_train, x_test = features[train_idx], features[test_idx]
        y_train, y_test = grades[train_idx], grades[test_idx]

        # Normalize with mean and std
        if standard:
            x_test -= x_train.mean(0)
            x_train -= x_train.mean(0)

        # Linear regression
        if method == 'ridge':
            model = Ridge(alpha=alpha, normalize=True, random_state=42, fit_intercept=use_intercept)
        else:
            model = Lasso(alpha=alpha, normalize=True, random_state=42, fit_intercept=use_intercept)
        model.fit(x_train, y_train)

        # Evaluate on test sample
        predictions.append(model.predict(x_test))

    predictions_flat = []
    for group in predictions:
        for p in group:
            predictions_flat.append(p)

    return np.array(predictions).squeeze(), model.coef_, model.intercept_
            # pass None as weights to np.average: uniform mean
            multioutput = None
    return np.average(output_errors, weights=multioutput)


'''Q1-1-1 PCA+PCR'''
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut
from sklearn import metrics

loo = LeaveOneOut()
ytests = []
ypreds = []
for train_idx, test_idx in loo.split(X_pca):
    X_train, X_test = X_pca[train_idx], X_pca[test_idx]  #requires arrays
    y_train, y_test = y[train_idx], y[test_idx]
    model = LinearRegression()

    model.fit(X=X_train, y=y_train)
    y_pred = model.predict(X_test)

    # there is only one y-test and y-pred per iteration over the loo.split,
    # so to get a proper graph, we append them to respective lists.
    ytests += list(y_test)
    ypreds += list(y_pred)

Score_R2 = metrics.r2_score(ytests, ypreds)
Score_MAE = metrics.mean_absolute_error(ytests, ypreds)
Score_MSE = metrics.mean_squared_error(ytests, ypreds)
Exemple #39
0
clf = LinearSVC(C=10)
start_time = time.time()
clf = clf.fit(x_tr, y_tr)
predictions_tr = (clf.predict(x_ts))

#10-fold Cross-Validation
scores = cross_val_score(clf, x_tr, y_tr, cv=10)
test_acc = accuracy_score(y_ts, predictions_tr)

print("Training Accuracy: %0.4f (+/- %0.2f)" %
      (scores.mean(), scores.std() * 2))
print("Test Accuracy: %0.4f" % test_acc)
print("--- %s seconds ---" % (time.time() - start_time))

#Leave One Out Validation
if leave_one_out_validation:
    loo_train_acc = []
    loo = LeaveOneOut()
    for train_index, test_index in loo.split(x_tr):
        X_train, X_test = x_tr[train_index], x_tr[test_index]
        y_train, y_test = y_tr[train_index], y_tr[test_index]
        clf = clf.fit(X_train, y_train)
        predictions = (clf.predict(X_test))
        loo_train_acc.append(accuracy_score(y_test, predictions))

    loo_train_accuracy = np.asarray(loo_train_acc)
    print("LOO Accuracy: %0.4f" % loo_train_accuracy.mean())

#Save the model
pickle.dump(clf, open(f'{model_save_path}/model1_inception_svm.sav', 'wb'))
Exemple #40
0
def _print_train_results(classifier_name, classifier, regressors, response, regressor_names, leave_one_out):
    """
        _print_train_results
            Performs validation tests of the model and prints the results
             
        :param classifier_name: Name of the classifier method 
        :param classifier: Classifier object
        :param regressors: numpy array with the regressors used to train the model
        :param response: numpy array with the response used to train the model
        :param regressor_names: List with the name of the regressors
        :param leave_one_out: Boolean, true to perform leave-one-out cross-validation, otherwise perform default cross 
            validation
        :return: None 
    """
    global MESSAGES
    _verbose_print("classifier_name: {}".format(classifier_name))
    _verbose_print("classifier: {}".format(classifier))
    _verbose_print("regressor_names: {}".format(regressor_names))
    _verbose_print("leave_one_out: {}".format(leave_one_out))

    MESSAGES.AddMessage("{} classifier with parameters: \n {}".format(classifier_name,
                                                                      str(classifier.get_params()).replace("'", "")))

    if leave_one_out:
        # create a leave-one-out instance to execute the cross-validation
        loo = LeaveOneOut()
        start = timer()
        cv_score = cross_val_score(classifier, regressors, response, cv=loo.split(regressors))
        end = timer()
        n_tests = len(response)
        MESSAGES.AddMessage("Score (Leave one Out):" + str(cv_score.mean()))
    else:
        start = timer()
        cv_score = cross_val_score(classifier, regressors, response)
        end = timer()
        n_tests = 3
        MESSAGES.AddMessage("Score (3-Fold):" + str(cv_score.mean()))
    # Print validation time
    MESSAGES.AddMessage("Testing time: {:.3f} seconds, {:.3f} seconds per test".format(end - start,
                                                                                       (end - start) / n_tests))
    # Print confusion matrix
    MESSAGES.AddMessage("Confusion Matrix (Train Set):")

    confusion = confusion_matrix(response, classifier.predict(regressors))
    labels = ["Non Deposit", "Deposit"]
    row_format = "{:6}" + "{:^16}" * (len(labels) + 1)
    MESSAGES.AddMessage(row_format.format("", "", "Predicted", ""))
    MESSAGES.AddMessage(row_format.format("True", "", *labels))
    for label, row in zip(labels, confusion):
        MESSAGES.AddMessage(row_format.format("", label, *row))

    # Some classifiers do not have  decision_function attribute but count with predict_proba instead
    # TODO: Generalize to anything that does not have decision_function "Easier to ask for forgiveness than permission"
    if classifier_name in ["Random Forest"]:
        des_fun = classifier.predict_proba(regressors)[:, classifier.classes_ == 1]
    else:
        des_fun = classifier.decision_function(regressors)
    MESSAGES.AddMessage("Area Under the curve (AUC): {}".format(roc_auc_score(response, des_fun)))

    # Give the importance of the features if it is supported
    # TODO: Generalize to anything that does have feature_importances_ "Easier to ask for forgiveness than permission"
    if classifier_name == "Adaboost":
        MESSAGES.AddMessage("Feature importances: ")
        importances = [[name, val*100] for name, val in zip(regressor_names, classifier.feature_importances_)]
        long_word = max([len(x) for x in regressor_names])
        row_format = "{" + ":" + str(long_word) + "} {:4.1f}%"
        # Print regressors in descending importance, omit the ones with 0 importance
        for elem in sorted(importances, key=lambda imp: imp[1], reverse=True):
            if elem[1] > 0:
                MESSAGES.AddMessage(row_format.format(*elem))

    return
from sklearn import metrics
from sklearn.model_selection import cross_val_predict

# log-regression lib model
log_model = LogisticRegression()
m = np.shape(X)[0]

# 10-folds CV
y_pred = cross_val_predict(log_model, X, y, cv=10)
print(metrics.accuracy_score(y, y_pred))
    
# LOOCV
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
accuracy = 0;
for train, test in loo.split(X):
    log_model.fit(X[train], y[train])  # fitting
    y_p = log_model.predict(X[test])
    if y_p == y[test] : accuracy += 1  
print(accuracy / np.shape(X)[0])

# m = np.shape(X)[0]
# scores_loo = cross_val_score(log_model, X, y, cv=m)
# print(scores_loo)
# # prediction using 10-folds
# y_pred_loo = cross_val_predict(log_model, X, y, cv=m)
# print(metrics.accuracy_score(y, y_pred_loo))

'''
transfusion-blood dats set analysis
'''