Exemple #1
1
def classify_using_lda(feat1, feat2, num_comp=2):

    n_plus = len(feat1)
    n_minus = len(feat2)

    X = np.concatenate((feat1, feat2), axis=0)
    y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0)
    y += 1

    print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape)

    lda = LDA(n_components=num_comp)
    lda.fit(X, y)

    # TODO FIXME Why is this returning n_samples x 1, and not n_samples x 2?
    # Is it able to to differentiate using just 1 component? Crazy!!
    X_tr = lda.transform(X)

    print(X_tr.shape, lda.score(X, y))

    # CRAZY, we don't actually have the 2nd component from LDA
    X1 = np.concatenate((X_tr[0:n_plus], np.zeros((n_plus, 1))), axis=1)
    X2 = np.concatenate((X_tr[-n_minus:], np.ones((n_minus, 1))), axis=1)

    plt.plot(X1[:, 0], X1[:, 1], 'ro')
    plt.plot(X2[:, 0], X2[:, 1], 'g+')

    plt.ylim(-1, 3)
    plt.show()
Exemple #2
0
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65, mean_u80 = 0, 0
    for idx_train, idx_test in kf.split(y):
        print("---k-FOLD-new-executing--")
        X_cv_train, y_cv_train = X[idx_train], y[idx_train]
        X_cv_test, y_cv_test = X[idx_test], y[idx_test]
        lda.fit(X_cv_train, y_cv_train)
        n_test = len(idx_test)
        sum_u65, sum_u80 = 0, 0
        for i, test in enumerate(X_cv_test):
            evaluate = lda.predict([test])
            print("-----TESTING-----", i)
            if y_cv_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
Exemple #3
0
def computing_performance_LDA(in_path=None, seeds=list([0])):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].tolist()
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65, mean_u80 = 0, 0
    n_times = len(seeds)
    for k in range(0, n_times):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seeds[k])
        sum_u65, sum_u80 = 0, 0
        lda.fit(X_train, y_train)
        n, _ = X_test.shape
        for i, test in enumerate(X_test):
            evaluate = lda.predict([test])
            print("-----TESTING-----", i)
            if y_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        print("--k-->", k, sum_u65 / n, sum_u80 / n)
        mean_u65 += sum_u65 / n
        mean_u80 += sum_u80 / n
    print("--->", mean_u65 / n_times, mean_u80 / n_times)
def test_lda_predict():
    # Test LDA classification.
    # This checks that LDA implements fit and predict and returns correct
    # values for simple toy data.
    for test_case in solver_shrinkage:
        solver, shrinkage = test_case
        clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        y_pred = clf.fit(X, y).predict(X)
        assert_array_equal(y_pred, y, "solver %s" % solver)

        # Assert that it works with 1D data
        y_pred1 = clf.fit(X1, y).predict(X1)
        assert_array_equal(y_pred1, y, "solver %s" % solver)

        # Test probability estimates
        y_proba_pred1 = clf.predict_proba(X1)
        assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver)
        y_log_proba_pred1 = clf.predict_log_proba(X1)
        assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8, "solver %s" % solver)

        # Primarily test for commit 2f34950 -- "reuse" of priors
        y_pred3 = clf.fit(X, y3).predict(X)
        # LDA shouldn't be able to separate those
        assert_true(np.any(y_pred3 != y3), "solver %s" % solver)

    # Test invalid shrinkages
    clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
    assert_raises(ValueError, clf.fit, X, y)
    clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
    assert_raises(NotImplementedError, clf.fit, X, y)
    # Test unknown solver
    clf = LinearDiscriminantAnalysis(solver="dummy")
    assert_raises(ValueError, clf.fit, X, y)
def test_lda_explained_variance_ratio():
    # Test if the sum of the normalized eigen vectors values equals 1,
    # Also tests whether the explained_variance_ratio_ formed by the
    # eigen solver is the same as the explained_variance_ratio_ formed
    # by the svd solver

    state = np.random.RandomState(0)
    X = state.normal(loc=0, scale=100, size=(40, 20))
    y = state.randint(0, 3, size=(40,))

    clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
    clf_lda_eigen.fit(X, y)
    assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)

    clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
    clf_lda_svd.fit(X, y)
    assert_almost_equal(clf_lda_svd.explained_variance_ratio_.sum(), 1.0, 3)

    tested_length = min(clf_lda_svd.explained_variance_ratio_.shape[0],
                        clf_lda_eigen.explained_variance_ratio_.shape[0])

    # NOTE: clf_lda_eigen.explained_variance_ratio_ is not of n_components
    # length. Make it the same length as clf_lda_svd.explained_variance_ratio_
    # before comparison.
    assert_array_almost_equal(clf_lda_svd.explained_variance_ratio_,
                              clf_lda_eigen.explained_variance_ratio_[:tested_length])
class LinearDiscriminantAnalysisPredictor(PredictorBase):
    '''
    Linear Discriminant Analysis
    '''

    def __init__(self, animal_type):
        self.animal_type = animal_type
        self.clf = LinearDiscriminantAnalysis()

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        predictions = self.clf.predict_proba(X_test)
        predictions_df = self.bundle_predictions(predictions)

        return predictions_df

    def find_best_params(self):
        parameters = {'solver': ['svd', 'lsqr', 'eigen']}
        knn = LinearDiscriminantAnalysis()
        clf = grid_search.GridSearchCV(knn, parameters)
        train_data = get_data('../data/train.csv')
        train_data = select_features(train_data, self.animal_type)
        X = train_data.drop(['OutcomeType'], axis=1)
        y = train_data['OutcomeType']
        clf.fit(X, y)
        print clf.best_params_
Exemple #7
0
def main():
    """Read Train/test log."""
    df = pd.read_csv("train.csv")

    # train/test split using stratified sampling
    labels = df['label']
    df = df.drop(['label'], 1)
    sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)
    for train_index, test_index in sss:
        x_train, x_test = df.values[train_index], df.values[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

    # classification algorithm
    classification(x_train, y_train, x_test, y_test)

    # Predict Test Set
    favorite_clf = LinearDiscriminantAnalysis()
    favorite_clf.fit(x_train, y_train)
    test = pd.read_csv('test.csv')
    test_predictions = favorite_clf.predict(test)
    print test_predictions

    # Format DataFrame
    submission = pd.DataFrame(test_predictions, columns=['Label'])
    submission.tail()
    submission.insert(0, 'ImageId', np.arange(len(test_predictions)) + 1)
    submission.reset_index()
    submission.tail()

    # Export Submission
    submission.to_csv('submission.csv', index=False)
    submission.tail()
class LinearDiscriminantAnalysiscls(object):
    """docstring for ClassName"""
    def __init__(self):
        self.lda_cls = LinearDiscriminantAnalysis()
        self.prediction = None
        self.train_x = None
        self.train_y = None

    def train_model(self, train_x, train_y):
        try:
            self.train_x = train_x
            self.train_y = train_y
            self.lda_cls.fit(train_x, train_y)
        except:
            print(traceback.format_exc())

    def predict(self, test_x):
        try:
            self.test_x = test_x
            self.prediction = self.lda_cls.predict(test_x)
            return self.prediction
        except:
            print(traceback.format_exc())

    def accuracy_score(self, test_y):
        try:
            # return r2_score(test_y, self.prediction)
            return self.lda_cls.score(self.test_x, test_y)
        except:
            print(traceback.format_exc())
Exemple #9
0
def plot_lda_only(filename, title, filename_fig):

    df = pd.read_csv(path+filename, names=['x1','x2','y'], header=None)
    fig = plt.figure()
    fig.suptitle(title, fontsize=20)
    columns_ls = []
    for column in df.columns:
        columns_ls.append(column)

    X = df[columns_ls[0:len(columns_ls)-1]].values
    Y = df[columns_ls[len(columns_ls)-1]].values

    clf_lda = LinearDiscriminantAnalysis()
    clf_lda.fit(X, Y)
    w = clf_lda.coef_[0]
    a = -w[0]/w[1]

    xx = np.linspace(-12, 34)
    yy = a*xx-clf_lda.intercept_[0]/w[1]
    plt.plot(xx,yy, color="blue", label ="LDA decision boundary")

    print "Weights W0 %.2f and W1%.2f"%(w[0], w[1])
    plt.text(0, 0, "Y=+1", fontsize=12)
    plt.text(10, -20, "Y=-1", fontsize=12)
    # plt.plot(xx, yy_down, 'k--')
    # plt.plot(xx, yy_up, 'k--')
    # plt.plot(xx,yy,color="black", label ="svm decision boundary")


    plt.xlabel('X1', fontsize=18)
    plt.ylabel('X2', fontsize=16)
    # fig.savefig(filename_fig)
    # model = LogisticRegression()
    # model.fit(X, Y)
    # w = model.coef_[0]
    # a = -w[0]/w[1]
    #
    # xx = np.linspace(-12, 34)
    # yy = a*xx-model.intercept_[0]/w[1]
    #
    # plt.plot(xx,yy, label ="logistic decision boundary")
    #
    # clf_lda = LinearDiscriminantAnalysis()
    # clf_lda.fit(X, Y)
    # w = clf_lda.coef_[0]
    # a = -w[0]/w[1]
    #
    # xx = np.linspace(-12, 34)
    # yy = a*xx-clf_lda.intercept_[0]/w[1]
    # plt.plot(xx,yy, color="blue", label ="LDA decision boundary")

    # plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
    #         s=80, color='b')
    plt.scatter(X[:, 0], X[:, 1], c=Y)

    plt.axis('tight')
    plt.legend()

    plt.show()
def doLDA(x,digits,s):
    myLDA = LDA()
    myLDA.fit(x.PCA[:,:s],digits.train_Labels)
    newtest = digits.test_Images -x.centers
    [email protected](x.V[:s,:])
    labels = myLDA.predict(newtest)
    errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels)
    return errors
Exemple #11
0
def lda(X, y, n):
	'''
		Returns optimal projection of the data
		LDA with n components
	'''
	selector = LinearDiscriminantAnalysis(n_components=n)
	selector.fit(X, y)
	return selector.transform(X), y
def Train(enhancedGeneSet, classLabels):
    enhancedGeneSet = np.array(enhancedGeneSet);
    classLabels = np.array(classLabels);
    classifier = LinearDiscriminantAnalysis();
    classifier.fit(enhancedGeneSet, classLabels);
    #del enhancedGeneSet;
    #del classLabels;
    return classifier;
Exemple #13
0
    def train_model(self):
        ### Train spectrum data
        # form training data and labels
        X = np.empty((0, self.freq_cutoff), int)
        y = np.empty((0, 1), int)

        data_dir = 'clap_data/claps/spectrum/'
        for fname in os.listdir(data_dir):
            data = np.load("%s%s"% (data_dir, fname))
            X = np.append(X, data, axis=0)
            y = np.append(y, [1] * data.shape[0])

        data_dir = 'clap_data/noclaps/spectrum/'
        for fname in os.listdir(data_dir):
            data = np.load("%s%s"% (data_dir, fname))
            X = np.append(X, data, axis=0)
            y = np.append(y, [0] * data.shape[0])

        # pca = PCA(n_components=200)
        # X_pca = pca.fit_transform(X)

        # fit the model
        # clf = LogisticRegression(penalty='l1')
        clf = LinearDiscriminantAnalysis()
        clf.fit(X, y)
        preds = clf.predict(X)
        # X_new = clf.transform(X)

        # clf2 = LinearDiscriminantAnalysis()
        # clf2.fit(X_new, y)
        # preds2 = clf2.predict(X_new)

        # print X.shape, X_pca.shape
        print preds
        print np.sum(preds), preds.size
        # print preds2, np.sum(preds2)

        # save model
        pickle.dump(clf, open(clap_model_dir + clap_classifier_fname, 'w'))
        self.clap_clf = clf

        ### Train decay data
        X = np.empty((0, self.decay_samples/10), int)

        data_dir = 'clap_data/claps/decay/'
        for fname in os.listdir(data_dir):
            if fname.endswith('npy'):
                data = np.load("%s%s"% (data_dir, fname))
                print data.shape, X.shape
                X = np.append(X, data, axis=0)

        print X.shape
        X_avg = np.mean(X, axis=0)
        plt.plot(X_avg)
        plt.show()

        # Average decay data
        np.save('%s%s' % (clap_model_dir, clap_decay_model_fname), X_avg)
Exemple #14
0
 def _get_lda(self, data, variables):
     domain = Domain(attributes=variables, class_vars=data.domain.class_vars)
     data = data.transform(domain)
     lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2)
     lda.fit(data.X, data.Y)
     scalings = lda.scalings_[:, :2].T
     if scalings.shape == (1, 1):
         scalings = np.array([[1.], [0.]])
     return scalings
def test_raises_value_error_on_same_number_of_classes_and_samples(solver):
    """
    Tests that if the number of samples equals the number
    of classes, a ValueError is raised.
    """
    X = np.array([[0.5, 0.6], [0.6, 0.5]])
    y = np.array(["a", "b"])
    clf = LinearDiscriminantAnalysis(solver=solver)
    with pytest.raises(ValueError, match="The number of samples must be more"):
        clf.fit(X, y)
def test_lda_numeric_consistency_float32_float64():
    for (solver, shrinkage) in solver_shrinkage:
        clf_32 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        clf_32.fit(X.astype(np.float32), y.astype(np.float32))
        clf_64 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        clf_64.fit(X.astype(np.float64), y.astype(np.float64))

        # Check value consistency between types
        rtol = 1e-6
        assert_allclose(clf_32.coef_, clf_64.coef_, rtol=rtol)
def test_lda_explained_variance_ratio():
    # Test if the sum of the normalized eigen vectors values equals 1
    n_features = 2
    n_classes = 2
    n_samples = 1000
    X, y = make_blobs(n_samples=n_samples, n_features=n_features,
                      centers=n_classes, random_state=11)

    clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
    clf_lda_eigen.fit(X, y)
    assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
 def testEvaluateLDA(self, trCList, teCList):
     # LDA object
     clf = LinearDiscriminantAnalysis()
     # fit lda model using training chromosomes
     clf.fit(numpy.asarray(trCList), numpy.asarray(trainGroupings))
     
     predicted = clf.predict(teCList)
         
     self.confusionMatrix(testGroupings, predicted, 'lda_test')
     
     # return precision ([0]), recall ([1]) or f1 score ([2]), replace with clf.score(numpy.asarray(teCList), testGroupings) for accuracy
     return precision_recall_fscore_support(testGroupings, predicted, average = 'weighted')[2] # fitness for test set
Exemple #19
0
def LD(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelLD=LinearDiscriminantAnalysis()
     modelLD.fit(train_desc,np.array(train_labels))
     joblib.dump((modelLD, img_classes, stdSlr), pth+"/ld-bof.pkl", compress=3) 
     test(pth, "ld-")
Exemple #20
0
 def lda_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     scores = []
     train_scores = []
     rng = range(1, X_train_scl.shape[1]+1)
     for i in rng:
         lda = LinearDiscriminantAnalysis(n_components=i)
         cv = KFold(X_train_scl.shape[0], 3, shuffle=True)
         
         # cross validation
         cv_scores = []
         for (train, test) in cv:
             lda.fit(X_train_scl[train], y_train[train])
             score = lda.score(X_train_scl[test], y_train[test])
             cv_scores.append(score)
         
         mean_score = np.mean(cv_scores)
         scores.append(mean_score)
         
         # train score
         lda = LinearDiscriminantAnalysis(n_components=i)
         lda.fit(X_train_scl, y_train)
         train_score = lda.score(X_train_scl, y_train)
         train_scores.append(train_score)
         
         print(i, mean_score)
         
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (LDA) for ' + data_set_name
     name = data_set_name.lower() + '_lda_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(rng,
                    [scores, train_scores],
                    [None, None],
                    ['cross validation score', 'training score'],
                    cm.viridis(np.linspace(0, 1, 2)),
                    ['o', '*'],
                    title,
                    'n_components',
                    'Score',
                    filename)
def test_lda_transform():
    # Test LDA transform.
    clf = LinearDiscriminantAnalysis(solver="svd", n_components=1)
    X_transformed = clf.fit(X, y).transform(X)
    assert_equal(X_transformed.shape[1], 1)
    clf = LinearDiscriminantAnalysis(solver="eigen", n_components=1)
    X_transformed = clf.fit(X, y).transform(X)
    assert_equal(X_transformed.shape[1], 1)

    clf = LinearDiscriminantAnalysis(solver="lsqr", n_components=1)
    clf.fit(X, y)
    msg = "transform not implemented for 'lsqr'"
    assert_raise_message(NotImplementedError, msg, clf.transform, X)
Exemple #22
0
    def learn(self, decoy_peaks, target_peaks, use_main_score=True):
        assert isinstance(decoy_peaks, Experiment)
        assert isinstance(target_peaks, Experiment)

        X0 = decoy_peaks.get_feature_matrix(use_main_score)
        X1 = target_peaks.get_feature_matrix(use_main_score)
        X = np.vstack((X0, X1))
        y = np.zeros((X.shape[0],))
        y[X0.shape[0] :] = 1.0
        classifier = LinearDiscriminantAnalysis()
        classifier.fit(X, y)
        self.classifier = classifier
        self.scalings = classifier.scalings_.flatten()
        return self
Exemple #23
0
def _dimReduce(df, method='pca', n_components=2, labels=None, standardize=False, smatFunc=None, ldaShrinkage='auto'):
    if method == 'kpca':
        """By using KernelPCA for dimensionality reduction we don't need to impute missing values"""
        if smatFunc is None:
            smatFunc = corrTSmatFunc
        pca = KernelPCA(kernel='precomputed', n_components=n_components)
        smat = smatFunc(df).values
        xy = pca.fit_transform(smat)
        pca.components_ = pca.alphas_
        pca.explained_variance_ratio_ = pca.lambdas_ / pca.lambdas_.sum()
        return xy, pca
    elif method == 'pca':
        if standardize:
            normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0)
        else:
            normed = df.apply(lambda vec: vec - vec.mean(), axis=0)
        pca = PCA(n_components=n_components)
        xy = pca.fit_transform(normed)
        return xy, pca
    elif method == 'lda':
        if labels is None:
            raise ValueError('labels needed to perform LDA')
        if standardize:
            normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0)
        else:
            normed = df.apply(lambda vec: vec - vec.mean(), axis=0)
        
        if df.shape[1] > df.shape[0]:
            """Pre-PCA step"""
            ppca = PCA(n_components=int(df.shape[0]/1.5))
            normed = ppca.fit_transform(df)

        lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=ldaShrinkage, n_components=n_components)
        lda.fit(normed, labels.values)
        lda.explained_variance_ratio_ = np.abs(lda.explained_variance_ratio_) / np.abs(lda.explained_variance_ratio_).sum()
        xy = lda.transform(normed)
    elif method == 'pls':
        if labels is None:
            raise ValueError('labels needed to perform PLS')
        if standardize:
            normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0)
        else:
            normed = df.apply(lambda vec: vec - vec.mean(), axis=0)
        
        pls = PLSRegression(n_components=n_components)
        pls.fit(normed, labels)
        
        pls.explained_variance_ratio_ = np.zeros(n_components)
        xy = pls.x_scores_
        return xy, pls
Exemple #24
0
def importance_lda(data, kpi, max_features=10, **kwargs):
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
    """
    :param data: dataframe containing training data
    :param kpi: Name of the current kpi
    :param max_features: maximum number of features to return
    :return: list of the best metrics
    """
    columns = data[[col for col in set(data.columns) - {kpi}]].columns
    train, test, target_train, target_test = prepare_data_for_kpi(data, kpi)
    model = LDA(**kwargs)
    model.fit(train)

    print model.coef_
Exemple #25
0
def plot_linear_svm_lda(filename, title):
    df = pd.read_csv(path+filename, names=['x1','x2','y'], header=None)
    columns_ls = []
    for column in df.columns:
        columns_ls.append(column)

    X = df[columns_ls[0:len(columns_ls)-1]].values
    Y = df[columns_ls[len(columns_ls)-1]].values
    fig = plt.figure()
    fig.suptitle(title, fontsize=20)

    clf = svm.SVC(kernel='linear')

    clf.fit(X, Y)
    w = clf.coef_[0]
    print "Weights SVM W0=%.2f and W1=%.2f"%(w[0], w[1])
    a = -w[0]/w[1]
    xx =np.linspace(-12, 34)
    yy = a*xx-clf.intercept_[0]/w[1]
    b = clf.support_vectors_[0]
    yy_down = a * xx + (b[1] - a * b[0])
    b = clf.support_vectors_[-1]
    yy_up = a * xx + (b[1] - a * b[0])

    # plot the line, the points, and the nearest vectors to the plane
    # plt.plot(xx, yy, 'k-')
    plt.text(0, 10, "Y=+1", fontsize=12)
    plt.text(10, 0, "Y=-1", fontsize=12)
    plt.plot(xx, yy_down, 'k--')
    plt.plot(xx, yy_up, 'k--')
    plt.plot(xx,yy,color="black", label ="svm decision boundary")

    clf_lda = LinearDiscriminantAnalysis()
    clf_lda.fit(X, Y)
    w = clf_lda.coef_[0]
    a = -w[0]/w[1]
    print "Weights LDA W0=%.2f and W1=%.2f"%(w[0], w[1])
    xx = np.linspace(-12, 34)
    yy = a*xx-clf_lda.intercept_[0]/w[1]
    plt.plot(xx,yy, color="blue", label ="LDA decision boundary")

    plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
            s=80, color='b')
    plt.scatter(X[:, 0], X[:, 1], c=Y)
    plt.xlabel('X1', fontsize=18)
    plt.ylabel('X2', fontsize=16)
    plt.axis('tight')
    plt.legend()
    plt.show()
Exemple #26
0
    def train_DA(self, X, y, lda_comp, qda_reg):
        '''
        Input: 
            qda_reg - reg_param
            lda_comp - n_components
            X - data matrix (train_num, feat_num)
            y - target labels matrix (train_num, label_num)

        Output: 
            best_clf - best classifier trained (QDA/LDA)
            best_score - CV score of best classifier

        Find best DA classifier.
        '''
        n_samples, n_feat = X.shape
        cv_folds = 10
        kf = KFold(n_samples, cv_folds, shuffle=False)

        
        
        lda = LinearDiscriminantAnalysis(n_components = lda_comp)
        qda = QuadraticDiscriminantAnalysis(reg_param = qda_reg)
        score_total_lda = 0 #running total of metric score over all cv runs
        score_total_qda = 0 #running total of metric score over all cv runs
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            lda.fit(X_train, y_train)
            cv_pred_lda = lda.predict(X_test)
            score_lda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")')
            score_total_lda += score_lda
            
            qda.fit(X_train,y_train)
            cv_pred_qda = qda.predict(X_test)
            score_qda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")')
            score_total_qda += score_qda

        score_lda = score_total_lda/cv_folds
        score_qda = score_total_qda/cv_folds
        
        # We keep the best one
        if(score_qda > score_lda):
            qda.fit(X,y)
            return qda, score_qda
        else:
            lda.fit(X,y)
            return lda, score_lda
Exemple #27
0
def plot_lda(features, labels):
    """
    Input
        features: features to get LDA and plot
        labels: labels of features
    Description
        plots the LDA of features
    """
    lda = LinearDiscriminantAnalysis(n_components=2)
    new_features = lda.fit(chroma[0], chroma[1]).transform(chroma[0])

    colors = list("rgbykrgbyk")
    markers = list("xxxxxooooo")

    plt.figure(len(genres)) # for all together
    for i, genre in enumerate(genres):
        plt.figure(i) # for one particular genre
        plt.scatter(new_features[i*num_songs:(i+1)*num_songs, 0],
                    new_features[i*num_songs:(i+1)*num_songs, 1],
                    c=colors[i], marker=markers[i], label=genre)
        plt.title(genre)

        plt.figure(len(genres)) # for all together
        plt.scatter(new_features[i*num_songs:(i+1)*num_songs, 0],
                    new_features[i*num_songs:(i+1)*num_songs, 1],
                    c=colors[i], marker=markers[i], label=genre)
    plt.legend()
    plt.title('LDA')
    plt.show()
Exemple #28
0
    def test(self):
        iris = datasets.load_iris()
        
        X = iris.data
        y = iris.target
        target_names = iris.target_names
        
        pca = PCA(n_components=3)
        X_r = pca.fit(X).transform(X)
        
        lda = LinearDiscriminantAnalysis(n_components=3)
        X_r2 = lda.fit(X, y).transform(X)
        
        # Percentage of variance explained for each components
        print('explained variance ratio (first two components): %s'
              % str(pca.explained_variance_ratio_))

        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
            ax.scatter(X_r[y == i, 0], X_r[y == i, 1], zs=X[y == i, 2], c=c, label=target_name)
        plt.legend()
        plt.title('PCA of IRIS dataset')
            
        fig2 = plt.figure()
        ax = fig2.add_subplot(111, projection='3d')
        for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
            ax.scatter(X_r2[y == i, 0], X_r2[y == i, 1], zs=X[y == i, 2], c=c, label=target_name)
        plt.legend()
        plt.title('LDA of IRIS dataset')
            
        plt.show()
def visualize_lda2D(X,y):
	"""
	Visualize the separation between classes using the two most discriminant features

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""
	labels=['Paid','Default']
	lda = LDA(n_components = 2,solver='eigen')
	# lda = LDA(n_components = 2)
	discriminative_attributes = lda.fit(X, y).transform(X)

	palette = sea.color_palette()
	# plt.plot(discriminative_attributes[:,0][y==0],'sg',label="Paid", alpha=0.5)
	# plt.plot(discriminative_attributes[:,0][y==1],'^r',label="Default", alpha=0.5)
	plt.scatter(discriminative_attributes[:,0][y==0],discriminative_attributes[:,1][y==0],marker='s',color='green',label="Paid", alpha=0.5)
	plt.scatter(discriminative_attributes[:,0][y==1],discriminative_attributes[:,1][y==1],marker='^',color='red',label="Default", alpha=0.5)
	plt.xlabel('First Linear Discriminant')
	plt.ylabel('Second Linear Discriminant')

	leg = plt.legend(loc='upper right', fancybox=True)
	leg.get_frame().set_alpha(0.5)
	plt.title("Linear Discriminant Analysis")
	plt.tight_layout

	#save fig
	output_dir='img'
	save_fig(output_dir,'{}/lda.png'.format(output_dir))
Exemple #30
0
    def tuneSpatialFilters(self):

        print colors.MAGENTA
        num_total_spatial_filters = self.all_spatial_filters.shape[0]

        best_mean = 0
        best_num = 0
        best_score = None

        for i in xrange(num_total_spatial_filters):

            num_filters_to_try = i+1
            print "trying with first",num_filters_to_try,"spatial filters"
            trial_X = self.extractFeatures(self.epochs, self.all_spatial_filters[:num_filters_to_try])
            lda = LinearDiscriminantAnalysis()
            lda = lda.fit(trial_X, self.y)
            cross_validation_folds = 10
            xval = cross_val_score(lda, trial_X, self.y, cv=cross_validation_folds)
            #print xval
            this_mean = xval.mean()
            print "mean",this_mean
            if this_mean > best_mean:
                best_mean = this_mean
                best_num = num_filters_to_try
                best_score = xval

        print "-----------------------------"
        print "best mean was", best_mean, "with", best_num, "filters used"
        print best_score

        print colors.ENDC
#evaluate created models
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s : %f (%f)' % (name, cv_results.mean(), cv_results.std()))

#compare models

pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

#make predictions on LDA
model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
pred = model.predict(X_validation)

#evaluate predictions
print(accuracy_score(Y_validation, pred))
print(confusion_matrix(Y_validation, pred))
print(classification_report(Y_validation, pred))
Exemple #32
0
def main(elec):
    """feature selection and permutations.

    For each separation of subjects with leave 2 subjects out, we train on the
    big set (feature selection) and test on the two remaining subjects.
    for each permutation, we just permute the labels at the trial level (we
    could use permutations at the subject level, but we wouldn't get as many
    permutations)
    """
    final_data = None

    print(STATE, elec)
    results_file_path = (
        SAVE_PATH / "results" /
        "EFS_NoGamma_{}_{}_{}_{:.2f}.mat".format(STATE, elec, WINDOW, OVERLAP))
    if not results_file_path.isfile():
        for freq in FREQS:
            data_file_path = SAVE_PATH / "PSD_{}_{}_{}_{}_{:.2f}.mat".format(
                STATE, freq, elec, WINDOW, OVERLAP)

            data = loadmat(data_file_path)["data"].ravel()
            if final_data is None:
                final_data = data
            else:
                for i, submat in enumerate(final_data):
                    final_data[i] = np.concatenate((submat, data[i]), axis=0)

        final_data = np.array(list(map(np.transpose, final_data)))

        lil_labels = [0] * 18 + [1] * 18
        lil_labels = np.asarray(lil_labels)
        lil_groups = list(range(36))
        sl2go = StratifiedLeave2GroupsOut()

        best_freqs = []
        pvalues, pscores = [], []
        test_scores, best_scores = [], []
        for train_subjects, test_subjects in sl2go.split(
                final_data, lil_labels, lil_groups):

            x_train, x_test = final_data[train_subjects], final_data[
                test_subjects]
            y_train, y_test = lil_labels[train_subjects], lil_labels[
                test_subjects]

            y_train = [[label] * len(x_train[i])
                       for i, label in enumerate(y_train)]
            y_train, groups = create_groups(y_train)
            x_train = np.concatenate(x_train[:], axis=0)

            nested_sl2go = StratifiedLeave2GroupsOut()
            clf = LDA()
            f_select = EFS(estimator=clf,
                           max_features=5,
                           cv=nested_sl2go,
                           n_jobs=-1)

            f_select = f_select.fit(x_train, y_train, groups)

            best_idx = f_select.best_idx_
            best_freqs.append(list(FREQS[list(best_idx)]))
            best_scores.append(f_select.best_score_)

            test_clf = LDA()
            test_clf.fit(x_train[:, best_idx], y_train)
            y_test = [[label] * len(x_test[i])
                      for i, label in enumerate(y_test)]
            y_test, groups = create_groups(y_test)
            x_test = np.concatenate(x_test[:], axis=0)
            test_score = test_clf.score(x_test[:, best_idx], y_test)
            test_scores.append(test_score)

            if PERM:
                pscores_cv = []
                for _ in range(N_PERM):
                    y_train = np.random.permutation(y_train)
                    y_test = np.random.permutation(y_test)

                    clf = LDA()
                    clf.fit(x_train[:, best_idx], y_train)
                    pscore = clf.score(x_test[:, best_idx], y_test)
                    pscores_cv.append(pscore)

                pvalue = compute_pval(test_score, pscores_cv)
                pvalues.append(pvalue)
                pscores.append(pscores_cv)

        score = np.mean(test_scores)
        data = {
            "score": score,
            "train_scores": best_scores,
            "test_scores": test_scores,
            "freqs": best_freqs,
            "pvalue": pvalues,
            "pscores": pscores,
        }

        savemat(results_file_path, data)
Exemple #33
0
### MAIN

class BertArgs:
  def __init__(self):
    self.bert_model_dir = "/content/drive/My Drive/DMT_HW3/bert/cased_L-12_H-768_A-12"
    self.bert_model_name = "bert-base-cased"
    self.bert_vocab_name = "vocab.txt"

args = BertArgs()
model = Bert(args)

# split train and dev data
train_file = '/content/drive/My Drive/DMT_HW3/train_fever.json'
dev_file = '/content/drive/My Drive/DMT_HW3/dev_fever.json'
data_train, label_train, map_idx_to_id_train = data_label_split(train_file, model)
data_dev, label_dev, map_idx_to_id_dev = data_label_split(dev_file, model)

# train model
clf = LinearDiscriminantAnalysis(solver='svd')
# fit model
clf.fit(data_train, label_train)
# get acc
LDA_acc = clf.score(data_dev, label_dev) # result: 0.6763485477178424
print(LDA_acc)

# get predictions and write to a file
test_file = '/content/drive/My Drive/DMT_HW3/singletoken_test_fever_homework_NLP.jsonl'
data_test, no_label_test, map_idx_to_id = data_label_split(test_file, model, train_set=False)
label_test = clf.predict(data_test)
write_test_prediction('final_test_pred.jsonl',map_idx_to_id,label_test)
plt.show()

# 1. PCA       
X=data
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
X_new = pca.transform(X)
plt.scatter(X_new[:, 0], X_new[:, 1],marker='o',c=y)
plt.show()

# 2. LDA
y=data[:,0]
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X,y)
X_new = lda.transform(X)
plt.scatter(X_new[:, 0], X_new[:, 1],marker='o',c=y)
plt.show()

# 3. KPCA
from sklearn.decomposition import KernelPCA
transformer = KernelPCA(n_components=7, kernel='rbf')
X_transformed = transformer.fit_transform(X)
X_transformed.shape
plt.scatter(X_transformed[:, 0], X_transformed[:, 1],marker='o',c=y)
plt.show()

# 4. Isomap
from sklearn.manifold import Isomap
embedding = Isomap(n_components=2)
Exemple #35
0
df_2018 = df_2018.dropna()
df_2018['Week_Number'] = range(1, len(df_2018['Week_Number']) + 1)

X_2018 = df_2018[['mu', 'sd']].values
Y_2018 = df_2018['Label'].values

scaler = StandardScaler()
scaler.fit(X_2017)
X_2017 = scaler.transform(X_2017)
scaler.fit(X_2018)
X_2018 = scaler.transform(X_2018)

# Question 1

lda_classifier = LDA(n_components=2)
lda_classifier.fit(X_2017, Y_2017)

print()
print('Question 1\n')
print(
    f'Equation for linear calssifier: ({lda_classifier.coef_[0][0]})x1 + ({lda_classifier.coef_[0][1]})x2 + ({lda_classifier.intercept_[0]}) = 0'
)

qda_classifier = QDA()
qda_classifier.fit(X_2017, Y_2017)

# Question 2
lda_accuracy = lda_classifier.score(X_2017, Y_2018)
qda_accuracy = qda_classifier.score(X_2017, Y_2018)
print()
print('Question 2\n')
Exemple #36
0
data = master_data[:, 0:300].T
classes = master_labels[0:300]

# Splits the data into a training set and randomized test set with accompanying labels
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    classes,
                                                    test_size=0.2)

# Scales the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Perform the LDA with one component
lda = LDA(n_components=1)
X_train = lda.fit(X_train, y_train)
result = lda.score(X_test, y_test)
print('Score: ' + str(result))
# X_test = lda.transform(X_test)

#%% Test with SVM
from sklearn import svm
# Splits the data into a training set and randomized test set with accompanying labels
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    classes,
                                                    test_size=0.2,
                                                    random_state=0)

# Scales the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
Exemple #37
0
def main():
    random.seed(64)
    pop = toolbox.population(n=100)
    
    CXPB, MUTPB, NGEN = 0.5, 0.2, 300
    
    print("Start of evolution")
    
    fitnesses = list(map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    
    print("  Evaluated %i individuals" % len(pop))
    
    for g in range(NGEN):
        print("-- Generation %i --" % g)
        
        offspring = toolbox.select(pop, len(pop))
        
        offspring = list(map(toolbox.clone, offspring))
    
        
        for child1, child2 in zip(offspring[::2], offspring[1::2]):

            if random.random() < CXPB:
                toolbox.mate(child1, child2)
                
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:

            if random.random() < MUTPB:
                toolbox.mutate(mutant)
                del mutant.fitness.values
    
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
        
        print("  Evaluated %i individuals" % len(invalid_ind))
        
        pop[:] = offspring
        
        fits = [ind.fitness.values[0] for ind in pop]
        
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5
        
        print("  Min %s" % min(fits))
        print("  Max %s" % max(fits))
        print("  Avg %s" % mean)
        print("  Std %s" % std)
        print ('Total Time is  ' + str(time.time()-start_time) + ' seconds.')
    
    print("-- End of (successful) evolution --")
    
    best_ind = tools.selBest(pop, 1)[0]
    print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))
    
    new_features = []
    for i in range(0,len(best_ind)-1):
        if best_ind[i] == 1:
            new_features.append(i)
    ##############################################################
    ##############################################################
    ##############################################################
    ####Ensemble Learning
    print('#####################################################')
    new_X_train = Xtrain[new_features] 
    new_Y_train = Ytrain

    new_X_test = Xtest[new_features]
    new_Y_test = Ytest

    clf = LinearDiscriminantAnalysis()
    sam = RandomUnderSampler(random_state=42)
    new_X_train, new_Y_train = sam.fit_sample(new_X_train, new_Y_train)
    y_pred = clf.fit(new_X_train, new_Y_train).predict(new_X_test)
    f = open('sampler_GA.txt','w')
    print(classification_report(Ytest,y_pred))
    print(roc_auc_score(Ytest,y_pred))
    f.write(classification_report(Ytest,y_pred))
    f.write('\n')
    f.write('ROC = ' + str(roc_auc_score(Ytest,y_pred)))    
    f.write('\n')

    f.close()
skf = StratifiedKFold(n_splits=4)
acc_total = []
for train_index, test_index in skf.split(bag_instance_features, bag_labels):
    bag_train_labels, bag_train_features = bag_labels[train_index], bag_instance_features[train_index]
    bag_test_labels, bag_test_features = bag_labels[test_index], bag_instance_features[test_index]

    train_labels = []

    for i in range(0, len(bag_train_features)):
        if bag_train_labels[i] == 1:
            train_labels.append(np.ones(len(bag_train_features[i]), dtype=int))
        else:
            train_labels.append(np.zeros(len(bag_train_features[i]), dtype=int))

    train_labels = np.concatenate(train_labels)
    train_features = np.vstack(bag_train_features)
    lda = LinearDiscriminantAnalysis()
    lda.fit(train_features, train_labels)

    predict_label = []

    for i in range(0, len(bag_test_labels)):
        pred = lda.predict(bag_test_features[i])
        predict_label.append(combineinstlabels(pred))

    acc = accuracy_score(bag_test_labels, predict_label)
    acc_total.append(acc)


print(np.mean(acc_total))
Exemple #39
0
for i in range(len(column_index)):
    c = []
    a = X[column_index[i]]
    for j in range(len(a)):
        if a[j] != 0:
            c.append(1 / a[j])
        else:
            c.append(100000)
    name = '1/' + column_index[i]
    X.insert(dim, '%s' % name, c)
    dim = dim + 1
print("第二次升维:", X.shape)

lda = LinearDiscriminantAnalysis(n_components=4)
lda.fit(X, Y)
X = lda.transform(X)

# clf = ExtraTreesClassifier()
# X_new = clf.fit(X, Y)

#print(clf.feature_importances_ )

# from sklearn.feature_selection import SelectKBest,chi2 #X中特征取值必须非负
# X_new=SelectKBest(chi2,k=2).fit_transform(X,Y)

# pca=PCA(n_components=10)
# pca.fit(X)
# X=pca.transform(X) #PCA进行降维

print("LDA降维后:", X.shape)
Exemple #40
0
def test_lda():
    # Load data
    X, y = load_iris(return_X_y=True)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=4242)

    # Create and fit model
    model = LinearDiscriminantAnalysis(store_covariance=True)
    model.fit(X_train, y_train)

    # Select data point for explaining its prediction
    x_orig = X_test[1:4][0, :]
    assert model.predict([x_orig]) == 2

    # Compute counterfactual
    features_whitelist = None

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        optimizer="mp",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        1,
        features_whitelist=features_whitelist,
        regularization="l1",
        optimizer="mp",
        return_as_dict=False)
    assert y_cf == 1
    assert model.predict(np.array([x_cf])) == 1

    cf = generate_counterfactual(model,
                                 x_orig,
                                 0,
                                 features_whitelist=features_whitelist,
                                 regularization="l1",
                                 optimizer="mp",
                                 return_as_dict=True)
    assert cf["y_cf"] == 0
    assert model.predict(np.array([cf["x_cf"]])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l2",
        optimizer="mp",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        C=1.0,
        optimizer="bfgs",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        C=1.0,
        optimizer="nelder-mead",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization=None,
        optimizer="bfgs",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization=None,
        optimizer="nelder-mead",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    features_whitelist = [0, 1, 2]
    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        optimizer="mp",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] <= 10e-5
        for i in range(x_orig.shape[0])
    ])

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l2",
        optimizer="mp",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] <= 10e-5
        for i in range(x_orig.shape[0])
    ])

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        C=1.0,
        optimizer="bfgs",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] == 0.
        for i in range(x_orig.shape[0])
    ])

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        C=1.0,
        optimizer="nelder-mead",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] == 0.
        for i in range(x_orig.shape[0])
    ])

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization=None,
        optimizer="bfgs",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] == 0.
        for i in range(x_orig.shape[0])
    ])

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization=None,
        optimizer="nelder-mead",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] == 0.
        for i in range(x_orig.shape[0])
    ])

    # Other stuff
    from ceml.sklearn import LdaCounterfactual
    with pytest.raises(TypeError):
        LdaCounterfactual(sklearn.linear_model.LogisticRegression())

    model = LinearDiscriminantAnalysis()
    model.fit(X_train, y_train)
    with pytest.raises(AttributeError):
        LdaCounterfactual(model)
    with pytest.raises(AttributeError):
        generate_counterfactual(model, x_orig, 0)
Exemple #41
0
y = np.array(y)

np.save('X_data', X)
np.save('y_data', y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# make classifiers
LDA = LinearDiscriminantAnalysis()
SVM = SVC()
SVM_RBF = SVC(kernel='rbf', gamma='scale')
LogReg = LogisticRegression()
RFC = RandomForestClassifier(n_estimators=100, max_depth=100)

# fit classifiers
LDA.fit(X_train, y_train)
SVM.fit(X_train, y_train)
SVM_RBF.fit(X_train, y_train)
LogReg.fit(X_train, y_train)
RFC.fit(X_train, y_train)

# predict results
prediction_LDA = LDA.predict(X_test)
prediction_SVM = SVM.predict(X_test)
prediction_SVM_RBF = SVM_RBF.predict(X_test)
prediction_LogReg = LogReg.predict(X_test)
prediction_RFC = RFC.predict(X_test)

# accurac score of classifiers
acc_LDA = accuracy_score(y_test, prediction_LDA)
acc_SVM = accuracy_score(y_test, prediction_SVM)
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')

# security guy said to hide this so I moved it down the page
thing = eyJwbG50ciI6Ik5EcVJsZ1lUUUx1TENYdEN5QjhcL2FnPT0iLCJhbGciOiJFUzI1NiJ9.eyJleHAiOjE1OTA2NTI1OTAsInNpZCI6Ik9pMjJ4VVAwUUVlaERydmtLQzBQekE9PSIsInN1YiI6IkUwMXArVCtEVEd5VkV2NWVOQzVGNGc9PSJ9.42JxOy-kKdbF1Bsfy92r3ALL4uhvK0CBGraPwMA1wmDtUrh0tJllmGRUwBgSY1d0lV6pkoRvxxXfiL_yPIFshQ

def plot_qda_cov(qda, splot):
    plot_ellipse(splot, qda.means_[0], qda.covariance_[0], 'red')
    plot_ellipse(splot, qda.means_[1], qda.covariance_[1], 'blue')


plt.figure(figsize=(10, 8), facecolor='white')
plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis',
             y=0.98, fontsize=15)
for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
    # Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis(store_covariance=True)
    y_pred = qda.fit(X, y).predict(X)
    splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
plt.tight_layout()
plt.subplots_adjust(top=0.92)
plt.show()
plt.semilogx(C, testScore, label='test accuracy')
plt.semilogx(C, trainScore, label='train accuracy')
plt.legend()
plt.ylabel("Accuracy")
plt.xlabel("C")
plt.title('Logistic: training/test data accuracy over different C')
plt.show()

#4.2 LinearDiscriminant classifier
print('-------------------LinearDiscriminant classifier-------------------')
lda = LinearDiscriminantAnalysis()
trainScore = []
testScore = []
for i in range(nmc):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    trainFit = lda.fit(X_train, y_train)
    trainScore.append(trainFit.score(X_train, y_train))
    testScore.append(trainFit.score(X_test, y_test))
print('test accuracy {0:4.4f}, train accuracy {1:4.4f}'.format(
    np.mean(testScore), np.mean(trainScore)))

#4.3 Quadratic Discrimant classifier
print('-------------------Quadratic Discrimant classifier-------------------')
qda = QuadraticDiscriminantAnalysis()
trainScore = []
testScore = []
for i in range(nmc):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    trainFit = qda.fit(X_train, y_train)
    trainScore.append(trainFit.score(X_train, y_train))
    testScore.append(trainFit.score(X_test, y_test))
Exemple #44
0
def old_feature_merging(mode=['CSP', 'TDP'],
                        key_name=['csp', 'tdp'],
                        cls='lsvm'):
    import os
    if os.path.isfile('result_files/merging_original_/result_' + mode[0] +
                      '&' + mode[1] + '_' + cls + '_none.csv'):
        print(mode[0] + '&' + mode[1] + '_' + cls + ' already done.')
        return
    path1 = 'E:/Richard/MultiData/' + mode[0] + '/'
    path2 = 'E:/Richard/MultiData/' + mode[1] + '/'

    files = os.listdir(path1)
    for file in files:
        try:
            data1 = scipy.io.loadmat(path1 + file)[key_name[0]][0][0]
            data2 = scipy.io.loadmat(path2 + file)[key_name[1]][0][0]
        except:
            continue
        train_x1 = data1[0]
        train_y1 = data1[1]
        test_x1 = data1[2]
        test_y1 = data1[3]
        train_x2 = data2[0]
        train_y2 = data2[1]
        test_x2 = data2[2]
        test_y2 = data2[3]
        for i in range(5):

            tx1 = np.transpose(train_x1[i])
            tx2 = np.transpose(train_x2[i])
            ty = np.transpose(train_y1[i]).argmax(axis=1)
            vx1 = np.transpose(test_x1[i])
            vx2 = np.transpose(test_x2[i])
            vy = np.transpose(test_y1[i]).argmax(axis=1)

            if mode[0] == 'PSD':
                max_value = tx1.max()
                tx1 = tx1 / max_value
                vx1 = vx1 / max_value
            if mode[1] == 'PSD':
                max_value = tx2.max()
                tx2 = tx2 / max_value
                vx2 = vx2 / max_value
            tx = np.concatenate((tx1, tx2), axis=1)
            vx = np.concatenate((vx1, vx2), axis=1)
            from sklearn import svm, linear_model
            from sklearn import ensemble
            if cls == 'lsvm': lda = svm.LinearSVC()
            elif cls == 'ksvm': lda = svm.SVC(kernel='linear')
            elif cls == 'gb': lda = ensemble.GradientBoostingClassifier()
            elif cls == 'srlda':
                lda = LinearDiscriminantAnalysis(solver='lsqr',
                                                 shrinkage='auto')
            lda.fit(tx, ty)
            y_predict = lda.predict(vx)
            coh = cohen_kappa_score(vy, y_predict)
            acc = accuracy_score(vy, y_predict)
            pen = open(
                'result_files/merging_original_/result_' + mode[0] + '&' +
                mode[1] + '_' + cls + '_none.csv', 'a')
            pen.write(file + ',' + str(i) + ',' + str(coh) + ',' + str(acc) +
                      '\n')
            pen.close()
Exemple #45
0
def _initialize_components(n_components,
                           input,
                           y=None,
                           init='auto',
                           verbose=False,
                           random_state=None,
                           has_classes=True):
    """Returns the initial transformation to be used depending on the arguments.

  Parameters
  ----------
  n_components : int
    The number of components to take. (Note: it should have been checked
    before, meaning it should not be None and it should be a value in
    [1, X.shape[1]])

  input : array-like
    The input samples (can be tuples or regular samples).

  y : array-like or None
    The input labels (or not if there are no labels).

  init : string or numpy array, optional (default='auto')
    Initialization of the linear transformation. Possible options are
    'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
    (n_features_a, n_features_b).

    'auto'
      Depending on ``n_components``, the most reasonable initialization
      will be chosen. If ``n_components <= n_classes`` we use 'lda' (see
      the description of 'lda' init), as it uses labels information. If
      not, but ``n_components < min(n_features, n_samples)``, we use 'pca',
      as it projects data onto meaningful directions (those of higher
      variance). Otherwise, we just use 'identity'.

    'pca'
      ``n_components`` principal components of the inputs passed
      to :meth:`fit` will be used to initialize the transformation.
      (See `sklearn.decomposition.PCA`)

    'lda'
      ``min(n_components, n_classes)`` most discriminative
      components of the inputs passed to :meth:`fit` will be used to
      initialize the transformation. (If ``n_components > n_classes``,
      the rest of the components will be zero.) (See
      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`).
      This initialization is possible only if `has_classes == True`.

    'identity'
      The identity matrix. If ``n_components`` is strictly smaller than the
      dimensionality of the inputs passed to :meth:`fit`, the identity
      matrix will be truncated to the first ``n_components`` rows.

    'random'
      The initial transformation will be a random array of shape
      `(n_components, n_features)`. Each value is sampled from the
      standard normal distribution.

    numpy array
      n_features_b must match the dimensionality of the inputs passed to
      :meth:`fit` and n_features_a must be less than or equal to that.
      If ``n_components`` is not None, n_features_a must match it.

  verbose : bool
    Whether to print the details of the initialization or not.

  random_state : int or `numpy.RandomState` or None, optional (default=None)
    A pseudo random number generator object or a seed for it if int. If
    ``init='random'``, ``random_state`` is used to initialize the random
    transformation. If ``init='pca'``, ``random_state`` is passed as an
    argument to PCA when initializing the transformation.

  has_classes : bool (default=True)
    Whether the labels are in fact classes. If true, this will allow to use
    the 'lda' initialization.

  Returns
  -------
  init_components : `numpy.ndarray`
    The initial transformation to use.
  """
    # if we are doing a regression we cannot use lda:
    n_features = input.shape[-1]
    authorized_inits = ['auto', 'pca', 'identity', 'random']
    if has_classes:
        authorized_inits.append('lda')

    if isinstance(init, np.ndarray):
        # we copy the array, so that if we update the metric, we don't want to
        # update the init
        init = check_array(init, copy=True)

        # Assert that init.shape[1] = X.shape[1]
        if init.shape[1] != n_features:
            raise ValueError(
                'The input dimensionality ({}) of the given '
                'linear transformation `init` must match the '
                'dimensionality of the given inputs `X` ({}).'.format(
                    init.shape[1], n_features))

        # Assert that init.shape[0] <= init.shape[1]
        if init.shape[0] > init.shape[1]:
            raise ValueError(
                'The output dimensionality ({}) of the given '
                'linear transformation `init` cannot be '
                'greater than its input dimensionality ({}).'.format(
                    init.shape[0], init.shape[1]))

        # Assert that self.n_components = init.shape[0]
        if n_components != init.shape[0]:
            raise ValueError('The preferred dimensionality of the '
                             'projected space `n_components` ({}) does'
                             ' not match the output dimensionality of '
                             'the given linear transformation '
                             '`init` ({})!'.format(n_components,
                                                   init.shape[0]))
    elif init not in authorized_inits:
        raise ValueError(
            "`init` must be '{}' "
            "or a numpy array of shape (n_components, n_features).".format(
                "', '".join(authorized_inits)))

    random_state = check_random_state(random_state)
    if isinstance(init, np.ndarray):
        return init
    n_samples = input.shape[0]
    if init == 'auto':
        if has_classes:
            n_classes = len(np.unique(y))
        else:
            n_classes = -1
        init = _auto_select_init(has_classes, n_features, n_samples,
                                 n_components, n_classes)
    if init == 'identity':
        return np.eye(n_components, input.shape[-1])
    elif init == 'random':
        return random_state.randn(n_components, input.shape[-1])
    elif init in {'pca', 'lda'}:
        init_time = time.time()
        if init == 'pca':
            pca = PCA(n_components=n_components, random_state=random_state)
            if verbose:
                print('Finding principal components... ')
                sys.stdout.flush()
            pca.fit(input)
            transformation = pca.components_
        elif init == 'lda':
            lda = LinearDiscriminantAnalysis(n_components=n_components)
            if verbose:
                print('Finding most discriminative components... ')
                sys.stdout.flush()
            lda.fit(input, y)
            transformation = lda.scalings_.T[:n_components]
        if verbose:
            print('done in {:5.2f}s'.format(time.time() - init_time))
        return transformation
    clf1.fit(X_train,y_train)
    pred = clf1.predict(X_test)
    print (accuracy_score(pred,y_test))
    showCON('DTC', clf1, X_test, y_test)
    
    clf2 = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_train,y_train)
    pred = clf2.predict(X_test)
    print (accuracy_score(pred,y_test))
    showCON('DTC', clf2, X_test, y_test)
    
    clf3 = LinearDiscriminantAnalysis()
    X_train = np.asarray(X_train)
    X_test = np.asarray(X_test)
    y_train = np.asarray(y_train)
    y_test = np.asarray(y_test)
    clf3.fit(X_train,y_train)
    clf3_score = clf3.score(X_test, y_test)
    print (clf3_score)
    showCON('DTC', clf3, X_test, y_test)
      
    clf4 = KNeighborsClassifier(n_neighbors=4)
    clf4.fit(X_train,y_train)
    clf4_score = clf4.score(X_test, y_test)
    print (clf4_score)
    showCON('KNN(K=4)', clf4, X_test, y_test)


    clf5 = tree.DecisionTreeClassifier()
    clf5 = clf5.fit(X_train,y_train)
    pred = clf5.predict(X_test)
    print (accuracy_score(pred,y_test))
def main():
    start = time.time()

    Data_path = "C:\\Users\\user\\Desktop\\Drone\\LDA\\Data\\"
    eegData_txt = Data_path + 'eegData.out'
    stims_txt = Data_path + 'stims.out'
    moveData_eeg = 'C:\\Users\\user\\Desktop\\Drone\\LDA\\Training\\eegData\\'
    moveData_stims = 'C:\\Users\\user\\Desktop\\Drone\\LDA\\Training\\stims\\'
    ##Generate Preprocessing Training data
    ctime = datetime.today().strftime("%m%d_%H%M")
    Classifier_path = 'C:/Users/user/Desktop/Drone/LDA/Model/' + ctime + 'Classifier.pickle'
    channelNum = 7
    samplingFreq = 300

    while True:
        if os.path.isfile(eegData_txt):
            break

    while True:
        if os.path.isfile(eegData_txt) & os.path.isfile(stims_txt):
            eegData = np.loadtxt(eegData_txt, delimiter=",")
            stims = np.loadtxt(stims_txt, delimiter=",")
            ctime = datetime.today().strftime("%m%d_%H%M%S")
            moveData_e = moveData_eeg + ctime + 'eegData.out'
            moveData_s = moveData_stims + ctime + 'stims.out'
            shutil.move(eegData_txt, moveData_e)
            shutil.move(stims_txt, moveData_s)
            break

    print("got process")

    ##Preprocessing process

    #Bandpass Filter
    eegData = butter_bandpass_filter(eegData, 0.1, 30, samplingFreq, order=4)

    #Epoching
    epochSampleNum = int(np.floor(1.0 * samplingFreq))
    offset = int(np.floor(0.0 * samplingFreq))
    baseline = int(np.floor(1.0 * samplingFreq))
    [EpochsT, NumT] = Epoching(eegData, stims, 1, samplingFreq, channelNum,
                               epochSampleNum, offset, baseline)
    [EpochsN, NumN] = Epoching(eegData, stims, 0, samplingFreq, channelNum,
                               epochSampleNum, offset, baseline)

    EpochsN_New = np.zeros((NumT, channelNum, epochSampleNum))
    NumN = NumT
    for i in range(NumN):
        EpochsN_New[i, :, :] = np.mean(EpochsN[i * 5:i * 5 + 5, :, :], axis=0)

    resampleRate = 100

    #Convert to feature vector
    [EpochsT_Aver, NumT_Aver, EpochsN_Aver,
     NumN_Aver] = Make_Average_Component(EpochsT, NumT, EpochsN_New, NumT,
                                         channelNum, epochSampleNum, 20)
    EpochsT_Aver = resampling(EpochsT_Aver, NumT_Aver, resampleRate,
                              channelNum)
    EpochsN_Aver = resampling(EpochsN_Aver, NumN_Aver, resampleRate,
                              channelNum)

    featureNum = channelNum * resampleRate
    [FeaturesT,
     FeaturesN] = Convert_to_featureVector(EpochsT_Aver, NumT_Aver,
                                           EpochsN_Aver, NumN_Aver, featureNum)
    TrainData = np.concatenate((FeaturesT, FeaturesN))
    TrainLabel = np.concatenate((np.ones(
        (NumT_Aver, 1)).astype(int), np.zeros(
            (NumN_Aver, 1)).astype(int))).ravel()

    #Saving LDA classifier
    lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
    lda.fit(TrainData, TrainLabel)
    joblib.dump(lda, Classifier_path, protocol=2)

    print("time :", time.time() - start)
        # Initially we set the each class probability to zero.
        scores = np.zeros((x_tests.shape[0], class_label))
        # We will calculate the probability for each of the class.
        for label in range(class_label):
            # normal_distribution_prob.logpdf Will give us the log threshold of the PDF that we just mentioned above.
            normal_distribution_prob = multivariate_normal(mean=self.mu[label], cov=self.sigma[label])
            # x_test can have multiple test data we will calculate the probability of each of the test data
            for i, x in enumerate(x_tests):
                scores[i, label] = np.log(self.phi[label]) + normal_distribution_prob.logpdf(x)
        predictions = np.argmax(scores, axis=1)
        return predictions


if __name__ == '__main__':
    data = load_iris()
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)
    GaussianDiscriminantAnalysis = GaussianDiscriminantAnalysis()
    GaussianDiscriminantAnalysis.fit(x_train, y_train)
    y_predict = GaussianDiscriminantAnalysis.predict(x_test)
    score = f1_score(y_test, y_predict, average="weighted")
    print("f1 score of our model: ", score)

    # Test scikit learn model
    lda = LinearDiscriminantAnalysis()
    lda.fit(x_train, y_train)
    y_predict_sk = lda.predict(x_test)
    print("f1 score of scikit-learn model is: ", f1_score(y_test, y_predict_sk, average="weighted"))



    tslag["Lag_" + str(i + 1)] = tslag["Lag_" + str(i + 1)].pct_change()
tslag.fillna(0, inplace=True)
tslag["Direction"] = np.sign(tslag["returns"])

#Use the prior two days of returns as predictor values,
#with direction as the response
#if the number of Lag is positive or negative, than has a impact of sign in
#Direction section
X = tslag[["Lag_1", "Lag_2"]]
y = tslag["Direction"]

#Creating test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]

#Create dataframe predictions
pred = pd.DataFrame(index=y_test.index)
lda = LDA()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)

pred = (1.0 + (y_pred == y_test)) / 2
hit_rate = np.mean(pred)

print("Linear Discriminant Analysis {:.4f}", format(hit_rate))
"""
    Linear Discriminant Analysis {:.4f} 0.742
"""
    def predict(self, X):
        project = np.dot(X, self.w.T)

        w_u0 = np.dot(self.w, self.u0.T)
        w_u1 = np.dot(self.w, self.u1.T)

        # 投影点到 w_u1的距离比到w_u0的距离近,即归为1的一类点
        return (np.abs(project - w_u1) < np.abs(project - w_u0)).astype(int)


data = pd.read_csv('../3.3/watermelon3_0_Ch.csv').values
X = data[:, 7:9].astype(float)
y = data[:, 9]
y[y == '是'] = 1
y[y == '否'] = 0
y = y.astype(int)
lda = LDA()
lda.fit(X, y)
y_predict = lda.predict(X).flatten()

clf = LinearDiscriminantAnalysis()
clf.fit(X, y)
y_clf_predict = clf.predict(X)
plt.figure(figsize=(15, 10))
plt.plot(np.arange(X.shape[0]), y, label='True')
plt.plot(np.arange(X.shape[0]), y_predict, label='Predict')
plt.plot(np.arange(X.shape[0]), y_clf_predict, label='Sklearn_Predict')
plt.legend()
plt.savefig('predict.png')
plt.show()
Exemple #51
0
    plt.show()

# Zadanie 2:

mnist = fetch_mldata('MNIST original')
train, test, train_targets, test_targets = train_test_split(mnist.data,
                                                            mnist.target,
                                                            test_size=0.50,
                                                            random_state=42)

# Zadanie 3:
max_value = 0
max_number = 0
for i in range(1, 6):
    lda = LDA(n_components=i)
    lda_train = lda.fit(train, train_targets).transform(train)
    lda_test = lda.fit(test, test_targets).transform(test)
    knn = KNeighborsClassifier(round(math.sqrt(mnist.data.shape[0])),
                               metric='euclidean',
                               weights='uniform')
    knn.fit(lda_train, train_targets)
    print("Score for ", i, " components: ", knn.score(lda_test, test_targets))
    if max_value < knn.score(lda_test, test_targets):
        max_value = knn.score(lda_test, test_targets)
        max_number = i
print("Max for: ", max_number, " is: ", max_value)

# Zadanie 4:
max_value = 0
max_number = 0
for i in range(1, 6):
Exemple #52
0
                                     scoring='accuracy')

        results.append(cv_results)
        names.append(name)

        print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

    # Compare Algorithms
    # pyplot.boxplot(results, labels=names)
    # pyplot.title('Algorithm Comparison')
    # pyplot.show()

    # print([data['o'][len(data['c'])-2], data['l'][len(data['c'])-2], data['h'][len(data['c'])-2], data['c'][len(data['c'])-2]])
    #
    LDA = LinearDiscriminantAnalysis()
    LDA.fit(X, y)
    summ = 0
    print(co)
    i = 0
    print(
        LDA.predict([[
            data['l'][len(data['c']) - 9 + i],
            data['o'][len(data['c']) - 9 + i],
            data['c'][len(data['c']) - 9 + i],
            data['h'][len(data['c']) - 9 + i],
            data['l'][len(data['c']) - 8 + i],
            data['o'][len(data['c']) - 8 + i],
            data['c'][len(data['c']) - 8 + i],
            data['h'][len(data['c']) - 8 + i],
            data['l'][len(data['c']) - 7 + i],
            data['o'][len(data['c']) - 7 + i],
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    parser.add_option("-s", "--subset", help="One of 'COVID', 'NONCOVID'")
    parser.add_option("-l", "--log", action="store_true", help="Take log1")
    parser.add_option("-o", "--output_prefix", help="Output file")
    (options, args) = parser.parse_args()

    expr_f = args[0]
    meta_f = args[1]
    prefix = options.output_prefix

    expr_df = pd.read_csv(expr_f, sep='\t', index_col=0)
    meta_df = pd.read_csv(meta_f, sep='\t')
    meta_df = meta_df.set_index('Albany_sampleID')

    # Remove patients for which the 28 days has not elapsed
    meta_df = meta_df.loc[meta_df['Hospital_free_days'].notnull()]
    no_expression_data = set(meta_df.index) - set(expr_df.columns)
    meta_df = meta_df.drop(no_expression_data)

    # Rmoeve non-COVID patients
    if options.subset == 'COVID':
        meta_df = meta_df.loc[meta_df['COVID'] == 1]

    # Get metadata
    hospital_free = np.array(meta_df['Hospital_free_days'])
    icu_status = []
    for is_icu in meta_df['ICU_1']:
        if is_icu == 1:
            icu_status.append('True')
        else:
            icu_status.append('False')
    covid_status = []
    for is_covid in meta_df['COVID']:
        if is_covid == 1:
            covid_status.append('True')
        else:
            covid_status.append('False')

    # Filter expression matrix according to metadata
    expr_df = expr_df[meta_df.index]
    X = np.array(expr_df)
    X = X.T
    if options.log:
        X = np.log(X + 1)

    mod = PCA(n_components=2)
    X_pca = mod.fit_transform(X)
    _plot_scatter(X_pca, 'PCA', hospital_free, 'Hospital Free Days',
                  '{}.PCA_hospital_free.pdf'.format(prefix))
    _plot_scatter_discrete(X_pca, 'PCA', icu_status, 'ICU Status',
                           '{}.PCA_icu_status.pdf'.format(prefix))
    if options.subset is None:
        _plot_scatter_discrete(X_pca, 'PCA', covid_status, 'COVID-19\nStatus',
                               '{}.PCA_covid_status.pdf'.format(prefix))

    for perp in [5, 6, 7, 8, 9, 10]:
        mod_tsne = TSNE(n_components=2, perplexity=perp)
        mod_pca_100 = PCA(n_components=min([100, len(X)]))
        X_pca_100 = mod_pca_100.fit_transform(X)
        print(X_pca_100.shape)
        print('Fitting t-SNE...')
        X_tsne = mod_tsne.fit_transform(X_pca_100)
        print('done.')
        _plot_scatter(X_tsne, 't-SNE', hospital_free, 'Hospital Free Days',
                      '{}.tSNE_perp_{}_hospital_free.pdf'.format(prefix, perp))
        _plot_scatter_discrete(
            X_tsne, 't-SNE', icu_status, 'ICU Status',
            '{}.tSNE_perp_{}_icu_status.pdf'.format(prefix, perp))
        if options.subset is None:
            _plot_scatter_discrete(
                X_tsne, 't-SNE', covid_status, 'COVID-19\nStatus',
                '{}.tSNE_perp_{}_covid_status.pdf'.format(prefix, perp))

    # Linear discriminant analysis for hospital-free days
    s_hospital_free = sorted(set(hospital_free))
    one_third = int(len(s_hospital_free) / 3)
    first_thresh = s_hospital_free[one_third]
    print(first_thresh)
    second_thresh = s_hospital_free[2 * one_third]
    discrete_y = []
    for hf in hospital_free:
        if hf < first_thresh:
            discrete_y.append(1)
        elif hf >= first_thresh and hf < second_thresh:
            discrete_y.append(2)
        elif hf >= second_thresh:
            discrete_y.append(3)
    print(discrete_y)
    lda = LinearDiscriminantAnalysis(n_components=2)
    X_lda = lda.fit(X, discrete_y).transform(X)
    _plot_scatter(X_lda, 'LDA', hospital_free, 'Hospital Free Days',
                  '{}.LDA_hospital_free.pdf'.format(prefix))
        % (kernel, correct_prediction * 100. / len(predictions),
           len(predictions)))

#####################################################################################
#####################################################################################
## kNNs + LDA

print("\n### LDA")

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

NB_COMPONENTS = 2

lda = LinearDiscriminantAnalysis(n_components=NB_COMPONENTS)
#lda = LinearDiscriminantAnalysis(n_components=NB_COMPONENTS,solver="eigen",shrinkage="auto")
lda.fit(training, categories)

training_lda = lda.transform(training)
testing_lda = lda.transform(testing)

plot_embedding(training_lda,
               categories,
               "Linear Discriminant projection (training)",
               xlabel="1st dimension",
               ylabel="2nd dimension")

plot_embedding(testing_lda,
               testing_categories,
               "Linear Discriminant projection (testing)",
               xlabel="1st dimension",
               ylabel="2nd dimension")
	elif mode==0:
		inputDataClass = InputReader(['Medical_data.csv', 'test_medical.csv'],0)
	elif mode==2:
		inputDataClass = InputReader('railwayBookingList.csv',2)
	elif mode==3:
		inputDataClass = InputReader('river_data.csv',3)

	X = inputDataClass.Train
	x_test = inputDataClass.Test

	if mode==1:
		# PCA for F-MNIST
		pca = PCA(n_components=80)
		X_new = pca.fit_transform(X[:,:-1])
		X = np.column_stack([X_new, X[:,-1]])
		x_test_new = pca.transform(x_test[:,:-1])
		x_test = np.column_stack([x_test_new, x_test[:,-1]])

	print('LDA ---> projection on {max_dim} dimensions. . .')
	Y_pred, acc, precision, recall, f1score, confMat = LDA(X, x_test, mode = 1, max_dim=max_dim)

	print("SKLEARN. . .")
	model = LinearDiscriminantAnalysis(solver='eigen', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001)
	model.fit(X[:,:-1], X[:,-1])
	Y_pred = model.predict(x_test[:,:-1])
	# analyse performance
	Y_true = xx[:,-1]
	acc = performanceAnalyser.calcAccuracyTotal(Y_pred,Y_true)
	precision, recall, f1score = performanceAnalyser.goodness(Y_true, Y_pred)
	confMat = performanceAnalyser.getConfusionMatrix(Y_true,Y_pred)
	print(f'Accuracy:{acc}\n Precision:{precision}\n Recall:{recall}\n F1score:{f1score}\n Confusion Matrix:{confMat}\n')
Exemple #56
0
def lda_project(spike_times,
                spike_clusters,
                event_times,
                event_groups,
                pre_time=0,
                post_time=0.5,
                cross_validation='kfold',
                num_splits=5,
                prob_left=None,
                custom_validation=None):
    """
    Use linear discriminant analysis to project population vectors to the line that best separates
    the two groups. When cross-validation is used, the LDA projection is fitted on the training
    data after which the test data is projected to this projection.

    spike_times : 1D array
        spike times (in seconds)
    spike_clusters : 1D array
        cluster ids corresponding to each event in `spikes`
    event_times : 1D array
        times (in seconds) of the events from the two groups
    event_groups : 1D array
        group identities of the events, can be any number of groups, accepts integers and strings
    cross_validation : string
        which cross-validation method to use, options are:
            'none'              No cross-validation
            'kfold'             K-fold cross-validation
            'leave-one-out'     Leave out the trial that is being decoded
            'block'             Leave out the block the to-be-decoded trial is in
            'custom'            Any custom cross-validation provided by the user
    num_splits : integer
        ** only for 'kfold' cross-validation **
        Number of splits to use for k-fold cross validation, a value of 5 means that the decoder
        will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process
        is repeated five times so that all data has been used as both training and test set.
    prob_left : 1D array
        ** only for 'block' cross-validation **
        the probability of the stimulus appearing on the left for each trial in event_times
    custom_validation : generator
        ** only for 'custom' cross-validation **
        a generator object with the splits to be used for cross validation using this format:
            (
                (split1_train_idxs, split1_test_idxs),
                (split2_train_idxs, split2_test_idxs),
                (split3_train_idxs, split3_test_idxs),
             ...)
    n_neurons : int
        Group size of number of neurons to be sub-selected

    Returns
    -------
    lda_projection : 1D array
        the position along the LDA projection axis for the population vector of each trial

    """

    # Check input
    assert cross_validation in [
        'none', 'kfold', 'leave-one-out', 'block', 'custom'
    ]
    assert event_times.shape[0] == event_groups.shape[0]
    if cross_validation == 'block':
        assert event_times.shape[0] == prob_left.shape[0]
    if cross_validation == 'custom':
        assert isinstance(custom_validation, types.GeneratorType)

    # Get matrix of all neuronal responses
    times = np.column_stack(
        ((event_times - pre_time), (event_times + post_time)))
    pop_vector, cluster_ids = get_spike_counts_in_bins(spike_times,
                                                       spike_clusters, times)
    pop_vector = pop_vector.T

    # Initialize
    lda = LinearDiscriminantAnalysis()
    lda_projection = np.zeros(event_groups.shape)

    if cross_validation == 'none':
        # Find the best LDA projection on all data and transform those data
        lda_projection = lda.fit_transform(pop_vector, event_groups)

    else:
        # Perform cross-validation
        if cross_validation == 'leave-one-out':
            cv = LeaveOneOut().split(pop_vector)
        elif cross_validation == 'kfold':
            cv = KFold(n_splits=num_splits).split(pop_vector)
        elif cross_validation == 'block':
            block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)]
            blocks = np.repeat(np.arange(len(block_lengths)), block_lengths)
            cv = LeaveOneGroupOut().split(pop_vector, groups=blocks)
        elif cross_validation == 'custom':
            cv = custom_validation

        # Loop over the splits into train and test
        for train_index, test_index in cv:

            # Find LDA projection on the training data
            lda.fit(pop_vector[train_index],
                    [event_groups[j] for j in train_index])

            # Project the held-out test data to projection
            lda_projection[test_index] = lda.transform(
                pop_vector[test_index]).T[0]

    return lda_projection
from performance import Portfolio, MarketIntradayPortfolio
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from pylab import *
from datetime import datetime

HS300 = getStock_C('000300')
SP500 = getStock_A('^GSPC')
HS300 = addFeatures(HS300)
SP500 = addFeatures(SP500)
HS300.drop('ADOSC', axis=1)
X_train, y_train, X_test, y_test = Prep(HS300)
Classify(X_train, y_train, X_test, y_test, 'RF')
CV(X_train, y_train, 9, 'RF')
clf = LDA()
y_pred = clf.fit(X_train, y_train).predict(X_test)
symbol = 'CSI300'
start_test = datetime(2014,1,1)
end_period = datetime(2015,9,29)
bars = HS300[['Open','AdjClose']]

bars = bars[start_test:end_period]

signals = pd.DataFrame(index=bars.index)
signals['signal'] = 0.0
signals['signal'] = y_pred
#Short the stock
signals.signal[signals.signal == 0] = -1

# positions change
signals['positions'] = signals['signal'].diff()
Exemple #58
0
# Load libraries
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Load Iris flower dataset:
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create and run an LDA, then use it to transform the features
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features, target).transform(features)

# Print the number of features
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_lda.shape[1])

lda.explained_variance_ratio_

# Create and run LDA
lda = LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(features, target)

# Create array of explained variance ratios
lda_var_ratios = lda.explained_variance_ratio_


# Create function
def select_n_components(var_ratio, goal_var: float) -> int:

    # Set initial variance explained so far
Exemple #59
0
        apical_words = ["SZ", "SZW"]

        training_mask = md['phone'].isin(training_list)
        training_mask = training_mask.values  # .as_matrix()
        training_md = md[training_mask].copy()
        training_data = pca_out[training_mask].copy()

        test_mask = md['phone'].isin(test_list)
        test_mask = test_mask.values  # .as_matrix()
        test_md = md[test_mask].copy()
        test_data = pca_out[test_mask].copy()

        # train LDA on training data
        labs = np.array(training_md.phone)  # expand dims?
        train_lda = LDA(n_components=int(n_lds))
        train_lda.fit(training_data, labs)  # train the model on the data
        train_lda_out = train_lda.transform(training_data)

        # score and/or categorize test data according to trained LDA model
        test_lda_out = train_lda.transform(test_data)

        # LDA data for csv: training on top of test
        ld = pd.DataFrame(np.vstack([train_lda_out, test_lda_out]))
        ld = ld.rename(columns={0: 'LD1', 1: 'LD2'})

        # a subject column for csv
        subject_lab = [subject] * ld.shape[0]
        subject_column = pd.DataFrame(subject_lab)
        subject_column = subject_column.rename(columns={0: 'subj'})

        # TODO get pandas to shut up about these two lines
Exemple #60
0
def lda_1():
    x_data, y_label = load_datasets()
    model = LinearDiscriminantAnalysis(solver='eigen')
    model.fit(x_data, y_label)