def LDA佮SVM模型(self, 問題, 答案):
		sample_weight_constant = np.ones(len(問題))
		clf = svm.SVC(C=1)
		lda = LDA()
# 		clf = svm.NuSVC()
		print('訓練LDA')
		lda.fit(問題, 答案)
		print('訓練SVM')
		clf.fit(lda.transform(問題), 答案, sample_weight=sample_weight_constant)
		print('訓練了')
		return lambda 問:clf.predict(lda.transform(問))
def plotLDA3D(X, y, names=[]):

    plt.cla()
    lda = LDA(n_components=3)
    lda.fit(X, y)
    X = lda.transform(X)

    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)

    classes = np.unique(y)
    colors_ = list(six.iteritems(colors.cnames))
    hex_ = [color[1] for color in colors_]
    rgb = [colors.hex2color(color) for color in hex_]
    colors_ = []

    class_label = []
    for i in range(0, len(classes)):
        colors_.append(rgb[i])

        if len(names) == 0:
            class_label.append((str(i), i))
        else:
            class_label.append((names[i], i))

    for name, label in class_label:
        ax.text3D(
            X[y == label, 0.0].mean(),
            X[y == label, 1.0].mean() + 1.5,
            X[y == label, 2.0].mean(),
            name,
            horizontalalignment="center",
            bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
        )
    # Reorder the labels to have colors matching the cluster results
    y = y.astype(int)
    # y = np.choose(y, class_label)
    ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.hot)

    x_surf = [X[:, 0].min(), X[:, 0].max(), X[:, 0].min(), X[:, 0].max()]
    y_surf = [X[:, 0].max(), X[:, 0].max(), X[:, 0].min(), X[:, 0].min()]
    x_surf = np.array(x_surf)
    y_surf = np.array(y_surf)
    v0 = lda.transform(lda.coef_[[0]])
    v0 /= v0[-1]
    v1 = lda.transform(lda.coef_[[1]])
    v1 /= v1[-1]

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])

    plt.show()
Beispiel #3
0
def lda(X_train, X_val, y_train):
    print("Performing dimensionality reduction using LDA...")
    lda = LDA()
    try:
        lda.fit(X_train, y_train)
    except TypeError:
        X_train = X_train.toarray()
        X_val = X_val.toarray()
        lda.fit(X_train, y_train)
    X_train = lda.transform(X_train)
    X_val = lda.transform(X_val)
    return X_train, X_val
Beispiel #4
0
def lda(X_train, X_val, y_train):
    print("Performing dimensionality reduction using LDA...")
    lda = LDA()
    try:
        lda.fit(X_train, y_train)
    except TypeError:
        X_train = X_train.toarray()
        X_val = X_val.toarray()
        lda.fit(X_train, y_train)
    X_train = lda.transform(X_train)
    X_val = lda.transform(X_val)
    return  X_train, X_val
def plotLDA3D(X, y, names=[]):

    plt.cla()
    lda = LDA(n_components=3)
    lda.fit(X, y)
    X = lda.transform(X)

    fig = plt.figure(1, figsize=(4, 3))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

    classes = np.unique(y)
    colors_ = list(six.iteritems(colors.cnames))
    hex_ = [color[1] for color in colors_]
    rgb = [colors.hex2color(color) for color in hex_]
    colors_ = []

    class_label = []
    for i in range(0, len(classes)):
        colors_.append(rgb[i])

        if (len(names) == 0):
            class_label.append((str(i), i))
        else:
            class_label.append((names[i], i))

    for name, label in class_label:
        ax.text3D(X[y == label, 0.0].mean(),
                  X[y == label, 1.0].mean() + 1.5,
                  X[y == label, 2.0].mean(),
                  name,
                  horizontalalignment='center',
                  bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
    # Reorder the labels to have colors matching the cluster results
    y = y.astype(int)
    #y = np.choose(y, class_label)
    ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.hot)

    x_surf = [X[:, 0].min(), X[:, 0].max(), X[:, 0].min(), X[:, 0].max()]
    y_surf = [X[:, 0].max(), X[:, 0].max(), X[:, 0].min(), X[:, 0].min()]
    x_surf = np.array(x_surf)
    y_surf = np.array(y_surf)
    v0 = lda.transform(lda.coef_[[0]])
    v0 /= v0[-1]
    v1 = lda.transform(lda.coef_[[1]])
    v1 /= v1[-1]

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])

    plt.show()
def naive_bayes_with_lda():
    train, train_target, test, test_target = load_polluted_spambase()

    print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape)
    print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape)

    start = timeit.default_timer()

    lda = LDA(n_components=100)
    train = lda.fit_transform(train, train_target)
    test = lda.transform(test)

    print lda
    print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape)
    print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape)

    cf = GaussianNaiveBayes()
    cf.fit(train, train_target)
    raw_predicts = cf.predict(test)
    predict_class = cf.predict_class(raw_predicts)

    cm = confusion_matrix(test_target, predict_class)
    print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
    er, acc, fpr, tpr = confusion_matrix_analysis(cm)
    print "Error rate: %f, accuracy: %f, FPR: %f, TPR: %f" % (er, acc, fpr, tpr)

    stop = timeit.default_timer()
    print "Total Run Time: %s secs" % (stop - start)
Beispiel #7
0
def myLDA(X,y):
    t1 = clock()
    clf = LDA()
    clf.fit(X, y)
    newRep = clf.transform(X)
    t2 = clock()
    return t2-t1
Beispiel #8
0
def lda_scikit():
    df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
    df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 
    'Alcalinity of ash', 'Magnesium', 'Total phenols', 
    'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
    'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
    X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
    X_train, X_test, y_train, y_test = \
          train_test_split(X, y, test_size=0.3, random_state=0)
    sc = StandardScaler()
    X_train_std = sc.fit_transform(X_train)
    X_test_std = sc.transform(X_test)
    
    pdb.set_trace()
    lda = LDA(n_components=3)
    X_train_lda = lda.fit_transform(X_train_std, y_train)
    lr = LogisticRegression()
    lr = lr.fit(X_train_lda, y_train)
    
    plot_decision_regions(X_train_lda, y_train, classifier=lr)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(PL5 + 'lda_scikit.png', dpi=300)
    plt.close()
    
    X_test_lda = lda.transform(X_test_std)
    
    plot_decision_regions(X_test_lda, y_test, classifier=lr)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(PL5 + 'lda_scikit_test.png', dpi=300)
Beispiel #9
0
def test_classification():
    from read import read
    import numpy, tfidf
    from sklearn.decomposition import TruncatedSVD
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import Normalizer

    m, files = read("training.json")
    y_map = [str(file["topic"]) for file in files]
    map = []
    for i in range(len(y_map)):
        if(len(map) == 0 or not map.__contains__(y_map[i])):
            map.append(y_map[i])
    y = numpy.array([map.index(y_map[i]) for i in range(len(y_map))])

    print("Construindo TF-IDF...")
    X, vectorizer = tfidf.vectorizeTFIDF(files)
    print X.shape

    print("Performing dimensionality reduction using LDA...")

    lda = LDA(n_components=9)
    X = X.toarray()
    lda.fit(X, y)
    X = lda.transform(X)

    mlp = MLPClassifier()
    mlp.fit(X, y)
    training_score = mlp.score(X, y)
    print("training accuracy: %f" % training_score)
def naive_bayes_with_lda():
    train, train_target, test, test_target = load_polluted_spambase()

    print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape)
    print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape)

    start = timeit.default_timer()

    lda = LDA(n_components=100)
    train = lda.fit_transform(train, train_target)
    test = lda.transform(test)

    print lda
    print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape)
    print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape)

    cf = GaussianNaiveBayes()
    cf.fit(train, train_target)
    raw_predicts = cf.predict(test)
    predict_class = cf.predict_class(raw_predicts)

    cm = confusion_matrix(test_target, predict_class)
    print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (
        cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
    er, acc, fpr, tpr = confusion_matrix_analysis(cm)
    print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr,
                                                              tpr)

    stop = timeit.default_timer()
    print "Total Run Time: %s secs" % (stop - start)
Beispiel #11
0
class FldaLite(FLDA):
    def fit(self, X, y):
        self.scaler_ = StandardScaler()
        self.pca_ = PCA(n_components=self.pca_n_components)
        XX = self.pca_.fit_transform(self.scaler_.fit_transform(X))

        self.knn_ = KNeighborsClassifier(n_neighbors=self.knn_n_neighs)
        self.knn_.fit(XX, y)

        yy = map(lambda nn: y[nn], self.knn_.kneighbors(XX)[1])
        self.cv_ = CountVectorizer(input='content', tokenizer=lambda x: x, lowercase=False)
        XXX = self.cv_.fit_transform(array(yy))
        self.tfidf_transformer_ = TfidfTransformer()
        XXX = self.tfidf_transformer_.fit_transform(XXX)

        self.clusterer_ = SpectralClustering(n_clusters=self.n_scented_clusters)
        yyy = self.clusterer_.fit_predict(XXX)

        self.lda_ = LDA(**self.lda_params)
        self.lda_.fit(XX, yyy)

        return self

    def transform(self, X):
        return self.lda_.transform(self.pca_.fit_transform(self.scaler_.fit_transform(X)))
Beispiel #12
0
def test_classification():
    from read import read
    import numpy, tfidf
    from sklearn.decomposition import TruncatedSVD
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import Normalizer

    m, files = read("training.json")
    y_map = [str(file["topic"]) for file in files]
    map = []
    for i in range(len(y_map)):
        if (len(map) == 0 or not map.__contains__(y_map[i])):
            map.append(y_map[i])
    y = numpy.array([map.index(y_map[i]) for i in range(len(y_map))])

    print("Construindo TF-IDF...")
    X, vectorizer = tfidf.vectorizeTFIDF(files)
    print X.shape

    print("Performing dimensionality reduction using LDA...")

    lda = LDA(n_components=9)
    X = X.toarray()
    lda.fit(X, y)
    X = lda.transform(X)

    mlp = MLPClassifier()
    mlp.fit(X, y)
    training_score = mlp.score(X, y)
    print("training accuracy: %f" % training_score)
Beispiel #13
0
class RecognizerLDA(RecognizerCommon):
	n_components = 30
	def fit(self):
		self.model = LDA(
			n_components=self.n_components).fit(self.data.X, self.data.y)

	def predict(self, X):
		return self.model.transform(X)
Beispiel #14
0
def LDA10Fold(X, y):
    acc = []
    kf = KFold(X.shape[0], n_folds=10, shuffle=True)
    i = 0
    for train_index, test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        clf = LDA()
        clf.fit(X[train_index], yTrain)
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain, yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0])
        # print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)
Beispiel #15
0
class RecognizerLDA(RecognizerCommon):
    n_components = 30

    def fit(self):
        self.model = LDA(n_components=self.n_components).fit(
            self.data.X, self.data.y)

    def predict(self, X):
        return self.model.transform(X)
def lda(ds, n):
    '''
        Outputs the projection of the data in the best
        discriminant dimension.
        Maximum of 2 dimensions for our binary case (values of n greater than this will be ignored by sklearn)
    '''
    selector = LDA(n_components=n)
    selector.fit(ds.data, ds.target)
    new_data = selector.transform(ds.data)
    return Dataset(new_data, ds.target)
def lda(ds, n):
    '''
        Outputs the projection of the data in the best
        discriminant dimension.
        Maximum of 2 dimensions for our binary case (values of n greater than this will be ignored by sklearn)
    '''
    selector = LDA(n_components=n)
    selector.fit(ds.data, ds.target)
    new_data = selector.transform(ds.data)
    return Dataset(new_data, ds.target)
Beispiel #18
0
    def execute(self,i,j):
        # dim_red = LDA()
        # dim_red.fit_transform(self.x_train, self.y_train)
        # with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid:
        #     cPickle.dump(dim_red, fid)

        # x_train = dim_red.transform(self.x_train)
        # x_test = dim_red.transform(self.y_train)    
        # stat_obj = self.stat_class() # reflection bitches
        # stat_obj.train(x_train, x_test)
        # print len(x_train)
        # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid:
        #     cPickle.dump(stat_obj, fid)

        kf = KFold(len(self.x_train), n_folds=self.k_cross)
        own_kappa = []
        for train_idx, test_idx in kf:
    		# print train_idx,test_idx
		# exit(0)
            x_train, x_test = self.x_train[train_idx], self.x_train[test_idx]
            y_train, y_test = self.y_train[train_idx], self.y_train[test_idx]
            dim_red = LDA()
            x_train = dim_red.fit_transform(x_train, y_train)
			
			
            # with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid:
            #     cPickle.dump(dim_red, fid)

            # with open('dumped_dim_red_'+str(i)+'.pkl', 'rb') as fid:
                # dim_red=cPickle.load(fid)
            x_test = dim_red.transform(x_test)
                
            # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'rb') as fid:
            #     stat_obj=cPickle.load(fid)
            # x_train = dim_red.transform(x_train)
            # x_test = dim_red.transform(x_test)

            stat_obj = self.stat_class() # reflection bitches
            stat_obj.train(x_train,y_train)
            # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid:
                # cPickle.dump(stat_obj, fid)
            # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'rb') as fid:
                # stat_obj=cPickle.load(fid)
            y_pred = [ 0 for i in xrange(len(y_test)) ]
            for i in range(len(x_test)):
                # print len(x_test[i])
                val = int(np.round(stat_obj.predict(x_test[i])))
                if val > self.range_max: val = self.range_max
                if val < self.range_min: val = self.range_min
                y_pred[i] = [val]
            y_pred = np.matrix(y_pred)
            cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max)
            self.values.append(cohen_kappa_rating)
        return str(sum(self.values)/self.k_cross)
Beispiel #19
0
def plot_lda_projection(marker, flname):
	lda = LDA()
	lda.fit(marker["individuals"], marker["population_labels"])
	print lda.score(marker["individuals"], marker["population_labels"])
	proj = lda.transform(marker["individuals"])
	n_samples, n_components = proj.shape

	plt.scatter(proj, marker["population_labels"])
	plt.xlabel("Component 0", fontsize=18)
	plt.ylabel("Population Labels", fontsize=18)

	plt.savefig(flname, DPI=200)
def LDA_reduction(posture, trainblock, componenet):
    currentdirectory = os.getcwd()  # get the directory.
    parentdirectory = os.path.abspath(currentdirectory + "/../..")  # Get the parent directory(2 levels up)
    path = parentdirectory + '\Output Files\E5-Dimensionality Reduction/posture-'+str(posture)+'/TrainBlock-'+str(trainblock)+''
    if not os.path.exists(path):
        os.makedirs(path)
    i_user = 1
    block = 1
    AUC = []
    while i_user <= 31:
        while block <= 6:
            train_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(trainblock)+"-GI.csv", dtype=float, delimiter=",")
            test_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(block)+"-GI.csv", dtype=float, delimiter=",")

            target_train = np.ones(len(train_data))
            row = 0
            while row < len(train_data):
                if np.any(train_data[row, 0:3] != [1, i_user, posture]):
                    target_train[row] = 0
                row += 1

            row = 0
            target_test = np.ones(len(test_data))
            while row < len(test_data):
                if np.any(test_data[row, 0:3] != [1, i_user, posture]):
                    target_test[row] = 0
                row += 1

            sample_train = train_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]]
            sample_test = test_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]]
            scaler = preprocessing.MinMaxScaler().fit(sample_train)
            sample_train_scaled = scaler.transform(sample_train)
            sample_test_scaled = scaler.transform(sample_test)

            lda = LDA(n_components=componenet)
            sample_train_lda = lda.fit(sample_train_scaled, target_train).transform(sample_train_scaled)
            sample_test_lda = lda.transform(sample_test_scaled)

            clf = ExtraTreesClassifier(n_estimators=100)
            clf.fit(sample_train_lda, target_train)

            prediction = clf.predict(sample_test_lda)
            auc = metrics.roc_auc_score(target_test, prediction)
            AUC.append(auc)

            block += 1

        block = 1
        i_user += 1
    print(AUC)
    AUC = np.array(AUC)
    AUC = AUC.reshape(31, 6)
    np.savetxt("../../Output Files/E5-Dimensionality Reduction/posture-"+str(posture)+"/TrainBlock-"+str(trainblock)+"/LDA-"+str(componenet)+"-Component.csv", AUC, delimiter=",")
Beispiel #21
0
def get_LDA_performance(test_df, X_std, y):
    X_test = test_df.ix[:, 'x.1':'x.10'].values
    X_std_test = StandardScaler().fit_transform(X_test)
    y_test = test_df.ix[:, 'y'].values

    lda_scores_training = []
    lda_scores_test = []

    qda_scores_training = []
    qda_scores_test = []

    knn_scores_training = []
    knn_scores_test = []

    for d in range(1, 11):
        lda = LDA(n_components=d)
        Xred_lda_training = lda.fit_transform(X_std, y)
        Xred_lda_test = lda.transform(X_std_test)

        lda_model = LDA()
        lda_model.fit(Xred_lda_training, y)

        qda_model = QDA()
        qda_model.fit(Xred_lda_training, y)

        knn_model = KNeighborsClassifier(n_neighbors=10)
        knn_model.fit(Xred_lda_training, y)

        lda_scores_training.append(1 - lda_model.score(Xred_lda_training, y))
        lda_scores_test.append(1 - lda_model.score(Xred_lda_test, y_test))

        qda_scores_training.append(1 - qda_model.score(Xred_lda_training, y))
        qda_scores_test.append(1 - qda_model.score(Xred_lda_test, y_test))

        knn_scores_training.append(1 - knn_model.score(Xred_lda_training, y))
        knn_scores_test.append(1 - knn_model.score(Xred_lda_test, y_test))

    plt.plot(range(10), lda_scores_training, 'r--', label="Train data")
    plt.plot(range(10), lda_scores_test, 'b--', label="Test data")
    plt.title("LDA vs LDA")
    plt.xlabel('k')
    plt.ylabel('Score')
    plt.show()

    plt.plot(range(10), qda_scores_training, 'r--', label="Train data")
    plt.plot(range(10), qda_scores_test, 'b--', label="Test data")
    plt.title("QDA vs LDA")
    plt.show()

    plt.plot(range(10), knn_scores_training, 'r--', label="Train data")
    plt.plot(range(10), knn_scores_test, 'b--', label="Test data")
    plt.title("KNN vs LDA")
    plt.show()
Beispiel #22
0
def lda_test(img_kind):
	import pylab as pl
	

	subdir = "data/"

	classes = []
	data = []

	the_ones = glob.glob(subdir + "f_" + img_kind + "*.jpg")
	all_of_them = glob.glob(subdir + "f_*_*.jpg")
	the_others = []

	for x in all_of_them:
		if the_ones.count(x) < 1:
			the_others.append(x)
	
	for x in the_ones:
		classes.append(1)
		data.append(get_image_features(cv.LoadImageM(x)))
	
	for x in the_others:
		classes.append(-1)
		data.append(get_image_features(cv.LoadImageM(x)))
	
	lda = LDA(n_components=2)
	print 'fiting'
	lda.fit(data, classes)
	print 'transforming'
	X_r = lda.transform(data)
	print '----'

	print X_r.shape

	x0 = [x[0] for x in X_r]
	x1 = [x[1] for x in X_r]

	pl.figure()
	for i in xrange(0,len(x0)):
		if classes[i] == 1:
			pl.scatter(x0[i], x1[i], c = 'r')
		else:
			pl.scatter(x0[i], x1[i], c = 'b')
	

	
	# for c, i, target_name in zip("rg", [1, -1], target_names):
	#     pl.scatter(X_r[classes == i, 0], X_r[classes == i, 1], c=c, label=target_name)
	pl.legend()
	pl.title('LDA of dataset')

	pl.show()
def feat_extraction(X,y,D):

    # usupervised feature extraction: Principal Component Analysis
    pca = decomposition.PCA(n_components=D)
    pca.fit(X)
    X_pca = pca.transform(X)

    # supervised feature extraction: Linear Discriminative Analysis
    lda = LDA(n_components=D)
    lda.fit(X,y)
    X_lda = lda.transform(X)

    return (X_pca,X_lda)
def feat_extraction(X, y, D):

    # usupervised feature extraction: Principal Component Analysis
    pca = decomposition.PCA(n_components=D)
    pca.fit(X)
    X_pca = pca.transform(X)

    # supervised feature extraction: Linear Discriminative Analysis
    lda = LDA(n_components=D)
    lda.fit(X, y)
    X_lda = lda.transform(X)

    return (X_pca, X_lda)
Beispiel #25
0
class LDA(AbstractProjection):
    def __init__(self, **kw):
        super(LDA, self).__init__()
        self.lda = ScikitLDA(**kw)

    def train(self, features, labels):
        red_feats = self.lda.fit_transform(features, labels)
        self.V = np.std(red_feats, axis=0)

    def project(self, feats, whiten=True):
        lda_feats = self.lda.transform(feats)
        if whiten:
            lda_feats /= self.V
        return lda_feats
Beispiel #26
0
def lda(arr0, target, n_components):
    from sklearn.lda import LDA
    matrix = np.array(arr0)
    target = np.array(target)
    temp = LDA(n_components=n_components).fit(matrix, target)
    coef = temp.coef_
    # covariance = temp.covariance_
    mean = temp.means_
    priors = temp.priors_
    scalings = temp.scalings_
    xbar = temp.xbar_
    # label = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index)
    label = temp.transform(matrix).tolist()
    return label, coef.tolist(), mean.tolist(), priors.tolist(
    ), scalings.tolist(), xbar.tolist()
Beispiel #27
0
class FLDA(object):
    def __init__(self, pca_n_components=3, knn_n_neighs=10, n_scented_clusters=5, **kwargs):
        self.pca_n_components = pca_n_components
        self.knn_n_neighs = knn_n_neighs
        self.n_scented_clusters = n_scented_clusters
        self.lda_params = kwargs

    def fit(self, X, y):
        self.y_ = y
        self.scaler_ = StandardScaler()
        self.pca_ = PCA(n_components=self.pca_n_components)
        XX = self.pca_.fit_transform(self.scaler_.fit_transform(X))

        self.knn_ = KNeighborsClassifier(n_neighbors=self.knn_n_neighs)
        self.knn_.fit(XX, self.y_)

        yy = map(lambda nn: y[nn], self.knn_.kneighbors(XX)[1])
        self.cv_ = CountVectorizer(input='content', tokenizer=lambda x: x, lowercase=False)
        XXX = self.cv_.fit_transform(array(yy))
        self.tfidf_transformer_ = TfidfTransformer()
        XXX = self.tfidf_transformer_.fit_transform(XXX)

        self.clusterer_ = SpectralClustering(n_clusters=self.n_scented_clusters)
        yyy = self.clusterer_.fit_predict(XXX)

        self.lda_ = LDA(**self.lda_params)

        self.lda_.fit(XXX.todense(), yyy)

        return self

    def transform(self, X):
        #         return self.lda_.transform(self.pca_.fit_transform(self.scaler_.fit_transform(X)))
        X = self.pca_.transform(self.scaler_.transform(X))
        yy = map(lambda nn: self.y_[nn], self.knn_.kneighbors(X)[1])
        X = self.cv_.transform(array(yy))
        X = self.tfidf_transformer_.transform(X)
        return self.lda_.transform(X.todense())

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)

    def set_params(self, **kwargs):
        for k, v in kwargs.iteritems():
            setattr(self, k, v)
Beispiel #28
0
def with_lda(X_train_std, y_train, X_test_std, y_test):
    from sklearn.lda import LDA
    lda = LDA(n_components=2)
    X_train_lda = lda.fit_transform(X_train_std, y_train)
    lr = LogisticRegression()
    lr = lr.fit(X_train_lda, y_train)
    plot_decision_regions(X_train_lda, y_train, classifier=lr)
    plot.xlabel('LD 1')
    plot.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.show()

    X_test_lda = lda.transform(X_test_std)
    plot_decision_regions(X_test_lda, y_test, classifier=lr)
    plot.xlabel('LD 1')
    plot.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.show()
def lda(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans lda")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    #lda=LDA(n_components=2)
    lda=LDA()
    lda.fit(X,y)
    X_LDA = lda.transform(X)
    y_pred = lda.predict(X)
    print "#########################################################################################################\n"
    print "Linear Discriminant Analysis Accuracy "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"LDA_metrics.txt"
    file = open(results, "w")
    file.write("Linear Discriminant Analaysis estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "LDA"
    save = Output + "LDA_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y)
    fig.colorbar(im);
    save_lda = Output + "LDA_plot.png"
    plt.savefig(save_lda)
    plt.close()
    lvltrace.lvltrace("LVLSortie dans lda")
def lda(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans lda split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    lda=LDA(n_components=2)
    lda.fit(X_train,y_train)
    X_LDA = lda.transform(X_train)
    print "shape of result:", X_LDA.shape
    y_pred = lda.predict(X_test)
    print "Linear Discriminant Analysis Accuracy "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    #LVLprint "\n"
    results = Output+"LDA_metrics_test.txt"
    file = open(results, "w")
    file.write("Linear Discriminant Analaysis estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "LDA %f"%test_size
    save = Output + "LDA_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y_train)
    fig.colorbar(im);
    save_lda = Output + "LDA_plot_test"+"_%s.png"%test_size
    plt.savefig(save_lda)
    plt.close()
    lvltrace.lvltrace("LVLSortie dans lda split_test")
Beispiel #31
0
class LDAFeatures:
    def __init__(self, n_comp=3):
        self.lda = None
        self.n_comp = n_comp

    def features(self, pixels, gt=None):
        # grab feature stack
        fullFeatures = naive_features(pixels)
        print fullFeatures.shape

        # if the LDA from ground truth exists already, transform new features
        if gt == None and self.lda != None:
            print self.lda
            return self.lda.transform(fullFeatures)
        assert gt != None

        # otherwise, train LDA
        self.lda = LDA(n_components=self.n_comp).fit(fullFeatures, gt)
        print self.lda
        return self.lda.transform(fullFeatures)
Beispiel #32
0
class LDAFeatures:
    def __init__(self, n_comp=3):
        self.lda = None
        self.n_comp = n_comp

    def features(self, pixels, gt=None):
        #grab feature stack
        fullFeatures = naive_features(pixels)
        print fullFeatures.shape

        #if the LDA from ground truth exists already, transform new features
        if gt == None and self.lda != None:
            print self.lda
            return self.lda.transform(fullFeatures)
        assert gt != None

        #otherwise, train LDA
        self.lda = LDA(n_components=self.n_comp).fit(fullFeatures, gt)
        print self.lda
        return self.lda.transform(fullFeatures)
Beispiel #33
0
	def fusion(self, ids, training, matrix, colnames, method='lda'):
		from sklearn.lda import LDA
		nbrows, nbcols = matrix.shape
		m=matrix[:,:] # copy matrix
		self.impute(m) # impute missing values
		classes = map(lambda x: 1 if x in training else 2, ids)
		clf = LDA()
		clf.fit(m, classes)
		weights = {}
		for w in xrange(len(colnames)):
			weights[ colnames[w] ] = clf.scalings_[w][0]
		fusion = clf.transform(m)
		# build result structure
		res = []
		for i in xrange(nbrows):
			r = { 'id': ids[i], 'fusion': fusion[i][0] }
			for j in xrange(nbcols):
				r[ colnames[j] ] = matrix[i,j]
			res.append(r)
		res = { 'genes': sorted(res, key=lambda x: x['fusion'], reverse=True), 'weights': weights }
		return res
Beispiel #34
0
    def drawLDA(X_true,X_false,X_test,suffix=""):
        X=X_true+X_false
        Y=[1]*len(X_true)+[0]*len(X_false)
        plc=0
        lda = LDA(solver="eigen",n_components=2)
        canfit=False
        hred = False
        try:
            lda.fit(X,Y)
            canfit=True
        except :
            try:
                print("fit error")
                X = np.array(X)
                X = X[:,:140]
                lda.fit(X,Y)
                canfit=True
                hred=True
            except:
                print("cannot visualize")
        if(not canfit):
            return
        if(hred):
            Xlda_true = lda.transform(np.array(X_true)[:,:140])
            Xlda_false = lda.transform(np.array(X_false)[:,:140])
        else:
            Xlda_true = lda.transform(X_true)
            Xlda_false = lda.transform(X_false)
        plt.scatter(Xlda_true[:,0],Xlda_true[:,1],color=plp[plc][0],marker=plp[plc][1],label="thbgm")
        plc+=1
        plt.scatter(Xlda_false[:,0],Xlda_false[:,1],color=plp[plc][0],marker=plp[plc][1],label="not thbgm")
        plc+=1
        if(len(X_test)>0):
            if(hred):
                Xlda_test = lda.transform(np.array(X_test)[:,:140])
            else:
                Xlda_test = lda.transform(np.array(X_test))
            plt.scatter(Xlda_test[:,0],Xlda_test[:,1],color=plp[plc][0],marker=plp[plc][1],label="test")
            plc+=1

        print(lda.coef_.shape)

        plt.xlabel("feature1")
        plt.ylabel("feature2")
        plt.title("Classification with "+useFeature)
        plt.legend()
        plt.savefig("./learn/visualize/lda_"+useFeature+suffix+".png")
        plt.clf()
Beispiel #35
0
 def execute(self):
     kf = KFold(len(self.x_train), n_folds=self.k_cross)
     own_kappa = []
     for train_idx, test_idx in kf:
         x_train, x_test = self.x_train[train_idx], self.x_train[test_idx]
         y_train, y_test = self.y_train[train_idx], self.y_train[test_idx]
         dim_red = LDA()
         x_train = dim_red.fit_transform(x_train, y_train)
         x_test = dim_red.transform(x_test)
         stat_obj = self.stat_class() # reflection bitches
         stat_obj.train(x_train,y_train)
         y_pred = [ 0 for i in xrange(len(y_test)) ]
         for i in range(len(x_test)):
             val = int(np.round(stat_obj.predict(x_test[i])))
             if val > self.range_max: val = self.range_max
             if val < self.range_min: val = self.range_min
             y_pred[i] = [val]
         y_pred = np.matrix(y_pred)
         cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max)
         self.values.append(cohen_kappa_rating)
     return str(sum(self.values)/self.k_cross)
Beispiel #36
0
    def execute(self,i,j):
        x_train= self.x_train
        y_train= self.y_train
        dim_red = LDA()
        x_train = dim_red.fit_transform(x_train, y_train)
        with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid:
            cPickle.dump(dim_red, fid)

        stat_obj = self.stat_class() # reflection bitches
        stat_obj.train(x_train,y_train)
        with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid:
            cPickle.dump(stat_obj, fid)
        
        kf = KFold(len(self.x_train), n_folds=self.k_cross)
        own_kappa = []
        for train_idx, test_idx in kf:
		# print train_idx,test_idx
		# exit(0)
            x_train, x_test = self.x_train[train_idx], self.x_train[test_idx]
            y_train, y_test = self.y_train[train_idx], self.y_train[test_idx]
            dim_red = LDA()
            x_train = dim_red.fit_transform(x_train, y_train)
            x_test = dim_red.transform(x_test)

            stat_obj = self.stat_class() # reflection bitches
            stat_obj.train(x_train,y_train)
          
            y_pred = [ 0 for i in xrange(len(y_test)) ]
            for i in range(len(x_test)):
                val = int(np.round(stat_obj.predict(x_test[i])))
                if val > self.range_max: val = self.range_max
                if val < self.range_min: val = self.range_min
                y_pred[i] = [val]
            y_pred = np.matrix(y_pred)
            cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max)
            self.values.append(cohen_kappa_rating)
        return sum(self.values)/self.k_cross
def reduce_features(f_red, data_and_sizes):
    train_data, train_sizes, test_data, test_sizes = data_and_sizes
    train_data = [np.array(strat_data) for strat_data in train_data]
    test_data = [np.array(strat_data) for strat_data in test_data]
    print f_red
    if f_red == 'non':
        return train_data, test_data
    X1 = np.vstack(train_data)
    X2 = np.vstack(test_data)
    print 'X1 X2 shape', X1.shape, X2.shape
    if f_red == 'pca':
        pca = PCA(n_components=0.95)
        pca.fit(X1)
        pca_X1 = pca.transform(X1)
        pca_X2 = pca.transform(X2)
        print 'pca X1 X2 shape', pca_X1.shape, pca_X2.shape
        print 'pca train min max', pca_X1.min(), pca_X1.max()
        print 'pca test min max', pca_X2.min(), pca_X2.max()
        pca_train_data = split_stack(pca_X1, train_sizes)
        pca_test_data = split_stack(pca_X2, test_sizes)
        print 'pca len', len(pca_train_data), len(pca_test_data)
        return pca_train_data, pca_test_data
    elif f_red == 'lda':
        targets = make_targets(train_data)
        lda = LDA()
        lda.fit(X1, targets)
        lda_X1 = lda.transform(X1)
        lda_X2 = lda.transform(X2)
        print 'lda X1 X2 shape', lda_X1.shape, lda_X2.shape
        print 'lda train min max', lda_X1.min(), lda_X1.max()
        print 'lda test min max', lda_X2.min(), lda_X2.max()
        lda_train_data = split_stack(lda_X1, train_sizes)
        lda_test_data = split_stack(lda_X2, test_sizes)
        print 'lda len', len(lda_train_data), len(lda_test_data)
        # Nomalize data???
        return lda_train_data, lda_test_data
Beispiel #38
0
class DotProduct(DP1):
    name = 'LDA'
    LDA_components = 2

    def __init__(self, X, Y, room, bin_size):
        assert (room[0][1] - room[0][0]) % bin_size == 0
        assert (room[1][1] - room[1][0]) % bin_size == 0
        self.bin_size = bin_size
        self.room = room
        self.xblen = (room[0][1] - room[0][0]) / bin_size
        self.yblen = (room[1][1] - room[1][0]) / bin_size
        self.bins = self.xblen * self.yblen
        self.labels = np.unique(Y)

        newX = np.zeros([X.shape[0], self.LDA_components + self.bins])
        newX[:, -self.bins:] = X[:, -self.bins:]

        self.lda = LDA(n_components=self.LDA_components)
        tmp = self.lda.fit_transform(X[:, :-self.bins], Y)
        import pdb
        pdb.set_trace()
        newX[:, :self.LDA_components] = tmp

        # This is if X = [cell1, cell2, ..., celln, binfrac1,...,binfrac k^2]
        self.train(newX, Y, room, bin_size)

    def classify(self, X):
        bin_frac = X[-self.bins:].reshape([self.xblen, self.yblen])
        X = X[:-self.bins]

        X = np.squeeze(self.lda.transform(X))

        #self.base[cell id, lbl, xbin, ybin] = rate
        cntxt0 = np.einsum('cxy,c,xy', self.base[:, 0, :, :], X, bin_frac)
        cntxt1 = np.einsum('cxy,c,xy', self.base[:, 1, :, :], X, bin_frac)

        if logging.getLogger().level <= 5:
            tmp0 = 0
            for cell in range(len(X)):
                tmp0 += np.sum(X[cell] * bin_frac * self.base[cell, 0, :, :])

            tmp1 = 0
            for cell in range(len(X)):
                tmp1 += np.sum(X[cell] * bin_frac * self.base[cell, 1, :, :])

            assert np.allclose(tmp0, cntxt0)
            assert np.allclose(tmp1, cntxt1)

        #import pdb; pdb.set_trace()

        if cntxt0 > cntxt1:
            return {self.labels[0]: 1, self.labels[1]: 0}
        else:
            return {self.labels[0]: 0, self.labels[1]: 1}
        '''
        # Normalize
        if cntxt0 != 0 or cntxt1 != 0:
            mag = cntxt0+cntxt1
        else:
            mag = 1
        
        cntxt0 /= mag
        cntxt1 /= mag
        
        assert (round(cntxt0 + cntxt1,5) in [0,1])'''

        return {self.labels[0]: cntxt0, self.labels[1]: cntxt1}
    print "n1:", n1

    p_value = np.zeros(B)
    cm = []
    BF_10 = []
    for i in range(B):
        print i,
        stdout.flush()
        X_train = two_sample(mu0, mu1, cov, m0, m1)
        X_test = two_sample(mu0, mu1, cov, n0, n1)
        y_train = np.array([0] * m0 + [1] * m1)
        y_test = np.array([0] * n0 + [1] * n1)

        clf = LDA()
        clf.fit(X_train, y_train)
        delta_x = clf.transform(X_test)  # distances from the classification surface.
        delta_x0 = delta_x[y_test == 0]
        delta_x1 = delta_x[y_test == 1]
        t, p_value[i] = ttest_ind(delta_x0, delta_x1)
        # y_pred = clf.predict(X_test)
        # cm.append(confusion_matrix(y_test, y_pred))
        # BF_10.append(compute_BF_10(cm[-1]))

        # print p_value[i], p_value[1]<p_threshold

    print
    power = (p_value <= p_threshold).mean()
    print "LDA-Student Power =", power
    # BF_10 = np.vstack(BF_10)
    # power_BF = (BF_10.min(1) >= BF_threshold).mean()
    # print "BF Power =", power_BF
Beispiel #40
0
# @File    : lda.py
# @Software: PyCharm Community Edition

from sklearn.lda import LDA
import pandas
from pandas import Series, DataFrame

df = pandas.read_csv('E:\8-22_Ubi_data\data_analyze\kills_deaths\\for_28model_mean_kills_deaths.csv',header=None)
#help(pandas.read_csv)
df=df.fillna(0)
# print df
# print df[0]
y=df[0]
X=df.drop([0],axis=1)
print y
print X

#X = iris.data[:-5]
#pre_x = iris.data[-5:]
#y = iris.target[:-5]
#print ('first 10 raw samples:', X[:10])
clf = LDA()
clf.fit(X, y)
X_r = clf.transform(X)
X_r=DataFrame(X_r)
#pre_y = clf.predict(pre_x)
#降维结果
X_r.to_csv('E:\8-22_Ubi_data\data_analyze\kills_deaths\\for_28model_mean_kills_deaths_lda.csv')
#print ('first 10 transformed samples:', X_r[:10])
#预测目标分类结果
#print ('predict value:', pre_y)
print("P1 Samples: " + str(np.sum(P1_mask)) + " P2 Samples: " + str(np.sum(P2_mask)))
print("P2 Test Samples: " + str(np.sum(P1_test_mask)) + " P2 Test Samples: " + str(np.sum(P2_test_mask)))

print("Doing LDA Reduction...")
reduce_to = 9


print(data_full.shape)
print(class_data.shape)
print(np.argmax(class_data,axis=1))

lda = LDA(n_components=9)
#lda = LDA(n_components=9,shrinkage='auto',solver='eigen')
#data_reduced = lda.fit_transform(data_full,np.argmax(class_data,axis=1))
lda = lda.fit(data_full,np.argmax(class_data,axis=1))
data_reduced = lda.transform(data_full)
test_data_reduced = lda.transform(test_data_full)

print(data_reduced.shape)

#pca reduce
#(pca_transform,data_means) = pca_reduce(data_full,class_data)
#data_reduced = np.dot(data_full,pca_transform[:,0:reduce_to])
#test_data_reduced = np.dot(test_data_full,pca_transform[:,0:reduce_to])

print("Normalizing...")
#we should normalize the pca reduced data
if(p.has_key('skip_pca') and p['skip_pca'] == True):
    print("Skipping PCA Reduction...")
    data_reduced = data_full
    test_data_reduced = test_data_full
Beispiel #42
0
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False):
	'''
	ARGUMENTS:
		- fileName:		the name of the WAV file to be analyzed
		- numOfSpeakers	the number of speakers (clusters) in the recording (<=0 for unknown)
		- mtSize (opt)	mid-term window size
		- mtStep (opt)	mid-term window step
		- stWin  (opt)	short-term window size
		- LDAdim (opt)	LDA dimension (0 for no LDA)
		- PLOT	 (opt)	0 for not plotting the results 1 for plottingy
	'''
	[Fs, x] = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x);
	Duration = len(x) / Fs

	[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll")
	[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale")

	[MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5));

	MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) )

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:,i] - MEAN1)  / STD1
		curF2 = (MidTermFeatures[:,i] - MEAN2)  / STD2
		[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
		[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
		MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001;
		MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
	
	MidTermFeatures = MidTermFeatures2	# TODO	
	# SELECT FEATURES:
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; 																											# SET 0A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; 																									# SET 0B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 0C
	
	iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; 																	# SET 1A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 															# SET 1B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 1C
	
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; 			# SET 2A		
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 	# SET 2B
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 2C
	
	#iFeaturesSelect = range(100);																									# SET 3	
	#MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010  
	
	MidTermFeatures = MidTermFeatures[iFeaturesSelect,:]		
	
	(MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T	
	numOfWindows = MidTermFeatures.shape[1]

	# remove outliers:
	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0]
	
	# TODO: Combine energy threshold for outlier removal:
	#EnergyMin = numpy.min(MidTermFeatures[1,:])
	#EnergyMean = numpy.mean(MidTermFeatures[1,:])
	#Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
	#iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
	#print iNonOutLiers

	perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows	
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]
	
	# LDA dimensionality reduction:
	if LDAdim > 0:
		#[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));		
		# extract mid-term features with minimum step:
		mtWinRatio  = int(round(mtSize  / stWin));
		mtStepRatio = int(round(stWin / stWin));
		mtFeaturesToReduce = []			
		numOfFeatures = len(ShortTermFeatures)
		numOfStatistics = 2;			
		#for i in range(numOfStatistics * numOfFeatures + 1):
		for i in range(numOfStatistics * numOfFeatures):
			mtFeaturesToReduce.append([])

		for i in range(numOfFeatures):		# for each of the short-term features:
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos<N):
				N1 = curPos
				N2 = curPos + mtWinRatio
				if N2 > N:
					N2 = N
				curStFeatures = ShortTermFeatures[i][N1:N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures))				
				curPos += mtStepRatio		
		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
				
		mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) )
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:,i] - MEAN1)  / STD1
			curF2 = (mtFeaturesToReduce[:,i] - MEAN2)  / STD2
			[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
			[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
			mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001;
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
		mtFeaturesToReduce = mtFeaturesToReduce2		
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:]		
		#mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
		(mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])	
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
		#DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
		#MDistancesAll = numpy.mean(DistancesAll)
		#iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
		#mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1],));
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin
		#print LDAstep, LDAstepRatio
		for i in range(Labels.shape[0]):
			Labels[i] = int(i*stWin/LDAstepRatio);		
		clf = LDA(n_components=LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	if numOfSpeakers<=0:
		sRange = range(2,10)
	else:
		sRange = [numOfSpeakers]
	clsAll = []; silAll = []; centersAll = []
	
	for iSpeakers in sRange:
		cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)		# perform k-means clustering
		
		#YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
		#print distance.squareform(YDist).shape
		#hc = mlpy.HCluster()
		#hc.linkage(YDist)
		#cls = hc.cut(14.5)
		#print cls

		# Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
		clsAll.append(cls)
		centersAll.append(means)
		silA = []; silB = []
		for c in range(iSpeakers):								# for each speaker (i.e. for each extracted cluster)
			clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.020:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c]			# get subset of feature vectors
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)				# compute average distance between samples that belong to the cluster (a values)
				silA.append(numpy.mean(Yt)*clusterPerCent)
				silBs = []
				for c2 in range(iSpeakers):						# compute distances from samples of other clusters
					if c2!=c:
						clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0)
				silBs = numpy.array(silBs)							
				silB.append(min(silBs))							# ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
		silA = numpy.array(silA); 
		silB = numpy.array(silB); 
		sil = []
		for c in range(iSpeakers):								# for each cluster (speaker)
			sil.append( ( silB[c] - silA[c]) / (max(silB[c],  silA[c])+0.00001)  )		# compute silhouette

		silAll.append(numpy.mean(sil))								# keep the AVERAGE SILLOUETTE

	#silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
	imax = numpy.argmax(silAll)									# position of the maximum sillouette value
	nSpeakersFinal = sRange[imax]									# optimal number of clusters

	# generate the final set of cluster labels
	# (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
	cls = numpy.zeros((numOfWindows,))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i-iNonOutLiers))		
		cls[i] = clsAll[imax][j]
		
	# Post-process method 1: hmm smoothing
	for i in range(1):
		startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls)
		hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat)			# hmm training
		hmm.means_ = means; hmm.covars_ = cov
		cls = hmm.predict(MidTermFeaturesNormOr.T)					
	
	# Post-process method 2: median filtering:
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]										# final sillouette
	classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


	# load ground-truth if available
	gtFile = fileName.replace('.wav', '.segments');							# open for annotated file
	if os.path.isfile(gtFile):									# if groundturh exists
		[segStart, segEnd, segLabels] = readSegmentGT(gtFile)					# read GT data
		flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)			# convert to flags

	if PLOT:
		fig = plt.figure()	
		if numOfSpeakers>0:
			ax1 = fig.add_subplot(111)
		else:
			ax1 = fig.add_subplot(211)
		ax1.set_yticks(numpy.array(range(len(classNames))))
		ax1.axis((0, Duration, -1, len(classNames)))
		ax1.set_yticklabels(classNames)
		ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

	if os.path.isfile(gtFile):
		if PLOT:
			ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
		purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT)
		print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean)
		if PLOT:
			plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
	if PLOT:
		plt.xlabel("time (seconds)")
		#print sRange, silAll	
		if numOfSpeakers<=0:
			plt.subplot(212)
			plt.plot(sRange, silAll)
			plt.xlabel("number of clusters");
			plt.ylabel("average clustering's sillouette");
		plt.show()
Beispiel #43
0
    s.send("OK")
    buf = buffer(msg)
    A = numpy.frombuffer(buf, dtype=md['dtype'])
    return A.reshape(md['shape'])


ctx = zmq.Context()
s = ctx.socket(zmq.REP)
s.bind("ipc:///tmp/zmq-test")
pref = s.recv()
s.send("OK")
X = recv_array(s)
Y = recv_array(s)
lda = LDA(n_components=X.shape[1])
lda.fit(X,Y)
XNew = lda.transform(X)
##
XDiff = numpy.sum(numpy.square(XNew[:,3:]), 1)
Y = XDiff
norm = None
cmap = cm.get_cmap(name="hot")
##
density = graphdensity.Density(X)
##

test = euclidean_distances(X, X)
test = numpy.reshape(test, (-1))
indices = numpy.argsort(test)
numItems = X.shape[0]
def convert(index, x, y):
    return (int(math.floor(index / x)), index % y)
Beispiel #44
0
    def drawLDA3Class(X_train,X_test,suffix=""):
        X=X_train[0]+X_train[1]+X_train[2]
        Y=[0]*len(X_train[0])+[1]*len(X_train[1])+[2]*len(X_train[2])
        Yt=[0]*len(X_test[0])+[1]*len(X_test[1])+[2]*len(X_test[2])
        featCount = len(X_train[0][0])
        X = np.array(np.array(X))
        lda = LDA(n_components=2)
        canfit=False
        fitFeat = featCount
        while(not canfit):
            try:
                X = X[:,:fitFeat]
                lda.fit(X,Y)
                canfit=True
            except :
                fitFeat = fitFeat//2
        Xlda = []
        Xldat = []
        for ind in range(3):
            trainFit = np.array(lda.transform(np.array(X_train[ind])[:,:fitFeat]))
            testFit = np.array(lda.transform(np.array(X_test[ind])[:,:fitFeat]))
            Xlda.append(trainFit)
            Xldat.append(testFit)
        Xlda = np.array(Xlda)
        Xldat = np.array(Xldat)
        plc=0

        Xs = np.vstack(Xlda)
        Xst = np.vstack(Xldat)
        clf = SVC(C=0.1,kernel="linear")
        clf.fit(Xs,Y)
        Yp = clf.predict(Xs)
        Ytp = clf.predict(Xst)
        print(accuracy_score(Y,Yp),accuracy_score(Yt,Ytp))


        Xsa = np.vstack([Xs,Xst])
        x_min, x_max = Xsa[:, 0].min() - 1, Xsa[:, 0].max() + 1
        y_min, y_max = Xsa[:, 1].min() - 1, Xsa[:, 1].max() + 1
        h = (x_max-x_min)/100.0
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.contour(xx, yy, Z)
        gen = monochrome_style_generator()
        for ind in range(3):
            plt.scatter(Xlda[ind][:,0],Xlda[ind][:,1],color=plp[plc][0],marker=plp[plc][1],label="Class"+str(ind))
            plc+=1
        plt.xlabel("feature1")
        plt.ylabel("feature2")
        plt.title("Classification with {0} / Accuracy {1:.5f}".format(useFeature,accuracy_score(Y,Yp)))
        plt.legend()
        plt.savefig("./learn/visualize/lda3c_"+useFeature+suffix+"_trainData.png")
        plt.clf()

        plc=0
        Xsa = np.vstack([Xs,Xst])
        x_min, x_max = Xsa[:, 0].min() - 1, Xsa[:, 0].max() + 1
        y_min, y_max = Xsa[:, 1].min() - 1, Xsa[:, 1].max() + 1
        h = (x_max-x_min)/100.0
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.contour(xx, yy, Z)
        for ind in range(3):
            plt.scatter(Xldat[ind][:,0],Xldat[ind][:,1],color=plp[plc][0],marker=plp[plc][1],label="Class"+str(ind))
            plc+=1
        plt.xlabel("feature1")
        plt.ylabel("feature2")
        plt.title("Classification with {0} / Accuracy {1:.5f}".format(useFeature,accuracy_score(Yt,Ytp)))
        plt.legend()
        plt.savefig("./learn/visualize/lda3c_"+useFeature+suffix+"_testData.png")
        plt.clf()
Beispiel #45
0
    def execute(self,i,j):
    	global save1
    	global save2
    	jk=i
    	# print type(jk)
        # dim_red = LDA()
        # dim_red.fit_transform(self.x_train, self.y_train)
        # with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid:
        #     cPickle.dump(dim_red, fid)

        # x_train = dim_red.transform(self.x_train)
        # x_test = dim_red.transform(self.y_train)    
        # stat_obj = self.stat_class() # reflection bitches
        # stat_obj.train(x_train, x_test)
        # print len(x_train)
        # with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid:
        #     cPickle.dump(stat_obj, fid)
        # save1=None
        # save2=None
        kf = KFold(len(self.x_train), n_folds=self.k_cross)
        own_kappa = []
        for train_idx, test_idx in kf:
		# print train_idx,test_idx
		# exit(0)
			x_train, x_test = self.x_train[train_idx], self.x_train[test_idx]
			y_train, y_test = self.y_train[train_idx], self.y_train[test_idx]
			dim_red = LDA()
			x_train = dim_red.fit_transform(x_train, y_train)


			# with open('dumped_dim_red_'+str(i)+'.pkl', 'wb') as fid:
			#     cPickle.dump(dim_red, fid)

			# with open('dumped_dim_red_'+str(i)+'.pkl', 'rb') as fid:
			    # dim_red=cPickle.load(fid)
			x_test = dim_red.transform(x_test)
			    
			# with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'rb') as fid:
			#     stat_obj=cPickle.load(fid)
			# x_train = dim_red.transform(x_train)
			# x_test = dim_red.transform(x_test)

			stat_obj = self.stat_class() # reflection bitches
			stat_obj.train(x_train,y_train)
			# with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'wb') as fid:
			    # cPickle.dump(stat_obj, fid)
			# with open('dumped_'+str(j)+'_'+str(i)+'.pkl', 'rb') as fid:
			    # stat_obj=cPickle.load(fid)
			y_pred = [ 0 for i in xrange(len(y_test)) ]
			if(int(jk)==1):
				# print "test_idx"
				save1=stat_obj
				save2=dim_red
			for i in range(len(x_test)):
			    # print len(x_test[i])
			    val = int(np.round(stat_obj.predict(x_test[i])))
			    if val > self.range_max: val = self.range_max
			    if val < self.range_min: val = self.range_min
			    y_pred[i] = [val]
			y_pred = np.matrix(y_pred)
			cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max)
			self.values.append(cohen_kappa_rating)
			# print stat_obj.predict(x_train)
			# linear_k_cross = k_fold_cross_validation(cross_valid_k,linear_regression,X_train,Y_train,range_min,range_max)
			# linesar_accuracy.append(linear_k_cross.execute(i,0))
			# logistic_k_cross = k_fold_cross_validation(cross_valid_k,logistic_regression,X_train,Y_train,range_min,range_max)
			# logistic_accuracy.append(logistic_k_cross.execute(i,1))
			# svr_k_cross = k_fold_cross_validation(cross_valid_k,support_vector_regression,X_train,Y_train,range_min,range_max)
			# svr_accuracy.append(svr_k_cross.execute(i,2))
			# svm_k_cross = k_fold_cross_validation(cross_valid_k,support_vector_machine,X_train,Y_train, range_min,range_max)
			# svm_accuracy.append(svm_k_cross.execute(i,3))
        return str(sum(self.values)/self.k_cross)
def LDA_reduction(posture, trainblock, componenet):
    currentdirectory = os.getcwd()  # get the directory.
    parentdirectory = os.path.abspath(
        currentdirectory + "/../..")  # Get the parent directory(2 levels up)
    path = parentdirectory + '\Output Files\E5-Dimensionality Reduction/posture-' + str(
        posture) + '/TrainBlock-' + str(trainblock) + ''
    if not os.path.exists(path):
        os.makedirs(path)
    i_user = 1
    block = 1
    AUC = []
    while i_user <= 31:
        while block <= 6:
            train_data = np.genfromtxt(
                "../../Output Files/E3-Genuine Impostor data per user per posture/posture-"
                + str(posture) + "/User-" + str(i_user) + "/1-" + str(i_user) +
                "-" + str(posture) + "-" + str(trainblock) + "-GI.csv",
                dtype=float,
                delimiter=",")
            test_data = np.genfromtxt(
                "../../Output Files/E3-Genuine Impostor data per user per posture/posture-"
                + str(posture) + "/User-" + str(i_user) + "/1-" + str(i_user) +
                "-" + str(posture) + "-" + str(block) + "-GI.csv",
                dtype=float,
                delimiter=",")

            target_train = np.ones(len(train_data))
            row = 0
            while row < len(train_data):
                if np.any(train_data[row, 0:3] != [1, i_user, posture]):
                    target_train[row] = 0
                row += 1

            row = 0
            target_test = np.ones(len(test_data))
            while row < len(test_data):
                if np.any(test_data[row, 0:3] != [1, i_user, posture]):
                    target_test[row] = 0
                row += 1

            sample_train = train_data[:, [
                3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17
            ]]
            sample_test = test_data[:, [
                3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17
            ]]
            scaler = preprocessing.MinMaxScaler().fit(sample_train)
            sample_train_scaled = scaler.transform(sample_train)
            sample_test_scaled = scaler.transform(sample_test)

            lda = LDA(n_components=componenet)
            sample_train_lda = lda.fit(
                sample_train_scaled,
                target_train).transform(sample_train_scaled)
            sample_test_lda = lda.transform(sample_test_scaled)

            clf = ExtraTreesClassifier(n_estimators=100)
            clf.fit(sample_train_lda, target_train)

            prediction = clf.predict(sample_test_lda)
            auc = metrics.roc_auc_score(target_test, prediction)
            AUC.append(auc)

            block += 1

        block = 1
        i_user += 1
    print(AUC)
    AUC = np.array(AUC)
    AUC = AUC.reshape(31, 6)
    np.savetxt("../../Output Files/E5-Dimensionality Reduction/posture-" +
               str(posture) + "/TrainBlock-" + str(trainblock) + "/LDA-" +
               str(componenet) + "-Component.csv",
               AUC,
               delimiter=",")
                          whiten=True)
    Train = ipca.fit_transform(Train)
    Devel = ipca.fit_transform(Devel)
elif args.lda:
    print("LDA transforming...")
    lda = LDA(n_components=args.pl_dim)

    if args.arousal:
        labels = Train_L[:, 0]
    elif args.valence:
        labels = Train_L[:, 1]
    elif args.liking:
        labels = Train_L[:, 2]

    lda = lda.fit(Train, labels)  #learning the projection matrix
    Train = lda.transform(Train)
    Devel = lda.transfrom(Devel)

print("After feature transformation")
print("Train feature shape: ", Train.shape)
print("Train_L feature shape: ", Train_L.shape)

print("Devel feature shape: ", Devel.shape)
print("Devel_L feature shape: ", Devel_L.shape)

if args.path_save_train_feat:
    np.savetxt(args.path_save_train_feat,
               np.append(Train, Train_L, axis=1),
               delimiter=args.delim)
if args.path_save_devel_feat:
    np.savetxt(args.path_save_devel_feat,
Beispiel #48
0
# newdata = normdata
# for i in range(5):
# 	print newdata[i]

print "data done"
print "logistic initialized"
# clf.fit(data[:,:-1], data[:,-1])
print "fitted data"
skf = StratifiedKFold(data[:,-1], n_folds=10, shuffle=True)
output =[]
finalscore = 0
counter = 0
for train, test in skf:
	counter = counter + 1
	newdata = prj.fit_transform([ normdata[i][:] for i in train ],[ data[i][-1] for i in train ])
	newtestdata = prj.transform([ normdata[i][:] for i in test ])
	clf = GradientBoostingClassifier(warm_start = True)
	clf = clf.fit(newdata, [ data[i][-1] for i in train ])
	prediction = clf.predict(newtestdata)
	# pred = []
	# for i in prediction:
	# 	if(i > 1.5):
	# 		pred.append(2)
	# 	else:
	# 		pred.append(1)
	finalscore = finalscore + score.get_score( prediction , [ data[i][-1] for i in test ])
	print "done"
# score = cross_val_score(clf, newdata[:,:], data[:,-1], cv = 5, scoring = 'get_score')
# print "in scores"
# for i in score:
# 	print i
Beispiel #49
0
                c=c,
                label=l,
                marker=m)

plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc='upper right')
title = 'Projecting Feature Set onto New Feature Space'
plt.title(title)
plt.tight_layout()
ocr_utils.show_figures(plt, title)

###############################################################################3
lda = LDA(n_components=2)
X_train_lda = lda.fit_transform(X_train_std, y_train)
X_test_lda = lda.transform(X_test_std)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr = lr.fit(X_train_lda, y_train)

title = 'Linear Descriminant Analysis Training Set'
ocr_utils.plot_decision_regions(X_train_lda,
                                y_train,
                                classifier=lr,
                                labels=['LD 1', 'LD 2'],
                                title=title)

title = 'Linear Descriminant Analysis Test Set'

ocr_utils.plot_decision_regions(X_test_lda,
Beispiel #50
0
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
import sklearn.linear_model as LM
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

fname = "./3_que_data/train.csv"
train_X = np.genfromtxt(fname, delimiter=",")
train_Y = np.genfromtxt("./3_que_data/train_labels.csv", delimiter=",")

test_X = np.genfromtxt("./3_que_data/test.csv", delimiter=",")
test_Y = np.genfromtxt("./3_que_data/test_labels.csv", delimiter=",")

clf = LDA()
clf.fit(train_X, train_Y)

train_X_transformed = clf.transform(train_X)
train_X_transformed = train_X_transformed.flatten()
print train_X_transformed.shape
print clf.coef_

plt.plot(train_X_transformed[:1000], [10] * 1000, "ro", label="Class 1")
plt.plot(train_X_transformed[1000:], [10] * 1000, "bo", label="Class 2")
plt.plot([0] * 21, range(21), "g", label="Decision Boundary")
plt.axis([-6, 6, 0, 20])
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
plt.legend()
plt.show()
print precision_recall_fscore_support(test_Y, clf.predict(test_X), labels=[1, 2])
Beispiel #51
0


# (2) Linear Discriminant Analsysi (LDA) - linear separatible
# --- Evaluate Importance of LDA
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
lda = LDA(n_components=None)
X_train_lda = lda.fit_transform(X_train_std, Y_train) # fit with trainset, (x, y) supervized
eigen_vals = lda.explained_variance_ratio_ # Egan-values for each LDAs (Importance)


# --- Fitting Model with LDA
lda = LDA(n_components=2)
X_train_lda = lda.fit_transform(X_train_std, Y_train) # fit with trainset, (x, y) supervized
X_test_lda = lda.transform(X_test_std) # only transform with testset
lr.fit(X_train_lda, Y_train)


# (3) Kernel Principal Component Analysis (K-PCA) - non-linear separatible
from sklearn.decomposition import KernelPCA
scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=15) # can choose other kernel methods, 2 PCAs = features
X_skernpca = scikit_kpca.fit_transform(X_train_std)

# - Explore Visually (Separatible?)
# > Normal PCA
import matplotlib as plt
from sklearn.decomposition import PCA
scikit_pca = PCA(n_components=2) # only first 2 PCAs
X_spca = scikit_pca.fit_transform(X_train_std) 
Beispiel #52
0
features = np.vstack(
    (x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16,
     x17, x18, x19, x20, x21, x22, x23, x24, x25))
print(features.shape)

### PCA to reduce the dimensions of data
pca = PCA(n_components=5400)
pca.fit(features)
#print(pca.explained_variance_ratio_)
reduced_features = pca.transform(features)
print(reduced_features.shape)

### LDA to project data into most discriminative (n-1) directions where n is the number of classes
lda = LDA()
lda.fit(reduced_features, labels)
new_features = lda.transform(reduced_features)
print(new_features.shape)

### Classification

# partition the data into training and testing splits, using 75%
# of the data for training and the remaining 25% for testing
(trainFeat, testFeat, trainLabels,
 testLabels) = train_test_split(features,
                                labels,
                                test_size=0.05,
                                random_state=42)

### KNN Classifier with 20% accuracy score
# train and evaluate a k-NN classifer on the histogram
# representations
Beispiel #53
0
lda = LDA(n_components=2)
X_train_lda = lda.fit_transform(X_train_std, y_train)

lr = LogisticRegression()
lr = lr.fit(X_train_lda, y_train)

plot_decision_regions(X_train_lda, y_train, classifier=lr)
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc='lower left')
# plt.tight_layout()
# plt.savefig('./images/lda3.png', dpi=300)
plt.show()

X_test_lda = lda.transform(X_test_std)

plot_decision_regions(X_test_lda, y_test, classifier=lr)
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc='lower left')
# plt.tight_layout()
# plt.savefig('./images/lda4.png', dpi=300)
plt.show()


#############################################################################
print(50 * '=')
print('Section: Implementing a kernel principal component analysis in Python')
print(50 * '-')
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge="none"):
    '''
    This function generates a chordial visualization for the recordings of the provided path.
    ARGUMENTS:
        - folder:		path of the folder that contains the WAV files to be processed
        - dimReductionMethod:	method used to reduce the dimension of the initial feature space before computing the similarity.
        - priorKnowledge:	if this is set equal to "artist"
    '''
    if dimReductionMethod == "pca":
        allMtFeatures, wavFilesList = aF.dirWavFeatureExtraction(folder, 30.0, 30.0, 0.050, 0.050, computeBEAT=True)
        if allMtFeatures.shape[0] == 0:
            print "Error: No data found! Check input folder"
            return

        namesCategoryToVisualize = [ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList];
        namesToVisualize = [ntpath.basename(w).replace('.wav', '') for w in wavFilesList];

        (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures])
        F = np.concatenate(F)
        pca = mlpy.PCA(method='cov')  # pca (eigenvalue decomposition)
        pca.learn(F)
        coeff = pca.coeff()

        # check that the new PCA dimension is at most equal to the number of samples
        K1 = 2
        K2 = 10
        if K1 > F.shape[0]:
            K1 = F.shape[0]
        if K2 > F.shape[0]:
            K2 = F.shape[0]

        finalDims = pca.transform(F, k=K1)
        finalDims2 = pca.transform(F, k=K2)
    else:
        allMtFeatures, Ys, wavFilesList = aF.dirWavFeatureExtractionNoAveraging(folder, 20.0, 5.0, 0.040,
                                                                                0.040)  # long-term statistics cannot be applied in this context (LDA needs mid-term features)
        if allMtFeatures.shape[0] == 0:
            print "Error: No data found! Check input folder"
            return

        namesCategoryToVisualize = [ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList];
        namesToVisualize = [ntpath.basename(w).replace('.wav', '') for w in wavFilesList];

        ldaLabels = Ys
        if priorKnowledge == "artist":
            uNamesCategoryToVisualize = list(set(namesCategoryToVisualize))
            YsNew = np.zeros(Ys.shape)
            for i, uname in enumerate(uNamesCategoryToVisualize):  # for each unique artist name:
                indicesUCategories = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname]
                for j in indicesUCategories:
                    indices = np.nonzero(Ys == j)
                    YsNew[indices] = i
            ldaLabels = YsNew

        (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures])
        F = np.array(F[0])

        clf = LDA(n_components=10)
        clf.fit(F, ldaLabels)
        reducedDims = clf.transform(F)

        pca = mlpy.PCA(method='cov')  # pca (eigenvalue decomposition)
        pca.learn(reducedDims)
        coeff = pca.coeff()
        reducedDims = pca.transform(reducedDims, k=2)

        # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY????

        uLabels = np.sort(np.unique((Ys)))  # uLabels must have as many labels as the number of wavFilesList elements
        reducedDimsAvg = np.zeros((uLabels.shape[0], reducedDims.shape[1]))
        finalDims = np.zeros((uLabels.shape[0], 2))
        for i, u in enumerate(uLabels):
            indices = [j for j, x in enumerate(Ys) if x == u]
            f = reducedDims[indices, :]
            finalDims[i, :] = f.mean(axis=0)
        finalDims2 = reducedDims

    for i in range(finalDims.shape[0]):
        plt.text(finalDims[i, 0], finalDims[i, 1], ntpath.basename(wavFilesList[i].replace('.wav', '')),
                 horizontalalignment='center', verticalalignment='center', fontsize=10)
        plt.plot(finalDims[i, 0], finalDims[i, 1], '*r')
    plt.xlim([1.2 * finalDims[:, 0].min(), 1.2 * finalDims[:, 0].max()])
    plt.ylim([1.2 * finalDims[:, 1].min(), 1.2 * finalDims[:, 1].max()])
    plt.show()

    SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine'))
    for i in range(SM.shape[0]):
        SM[i, i] = 0.0;

    chordialDiagram("visualization", SM, 0.50, namesToVisualize, namesCategoryToVisualize)

    SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine'))
    for i in range(SM.shape[0]):
        SM[i, i] = 0.0;
    chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize, namesCategoryToVisualize)

    # plot super-categories (i.e. artistname
    uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize)))
    finalDimsGroup = np.zeros((len(uNamesCategoryToVisualize), finalDims2.shape[1]))
    for i, uname in enumerate(uNamesCategoryToVisualize):
        indices = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname]
        f = finalDims2[indices, :]
        finalDimsGroup[i, :] = f.mean(axis=0)

    SMgroup = 1.0 - distance.squareform(distance.pdist(finalDimsGroup, 'cosine'))
    for i in range(SMgroup.shape[0]):
        SMgroup[i, i] = 0.0;
    chordialDiagram("visualizationGroup", SMgroup, 0.50, uNamesCategoryToVisualize, uNamesCategoryToVisualize)
Beispiel #55
0
######## Pregunta (i) ############################################################

lda_train = []
lda_test = []
qda_train = []
qda_test = []
knn_train = []
knn_test = []

lda_model = LDA()
qda_model = QDA()
knn_model = KNeighborsClassifier(n_neighbors=7)
for i in range(1,11):
	sklearn_lda = LDA(n_components=i)
	Xred_pca = sklearn_lda.fit_transform(X_std, y)
	Xred_pca_test = sklearn_lda.transform(X_std_test)
	lda_model.fit(Xred_pca,y)
	qda_model.fit(Xred_pca,y)
	knn_model.fit(Xred_pca,y)

	yhat_train = lda_model.predict(Xred_pca)
	lda_train.append(zero_one_loss(y, yhat_train)) 
	yhat_test = lda_model.predict(Xred_pca_test)
	lda_test.append(zero_one_loss(ytest, yhat_test)) 

	yhat_train = qda_model.predict(Xred_pca)
	qda_train.append(zero_one_loss(y, yhat_train)) 
	yhat_test = qda_model.predict(Xred_pca_test)
	qda_test.append(zero_one_loss(ytest, yhat_test)) 

	yhat_train = knn_model.predict(Xred_pca)