Beispiel #1
1
def tryLinearDiscriminantAnalysis(goFast):
  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True)

  from sklearn.lda import LDA
  from sklearn.metrics import accuracy_score
  from sklearn.grid_search import ParameterGrid
  from sklearn.decomposition import RandomizedPCA

  rpcaDataGrid = [{"n_components": [10,45,70,100],
                    "iterated_power": [2, 3, 4],
                    "whiten": [True]}]

  for rpca_parameter_set in ParameterGrid(rpcaDataGrid):
    rpcaOperator = RandomizedPCA(**rpca_parameter_set)
    rpcaOperator.fit(training_data,training_labels)
    new_training_data = rpcaOperator.transform(training_data,training_labels)
    new_validation_data = rpcaOperator.transform(validation_data,validation_labels)
    ldaOperator = LDA()
    ldaOperator.fit(new_training_data,training_labels)
    print "Score = " + str(accuracy_score(validation_labels,ldaOperator.predict(new_validation_data)))
Beispiel #2
1
class Ensemble:

	def __init__(self, data):
		self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy')
		self.lda = LDA()
		self.dec = DecisionTreeClassifier(criterion='entropy')
		self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25)

		self.make_prediction(data)


	def make_prediction(self, data):
		'''
		Make an ensemble prediction
		'''
		self.rf.fit(data.features_train, data.labels_train)
		self.lda.fit(data.features_train, data.labels_train)
		self.dec.fit(data.features_train, data.labels_train)
		self.ada.fit(data.features_train, data.labels_train)

		pre_pred = []
		self.pred = []

		ada_pred = self.ada.predict(data.features_test)
		rf_pred = self.rf.predict(data.features_test)
		lda_pred = self.lda.predict(data.features_test)
		dec_pred = self.dec.predict(data.features_test)

		for i in range(len(rf_pred)):
			pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ])

		for entry in pre_pred:
			pred_list = sorted(entry, key=entry.count, reverse=True)
			self.pred.append(pred_list[0])
    def test_twomethods(self):
        key_y_pred = 'y' + conf.SEP + conf.PREDICTION
        X, y = datasets.make_classification(n_samples=20, n_features=5,
                                            n_informative=2)
        # = With EPAC
        wf = Methods(LDA(), SVC(kernel="linear"))
        r_epac = wf.run(X=X, y=y)

        # = With SKLEARN
        lda = LDA()
        svm = SVC(kernel="linear")
        lda.fit(X, y)
        svm.fit(X, y)
        r_sklearn = [lda.predict(X), svm.predict(X)]

        # Comparison
        for i_cls in range(2):
            comp = np.all(np.asarray(r_epac[i_cls][key_y_pred]) ==
                                    np.asarray(r_sklearn[i_cls]))
            self.assertTrue(comp, u'Diff Methods')

        # test reduce
        r_epac_reduce = [wf.reduce().values()[0][key_y_pred],
            wf.reduce().values()[1][key_y_pred]]
        comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn))
        self.assertTrue(comp, u'Diff Perm / CV: EPAC reduce')
def LDAmeanScore(X, Y, n_folds, dim_reduction=0):
    """
    :param X: matrice d'entree du classifieur, n_samples*n_parameters, n_paramters>=2, n_samples>0. DONNES COHERENTES POUR CLASSIFICATION LDA
    :param Y: matrice des labels, n_samples
    :param n_folds: nombre de tests pour le KFold, >1
    :param dim_reduction: si egale a 0, pas de reduction, si inferieur a 0, best_reduction, sinon on fait une reduction PCA (on reduit a dim_reduction dimensions)
    :return: le score moyen de la validation croisee, affiche ce score. Si n_folds>n_samples, renvoie -1
 """
    if dim_reduction > 0 and X.shape[1] > dim_reduction:
        X = dim_reduction_PCA(X, dim_reduction)
    if dim_reduction == -1:
        dim_reduction = best_dimension(X)
        print "Best dimension : " + str(dim_reduction)
        X = dim_reduction_PCA(X, dim_reduction)

    if X.shape[0] > n_folds:
        # Cross validation pour estimer la performance d'un classifieur LDA
        kf = KFold(n=len(Y), n_folds=n_folds, shuffle=True, random_state=None)
        scores = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index, :], X[test_index, :]
            Y_train, Y_test = Y[train_index], Y[test_index]
            cl = LDA()
            cl.fit(X_train, Y_train)
            scores.append(cl.score(X_test, Y_test))

        print "Score moyen : ", np.mean(np.array(scores))
        return 100.0 * np.mean(np.array(scores))
    else:
        return -1
	def LDA模型(self, 問題, 答案):
		lda = LDA()
# 		clf = svm.NuSVC()
		print('訓練LDA')
		lda.fit(問題, 答案)
		print('訓練了')
		return lambda 問:lda.predict(問)
Beispiel #6
0
    def test_all_methods(self):
        x_cols = ["Lag2"]
        formula = "Direction~Lag2"
        # print self.df.shape[0]
        train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :]
        # print train_data.shape[0]
        """ (d) logistic"""
        model = smf.glm(formula, data=train_data, family=sm.families.Binomial())
        result = model.fit()
        test_data = self.df.ix[self.df["Year"] > 2008, :]
        probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]])))
        pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up")
        tp.output_table(pred_values.values, test_data[self.y_col].values)

        train_X = train_data[x_cols].values
        train_y = train_data[self.y_col].values
        test_X = test_data[x_cols].values
        test_y = test_data[self.y_col].values
        """ (e) LDA """
        lda_res = LDA().fit(train_X, train_y)
        pred_y = lda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (f) QDA """
        qda_res = QDA().fit(train_X, train_y)
        pred_y = qda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (g) KNN """
        clf = neighbors.KNeighborsClassifier(1, weights="uniform")
        clf.fit(train_X, train_y)
        pred_y = clf.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (h) logistic and LDA """
        """ (i) Is the purpose of the last question going through all methods with no direction?"""
def score(train_X, train_y):
    X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10)

    clf = LDA()
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)
    return log_loss(y_valid, y_pred)
Beispiel #8
0
def test_classification():
    from read import read
    import numpy, tfidf
    from sklearn.decomposition import TruncatedSVD
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import Normalizer

    m, files = read("training.json")
    y_map = [str(file["topic"]) for file in files]
    map = []
    for i in range(len(y_map)):
        if(len(map) == 0 or not map.__contains__(y_map[i])):
            map.append(y_map[i])
    y = numpy.array([map.index(y_map[i]) for i in range(len(y_map))])

    print("Construindo TF-IDF...")
    X, vectorizer = tfidf.vectorizeTFIDF(files)
    print X.shape

    print("Performing dimensionality reduction using LDA...")

    lda = LDA(n_components=9)
    X = X.toarray()
    lda.fit(X, y)
    X = lda.transform(X)

    mlp = MLPClassifier()
    mlp.fit(X, y)
    training_score = mlp.score(X, y)
    print("training accuracy: %f" % training_score)
Beispiel #9
0
def main():
    
    for question in range(3,18):
        
        print("Question ", question, " Percent Accuracy")

        trainingSet_features, trainingSet_labels, testSet_features, testSet_labels = loadTrainingAndTestData(question)
        #print(len(trainingSet_features))
        #print(trainingSet_labels)
        #print(len(testSet_features))
        #print(len(testSet_labels))
        
        #print(trainingSet_labels)
        nnC = KNeighborsClassifier(n_neighbors=5)
        nnC.fit(trainingSet_features, trainingSet_labels) 
        nnC_predictions = nnC.predict(testSet_features)
        print("Nearest Neighbor: %.2f" % (100*accuracy_score(testSet_labels,nnC_predictions)),"%")

        svmC = svm.SVC()
        svmC.fit(trainingSet_features, trainingSet_labels) 
        svmCpredictions = svmC.predict(testSet_features)
        print("Support Vector Machines: %.2f" % (100*accuracy_score(testSet_labels,svmCpredictions)),"%")

        rfC = RandomForestClassifier(n_estimators=100)
        rfC.fit(trainingSet_features, trainingSet_labels) 
        rfC_predictions = rfC.predict(testSet_features)
        print("Random Forrest:  %.2f" % (100*accuracy_score(testSet_labels,rfC_predictions)),"%")

        ldaC = LDA(solver='lsqr')
        ldaC.fit(trainingSet_features, trainingSet_labels) 
        ldaC_predictions = ldaC.predict(testSet_features)
        print("Linear Discriminant Analysis Classifier: %.2f" % (100*accuracy_score(testSet_labels,ldaC_predictions)),"%")
Beispiel #10
0
def main():

    logging.basicConfig(format='[%(asctime)s] %(levelname)7s: %(message)s', level=logging.DEBUG)

    all_image_numbers = generate_all_image_numbers(no_of_persons, samples_person)
    classes = all_image_numbers[:, 0]
    all_face_vectors = load_face_vectors_from_disk(all_image_numbers, image_size)

    classifier = LDA()
    logging.debug("Training..")
    classifier.fit(all_face_vectors, classes)

    while True:
        function = input(
            "0)Exit\n"
            "1)Live test\n"
            "2)Test image \"test.JPG\"\n"
            "3)General test\n"
            "\n"
            "Choose function:"
        )
        if function == "1":
            test_live(classifier, all_face_vectors)
        elif function == "2":
            test_one_image(classifier, all_face_vectors)
        elif function == "3":
            test(all_face_vectors, classes)
        elif function == "0":
            return
Beispiel #11
0
def runLDA(all_kmer_vectors_array,labels):
    sklearn_lda = LDA(n_components=4)
    X = np.array(all_kmer_vectors_array)
    y = np.array(labels)
    X_lda_sklearn = sklearn_lda.fit_transform(X,y)
    print(X_lda_sklearn)
    return X_lda_sklearn
Beispiel #12
0
def LDAClassify_Proba(enrollment_id, trainData, trainLabel, testData):
    clf = LDA(solver='lsqr')
    #clf = LDA()
    clf.fit(trainData, ravel(trainLabel))
    testLabel = clf.predict_proba(testData)[:,1]
    saveResult(enrollment_id, testLabel, 'Proba_sklearn_LDA.csv')
    return testLabel
def naive_bayes_with_lda():
    train, train_target, test, test_target = load_polluted_spambase()

    print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape)
    print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape)

    start = timeit.default_timer()

    lda = LDA(n_components=100)
    train = lda.fit_transform(train, train_target)
    test = lda.transform(test)

    print lda
    print "Train data: %s, Train Label: %s" % (train.shape, train_target.shape)
    print "Test data: %s, Test Label: %s" % (test.shape, test_target.shape)

    cf = GaussianNaiveBayes()
    cf.fit(train, train_target)
    raw_predicts = cf.predict(test)
    predict_class = cf.predict_class(raw_predicts)

    cm = confusion_matrix(test_target, predict_class)
    print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
    er, acc, fpr, tpr = confusion_matrix_analysis(cm)
    print "Error rate: %f, accuracy: %f, FPR: %f, TPR: %f" % (er, acc, fpr, tpr)

    stop = timeit.default_timer()
    print "Total Run Time: %s secs" % (stop - start)
def pca_lda(X_train,X_test,y_train,y_test):
    pca = PCA(n_components=500)
    lda = LDA()
    pca.fit(X_train)
    scores = np.dot(X_train,np.transpose(pca.components_))
    lda.fit(scores, y_train)
    return lda.score(scores, y_train, sample_weight=None)
Beispiel #15
0
def train_lda():
    from sklearn.lda import LDA

    data, classes = get_data_and_classes()
    classifier = LDA()
    classifier.fit(data, classes, store_covariance=True)
    return classifier
Beispiel #16
0
def read_subpop_data(one_hot=True, fake_data=False, test_size=0.2, undersample=False):

    labeled_dic = convert_txt_to_npy(LABELED_RL_PATH)
    unlabeled_dic = convert_txt_to_npy(UNLABELED_RL_PATH, labeled=False)
    X_train, X_test, y_train, y_test = split_train_test(labeled_dic, test_size=test_size)

    class DataSets(object):
        pass
    data_sets = DataSets()
    
    if undersample:
        from unbalanced_dataset import UnderSampler 
        US = UnderSampler(verbose=True)
        X_train, y_train = US.fit_transform(X_train, y_train)
        
    lda = LDA()
    lda.fit(X_train, y_train)
    score = metrics.accuracy_score(lda.predict(X_test), y_test)
    print("Baseline LDA: %f " % score)

    if one_hot:
        y_train = convert_to_one_hot(y_train)
        y_test = convert_to_one_hot(y_test)

    data_sets = DataSets()
    data_sets.test = DataSet(X_test, y_test)
    data_sets.train = SemiDataSet(unlabeled_dic['data'], X_train, y_train)

    return data_sets
def ldapredict(trainData,testData,trainOuts,testOuts):
	clf = LDA()
	print(clf.fit(trainData,trainOuts))
	predictions = clf.predict(testData)
	print(predictions)
	misses,error = sup.crunchTestResults(predictions,testOuts,.5)
	print(1-error)
Beispiel #18
0
def do_lda(x, y, folds):
    indexes = list(range(len(x)))
    shuffle(indexes)
    x = list(x[i] for i in indexes)
    y = list(y[i] for i in indexes)
    fold_size = len(x) / folds
    corrects = []
    for fold in range(folds):
        test_x = []
        train_x = []
        test_y = []
        train_y = []
        for i in range(len(x)):
            fold_index = i / fold_size
            if fold == fold_index:
                test_x.append(x[i])
                test_y.append(y[i])
            else:
                train_x.append(x[i])
                train_y.append(y[i])
        print 'Partitioned data into fold'
        test_x, train_x = remove_redundant_dimensions(test_x, train_x)
        print 'Removed redundant dimensions'
        lda = LDA()
        lda.fit(train_x, train_y)
        print 'Fit lda'
        predictions = lda.predict(test_x)
        correct = sum(1 for i in range(len(predictions)) if predictions[i] == test_y[i])
        print 'Did fold, correct:', correct
        corrects.append(correct)
    return corrects
Beispiel #19
0
 def lda(self, reducedArray = []):
     # components vyjadruju pocet stavov / classov medzi ktorymi rozlisujeme.. staci 0/1 pre target a non-target
     lda = LDA(n_components=2)
     if len(reducedArray) > 0:
         self.ldaMat = lda.fit(np.resize(reducedArray,(len(reducedArray),len(reducedArray[0]))), self.targetVals)
     else:
         self.ldaMat = lda.fit(np.resize(self.signalArray,(len(self.signalArray),len(self.signalArray[0]))), self.targetVals)
 def DLDA(self, trainLabel, featureData, testData):
     # print featureData == testData
     # print testData
     clf = LDA()
     clf.fit(featureData, trainLabel)
     testLabel = clf.predict(testData)
     return testLabel
Beispiel #21
0
def main_lda():
	X,y=fh_lda()

	lda=LDA()
	lda.fit(X,y)

	splot=plot_LDA(lda, X, y, lda.fit(X,y).predict(X))
	return splot
Beispiel #22
0
def siLDA(X, y):
    lda = LDA(n_components=2)
    X_r2 = lda.fit(X, y).transform(X)
    plt.figure()
    for c, i in zip('rgb', [0, 1]):
        plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], c=c)
    plt.title('LDA')
    plt.show()
def lda(X,y):
    lda = LDA(n_components=3)
    X_r2 = lda.fit(X,y).transform(X)

    plt.figure()
    for c, i, target_name in zip("gbr", [0,1, 2], ['others','inhibitory','excitatory']):
        plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], c=c, label=target_name)
        plt.legend()
        plt.title('LDA')
def reduceDimensionLDA(mat, k):
	print mat.shape
	labels = mat[:, -1]
	mat = mat[:, :-1]
	lda = LDA(n_components = k)
	data = lda.fit_transform(mat, labels)
	data = addLabels(data, labels)
	print data
	return data
 def classifyLDA(self) :
     print self.train_dataset
     clf = LDA(n_components=2)
     vr_train = clf.fit(self.train_dataset, self.train_label).transform(self.train_dataset)
     print vr_train
     plt.figure()
     for c, i in zip("br", [0, 1]):
         plt.scatter(vr_train[self.train_label == i], [0]*len(vr_train[self.train_label == i]), c=c)
     plt.show()
def curve_per_subject(subject, data_path, test_labels):
    d = load_train_data(data_path, subject)
    x, y_10m = d['x'], d['y']
    n_train_examples = x.shape[0]
    n_timesteps = x.shape[-1]
    print 'n_preictal', np.sum(y_10m)
    print 'n_inetrictal', np.sum(y_10m - 1)

    x, y = reshape_data(x, y_10m)
    data_scaler = StandardScaler()
    x = data_scaler.fit_transform(x)

    lda = LDA()
    lda.fit(x, y)

    pred_1m = lda.predict_proba(x)[:, 1]
    pred_10m = np.reshape(pred_1m, (n_train_examples, n_timesteps))
    pred_10m = np.mean(pred_10m, axis=1)
    fpr, tpr, threshold = roc_curve(y_10m, pred_10m)
    c = np.sqrt((1 - tpr) ** 2 + fpr ** 2)
    opt_threshold = threshold[np.where(c == np.min(c))[0]][-1]
    print opt_threshold

    # ------- TEST ---------------

    d = load_test_data(data_path, subject)
    x_test, id = d['x'], d['id']
    n_test_examples = x_test.shape[0]
    n_timesteps = x_test.shape[3]
    x_test = reshape_data(x_test)
    x_test = data_scaler.transform(x_test)

    pred_1m = lda.predict_proba(x_test)[:, 1]
    pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps))
    pred_10m = np.mean(pred_10m, axis=1)

    y_pred = np.zeros_like(test_labels)
    y_pred[np.where(pred_10m >= opt_threshold)] = 1
    cm = confusion_matrix(test_labels, y_pred)
    print print_cm(cm, labels=['interictal', 'preictal'])
    sn = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0])
    sp = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1])
    print sn, sp

    sn, sp = [], []
    t_list = np.arange(0.0, 1.0, 0.01)
    for t in t_list:
        y_pred = np.zeros_like(test_labels)
        y_pred[np.where(pred_10m >= t)] = 1
        cm = confusion_matrix(test_labels, y_pred)
        sn_t = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0])
        sp_t = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1])
        sn.append(sn_t)
        sp.append(sp_t)

    return t_list, sn, sp
def lda(data,labels,n,v_type):
	train_data,train_labels,test_data,test_labels = split_data(data,labels,v_type)

	clf = LDA()
	clf.fit(np.array(train_data,dtype=np.float64), np.array(train_labels,dtype=np.float64))
	y_pred = clf.predict(test_data)
	pure_accuracy_rate = len([y_pred[x] for x in range(len(y_pred)) if y_pred[x] == test_labels[x]])/float(len(test_labels))
	report = classification_report(y_pred, test_labels, target_names=rock_names)
	cm = confusion_matrix(test_labels, y_pred)
	return pure_accuracy_rate,report,y_pred,test_labels,test_data,clf,cm,"LDA"
def lda(ds, n):
    '''
        Outputs the projection of the data in the best
        discriminant dimension.
        Maximum of 2 dimensions for our binary case (values of n greater than this will be ignored by sklearn)
    '''
    selector = LDA(n_components=n)
    selector.fit(ds.data, ds.target)
    new_data = selector.transform(ds.data)
    return Dataset(new_data, ds.target)
Beispiel #29
0
def lda(df, samples, sample_labels, plot_name='lda_plot.png'):
    df = df.copy()
    df = df.transpose()
    df = df.ix[samples]
    df_nrm = normalize_min_max(df)
    X = df_nrm.values
    label_dict, y = encode_labels(sample_labels)
    ldas = LDA(n_components=2)
    X_lda = ldas.fit_transform(X, y)
    plot_scikit_lda(X_lda, y, label_dict, samples)
Beispiel #30
0
 def __call__(self, x, y, inputs, labels):
     classes = numpy.unique(labels)
     if len(classes) == 1:
         if y == classes[0]:
             return 1
         else:
             return -1
     lda = LDA().fit(inputs, labels)
     prob = lda.predict_proba([x])[0][lda.classes_.tolist().index(y)]
     return 2 * prob - 1
Beispiel #31
0
 def new_clf(self, classifier="Decision Tree"):
     names = [
         "Decision Tree", "Random Forest", "AdaBoost",
         "Gaussian Naive Bayes", "Multinomial Naive Bayes",
         "Bernoulli Naive Bayes", "LDA"
     ]
     classifiers = [
         DecisionTreeClassifier(max_depth=5),
         RandomForestClassifier(max_depth=5,
                                n_estimators=10,
                                max_features=1),
         AdaBoostClassifier(),
         GaussianNB(),
         MultinomialNB(),
         BernoulliNB(),
         LDA()
     ]
     dic = dict(zip(names, classifiers))
     return dic[classifier]
Beispiel #32
0
def test_all_estimators():
    estimators = all_estimators()
    clf = LDA()

    for name, E in estimators:
        # some can just not be sensibly default constructed
        if E in dont_test:
            continue
        # test default-constructibility
        # get rid of deprecation warnings
        with warnings.catch_warnings(record=True) as w:
            if E in meta_estimators:
                e = E(clf)
            else:
                e = E()
            #test cloning
            clone(e)
            # test __repr__
            repr(e)
def checkeachClassfier(train_x, train_y, test_x, test_y):
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(class_weight='auto'),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        DecisionTreeClassifier(class_weight='auto'),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        RandomForestClassifier(class_weight='auto'),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()
    ]

    classtitle = [
        "KNeighborsClassifier", "SVC", "SVC weighted", "SVC(gamma=2, C=1)",
        "DecisionTreeClassifier", "DecisionTreeClassifier weighted",
        "RandomForestClassifier", "RandomForestClassifier weighted",
        "AdaBoostClassifier", "GaussianNB", "LDA", "QDA"
    ]

    for i in range(len(classtitle)):
        try:
            ctitle = classtitle[i]
            clf = classifiers[i]
            clf.fit(train_x, train_y)
            train_pdt = clf.predict(train_x)
            MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt)
            print ctitle + ":"
            print "MCC, Acc_p , Acc_n, Acc_all(train): "
            print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n),
                                   str(Acc_all))
            test_pdt = clf.predict(test_x)
            MCC, Acc_p, Acc_n, Acc_all = get_Accs(test_y, test_pdt)
            print "MCC, Acc_p , Acc_n, Acc_all(test): "
            print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n),
                                   str(Acc_all))
        except:
            print ctitle + ": error"
        print
Beispiel #34
0
def get_classification_r2(ticker_data):


	data_len = len(ticker_data)
	split_line = int(data_len * 0.8)
	X = ticker_data.drop('close',1)[:-1]
	y = Series(ticker_data['close'].shift(-1).dropna(),dtype='|S6')

	X_train = X.ix[:split_line]
	X_test = X.ix[split_line:]
	y_train = y.ix[:split_line]
	y_test = y.ix[split_line:]


 	models = [("LR", LogisticRegression()),
		("LDA", LDA()),
		# ("QDA", QDA()),
		("LSVC", LinearSVC()),
		("RSVM", SVC(
			C=1000000.0, cache_size=200, class_weight=None,
		coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
		max_iter=-1, probability=False, random_state=None,
		shrinking=True, tol=0.001, verbose=False)
		),
		("RF", RandomForestClassifier(
			n_estimators=1000, criterion='gini',
		max_depth=None, min_samples_split=2,
		min_samples_leaf=1, max_features='auto',
		bootstrap=True, oob_score=False, n_jobs=1,
		random_state=None, verbose=0)
	)]

	best = (0,0)
	for m in models:
		m[1].fit(X_train, y_train)
		pred = m[1].predict(X_test)
		name = m[0]
		score = m[1].score(X_test, y_test)
		if score > best[1]:
			best = (name,score)
	print 'the best cluster is:' , best
	return best
def main():

    (X, Y, Ynames) = load_magic_data()
    X = StandardScaler().fit_transform(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=None)
    C = 5.0

    classifiers = {
        'L1 logistic': LogisticRegression(C=C, penalty='l1'),
        'L2 logistic': LogisticRegression(C=C, penalty='l2'),
        'KNN': KNeighborsClassifier(n_neighbors=11),
        'NB': GaussianNB(),
        'RF5': RandomForestClassifier(n_estimators=5),
        'RF50': RandomForestClassifier(n_estimators=50),
        'AdaBoost': AdaBoostClassifier(),
        'LDA': LDA(),
        'QDA': QDA()
    }

    plt.figure(figsize=(8, 8))

    n_classifiers = len(classifiers)
    for index, (name, clf) in enumerate(classifiers.iteritems()):
        clf.fit(Xtrain, Ytrain)
        probs = clf.predict_proba(Xtest)
        fpr, tpr, thresholds = roc_curve(Ytest, probs[:, 1])
        roc_auc = auc(fpr, tpr)
        print 'For model', name, 'accuracy =', clf.score(Xtest, Ytest)

        plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
Beispiel #36
0
def train(args):
    print("Loading embeddings.")
    fname = "{}/labels.csv".format(args.workDir)
    labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
    labels = map(itemgetter(1), map(os.path.split,
                                    map(os.path.dirname,
                                        labels)))  # Get the directory.
    fname = "{}/reps.csv".format(args.workDir)
    embeddings = pd.read_csv(fname, header=None).as_matrix()
    le = LabelEncoder().fit(labels)
    labelsNum = le.transform(labels)
    nClasses = len(le.classes_)
    print("Training for {} classes.".format(nClasses))

    if args.classifier == 'LinearSvm':
        clf = SVC(C=1, kernel='linear', probability=True)
    elif args.classifier == 'GMM':  #Doesn't work best
        clf = GMM(n_components=nClasses)

    #ref: http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
    elif args.classifier == 'RadialSvm':  #Radial Basis Function kernel
        clf = SVC(C=1000, kernel='rbf', probability=True,
                  gamma=0.05)  #works better with C = 1 and gamma = 2
    elif args.classifier == 'DecisionTree':  #Doesn't work best
        clf = DecisionTreeClassifier(max_depth=20)

    if args.ldaDim > 0:
        clf_final = clf
        clf = Pipeline([('lda', LDA(n_components=args.ldaDim)),
                        ('clf', clf_final)])

    print "Embeddings: "
    print embeddings.shape
    print "\nlabelsNum: "
    print labelsNum[-1:][0] + 1

    clf.fit(embeddings, labelsNum)

    fName = "{}/classifier.pkl".format(args.workDir)
    print("Saving classifier to '{}'".format(fName))
    with open(fName, 'w') as f:
        pickle.dump((le, clf), f)
Beispiel #37
0
class LDAFeatures:
    def __init__(self, n_comp=3):
        self.lda = None
        self.n_comp = n_comp

    def features(self, pixels, gt=None):
        #grab feature stack
        fullFeatures = naive_features(pixels)
        print fullFeatures.shape

        #if the LDA from ground truth exists already, transform new features
        if gt == None and self.lda != None:
            print self.lda
            return self.lda.transform(fullFeatures)
        assert gt != None

        #otherwise, train LDA
        self.lda = LDA(n_components=self.n_comp).fit(fullFeatures, gt)
        print self.lda
        return self.lda.transform(fullFeatures)
Beispiel #38
0
	def __init__(self):

		self.model = LDA()
		self.emgMode = myo.EmgMode.RAW
		self.imuMode = myo.ImuMode.RAW
		self.features = {'Names': ['MAV', 'RMS','ZC'],'LW': 150,'LI': 1}
		self.trainPercent = [0.7, 0.2, 0.1]
		self.dataMode = ["emg" , "imu"]

		self.emgDataFilePath 	= None
		self.accDataFilePath 	= None
		self.gyroDataFilePath 	= None
		self.quatDataFilePath 	= None
		self.modelFilePath = None

		self.fileDict = {}
		self.modelFileDict = {}
		self.filePathName = "wfyFilePath"
		self.modelName = "wfyModel"

		self.runCount = 0
def main():
    svm = SVC(C=4.0)
    clf = Pipeline([('scaler', StandardScaler()), ('svm', svm)])
    name = 'SVM'
    test_method(clf, name)

    clf = RandomForestClassifier(n_estimators=60, n_jobs=-1)
    name = 'randforest'
    test_method(clf, name)

    clf = LDA()
    name = 'LDA'
    test_method(clf, name)

    clf = neighbors.KNeighborsClassifier(n_neighbors=15)
    name = 'KNN'
    test_method(clf, name)

    clf = GradientBoostingClassifier(n_estimators=100)
    name = 'gradboosting'
    test_method(clf, name)
def evaluate(data, targets):
    print "Creating models..."
    models = []
    models.append(LinearSVC())
    models.append(SVC(kernel='rbf'))
    models.append(GaussianNB())
    models.append(LDA())
    models.append(QDA())
    models.append(LogisticRegression())
    models.append(KNeighborsRegressor())
    models.append(
        RandomForestClassifier(n_estimators=100,
                               criterion="entropy",
                               random_state=1234,
                               n_jobs=-1))

    if sparse.issparse(data):
        data = data.toarray()

    mc = ModelComparison(data, targets, folds=10, numCV=3, models=models)
    mc.evaluate()
Beispiel #41
0
def modelSelection(X, y, KFold, test_fraction):
    model_arr = [
        LDA(),
        DecisionTreeClassifier(max_depth=5),
        KNeighborsClassifier(3),
        SVC(gamma=2, C=1),
        SVC(kernel="linear", C=0.025),
        GaussianNB(),
        LogisticRegression()
    ]

    model_names = [
        "LDA()", "DecisionTreeClassifier(max_depth=5)",
        "KNeighborsClassifier(3)", "SVC(gamma=2, C=1)",
        "SVC(kernel=linear, C=0.025)", "GaussianNB()", "LogisticRegression()"
    ]

    for i, m in enumerate(model_arr):
        result = cross_validate(X, y, m, KFold, test_fraction)
        print model_names[i]
        print result
def classification_learning_curves(X, y, title=''):
    """ Computes and plots learning curves of regression models of X and y
    """
    
    # Ridge classification
    rdgc = RidgeClassifierCV(alphas=np.logspace(-3, 3, 7))

    # Support Vector classification    
    svc = SVC()
    
    # Linear Discriminant Analysis
    lda = LDA()
    
    # Logistic Regression
    logit = LogisticRegression(penalty='l2', random_state=42)

    estimator_str = ['svc', 'lda', 'rdgc', 'logit']

    # train size
    train_size = np.linspace(.2, .9, 8)    
    
    # Compute learning curves
    for e in estimator_str:
        estimator = eval(e)
        ts, _, scores = learning_curve(estimator, X, y,
                                       train_sizes=train_size, cv=4)
        bl = plt.plot(train_size, np.mean(scores, axis=1))
        plt.fill_between(train_size,
                         np.mean(scores, axis=1) - np.std(scores, axis=1),
                         np.mean(scores, axis=1) + np.std(scores, axis=1),
                         facecolor=bl[0].get_c(),
                         alpha=0.1)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.legend(estimator_str, loc='best')
    plt.xlabel('Train size', fontsize=16)
    plt.ylabel('Accuracy', fontsize=16)
    plt.ylim([.3, .9])
    plt.grid()
    plt.title('Classification ' + title, fontsize=16)
Beispiel #43
0
def evaluate(data, targets):
    prior = numpy.bincount(y.astype(int)) / float(len(targets))
    models = [
        LDA(priors=prior),
        SVC(probability=True, class_weight="auto", kernel="linear"),
        LogisticRegression(class_weight="auto"),
        GaussianNB(),
        KNeighborsClassifier(),
        QDA(priors=prior),
        RandomForestClassifier(n_estimators=100,
                               criterion="entropy",
                               n_jobs=-1,
                               random_state=123456),
        SVC(probability=True, class_weight="auto")
    ]

    model_names = [
        "LDA", "Linear SVM", "Logistic Regression", "Naive Bayes", "k-NN",
        "QDA", "Random Forest", "SVM w/ RBF"
    ]

    # evaluate using ModelEvaluation class
    mevaluator = model_evaluation.TenFoldCrossValidation(
        data=data,
        targets=targets,
        models=models,
        model_names=model_names,
        scale=True)

    start = time.time()
    caa_eval = mevaluator.evaluate(metrics.class_averaged_accuracy_score)
    for key, value in caa_eval.iteritems():
        model_str = key.split("(")[0]
        print model_str, (str(numpy.around(numpy.mean(value), decimals=3)) +
                          " (" +
                          str(numpy.around(numpy.std(value), decimals=3)) +
                          ")")
    mevaluator.evaluate_roc()
    print "Overall running time:", (time.time() - start)
def gridSearch(X_train, y_train, angle):
    """
	Performs a grid search to find the best classifier hyperparameters using LDA
	with a KNN classifier.
	"""
    component_grid = [5, 10, 20, 50, 75, 100]
    neighbor_grid = [2, 3, 4, 5, 6, 7, 9, 11, 15, 20, 25, 30, 40]

    estimators = [('reduce_dim', LDA(solver='eigen')),
                  ('knn', KNeighborsClassifier())]
    clf = Pipeline(estimators)

    params = {
        'reduce_dim__n_components': neighbor_grid,
        'knn__n_neighbors': component_grid
    }

    grid_search = GridSearchCV(clf, param_grid=params)
    grid_search.fit(X_train, y_train)

    pickle.dump(grid_search, open("model" + str(angle) + ".p", "wb"))
    return grid_search
Beispiel #45
0
 def __init__(self, training_path, testing_path):
     self.training_path = training_path
     self.testing_path = testing_path
     self.training_features = None
     self.testing_features = None
     self.training_image_list = []
     self.testing_image_list = []
     self.training_labels = []
     self.testing_labels = []
     self.predicted_testing_labels = []
     self.class_map = {}
     self.n_classes = len(os.listdir(os.path.join('.', 'data', 'training')))
     self.classifiers = {
         'knn':
         KNeighborsClassifier(3),
         'svm_linear':
         SVC(kernel="linear", C=0.025),
         'svm':
         SVC(gamma=2, C=1),
         'tree':
         DecisionTreeClassifier(max_depth=5),
         'rf':
         RandomForestClassifier(max_depth=5,
                                n_estimators=10,
                                max_features=1),
         'adb':
         AdaBoostClassifier(),
         'gauss':
         GaussianNB(),
         'lda':
         LDA(),
         'qda':
         QDA(),
         'ann':
         neuralNetwork(self.n_classes)
     }
     self.get_training_image_list()
     self.get_testing_image_list()
Beispiel #46
0
def random_methods(data_train1,target_train1):
    rng = np.random.RandomState(96235)
    names = ["SGD", "Nearest Neighbors", "ensembel","Decision Tree","Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
    classifiers = [
        SGDClassifier(loss='hinge', penalty='l2', alpha=0.0005, n_iter=200, random_state=42, n_jobs=-1, average=True),
        KNeighborsClassifier(10),
        AdaBoostRegressor(DecisionTreeRegressor(max_depth=25),n_estimators=300, random_state=rng),
        DecisionTreeClassifier(max_depth=11),
        RandomForestClassifier(max_depth=21, n_estimators=21, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()
    ]
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        print("Fitting " + name + "...")
        clf.fit(data_train1, target_train1)
        print("Predicting...")
        score = clf.score(data_test, target_test)
        print(score)
        predicted_test = clf.fit(data_train1, target_train1).predict(data_test)
        print(metrics.classification_report(target_test, predicted_test))
Beispiel #47
0
def LDA10Fold(X, y):
    acc = []
    kf = KFold(X.shape[0], n_folds=10, shuffle=True)
    i = 0
    for train_index, test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        clf = LDA()
        clf.fit(X[train_index], yTrain)
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain, yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0])
        # print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)
Beispiel #48
0
def create_confidence_matrix_one_vs_one(user_matix, file_number=0):
    from sklearn.pipeline import Pipeline
    import OneVsOneImproved
    lda = LDA()
    csp = CSP(n_components=2, transform_into='csp_space')
    clf = Pipeline([('CSP', csp), ("LDA", lda)])  #TODO NOTE THIS CHANGE
    classifier = OneVsOneImproved.OneVsOneClassifier(clf)
    labels = []
    data = []
    for id, subject in enumerate(user_matix):
        if len(subject) > file_number:
            labels1 = [id for i in range(len(subject[file_number]))]
            if not len(labels):
                labels = labels1
                data = subject[file_number]
            else:
                labels = np.concatenate((labels, np.asarray(labels1)))
                data = np.concatenate((data, np.asarray(subject[file_number])))
    #if len(data) != len(labels):
    #print(len(data))
    # print(len(labels))
    score_matrix = fit_classifier_cross_val_score(data, labels, clf)
    classifier.fit(np.asarray(data), np.asarray(labels))
    return score_matrix, classifier
Beispiel #49
0
    def execute(self, i, j):
        x_train = self.x_train
        y_train = self.y_train
        dim_red = LDA()
        x_train = dim_red.fit_transform(x_train, y_train)
        with open('dumped_dim_red_' + str(i) + '.pkl', 'wb') as fid:
            cPickle.dump(dim_red, fid)

        stat_obj = self.stat_class()  # reflection bitches
        stat_obj.train(x_train, y_train)
        with open('dumped_' + str(j) + '_' + str(i) + '.pkl', 'wb') as fid:
            cPickle.dump(stat_obj, fid)

        kf = KFold(len(self.x_train), n_folds=self.k_cross)
        own_kappa = []
        for train_idx, test_idx in kf:
            # print train_idx,test_idx
            # exit(0)
            x_train, x_test = self.x_train[train_idx], self.x_train[test_idx]
            y_train, y_test = self.y_train[train_idx], self.y_train[test_idx]
            dim_red = LDA()
            x_train = dim_red.fit_transform(x_train, y_train)
            x_test = dim_red.transform(x_test)

            stat_obj = self.stat_class()  # reflection bitches
            stat_obj.train(x_train, y_train)

            y_pred = [0 for i in xrange(len(y_test))]
            for i in range(len(x_test)):
                val = int(np.round(stat_obj.predict(x_test[i])))
                if val > self.range_max: val = self.range_max
                if val < self.range_min: val = self.range_min
                y_pred[i] = [val]
            y_pred = np.matrix(y_pred)
            cohen_kappa_rating = own_wp.quadratic_weighted_kappa(
                y_test, y_pred, self.range_min, self.range_max)
            self.values.append(cohen_kappa_rating)
        return sum(self.values) / self.k_cross
Beispiel #50
0
def acc_image(training_data, tarining_label, test_data, test_label):
    n_train = training_data.shape[0]  # samples for training
    n_test = test_data.shape[0]  # samples for testing
    n_averages = 50  # how often to repeat classification
    n_features_max = 5  # maximum number of features
    step = 1  # step size for the calculation

    acc_clf1, acc_clf2 = [], []
    n_features_range = range(1, n_features_max + 1, step)
    for n_features in n_features_range:
        score_clf1, score_clf2 = 0, 0
        for _ in range(n_averages):
            X, y = training_data[:, 0:n_features], tarining_label

            clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y)
            clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y)

            X, y = test_data[:, 0:n_features], test_label
            score_clf1 += clf1.score(X, y)
            score_clf2 += clf2.score(X, y)

        acc_clf1.append(score_clf1 / n_averages)
        acc_clf2.append(score_clf2 / n_averages)

    features_samples_ratio = np.array(n_features_range) / n_train

    plt.plot(features_samples_ratio,
             acc_clf1,
             linewidth=2,
             label="LDA with shrinkage",
             color='r')
    plt.plot(features_samples_ratio,
             acc_clf2,
             linewidth=2,
             label="LDA",
             color='g')

    plt.xlabel('n_features / n_samples')
    plt.ylabel('Classification accuracy')

    plt.legend(loc=1, prop={'size': 12})
    plt.suptitle('LDA vs. shrinkage LDA (1 discriminative feature)')
    plt.show()
def lda(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans lda")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    #lda=LDA(n_components=2)
    lda=LDA()
    lda.fit(X,y)
    X_LDA = lda.transform(X)
    y_pred = lda.predict(X)
    print "#########################################################################################################\n"
    print "Linear Discriminant Analysis Accuracy "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"LDA_metrics.txt"
    file = open(results, "w")
    file.write("Linear Discriminant Analaysis estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "LDA"
    save = Output + "LDA_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    # plot the results along with the labels
    fig, ax = plt.subplots()
    im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y)
    fig.colorbar(im);
    save_lda = Output + "LDA_plot.png"
    plt.savefig(save_lda)
    plt.close()
    lvltrace.lvltrace("LVLSortie dans lda")
Beispiel #52
0
def test():
    class1 = np.mat([
        (2.9500, 6.6300),
        (2.5300, 7.7900),
        (3.5700, 5.6500),
        (3.1600, 5.4700),
    ])
    class2 = np.mat([
        (2.5800, 4.4600),
        (2.1600, 6.2200),
        (3.2700, 3.5200),
    ])
    test = (2.81, 5.46)
    lda = myLDA(class1, class2)
    print lda.predict(test)

    lda = LDA()
    lda.fit(np.concatenate((class1, class2)),
            np.concatenate((np.zeros((3, 1)), np.ones((4, 1))), axis=0),
            store_covariance=True)
    print lda.predict(test)
        temp = []

#PLS Dimension Reduction
pls2 = PLSRegression(n_components=n_components)
pls2.fit(features, MA_label)
XScore = pls2.transform(features)
# XScore = features

#LDA Classification
kf = KFold(n_splits=5)
kf.get_n_splits(XScore)
mean_acc = 0
for train_index, test_index in kf.split(XScore):
    X_train, X_test = XScore[train_index], XScore[test_index]
    y_train, y_test = MA_label[train_index], MA_label[test_index]
    clf = LDA()
    clf.fit(X_train, y_train)
    Y_predict = clf.predict(X_test)
    for i in range(len(Y_predict)):
        print("Y_Predict {} - Y_Test {}".format(Y_predict[i], y_test[i]))
    acc = accuracy_score(Y_predict, y_test)
    print("Accuracy = {}".format(acc))
    mean_acc = mean_acc + acc

mean_acc = (mean_acc / 5) * 100
print("Accuracy is {}".format(mean_acc))

with open("Results/MLL.csv", 'a') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow([numFeatures, mean_acc])
    csvfile.close()
def classifier_comparison(X, y):
    """
    分类器比较

    Args:
        X: training samples, size=[n_samples, n_features]
        y: class labels, size=[n_samples, 1]
    Returns:
        None
    """
    from sklearn import grid_search
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.lda import LDA
    from sklearn.qda import QDA
    from sklearn.linear_model import LogisticRegression
    import scipy

    # Exhaustive Grid Search
    exhaustive_parameters = {
        'kernel': ['rbf'],
        'C': [1, 10, 100, 1000],
        'gamma': [1e-3, 1e-4]
    }
    clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters)
    # Randomized Parameter Optimization
    randomized_parameter = {
        'kernel': ['rbf'],
        'C': scipy.stats.expon(scale=100),
        'gamma': scipy.stats.expon(scale=.1)
    }
    clf_SVC_randomized = grid_search.RandomizedSearchCV(
        SVC(), randomized_parameter)

    names = [
        "Linear SVM", "RBF SVM", "RBF SVM with Grid Search",
        "RBF SVM with Random Grid Search", "Decision Tree", "Random Forest",
        "AdaBoost", "Naive Bayes", "LDA", "QDA"
    ]
    classifiers = [
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1), clf_SVC_exhaustive, clf_SVC_randomized,
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()
    ]

    for name, clf in zip(names, classifiers):
        logger.info('Use %s:' % (name))
        train_classifier(clf, X, y)

    # 逻辑回归
    for C in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
        logger.info('Use LR with l1 penalty, C=%s:' % (C))
        clf = LogisticRegression(C=C, penalty='l1', tol=0.01)
        clf = train_classifier(clf, X, y)
        logger.debug('coef matrix: %s' % (clf.coef_))

        logger.info('Use LR with l2 penalty, C=%s:' % (C))
        clf = LogisticRegression(C=C, penalty='l2', tol=0.01)
        clf = train_classifier(clf, X, y)
        logger.debug('coef matrix: %s' % (clf.coef_))
Beispiel #55
0
########################### Instantiate Classifiers ############################


classifiers = {
    "Logistic":LogisticRegression(),
    "NearestNeighbors":KNeighborsClassifier(100),
    "LinearSVM":SVC(kernel="linear", C=0.025),
    "RBFSVM":SVC(gamma=2, C=1),
    "DecisionTree":DecisionTreeClassifier(max_depth=32),
    "RandomForest":RandomForestClassifier(max_depth=None, n_estimators=200, max_features="auto",random_state=0,n_jobs=4),
    "RandomForest2":RandomForestClassifier(max_depth=8, n_estimators=200, max_features="auto",random_state=0,n_jobs=4),
    "AdaBoost":AdaBoostClassifier(n_estimators=500,random_state=0),
    "GradientBoost":GradientBoostingClassifier(n_estimators=500, learning_rate=1.0,max_depth=None, random_state=0),
    "NaiveBayes":GaussianNB(),
    "LDA":LDA(),
    "QDA":QDA()
    }

joblist=[
        (classifiers["RandomForest"],'RandomForest_signal','model_var_list_signal.csv'), # suffix and varlist
        #(classifiers["RandomForest"],'RandomForest_tmxpayer','model_var_list_tmxpayer.csv'),
        #(classifiers["RandomForest"],'RandomForest_tmxpayee','model_var_list_tmxpayee.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxpayer','model_var_list_signal_tmxpayer.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxpayee','model_var_list_signal_tmxpayee.csv'),
        #(classifiers["RandomForest"],'RandomForest_tmxpayer_tmxpayee','model_var_list_tmxpayer_tmxpayee.csv'),
        #(classifiers["RandomForest"],'RandomForest_tmxpayerpayee_comp','model_var_list_tmxpayerpayee_comp.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxboth','model_var_list_signal_tmxboth.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxboth_120','model_var_list_signal_tmxboth_120.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxboth_800','model_var_list_signal_tmxboth_800.csv'),
        #(classifiers["RandomForest2"],'RandomForest_signal_tmxboth_RF2','model_var_list_signal_tmxboth.csv'),
Beispiel #56
0
def train(args):
    print("Loading embeddings.")
    fname = "{}/labels.csv".format(args.workDir)
    labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
    labels = map(itemgetter(1), map(os.path.split,
                                    map(os.path.dirname,
                                        labels)))  # Get the directory.
    fname = "{}/reps.csv".format(args.workDir)
    embeddings = pd.read_csv(fname, header=None).as_matrix()
    le = LabelEncoder().fit(labels)
    labelsNum = le.transform(labels)
    nClasses = len(le.classes_)
    print("Training for {} classes.".format(nClasses))

    if args.classifier == 'LinearSvm':
        clf = SVC(C=1, kernel='linear', probability=True)
    elif args.classifier == 'GridSearchSvm':
        print("""
        Warning: In our experiences, using a grid search over SVM hyper-parameters only
        gives marginally better performance than a linear SVM with C=1 and
        is not worth the extra computations of performing a grid search.
        """)
        param_grid = [{
            'C': [1, 10, 100, 1000],
            'kernel': ['linear']
        }, {
            'C': [1, 10, 100, 1000],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf']
        }]
        clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
    elif args.classifier == 'GMM':  # Doesn't work best
        clf = GMM(n_components=nClasses)

    # ref:
    # http://scikit-learn.org/stable/auto_examples/classification/plot_args.classifier_comparison.html#example-classification-plot-args.classifier-comparison-py
    elif args.classifier == 'RadialSvm':  # Radial Basis Function kernel
        # works better with C = 1 and gamma = 2
        clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
    elif args.classifier == 'DecisionTree':  # Doesn't work best
        clf = DecisionTreeClassifier(max_depth=20)
    elif args.classifier == 'GaussianNB':
        clf = GaussianNB()

    # ref: https://jessesw.com/Deep-Learning/
    elif args.classifier == 'DBN':
        from nolearn.dbn import DBN
        clf = DBN(
            [embeddings.shape[1], 500, labelsNum[-1:][0] + 1
             ],  # i/p nodes, hidden nodes, o/p nodes
            learn_rates=0.3,
            # Smaller steps mean a possibly more accurate result, but the
            # training will take longer
            learn_rate_decays=0.9,
            # a factor the initial learning rate will be multiplied by
            # after each iteration of the training
            epochs=300,  # no of iternation
            # dropouts = 0.25, # Express the percentage of nodes that
            # will be randomly dropped as a decimal.
            verbose=1)

    if args.ldaDim > 0:
        clf_final = clf
        clf = Pipeline([('lda', LDA(n_components=ldaDim)), ('clf', clf_final)])

    clf.fit(embeddings, labelsNum)

    fName = "{}/classifier.pkl".format(args.workDir)
    print("Saving classifier to '{}'".format(fName))
    with open(fName, 'w') as f:
        pickle.dump((le, clf), f)
Beispiel #57
0
def get_gridsearch_classifier(clf_name):
    """ add docstring later
    """
    #%% "is_sparse" flag
    """note: i included this so method like Lasso, so I can obtain nnz after
             model fit.  for feature selection methods like ttest, i set this
             as False since here I know nnz prehand."""

    is_sparse = False  # <- set this to True if method is sparse
    #%% ***START HUGE ELIF STATEMENT ****
    if clf_name == 'sklLogregL1':
        """ L1 logistic regression """
        np.random.seed(
            0)  # <- needed to ensure replicability in LogReg fit model
        from sklearn.linear_model import LogisticRegression
        clf = LogisticRegression(penalty='l1', random_state=0)
        param_grid = {'C': 2.**np.arange(-8, 18, 2)}
        is_sparse = True
    elif clf_name == 'sklLinSvm':
        """ Linear SVM (hinge loss) """
        from sklearn.svm import LinearSVC
        clf = LinearSVC(loss='hinge')
        param_grid = {'C': 2.**np.arange(-18, 2, 2)}
#        param_grid = {'C':2.**np.arange(-18,-2,1)}
#        param_grid = {'C':2.**np.arange(-1,0,1)}
    elif clf_name == 'fistaLogregElasticnet':
        from tak.core import get_incmat_conn86
        from tak.machine_learning.fista import LogRegElasticNetFista
        clf = LogRegElasticNetFista(tol=1e-3)
        param_grid = {
            'alpha': 10.**np.arange(-8, 5, 1),
            'l1_ratio': np.arange(0.1, 1.1, 0.1)
        }
    elif clf_name == 'fistaLogregGraphnet':
        """ GraphNet Fista (logistic loss) """
        from tak.core import get_incmat_conn86
        from tak.machine_learning.fista import LogRegGraphNetFista
        C, _ = get_incmat_conn86(radius=50)
        clf = LogRegGraphNetFista(tol=1e-3, C=C)
        param_grid = {
            'alpha': 10.**np.arange(-8, 5, 1),
            'l1_ratio': np.arange(0.1, 1.1, 0.1)
        }
    elif clf_name == 'fistaLogregGraphnet80':
        """ GraphNet Fista (logistic loss)with radius of 80 """
        from tak.core import get_incmat_conn86
        from tak.machine_learning.fista import LogRegGraphNetFista
        C, _ = get_incmat_conn86(radius=80)
        clf = LogRegGraphNetFista(tol=1e-3, C=C)
        param_grid = {
            'alpha': 10.**np.arange(-8, 5, 1),
            'l1_ratio': np.arange(0.1, 1.1, 0.1)
        }
    elif clf_name == 'rbfSvm':
        """ RBF Kernel SVM """
        from tak.ml import PrecomputedRBFSVM
        clf = PrecomputedRBFSVM()
        param_grid = {
            'C': 10.**np.arange(-1, 10, 2),
            'gamma': 10.**np.arange(-12, 1, 1)
        }
    elif clf_name == 'ttestRbfSvm':
        # ttest + RBF Kernel SVM using Pipeline (3 parameters)
        from tak.ml import ttest_for_fs, PrecomputedRBFSVM
        from sklearn.feature_selection import SelectKBest
        from sklearn.pipeline import Pipeline
        ttest_fs = SelectKBest(score_func=ttest_for_fs)

        # setup pipeline of ttest_filter + RBF_SVM
        clf = Pipeline([('ttest', ttest_fs), ('svm', PrecomputedRBFSVM())])

        # estimator parameters in a pipeline accessed as: <estimator>__<estimator>
        param_grid = {
            'ttest__k': (2**np.arange(4, 11, 1)).astype(int),
            'svm__C':
            10.**np.arange(-8, 11,
                           2),  #^^^^^must be int, or scikit will complain
            'svm__gamma': 10.**np.arange(-16, -5, 2)
        }

    elif clf_name == 'ttestLinSvm':
        # ttest + liblinear Pipeline (2 parameters)
        from tak.ml import ttest_for_fs
        from sklearn.svm import LinearSVC
        from sklearn.feature_selection import SelectKBest
        from sklearn.pipeline import Pipeline
        ttest_fs = SelectKBest(score_func=ttest_for_fs)
        clf = Pipeline([
            ('ttest', ttest_fs),
            ('liblin', LinearSVC(loss='hinge')),
        ])
        param_grid = {
            'ttest__k': (2**np.arange(
                4, 11.5,
                0.5)).astype(int),  # must be int, or scikit will complain
            'liblin__C': 2.**np.arange(-18, 1, 1),
        }
    elif clf_name == 'enetLogRegSpams':
        # Elastic-net Logistic Regression using my wrapper on SpamsToolbox (2 parameters)
        from tak.ml import SpamFistaFlatWrapper
        clf = SpamFistaFlatWrapper(loss='logistic',
                                   regul='elastic-net',
                                   max_it=400,
                                   tol=1e-3)
        param_grid = {
            'lambda1': 2.**np.arange(-16, 1,
                                     2),  # L1 penalty (lambda1 in SPAMS)
            'lambda2': 2.**np.arange(-16, 11, 3),
        }  # L2 penalty (lambda2 in SPAMS)
        is_sparse = True
    elif clf_name == 'enetLogRegGlmNet':
        # Elastic-net Logistic Regression using my wrapper on SpamsToolbox (2 parameters)
        from tak.ml import LogisticGlmNet
        clf = LogisticGlmNet()
        param_grid = {
            'alpha': np.arange(0.1, 1.1, 0.1),
            'lambdas': 2.**np.arange(1, -14, -1)
        }
        is_sparse = True
    #%% === PCA stuffs...no interpretability, but see if accuracy improves ====
    elif clf_name == 'PcaLda':
        """ PCA + LDA (1 parameter) """
        from sklearn.lda import LDA
        from sklearn.decomposition import PCA
        from sklearn.pipeline import Pipeline

        clf = Pipeline([
            ('PCA', PCA()),
            ('LDA', LDA(solver='lsqr', shrinkage='auto')),
        ])
        param_grid = {'PCA__n_components': np.array([2, 5, 10, 20, 40])}
    #=== PCA + LINSVM ===
    elif clf_name == 'PcaLinSvm':
        from sklearn.svm import LinearSVC
        from sklearn.decomposition import PCA
        from sklearn.pipeline import Pipeline

        clf = Pipeline([
            ('PCA', PCA()),
            ('SVM', LinearSVC(loss='hinge')),
        ])
        param_grid = {
            'PCA__n_components': np.array([5, 10, 20, 40, 100]),
            'SVM__C': 2.**np.arange(-14, 3, 2)
        }
    #%% PCA + RBFSVM
    elif clf_name == 'PcaRbfSvm':
        from tak.ml import PrecomputedRBFSVM
        from sklearn.decomposition import PCA
        from sklearn.pipeline import Pipeline

        clf = Pipeline([
            ('PCA', PCA()),
            ('SVM', PrecomputedRBFSVM()),
        ])
        param_grid = {
            'PCA__n_components': np.array([5, 10, 20, 40, 100]),
            'SVM__C':
            10.**np.arange(-1, 10,
                           2),  #^^^^^must be int, or scikit will complain
            'SVM__gamma': 2.**np.arange(-18, -8, 2)
        }
    #%% ttest + LDA (for interpretability, I guess)
    elif clf_name == 'ttestLDA':
        from tak.ml import ttest_for_fs
        from sklearn.lda import LDA
        from sklearn.pipeline import Pipeline
        from sklearn.feature_selection import SelectKBest

        ttest_fs = SelectKBest(score_func=ttest_for_fs)

        clf = Pipeline([
            ('ttest', ttest_fs),
            ('LDA', LDA(solver='lsqr', shrinkage='auto')),
        ])
        param_grid = {'ttest__k': (2**np.arange(4, 9.5, 0.5)).astype(int)}

    #%%______huge elif above is complete.  return ______
    return clf, param_grid, is_sparse
Beispiel #58
0
n_samples = len(digits)

data = digits[:, :-1]
target = digits[:, -1]

param_grid = {
    'pca1__n_components': [16],
    'poly__degree': [2],
    'pca2__n_components': [0.8],
    'lda__n_components': [9],
    'lr__penalty': ['l2'],
    'lr__C': [0.1, 1]
}

steps = [('pca1', PCA()), ('poly', PolynomialFeatures()), ('pca2', PCA()),
         ('lda', LDA()), ('lr', LogisticRegression())]

pipeline = Pipeline(steps)

grid_search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1, cv=2)

n_trains = n_samples / 3 * 2

# We learn the digits on the first half of the digits
grid_search.fit(data[:n_trains], target[:n_trains])

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
print best_parameters
Beispiel #59
0
def lda(X_train, y_train, X_test, y_test):
    # Linear Discriminant Analysis (LDA) additionally maximizes the spread between classes
    lda = LDA()
Beispiel #60
0
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False):
	'''
	ARGUMENTS:
		- fileName:		the name of the WAV file to be analyzed
		- numOfSpeakers	the number of speakers (clusters) in the recording (<=0 for unknown)
		- mtSize (opt)	mid-term window size
		- mtStep (opt)	mid-term window step
		- stWin  (opt)	short-term window size
		- LDAdim (opt)	LDA dimension (0 for no LDA)
		- PLOT	 (opt)	0 for not plotting the results 1 for plottingy
	'''
	[Fs, x] = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x);
	Duration = len(x) / Fs

	[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll")
	[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale")

	[MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5));

	MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) )

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:,i] - MEAN1)  / STD1
		curF2 = (MidTermFeatures[:,i] - MEAN2)  / STD2
		[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
		[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
		MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001;
		MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
	
	MidTermFeatures = MidTermFeatures2	# TODO	
	# SELECT FEATURES:
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; 																											# SET 0A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; 																									# SET 0B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 0C
	
	iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; 																	# SET 1A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 															# SET 1B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 1C
	
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; 			# SET 2A		
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 	# SET 2B
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 2C
	
	#iFeaturesSelect = range(100);																									# SET 3	
	#MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010  
	
	MidTermFeatures = MidTermFeatures[iFeaturesSelect,:]		
	
	(MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T	
	numOfWindows = MidTermFeatures.shape[1]

	# remove outliers:
	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0]
	
	# TODO: Combine energy threshold for outlier removal:
	#EnergyMin = numpy.min(MidTermFeatures[1,:])
	#EnergyMean = numpy.mean(MidTermFeatures[1,:])
	#Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
	#iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
	#print iNonOutLiers

	perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows	
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]
	
	# LDA dimensionality reduction:
	if LDAdim > 0:
		#[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));		
		# extract mid-term features with minimum step:
		mtWinRatio  = int(round(mtSize  / stWin));
		mtStepRatio = int(round(stWin / stWin));
		mtFeaturesToReduce = []			
		numOfFeatures = len(ShortTermFeatures)
		numOfStatistics = 2;			
		#for i in range(numOfStatistics * numOfFeatures + 1):
		for i in range(numOfStatistics * numOfFeatures):
			mtFeaturesToReduce.append([])

		for i in range(numOfFeatures):		# for each of the short-term features:
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos<N):
				N1 = curPos
				N2 = curPos + mtWinRatio
				if N2 > N:
					N2 = N
				curStFeatures = ShortTermFeatures[i][N1:N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures))				
				curPos += mtStepRatio		
		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
				
		mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) )
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:,i] - MEAN1)  / STD1
			curF2 = (mtFeaturesToReduce[:,i] - MEAN2)  / STD2
			[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
			[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
			mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001;
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
		mtFeaturesToReduce = mtFeaturesToReduce2		
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:]		
		#mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
		(mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])	
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
		#DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
		#MDistancesAll = numpy.mean(DistancesAll)
		#iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
		#mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1],));
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin
		#print LDAstep, LDAstepRatio
		for i in range(Labels.shape[0]):
			Labels[i] = int(i*stWin/LDAstepRatio);		
		clf = LDA(n_components=LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	if numOfSpeakers<=0:
		sRange = range(2,10)
	else:
		sRange = [numOfSpeakers]
	clsAll = []; silAll = []; centersAll = []
	
	for iSpeakers in sRange:
		cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)		# perform k-means clustering
		
		#YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
		#print distance.squareform(YDist).shape
		#hc = mlpy.HCluster()
		#hc.linkage(YDist)
		#cls = hc.cut(14.5)
		#print cls

		# Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
		clsAll.append(cls)
		centersAll.append(means)
		silA = []; silB = []
		for c in range(iSpeakers):								# for each speaker (i.e. for each extracted cluster)
			clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.020:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c]			# get subset of feature vectors
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)				# compute average distance between samples that belong to the cluster (a values)
				silA.append(numpy.mean(Yt)*clusterPerCent)
				silBs = []
				for c2 in range(iSpeakers):						# compute distances from samples of other clusters
					if c2!=c:
						clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0)
				silBs = numpy.array(silBs)							
				silB.append(min(silBs))							# ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
		silA = numpy.array(silA); 
		silB = numpy.array(silB); 
		sil = []
		for c in range(iSpeakers):								# for each cluster (speaker)
			sil.append( ( silB[c] - silA[c]) / (max(silB[c],  silA[c])+0.00001)  )		# compute silhouette

		silAll.append(numpy.mean(sil))								# keep the AVERAGE SILLOUETTE

	#silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
	imax = numpy.argmax(silAll)									# position of the maximum sillouette value
	nSpeakersFinal = sRange[imax]									# optimal number of clusters

	# generate the final set of cluster labels
	# (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
	cls = numpy.zeros((numOfWindows,))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i-iNonOutLiers))		
		cls[i] = clsAll[imax][j]
		
	# Post-process method 1: hmm smoothing
	for i in range(1):
		startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls)
		hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat)			# hmm training
		hmm.means_ = means; hmm.covars_ = cov
		cls = hmm.predict(MidTermFeaturesNormOr.T)					
	
	# Post-process method 2: median filtering:
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]										# final sillouette
	classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


	# load ground-truth if available
	gtFile = fileName.replace('.wav', '.segments');							# open for annotated file
	if os.path.isfile(gtFile):									# if groundturh exists
		[segStart, segEnd, segLabels] = readSegmentGT(gtFile)					# read GT data
		flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)			# convert to flags

	if PLOT:
		fig = plt.figure()	
		if numOfSpeakers>0:
			ax1 = fig.add_subplot(111)
		else:
			ax1 = fig.add_subplot(211)
		ax1.set_yticks(numpy.array(range(len(classNames))))
		ax1.axis((0, Duration, -1, len(classNames)))
		ax1.set_yticklabels(classNames)
		ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

	if os.path.isfile(gtFile):
		if PLOT:
			ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
		purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT)
		print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean)
		if PLOT:
			plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
	if PLOT:
		plt.xlabel("time (seconds)")
		#print sRange, silAll	
		if numOfSpeakers<=0:
			plt.subplot(212)
			plt.plot(sRange, silAll)
			plt.xlabel("number of clusters");
			plt.ylabel("average clustering's sillouette");
		plt.show()