Esempio n. 1
0
def run_test(**kwargs):
    b = fetch_sw_orl()
    tic = time.time()

    # split the data in
    X_train, X_test, y_train, y_true = train_test_split(b.data,
                                                        b.target,
                                                        test_size=0.2,
                                                        stratify=b.target)

    hog_train = []
    for img_array in X_train:
        fd, _ = hog(img_array.reshape(b.shape),
                    orientations=8,
                    pixels_per_cell=(PPC, PPC),
                    cells_per_block=(1, 1),
                    visualize=True,
                    multichannel=False)
        hog_train.append(fd)

    clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2)
    clf.fit(hog_train, y_train)
    tok = time.time()

    hog_test = []
    for img_arry in X_test:
        fd, _ = hog(img_arry.reshape(b.shape),
                    orientations=8,
                    pixels_per_cell=(PPC, PPC),
                    cells_per_block=(1, 1),
                    visualize=True,
                    multichannel=False)
        hog_test.append(fd)
    y_pred = clf.predict(hog_test)
    return tok - tic, accuracy_score(y_true, y_pred)
def test_ecoc_fit_predict():
    # A classifier which implements decision_function.
    ecoc = OutputCodeClassifier(LinearSVC(), code_size=2)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)

    # A classifier which implements predict_proba.
    ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)
Esempio n. 3
0
def test_ecoc_fit_predict():
    # A classifier which implements decision_function.
    ecoc = OutputCodeClassifier(LinearSVC(), code_size=2)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)

    # A classifier which implements predict_proba.
    ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)
Esempio n. 4
0
 def train(corpus):
     time = datetime.datetime.now()
     logging.info('Static Embedding Oracle')
     Y, X_dic = EmbeddingOracle.parseCorpus(corpus.trainingSents,
                                            EmbeddingOracle)
     vec = DictVectorizer()
     X = vec.fit_transform(X_dic)
     clf = OutputCodeClassifier(LinearSVC(random_state=0),
                                code_size=2,
                                random_state=0)
     clf.fit(X, Y)
     logging.info('Traingin Time: ' +
                  str(int((datetime.datetime.now() - time).seconds / 60.)))
     return clf, vec
Esempio n. 5
0
def test_ecoc_float_y():
    # Test that the OCC errors on float targets
    X = iris.data
    y = iris.data[:, 0]

    ovo = OutputCodeClassifier(LinearSVC())
    msg = "Unknown label type"
    with pytest.raises(ValueError, match=msg):
        ovo.fit(X, y)

    ovo = OutputCodeClassifier(LinearSVC(), code_size=-1)
    msg = "code_size should be greater than 0, got -1"
    with pytest.raises(ValueError, match=msg):
        ovo.fit(X, y)
Esempio n. 6
0
def train_svm(labels,array, num_folds, num_jobs, params = 2):
	#obtain the best parameter settings for an svm outputcode classifier
	bestParameters = dict()
	if len(labels) > 2:
		print("outputcodeclassifier")
		#param_grid = {'estimator__C': [0.001, 0.005, 0.01,0.1, 0.5, 1,2.5, 5, 10,15,25, 50,75, 100, 500, 1000],
		#	'estimator__kernel': ['linear','rbf','poly'], 
		#	'estimator__gamma': [0.0005,0.001, 0.002, 0.008,0.016, 0.032,0.064, 0.128,0.256, 0.512, 1.024, 2.048],
		#	'estimator__degree': [1,2,3,4]}
		param_grid = {'estimator__C': [0.001, 0.005],
			'estimator__kernel': ['linear','rbf'], 
			'estimator__gamma': [0.0005,0.001],
			'estimator__degree': [1]}
		model = OutputCodeClassifier(svm.SVC(probability=True))
	else:
		print("svc model")
		param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
			'kernel': ['linear','rbf','poly'], 
			'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048],
			'degree': [1,2,3,4]}
		model = svm.SVC(probability=True)
	
	paramsearch = RandomizedSearchCV(model, param_grid, cv=num_folds, verbose=2,n_iter = params,n_jobs=num_jobs) 
	print("Grid search...")
	paramsearch.fit(array,numpy.asarray(labels))
	print("Prediction...")
	parameters = paramsearch.best_params_
	
	for parameter in parameters.keys():
		print(parameter + ": " + str(parameters[parameter]) + "\n")
	print("best score: " + str(paramsearch.best_score_) + "\n\n")
	
	#for score in paramsearch.grid_scores_:
	#	print 'mean score:',score.mean_validation_score
	#	print 'list scores:',score.cv_validation_scores
	#train an svm outputcode classifier using the best parameters
	
	if len(labels) > 2:
		test = svm.SVC(probability=True, C=parameters['estimator__C'],
			kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'],
			degree=parameters['estimator__degree'])
		out_test = OutputCodeClassifier(test,n_jobs=1)
		out_test.fit(array,labels)
	else:
		test = svm.SVC(probability=True, C=parameters['C'],
			kernel=parameters['kernel'],gamma=parameters['gamma'],
			degree=parameters['degree'])
		#test.fit(array,labels)
	return test	
def ECOC():

    print('Aplicando metodo multiclase ERROR CORRECTING OUTPUT CODES')
    for indice in lista_datasets:

        print('Base de datos: ' + str(indice))
        dataset = arff.loadarff('./datasets/' + str(indice))
        df = pd.DataFrame(dataset[0])
        input = df.iloc[:, df.columns != 'class']
        output = pd.factorize(df['class'])[0]
        X_train, X_test, Y_train, Y_test = train_test_split(input, output, test_size=0.25)

        clf = OutputCodeClassifier(KNeighborsClassifier(n_neighbors=5), code_size=2, random_state=0)
        clf.fit(X_train, Y_train)

        print('Porcentaje de bien clasificados ERROR CORRECTING OUTPUT CODES')
        print(clf.score(X_test, Y_test))
    print('--------------------------')
class SVMClf:
    def __init__(self, labels, data, load=False, save=False):
        if load:
            with open(clfData, 'rb') as input:
                self.classifier = pickle.load(input)
            with open(vecData, 'rb') as input:
                self.verctorizer = pickle.load(input)
            return
        self.verctorizer = DictVectorizer()
        featureVec = self.verctorizer.fit_transform(data)
        self.classifier = OutputCodeClassifier(LinearSVC(random_state=0),
                                               code_size=2,
                                               random_state=0)
        # self.classifier = LogisticRegression( solver='sag')
        self.classifier.fit(featureVec, labels)
        if save:
            with open(clfData, 'wb') as output:
                pickle.dump(self.classifier, output, pickle.HIGHEST_PROTOCOL)
            with open(vecData, 'wb') as output:
                pickle.dump(self.verctorizer, output, pickle.HIGHEST_PROTOCOL)
Esempio n. 9
0
 def evaluateOutputCode(X, Y, printReport=False):
     time = datetime.datetime.now()
     X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                         Y,
                                                         test_size=0.2,
                                                         random_state=42)
     clf = OutputCodeClassifier(LinearSVC(random_state=0),
                                code_size=2,
                                random_state=0)
     clf.fit(X_train, Y_train)
     if printReport:
         print 'Training time:' + str(datetime.datetime.now() - time)
         print 'Evaluation result: OneVsOne: ' + str(
             clf.score(X_test, Y_test))
     Y_test = clf.predict(X_test)
     if printReport:
         print '0: ' + str((Y_test == 0).sum())
         print '1: ' + str((Y_test == 1).sum())
         print '2: ' + str((Y_test == 2).sum())
     return [clf.score(X_test, Y_test), (Y_test == 1).sum(), clf]
Esempio n. 10
0
def clasificar_ECOC(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname):
	print("\n[" + str(graphname) + "]")
	kernelRBF=1.0*RBF(1.0)
	clf=OutputCodeClassifier(estimator = DecisionTreeClassifier())
	clf=clf.fit(trainInputs, trainOutputs)
	precisionTrain = clf.score(trainInputs, trainOutputs)
	precisionTest = clf.score(testInputs, testOutputs)
	print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100))
	prediccion_test = clf.predict(testInputs)
	print(prediccion_test)
	print(testOutputs)
	return precisionTest
Esempio n. 11
0
def test_ecoc_delegate_sparse_base_estimator():
    # Non-regression test for
    # https://github.com/scikit-learn/scikit-learn/issues/17218
    X, y = iris.data, iris.target
    X_sp = sp.csc_matrix(X)

    # create an estimator that does not support sparse input
    base_estimator = CheckingClassifier(
        check_X=check_array,
        check_X_params={
            "ensure_2d": True,
            "accept_sparse": False
        },
    )
    ecoc = OutputCodeClassifier(base_estimator, random_state=0)

    with pytest.raises(TypeError, match="A sparse matrix was passed"):
        ecoc.fit(X_sp, y)

    ecoc.fit(X, y)
    with pytest.raises(TypeError, match="A sparse matrix was passed"):
        ecoc.predict(X_sp)

    # smoke test to check when sparse input should be supported
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    ecoc.fit(X_sp, y).predict(X_sp)
    assert len(ecoc.estimators_) == 4
Esempio n. 12
0
def OutputCodeClassifier(data, label, pred_data, pred_last):
    '''
    0.76473194506
    Number of mislabeled points out of a total 841 points : 211
    0.749108204518
    需要规范化
    '''
    data = np.array(data)
    pred_data = np.array(pred_data)
    label = np.array(label)
    pred_last = np.array(pred_last)
    from sklearn.multiclass import OutputCodeClassifier
    from sklearn.svm import LinearSVC
    clf = OutputCodeClassifier(LinearSVC(random_state=0),
                               code_size=2,
                               random_state=0)
    clf.fit(data, label)

    print clf.score(data, label)
    pred_result = clf.predict(pred_data)
    print("Number of mislabeled points out of a total %d points : %d" %
          (pred_data.shape[0], (pred_last != pred_result).sum()))
    print clf.score(pred_data, pred_last)
    return pred_result
Esempio n. 13
0
# -*- coding: utf-8 -*-
"""
Created on Fri May 24 20:38:46 2019

@author: pathouli
"""

import pandas as pd
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC

the_path = 'C:/Users/pathouli/myStuff/academia/torhea/projects/groupC/'

allstate_data = pd.read_csv(the_path + 'train.csv', sep=",")
clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=0)

label_cols = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
X_cols = allstate_data.columns.difference(label_cols)
X = allstate_data[X_cols][1:10000]
y = allstate_data[label_cols][1:10000]  #small sample to test

clf.fit(X, y).predict(X)

# https://www.kaggle.com/c/allstate-purchase-prediction-challenge/data
Esempio n. 14
0
"""

from sklearn import datasets
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

#数据获取
iris = datasets.load_iris()
x, y = iris.data, iris.target
print('样本数量,%d,特征数量%d' % x.shape)

#模型对象创建
#code_size 指定最终使用多少个子模型,实际的子模型数量=code_size*label_number
clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=30,
                           random_state=0)
#模型构建
clf.fit(x, y)

#输出预测结果值
print(clf.predict(x))
print('准确率%.3f' % accuracy_score(y, clf.predict(x)))

#模型属性输出
k = 1
for item in clf.estimators_:
    print('第%d个模型' % k)
    print(item)
    k += 1
print(clf.classes_)
Esempio n. 15
0
def main():

    filenameLB = 'mfcc_lb.csv'
    allsongcat = pickle.load(open('mfcc_fv.p', 'rb'))
    hcdf = pickle.load(open('hcdf_fv.p', 'rb'))

    with open('mfcc_lb.csv') as f:
        reader = csv.reader(f)
        for row in reader:
            labels = row

    # select training and test sets
    '''
    TEidx = np.array(random.sample(range(0,1000), 100))
    
    training = []
    test = []
    
    trainingLB = []
    testLB = []

    # make numpy arrays
    for i in range(1000):
        if i in TEidx:
            test.append(featureDict[i])
            testLB.append(int(labels[i]))
        else:
            training.append(featureDict[i])
            trainingLB.append(int(labels[i]))
        
    # fit with classifier and predict
    X = np.array(training)
    Y = np.array(trainingLB)

    '''
    l = [allsongcat, hcdf]
    all_feats = combineFeatures(l)
    feats_shuf = []
    labels_shuf = []
    index_shuf = range(len(labels))
    shuffle(index_shuf)
    for i in index_shuf:
        feats_shuf.append(all_feats[i])
        labels_shuf.append(labels[i])

    X = np.array(feats_shuf)
    Y = np.array(labels_shuf)

    kf = KFold(1000, n_folds=10)
    #rf = RandomForestClassifier(n_estimators=50, max_features = 'log2')
    sgd = SGDClassifier(loss="hinge", penalty="l2")
    #svc = svm.SVC(kernel='linear')
    dtree = DecisionTreeClassifier(max_depth=3)
    lsvc = LinearSVC(random_state=0)
    cla = OutputCodeClassifier(sgd, code_size=128, random_state=0)

    cm_all = np.zeros((10, 10), dtype=np.int)

    cb = np.zeros((10, 20))
    losses = []

    with open('ECOC_sgd_error.csv', 'w') as f1:
        wrtest = csv.writer(f1,
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator='\n')
        scores = 0.0
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[
                test]
            cla.fit(X_train, y_train)
            predictions = cla.predict(X_test)
            loss = zero_one_loss(predictions, y_test)
            losses.append(loss)
            scores += loss
            # print y_test
            # print predictions

            cb = cla.code_book_

            np.savetxt('codebook.csv', cb, delimiter=',')

            # Compute confusion matrix
            cm = confusion_matrix(
                y_test,
                predictions,
                labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
            np.set_printoptions(precision=2)
            #print(cm_all)
            cm_all = np.add(cm_all, cm)

        # make ECOC coding matrix 0-1 binary
        cb[cb <= 0] = 0
        wrtest.writerow(losses)
    print cb

    print scores / 10
        row = []
        for (top_left, bottom_right) in rectangles:
            row += get_haar_features(im, top_left, bottom_right)

        train_ecoc_table[ind] = row

    test_ecoc_table = np.zeros(shape=(np.shape(test_images)[0], 200))
    for ind, im in enumerate(test_images):
        row = []
        for (top_left, bottom_right) in rectangles:
            row += get_haar_features(im, top_left, bottom_right)

        test_ecoc_table[ind] = row

    clf = OutputCodeClassifier(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200), code_size=5, random_state=0)
    clf.fit(train_ecoc_table, labels)

    train_pred = np.array(clf.predict(train_ecoc_table))
    print "Digits Training Accuracy: %f" % (np.sum(train_pred == np.array(labels)).astype(np.float)/np.shape(train_pred)[0])

    test_pred = np.array(clf.predict(test_ecoc_table))
    print "Digits Testing Accuracy: %f" % (np.sum(test_pred == np.array(test_labels)).astype(np.float)/np.shape(test_pred)[0])

    # ecoc_table = []
    # for im in images:
    #
    #     im_preprocess = np.matrix([[np.sum(im[:i,:j]) for i in range(1, 29)] for j in range(1, 29)])
    #
    #     def get_black_rectangle(top_left, bottom_right):
    #         x1, y1 = top_left
    #         x2, y2 = bottom_right
Esempio n. 17
0
@author: 凯风
"""

from sklearn.datasets import load_iris
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split

iris_data = load_iris()
X, Y = iris_data.data, iris_data.target
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3)
'''
    纠错输出码
        和O-vs-O、O-vs-Rest不太一样的方法
        主要是在欧几里得空间表示
        具体的文本解释,看《机器学习》周治平的那个本里面有提到
'''

clf = LinearSVR(random_state=0)
ovrc = OutputCodeClassifier(clf, code_size=1.5, random_state=None, n_jobs=1)
ovrc.fit(trainX, trainY)
ovrc.predict(testX)
ovrc.code_book_
'''
    estimator           评估器
    code_size           空间尺寸? 
    random_state        随机器
    n_jobs              CPU的作业数量
'''
Esempio n. 18
0
x_train, x_test, y_train, y_test = train_test_split(breast.data,
                                                    breast.target,
                                                    test_size=0.2)

# creating a classification
clf_1 = MLPClassifier(solver='lbfgs',
                      alpha=1e-5,
                      hidden_layer_sizes=(5, 2),
                      random_state=42)
clf_2 = OutputCodeClassifier(LinearSVC(random_state=0),
                             code_size=2,
                             random_state=42)

# train the classifier with training data
clf_1.fit(x_train, y_train)
clf_2.fit(x_train, y_train)

# find y_pred prediction best on x_test data
y_pred_1 = clf_1.predict(x_test)
y_pred_2 = clf_2.predict(x_test)

# calculate accuracy of y_pred using y_test
print(f'accuracy {accuracy_score(y_test, y_pred_1)}')
print(f'accuracy {accuracy_score(y_test, y_pred_2)}')

# use classification_report function to print more information
print(
    f'\n\nClassification report for MLPClassifier is\n {classification_report(y_test, y_pred_2)}'
)
print(
    f'\n\nClassification report for MLPClassifierOutpuCodeClassifier is\n {classification_report(y_test, y_pred_2)}'
X_train, X_test, y_train, y_test = train_test_split(features_minmax,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=42)
samples_num = y_test.shape[0]

predictions_one_vs_rest = OneVsRestClassifier(LinearSVC(random_state=0)).fit(
    X_train, y_train).predict(X_test)

predictions_one_vs_one = OneVsOneClassifier(LinearSVC(random_state=0)).fit(
    X_train, y_train).predict(X_test)

clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=0)
prediction_outputCode = clf.fit(X_train, y_train).predict(X_test)

correct_onevsone = 0
correct_onevsrest = 0
correct_output = 0
y_test = np.array(y_test)
for i in range(samples_num):
    if predictions_one_vs_rest[i] == y_test[i]:
        correct_onevsrest = correct_onevsrest + 1
    if predictions_one_vs_one[i] == y_test[i]:
        correct_onevsone = correct_onevsone + 1
    if prediction_outputCode[i] == y_test[i]:
        correct_output = correct_output + 1

print("Accuracy for one vs one classifier")
acc_oneVsone = float(correct_onevsone) / samples_num
Esempio n. 20
0
# apply HoG to all the images in b.data
hog_train = []
for img_array in X_train:
    img = img_array.reshape(b.shape)
    fd, _ = hog(img,
                orientations=8,
                pixels_per_cell=(PPC, PPC),
                cells_per_block=(1, 1),
                visualize=True,
                multichannel=False)
    hog_train.append(fd)

clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=42)
clf.fit(hog_train, y_train)
tok = time.time()

if control[1]:
    # create the hog fro the X_test
    hog_test = []
    for img_arry in X_test:
        fd, _ = hog(img_arry.reshape(b.shape),
                    orientations=8,
                    pixels_per_cell=(PPC, PPC),
                    cells_per_block=(1, 1),
                    visualize=True,
                    multichannel=False)
        hog_test.append(fd)
    y_pred = clf.predict(hog_test)
Esempio n. 21
0
def ml_models(train, test, lab, labt):
    #Random Forest
    forest = RandomForestClassifier(n_estimators=200,
                                    max_leaf_nodes=50,
                                    criterion="entropy")
    forest = forest.fit(train, lab)
    output_rf = forest.predict(test).astype(int)
    suc_rf = 0
    totals_rf = [0 for m in range(num)]
    preds_rf = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_rf[labt[i]] += 1
        if output_rf[i] == labt[i]:
            suc_rf = suc_rf + 1
            preds_rf[labt[i]] += 1

    accuracy_rf = suc_rf / len(labt)

    #KNearest Neighbour

    neigh = KNeighborsClassifier(n_neighbors=7)
    neigh.fit(train, lab)
    output_kn = neigh.predict(test)
    suc_kn = 0
    totals_kn = [0 for m in range(num)]
    preds_kn = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_kn[labt[i]] += 1
        if output_kn[i] == labt[i]:
            suc_kn = suc_kn + 1
            preds_kn[labt[i]] += 1

    accuracy_kn = suc_kn / len(labt)

    # Logistic Regression

    model = LogisticRegression()
    model.fit(train, lab)
    output_lr = model.predict(test)
    suc_lr = 0
    totals_lr = [0 for m in range(num)]
    preds_lr = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_lr[labt[i]] += 1
        if output_lr[i] == labt[i]:
            suc_lr = suc_lr + 1
            preds_lr[labt[i]] += 1

    accuracy_lr = suc_lr / len(labt)

    # Naive Bayes

    model = GaussianNB()
    model.fit(train, lab)
    # print(model)
    # make predictions
    # expected = y
    output_nb = model.predict(test)

    suc_nb = 0
    totals_nb = [0 for m in range(num)]
    preds_nb = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_nb[labt[i]] += 1
        if output_nb[i] == labt[i]:
            suc_nb = suc_nb + 1
            preds_nb[labt[i]] += 1

    accuracy_nb = suc_nb / len(labt)

    # Decision Tree Classifier

    model = DecisionTreeClassifier()
    model.fit(train, lab)
    output_dt = model.predict(test)

    suc_dt = 0
    totals_dt = [0 for m in range(num)]
    preds_dt = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_dt[labt[i]] += 1
        if output_dt[i] == labt[i]:
            suc_dt = suc_dt + 1
            preds_dt[labt[i]] += 1

    accuracy_dt = suc_dt / len(labt)

    # Support Vector Machine

    clf = OutputCodeClassifier(LinearSVC(random_state=0),
                               code_size=2,
                               random_state=0)
    clf.fit(train, lab)
    output_sv = clf.predict(test)

    suc_sv = 0
    totals_sv = [0 for m in range(num)]
    preds_sv = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_sv[labt[i]] += 1
        if output_sv[i] == labt[i]:
            suc_sv = suc_sv + 1
            preds_sv[labt[i]] += 1

    accuracy_sv = suc_sv / len(labt)

    # Majority voting

    def Most_Common(lst):
        data = Counter(lst)
        return data.most_common(1)[0][0]

    output_mv = []
    for i in range(0, len(labt)):
        c = [output_dt[i], output_rf[i], output_lr[i]]
        output_mv.append(Most_Common(c))

    suc_mv = 0
    totals_mv = [0 for m in range(num)]
    preds_mv = [0 for m in range(num)]
    for i in range(0, len(labt)):
        totals_mv[labt[i]] += 1
        if output_mv[i] == labt[i]:
            suc_mv = suc_mv + 1
            preds_mv[labt[i]] += 1

    accuracy_mv = suc_mv / len(labt)

    return accuracy_rf, accuracy_kn, accuracy_lr, accuracy_nb, accuracy_dt, accuracy_sv, accuracy_mv, \
           preds_rf, preds_kn, preds_lr, preds_nb, preds_dt, preds_sv, preds_mv, \
           totals_rf, totals_kn, totals_lr, totals_nb, totals_dt, totals_sv, totals_mv
Esempio n. 22
0
x= np.array(np.zeros(15050), ndmin=1)    #label 0 for benign
y= np.array(np.ones(15050), ndmin=1)     #label 1 for malignant
y_train=np.concatenate((x,y), axis=0)

'''

#labeling y_test
x1= np.array(np.zeros(50), ndmin=1)  #label 0 for benign
y1= np.array(np.ones(50), ndmin=1)   #label 1 for malignant
y_test=np.concatenate((x1,y1), axis=0)

################Using LinearSVC
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000,5000, 10000]}
clf = OutputCodeClassifier(LinearSVC(random_state=0, verbose=5),
                           code_size=3, random_state=0)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

###########Using GridSearchCV
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000,5000, 10000], 'gamma':[100,10,1,0.1,0.01,0.001,0.0001]}
model_grid = GridSearchCV(SVC(), param_grid, verbose=5,cv=10)


ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0)
Cs = [0.0001,0.001, 0.01,0.5, 0.8, 0.1, 1, 10, 100, 1000, 5000, 10000]
cv = GridSearchCV(ecoc, {'estimator__C': Cs}, verbose=5, cv=10)

cv.fit(X_train,y_train)
Esempio n. 23
0
def train_by_OutputCodeClassifier(X, y):
    clf = OutputCodeClassifier(LinearSVC(random_state=0),
                               code_size=2,
                               random_state=0)
    return clf.fit(X, y)
Esempio n. 24
0
    threshold_train]

# Test
threshold_test = np.where((y_test == 0) | (y_test == 1) | (y_test == 7)
                          | (y_test == 8))
y_test_thres, x_test_thres = y_test[threshold_test], x_test[threshold_test]

###################################################################################################
################################# Training a classifier (4  numbers) ##############################

num_iter = 5

start_time_OCC = time.time()

OCC = OutputCodeClassifier(Perceptron(max_iter=num_iter, random_state=0))
OCC.fit(x_train_thres, y_train_thres)
predictionsOCC = OCC.predict(x_test_thres)
scoreOCC = OCC.score(x_test_thres, y_test_thres)

cmOCC = metrics.confusion_matrix(y_test_thres, predictionsOCC)
plt.figure(figsize=(9, 9))
sns.heatmap(cmOCC,
            annot=True,
            fmt=".3f",
            linewidths=.5,
            square=True,
            cmap='Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'OCC - Accuracy Score: {0}'.format(scoreOCC)
plt.title(all_sample_title, size=15)
Esempio n. 25
0
print(welfare_word)
welfare_data = tfidf_vectorizer.fit_transform(welfare_word).toarray()

print(welfare_data)
# 划分数据集合
welfare_data_train,welfare_data_test,welfare_target_train,welfare_target_test = \
train_test_split(welfare_data,welfare_target,test_size=0.2,random_state=666)

# 数据标准化
# stdScaler = StandardScaler().fit(welfare_data_train)
# welfare_data_train_std = stdScaler.transform(welfare_data_train)
# welfare_data_test_std = stdScaler.transform(welfare_data_test)

# 建立svm模型,使用线性核函数
model = OutputCodeClassifier(LinearSVC())
model = model.fit(welfare_data_train,welfare_target_train)
# 保存模型
joblib.dump(model, 'welfare_predict.pkl')

welfare_target_predict = model.predict(welfare_data_test)
print('预测前20个结果为:\n',welfare_target_predict[:20])

print('使用SVM预测数据的准确率为:',
      accuracy_score(welfare_target_test,welfare_target_predict))
print('使用SVM预测数据的精确率为:',
      precision_score(welfare_target_test,welfare_target_predict,average='micro'))
print('使用SVM预测数据的召回率为:',
      recall_score(welfare_target_test,welfare_target_predict,average='micro'))
print('使用SVM预测数据的F1值为:',
      f1_score(welfare_target_test,welfare_target_predict,average='micro'))
print('使用SVM预测数据的Cohen’s Kappa系数为:',
Esempio n. 26
0
      knn.fit(train_ft, train_label).score(test_ft, test_label))
print('LogisticRegression score: %f' %
      logistic.fit(train_ft, train_label).score(test_ft, test_label))

# SVM
list_of_acc = list()

accur = 0
# for c in np.logspace(-2, 10, 5):
c = 1000
# for c in np.logspace(-2, 10, 5):
#     for c in np.logspace(-2, 10, 5):
for c in [100, 1000, 10000, 100000]:
    for g in np.logspace(-9, 3, 13):

        clf = OutputCodeClassifier(svm.SVC(random_state=0, gamma=g, C=c),
                                   code_size=10,
                                   random_state=0)

        accur_temp = clf.fit(svmtrain,
                             svmtrainlabel).score(svmtest, svmtestlabel)

        if accur < accur_temp:
            accur = accur_temp
            gamma = g

        print(c, g, accur)

list_of_acc.append(accur)
print(np.mean(list_of_acc))
Esempio n. 27
0
     y_train = labels[100:172,i]
     X_test = sample2
     y_test = labels[272:,i]
 else:
     X_train = training
     y_train = labels[:172,i]
     X_test = sampletest
     y_test = labels[172:,i]
 
 box = np.zeros([6,6])
 accuracy = np.zeros(100)
 for m in range(0,100):
     posterior = np.empty([100,72,6])
     gbc = GradientBoostingClassifier(n_estimators=60, max_depth=3)
     occ = OutputCodeClassifier(gbc)
     y_pred = occ.fit(X_train, y_train).predict(X_test)
     
     n=0
     for i in range(0,len(y_pred)):
         if y_pred[i] == y_test[i]:
             #print i, y_pred[i], y_test[i]
             n = n+1
             accuracy[m] = accuracy[m]+1
         box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
             #posterior[m] =  knc.predict_proba(X_test)
 print np.mean(accuracy)/0.72, np.std(accuracy)/0.72
 #print sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0
 '''
 means = np.empty([72,6])
 stds = np.empty([72,6])
 grid = np.empty([6,6])
Esempio n. 28
0
# -- coding: utf-8 --
# Problem 8, Python code
# 1530200066 赵一勤
# SVM 分类代码
import h5py
import numpy as np
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OutputCodeClassifier

# 读取数据
f = h5py.File('./pca_data.mat', 'r')
data = {}
for k in f.keys():
    data[k] = f[k][:]

test_data = data['test_data'].transpose()
test_label = np.ravel(data['test_label'].transpose())
train_data = data['train_data'].transpose()
train_label = np.ravel(data['train_label'].transpose())

# 开始训练
clf = OutputCodeClassifier(SVC(kernel='rbf'))
model = clf.fit(train_data, train_label)
train_acc = model.score(train_data, train_label)
test_acc = model.score(test_data, test_label)

# 输出训练和测试正确率
print('[Train accuracy]: %s, [Test accuracy]: %s' %(train_acc, test_acc))
# random_search = RandomizedSearchCV(estimator=svc,
#                                    param_distributions=random_grid,
#                                    n_iter=10,
#                                    scoring='accuracy',
#                                    cv=3, 
#                                    verbose=1, 
#                                    random_state=12)
##----------------------End of Uncomment block for applying random search grid to find best parameters

##----------------------Uncomment block for using multiclass learning using output-codes
occ = OutputCodeClassifier(svc,code_size=2, random_state=8)
##----------------------End of Uncomment block for using multiclass learning using output-codes

## Fit your chosen model by changing the vaiable before the period to either - svc, random_search, grid_search or occ 
occ.fit(features_train, labels_train)

##----------------------Uncomment required block for finding out best parameters if using the random search or grid search for best accuracy
# print("The best hyperparameters from Random Search are:")
# print(random_search.best_params_)
# print("")
# print("The mean accuracy of a model with these hyperparameters is:")
# print(random_search.best_score_)
##----------------------End of Uncomment block for finding out best parameters if using the random search for best accuracy

def get_key(val):
    identifiedKey = [k for k,v in category_codes.items() if v == val]
    if  len(identifiedKey) == 0:
        return "No value"
    return identifiedKey[0]
Esempio n. 30
0
    train_ingredients.append(' '.join(ings))

#construct test_ingredients
for entry in test_set:
    ings = [WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w)) for w in entry['ingredients']]

    test_ingredients.append(' '.join(ings))

#used to encode labels as numbers for use with RandomForestClassifier
le = LabelEncoder()

#encode cuisines as numbers
train_cuisines = le.fit_transform(train_cuisines)

#used to create bag of ingredients vocabulary and create features for each entry
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_ingredients).toarray()
test_features = vectorizer.transform(test_ingredients).toarray()

clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=2)
result = clf.fit(train_features, train_cuisines).predict(test_features)

output = pd.DataFrame(data={'id':test_ids, 'cuisine':le.inverse_transform(result)})

#force explicit ordering of columns
output = output[['id', 'cuisine']]
output.to_csv('ecoc.csv', index=False)


Esempio n. 31
0
print()

data = loadmat('ex3data1.mat')
X = data['X']
y = data['y']

y = y.T
y = y[0]

# n_classes = 10
# code_size = np.log2(n_classes) / n_classes
# yields .332

clf = OutputCodeClassifier(LinearSVC(random_state=0),
                            code_size=2, random_state=0)
ind2 = clf.fit(X, y).predict(X)
error = []
count = 0
for i in range(0, len(y)):
    if y[i] == ind2[i]:
        count += 1                # Good - increment count
    else:  
        error.append(i)           # Record index of bad read

print('The number predicted correctly = ', count)
print('The percentage accuracy is ','{:.2%}'.format(count/len(y)))
print()

# Display a selection of the mis-classified
m = 0
# Display size
Esempio n. 32
0
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC
from sklearn.datasets import load_svmlight_file
import numpy as np
import sklearn

TEST_SPLIT = .2

X, Y = load_svmlight_file("ablated_features.txt")

num_instances = len(Y)
num_test = int((1 - TEST_SPLIT) * num_instances)
indices = np.arange(num_instances)
np.random.shuffle(indices)

X = X[indices]
Y = Y[indices]

X_train = X[:num_test]
Y_train = Y[:num_test]
X_test = X[num_test:]
Y_test = Y[num_test:]

# print X_train.shape[0], X_test.shape[0]

clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=20,
                           random_state=0)
preds = clf.fit(X_train, Y_train).predict(X_test)
print sklearn.metrics.accuracy_score(Y_test, preds)
Esempio n. 33
0
def oc_classify(X,Y):
	size = np.count_nonzero(sp.unique(Y))
	clf = OutputCodeClassifier(LinearSVC(),code_size=size)
	clf.fit(X,Y)
	return clf
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OutputCodeClassifier

iris = datasets.load_iris()

print iris
X, y = iris.data, iris.target
clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=0)

print X
print y
print clf.fit(X, y).predict(X)
Esempio n. 35
0
class Classifier():

    def __init__(self,trainlist,testlist,scaling = "binary",jobs=16,directory=False,
            features = False, feature_info = False):
        self.training = trainlist
        self.test = testlist #self.test should be a list with multiple lists for each testset
        self.scaling = scaling
        self.jobs = jobs
        self.directory = directory
        self.feature_status = {}
        self.outstring = False
        self.features = features
        self.feature_info = feature_info

    def count_feature_frequency(self):
        
        def ff(instances,queue):
            feature_frequency = defaultdict(int)
            for i,instance in enumerate(instances):
                for feature in instance["ngrams"]:
                    feature_frequency[feature] += 1
            queue.put(feature_frequency)
        
        print(len(self.training))

        q = multiprocessing.Queue()
        chunks = gen_functions.make_chunks(self.training,self.jobs)
        for chunk in chunks:
            p = multiprocessing.Process(target=ff,args=[chunk,q])
            p.start()

        ds = []
        while True:
            l = q.get()
            ds.append(l)
            if len(ds) == len(chunks):
                break
        
        self.feature_frequency = defaultdict(int)
        for d in ds:
            for k in d:
                self.feature_frequency[k] += d[k]
        self.features = sorted(self.feature_frequency, key=self.feature_frequency.get, 
            reverse=True)

    def make_feature_labellist(self):
        feature_labellist = defaultdict(list)
        for instance in self.training:
            try:
                label = int(instance["label"])       
                for feature in instance["ngrams"]:
                    feature_labellist[feature].append(label)
            except:
                continue
        self.feature_labellist = feature_labellist

    def prune_features(self):
        for instance in self.training:
            new_features = []
            #print feature_status
            for f in instance["ngrams"]:
                try:
                    if self.feature_status[f]:
                        new_features.append(f)
                except:
                    continue
            instance["ngrams"] = new_features
            # queue.put(instance)

    def convert_features(self,convert_list):
        for instance in self.training:
            new_features = []
            #print feature_status
            #print instance["features"]
            for i,f in enumerate(instance["ngrams"]):
                if f in convert_list.keys():
                     instance["ngrams"][i] = convert_list[f]
            #print instance["features"]

    def filter_stdev(self,threshold,prop):
        self.make_feature_labellist()
        feature_convert = {}
        new_features = []
        for feature in self.feature_labellist.keys():
            if re.search(r"^" + prop,feature):
                if gen_functions.return_standard_deviation(self.feature_labellist[feature]) > threshold or len(self.feature_labellist[feature]) <= 2:
                    self.feature_status[feature] = False
                else:
                    new_feature = str(abs(int(numpy.median(self.feature_labellist[feature])))) + "_days"
                    feature_convert[feature] = new_feature
                    new_features.append(new_feature)
                    self.feature_status[new_feature] = True
            else:
                self.feature_status[feature] = True
                new_features.append(feature)
        self.convert_features(feature_convert)
        self.prune_features()
        self.features = list(set(new_features))

    def prune_features_topfrequency(self,n):
        #generate feature_frequency dict
        for f in self.features[:n]:
            self.feature_status[f] = True 
        for f in self.features[n:]:
            self.feature_status[f] = False
        self.features = self.features[:n]
        self.prune_features()

    def balance_data(self):
        label_instances = defaultdict(list)
        new_training = []
        for instance in self.training:     
            label = instance["label"]
            label_instances[label].append(instance)
        if len(label_instances.keys()) > 2:
            median = int(numpy.median(numpy.array([len(label_instances[x]) for \
                x in label_instances.keys()])))
            for label in label_instances.keys():
                if len(label_instances[label]) == median:
                    new_training.extend(label_instances[label])
                else:
                    instances = lineconverter.Lineconverter(label_instances[label])
                    if len(instances.lines) < median:
                        instances.sample(median-len(instances.lines),sample_type="up")
                    else:
                        instances.sample(len(instances.lines)-median)
                    new_training.extend(instances.lines)
            self.training = new_training

    def index_features(self,ind = 0):
        feature_frequency=defaultdict(int)
        self.feature_info={}
        #print self.features      
        for i,feature in enumerate(self.features):
            self.feature_info[feature]=i+ind
        
        def sparsify(instances,writelist):
            for instance in instances:
                sparse_features = defaultdict(int)
                for feature in instance["ngrams"]:
                    try:
                        sparse_features[self.feature_info[feature]] += 1
                    except:
                        continue
                instance["sparse"] = sparse_features
                writelist.append(instance)         
        new_instances = []
        sparsify(self.training,new_instances)
        self.training = new_instances

        for tset in self.test:
            for instance in tset["instances"]:
                sparse_features = defaultdict(int)
                for feature in instance["ngrams"]:
                    try:
                        sparse_features[self.feature_info[feature]] += 1
                    except:
                        continue
                instance["sparse"] = sparse_features

    def vectorize(self,instances):
        zerolist = [float(0)] * len(self.feature_info.keys())
        matrix = []
        for instance in instances:
            featurev = zerolist[:]
            for feature in instance["sparse"].keys():
                if self.scaling == "binary":
                    featurev[feature] = float(1)
                elif self.scaling == "log": 
                    featurev[feature] = math.log(instance["sparse"][feature],10)
                elif self.scaling == "tfidf":
                    featurev[feature] = instance["sparse"][feature] * self.idf[feature]
            for feat in instance["features"]:
                featurev.append(feat)
            matrix.append(featurev)
        return matrix

    def model_necessities(self):
        #generate scipy libsvm input
        self.trainlabels_raw = [x["label"] for x in self.training]
        self.labels = set(self.trainlabels_raw)
        labeldict = dict(zip(self.labels,range(len(self.labels))))
        self.labeldict_back = dict(zip(range(len(self.labels)),self.labels))
        if self.scaling == "tfidf":
            self.idf = weight_features.return_idf(self.training)
        self.trainingvectors = self.vectorize(self.training)
        self.training_csr = csr_matrix(self.trainingvectors)
        self.trainlabels = [labeldict[x["label"]] for x in self.training]

    def predict(self,ts):
        testvectors = self.vectorize(ts)
        predictions = []
        for i,t in enumerate(testvectors):
            classification = self.clf.predict(t)
            proba = self.clf.predict_proba(t)
            classification_label = self.labeldict_back[classification[0]]
            if len(ts[0]["meta"]) == 6:
                predictions.append([ts[i]["meta"][5], ts[i]["label"] + " " + classification_label, \
                    " ".join([str(round(x,2)) for x in proba.tolist()[0]])])
            else:
                predictions.append([" ".join([x for x in ts[i]["ngrams"] if not re.search("_",x)]), ts[i]["label"] + " " + classification_label, \
                    " ".join([str(round(x,2)) for x in proba.tolist()[0]])])
        return predictions

    def train_svm(self,params = 10):
        #obtain the best parameter settings for an svm outputcode classifier
        if len(self.labels) > 2:
            print("outputcodeclassifier")
            param_grid = {'estimator__C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
                'estimator__kernel': ['linear','rbf','poly'], 
                'estimator__gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048],
                'estimator__degree': [1,2,3,4]}
            model = OutputCodeClassifier(svm.SVC(probability=True))
        else:
            print("svc model")
            param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
                'kernel': ['linear','rbf','poly'], 
                'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048],
                'degree': [1,2,3,4]}
            model = svm.SVC(probability=True)
        paramsearch = RandomizedSearchCV(model, param_grid, cv=5, verbose=2,n_iter = params,n_jobs=self.jobs) 
        print("Grid search...")
        paramsearch.fit(self.training_csr,numpy.asarray(self.trainlabels))
        print("Prediction...")
        #print the best parameters to the file
        parameters = paramsearch.best_params_
        self.outstring = "best parameter settings:\n"
        for parameter in parameters.keys():
            self.outstring += (parameter + ": " + str(parameters[parameter]) + "\n")
        self.outstring += ("best score: " + str(paramsearch.best_score_) + "\n\n")
        #train an svm outputcode classifier using the best parameters
        if len(self.labels) > 2:
            clf = svm.SVC(probability=True, C=parameters['estimator__C'],
                kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'],
                degree=parameters['estimator__degree'])
            self.clf = OutputCodeClassifier(clf,n_jobs=self.jobs)
            self.clf.fit(self.training_csr,self.trainlabels)
        else:
            self.clf = svm.SVC(probability=True, C=parameters['C'],
                kernel=parameters['kernel'],gamma=parameters['gamma'],
                degree=parameters['degree'])
            self.clf.fit(self.training_csr,self.trainlabels)

    def train_nb(self):
        self.clf = naive_bayes.MultinomialNB()
        self.clf.fit(self.training_csr,self.trainlabels)

    def train_decisiontree(self):
        self.clf = tree.DecisionTreeClassifier()
        self.clf.fit(self.training_csr.toarray(),self.trainlabels)

    def tenfold_train(self,voting,classifiers = [],p = 10):
        kf = cross_validation.KFold(len(self.training), n_folds=10)
        training = deepcopy(self.training)
        feat = deepcopy(self.features)
        fi = deepcopy(self.feature_info)
        if voting == "weighted":
            self.feature_info = {}
            self.features = []
            for instance in self.training:
                instance["sparse"] = defaultdict(int)
                instance["ngrams"] = []
        len_features = len(self.features)
        for i,fn in enumerate(classifiers):
            featurename = "___" + fn
            self.feature_info[featurename] = len_features + i
            self.features.append(featurename)
        for train_index, test_index in kf:
            train = deepcopy([training[x] for x in train_index])
            test = deepcopy([training[y] for y in test_index])
            cl = Classifier(train,test,features = feat,feature_info = fi)
            cl.model_necessities()
            if "svm" in classifiers:
                cl.train_svm(params = p)
                predictions = cl.predict(test)
                for i,j in enumerate(test_index):
                    prediction = int(float(predictions[i][1].split()[1]))
                    self.training[j]["sparse"][self.feature_info["___svm"]] = prediction
                    if prediction == 1:
                        self.training[j]["ngrams"].append("___svm")
            if "nb" in classifiers:
                cl.train_nb()
                predictions = cl.predict(test)
                for i,j in enumerate(test_index):
                    prediction = int(float(predictions[i][1].split()[1]))
                    self.training[j]["sparse"][self.feature_info["___nb"]] = prediction
                    if prediction == 1:
                        self.training[j]["ngrams"].append("___nb")
            if "dt" in classifiers:
                cl.train_decisiontree()
                predictions = cl.predict(test)
                for i,j in enumerate(test_index):
                    prediction = int(float(predictions[i][1].split()[1]))
                    self.training[j]["sparse"][self.feature_info["___dt"]] = prediction
                    if prediction == 1:
                        self.training[j]["ngrams"].append("___dt")               
            
    def return_classification_features(self):
        prediction_features_testset = []
        for tset in self.test:
            prediction_features = []
            predictions = self.predict(tset["instances"])
            for i,prediction in enumerate(predictions):
                prediction_features.append(int(float(predictions[i][1].split()[1])))
            prediction_features_testset.append(prediction_features)
        return prediction_features_testset    

    def add_classification_features(self,featuredict,featurenames,voter):
        if voter == "majority":
            self.feature_info = {}
            len_features = len(self.feature_info.keys())
            for i,fn in enumerate(featurenames):
                self.feature_info[fn] = len_features + i
                self.features.append(fn)
        for i,tset in enumerate(self.test):
            for j,instance in enumerate(tset["instances"]):
                if voter != "arbiter":
                    tset["instances"][j]["sparse"] = defaultdict(int)
                    tset["instances"][j]["ngrams"] = []
                for fn in featurenames:
                    tset["instances"][j]["sparse"][self.feature_info[fn]] = featuredict[i][j][fn]
                    tset["instances"][j]["ngrams"].append(fn)

    def append_classifier_labelings(self):
        len_features = len(self.feature_info.keys())
        self.feature_info["___append"] = len_features
        self.features.append("___append")
        for instance in self.training:
            instance["sparse"][self.feature_info["___append"]] = instance["append"]
            if instance["append"] == 1:
                instance["features"].append("___append")
        for tset in self.test:
            for instance in tset["instances"]:
                instance["sparse"][self.feature_info["___append"]] = instance["append"]
                if instance["append"] == 1:
                    instance["features"].append("___append")

    def output_data(self):
        if re.search(".txt",self.test[0]["out"]):
            outdir = self.test[0]["out"][:-4] + "_"
        else:
            outdir = self.test[0]["out"]
        #output features
        #featureout = codecs.open(outdir + "features.txt","w","utf-8")
        featureout = open(outdir + "features.txt", "w", encoding = "utf-8")
        for feature in sorted(self.feature_info, key=self.feature_info.get):
            featureout.write(feature + "\t" + str(self.feature_info[feature]) + "\n")
        featureout.close()
        #output trainfile
        #trainout = codecs.open(outdir + "train.txt","w","utf-8")
        trainout = open(outdir + "train.txt", "w", encoding = "utf-8")
        for instance in self.training:
            trainout.write(instance["label"] + " " + ",".join(instance["ngrams"]) + " " + 
                ",".join([str(x) for x in instance["sparse"].keys()]) + "\n")
        trainout.close()
        #output testfile
        #testout = codecs.open(outdir + "test.txt","w","utf-8")
        testout = open(outdir + "test.txt", "w", encoding = "utf-8")
        for i,tset in enumerate(self.test):
            #testout = codecs.open(outdir + "test" + str(i) + ".txt","w","utf-8")
            for instance in tset["instances"]:
                testout.write(instance["label"] + " " + ",".join(instance["ngrams"]) + " " + 
                    ",".join([str(x) for x in instance["sparse"].keys()]) + "\n")

    def test_model(self):
        for tset in self.test:
            testresults = self.predict(tset["instances"])
            #outfile = codecs.open(tset["out"] + "predictions.txt","w","utf-8")
            if re.search(".txt",tset["out"]):
                outstring = tset["out"][:-4] + "_predictions.txt"
            else:
                outstring = tset["out"] + "predictions.txt"
#            outfile = codecs.open(outstring,"w","utf-8")
            outfile = open(outstring, "w", encoding = "utf-8")
            if self.outstring:
                outfile.write(self.outstring)
            for instance in testresults:
                outfile.write("\t".join(instance) + "\n") 
            outfile.close()

    def save_model(self):
        for tset in self.test:
            outfile = tset["out"][:-4] + "_model.joblib.pkl"
            #with open(outfile, 'wb') as fid:
            #    cPickle.dump(self.clf, fid)    
            with open(outfile, 'wb') as fid:
                pickle.dump(self.clf, fid)    
            #_ = joblib.dump(, outfile, compress=9)
            #outvocabulary = codecs.open(tset["out"] + "vocabulary.txt","w","utf-8")
            outstring = tset["out"][:-4] + "_vocabulary.txt"
            #outvocabulary = codecs.open(outstring,"w","utf-8")
            outvocabulary = open(outstring, "w", encoding = "utf-8")
            for feature in self.features:
                outvocabulary.write(feature + "\n")
            outvocabulary.close() 
            #outidf = codecs.open(tset["out"][:-4] + "_idfs.txt","w","utf-8")
            outidf = open(tset["out"][:-4] + "_idfs.txt", "w", encoding = "utf-8")
            for key in self.idf.keys():
                outidf.write(str(key) + "\t" + str(self.idf[key]) + "\n")
            outidf.close()
Esempio n. 36
0
        WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w))
        for w in entry['ingredients']
    ]

    test_ingredients.append(' '.join(ings))

#used to encode labels as numbers for use with RandomForestClassifier
le = LabelEncoder()

#encode cuisines as numbers
train_cuisines = le.fit_transform(train_cuisines)

#used to create bag of ingredients vocabulary and create features for each entry
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_ingredients).toarray()
test_features = vectorizer.transform(test_ingredients).toarray()

clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=2)
result = clf.fit(train_features, train_cuisines).predict(test_features)

output = pd.DataFrame(data={
    'id': test_ids,
    'cuisine': le.inverse_transform(result)
})

#force explicit ordering of columns
output = output[['id', 'cuisine']]
output.to_csv('ecoc.csv', index=False)
Esempio n. 37
0
        for w in words:
            for i, word in enumerate(vocab):
                if word == w:
                    bag_vector[i] += 1

        print("{0} \n{1}\n".format(sentence, numpy.array(bag_vector)))


allsentences = [
    "Joe waited`s for the train", "The train was late",
    "Mary and Samantha took the bus",
    "I looked for Mary and Samantha at the bus station",
    "Mary and Samantha arrived at the bus station early but waited until noon for the bus"
]

generate_bow(allsentences)

from sklearn import datasets
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC
iris = datasets.load_iris()
X, y = iris.data, iris.target
print(X)
clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=0)

clf.fit(X, y)
m = clf.predict(X)
print(m)