Ejemplo n.º 1
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intented
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovr.partial_fit(iris.data[60:], iris.target[60:])
    pred = ovr.predict(iris.data)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data)
    
    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred), 0.65)
Ejemplo n.º 2
0
def test_ovr_multilabel():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
    y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"],
         ["ham", "eggs"], ["ham"]]
    #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]]
    Y = np.array([[0, 1, 1],
                  [0, 1, 0],
                  [1, 1, 1],
                  [1, 0, 1],
                  [1, 0, 0]])

    classes = set("ham eggs spam".split())

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(),
                     ElasticNet(), Lasso(alpha=0.5)):
        # test input as lists of tuples
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_equal(set(y_pred), set(["spam", "eggs"]))
        assert_true(clf.multilabel_)

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_array_equal(y_pred, [0, 1, 1])
        assert_true(clf.multilabel_)
Ejemplo n.º 3
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intended
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    # with SGDClassifier
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]

    ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
                                            shuffle=False, random_state=0))
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    ovr.partial_fit(X[7:], y[7:])
    pred = ovr.predict(X)
    ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
                                             shuffle=False, random_state=0))
    pred1 = ovr1.fit(X, y).predict(X)
    assert_equal(np.mean(pred == y), np.mean(pred1 == y))

    # test partial_fit only exists if estimator has it:
    ovr = OneVsRestClassifier(SVC())
    assert_false(hasattr(ovr, "partial_fit"))
Ejemplo n.º 4
0
def AgeClassifier(data_feature_stack,data_age_stack,test_size = 0.5):
	Age_range = np.unique(data_age_stack)
	# 923,  1529,   856,   1617,    13836,      6260,     1198

	AgeX_train,AgeX_test,AgeY_train,AgeY_test = preprocess(data_feature_stack,data_age_stack,test_size)
	print "fitting Age Clssfifer..."
	# parameters = (C=1.0, class_weight=None, dual=True, fit_intercept=True,\
	# intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',\
 #     random_state=0, tol=0.0001, verbose=0)

	clf = OneVsRestClassifier(LinearSVC(C = 0.001)).fit(AgeX_train, AgeY_train)


	print "predicting Age..."
	Age_test_result  = clf.predict(AgeX_test)
	Age_train_result = clf.predict(AgeX_train)	

	# Age_acc_test  = clf.score(AgeX_test, AgeY_test)
	# Age_acc_train = clf.score(AgeX_train, AgeY_train)
	Age_acc_test  = np.sum(Age_test_result == AgeY_test)
	Age_acc_train = np.sum(Age_train_result == AgeY_train)

	temp   = Age_test_result-AgeY_test
	error  = np.sqrt(temp**2)
	rmse   = np.mean(error)
	error2 = np.sqrt(temp[temp!=0]**2)
	rmse2  = np.mean(error2)


	pdb.set_trace()
	return clf, Age_acc_test,Age_acc_train
Ejemplo n.º 5
0
def test_ovr_multiclass():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
    y = ["eggs", "spam", "ham", "eggs", "ham"]
    Y = np.array([[0, 0, 1],
                  [0, 1, 0],
                  [1, 0, 0],
                  [0, 0, 1],
                  [1, 0, 0]])

    classes = set("ham eggs spam".split())

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(),
                     ElasticNet()):

        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_equal(set(y_pred), set("eggs"))

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[0, 0, 4]])[0]
        assert_array_equal(y_pred, [0, 0, 1])
Ejemplo n.º 6
0
def test_ovr_fit_predict_sparse():
    for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]:
        base_clf = MultinomialNB(alpha=1)

        X, Y = datasets.make_multilabel_classification(
            n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0
        )

        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]

        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
        Y_pred_sprs = clf_sprs.predict(X_test)

        assert_true(clf.multilabel_)
        assert_true(sp.issparse(Y_pred_sprs))
        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)

        # Test predict_proba
        Y_proba = clf_sprs.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > 0.5
        assert_array_equal(pred, Y_pred_sprs.toarray())

        # Test decision_function
        clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train))
        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
Ejemplo n.º 7
0
class SVMSentiment:

    def __init__(self):
       self.max_length = 500
       self.batch_size=50
       self.model = OneVsRestClassifier(svm.SVC(kernel='rbf',gamma=1,C = 1,tol=0.0001,cache_size=5000)  )


    def configureSVMModel(self,TrainX,TrainY,validX,validY):
       print('Configuring the SVM Model')
       currPath = os.getcwd()
       currFiles =  os.listdir(currPath)
       print('################### Test #####################')
       print(currFiles.count('SVMScores.pkl'))
       if(currFiles.count('SVMScores.pkl')==0):
          self.model.fit(TrainX, TrainY)
          # Saving model scores
          joblib.dump(self.model,currPath+'/SVMScores.pkl')
       else:
          print('Loading already existing Model')
          self.model = joblib.load(currPath+'/SVMScores.pkl')
       

    def evaluateSVMModel(self,TestX,TestY):
       print self.model.score(TestX, TestY)

       predicted_data=[]
       for i in range(len(TestX)):
          predicted_data.append(list([self.model.predict (TestX[i].reshape(1,-1)) ,TestY[i]]) )

       print "Predicted Data"
       print predicted_data
       #print TestY

    def predictSentiment(self,dataX,dataY):
       print('@@@@@@@@@@@@@@@@ Length of test data : ',len(dataX))
       for i in range(len(dataX)):
         predicted_data = self.model.predict(dataX[i].reshape(1,-1))
         expected_out = dataY[i]

       print('############### Predicted data :',predicted_data,' ; ; ',expected_out)
       return predicted_data

    def getTrainTestData(self):
       print('Loading Training and Test data')
       trainX=[]
       trainY=[]
       testX=[]
       testY = []
       f= open('trainingdata.pkl','rb')
       (trainX,trainY) = cPickle.load(f)       
       f= open('testingdata.pkl','rb')
       (testX,testY)  = cPickle.load(f)

       return ((trainX,trainY),(testX,testY))

    def getValidationData(self,dataX,dataY):
       return dataX[0:self.batch_size,:],dataY[0:self.batch_size,:]
Ejemplo n.º 8
0
class ScikitSVM:
    def __init__(self, train_file, tags_file, tag_start, tag_end):
        self.sf = ScikitFeature(train_file, tags_file, tag_start, tag_end, max_features=10000)
        print "done getting features"
        self.classifier = OneVsRestClassifier(LinearSVC(C=32,random_state=0), n_jobs=1)
        self.classifier.fit(self.sf.training_text, self.sf.training_labels_tuple)
        print "done fitting"

    def predict(self, text):
        text_vector = self.sf.get_text_vector(text)
        labels = self.classifier.predict(text_vector)
        return self.sf.get_labels(labels)

    def test(self, test_file):
        test_matrix = self.sf.get_file_text(test_file)
        predicted_labels = self.classifier.predict(test_matrix)
        predicted_label_names = [self.sf.get_labels_from_id(label_ids) for label_ids in predicted_labels]
        true_labels = self.sf.get_file_labels(test_file)
        N_question = len(predicted_labels)
        N_true_tags = 0.0
        N_predict_tags = 0.0
        N_correct = 0.0
        F1 = []
        for i in range(N_question):
            N_true_tags += len(true_labels[i])
            N_predict_tags += len(predicted_labels[i])
            this_correct = 0.
            for predict_label_id in predicted_labels[i]:
                if (predict_label_id in true_labels[i]):
                    this_correct += 1
            N_correct += this_correct
            if this_correct == 0:
                F1.append(0)
            else:
                p = this_correct / len(predicted_labels[i])
                r = this_correct / len(true_labels[i])
                F1.append(2*p*r/(p+r))
        print N_correct,N_predict_tags,N_true_tags
        p= N_correct / N_predict_tags
        r= N_correct / N_true_tags
        print "Precision: %f %%" % (p*100)
        print "Recall: %f %%" % (r*100)
        print "Mean F1: %f" % (np.average(F1))

    def get_tags(self, test_file, output_file):
        print "Getting tags for "+test_file
        new_csv = open(output_file, 'w')
        writer = csv.writer(new_csv, delimiter=',', quotechar='"')
        test_matrix = self.sf.get_file_text(test_file)
        predicted_labels = self.classifier.predict(test_matrix)
        predicted_label_names = [self.sf.get_labels_from_id(label_ids) for label_ids in predicted_labels]
        ids = self.sf.get_file_ids(test_file)
        for i,id in enumerate(ids):
            tags = " ".join(predicted_label_names[i])
            writer.writerow([id, tags])
        new_csv.close()
        print "Done."
Ejemplo n.º 9
0
def test_ovr_pipeline():
    # test with pipeline with length one
    # pipeline is a bit weird wrt duck-typing predict_proba and
    # decision_function
    clf = Pipeline([("tree", DecisionTreeClassifier())])
    ovr_pipe = OneVsRestClassifier(clf)
    ovr_pipe.fit(iris.data, iris.target)
    ovr = OneVsRestClassifier(DecisionTreeClassifier())
    ovr.fit(iris.data, iris.target)
    assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
Ejemplo n.º 10
0
def test_ovr_pipeline():
    # Test with pipeline of length one
    # This test is needed because the multiclass estimators may fail to detect
    # the presence of predict_proba or decision_function.
    clf = Pipeline([("tree", DecisionTreeClassifier())])
    ovr_pipe = OneVsRestClassifier(clf)
    ovr_pipe.fit(iris.data, iris.target)
    ovr = OneVsRestClassifier(DecisionTreeClassifier())
    ovr.fit(iris.data, iris.target)
    assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
Ejemplo n.º 11
0
def apply_Model(temp_data, selectModel):
    data = {};
    data['X_train_ceil'] = temp_data['X_train_ceil'];
    data['X_test_ceil'] = temp_data['X_test_ceil'];
    data['y_train_ceil'] = temp_data['y_train_ceil'];
    data['y_test_ceil'] = temp_data['y_test_ceil'];
    data['ind_ceil'] = temp_data['ind_ceil'] 
    
    # feature selection for the floor
    data['X_train_floor'] = temp_data['X_train_floor'];
    data['X_test_floor'] = temp_data['X_test_floor'];
    data['y_train_floor'] = temp_data['y_train_floor'] ;
    data['y_test_floor'] = temp_data['y_test_floor'];
    data['ind_floor'] = temp_data['ind_floor'];
    
    if(selectModel==1):
        print "OneVsRest";
        classifier_floor = OneVsRestClassifier(LinearSVC(random_state=0)).fit(data['X_train_floor'], data['y_train_floor'])
        classifier_ceil = OneVsRestClassifier(LinearSVC(random_state=0)).fit(data['X_train_ceil'], data['y_train_ceil'])
    if(selectModel==2):
        print "Decision Tree";
        classifier_floor = tree.DecisionTreeClassifier().fit(data['X_train_floor'], data['y_train_floor'])
        classifier_ceil = tree.DecisionTreeClassifier().fit(data['X_train_ceil'], data['y_train_ceil'])
    if(selectModel==3):
        print "Nearest Centroid";
        classifier_floor = NearestCentroid().fit(data['X_train_floor'], np.ravel(data['y_train_floor']));
        classifier_ceil = NearestCentroid().fit(data['X_train_ceil'], np.ravel(data['y_train_ceil']));
    if(selectModel==4):
        print "SGD Classifier";
        classifier_floor = SGDClassifier(loss="hinge", penalty="l2").fit(data['X_train_floor'], np.ravel(data['y_train_floor']));
        classifier_ceil = SGDClassifier(loss="hinge", penalty="l2").fit(data['X_train_ceil'], np.ravel(data['y_train_ceil']));
    
    train_predict_floor = classifier_floor.predict(data['X_train_floor']);
    conf_mat_floor = confusion_matrix(train_predict_floor,data['y_train_floor']);
    train_predict_ceil = classifier_ceil.predict(data['X_train_ceil']);
    conf_mat_ceil = confusion_matrix(train_predict_ceil, data['y_train_ceil'])
    
    y_predict_ceil = classifier_ceil.predict(data['X_test_ceil'])
    result_ceil = confusion_matrix(y_predict_ceil,data['y_test_ceil'])
    
    y_predict_floor = classifier_floor.predict(data['X_test_floor'])
    result_floor = confusion_matrix(y_predict_floor,data['y_test_floor'])
    
    precision_floor, recall_floor, _, _ = precision_recall_fscore_support(data['y_test_floor'], y_predict_floor)
    precision_ceil, recall_ceil, _, _ = precision_recall_fscore_support(data['y_test_ceil'], y_predict_ceil)
    
    acc_ceil = result_ceil.trace()*100/result_ceil.sum();
    acc_ceil_train = conf_mat_ceil.trace()*100/conf_mat_ceil.sum();
    acc_floor = result_floor.trace()*100/result_floor.sum();
    acc_floor_train = conf_mat_floor.trace()*100/conf_mat_floor.sum();
    
    data['acc_ceil_train'] = acc_ceil_train;
    data['acc_floor_train'] = acc_floor_train;
    
    return data, acc_ceil, acc_floor, recall_ceil, recall_floor, result_ceil, result_floor, conf_mat_floor, conf_mat_ceil;
Ejemplo n.º 12
0
def runDigits(n, skclf, myclf):
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    print 'scikit predict'
    sk_pred = skclf.predict(X_test)
    print sk_pred
    print y_test
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
Ejemplo n.º 13
0
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
                                                                     max_depth=None,
                                                                     min_samples_split=2,
                                                                     min_samples_leaf=1,
                                                                     min_weight_fraction_leaf=0.,
                                                                     max_features="auto",
                                                                     max_leaf_nodes=None,
                                                                     class_weight=None),
                                                 n_jobs=-1
                                                 )

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
Ejemplo n.º 14
0
def run_classifier(sentences, labels, test_docs):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	test_sentences = doc2sentences(test_docs)
	sentence_matrix = tfidf.transform(test_sentences)
	print("Shape of sentence matrix : ", sentence_matrix.shape)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import linearSVC
	# estimator = SVC(kernel='linear')
	estimator = linearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)
	predictions = classifier.predict(sentence_matrix)

	import csv
	with open("classified.csv", "w") as fl:
		writer = csv.writer(fl)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			writer.writerow((test_sentences[i], curr_pred))
Ejemplo n.º 15
0
def main():
    img_dir = 'images/'
    images = [img_dir + f for f in os.listdir(img_dir)]
    labels = [f.split('/')[-1].split('_')[0] for f in images]
    label2ids = {v: i for i, v in enumerate(sorted(set(labels),
                                                   key=labels.index))}
    y = np.array([label2ids[l] for l in labels])

    data = []
    for image_file in images:
        img = img_to_matrix(image_file)
        img = flatten_image(img)
        data.append(img)
    data = np.array(data)

    # training samples
    is_train = np.random.uniform(0, 1, len(data)) <= 0.7
    train_X, train_y = data[is_train], y[is_train]

    # training a classifier
    pca = RandomizedPCA(n_components=5)
    train_X = pca.fit_transform(train_X)
    multi_svm = OneVsRestClassifier(LinearSVC())
    multi_svm.fit(train_X, train_y)

    # evaluating the model
    test_X, test_y = data[is_train == False], y[is_train == False]
    test_X = pca.transform(test_X)
    print pd.crosstab(test_y, multi_svm.predict(test_X),
                      rownames=['Actual'], colnames=['Predicted'])
Ejemplo n.º 16
0
    def _calculate(self, X, y, categorical):
        import sklearn.discriminant_analysis
        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        try:
            for train, test in kf.split(X, y):
                lda = sklearn.discriminant_analysis.LinearDiscriminantAnalysis()

                if len(y.shape) == 1 or y.shape[1] == 1:
                    lda.fit(X[train], y[train])
                else:
                    lda = OneVsRestClassifier(lda)
                    lda.fit(X[train], y[train])

                predictions = lda.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
            return accuracy / 10
        except scipy.linalg.LinAlgError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
        except ValueError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
Ejemplo n.º 17
0
def experienceSVMTrain(trainData, testData, testCounts, classifierNumber = 0):
    if classifierNumber == 0:
        classifier = OneVsRestClassifier(svm.SVC())
        algorithmName = 'OneVsRestClassifier'
    elif classifierNumber == 1:
        classifier = svm.SVC()
        algorithmName = 'SupportVectorClassifier'
    elif classifierNumber == 2:
        classifier = RandomForestClassifier(n_estimators= 1000, n_jobs = 4)
        algorithmName = 'RandomForestClassifier'
    else:
        classifier = KNeighborsClassifier(n_neighbors=3)
        algorithmName = 'KNeighborsClassifier'
    print_(algorithmName, 'has been started to train the data by', nowStr())
    classifier.fit(preprocessing.scale(trainData['X']), trainData['Y'])
    print_(algorithmName, 'has been started to predict the test data by', nowStr())
    predictions = classifier.predict(preprocessing.scale(testData['X']))
    truePositives = 0
    truePositiveCounts = {genre: 0 for genre in genreSet}
    predictionCount = len(predictions)
    for i in range(predictionCount):
        if predictions[i] == testData['Y'][i]:
            truePositives += 1
            truePositiveCounts[genreSet[testData['Y'][i]]] += 1
    print_(algorithmName, 'Experiment has been finished by', nowStr())
    print_('\nGeneral Test Accuracy = %.3f' % (truePositives / float(predictionCount)))
    print('\nTotal Number of predictions:', predictionCount)
    print('Number of true predictions:  ', truePositives)
    print('Number of false predictions: ', predictionCount-truePositives)
    print_('\nTesting distribution:            ', {genre: testCounts[genre] for genre in genreSet})
    print_('Distribution of true predictions: ', truePositiveCounts)
    falseNegativeCounts = {genre: testCounts[genre]-truePositiveCounts[genre] for genre in genreSet}
    print_('Distribution of false predictions:', falseNegativeCounts, '\n')
Ejemplo n.º 18
0
def runDigitsDensity(n,_i, j):
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute
    #skclf = KernelDensity(metric=ma)
    myclf = hw7u.MyKNN(metric=metric[j], density=True)
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    #skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    #print 'scikit predict'
    #sk_pred = skclf.predict(X_test)
    #print sk_pred
    print y_test
    print y_pred
    #print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
    print 'My Accuracy: {}'.format(myacc)
Ejemplo n.º 19
0
def test_ovr_always_present():
    """Test that ovr works with classes that are always present or absent
    """
    # Note: tests is the case where _ConstantPredictor is utilised
    X = np.ones((10, 2))
    X[:5, :] = 0
    y = np.zeros((10, 3))
    y[5:, 0] = 1
    y[:, 1] = 1
    y[:, 2] = 1

    [[int(i >= 5), 2, 3] for i in range(10)]
    ovr = OneVsRestClassifier(LogisticRegression())
    assert_warns(UserWarning, ovr.fit, X, y)
    y_pred = ovr.predict(X)
    assert_array_equal(np.array(y_pred), np.array(y))
    y_pred = ovr.decision_function(X)
    assert_equal(np.unique(y_pred[:, -2:]), 1)
    y_pred = ovr.predict_proba(X)
    assert_array_equal(y_pred[:, -1], np.ones(X.shape[0]))

    # y has a constantly absent label
    y = np.zeros((10, 2))
    y[5:, 0] = 1  # variable label
    ovr = OneVsRestClassifier(LogisticRegression())
    assert_warns(UserWarning, ovr.fit, X, y)
    y_pred = ovr.predict_proba(X)
    assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
Ejemplo n.º 20
0
Archivo: svm.py Proyecto: lkprof/sema
def svm():
    #load data
    x_train,y_train=load_svmlight_file("12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("12testdata")
    x_test.todense()
    sk=SelectKBest(f_classif,9).fit(x_train,y_train)
    x_new=sk.transform(x_train)
    x_newtest=sk.transform(x_test)
    print(sk.scores_)
    print(x_new.shape)
    print(sk.get_support())
    #classfier
    clf=SVC(C=2,gamma=2)
    ovrclf=OneVsRestClassifier(clf,-1)
    ovrclf.fit(x_train,y_train)
    y_pred=ovrclf.predict(x_test)
    # write result
    with open("result.txt","w") as fw:
        for st in y_pred.tolist():
            fw.write(str(st)+'\n')
    print(np.array(y_pred).shape)

    target_names=['0','1','2','3']
    #result
    #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    #print(classification_report(y_test,y_pred,target_names=target_names))
    #print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
Ejemplo n.º 21
0
def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       return_indicator=True,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # decision function only estimator. Fails in current implementation.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        # Estimator with predict_proba disabled, depending on parameters.
        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
        decision_only.fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred)
Ejemplo n.º 22
0
def run_classifier(sentences, labels, test_doc_list, output_file_path_list):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	estimator = LinearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)

	for test_doc, output_file_path in zip(test_doc_list, output_file_path_list):
		test_sentences = doc2sentences([test_doc])
		sentence_matrix = tfidf.transform(test_sentences)
		print("Shape of sentence matrix : ", sentence_matrix.shape)
		predictions = classifier.predict(sentence_matrix)

		from lxml import etree
		document = etree.Element('doc')
		doc_tree = etree.ElementTree(document)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i]
		doc_tree.write(output_file_path)
Ejemplo n.º 23
0
def run(data_path):
    print "Reading the dataset:", data_path
    mnist = fetch_mldata('MNIST original')
    mnist.data, mnist.target = shuffle(mnist.data, mnist.target)

    # Trunk the data
    n_train = 600
    n_test = 400

    # Define training and testing sets
    indices = arange(len(mnist.data))
    random.seed(0)
    train_idx = random.sample(indices, n_train)
    test_idx = random.sample(indices, n_test)
    X_train, y_train = mnist.data[train_idx], mnist.target[train_idx]
    X_test, y_test = mnist.data[test_idx], mnist.target[test_idx]

    # Apply a learning algorithm
    print "Applying a learning algorithm..."
    clf = OneVsRestClassifier(LinearSVC()).fit(X_train, y_train)

    # Make a prediction
    print "Making predictions..."
    y_pred = clf.predict(X_test)

    print y_pred

    # Evaluate the prediction
    print "Evaluating results..."
    print "Precision: \t", metrics.precision_score(y_test, y_pred)
    print "Recall: \t", metrics.recall_score(y_test, y_pred)
    print "F1 score: \t", metrics.f1_score(y_test, y_pred)
    print "Mean accuracy: \t", clf.score(X_test, y_test)
Ejemplo n.º 24
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression(),
                            order=np.array([0, 2, 4, 6, 8, 10,
                                            12, 1, 3, 5, 7, 9,
                                            11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Ejemplo n.º 25
0
    def setUp(self):
        import sklearn.svm as svm
        import sklearn.preprocessing as pp
        from sklearn.multiclass import OneVsRestClassifier

        # 2 class
        iris = datasets.load_iris()
        self.data = iris.data
        self.target = pp.LabelBinarizer().fit_transform(iris.target)
        self.df = pdml.ModelFrame(self.data, target=self.target)
        self.assertEqual(self.df.shape, (150, 7))

        svc1 = svm.SVC(probability=True, random_state=self.random_state)
        estimator1 = OneVsRestClassifier(svc1)
        self.df.fit(estimator1)
        self.df.predict(estimator1)
        self.assertTrue(isinstance(self.df.predicted, pdml.ModelFrame))

        svc2 = svm.SVC(probability=True, random_state=self.random_state)
        estimator2 = OneVsRestClassifier(svc2)
        estimator2.fit(self.data, self.target)
        self.pred = estimator2.predict(self.data)
        self.proba = estimator2.predict_proba(self.data)
        self.decision = estimator2.decision_function(self.data)

        # argument for classification reports
        self.labels = np.array([2, 1, 0])
def main():
    word_vec_dict = readGloveData("./glove.twitter.27B/glove.twitter.27B.25d.txt")
    tweets = readTweets("./dataset_raw/semeval2016-task6-trainingdata.txt")

    tweetVectors = getTweetVectors(tweets[0 : len(tweets) - 1], word_vec_dict)
    print tweets[0]
    print getSumVectors(tweets[0], word_vec_dict)
    tweetClasses = set(tweets[-1])

    mapping = {"favor": 1, "none": 0, "against": 1}

    tweetClasses = np.asarray([mapping[x] for x in tweets[-1]])
    tweetData = np.asarray(tweetVectors)
    print tweetClasses.shape
    print tweetData.shape
    X = tweetData
    Y = tweetClasses
    clf = OneVsRestClassifier(LinearSVC())
    # clf = SVC(kernel='rbf', gamma=1.5, random_state=34543)
    X_train = X[0 : int(0.7 * len(X))]
    y_train = Y[0 : int(0.7 * len(Y))]
    X_test = X[int(0.7 * len(X)) : len(X)]
    y_test = Y[int(0.7 * len(Y)) : len(Y)]
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    for indexMax in xrange(len(y_test)):
        print str(y_pred[indexMax]) + " " + str(y_test[indexMax])
Ejemplo n.º 27
0
    def conduct_test(base_clf, test_predict_proba=False):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_equal(set(y_pred), set("eggs"))

        if test_predict_proba:
            X_test = np.array([[0, 0, 4]])
            probabilities = clf.predict_proba(X_test)
            assert_equal(2, len(probabilities[0]))
            assert_equal(clf.classes_[np.argmax(probabilities, axis=1)], clf.predict(X_test))

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[3, 0, 0]])[0]
        assert_equal(y_pred, 1)
Ejemplo n.º 28
0
def test_decision_function_shape_two_class():
    for n_classes in [2, 3]:
        X, y = make_blobs(centers=n_classes, random_state=0)
        for estimator in [svm.SVC, svm.NuSVC]:
            clf = OneVsRestClassifier(estimator(
                decision_function_shape="ovr")).fit(X, y)
            assert_equal(len(clf.predict(X)), len(y))
Ejemplo n.º 29
0
def ml_train(datasetFilePath, falsePredictionsFilePath, unknownPredictionsFilePath, confusionMatricesDir, classifierFilePath):
    logger.info("start of training and testing phase")

    classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True), n_jobs=NUMBER_OF_CPUS_TO_USE)

    logger.info("loading data set")
    dataset, features_names = load_dataset(datasetFilePath)

    #limited_dataset = limit_dataset(dataset)
    limited_dataset = dataset
    
    ml_dataset = split_dataset(limited_dataset, len(features_names))

    logger.info("fitting training set X_train - %s, y_train - %s" % (ml_dataset.X_train.shape, ml_dataset.y_train.shape))
    classifier.fit(ml_dataset.X_train, ml_dataset.y_train)

    logger.info("predicting test set X_test - %s, y_test - %s" % (ml_dataset.X_test.shape, ml_dataset.y_test.shape))
    y_pred = classifier.predict(ml_dataset.X_test)

    y_pred_probabilities = classifier.predict_proba(ml_dataset.X_test)

    y_pred_with_unknown_cls, y_pred_fictive, max_y_pred_probs = process_prediction_vector(ml_dataset.y_test, y_pred, y_pred_probabilities)

    validation(ml_dataset.y_test, y_pred, y_pred_with_unknown_cls, y_pred_fictive, list(classifier.classes_) + ["unknown"])
    plot_confusion_matrices(ml_dataset.y_test, y_pred, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "1")
    plot_confusion_matrices(ml_dataset.y_test, y_pred_with_unknown_cls, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "2")
    plot_confusion_matrices(ml_dataset.y_test, y_pred_fictive, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "3")

    produce_output(ml_dataset.y_test, y_pred, max_y_pred_probs, ml_dataset.test_terms_name, falsePredictionsFilePath, unknownPredictionsFilePath)

    logger.info("exporting classifier model")
    joblib.dump(classifier, classifierFilePath)

    logger.info("end of training and testing phase")
Ejemplo n.º 30
0
def main(root):
	actionDirs = filter(lambda p: os.path.isdir(root + os.path.sep + p), os.listdir(root))
	actionNames = actionDirs
	actionDirs = map(lambda p: root + os.path.sep + p, actionDirs)
	actions = map(lambda a, n: Action(a, n), actionDirs, actionNames)

	print 'Collecting cluster features'
	# collect all features for generating codebook.
	clusterFeatures = np.empty([0,426], dtype=np.float32)
	tags = []
	for action in actions:
		for video in action.traindata:
			print video.src
			start = clusterFeatures.shape[0]
			clusterFeatures = np.vstack((clusterFeatures, video.features))
			end = clusterFeatures.shape[1]
			tags.append((start, end, action.id))

	# performing k means clustering for creating dictionary of visual words
	k = 4000
	attempts = 10
	print 'Generating ' + str(k) + ' clusters'
	compactness, labels, centers = cv2.kmeans(clusterFeatures, k, criteria=(cv2.TERM_CRITERIA_EPS+cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0), attempts=attempts, flags=cv2.KMEANS_RANDOM_CENTERS)


	# generating data and labels for svm training
	print 'Generating bag-of-words for each video'
	trainData = np.empty([0, k], dtype=np.float32)
	trainLabels = []
	for t in tags:
		hist, bin_edges = np.histogram(labels[t[0]:t[1]], k)
		trainData = np.vstack((trainData, hist))
		trainLabels.append(t[2])
	trainLabels = np.array(trainLabels)

	# using one v/s all svm classifier
	print 'Training SVM model with chi-squared kernel'
	model = OneVsRestClassifier(SVC(kernel=chi2_kernel, random_state=0, class_weight='auto')).fit(trainData, trainLabels)
	pickle.dump(model, open('model.p', 'w'))
	pickle.dump(centers, open('centers.p', 'w'))


	#
	#	Testing:
	#		Generate dense trajectory features for every input test video and then get bag of words.
	#		use trained svm for predicting the output
	testData = np.empty([0,k], dtype=np.float32)
	testLabels = []
	for action in actions:
		for video in action.testdata:
			hist = video.generateBOW(centers)
			testData = np.vstack((testData, hist))
			testLabels.append(action.id)
	testLabels = np.array(testLabels)

	# predicted labels compared with the true labels to get the classification accuracy
	predictedLabels = model.predict(testData)

	print "accuracy: " + str(float(np.sum(np.array(testLabels)==np.array(predictedLabels)))/predictedLabels.shape[0])
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=.7,random_state=24)

#NOTE: change classifier here
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, max_features=15, n_jobs=4, max_depth=5))

#training
st = time.time()
print "training started"
clf.fit( x_train, y_train )
print "training ended"
et = time.time()
tt = et - st
print "Training Time = " + str(tt) + "\n"

#predictions
pred = clf.predict( x_test )
#NOTE: change to decision_function or predict_proba depending on the classifier
y_score = clf.predict_proba(x_test)
#y_score = clf.decision_function(x_test)
out = open('../results/rf_ALL_ova.txt','w')

#################################################################################
#PrecisionRecall-plot
precision = dict()
recall = dict()
PR_area = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], thresholds = precision_recall_curve(y_test[:,i],y_score[:,i])
    PR_area[i] = auc(recall[i], precision[i])
    average_precision[i] = average_precision_score(y_test[:,i], y_score[:, i])
Ejemplo n.º 32
0
start_time = time.time()
#clf_ovr_bagged = OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear', probability=True, class_weight='balanced'), max_samples=max_samples, n_estimators=n_estimators, n_jobs=-1))
clf_nb_bagged = OneVsRestClassifier(BaggingClassifier(MultinomialNB(), max_samples=max_samples, n_estimators=n_estimators, n_jobs=-1))
clf_nb_bagged.fit(X_train_mc_resampled_tfidf, y_train_mc_resampled)
total_time = time.time() - start_time
print("Tempo para a criação do modelo Bagging Naive Bayes Multinomial", str(timedelta(seconds=total_time)))

filename = path_base_exeperimento + 'Modelos/clf_nb_bagged_trt21_todosassuntosfiltrados.sav'
import pickle
pickle.dump(clf_nb_bagged, open(filename, 'wb'))

# clf_ovr_bagged = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test_mc_tfidf, y_test_mc)
# print(result)

y_pred_bagged = clf_nb_bagged.predict(X_test_mc_tfidf)
clf_nb_bagged.score(X_test_mc_tfidf,y_test_mc)
accuracy = clf_nb_bagged.score(X_test_mc_tfidf,y_test_mc)
print('macro_precision %s \nmacro_recall    %s \nmacro_fscore    %s' % score(y_test_mc,y_pred_bagged,average='macro')[:3])
print('micro_precision %s \nmicro_recall    %s \nmicro_fscore    %s' % score(y_test_mc,y_pred_bagged,average='weighted')[:3])
conf_mat = multilabel_confusion_matrix(y_true=y_test_mc, y_pred=y_pred_bagged)
print('Confusion matrix:\n', conf_mat)

macro_score = str('macro_precision %s \nmacro_recall    %s \nmacro_fscore    %s' % score(y_test_mc,y_pred_bagged,average='macro')[:3])
micro_score = str('micro_precision %s \nmicro_recall    %s \nmicro_fscore    %s' % score(y_test_mc,y_pred_bagged,average='weighted')[:3])
imprime_resultado_classificador(path_base_exeperimento, nome_experimento, nome_classificador, timedelta(seconds=total_time), accuracy, macro_score, micro_score)

y_pred_bagged_proba = clf_nb_bagged.predict_proba(X_test_mc_tfidf)

#%%
Ejemplo n.º 33
0
data_vect = df.values


training_data, test_data, train_target, test_target = train_test_split(data_vect, targets, train_size=0.8)
print('training_data size = ', len(training_data[0]))
print('test_data size = ', len(test_data[0]))


model = LogisticRegression()
ovr = OneVsRestClassifier(model).fit(training_data, train_target)

dummy = DummyClassifier('most_frequent')
dummy.fit(training_data, train_target)

#predicting & comparing
predictedOvr = ovr.predict(test_data)
predictedDummy = dummy.predict(test_data)


print('Real data')
print(np.asarray(test_target))


print('Majority classifier result')
print(predictedDummy)
print('Majority classifier accuracy:', accuracy_score(test_target,predictedDummy))

print('OVR predictor result')
print(predictedOvr)
print('OVR accuracy:', accuracy_score(test_target,predictedOvr))
Ejemplo n.º 34
0
plt.figure(figsize=(4, 3))
plt.boxplot([cv_scores_ova, cv_scores_ovo])
plt.xticks([1, 2], ['One vs All', 'One vs One'])
plt.title('Prediction: accuracy score')

##############################################################################
# Plot a confusion matrix
# ------------------------
# We fit on the the first 10 sessions and plot a confusion matrix on the
# last 2 sessions
from sklearn.metrics import confusion_matrix
from nilearn.plotting import plot_matrix

svc_ovo.fit(X[session < 10], y[session < 10])
y_pred_ovo = svc_ovo.predict(X[session >= 10])

plot_matrix(confusion_matrix(y_pred_ovo, y[session >= 10]),
            labels=unique_conditions,
            title='Confusion matrix: One vs One',
            cmap='hot_r')

svc_ova.fit(X[session < 10], y[session < 10])
y_pred_ova = svc_ova.predict(X[session >= 10])

plot_matrix(confusion_matrix(y_pred_ova, y[session >= 10]),
            labels=unique_conditions,
            title='Confusion matrix: One vs All',
            cmap='hot_r')

plt.show()
Ejemplo n.º 35
0
    for x, y in zip(truth_building, truth_building_rearrange):
        assert x == y

    pred_file_name = os.path.join(task_dir,
                                  '{}_city_pred_2048.npy'.format(model_name))
    if not os.path.exists(pred_file_name):
        kf = KFold(n_splits=5, shuffle=True)
        clf = OneVsRestClassifier(svm.SVC(probability=True))
        pred_city = []
        truth_city_rearrange = []
        for cnt, (train_idx, test_idx) in enumerate(kf.split(feature)):
            print('Training on fold {}'.format(cnt))
            X_train, X_test = feature[train_idx, :], feature[test_idx, :]
            y_train, y_test = truth_city[train_idx], truth_city[test_idx]
            clf.fit(X_train, y_train)
            pred_city.append(clf.predict(X_test))
            truth_city_rearrange.append(y_test)
        pred_city = np.concatenate(pred_city)
        truth_city_rearrange = np.concatenate(truth_city_rearrange)
        np.save(pred_file_name, [pred_city, truth_city_rearrange])
    else:
        pred_city, truth_city_rearrange = np.load(pred_file_name)

    plt.figure(fig_num)
    fpr_rf, tpr_rf, _ = roc_curve(truth_building_rearrange, pred_building)
    plt.plot(fpr_rf,
             tpr_rf,
             label='{} AUC = {:.2f}'.format(model_name, auc(fpr_rf, tpr_rf)))
    plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
from sklearn.linear_model import LogisticRegression

print(__doc__)

# Load a multi-label dataset from https://www.openml.org/d/40597
X, Y = fetch_openml('yeast', version=4, return_X_y=True)
Y = Y == 'TRUE'
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,
                                                    random_state=0)

# Fit an independent logistic regression model for each class using the
# OneVsRestClassifier wrapper.
base_lr = LogisticRegression()
ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average='samples')

# Fit an ensemble of logistic regression classifier chains and take the
# take the average prediction of all the chains.
chains = [ClassifierChain(base_lr, order='random', random_state=i)
          for i in range(10)]
for chain in chains:
    chain.fit(X_train, Y_train)

Y_pred_chains = np.array([chain.predict(X_test) for chain in
                          chains])
chain_jaccard_scores = [jaccard_score(Y_test, Y_pred_chain >= .5,
                                      average='samples')
                        for Y_pred_chain in Y_pred_chains]
Ejemplo n.º 37
0
def predict():
    # Read the csv file into dataframe df
    df = pd.read_csv("train.csv")
    n = 159571  # number of records in file
    s = 25000  # desired sample size
    filename = "train.csv"
    skip = sorted(random.sample(range(n), n - s))
    df = pd.read_csv(filename, skiprows=skip)
    df.columns = [
        "id", "message", "toxic", "severe_toxic", "obscene", "threat",
        "insult", "identity_hate"
    ]
    df = df.reindex(np.random.permutation(df.index))

    comment = df['message']
    comment = comment.as_matrix()

    label = df[[
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]]
    label = label.as_matrix()

    comments = []
    labels = []

    for ix in range(comment.shape[0]):
        if len(comment[ix]) <= 400:
            comments.append(comment[ix])
            labels.append(label[ix])

    labels = np.asarray(labels)
    import string
    print(string.punctuation)
    punctuation_edit = string.punctuation.replace('\'', '') + "0123456789"
    print(punctuation_edit)
    outtab = "                                         "
    trantab = str.maketrans(punctuation_edit, outtab)

    import nltk
    from nltk.corpus import stopwords
    nltk.download('stopwords')

    stop_words = stopwords.words("english")
    for x in range(ord('b'), ord('z') + 1):
        stop_words.append(chr(x))

    import nltk
    from nltk.stem import PorterStemmer, WordNetLemmatizer

    # create objects for stemmer and lemmatizer
    lemmatiser = WordNetLemmatizer()
    stemmer = PorterStemmer()
    # download words from wordnet library
    nltk.download('wordnet')

    for i in range(len(comments)):
        comments[i] = comments[i].lower().translate(trantab)
        l = []
        for word in comments[i].split():
            l.append(stemmer.stem(lemmatiser.lemmatize(word, pos="v")))
        comments[i] = " ".join(l)

    # import required library
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

    # create object supplying our custom stop words
    count_vector = TfidfVectorizer(stop_words=stop_words)
    # fitting it to converts comments into bag of words format
    tf = count_vector.fit_transform(comments)

    # print(count_vector.get_feature_names())
    print(tf.shape)

    def shuffle(matrix, target, test_proportion):
        ratio = int(matrix.shape[0] / test_proportion)
        X_train = matrix[ratio:, :]
        X_test = matrix[:ratio, :]
        Y_train = target[ratio:, :]
        Y_test = target[:ratio, :]
        return X_train, X_test, Y_train, Y_test

    X_train, X_test, Y_train, Y_test = shuffle(tf, labels, 3)

    from sklearn.naive_bayes import MultinomialNB

    # clf will be the list of the classifiers for all the 6 labels
    # each classifier is fit with the training data and corresponding classifier
    if request.method == 'GET':
        text = request.args.get('text')
    elif request.method == 'POST':
        data = json.loads(request.get_data().decode('utf-8'))
        message = data['text']
        model = data['model']
        print(model)
        data = [message]
        vect = count_vector.transform(data)

        if model == 'MultinomialNB':
            clf = []
            for ix in range(6):
                clf.append(MultinomialNB())
                clf[ix].fit(X_train, Y_train[:, ix])

            my_prediction = []
            for ix in range(6):
                my_prediction.append(clf[ix].predict(vect)[0])
            print(my_prediction)

        elif model == 'XGBoost':

            from sklearn.multiclass import OneVsRestClassifier
            from xgboost import XGBClassifier
            from sklearn.preprocessing import MultiLabelBinarizer

            clf = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4))

            clf.fit(X_train, Y_train)
            my_prediction = clf.predict(vect)[0]
            print(my_prediction.shape)
            print(my_prediction)  #=(my_prediction[0,:].toarray())[0]

        #evaluate_score(Y_test, my_prediction)
        #for prediction in my_prediction:
        results = []
        result_dict = dict()
        result_dict['Toxic'] = str(my_prediction[0])
        result_dict['Severely Toxic'] = str(my_prediction[1])
        result_dict['Obscene'] = str(my_prediction[2])
        result_dict['Threat'] = str(my_prediction[3])
        result_dict['Insult'] = str(my_prediction[4])
        result_dict['Identity Hate'] = str(my_prediction[5])

        results.append(json.dumps(result_dict))

        return jsonify(results)
Ejemplo n.º 38
0
len(acc_list)
"""# **SVM**"""

acc_list, rec_list, pre_list, total_acc_list = [], [], [], []

# Support Vector Classifier
# one vs rest

from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

for x_train, decoded_y_train, x_test, y_test in zip(x_train_list,
                                                    decoded_y_train_list,
                                                    x_test_list, y_test_list):
    svm = OneVsRestClassifier(SVC()).fit(x_train, decoded_y_train)
    y_pred_svc = svm.predict(x_test)

    # print(y_pred_svc)

    # encode y_pred_svc to one-hot vector
    Y = encoder.fit_transform(y_pred_svc)
    y_pred_svc = pd.get_dummies(Y).values.astype(np.float32)

    acc, rec, pre, total_acc = result(y_pred_svc, y_test)
    acc_list.append(acc)
    rec_list.append(rec)
    pre_list.append(pre)
    total_acc_list.append(total_acc)

tf.print("Total Accuracy = {:.3f}".format(
    K.sum(total_acc_list) / len(total_acc_list)))
Ejemplo n.º 39
0
    #                 train_scores_mean + train_scores_std, alpha=0.1,
    #                 color="r")
    # plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
    #                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
    # plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
    #         label="Training score")
    # plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
    #         label="Cross-validation score")

    # plt.legend(loc="best")
    # end of learning curve

    # everything is ready, let's fit
    clf3.fit(X_train_sc_100, y_train_wout_100)
    score = clf3.score(X_train_sc_100, y_train_wout_100)
    Predicted_test3 = clf3.predict(X_test_sc_100)
    Predicted_train3 = clf3.predict(X_train_sc_100)

    ## now let's add the nans and reshape back
    test_predicted = np.empty(test_mask100.shape)
    test_predicted[test_mask100] = Predicted_test3
    test_predicted[np.logical_not(test_mask100)] = np.nan
    test_predicted_reshaped = np.reshape(test_predicted, y_test.shape)

    train_predicted = np.empty(mask100.shape)
    train_predicted[mask100] = Predicted_train3
    train_predicted[np.logical_not(mask100)] = np.nan
    train_predicted_reshaped = np.reshape(train_predicted, y_train.shape)

    GT = np.empty(test_mask100.shape)
    GT[test_mask100] = y_test_wout100
    SGDClassifier(alpha=0.000001, penalty="l2"),
    'NaiveBayes':
    MultinomialNB(),
    'KNN':
    KNeighborsClassifier(),
    'Random Forests: ':
    RandomForestClassifier(n_estimators=200,
                           max_features='auto',
                           n_jobs=core_count)
}
performance = {}
for name, classifier in classifiers.iteritems():

    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    performance[name] = hamming_loss(Y_test, Y_pred)

# In[334]:

fig = plt.figure()
plt.title("Hamming Loss - OneVsRestClassifier")
plt.bar(range(len(performance)),
        list(performance.itervalues()),
        color="r",
        align="center")
plt.xticks(range(len(performance)), list(performance.iterkeys()), rotation=45)
plt.xlim([-1, len(performance)])
plt.show()

# # Tuning Hyper Parameters
Ejemplo n.º 41
0
item_train_data = []
item_test_data = []
(train_x_data, train_y_data), (test_x_data,
                               test_y_data) = fashion_mnist.load_data()
for item_train in train_x_data:
    item_train_data.append(item_train.flatten())
for item_test in test_x_data:
    item_test_data.append(item_test.flatten())
item_train_data = np.array(item_train_data)
item_test_data = np.array(item_test_data)
print(item_test)
item_mnist_classifier = OneVsRestClassifier(
    LogisticRegression(verbose=1, max_iter=10))
item_mnist_classifier.fit(item_train_data, train_y_data)
conf_matrix = confusion_matrix(
    test_y_data, item_mnist_classifier.predict(item_test_data))
print("Confusion_matrix:")
print(conf_matrix)
sns.heatmap(conf_matrix)
print('The output score is: %s' %
      item_mnist_classifier.score(item_test_data, test_y_data))
plt.show()

item_mnist_classifier = LogisticRegression(
    verbose=1, max_iter=6, multi_class="multinomial", solver="sag")
item_mnist_classifier.fit(item_train_data, train_y_data)
conf_matrix = confusion_matrix(
    test_y_data, item_mnist_classifier.predict(item_test_data))
sns.heatmap(conf_matrix)
item_mnist_classifier.score(item_test_data, test_y_data)
plt.show()
Ejemplo n.º 42
0
# In[9]:

#Split the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
X_train

# In[11]:

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

#MultiClassOnevsRestClassifier

clf5 = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)
print("MultiClassOnevsRestClassifier prediction :", clf5.predict(X_test))
lrTest = clf5.predict(X_test)
print("MultiClassOnevsRestClassifier score :", accuracy_score(y_test, lrTest))

# In[12]:

res = clf5.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, res))
print(classification_report(y_test, res))

# In[13]:

cm = confusion_matrix(y_test, res)
pl.matshow(cm)
pl.title('Confusion matrix of the classifier')
Ejemplo n.º 43
0
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            # tmp_a = len(set_true.union(set_pred))
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    # print(acc_list)
    return np.mean(acc_list)

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    # print("Hamming loss: {}".format(hamming_loss(y_test_tfidf, y_pred)))
    print("Hamming score: {}".format(hamming_score(y_test_tfidf, y_pred)))
    # print('Subset accuracy: {0}'.format(accuracy_score(y_test_tfidf, y_pred, normalize=True, sample_weight=None)))
    # print('Subset precision: {0}'.format(precision_score(y_test_tfidf, y_pred, average='samples')))
    print("---")

# sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None)
lr = LogisticRegression()
mn = MultinomialNB()
svm = LinearSVC()

for classifier in [lr, svm, mn]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(x_train_tfidf, y_train_tfidf)
    y_pred = clf.predict(x_test_tfidf)
    # print_score(y_pred, classifier)
    print(classification_report(y_test_tfidf, y_pred))
Ejemplo n.º 44
0
# display accuracies
print(acc_count_nb, acc_tfidf_nb)

# Code ends here

# --------------
import warnings
warnings.filterwarnings('ignore')

# initialize logistic regression
logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10))

logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10))
# fit on count vectorizer training data
logreg_1.fit(X_train_count, Y_train)

# fit on tfidf vectorizer training data
logreg_2.fit(X_train_tfidf, Y_train)

# accuracy with count vectorizer
acc_count_logreg = accuracy_score(logreg_1.predict(X_test_count), Y_test)

# accuracy with tfidf vectorizer
acc_tfidf_logreg = accuracy_score(logreg_2.predict(X_test_tfidf), Y_test)

# display accuracies
print(acc_count_logreg, acc_tfidf_logreg)

# Code ends here
pickle.dump(lr_model, open("lr_fasttext.sav", 'wb'))
pred_lr = lr_model.predict(np.asarray(q_test))
lr_evaluation_scores, lr_cm = evaluation.multilabel_evaluation_multilabelbinarizer(
    d_test, label_encoder.inverse_transform(pred_lr), "Logistic Regression")
#evaluate the model, for abstracts use multilabel_evaluation_multilabelbinarizer() for citations
#use multilabel_evaluation()
#lr_evaluation_scores, lr_cm = evaluation.multilabel_evaluation(
#    d_test, label_encoder.inverse_transform(pred_lr), "Logistic Regression")
documentation_file_modelopt.write(lr_evaluation_scores)

#build Gaussian Naive Bayes model and evaluate the model
print("Gaussian Naive Bayes model evaluation")
gnb_model = OneVsRestClassifier(GaussianNB()).fit(np.asarray(q_train),
                                                  np.asarray(d_train_encoded))
pickle.dump(gnb_model, open("gnb_fasttext.sav", 'wb'))
pred_gnb = gnb_model.predict(np.asarray(q_test))
#evaluate the model, for abstracts use multilabel_evaluation_multilabelbinarizer() for citations
#use multilabel_evaluation()
gnb_evaluation_scores, gnb_cm = evaluation.multilabel_evaluation_multilabelbinarizer(
    d_test, label_encoder.inverse_transform(pred_gnb), "Gaussian Naive Bayes")
#gnb_evaluation_scores, gnb_cm = evaluation.multilabel_evaluation(
#    d_test, label_encoder.inverse_transform(pred_gnb), "Gaussian Naive Bayes")
documentation_file_modelopt.write(gnb_evaluation_scores)

#split data in training and test data
d_train_single, d_test_single, q_train_single, q_test_single = train_test_split(
    datasets_single, q_fasttext, test_size=0.2)

#prepare queries and datasets for Neural Network application
label_binarizer = LabelBinarizer()
label_binarizer.fit(datasets_single)
Ejemplo n.º 46
0
train_label_12 = train_set_12[:, -1]

test_set_1 = test_set[np.where(test_label == 1)]
test_set_2 = test_set[np.where(test_label == 3)]
test_set_12 = np.concatenate((test_set_1, test_set_2), axis=0)
test_data_12 = test_set_12[:, :-1]
test_label_12 = test_set_12[:, -1]

binary_model = MSE_binary()
binary_model.fit(train_data_12, train_label_12)
res = binary_model.predict(test_data_12)

mc_model = OneVsRestClassifier(binary_model)

mc_model.fit(train_data, train_label)
pred_MSE_train = mc_model.predict(train_data)
acc_MSE_train = np.sum(
    (pred_MSE_train == train_label).astype(float)) / np.shape(train_label)[0]
print(
    'Classification accuracy on training set with all features with MSE, unnormalized is',
    acc_MSE_train)
pred_MSE_test = mc_model.predict(test_data)
acc_MSE_test = np.sum(
    (pred_MSE_test == test_label).astype(float)) / np.shape(test_label)[0]
print(
    'Classification accuracy on test set with all features with MSE, unnormalized is',
    acc_MSE_test)
print()
mc_model.fit(train_data[:, :2], train_label)
pred_MSE_train = mc_model.predict(train_data[:, :2])
acc_MSE_train = np.sum(
Ejemplo n.º 47
0
classifier4 = OneVsRestClassifier(GaussianNB())
classifier5 = OneVsRestClassifier(DecisionTreeClassifier(criterion = 'entropy', random_state = 0))
classifier6 = OneVsRestClassifier(RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0))

classifier1.fit(X_train, y_train)
classifier2.fit(X_train, y_train)
classifier3.fit(X_train, y_train)
classifier4.fit(X_train, y_train)
classifier5.fit(X_train, y_train)
classifier6.fit(X_train, y_train)

m = 0
l = 0
d = 0

y_pred = classifier1.predict(X_test)
if y_pred[0][0] == 0 and y_pred[0][1] == 1:
    m += 1
if y_pred[0][0] == 1 and y_pred[0][1] == 0:
    l += 1
if y_pred[0][0] == 0 and y_pred[0][1] == 0:
    d += 1
    
y_pred = classifier2.predict(X_test)
if y_pred[0][0] == 0 and y_pred[0][1] == 1:
    m += 1
if y_pred[0][0] == 1 and y_pred[0][1] == 0:
    l += 1
if y_pred[0][0] == 0 and y_pred[0][1] == 0:
    d += 1
    
Ejemplo n.º 48
0
class IncrementalClassifier:
    '''Classifier for image patch classification
       -----------------------------------------
       This class is an interface for sklearn classifiers,
       adapted for classification of image patches in a labelled
       image. The user can accumulate training instances and
       visualize interactively the predictions on new images.
       Misclassified instances can be added as new training instances
       and the classifier can be re-trained and predictions re-run.

       Attributes
       ----------
       imgx : ImgX object
            Labelled image instance with the intensity image and
            segmentation (bounding boxes of ROIs)
       newlabels : array-like
           Array or list of new training instances
       Xtrain : np.array
           Train data with observations (e.g. cells) in rows
           and morphological features in columns
       ytrain : np.array
           List or array of labels
       clf : sklearn classifier
           Classifier model. Default is RandomForestClassifier in
           one-vs-rest mode. For details see the documentation:
           (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
       classes : array-like
           List of class names, e.g. ['cell A', 'cell B', 'cell C']

       Methods
       -------
       set_param(**kwargs)
           Change any of the class attributes that have been set
       set_classifier(clf=None)
           Initializes a classifier object

       add_instances(newlabels)
           Add new training instances

       train_classifier()
           Train a classifier with the current training data

       generate_predictions()
           Generate predictions after the classifier has been
           trained.

       plot_predictions()
           Plot predictions using plotly interactive
           visualization

       h5_write(fname, group)
           Write the train set as HDF5 file
       
    '''
    def __init__(self):
        '''
        Parameters
        ----------
        imgx : ImgX object
            Labelled image instance with the intensity image and
            segmentation (bounding boxes of ROIs)
        newlabels : array-like
            Array or list of new training instances
        Xtrain : np.array
            Train data with observations (e.g. cells) in rows
            and morphological features in columns
        ytrain : np.array
            List or array of labels
        clf : sklearn classifier
            Classifier model. Default is RandomForestClassifier in
            one-vs-rest mode. For details see the documentation:
            (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
        classes : array-like
            List of class names, e.g. ['cell A', 'cell B', 'cell C']
        '''
        # initialize with 'None' something to be loaded later
        self.imgx = None

        self.newlabels = None
        # training data
        self.Xtrain = None
        self.ytrain = None
        # inialize classifier as 'None'
        self.clf = None
        self.classes = None

    def __setattr__(self, name, value):
        self.__dict__[name] = value
        # if a new ImgX object is passed,
        # compute its features
        if name == 'imgx':
            self._compute_imgx_data()
            self.newlabels = None

    # function for setting individual class parameters
    def set_param(self, **kwargs):
        '''Change class attribute values
           -----------------------------
           Using this function we can change the current
           labelled image loaded (`imgx` attribute)
        '''
        for k in kwargs.keys():
            self.__setattr__(k, kwargs[k])

    # internal function checks if the embedded
    # imgx object has the features computed
    def _compute_imgx_data(self):
        if self.imgx is not None and len(self.imgx.data) == 0:
            self.imgx.compute_props()

    def _push_traindata(self, newlabels):
        ids = newlabels[:, 0]
        if self.Xtrain is None:
            self.Xtrain = self.imgx.data.iloc[ids, :]
            self.ytrain = label_binarize(newlabels[:, 1],
                                         classes=range(len(self.classes)))
        else:
            self.Xtrain = pd.concat([self.Xtrain, self.imgx.data.iloc[ids, :]],
                                    axis=0)
            self.ytrain = np.append(self.ytrain,
                                    label_binarize(newlabels[:, 1],
                                                   classes=range(
                                                       len(self.classes))),
                                    axis=0)

    def set_classifier(self, clf=None):
        '''Initialize classifier
           ---------------------
           
           Parameters
           ----------
           clf : sklearn classifier (optional)
               By default a RandomForestClassifier with 500 estimators
               is initialized in one-vs-rest mode (for multi-class settings).
               Users can initialize any of the sklearn classifiers
               externally and provide as the `clf` argument.
        '''
        self.clf = clf
        # if 'None' then some reasonable default
        if clf is None:
            self.clf = OneVsRestClassifier(
                RandomForestClassifier(bootstrap=True,
                                       class_weight="balanced",
                                       n_estimators=500,
                                       random_state=123,
                                       n_jobs=-1))
        return self

    def add_instances(self, newlabels):
        '''Add new training instances
           --------------------------
           The function accepts a 2D array that for each new
           instance provides the numeric index in `imgx` instance
           and user defined class (as integer), e.g. 
           np.array([[12,0], [36,1]])
           
           Parameters
           ----------
           newlabels : array
               A 2D numpy.array is expected with an index of
               the labelled region and the class (as integer)
        '''
        newlabels = np.unique(newlabels, axis=0)
        if self.newlabels is None:
            self.newlabels = newlabels
        else:
            a1_rows = newlabels.view([('', newlabels.dtype)] *
                                     newlabels.shape[1])
            a2_rows = self.newlabels.view([('', self.newlabels.dtype)] *
                                          self.newlabels.shape[1])

            newlabels = (np.setdiff1d(a1_rows,
                                      a2_rows).view(newlabels.dtype).reshape(
                                          -1, newlabels.shape[1]))
            self.newlabels = np.append(self.newlabels, newlabels, axis=0)
        # if 'newlabels' array is not empty
        if len(newlabels) > 0:
            self._push_traindata(newlabels=newlabels)
        return self

    def train_classifier(self):
        '''Fit a supervised model to the current training
           data. The function runs sklearn.clf.fit() on 
           Xtrain and ytrain that the user provided
        '''
        self.clf.fit(self.Xtrain, self.ytrain)
        return self

    # print the confusion matrix on the existing training set
    def train_error(self):
        ypred = self.clf.predict(self.Xtrain)
        print(
            classification_report(self.ytrain.argmax(axis=1),
                                  ypred.argmax(axis=1),
                                  target_names=self.classes))
        # print(confusion_matrix(self.ytrain.argmax(axis=1), ypred.argmax(axis=1),
        #                       labels=range(len(self.classes))))

    # generate predictions and pass them to self.imgx.y
    def generate_predictions(self, prob=False):
        '''Generate predictions after the model was fit
           The function runs sklearn.clf.predict() on
           the currently loaded `imgx` instance
        '''
        Xtest = self.imgx.data
        ypred = self.clf.predict(Xtest)
        # set labels to these
        self.imgx.y = ypred.argmax(axis=1)

    # plot predictions overlaid with the original image
    # plot is a 'void' function (returns 'None')
    def plot_predictions(self):
        '''Plot predictions over the original image
           The function generates an interactive visualization
           with plotly. The user can hover over a labelled region
           (e.g. a cell) and the class of the region will be shown
           (if .generate_predictions() has been run prior to the 
           function call)
        '''
        if self.imgx.y is not None:
            layout, feats = plotly_predictions(img=self.imgx.img,
                                               bb=self.imgx.bbox,
                                               ypred=self.imgx.y,
                                               labels=self.classes)
        else:
            layout, feats = plotly_viz(img=self.imgx.img, bb=self.imgx.bbox)
        iplot(dict(data=feats, layout=layout))

    def h5_write(self, fname, group):
        '''Write current train set as HDF5 file
          
           Parameters
           ----------
               fname : string
                   Path and file name (e.g. trainset.h5)
               group : string
                   Dataset / group name under which the
                   train data will be saved. For more details
                   see the documentation on HDF5 groups:
                   (http://docs.h5py.org/en/stable/high/group.html)
        '''
        hf = h5py.File(fname, 'w')
        hf.create_dataset(group + '/Xtrain', data=self.Xtrain)
        hf.create_dataset(group + '/ytrain', data=self.ytrain.argmax(axis=1))
        hf.create_dataset(group + '/columns',
                          data=self.Xtrain.columns.values.astype('S').tolist())
        hf.close()
Ejemplo n.º 49
0
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
plt.plot(fpr["macro"], tpr["macro"],
         label='OvO ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='red', linestyle='--', linewidth=2)
lw = 2
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Comparison')
plt.legend(loc="lower right")
plt.savefig('balanced_soft.png')
plt.show()

acc1 = accuracy_score(test,softmax.predict(X_test))
acc2 = accuracy_score(test,logi_ovo.predict(X_test))
acc3 = accuracy_score(test,logi_ovr.predict(X_test))
print(acc1,acc2,acc3)
Ejemplo n.º 50
0
class BongC(TextC):
    PARAMS = {
            'classifier': 'svm',
            'vectorizer': 'tfidf',
            'multi_class': 'ovr',
            'C': 1.0,
            'class_weight': 'balanced',
            'n_jobs': -1,
            'num_mix': 1.0,
            'max_iter': 20000,
            'random_state': None,
            'dual': True,       # for SVMs
            'n_estimators': 50, # for RF
            'adapt_thresh': 0.0,
    }
    def __init__(self, **kwargs):
        self.v = BongVectorizer()
        for k,v in self.v.get_params().items():
            self.PARAMS[k] = v
        super().__init__(self)
        self._trained = False
        self.model = None
        self._training_data = None
        #
        self.set_params(**kwargs)

    def set_params(self, **kwargs):
        for k, v in kwargs.items():
            if k not in self.PARAMS:
                warning("Ignoring unknown parameter {}.".format(k))
            else:
                if k in self.v.get_params():
                    self.v.set_params(**{k: v})
                old_v = getattr(self, k)
                if v != old_v:
                    debug('Setting {}, old = {}, new = {}'.format(k, old_v,v ))
                    setattr(self, k, v)
                    self._trained = False

    def fit(self, train, val=None):
        info("Converting documents to BoNG vectors")
        docs = self.v.fit_transform(train)
        info("Number of features: {}".format(
                            len(self.v.v.vocabulary_)))

        if self._training_data == train and self._trained:
            info("Skipping training the model, parameters and data did not change.")
            return
        if self.classifier == 'lr':
            from sklearn.linear_model import LogisticRegression
            clf = LogisticRegression
            clf_params = {'C', 'multi_class', 'dual', 'class_weight',
                    'random_state', 'max_iter', 'solver'}
        elif self.classifier == 'rf':
            from sklearn.ensemble import RandomForestClassifier
            clf = RandomForestClassifier
            clf_params = {'class_weight', 'n_estimators', 'n_jobs', 'random_state'}
        else:
            from sklearn.svm import LinearSVC
            clf = LinearSVC
            clf_params = {'C', 'multi_class', 'dual', 'class_weight',
                    'random_state', 'max_iter'}

        clf_params = {k:v for k,v in self.get_params().items()\
                        if k in clf_params}
        self.model = clf(**clf_params)

        if self.multi_class:
            if self.multi_class == 'ovo':
                from sklearn.multiclass import OneVsOneClassifier
                self.model = OneVsOneClassifier(self.model, n_jobs=self.n_jobs)
            elif self.multi_class == 'ovr':
                from sklearn.multiclass import OneVsRestClassifier
                self.model = OneVsRestClassifier(self.model, n_jobs=self.n_jobs)

        if train.num_features is not None:
            docs = hstack((docs, self.num_mix * train.num_features), format="csr")
        info("Fitting the model {}".format(docs.shape))
        self.model.fit(docs, np.array(train.labels))
        self._training_data = train
        self._trained = True

    def _predict(self, test, train=None, decision_func=False):
        if train: self.fit(train)
        x = self.v.transform(test)
        if self._training_data.num_features is not None\
                and test.num_features is not None:
            x = hstack((x, self.num_mix * test.num_features), format="csr")
        predictions = self.model.predict(x)
        if decision_func:
            decision_val = self.model.decision_function(x)
            return predictions, decision_val
        return predictions
varietal_list = pd.np.array(varietal_list)

count_vect = TfidfVectorizer(lowercase=True,
                             tokenizer=lambda text: word_tokenize(text))
x_train_counts = count_vect.fit_transform(description_list)

tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

train_x, test_x, train_y, test_y = train_test_split(x_train_tfidf,
                                                    varietal_list,
                                                    test_size=0.2)

clf = OneVsRestClassifier(SVC(kernel='linear',
                              gamma=0.5)).fit(train_x, train_y)
y_score = clf.predict(test_x)

n_right = 0
for i in range(len(y_score)):
    if y_score[i] == test_y[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right / float(len(test_y)) * 100)))
print(classification_report(test_y, y_score))

test = clf.predict(count_vect.transform(["Ur a f****n idiot!"]))
if test == 0:
    print("No")
else:
    for token in "Ur a f****n idiot!":
        if token in wordBully:
def oneVRest(x, y, x_test):
    result = OneVsRestClassifier(LinearSVC()).fit(x, y)
    preds = result.predict(x_test)
    return preds
Ejemplo n.º 53
0
class KOMD(BaseEstimator, ClassifierMixin):
    """KOMD.
    
    KOMD is a kernel method for classification and ranking.
    
    Read more in http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf
    by F. Aiolli, G. Da San Martino, and A. Sperduti.
    
    For details on the precise mathematical formulation of the provided
    kernel functions and how `gamma`, `coef0` and `degree` affect each
    other, see the corresponding section in the narrative documentation:
    :ref:`svm_kernels`.
	
    Parameters
    ----------
    lam : float, (default=0.1)
        Specifies the lambda value, between 0.0 and 1.0.
    
    kernel : optional (default='linear')
        Specifies the kernel function used by the algorithm.
        It must be one of 'linear', 'poly', 'rbf', a callable or a gram matrix.
        If none is given, 'linear' will be used. If a callable is given it is
        used to pre-compute the kernel matrix from data matrices; that matrix
        should be an array of shape ``(n_samples, n_samples)``.
    
    rbf_gamma : float, optional (default=0.1)
        Coefficient for 'rbf' and 'poly' kernels.
        Ignored by all other kernels.
    
    degree : float, optional (default=2.0)
        Specifies the degree of the 'poly' kernel.
	    Ignored by all other kernels.
    
    coef0 : float, optional (default=0.0)
        Specifies the coeff0 in a polynomial kernel.
        Ignored by all other kernels.
    
    max_iter : int, optional (default=100)
        Hard limit on iterations within solver, it can't be negative.
    
    verbose : bool, (default=False)
        Enable verbose output during fit.
    
    multiclass_strategy : string, optional (default='ova')
        Specifies the strategy used in case of multiclass.
        'ova' for one_vs_all pattern (also called one_vs_rest),
        'ovo' for one_vs_one pattern.
        With other unexpected string, 'ova' pattern is used.
    
    Attributes
    ----------
    gamma : array-like, shape = [n_samples]
        probability-like vector that define the distance vector
        over the two class.
    
    classes_ : array-like, shape = [n_classes]
        Vector that contain all possibile labels
    
    multiclass_ : boolean,
        True if the number of classes > 2
    
    Examples
    --------
    >>>import numpy as np
    >>>from ??.komd import KOMD
    >>>X = np.array([[1,2,i] for i in range(5)])
    >>>Y = np.array([1,1,1,-1,-1])
    >>>cls = KOMD()
    >>>cls = cls.fit(X,Y)
    >>>pred = cls.predict([[1,1,5]])
    
    References
    ----------
    `A Kernel Method for the Optimization of the Margin Distribution
    <http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf>`__
    """
    def __init__(self,
                 lam=0.1,
                 kernel='rbf',
                 rbf_gamma=0.1,
                 degree=2.0,
                 coef0=0.0,
                 max_iter=100,
                 verbose=False,
                 multiclass_strategy='ova'):
        self.lam = lam
        self.gamma = None
        self.bias = None
        self.X = None
        self.Y = None
        self.is_fitted = False
        self.rbf_gamma = rbf_gamma
        self.degree = degree
        self.coef0 = coef0
        self.max_iter = max_iter
        self.verbose = verbose
        self.kernel = kernel
        self.multiclass_strategy = multiclass_strategy
        self.multiclass_ = None
        self.classes_ = None
        self._pairwise = self.kernel == 'precomputed'

    def __kernel_definition__(self):
        """Select the kernel function
        
        Returns
        -------
        kernel : a callable relative to selected kernel
        """
        if hasattr(self.kernel, '__call__'):
            return self.kernel
        if self.kernel == 'rbf' or self.kernel == None:
            return lambda X, Y: rbf_kernel(X, Y, self.rbf_gamma)
        if self.kernel == 'poly':
            return lambda X, Y: polynomial_kernel(X,
                                                  Y,
                                                  degree=self.degree,
                                                  gamma=self.rbf_gamma,
                                                  coef0=self.coef0)
        if self.kernel == 'linear':
            return lambda X, Y: linear_kernel(X, Y)
        if self.kernel == 'precomputed':
            return lambda X, Y: X

    def fit(self, X, Y):
        """Fit the model according to the given training data
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Matrix of the examples, where
            n_samples is the number of samples and
            n_feature is the number of features
        
        Y : array-like, shape = [n_samples]
            array of the labels relative to X
        
        Returns
        -------
        self : object
            Returns self
        """
        X, Y = validation.check_X_y(X,
                                    Y,
                                    dtype=np.float64,
                                    order='C',
                                    accept_sparse='csr')
        #check_consistent_length(X,Y)
        check_classification_targets(Y)

        self.classes_ = np.unique(Y)
        if len(self.classes_) < 2:
            raise ValueError("The number of classes has to be almost 2; got ",
                             len(self.classes_))

        if len(self.classes_) == 2:
            self.multiclass_ = False
            return self._fit(X, Y)
        else:
            self.multiclass_ = True
            if self.multiclass_strategy == 'ovo':
                return self._one_vs_one(X, Y)
            else:
                return self._one_vs_rest(X, Y)
        raise ValueError('This is a very bad exception...')

    def _one_vs_one(self, X, Y):
        self.cls = OneVsOneClassifier(KOMD(**self.get_params())).fit(X, Y)
        self.is_fitted = True
        return self

    def _one_vs_rest(self, X, Y):
        self.cls = OneVsRestClassifier(KOMD(**self.get_params())).fit(X, Y)
        self.is_fitted = True
        return self

    def _fit(self, X, Y):
        self.X = X
        values = np.unique(Y)
        Y = [1 if l == values[1] else -1 for l in Y]
        self.Y = Y
        npos = len([1.0 for l in Y if l == 1])
        nneg = len([1.0 for l in Y if l == -1])
        gamma_unif = matrix([1.0 / npos if l == 1 else 1.0 / nneg for l in Y])
        YY = matrix(np.diag(list(matrix(Y))))

        Kf = self.__kernel_definition__()
        ker_matrix = matrix(Kf(X, X).astype(np.double))
        #KLL = (1.0 / (gamma_unif.T * YY * ker_matrix * YY * gamma_unif)[0])*(1.0-self.lam)*YY*ker_matrix*YY
        KLL = (1.0 - self.lam) * YY * ker_matrix * YY
        LID = matrix(
            np.diag([self.lam * (npos * nneg / (npos + nneg))] * len(Y)))
        Q = 2 * (KLL + LID)
        p = matrix([0.0] * len(Y))
        G = -matrix(np.diag([1.0] * len(Y)))
        h = matrix([0.0] * len(Y), (len(Y), 1))
        A = matrix([[1.0 if lab == +1 else 0 for lab in Y],
                    [1.0 if lab2 == -1 else 0 for lab2 in Y]]).T
        b = matrix([[1.0], [1.0]], (2, 1))

        solvers.options['show_progress'] = False  #True
        solvers.options['maxiters'] = self.max_iter
        sol = solvers.qp(Q, p, G, h, A, b)
        self.gamma = sol['x']
        if self.verbose:
            print('[KOMD]')
            print('optimization finished, #iter = %d' % sol['iterations'])
            print('status of the solution: %s' % sol['status'])
            print('objval: %.5f' % sol['primal objective'])

        bias = 0.5 * self.gamma.T * ker_matrix * YY * self.gamma
        self.bias = bias
        self.is_fitted = True
        self.ker_matrix = ker_matrix
        return self

    def predict(self, X):
        """Perform classification on samples in X.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Matrix containing new samples
        
        Returns
        -------
        y_pred : array, shape = [n_samples]
            The value of prediction for each sample
        """

        if self.is_fitted == False:
            raise NotFittedError(
                "This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method."
            )
        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
        if self.multiclass_ == True:
            return self.cls.predict(X)

        return np.array([
            self.classes_[1] if p >= 0 else self.classes_[0]
            for p in self.decision_function(X)
        ])

    def get_params(self, deep=True):
        # this estimator has parameters:
        return {
            "lam": self.lam,
            "kernel": self.kernel,
            "rbf_gamma": self.rbf_gamma,
            "degree": self.degree,
            "coef0": self.coef0,
            "max_iter": self.max_iter,
            "verbose": self.verbose,
            "multiclass_strategy": self.multiclass_strategy
        }

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def decision_function(self, X):
        """Distance of the samples in X to the separating hyperplane.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        
        Returns
        -------
        Z : array-like, shape = [n_samples, 1]
            Returns the decision function of the samples.
        """

        if self.is_fitted == False:
            raise NotFittedError(
                "This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method."
            )
        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")

        if self.multiclass_ == True:
            return self.cls.decision_function(X)

        Kf = self.__kernel_definition__()
        YY = matrix(np.diag(list(matrix(self.Y))))
        ker_matrix = matrix(Kf(X, self.X).astype(np.double))
        z = ker_matrix * YY * self.gamma
        z = z - self.bias
        return np.array(list(z))
                                      random_state=45,
                                      max_features=None,
                                      warm_start=True,
                                      presort='auto',
                                      init=None)

gr.fit(x_train, y_train)
gr.score(x_test, y_test)

#Final Validation
from sklearn.metrics import f1_score

#1. ONE VS REST CLASSIFIER CON RANDOM FOREST

tuned_clf.fit(x_sample, y_sample)
prediction = tuned_clf.predict(test)
f1_score(test_y_cat, prediction, average='micro')

#2.GRADIENT BOOSTING CLASSIFIER ANIDADO IN ONE VS REST CLASSIFIER

gr.fit(x_sample, y_sample)
prediction = gr.predict(test)
f1_score(test_y_cat, prediction, average='micro')

#3.EXTRA TREES CLASSIFIER

tuned_tree.fit(x_sample, y_sample)
prediction = tuned_tree.predict(test)
f1_score(test_y_cat, prediction, average='micro')

###################################################################################################################################################
Ejemplo n.º 55
0
def main():
    ROOT_PATH = "/home/vng/Documents/KD-DM/Project3/sentiment-analysis"
    data_directory = os.path.join(ROOT_PATH, "data")

    # load data for pre-processing
    train_texts_list, train_opinions_list = load_data(data_directory,
                                                      "Training.xml")
    test_texts_list, test_opinions_list = load_data(data_directory,
                                                    "Testing.xml")

    # pre-process training and testing data
    train_tokens_list = tokenize_data(train_texts_list)
    train_texts_list = clean_data(train_tokens_list)

    test_tokens_list = tokenize_data(test_texts_list)
    test_texts_list = clean_data(test_tokens_list)

    # get 20 most common aspects
    most_common_aspects = get_most_common_aspects(train_opinions_list)
    wr = open("./data/common-aspects.txt", "w")
    for aspect in most_common_aspects:
        wr.write(aspect + "\n")
    wr.close()

    #1. Aspect Detection
    # convert data format to be corresponding with fit_transform() method
    df_train = get_data_frame(train_texts_list, train_opinions_list,
                              most_common_aspects)
    df_train.to_csv("./data/training-features.csv")
    df = normalize_aspect_data_frame(
        df_train, most_common_aspects)  # aspects mentioned are assigned to 1
    df_train_aspect = df.reindex_axis(sorted(df.columns),
                                      axis=1)  # re-arrange columns in df
    X_train = df_train_aspect.Review  # split Review column of df to X_train set
    y_train = df_train_aspect.drop('Review', 1)  # the rest of df is y_train
    y_train = np.asarray(y_train, dtype=np.int64)

    df_test = get_data_frame(test_texts_list, test_opinions_list,
                             most_common_aspects)
    df_test.to_csv("./data/testing-features.csv")
    df = normalize_aspect_data_frame(df_test, most_common_aspects)
    df_test_aspect = df.reindex_axis(sorted(df.columns), axis=1)
    X_test = df_test_aspect.Review
    y_test = df_test_aspect.drop('Review', 1)
    y_test = np.asarray(y_test, dtype=np.int64)

    vect = CountVectorizer(lowercase=False,
                           max_df=1.0,
                           stop_words='english',
                           max_features=2000)
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)

    nb_classifier = OneVsRestClassifier(MultinomialNB()).fit(
        X_train_dtm, y_train)
    y_predicted_nb = nb_classifier.predict(X_test_dtm)
    #    print(metrics.accuracy_score(y_test, y_predicted_nb))
    #    print(metrics.classification_report(y_test, y_predicted_nb))
    #    wr = open("./data/predicted-aspects.txt", "w")
    #    for aspect in y_predicted_nb:
    #        wr.write(str(aspect) + "\n")
    #    wr.close()

    rf_classifier = OneVsRestClassifier(RandomForestClassifier()).fit(
        X_train_dtm, y_train)
    y_predicted_rf = rf_classifier.predict(X_test_dtm)

    #    print(metrics.accuracy_score(y_test, y_predicted_rf))
    #    print(metrics.classification_report(y_test, y_predicted_rf))

    svm_classifier = OneVsRestClassifier(svm.SVC(C=1.0, kernel='linear')).fit(
        X_train_dtm, y_train)
    y_predicted_svm = svm_classifier.predict(X_test_dtm)
    #    print(metrics.classification_report(y_test, y_predicted_svm))
    #2. Sentiment Detection for each Aspect
    # construct extra data
    dict_of_aspects = create_dict_of_aspect(y_train, most_common_aspects)
    aspect_vectorizer = DictVectorizer()
    X_train_aspect_dtm = aspect_vectorizer.fit_transform(dict_of_aspects)

    dict_of_aspects = create_dict_of_aspect(y_test, most_common_aspects)
    X_test_aspect_dtm = aspect_vectorizer.transform(dict_of_aspects)

    # construct original train data
    df_train = get_data_frame(train_texts_list, train_opinions_list,
                              most_common_aspects)
    df_test = get_data_frame(test_texts_list, test_opinions_list,
                             most_common_aspects)
    # for positive aspect detection
    df_train_pos = get_pos_data_frame(df_train, most_common_aspects)
    df_test_pos = get_pos_data_frame(df_test, most_common_aspects)
    # for negative aspect detection
    df_train = get_data_frame(train_texts_list, train_opinions_list,
                              most_common_aspects)
    df_test = get_data_frame(test_texts_list, test_opinions_list,
                             most_common_aspects)
    df_train_neg = get_neg_data_frame(df_train, most_common_aspects)
    df_test_neg = get_neg_data_frame(df_test, most_common_aspects)
    # for neutral or conflict aspect detection
    df_train = get_data_frame(train_texts_list, train_opinions_list,
                              most_common_aspects)
    df_test = get_data_frame(test_texts_list, test_opinions_list,
                             most_common_aspects)
    df_train_neu = get_neu_data_frame(df_train, most_common_aspects)
    df_test_neu = get_neu_data_frame(df_test, most_common_aspects)

    nb_pos_aspect_classifier, rf_pos_aspect_classifier, svm_pos_aspect_classifier = classify_sentiment(
        df_train_pos, X_train_aspect_dtm, df_test_pos, X_test_aspect_dtm)
    nb_neg_aspect_classifier, rf_neg_aspect_classifier, svm_neg_aspect_classifier = classify_sentiment(
        df_train_neg, X_train_aspect_dtm, df_test_neg, X_test_aspect_dtm)
    nb_neu_aspect_classifier, rf_neu_aspect_classifier, svm_neu_aspect_classifier = classify_sentiment(
        df_train_neu, X_train_aspect_dtm, df_test_neu, X_test_aspect_dtm)
Ejemplo n.º 56
0
def main():
    '''
    using hand crafted features

    classifier = Classifier('swa', '../Data/swda/')
    dataStartTime = time()
    classifier.getData()
    dataEndTime = time()
    print "Data loaded in", dataEndTime - dataStartTime, "sec"

    # print classifier.data[2].utterance_count
    # get test and train data
    classifier.getTrainAndTestData()

    featureStartTime = time()
    # transform a feature vector
    feature_vectors, speech_acts, utter_text = classifier.featurize(classifier.trainData)
    featureEndTime = time()
    print "Feature extracted in", featureEndTime - featureStartTime, "sec"
    print len(feature_vectors)

    # normalize speech acts into classes
    classifier.normalizeSpeechAct(speech_acts)

    # train
    trainStartTime = time()
    clf = OneVsRestClassifier(SVC(C=1, kernel = 'poly', gamma= 'auto', verbose= False, probability=False))
    clf.fit(feature_vectors, speech_acts)
    trainEndTime = time()
    print "Model trained in",trainEndTime - trainStartTime, "sec"

    feature_vectors, labelled_speech_acts, utter_text = classifier.featurize(classifier.testData)

    # normalize speech act for test data
    classifier.normalizeSpeechAct(labelled_speech_acts)

    # predict speech act for test
    predicted_speech_act = clf.predict(feature_vectors)

    correctResult = Counter()
    wrongResult = Counter()

    for i in range(len(predicted_speech_act)):
        if predicted_speech_act[i] == labelled_speech_acts[i]:
            correctResult[predicted_speech_act[i]] += 1
        else:
            wrongResult[predicted_speech_act[i]] += 1

    total_correct = sum([correctResult[i] for i in correctResult])
    total_wrong = len(predicted_speech_act) - total_correct

    print "total_correct", total_correct
    print "total wrong", total_wrong
    print "accuracy", (total_correct/len(predicted_speech_act)) * 100

    print "Classification_report:\n", classification_report(labelled_speech_acts, predicted_speech_act)#, target_names=target_names)
    print "accuracy_score:", round(accuracy_score(labelled_speech_acts, predicted_speech_act), 2)
    :return:
    '''
    # Bag of Words
    classifier = Classifier('swa', '../Data/swda/')
    bagofwords = BagOfWords()
    dataStartTime = time()
    classifier.getData()
    dataEndTime = time()
    print "Data loaded in", dataEndTime - dataStartTime, "sec"

    # print classifier.data[2].utterance_count
    # get test and train data
    classifier.getTrainAndTestData()

    populateSpaceStartTime = time()
    # populate space
    print "classifier.trainData"
    print classifier.trainData
    bagofwords.populateSpace(classifier.trainData)
    populateSpaceEndTime = time()
    print "Space populated extracted in", populateSpaceEndTime - populateSpaceStartTime, "sec"
    print "Space length:", len(bagofwords.space)

    f = open('../Analysis/space.txt', 'w')
    f.write(','.join(bagofwords.space))
    f.close()

    featureStartTime = time()
    # transform a feature vector
    feature_vectors_bow, speech_acts, utter_text = bagofwords.featurize(
        classifier.trainData)
    featureEndTime = time()
    print "Feature extracted in", featureEndTime - featureStartTime, "sec"
    print "feature_vectors_bow", len(feature_vectors_bow)

    featureStartTime = time()
    # transform a feature vector
    feature_vectors_cust, speech_acts, utter_text = classifier.featurize(
        classifier.trainData)
    featureEndTime = time()
    print "Feature extracted in", featureEndTime - featureStartTime, "sec"
    print "feature_vectors_cust", len(feature_vectors_cust)
    feature_vectors = classifier.combineFeatureVectors(feature_vectors_bow,
                                                       feature_vectors_cust)
    print len(feature_vectors)

    # normalize speech acts into classes
    classifier.normalizeSpeechAct(speech_acts)
    classifier.findmajorityclass(speech_acts)

    # train
    trainStartTime = time()
    clf = OneVsRestClassifier(
        SVC(C=1, kernel='linear', gamma=1, verbose=False, probability=False))
    clf.fit(feature_vectors, speech_acts)
    trainEndTime = time()
    print "Model trained in", trainEndTime - trainStartTime, "sec"

    feature_vectors_bow, labelled_speech_acts, utter_text = bagofwords.featurize(
        classifier.testData)
    print "len(feature_vectors_bow[0])", len(feature_vectors_bow[0])
    feature_vectors_cust, speech_acts, utter_text = classifier.featurize(
        classifier.testData)
    print "len(feature_vectors_cust[0])", len(feature_vectors_cust[0])

    feature_vectors = classifier.combineFeatureVectors(feature_vectors_bow,
                                                       feature_vectors_cust)

    # normalize speech act for test data
    classifier.normalizeSpeechActTest(labelled_speech_acts)

    predictionStartTime = time()
    # predict speech act for test
    predicted_speech_act = clf.predict(feature_vectors)
    predictionEndTime = time()
    print "Prediction time", predictionEndTime - predictionStartTime

    classifier.normalizePrediction(predicted_speech_act, labelled_speech_acts)
    print set(predicted_speech_act), set(labelled_speech_acts)
    correctResult = Counter()
    wrongResult = Counter()

    for i in range(len(predicted_speech_act)):
        if predicted_speech_act[i] == labelled_speech_acts[i]:
            correctResult[predicted_speech_act[i]] += 1
        else:
            wrongResult[predicted_speech_act[i]] += 1

    total_correct = sum([correctResult[i] for i in correctResult])
    total_wrong = len(predicted_speech_act) - total_correct

    print "total_correct", total_correct
    print "total wrong", total_wrong
    print "accuracy", (total_correct / len(predicted_speech_act)) * 100

    print "Classification_report:\n", classification_report(
        labelled_speech_acts,
        predicted_speech_act)  #, target_names=target_names)
    print "accuracy_score:", round(
        accuracy_score(labelled_speech_acts, predicted_speech_act), 2)

    pickle.dump(classifier, open('classifier.p', 'wb'))
    pickle.dump(clf, open('clf.p', 'wb'))
    print "saved"
yTrain = train_fingerprint.iloc[:, -1]
yTrain = yTrain.as_matrix(columns=None)

xTest = test_fingerprint.iloc[:, :-1]
xTest = np.c_[np.ones((xTest.shape[0])), xTest]
yTest = test_fingerprint.iloc[:, -1]
yTest = yTest.as_matrix(columns=None)

#Normalize the data set
xTrain = xTrain / 255
row, column = xTrain.shape[0], xTrain.shape[1]
div = sum(xTrain.sum(axis=1)) / (row * column)
xTrain = xTrain - div

xTest = xTest / 255
row, column = xTest.shape[0], xTest.shape[1]
div = sum(xTest.sum(axis=1)) / (row * column)
xTest = xTest - div

clf = OneVsRestClassifier(
    SVC(kernel='rbf', tol=0.03, C=1 / 0.2, gamma=0.03, probability=True))
clf = clf.fit(xTrain, yTrain)

accuracy = clf.score(xTest, yTest)
print('Accuracy of data: ', accuracy * 100)

clf.decision_function(xTrain)
predLabel = clf.predict(xTest)
correct = np.sum(predLabel == yTest)
print("%d out of %d predictions correct" % (correct, len(predLabel)))
print("The fingerprint belongs to person %f" % (predLabel))
def accuracyCalc():
    accuracy = []
    final_true = []
    final_pred = []
    kf_total = StratifiedKFold(y, n_folds=10, shuffle=True)
    for train, test in kf_total:
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        classifier = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)
        #y_pred = classifier.predict(X_test)

        #classifier = OneVsRestClassifier(svm.SVC(decision_function_shape='ovo',random_state=0))
        #classifier = GaussianNB()
        #classifier = LinearDiscriminantAnalysis()
        #classifier = QuadraticDiscriminantAnalysis()
        #classifier = DecisionTreeClassifier(random_state=0)
        #classifier = RandomForestClassifier(n_estimators=10)
        #classifier = KNeighborsClassifier(n_neighbors=10)
       # classifier = RadiusNeighborsClassifier(radius=35.0)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)


        most_informative_feature_for_class(vec, classifier, "FAQ")
        most_informative_feature_for_class(vec, classifier, "Prodotto")
        most_informative_feature_for_class(vec, classifier, "OrdiniAccountPersonali")

        for coef,feat in most_informative_feature_for_class(vec, classifier, "FAQ"):
	        if 



        print ("accuracy:", accuracy_score(y_test, y_pred))
        print ("P/R/F1 micro:", precision_recall_fscore_support(y_test,y_pred, average="micro"))
        print ("P/R/F1 macro:", precision_recall_fscore_support(y_test,y_pred, average="macro"))
        
        final_true = final_true + list(y_test)
        final_pred = final_pred + list(y_pred)

    print ("Final accuracy:", accuracy_score(final_true, final_pred))
    print ("P/R/F1 Final micro:", precision_recall_fscore_support(final_true,final_pred, average="macro"))
    print ("P/R/F1 Final macro:", precision_recall_fscore_support(final_true,final_pred, average="micro"))

    cm = confusion_matrix(final_true, final_pred)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(cm)
    plt.figure(figsize=(10, 10), dpi=100)

    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(len(classifier.classes_))
    plt.xticks(tick_marks, classifier.classes_, rotation=90)
    plt.yticks(tick_marks, classifier.classes_)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    plt.savefig("/Users/giulia/Desktop/Awhy_Classifier_Refiner/twoStepsClassifier/Plot/RandomForestClassifier10plot1_ALL.png")

    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print('Normalized confusion matrix')
    print(cm_normalized)
    plt.figure(figsize=(10, 10), dpi=100)

    plt.imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Normalized Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(len(classifier.classes_))
    plt.xticks(tick_marks, classifier.classes_, rotation=90)
    plt.yticks(tick_marks, classifier.classes_)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    plt.savefig("/Users/giulia/Desktop/Awhy_Classifier_Refiner/twoStepsClassifier/Plot/RandomForestClassifier10plot2_ALL.png")
Ejemplo n.º 59
0
def PR_ovr_classifier(X, Y):
    print("Shape of X", X.shape)

    Y = label_binarize(Y, classes=[0, 1, 2])

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=1234)

    classifier = OneVsRestClassifier(LinearSVC(C=3, max_iter=10000))
    classifier.fit(X_train, Y_train)
    y_pred = classifier.predict(X_test)
    acc = accuracy_score(Y_test, y_pred)
    print('For value of C = 3, best 10 fold cross validation accuracy = %f' %
          (acc))

    y_score = classifier.decision_function(X_test)
    n_classes = 3
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(
            Y_test[:, i], y_score[:, i])
        average_precision[i] = average_precision_score(Y_test[:, i],
                                                       y_score[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision["micro"], recall["micro"], _ = precision_recall_curve(
        Y_test.ravel(), y_score.ravel())
    average_precision["micro"] = average_precision_score(Y_test,
                                                         y_score,
                                                         average="micro")

    #print('Average precision score over all classes: {0:0.2f}'.format(average_precision["micro"]))
    plt.figure()
    plt.step(recall['micro'], precision['micro'], where='post')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Average precision score, over all classes: AP={0:0.2f}'.format(
        average_precision["micro"]))
    plt.show()

    # setup plot details
    colors = cycle(['red', 'yellow', 'green'])
    lines = []
    labels = []
    for i, color in zip(range(n_classes), colors):
        l, = plt.plot(recall[i], precision[i], color=color, lw=2)
        lines.append(l)
        labels.append('Precision-recall for class {0} (area = {1:0.2f})'
                      ''.format(i, average_precision[i]))

    fig = plt.gcf()
    fig.subplots_adjust(bottom=0.25)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Extension of Precision-Recall curve to multi-class')
    plt.legend(lines, labels, loc=(0, -.38), prop=dict(size=14))

    plt.show()
Ejemplo n.º 60
0
y = lb.fit_transform(target)
type(y)
# Model Training 
print ("Train the model ... ")
classifier = SVC(C=100, # penalty parameter, setting it to a larger value 
	 			 kernel='rbf', # kernel type, rbf working fine here
	 			 degree=3, # default value, not tuned yet
	 			 gamma=1, # kernel coefficient, not tuned yet
	 			 coef0=1, # change to 1 from default value of 0.0
	 			 shrinking=True, 
	 			 tol=0.001, # stopping criterion tolerance 
	      		 probability=False, # no need to enable probability estimates
	      		 cache_size=200, # 200 MB cache size
	      		 class_weight=None, # all classes are treated equally 
	      		 verbose=False, # print the logs 
	      		 max_iter=-1, # no limit, let it run
          		 decision_function_shape=None, # will use one vs rest explicitly 
          		 random_state=None)
model = OneVsRestClassifier(classifier, n_jobs=4)
model.fit(X, y)

# Predictions 
print ("Predicting on test data ... ")
y_test = model.predict(X_test)
y_pred = lb.inverse_transform(y_test)

# Submission
print ("Generating Submission File ... ")
test_id = [doc['id'] for doc in test]
sub = pd.DataFrame({'id': test_id, 'cuisine': y_pred}, columns=['id', 'cuisine'])
sub.to_csv('svm_output.csv', index=False)