Ejemplo n.º 1
0
 def expert_training(self):
     history_context, history_action = self.data_simulation()
     logreg = OneVsRestClassifier(LogisticRegression())
     mnb = OneVsRestClassifier(MultinomialNB(),)
     logreg.fit(history_context, history_action)
     mnb.fit(history_context, history_action)
     return [logreg, mnb]
Ejemplo n.º 2
0
    def _calculate(self, X, y, categorical):
        import sklearn.discriminant_analysis
        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = sklearn.model_selection.StratifiedKFold(n_splits=10)
        else:
            kf = sklearn.model_selection.KFold(n_splits=10)

        accuracy = 0.
        try:
            for train, test in kf.split(X, y):
                lda = sklearn.discriminant_analysis.LinearDiscriminantAnalysis()

                if len(y.shape) == 1 or y.shape[1] == 1:
                    lda.fit(X[train], y[train])
                else:
                    lda = OneVsRestClassifier(lda)
                    lda.fit(X[train], y[train])

                predictions = lda.predict(X[test])
                accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
            return accuracy / 10
        except scipy.linalg.LinAlgError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
        except ValueError as e:
            self.logger.warning("LDA failed: %s Returned 0 instead!" % e)
            return np.NaN
def experienceSVMTrain(trainData, testData, testCounts, classifierNumber = 0):
    if classifierNumber == 0:
        classifier = OneVsRestClassifier(svm.SVC())
        algorithmName = 'OneVsRestClassifier'
    elif classifierNumber == 1:
        classifier = svm.SVC()
        algorithmName = 'SupportVectorClassifier'
    elif classifierNumber == 2:
        classifier = RandomForestClassifier(n_estimators= 1000, n_jobs = 4)
        algorithmName = 'RandomForestClassifier'
    else:
        classifier = KNeighborsClassifier(n_neighbors=3)
        algorithmName = 'KNeighborsClassifier'
    print_(algorithmName, 'has been started to train the data by', nowStr())
    classifier.fit(preprocessing.scale(trainData['X']), trainData['Y'])
    print_(algorithmName, 'has been started to predict the test data by', nowStr())
    predictions = classifier.predict(preprocessing.scale(testData['X']))
    truePositives = 0
    truePositiveCounts = {genre: 0 for genre in genreSet}
    predictionCount = len(predictions)
    for i in range(predictionCount):
        if predictions[i] == testData['Y'][i]:
            truePositives += 1
            truePositiveCounts[genreSet[testData['Y'][i]]] += 1
    print_(algorithmName, 'Experiment has been finished by', nowStr())
    print_('\nGeneral Test Accuracy = %.3f' % (truePositives / float(predictionCount)))
    print('\nTotal Number of predictions:', predictionCount)
    print('Number of true predictions:  ', truePositives)
    print('Number of false predictions: ', predictionCount-truePositives)
    print_('\nTesting distribution:            ', {genre: testCounts[genre] for genre in genreSet})
    print_('Distribution of true predictions: ', truePositiveCounts)
    falseNegativeCounts = {genre: testCounts[genre]-truePositiveCounts[genre] for genre in genreSet}
    print_('Distribution of false predictions:', falseNegativeCounts, '\n')
Ejemplo n.º 4
0
def main():
    img_dir = 'images/'
    images = [img_dir + f for f in os.listdir(img_dir)]
    labels = [f.split('/')[-1].split('_')[0] for f in images]
    label2ids = {v: i for i, v in enumerate(sorted(set(labels),
                                                   key=labels.index))}
    y = np.array([label2ids[l] for l in labels])

    data = []
    for image_file in images:
        img = img_to_matrix(image_file)
        img = flatten_image(img)
        data.append(img)
    data = np.array(data)

    # training samples
    is_train = np.random.uniform(0, 1, len(data)) <= 0.7
    train_X, train_y = data[is_train], y[is_train]

    # training a classifier
    pca = RandomizedPCA(n_components=5)
    train_X = pca.fit_transform(train_X)
    multi_svm = OneVsRestClassifier(LinearSVC())
    multi_svm.fit(train_X, train_y)

    # evaluating the model
    test_X, test_y = data[is_train == False], y[is_train == False]
    test_X = pca.transform(test_X)
    print pd.crosstab(test_y, multi_svm.predict(test_X),
                      rownames=['Actual'], colnames=['Predicted'])
Ejemplo n.º 5
0
def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       return_indicator=True,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # decision function only estimator. Fails in current implementation.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        # Estimator with predict_proba disabled, depending on parameters.
        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
        decision_only.fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred)
Ejemplo n.º 6
0
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
                                                                     max_depth=None,
                                                                     min_samples_split=2,
                                                                     min_samples_leaf=1,
                                                                     min_weight_fraction_leaf=0.,
                                                                     max_features="auto",
                                                                     max_leaf_nodes=None,
                                                                     class_weight=None),
                                                 n_jobs=-1
                                                 )

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
def main():

    dataTuples=getDataInFormat()
    print "Length of dataTuples is: ",  len(dataTuples)
    shuffle(dataTuples)
    trainTuples=dataTuples
    del dataTuples
    ids, labels, vectors= getLabelsAndVectors(trainTuples)
    del trainTuples
    followerCountsList = loadFollowerCountsFromFile()
    space=getSpace(vectors)
    reducedSpace=getReducedSpace(vectors, space)
    spaceWithMetaFeatures= augmentSpace(reducedSpace, emotionFeatures)

    print "Total # of features in your space is: ", len(space)
    print "Total # of features in your reducedSpace is: ", len(reducedSpace)
    oneHotVectors=getOneHotVectors(ids, labels, vectors,spaceWithMetaFeatures , followerCountsList)
    trainVectors, trainLabels=getOneHotVectorsAndLabels(oneHotVectors)
    del oneHotVectors
    clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear',gamma=0.1, verbose= False, probability=False))
    clf.fit(trainVectors, trainLabels)
    
    print "\nDone fitting classifier on training data...\n"
    print "\nDone fitting classifier on training data...\n"
    print "="*50, "\n"
    print "Results with 10-fold cross validation:\n"
    print "="*50, "\n"
    predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=10)
    print "*"*20
    print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted)
    print "*"*20
    print "precision_score\t", metrics.precision_score(trainLabels, predicted)
    print "recall_score\t", metrics.recall_score(trainLabels, predicted)
    print "\nclassification_report:\n\n", metrics.classification_report(trainLabels, predicted)
    print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(trainLabels, predicted)
Ejemplo n.º 8
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intented
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovr.partial_fit(iris.data[60:], iris.target[60:])
    pred = ovr.predict(iris.data)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data)
    
    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred), 0.65)
Ejemplo n.º 9
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intended
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    # with SGDClassifier
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]

    ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
                                            shuffle=False, random_state=0))
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    ovr.partial_fit(X[7:], y[7:])
    pred = ovr.predict(X)
    ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
                                             shuffle=False, random_state=0))
    pred1 = ovr1.fit(X, y).predict(X)
    assert_equal(np.mean(pred == y), np.mean(pred1 == y))

    # test partial_fit only exists if estimator has it:
    ovr = OneVsRestClassifier(SVC())
    assert_false(hasattr(ovr, "partial_fit"))
def main():
    word_vec_dict = readGloveData("./glove.twitter.27B/glove.twitter.27B.25d.txt")
    tweets = readTweets("./dataset_raw/semeval2016-task6-trainingdata.txt")

    tweetVectors = getTweetVectors(tweets[0 : len(tweets) - 1], word_vec_dict)
    print tweets[0]
    print getSumVectors(tweets[0], word_vec_dict)
    tweetClasses = set(tweets[-1])

    mapping = {"favor": 1, "none": 0, "against": 1}

    tweetClasses = np.asarray([mapping[x] for x in tweets[-1]])
    tweetData = np.asarray(tweetVectors)
    print tweetClasses.shape
    print tweetData.shape
    X = tweetData
    Y = tweetClasses
    clf = OneVsRestClassifier(LinearSVC())
    # clf = SVC(kernel='rbf', gamma=1.5, random_state=34543)
    X_train = X[0 : int(0.7 * len(X))]
    y_train = Y[0 : int(0.7 * len(Y))]
    X_test = X[int(0.7 * len(X)) : len(X)]
    y_test = Y[int(0.7 * len(Y)) : len(Y)]
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    for indexMax in xrange(len(y_test)):
        print str(y_pred[indexMax]) + " " + str(y_test[indexMax])
Ejemplo n.º 11
0
Archivo: svm.py Proyecto: lkprof/sema
def svm():
    #load data
    x_train,y_train=load_svmlight_file("12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("12testdata")
    x_test.todense()
    sk=SelectKBest(f_classif,9).fit(x_train,y_train)
    x_new=sk.transform(x_train)
    x_newtest=sk.transform(x_test)
    print(sk.scores_)
    print(x_new.shape)
    print(sk.get_support())
    #classfier
    clf=SVC(C=2,gamma=2)
    ovrclf=OneVsRestClassifier(clf,-1)
    ovrclf.fit(x_train,y_train)
    y_pred=ovrclf.predict(x_test)
    # write result
    with open("result.txt","w") as fw:
        for st in y_pred.tolist():
            fw.write(str(st)+'\n')
    print(np.array(y_pred).shape)

    target_names=['0','1','2','3']
    #result
    #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    #print(classification_report(y_test,y_pred,target_names=target_names))
    #print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
Ejemplo n.º 12
0
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = OneVsRestClassifier(SVC(C=1000000.0, gamma='auto', kernel='rbf'))
    svm.fit(X, y)
    return svm
Ejemplo n.º 13
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression(),
                            order=np.array([0, 2, 4, 6, 8, 10,
                                            12, 1, 3, 5, 7, 9,
                                            11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Ejemplo n.º 14
0
  def train(self, trainfile_name):
    print >>sys.stderr, "Reading data.."
    train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")]
    shuffle(train_data)
    filter_feature = get_filter()
    train_labels, train_clauses = zip(*train_data)
    train_labels = [tl.lower() for tl in train_labels]
    print >>sys.stderr, "Indexing features.."
    self.fp.index_data(train_clauses, filter_feature)
    X = numpy.asarray([self.fp.featurize(clause, filter_feature) for clause in train_clauses])
    tagset = list(set(train_labels))
    tag_index = {l:i for (i, l) in enumerate(tagset)}
    Y = numpy.asarray([[tag_index[label]] for label in train_labels])

    classifier = OneVsRestClassifier(SVC(kernel='linear'))
    if self.cv:
      print >>sys.stderr, "Starting Cross-validation for %d folds.."%(self.folds)
      y = [l[0] for l in Y]
      scores = cross_validation.cross_val_score(classifier, X, y, cv=self.folds, scoring='f1_weighted')
      print >>sys.stderr, "Scores:", scores
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(scores.mean(), scores.std() * 2)

    print >>sys.stderr, "Starting training.."
    classifier.fit(X, Y)
    pickle.dump(classifier, open(self.trained_model_name, "wb"))
    pickle.dump(self.fp.feat_index, open(self.feat_index_name, "wb"))
    pickle.dump(tagset, open(self.stored_tagset, "wb"))

    print >>sys.stderr, "Done"
Ejemplo n.º 15
0
def trainAndPredictLR(trainX, trainY, testX):
    """
    Logistic regression is used for predicting the target labels of the test data
    The probability of belonging to each of the labels is predicted for every test
    data and the labels with the top 10 probability values are extracted
    
    Input:
        1. trainX: ntrainingSamples * 2000 numpy matrix representing training data features
        2. trainY: ntrainingSamples * 185 numpy matrix representing the training data labels
        3. testX: ntestSamples * 2000 numpy matrix representing test data features
    
    Output:
        testY: ntestSamples * 19 numpy matrix representing the labels for the test data
    
    """
    clf = OneVsRestClassifier(LogisticRegression(C = 1.0))
    clf.fit(trainX, trainY)
    actY = clf.predict_proba(testX)
    testY = []
    # fetch the labels with max probability
    for prob in actY:
        y = []
        for i in range(10):
            index = np.argmax(prob, axis=0)
            classVal = classOrder[index]
            y.append(classVal)
            prob[index] = -1
        testY.append(y)
    return np.array(testY)
Ejemplo n.º 16
0
def benchmark(clf_current):
    print('_' * 80)
    print("Test performance for: ")
    clf_descr = str(clf_current).split('(')[0]
    print(clf_descr)
    t0 = time()
    classif = OneVsRestClassifier(clf_current)
    classif.fit(X_train, Y_train.toarray())
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    t0 = time()
    if hasattr(clf_current,"decision_function"):
        dfmatrix = classif.decision_function(X_test)
        score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5))
    else:
        probsmatrix = classif.predict_proba(X_test)
        score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5))
        
    test_time = time() - t0

    
    print("f1-score:   %0.7f" % score)
    print("test time:  %0.3fs" % test_time)

    print('_' * 80)
    return clf_descr, score, train_time, test_time
Ejemplo n.º 17
0
def run_classifier(sentences, labels, test_doc_list, output_file_path_list):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	estimator = LinearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)

	for test_doc, output_file_path in zip(test_doc_list, output_file_path_list):
		test_sentences = doc2sentences([test_doc])
		sentence_matrix = tfidf.transform(test_sentences)
		print("Shape of sentence matrix : ", sentence_matrix.shape)
		predictions = classifier.predict(sentence_matrix)

		from lxml import etree
		document = etree.Element('doc')
		doc_tree = etree.ElementTree(document)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i]
		doc_tree.write(output_file_path)
Ejemplo n.º 18
0
class ClassDistanceMapper(TransformerMixin):
    """ Fit a OneVsRestClassifier for each sentiment class (against all others
        combined) and return the distances from the decision boundary for each
        class. Hence, this transformation can be seen as a dimensionality
        reduction from #words to #sentiment_classes (=5).

    """

    def __init__(self):
        """ Initialize a one-vs-rest multiclass classifer with a
            SGDClassifier. The choice of the SGDclassifier here is arbitrary,
            any other classifier might work as well.

        """
        self.clf = OneVsRestClassifier(LogisticRegression())

    def fit(self, X, y):
        """ Fit the multiclass classifier. """
        self.clf.fit(X, y)
        return self

    def transform(self, X):
        """ Return the distance of each sample from the decision boundary for
            each class.

        """
        return self.clf.decision_function(X)
Ejemplo n.º 19
0
    def fit(self, df_X, df_y):
        if not df_y.shape[0] == df_X.shape[0]:
            raise ValueError("number of regions is not equal")
        if df_y.shape[1] != 1:
            raise ValueError("y needs to have 1 label column")

        le = LabelEncoder()
        y = le.fit_transform(df_y.iloc[:,0].values)

        clf = RandomForestClassifier(n_estimators=100)
        
        # Multiclass
        if len(le.classes_) > 2:
            orc = OneVsRestClassifier(clf)
            orc.fit(df_X.values, y)

            importances = np.array([c.feature_importances_ for c in orc.estimators_]).T
        else: # Only two classes
            clf.fit(df_X.values, y)
            importances = np.array([
                clf.feature_importances_,
                clf.feature_importances_
                ]).T
        
        for i,c in enumerate(le.classes_):
            
            diff = df_X.loc[y == c].quantile(q=0.75) - df_X.loc[y != c].quantile(q=0.75)
            sign = (diff >= 0) * 2 - 1
            importances[:,i] *= sign
        
        
        # create output DataFrame
        self.act_ = pd.DataFrame(importances,
                columns=le.inverse_transform(range(len(le.classes_))),
                index=df_X.columns)
Ejemplo n.º 20
0
    def setUp(self):
        import sklearn.svm as svm
        import sklearn.preprocessing as pp
        from sklearn.multiclass import OneVsRestClassifier

        # 2 class
        iris = datasets.load_iris()
        self.data = iris.data
        self.target = pp.LabelBinarizer().fit_transform(iris.target)
        self.df = pdml.ModelFrame(self.data, target=self.target)
        self.assertEqual(self.df.shape, (150, 7))

        svc1 = svm.SVC(probability=True, random_state=self.random_state)
        estimator1 = OneVsRestClassifier(svc1)
        self.df.fit(estimator1)
        self.df.predict(estimator1)
        self.assertTrue(isinstance(self.df.predicted, pdml.ModelFrame))

        svc2 = svm.SVC(probability=True, random_state=self.random_state)
        estimator2 = OneVsRestClassifier(svc2)
        estimator2.fit(self.data, self.target)
        self.pred = estimator2.predict(self.data)
        self.proba = estimator2.predict_proba(self.data)
        self.decision = estimator2.decision_function(self.data)

        # argument for classification reports
        self.labels = np.array([2, 1, 0])
Ejemplo n.º 21
0
def one_vs_all(X, y, test_size=0.2, run_num = 100, svm_type='linear'):
    """Trains 15 1 vs all SVM classifiers of specified type"""
    # Python has a wonderful wrapper function that creates 1 vs all classifiers!
    if type == 'linear':
        estimator = LinearSVC()
    else:
        # This will automatically use RBF functions
        estimator = SVC()

    ovr = OneVsRestClassifier(estimator = estimator)

    acc_tr = []
    acc_tst = []

    for i in range(run_num):
        [X_train, X_test, y_train, y_test] = train_test_split(X, y,
                                                              test_size=test_size)
        # Train the classifier
        ovr.fit(X_train, y_train.ravel())

        # Work out the score on the training data. However there is nothing
        # to optimise for - we are just getting an idea of the accuracy for
        # training vs test data. box plot opportunity!
        tr_acc = ovr.score(X_train, y_train.ravel())
        tst_acc = ovr.score(X_test, y_test.ravel())

        acc_tr.append(tr_acc)
        acc_tst.append(tst_acc)

        # All the data isn't used here as it tends to overtrain the classifier.

    return ovr, acc_tr, acc_tst
Ejemplo n.º 22
0
def train_linear(X, Y, splits, model_config, results_dir, best_k=10, validation_score='f1',
                threshold_score='f1', threshold_criterion='zack', fn_prefix='', label_idx=None):
    label_idx = np.arange(Y.shape[1]) if label_idx is None else label_idx
    best_perf = None
    best_C = None
    best_model = None
    for C in np.logspace(-3,3, num=20):
        sys.stdout.write('Training Ridge Regression with C={0}...'.format(C))
        sys.stdout.flush()
        model = OneVsRestClassifier(LogisticRegression(C=C))
        try:
            model.fit(X[splits[0]], Y[splits[0]])
        except KeyboardInterrupt:
            sys.stdout.write('training interrupted...')
            break
        except:
            raise

        Yp = model.predict_proba(X[splits[1]])
        perf = compute_micro_evaluations(Y[splits[1]][:,label_idx], Yp[:,label_idx], k=best_k,
                                        threshold_score=threshold_score, criterion=threshold_criterion)
        sys.stdout.write(' {0}={1:.4f}'.format(validation_score, perf[validation_score]))
        sys.stdout.flush()
        if best_perf is None or perf[validation_score] > best_perf[validation_score]:
            best_perf = perf
            best_model = model
            best_C = C
            sys.stdout.write(' *BEST')
        sys.stdout.write('\n')

    model_config['C'] = best_C
    cPickle.dump(best_model, open(os.path.join(results_dir, fn_prefix + '-model.pkl'), 'wb'))

    return best_model, model_config
Ejemplo n.º 23
0
def make_classifier():
    test_size=0
    X, y = make_X_Y()
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size)
    X_train = X_train.astype(int)
    X_test = X_test.astype(int)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)
    clf = OneVsRestClassifier(SVC(kernel='linear', class_weight='auto', probability=True))
    clf.fit(X_train, y_train)
    try:
        y_suggest = clf.predict_proba(X_test)
        nn = 0
        n = 0
        for y_s, y_t in zip(y_suggest, y_test):
            s1 = chords_Y[np.argmax(y_s)]
            y_s[np.argmax(y_s)]=0
            s2 = chords_Y[np.argmax(y_s)]
            t = chords_Y[np.argmax(y_t)]        
            print 'Suggest: ' + s1 + ' or ' + s2 + '  Real: ' + t
            n = n+1
            if s1==t:
                nn = nn+1
        if n>0:
            print 'Accuracy is ' + str(float(nn)/n)
    except ValueError:
        pass
    #print classification_report(clf.predict(X_test), y_test)
    pickle.dump(clf, open("classifier.bin", "wb"))   
def main():
    word_vec_dict = readGloveData('../glove.twitter.27B/glove.twitter.27B.25d.txt')
    tweets = readTweets('../dataset_raw/semeval2016-task6-trainingdata.txt')

    tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict)
    print tweets[0]
    print getSumVectors(tweets[0], word_vec_dict)
    tweetClasses = set(tweets[-1])

    mapping = {'favor': 1, 'none': 0, 'against': 1}

    tweetClasses = np.asarray([mapping[x] for x in tweets[-1]])
    tweetData = np.asarray(tweetVectors)
    print tweetClasses.shape
    print tweetData.shape
    X = tweetData
    Y = tweetClasses
    clf = OneVsRestClassifier(LinearSVC(random_state=0))
    # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0)
    X_train = X[0:int(0.7 * len(X))]
    y_train = Y[0:int(0.7 * len(Y))]
    X_test = X[int(0.7 * len(X)) : len(X)]
    y_test = Y[int(0.7 * len(Y)) : len(Y)]
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
Ejemplo n.º 25
0
def train_data_SVC(X, y):
    """
    Create and train the Support Vector Machine.
    """
    classif = OneVsRestClassifier(LinearSVC())
    classif.fit(X,y)
    return classif
Ejemplo n.º 26
0
def prepare_multiclass_clf(X, y):
    clf = GridSearchCV(LogisticRegression(penalty='l1'),
                       {'C': np.logspace(-4, 2, 10)},
                       scoring='accuracy', cv=5)
    multi_clf = OneVsRestClassifier(clf)
    multi_clf.fit(X, y)
    return multi_clf
Ejemplo n.º 27
0
def ml_train(datasetFilePath, falsePredictionsFilePath, unknownPredictionsFilePath, confusionMatricesDir, classifierFilePath):
    logger.info("start of training and testing phase")

    classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True), n_jobs=NUMBER_OF_CPUS_TO_USE)

    logger.info("loading data set")
    dataset, features_names = load_dataset(datasetFilePath)

    #limited_dataset = limit_dataset(dataset)
    limited_dataset = dataset
    
    ml_dataset = split_dataset(limited_dataset, len(features_names))

    logger.info("fitting training set X_train - %s, y_train - %s" % (ml_dataset.X_train.shape, ml_dataset.y_train.shape))
    classifier.fit(ml_dataset.X_train, ml_dataset.y_train)

    logger.info("predicting test set X_test - %s, y_test - %s" % (ml_dataset.X_test.shape, ml_dataset.y_test.shape))
    y_pred = classifier.predict(ml_dataset.X_test)

    y_pred_probabilities = classifier.predict_proba(ml_dataset.X_test)

    y_pred_with_unknown_cls, y_pred_fictive, max_y_pred_probs = process_prediction_vector(ml_dataset.y_test, y_pred, y_pred_probabilities)

    validation(ml_dataset.y_test, y_pred, y_pred_with_unknown_cls, y_pred_fictive, list(classifier.classes_) + ["unknown"])
    plot_confusion_matrices(ml_dataset.y_test, y_pred, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "1")
    plot_confusion_matrices(ml_dataset.y_test, y_pred_with_unknown_cls, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "2")
    plot_confusion_matrices(ml_dataset.y_test, y_pred_fictive, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "3")

    produce_output(ml_dataset.y_test, y_pred, max_y_pred_probs, ml_dataset.test_terms_name, falsePredictionsFilePath, unknownPredictionsFilePath)

    logger.info("exporting classifier model")
    joblib.dump(classifier, classifierFilePath)

    logger.info("end of training and testing phase")
Ejemplo n.º 28
0
    def run_naive_bayes(cls, train, test, binarizer, labels, alpha):
        # logging
        logging = configure_log(__file__)

        logging.info("alpha = %s" % (str(alpha)))
        logging.info("Fitting Naive Bayes...")
        train_data, train_labels = train
        test_data, test_labels = test

        classifier = OneVsRestClassifier(MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None))
        with warnings.catch_warnings():  # FIXME: split the data set in a way that the train set has every label
            warnings.simplefilter("ignore")
            classifier.fit(train_data, train_labels)

        possible_labels = set()
        [map(possible_labels.add, row) for row in [label.nonzero()[0] for label in labels]]

        logging.info("Predicting test set...")
        test_predictions = cls.predict(
            classifier=classifier,
            data=test_data,
            labels=test_labels,
            possible_labels=possible_labels,
            binarizer=binarizer,
        )

        # logging.info('Predicting train set...')
        # train_predictions = cls.predict(classifier=classifier, data=train_data, labels=train_labels,
        #                                  possible_labels=possible_labels, binarizer=binarizer)

        test_precision = precision_score(y_true=test_labels, y_pred=test_predictions, average="samples")
        # train_precision = precision_score(y_true=train_labels, y_pred=train_predictions, average='samples')

        # return train_precision, test_precision
        return test_precision
Ejemplo n.º 29
0
def run_classifier(sentences, labels, test_docs):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	test_sentences = doc2sentences(test_docs)
	sentence_matrix = tfidf.transform(test_sentences)
	print("Shape of sentence matrix : ", sentence_matrix.shape)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import linearSVC
	# estimator = SVC(kernel='linear')
	estimator = linearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)
	predictions = classifier.predict(sentence_matrix)

	import csv
	with open("classified.csv", "w") as fl:
		writer = csv.writer(fl)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			writer.writerow((test_sentences[i], curr_pred))
def svm_fixed(train_X, train_Y):
    C = 1.
    kernel = 'linear'
    gamma  = 0.01
    estimator = SVC(C=C, kernel=kernel, gamma=gamma)
    classifier = OneVsRestClassifier(estimator)
    classifier.fit(train_X, train_Y)
    return classifier
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC


x = [[1,2,3],[3,3,2],[8,8,7],[3,7,1],[4,5,6]]
y = [['bar','foo'],['bar'],['foo'],['foo','jump'],['bar','fox','jump']]

mlb = MultiLabelBinarizer()
y_enc = mlb.fit_transform(y)

train_x, test_x, train_y, test_y = train_test_split(x, y_enc, test_size=0.33)

clf = OneVsRestClassifier(SVC(probability=True))
clf.fit(train_x, train_y)
predictions = clf.predict(test_x)

my_metrics = metrics.classification_report( test_y, predictions)
print(my_metrics)
def main():
    # Read JSON files into Pandas DataFrames
    print('Reading data into DataFrames...')
    omdb_filename = "./movies/data/omdb-data.json.gz"
    rotten_filename = "./movies/data/rotten-tomatoes.json.gz"
    wikidata_filename = "./movies/data/wikidata-movies.json.gz"
    genres_filename = "./movies/data/genres.json.gz"
    omdb = pd.read_json(omdb_filename, lines=True)
    rotten = pd.read_json(rotten_filename, lines=True)
    wikidata = pd.read_json(wikidata_filename, lines=True)
    genres = pd.read_json(genres_filename, lines=True)

    # Convert genres DataFrame to a dictionary of genre_code:genre_label pairs.
    genre_map = pd.Series(genres.genre_label.values,
                          index=genres.wikidata_id).to_dict()

    # Create DataFrame of plot summaries with corresponding imdb id
    plot_summaries = omdb[['imdb_id', 'omdb_plot']]
    plot_summaries = plot_summaries.sort_values(by=['imdb_id'])
    plot_summaries = plot_summaries.set_index('imdb_id')

    wikidata = wikidata.sort_values(by=['imdb_id'])
    wikidata = wikidata.set_index('imdb_id')
    wikidata = wikidata[[
        # 'publication_date',
        # 'wikidata_id',
        'genre'
    ]]

    # Clean data.
    print('Cleaning data...')
    movies_data = pd.merge(wikidata, plot_summaries, on='imdb_id')
    # Remove movies with no plot summary.
    movies_data = movies_data[movies_data['omdb_plot'] != 'N/A']
    # Convert plot summaries to lowercase.
    movies_data['omdb_plot'] = movies_data['omdb_plot'].str.lower()
    # Remove all punctuations in plot summaries.
    movies_data['omdb_plot'] = movies_data['omdb_plot'].apply(
        remove_punctuations)
    # Tokenize strings
    movies_data['omdb_plot'] = movies_data['omdb_plot'].apply(tokenize)
    # Remove stop words.
    stop_words = stopwords.words('english')
    stop_words.append('platform')
    stop_words.append('film')
    movies_data['omdb_plot'] = movies_data['omdb_plot'].apply(
        lambda x: [word for word in x if word not in stop_words])
    movies_data['clean_summary'] = movies_data['omdb_plot'].apply(
        lambda x: ' '.join(x))
    plot_summaries_words = movies_data['omdb_plot'].apply(
        lambda x: ' '.join(x))

    # Create and generate word cloud image.
    # generate_word_cloud(plot_summaries_words)

    # Get all the genres from the movies.
    genres_all = []
    for index, row in movies_data.iterrows():
        genres_all.append(row['genre'])

    # Flatten the genres list.
    genres = []
    for sublist in genres_all:
        for item in sublist:
            genres.append(item)

    # Get the distinct genres.
    genres = list(set(genres))

    # This DataFrame will separate all the distinct genres into separate columns, and show which movie is associated with which individual genre.
    movies_data2 = movies_data
    for genre_code in genres:
        movies_data2[genre_code] = 0

    for index, row in movies_data2.iterrows():
        for genre_code in row['genre']:
            movies_data2.loc[index,
                             genre_code] = movies_data2.loc[index,
                                                            genre_code] + 1

    # Get the number of counts for each genre.
    # genres_count = []
    # for col in movies_data2.columns[2:]:
    #     genres_count.append(movies_data2[col].sum())

    # Get the English label names for each genre.
    # genre_labels = []
    # for label in genres:
    #     genre_labels.append(genre_map.get(label))
    # print(genre_labels)

    multilabel_binarizer = MultiLabelBinarizer()
    multilabel_binarizer.fit(movies_data2['genre'])

    # Extract features from cleaned plot summaries by usng tf-idf.
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=500)

    # Split data into train and validation data sets.
    X = movies_data2['clean_summary']
    y = multilabel_binarizer.transform(movies_data2['genre'])

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          random_state=9)
    # X_train, X_valid, y_train, y_valid = train_test_split(X, y)

    # Create tf-idf features.
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

    # Build the genre prediction model.
    print('Building prediction model...')
    lr = LogisticRegression()
    model = OneVsRestClassifier(lr)

    # Train the model.
    print('Training model...')
    model.fit(X_train_tfidf, y_train)

    # Predict on validation data set.
    print('Making predictions...')
    y_prediction = model.predict_proba(X_valid_tfidf)
    y_prediction = (y_prediction >= 0.25).astype(int)
    predictions = multilabel_binarizer.inverse_transform(y_prediction)

    print('\nPredicted genre codes: ')
    res = pd.Series(predictions)
    print(res)
    print('\nf1 score: {}\n'.format(
        f1_score(y_valid, y_prediction, average="micro")))

    # Show 10 genre predictions, and compare it with the actual genres.
    def make_predictions(data):
        data_tfidf = tfidf_vectorizer.transform([data])
        data_prediction = model.predict_proba(data_tfidf)
        data_prediction = (data_prediction >= 0.25).astype(int)
        return multilabel_binarizer.inverse_transform(data_prediction)

    for i in range(10):
        data = X_valid.sample(1).index[0]

        predicted_genre = make_predictions(X_valid[data])

        actual_genre = movies_data2['genre'][data]
        summary = movies_data2['clean_summary'][data]

        predicted_genre_labels = []
        for code_set in predicted_genre:
            for code in code_set:
                predicted_genre_labels.append(genre_map.get(code))

        actual_genre_labels = []
        for code in actual_genre:
            actual_genre_labels.append(genre_map.get(code))

        print('IMDB ID: {}'.format(data))
        # print('Plot summary: {}'.format(summary))
        print('\tReal genre(s): {}'.format(actual_genre_labels))
        print('\tPredicted genre(s): {}\n'.format(predicted_genre_labels))
Ejemplo n.º 33
0
y = data[["Encodings"]]
X = data.drop(["Encodings", "Pictures"], axis=1)

clf = OneVsRestClassifier(SVC(gamma="auto", probability=False, C=400))
scaling = MinMaxScaler(feature_range=(-1, 1)).fit(X)
X_scaled = scaling.transform(X)
print("CV =", np.mean(cross_val_score(clf, X_scaled, y.values.ravel(), cv=5)))


def accuracy(pred, actual):
    # Calculate the accuracy percentage of the predicted values
    return sum(pred == actual) / len(actual)


clf = OneVsRestClassifier(SVC(gamma="auto", probability=False, C=400))
clf.fit(X_scaled, y.values.ravel())
X_test = pd.read_pickle("test_with_feature.pkl").drop(["Pictures"], axis=1)
X_test_scaled = scaling.transform(X_test)
pred = clf.predict(X_test_scaled)
test_acc = accuracy(pred, ground_truth)
print(test_acc)
pred

# Random Forest
clf = RandomForestClassifier(n_estimators=1000, max_depth=18, random_state=42)
# print("CV =", np.mean(cross_val_score(clf, X, y.values.ravel(), cv=5)))
clf.fit(X, y.values.ravel())
pred = clf.predict(X_test)
test_acc = accuracy(pred, ground_truth)
print(test_acc)
Ejemplo n.º 34
0
    def model1(self):
        """SVM model."""
        X, y = self._split_data()
        # Binarize the output
        y = label_binarize(y, classes=[0, 1, 2, 3])
        n_classes = y.shape[1]
        # shuffle and split training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_seed)
        # Learn to predict each class against the other
        classifier = OneVsRestClassifier(
            svm.SVC(kernel='poly',
                    degree=2,
                    probability=True,
                    tol=1e-6,
                    random_state=self.random_seed))  # , gamma= 0.1))
        y_score = classifier.fit(X_train, y_train).decision_function(X_test)
        # Binarize the output
        y = label_binarize(y, classes=[0, 1, 2, 3])
        n_classes = y.shape[1]
        # shuffle and split training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_seed)
        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(),
                                                  y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        # Finally average it and compute AUC
        mean_tpr /= n_classes

        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        # Plot all ROC curves
        plt.figure()
        plt.plot(fpr["micro"],
                 tpr["micro"],
                 label='micro-average ROC curve (area = {0:0.2f})'
                 ''.format(roc_auc["micro"]),
                 color='deeppink',
                 linestyle=':',
                 linewidth=4)

        plt.plot(fpr["macro"],
                 tpr["macro"],
                 label='macro-average ROC curve (area = {0:0.2f})'
                 ''.format(roc_auc["macro"]),
                 color='navy',
                 linestyle=':',
                 linewidth=4)

        colors = cycle(['cyan', 'magenta', 'yellow', 'cornflowerblue'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i],
                     tpr[i],
                     color=color,
                     label='ROC curve of class {0} (area = {1:0.2f})'
                     ''.format(i, roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc="lower right")
        plt.savefig(self.path_to_file +
                    f'Figures/Funk-OneVsRestClassifier-SVM-poly-ROC_.png')
        plt.show()

        y_prob = classifier.predict_proba(X_test)

        macro_roc_auc_ovo = roc_auc_score(y_test,
                                          y_prob,
                                          multi_class="ovo",
                                          average="macro")
        weighted_roc_auc_ovo = roc_auc_score(y_test,
                                             y_prob,
                                             multi_class="ovo",
                                             average="weighted")
        macro_roc_auc_ovr = roc_auc_score(y_test,
                                          y_prob,
                                          multi_class="ovr",
                                          average="macro")
        weighted_roc_auc_ovr = roc_auc_score(y_test,
                                             y_prob,
                                             multi_class="ovr",
                                             average="weighted")
        print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
              "(weighted by prevalence)".format(macro_roc_auc_ovo,
                                                weighted_roc_auc_ovo))
        print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
              "(weighted by prevalence)".format(macro_roc_auc_ovr,
                                                weighted_roc_auc_ovr))
Ejemplo n.º 35
0
    print(titanic_df.head())

    if (cat == 1):
        X = titanic_df.drop("Survived", axis=1)
        Y = titanic_df["Survived"]

        X = X.as_matrix().astype(np.int)
        Y = Y.as_matrix().astype(np.int)

        return X, Y
    else:
        X = titanic_df

        X = X.as_matrix().astype(np.int)

        return X


train_location = "C:/Users/Oliver Crosbie Higgs/Documents/personal projects/train.csv"
test_location = "C:/Users/Oliver Crosbie Higgs/Documents/personal projects/test.csv"

X_train, Y_train = transform_data(train_location, 1)
X_test = transform_data(test_location, 0)

forest = RandomForestClassifier(n_estimators=250, random_state=0)
model1a = DTC(max_depth=10)

classifier = OneVsRestClassifier(model1a)
y_score = classifier.fit(X_train, Y_train).predict(X_test.astype(int))

np.savetxt("y_score.csv", y_score, delimiter=",")
Ejemplo n.º 36
0
np.random.set_state(state)
np.random.shuffle(label)

train_num=1500
test_num=1500
data_train=data[0:train_num,]
label_train=label[0:train_num,]

data_test=data[train_num:train_num+test_num,]
label_test=label[train_num:train_num+test_num,]

## multi classification
model_0 =OneVsRestClassifier(SVC(kernel='linear', probability=True,gamma='scale'))

model_0.fit(data_train, label_train)
pre_0 = model_0.predict_proba(data_test)

max_ind=np.argmax(pre_0,axis=1)
# print(max_ind)
pre=np.zeros_like(pre_0)
for i in range(pre.shape[0]):
    pre[i,max_ind[i]]=1
# print(pre)
pre_train0=model_0.predict_proba(data_train)
max_ind_train=np.argmax(pre_train0,axis=1)
# print(max_ind)
pre_train=np.zeros_like(pre_0)
for i in range(max_ind_train.shape[0]):
    pre_train[i,max_ind_train[i]]=1
# Use label_binarize to be multi-label like settings
Y = label_binarize(y, classes=[0, 1, 2])
n_classes = Y.shape[1]

# Split into training and test
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=.5,
                                                    random_state=random_state)

# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier

# Run classifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)

###############################################################################
# The average precision score in multi-label settings
# ....................................................
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(
        Y_test[:, i], y_score[:, i])
Ejemplo n.º 38
0
x_all = np.hstack(( x_num_all, fac_x_cat_all ))

#train-test split
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=.75,random_state=24)

#NOTE: change classifier here
clf1 = OneVsRestClassifier(RandomForestClassifier(n_estimators=250, max_features='auto', n_jobs=4, max_depth=5))
clf2 = OneVsRestClassifier(AdaBoostClassifier(n_estimators=250, algorithm='SAMME'))
clf3 = OneVsRestClassifier(GaussianNB())
clf4 = OneVsRestClassifier(DecisionTreeClassifier())
#clf5 = OneVsRestClassifier(svm.SVC(gamma=2))

#training
st = time.time()
print "training started"
clf1.fit( x_train, y_train )
clf2.fit( x_train, y_train )
clf3.fit( x_train, y_train )
clf4.fit( x_train, y_train )
print "training ended"
et = time.time()
tt = et - st
print "Training Time = " + str(tt) + "\n"

#predictions
pred1 = clf1.predict( x_test )
pred2 = clf2.predict( x_test )
pred3 = clf3.predict( x_test )
pred4 = clf4.predict( x_test )
pred = pred2;
#NOTE: change to decision_function or predict_proba depending on the classifier
Ejemplo n.º 39
0
class DialectIdentifier(object):
    """A class for training, evaluating and running the dialect identification
    model described by Salameh et al. After initializing an instance, you must
    run the train method once before using it.

    Args:
        labels (:obj:`set` of :obj:`str`, optional): The set of dialect labels
            used in the training data in the main model.
            If None, the default labels are used.
            Defaults to None.
        labels_extra (:obj:`set` of :obj:`str`, optional): The set of dialect
            labels used in the training data in the extra features model.
            If None, the default labels are used.
            Defaults to None.
        char_lm_dir (:obj:`str`, optional): Path to the directory containing
            the character-based language models. If None, use the language
            models that come with this package. Defaults to None.
        word_lm_dir (:obj:`str`, optional): Path to the directory containing
            the word-based language models. If None, use the language models
            that come with this package. Defaults to None.
    """
    def __init__(self,
                 labels=None,
                 labels_extra=None,
                 char_lm_dir=None,
                 word_lm_dir=None):
        if labels is None:
            labels = _DEFAULT_LABELS
        if labels_extra is None:
            labels_extra = _DEFAULT_LABELS_EXTRA
        if char_lm_dir is None:
            char_lm_dir = _CHAR_LM_DIR
        if word_lm_dir is None:
            word_lm_dir = _WORD_LM_DIR

        self._labels = labels
        self._labels_extra = labels_extra
        self._labels_sorted = sorted(labels)
        self._labels_extra_sorted = sorted(labels_extra)

        self._char_lms = collections.defaultdict(kenlm.Model)
        self._word_lms = collections.defaultdict(kenlm.Model)
        self._load_lms(char_lm_dir, word_lm_dir)

        self._is_trained = False

    def _load_lms(self, char_lm_dir, word_lm_dir):
        config = kenlm.Config()
        config.show_progress = False
        config.arpa_complain = kenlm.ARPALoadComplain.NONE

        for label in self._labels:
            char_lm_path = Path(char_lm_dir, '{}.arpa'.format(label))
            word_lm_path = Path(word_lm_dir, '{}.arpa'.format(label))
            self._char_lms[label] = kenlm.Model(str(char_lm_path), config)
            self._word_lms[label] = kenlm.Model(str(word_lm_path), config)

    def _get_char_lm_scores(self, txt):
        chars = _word_to_char(txt)
        return np.array([
            self._char_lms[label].score(chars, bos=True, eos=True)
            for label in self._labels_sorted
        ])

    def _get_word_lm_scores(self, txt):
        return np.array([
            self._word_lms[label].score(txt, bos=True, eos=True)
            for label in self._labels_sorted
        ])

    def _get_lm_feats(self, txt):
        word_lm_scores = self._get_word_lm_scores(txt).reshape(1, -1)
        word_lm_scores = _normalize_lm_scores(word_lm_scores)
        char_lm_scores = self._get_char_lm_scores(txt).reshape(1, -1)
        char_lm_scores = _normalize_lm_scores(char_lm_scores)
        feats = np.concatenate((word_lm_scores, char_lm_scores), axis=1)
        return feats

    def _get_lm_feats_multi(self, sentences):
        feats_list = collections.deque()
        for sentence in sentences:
            feats_list.append(self._get_lm_feats(sentence))
        feats_matrix = np.array(feats_list)
        feats_matrix = feats_matrix.reshape((-1, 52))
        return feats_matrix

    def _prepare_sentences(self, sentences):
        tokenized = [
            ' '.join(simple_word_tokenize(dediac_ar(s))) for s in sentences
        ]
        sent_array = np.array(tokenized)
        x_trans = self._feat_union.transform(sent_array)
        x_trans_extra = self._feat_union_extra.transform(sent_array)
        x_predict_extra = self._classifier_extra.predict_proba(x_trans_extra)
        x_lm_feats = self._get_lm_feats_multi(sentences)
        x_final = sp.sparse.hstack((x_trans, x_lm_feats, x_predict_extra))
        return x_final

    def train(self,
              data_path=None,
              data_extra_path=None,
              char_ngram_range=(1, 3),
              word_ngram_range=(1, 1),
              n_jobs=None):
        """Trains the model on a given data set.

        Args:
            data_path (:obj:`str`, optional): Path to main training data.
                If None, use the provided training data.
                Defaults to None.
            data_extra_path (:obj:`str`, optional): Path to extra features
                training data. If None,cuse the provided training data.
                Defaults to None.
            char_ngram_range (:obj:`tuple`, optional): The n-gram ranges to
                consider in the character-based language models.
                Defaults to (1, 3).
            word_ngram_range (:obj:`tuple`, optional): The n-gram ranges to
                consider in the word-based language models.
                Defaults to (1, 1).
            n_jobs (:obj:`int`, optional): The number of parallel jobs to use
                for computation. If None, then only 1 job is used.
                If -1 then all processors are used. Defaults to None.
        """

        if data_path is None:
            data_path = _TRAIN_DATA_PATH
        if data_extra_path is None:
            data_extra_path = _TRAIN_DATA_EXTRA_PATH

        # Load training data and extract
        train_data = pd.read_csv(data_path, sep='\t', index_col=0)
        train_data_extra = pd.read_csv(data_extra_path, sep='\t', index_col=0)

        x = train_data['ar'].values
        y = train_data['dialect'].values
        x_extra = train_data_extra['ar'].values
        y_extra = train_data_extra['dialect'].values

        # Build and train extra classifier
        self._label_encoder_extra = LabelEncoder()
        self._label_encoder_extra.fit(y_extra)
        y_trans = self._label_encoder_extra.transform(y_extra)

        word_vectorizer = TfidfVectorizer(lowercase=False,
                                          ngram_range=word_ngram_range,
                                          analyzer='word',
                                          tokenizer=lambda x: x.split(' '))
        char_vectorizer = TfidfVectorizer(lowercase=False,
                                          ngram_range=char_ngram_range,
                                          analyzer='char',
                                          tokenizer=lambda x: x.split(' '))
        self._feat_union_extra = FeatureUnion([('wordgrams', word_vectorizer),
                                               ('chargrams', char_vectorizer)])
        x_trans = self._feat_union_extra.fit_transform(x_extra)

        self._classifier_extra = OneVsRestClassifier(MultinomialNB(),
                                                     n_jobs=n_jobs)
        self._classifier_extra.fit(x_trans, y_trans)

        # Build and train main classifier
        self._label_encoder = LabelEncoder()
        self._label_encoder.fit(y)
        y_trans = self._label_encoder.transform(y)

        word_vectorizer = TfidfVectorizer(lowercase=False,
                                          ngram_range=word_ngram_range,
                                          analyzer='word',
                                          tokenizer=lambda x: x.split(' '))
        char_vectorizer = TfidfVectorizer(lowercase=False,
                                          ngram_range=char_ngram_range,
                                          analyzer='char',
                                          tokenizer=lambda x: x.split(' '))
        self._feat_union = FeatureUnion([('wordgrams', word_vectorizer),
                                         ('chargrams', char_vectorizer)])
        self._feat_union.fit(x)

        x_prepared = self._prepare_sentences(x)

        self._classifier = OneVsRestClassifier(MultinomialNB(), n_jobs=n_jobs)
        self._classifier.fit(x_prepared, y_trans)

        self._is_trained = True

    def eval(self, data_path=None, data_set='VALIDATION'):
        """Evaluate the trained model on a given data set.

        Args:
            data_path (:obj:`str`, optional): Path to an evaluation data set.
                If None, use one of the provided data sets instead.
                Defaults to None.
            data_set (:obj:`str`, optional): Name of the provided data set to
                use. This is ignored if data_path is not None. Can be either
                'VALIDATION' or 'TEST'. Defaults to 'VALIDATION'.

        Returns:
            :obj:`dict`: A dictionary mapping an evaluation metric to its
            computed value. The metrics used are accuracy, f1_micro, f1_macro,
            recall_micro, recall_macro, precision_micro and precision_macro.
        """

        if not self._is_trained:
            raise UntrainedModelError('Can\'t evaluate an untrained model.')

        if data_path is None:
            if data_set == 'VALIDATION':
                data_path = _VAL_DATA_PATH
            elif data_set == 'TEST':
                data_path = _TEST_DATA_PATH
            else:
                raise InvalidDataSetError(data_set)

        # Load eval data
        eval_data = pd.read_csv(data_path, sep='\t', index_col=0)
        x = eval_data['ar'].values
        y_true = eval_data['dialect'].values

        # Generate predictions
        x_prepared = self._prepare_sentences(x)
        y_pred = self._classifier.predict(x_prepared)
        y_pred = self._label_encoder.inverse_transform(y_pred)

        # Get scores
        scores = {
            'accuracy': accuracy_score(y_true, y_pred),
            'f1_micro': f1_score(y_true, y_pred, average='micro'),
            'f1_macro': f1_score(y_true, y_pred, average='macro'),
            'recall_micro': recall_score(y_true, y_pred, average='micro'),
            'recall_macro': recall_score(y_true, y_pred, average='macro'),
            'precision_micro': precision_score(y_true, y_pred,
                                               average='micro'),
            'precision_macro': precision_score(y_true, y_pred, average='macro')
        }

        return scores

    def predict(self, sentences):
        """Predict the dialect probability scores for a given list of
        sentences.

        Args:
            sentences (:obj:`list` of :obj:`str`): The list of sentences.

        Returns:
            :obj:`list` of :obj:`DIDPred`: A list of prediction results,
            each corresponding to its respective sentence.
        """

        if not self._is_trained:
            raise UntrainedModelError(
                'Can\'t predict with an untrained model.')

        x_prepared = self._prepare_sentences(sentences)
        predicted_scores = self._classifier.predict_proba(x_prepared)

        result = collections.deque()
        for sentence, scores in zip(sentences, predicted_scores):
            score_tups = list(zip(self._labels_sorted, scores))
            predicted_dialect = _max_score(score_tups)
            dialect_scores = dict(score_tups)
            result.append(DIDPred(predicted_dialect, dialect_scores))

        return list(result)

    @staticmethod
    def pretrained():
        """Load the default pre-trained model provided with camel-tools.

        Raises:
            :obj:`PretrainedModelError`: When a pre-trained model compatible
                with the current Python version isn't available.

        Returns:
            :obj:`DialectIdentifier`: The loaded model.
        """

        suffix = '{}{}'.format(sys.version_info.major, sys.version_info.minor)
        model_file_name = 'did_pretrained_{}.dill'.format(suffix)
        model_path = Path(_DATA_DIR, model_file_name)

        if not model_path.is_file():
            raise PretrainedModelError(
                'No pretrained model for current Python version found.')

        with model_path.open('rb') as model_fp:
            model = dill.load(model_fp)

            # We need to reload LMs since they were set to None when
            # serialized.
            model._char_lms = collections.defaultdict(kenlm.Model)
            model._word_lms = collections.defaultdict(kenlm.Model)
            model._load_lms(_CHAR_LM_DIR, _WORD_LM_DIR)

            return model
Ejemplo n.º 40
0
def test_ovr_fit_predict_svc():
    ovr = OneVsRestClassifier(svm.SVC())
    ovr.fit(iris.data, iris.target)
    assert_equal(len(ovr.estimators_), 3)
    assert_greater(ovr.score(iris.data, iris.target), .9)
Ejemplo n.º 41
0
def temporal_holdout(X,
                     y,
                     indx,
                     bootstrap,
                     fname,
                     goterms=None,
                     go_fname=None):
    """Perform temporal holdout validation"""

    X_train = X[indx['train'].tolist()]
    X_test = X[indx['test'].tolist()]
    X_valid = X[indx['valid'].tolist()]
    y_train = y['train'].tolist()
    y_test = y['test'].tolist()
    y_valid = y['valid'].tolist()
    if goterms is not None:
        goterms = goterms['terms'].tolist()

    # range of hyperparameters
    C_range = 10.**np.arange(-1, 3)
    gamma_range = 10.**np.arange(-3, 1)

    # pre-generating kernels
    print("### Pregenerating kernels...")
    K_rbf_train = {}
    K_rbf_test = {}
    K_rbf_valid = {}
    for gamma in gamma_range:
        K_rbf_train[gamma] = rbf_kernel(X_train, gamma=gamma)
        K_rbf_test[gamma] = rbf_kernel(X_test, X_train, gamma=gamma)
        K_rbf_valid[gamma] = rbf_kernel(X_valid, X_train, gamma=gamma)
    print("### Done.")
    print("Train samples=%d; #Test samples=%d" %
          (y_train.shape[0], y_test.shape[0]))

    # parameter fitting
    C_opt = None
    gamma_opt = None
    max_aupr = 0
    for C in C_range:
        for gamma in gamma_range:
            # Multi-label classification
            clf = OneVsRestClassifier(svm.SVC(C=C,
                                              kernel='precomputed',
                                              probability=False),
                                      n_jobs=-1)
            clf.fit(K_rbf_train[gamma], y_train)
            y_score_valid = clf.decision_function(K_rbf_valid[gamma])
            y_pred_valid = clf.predict(K_rbf_valid[gamma])
            perf = evaluate_performance(y_valid, y_score_valid, y_pred_valid)
            micro_aupr = perf['m-aupr']
            print("### gamma = %0.3f, C = %0.3f, AUPR = %0.3f" %
                  (gamma, C, micro_aupr))
            if micro_aupr > max_aupr:
                C_opt = C
                gamma_opt = gamma
                max_aupr = micro_aupr
    print("### Optimal parameters: ")
    print("C_opt = %0.3f, gamma_opt = %0.3f" % (C_opt, gamma_opt))
    print("### Train dataset: AUPR = %0.3f" % (max_aupr))
    print("### Computing performance on test dataset...")
    clf = OneVsRestClassifier(svm.SVC(C=C_opt,
                                      kernel='precomputed',
                                      probability=False),
                              n_jobs=-1)
    clf.fit(K_rbf_train[gamma_opt], y_train)

    # Compute performance on test set
    y_score = clf.decision_function(K_rbf_test[gamma_opt])
    y_pred = clf.predict(K_rbf_test[gamma_opt])

    # performance measures for bootstrapping
    perf = dict()
    pr_micro = []
    pr_macro = []
    fmax = []
    acc = []

    # individual goterms
    pr_goterms = {}
    for i in range(0, len(goterms)):
        pr_goterms[goterms[i]] = []

    for ind in bootstrap:
        perf_ind = evaluate_performance(y_test[ind], y_score[ind], y_pred[ind])
        pr_micro.append(perf_ind['m-aupr'])
        pr_macro.append(perf_ind['M-aupr'])
        fmax.append(perf_ind['F1'])
        acc.append(perf_ind['acc'])
        for i in range(0, len(goterms)):
            pr_goterms[goterms[i]].append(perf_ind[i])

    perf['m-aupr_avg'] = np.mean(pr_micro)
    perf['m-aupr_std'] = std(pr_micro)
    perf['M-aupr_avg'] = np.mean(pr_macro)
    perf['M-aupr_std'] = std(pr_macro)
    perf['F1_avg'] = np.mean(fmax)
    perf['F1_std'] = std(fmax)
    perf['acc_avg'] = np.mean(acc)
    perf['acc_std'] = std(acc)

    # trials
    fout = open(fname, 'w')
    fout.write("aupr[micro], aupr[macro], F_max, accuracy\n")
    for it in range(0, len(bootstrap)):
        fout.write(pr_micro[it], pr_macro[it], fmax[it], acc[it], "\n")
    fout.close()

    # write performance on individual GO terms
    if go_fname is not None:
        fout = open(go_fname, 'wb')
        print >> fout, "GO_id, AUPRs"
        for i in range(0, len(goterms)):
            print >> fout, goterms[i], sum(y_train[:, i]) / float(
                y_train.shape[0]),
            for pr in pr_goterms[goterms[i]]:
                print >> fout, pr,
            print >> fout
        fout.close()

    return perf
Ejemplo n.º 42
0
def test_ovr_coef_():
    ovr = OneVsRestClassifier(LinearSVC())
    ovr.fit(iris.data, iris.target)
    shape = ovr.coef_.shape
    assert_equal(shape[0], n_classes)
    assert_equal(shape[1], iris.data.shape[1])
Ejemplo n.º 43
0
    percentage = np.arange(0.1, 1, 0.1)

    classif = OneVsRestClassifier(lr)
    for p in percentage:
        random.shuffle(lbl)
        train_ins = int(len(lbl) * p)
        test_ins = lbl[train_ins:]
        train_ins = lbl[0:train_ins]

        X = np.zeros((len(train_ins), fea.shape[1]))
        Y = np.zeros((len(train_ins)))

        X_test = np.zeros((len(test_ins), fea.shape[1]))
        Y_test = np.zeros((len(test_ins)))

        for idx, tup in enumerate(train_ins):
            X[idx, :] = fea[tup[0], :]
            Y[idx] = tup[1]

        for idx, tup in enumerate(test_ins):
            X_test[idx, :] = fea[tup[0], :]
            Y_test[idx] = tup[1]

        classif.fit(X, Y)
        Y_pred = classif.predict(X_test)
        f1_a = f1_score(Y_test, Y_pred, average='macro')
        f1_i = f1_score(Y_test, Y_pred, average='micro')
        print 'Macro', f1_a
        print 'Micro', f1_i
Ejemplo n.º 44
0
def cross_validation(X, y, n_trials=5, trial_splits=None, fname=None):
    """Perform model selection via 5-fold cross validation"""
    # filter samples with no annotations
    del_rid = np.where(y.sum(axis=1) == 0)[0]
    y = np.delete(y, del_rid, axis=0)
    X = np.delete(X, del_rid, axis=0)

    # range of hyperparameters
    C_range = 10.**np.arange(-1, 3)
    gamma_range = 10.**np.arange(-3, 1)

    # pre-generating kernels
    print("### Pregenerating kernels...")
    K_rbf = {}
    for gamma in gamma_range:
        K_rbf[gamma] = rbf_kernel(X, gamma=gamma)
    print("### Done.")

    # performance measures
    perf = dict()
    pr_micro = []
    pr_macro = []
    fmax = []
    acc = []

    if trial_splits is None:
        # shuffle and split training and test sets
        trials = ShuffleSplit(n_splits=n_trials,
                              test_size=0.2,
                              random_state=None)
        ss = trials.split(X)
        trial_splits = []
        for train_idx, test_idx in ss:
            trial_splits.append((train_idx, test_idx))

    it = 0
    for jj in range(0, n_trials):
        train_idx = trial_splits[jj][0]
        test_idx = trial_splits[jj][1]
        it += 1
        y_train = y[train_idx]
        y_test = y[test_idx]
        print("### [Trial %d] Perfom cross validation...." % (it))
        print("Train samples=%d; #Test samples=%d" %
              (y_train.shape[0], y_test.shape[0]))
        # setup for neasted cross-validation
        splits = ml_split(y_train)

        # parameter fitting
        C_opt = None
        gamma_opt = None
        max_aupr = 0
        for C in C_range:
            for gamma in gamma_range:
                # Multi-label classification
                cv_results = []
                for train, valid in splits:
                    clf = OneVsRestClassifier(svm.SVC(C=C,
                                                      kernel='precomputed',
                                                      probability=False),
                                              n_jobs=-1)
                    K_train = K_rbf[gamma][
                        train_idx[train], :][:, train_idx[train]]
                    K_valid = K_rbf[gamma][
                        train_idx[valid], :][:, train_idx[train]]
                    y_train_t = y_train[train]
                    y_train_v = y_train[valid]
                    y_score_valid = np.zeros(y_train_v.shape, dtype=float)
                    y_pred_valid = np.zeros_like(y_train_v)
                    idx = np.where(y_train_t.sum(axis=0) > 0)[0]
                    clf.fit(K_train, y_train_t[:, idx])
                    y_score_valid[:, idx] = clf.decision_function(K_valid)
                    y_pred_valid[:, idx] = clf.predict(K_valid)
                    perf_cv = evaluate_performance(y_train_v, y_score_valid,
                                                   y_pred_valid)
                    cv_results.append(perf_cv['m-aupr'])
                cv_aupr = np.median(cv_results)
                print("### gamma = %0.3f, C = %0.3f, AUPR = %0.3f" %
                      (gamma, C, cv_aupr))
                if cv_aupr > max_aupr:
                    C_opt = C
                    gamma_opt = gamma
                    max_aupr = cv_aupr
        print("### Optimal parameters: ")
        print("C_opt = %0.3f, gamma_opt = %0.3f" % (C_opt, gamma_opt))
        print("### Train dataset: AUPR = %0.3f" % (max_aupr))
        print("### Using full training data...")
        clf = OneVsRestClassifier(svm.SVC(C=C_opt,
                                          kernel='precomputed',
                                          probability=False),
                                  n_jobs=-1)
        y_score = np.zeros(y_test.shape, dtype=float)
        y_pred = np.zeros_like(y_test)
        idx = np.where(y_train.sum(axis=0) > 0)[0]
        clf.fit(K_rbf[gamma_opt][train_idx, :][:, train_idx], y_train[:, idx])

        # Compute performance on test set
        y_score[:, idx] = clf.decision_function(
            K_rbf[gamma_opt][test_idx, :][:, train_idx])
        y_pred[:, idx] = clf.predict(K_rbf[gamma_opt][test_idx, :][:,
                                                                   train_idx])
        perf_trial = evaluate_performance(y_test, y_score, y_pred)
        pr_micro.append(perf_trial['m-aupr'])
        pr_macro.append(perf_trial['M-aupr'])
        fmax.append(perf_trial['F1'])
        acc.append(perf_trial['acc'])
        print(
            "### Test dataset: AUPR['micro'] = %0.3f, AUPR['macro'] = %0.3f, F1 = %0.3f, Acc = %0.3f"
            % (perf_trial['m-aupr'], perf_trial['M-aupr'], perf_trial['F1'],
               perf_trial['acc']))
    perf['m-aupr_avg'] = np.mean(pr_micro)
    perf['m-aupr_std'] = std(pr_micro)
    perf['M-aupr_avg'] = np.mean(pr_macro)
    perf['M-aupr_std'] = std(pr_macro)
    perf['F1_avg'] = np.mean(fmax)
    perf['F1_std'] = std(fmax)
    perf['acc_avg'] = np.mean(acc)
    perf['acc_std'] = std(acc)

    if fname is not None:
        fout = open(fname, 'w')
        fout.write("aupr[micro], aupr[macro], F_max, accuracy\n")
        for ii in range(0, n_trials):
            fout.write(pr_micro[ii], pr_macro[ii], fmax[ii], acc[ii])
        fout.close()

    return perf
Ejemplo n.º 45
0
/* With split data in hand, you're only a few lines away from training a model.

In this exercise, you will import the logistic regression and one versus rest classifiers in order to fit a multi-class logistic regression model to the NUMERIC_COLUMNS of your feature data.

Then you'll test and print the accuracy with the .score() method to see the results of training.

Before you train! Remember, we're ultimately going to be using logloss to score our model, so don't worry too much about the accuracy here. Keep in mind that you're throwing away all of the text data in the dataset - that's by far most of the data! So don't get your hopes up for a killer performance just yet. We're just interested in getting things up and running at the moment.

All data necessary to call multilabel_train_test_split() has been loaded into the workspace. */
# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Create the DataFrame: numeric_data_only
numeric_data_only = df[NUMERIC_COLUMNS].fillna(-1000)

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

# Create training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only, label_dummies, size=0.2, seed=123)

# Instantiate the classifier: clf
clf = OneVsRestClassifier(LogisticRegression())

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Print the accuracy
print("Accuracy: {}".format(clf.score(X_test, y_test)))
Ejemplo n.º 46
0
from sklearn.multiclass import OneVsRestClassifier
# from sklearn.preprocessing import MultiLabelBinarizer

# mlb = MultiLabelBinarizer(classes=np.unique(y))
# y_train = mlb.fit_transform([[el] for el in y_train])
# y_test = mlb.fit_transform([[el] for el in y_test])

# pickle.dump(tag_classifier, open('mlb.pkl', 'wb'))

######################################
######### YOUR CODE HERE #############
######################################

tag_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=0))
tag_classifier.fit(X_train_tfidf, y_train)

# print(mlb.classes_)
# print(mlb.inverse_transform(y_test_pred[:5, :]))

# Check test accuracy.
y_test_pred = tag_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

"""Dump the classifier to use it in the running bot."""

pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))

"""## Part II. Ranking  questions with embeddings
Y = np.array(train['label'].values, dtype=np.int32)

test = pd.read_csv('ftest.csv')
X_test = test[[str(i) for i in range(4096)]].values
Y_test = np.array(test['label'].values, dtype=np.int32)
Y_test = label_binarize(Y_test, classes=[i for i in range(193)])

Y = label_binarize(Y, classes=[i for i in range(193)])
n_classes = Y.shape[1]

random_state = np.random.RandomState(0)
n_samples, n_features = X.shape

classifier = OneVsRestClassifier(svm.SVC(kernel='poly', probability=True,
                                         random_state=random_state))
y_score = classifier.fit(X, Y).decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

lw = 2

all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
from src.utils.initialize import *
from sklearn.model_selection import train_test_split
import pickle

with open('data/processed/target_train.pkl', 'rb') as f:
    Y_train = pickle.load(f)
print(
    "Loaded the training target variable Y from data/processed/target_train.pkl."
)

with open('data/processed/raw_count_features_train.pkl', 'rb') as f:
    X_train = pickle.load(f)
print("Loaded X from data/processed/raw_count_features_train.pkl.\n")

print("Shape of X_train is {X_train}.\n".format(X_train=X_train.shape))

###### Naive Bayes ########
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

classifnb = OneVsRestClassifier(MultinomialNB())
classifnb.fit(X_train, Y_train)
print("Trained using Multinomial Naive Bayes.")

with open('models/classifier_nb.pkl', 'wb') as f:
    pickle.dump(classifnb, f)
        for j in range(7):
            block = x_luv[32 * i:32 * (i + 1), 32 * j:32 * (j + 1)]
            mean, var = np.mean(block,
                                axis=tuple(range(block.ndim - 1))), np.var(
                                    block, axis=tuple(range(block.ndim - 1)))
            l = np.concatenate((l, mean))
            l = np.concatenate((l, var))
    x_test.append(l)
x_train = np.asarray(x_train).astype(np.float32)
x_test = np.asarray(x_test).astype(np.float32)
y_test = np.asarray(y_test)
y_train = np.asarray(y_train)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

classifier = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3))
classifier.fit(x_train, y_train)

with open('onevsrest-knn-3-luv.pkl', 'wb') as f:
    pickle.dump(classifier, f)
'''
with open('onevsrest-knn-3-luv.pkl', 'rb') as f:
    classifier = pickle.load(f)
'''
predictions = classifier.predict(x_test)
print('all match:',
      np.sum(np.all(predictions == y_test, axis=1)) / len(y_test))
print('at least one match:',
      (np.sum(np.all(predictions - y_test <= 0, axis=1)) -
       np.sum(np.all(predictions == 0, axis=1))) / len(y_test))
print('binary :', np.sum(predictions == y_test) / (5 * len(y_test)))
Ejemplo n.º 50
0
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                    random_state=0)

# Learn to predict each class against the other
#classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
#                                 random_state=random_state))
classifier = OneVsRestClassifier(knn)

#y_score = classifier.fit(X_train, y_train).decision_function(X_test)
y_score = classifier.fit(X_train, y_train).predict(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Plot ROC
plt.figure()
lw = 2
Ejemplo n.º 51
0
class Model:
    def __init__(self, estimator_file=None):
        self.estimator = None
        self.estimator_file = estimator_file
        self.estimator_name = None
        self.threshold = 0.5
        self.binarizer = None
        self.vectorizer = None

    def load(self):
        if self.estimator_file is None:
            raise ValueError(
                'Specify an estimator_path for loading a pre-trained model.')

        # Load file
        file = open(self.estimator_file, "rb")
        model = pickle.load(file)

        # Set variables from loaded model
        self.estimator = model.estimator
        self.estimator_name = model.estimator_name
        self.binarizer = model.binarizer
        self.vectorizer = model.vectorizer

        file.close()

    def fit(self, X, y, estimator='logistic', n_jobs=-1, **estimator_params):
        # Select estimator
        if estimator == 'logistic':
            estimator = LogisticRegressionCV(**estimator_params)
        elif callable(estimator):
            estimator = estimator(**estimator_params)
        else:
            raise NotImplementedError(
                f'Estimator "{estimator}" not yet implemented!')

        # Build into OneVsRestClassifier and fit
        self.estimator = OneVsRestClassifier(estimator=estimator,
                                             n_jobs=n_jobs)
        self.estimator.fit(X, y)
        self.estimator_name = type(estimator).__name__

    def predict(self, X=None, probas=None, t=None):
        # Parameter setting and error checking
        if X is None and probas is None:
            raise TypeError("Either X or probas must be provided.")
        if t is None:
            t = self.threshold

        # Get probabilities matrix
        if probas is None:
            probas = self.estimator.predict_proba(X)

        # Set to 1 of probability is >= threshold
        return (probas >= t).astype(int)

    def score(self, y, X=None, probas=None, t=None):
        preds = self.predict(X, probas, t)
        return f1_score(y, preds, average='micro')

    def set_best_threshold(self,
                           X,
                           y,
                           precision=0.01,
                           max_t=0.5,
                           min_t=None,
                           bias=0):
        # Parameter setting and error checking
        if min_t is None:
            min_t = precision
        if min_t > max_t:
            raise ValueError(
                "Minimum threshold needs to be less than maximum.")

        # Get probas and score for current threshold
        probas = self.estimator.predict_proba(X)
        best_score = self.score(y, probas=probas)

        # Loop to try to find a better threshold
        for t in np.arange(min_t, max_t, precision):
            score = self.score(y, probas=probas, t=t)
            if score >= best_score:
                best_score = score
                self.threshold = t + bias
Ejemplo n.º 52
0
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import train_test_split

data, target = make_multilabel_classification(n_samples=1000,
                                              n_classes=3,
                                              n_labels=3,
                                              allow_unlabeled=True,
                                              random_state=1)

traing_data, test_data, traing_target, test_target = train_test_split(
    data, target, test_size=0.2)

classif = OneVsRestClassifier(SVC(kernel='linear'))
classif.fit(traing_data, traing_target)

predicted = classif.predict(test_data)

print(metrics.classification_report(test_target, predicted,
                                    target_names="tes"))
Ejemplo n.º 53
0
cpu_count = 1
if (len(argv) == 2):
    script, cpu_count = argv
try:
    cpu_count = int(cpu_count)
except Exception, e:
    print "Cpu count should be a number"
    exit()
dataset = numpy.genfromtxt(open('../Data/train.csv', 'r'),
                           delimiter=',',
                           dtype='f8')[1:]
target = [x[0] for x in dataset]
train = [x[1:] for x in dataset]
test = numpy.genfromtxt(open('../Data/test.csv', 'r'),
                        delimiter=',',
                        dtype='f8')[1:]
number_of_svms = 40
svm_bagging_classifier = OneVsRestClassifier(
    BaggingClassifier(svm.SVC(C=0.01, gamma=1e-8),
                      max_samples=1.0 / number_of_svms,
                      n_estimators=number_of_svms,
                      n_jobs=cpu_count))
svm_bagging_classifier.fit(train, target)
predictions = svm_bagging_classifier.predict(test)
numpy.savetxt('../Predictions/svm_predictions.csv',
              numpy.c_[range(1,
                             len(test) + 1), predictions],
              delimiter=',',
              header='ImageId,Label',
              comments='',
              fmt='%d')
Ejemplo n.º 54
0
def run_test(filename, results_dir, models, random_state, external_split,
             internal_split, optimization_iterations):
    global df_results
    print(filename)
    data_dict['Dataset Name'] = filename.replace('.csv', '')
    df = pd.read_csv(directory + '/' + filename)
    X, Y = fix_dataset(df)
    kf = StratifiedKFold(n_splits=external_split,
                         random_state=random_state,
                         shuffle=True)
    for fold_index, (train_index, test_index) in enumerate(kf.split(X, Y)):
        data_dict['Cross Validation[1-10]'] = fold_index
        print("fold index =", fold_index)
        x_train = X.iloc[train_index]
        y_train = Y.iloc[train_index]
        x_test = X.iloc[test_index]
        y_test = Y.iloc[test_index]
        for model_name, model_class, model, model_dict in models:
            print('Model:', model_name)
            data_dict['Algorithm Name'] = model_name
            # distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
            distributions = model_dict
            start_training_time = time.time()
            randomSearcher = RandomizedSearchCV(
                model,
                distributions,
                random_state=random_state,
                cv=internal_split,
                n_iter=optimization_iterations,
                scoring=make_scorer(accuracy_score))
            randomSearcher.fit(x_train, y_train.values.ravel())

            if model_class is wprb:
                params = {
                    k.replace("estimator__", ""): v
                    for k, v in randomSearcher.best_params_.items()
                }
                best_model = OneVsRestClassifier(model_class(**params))
            else:
                params = randomSearcher.best_params_
                best_model = model_class(**params)
            data_dict['Hyper-Parameters Values'] = params
            best_model.fit(x_train, y_train.values.ravel())
            data_dict['Training Time'] = time.time() - start_training_time
            print("best params:", params)
            print(
                "train accuracy:",
                round(accuracy_score(y_train, best_model.predict(x_train)), 4))
            start_inference_time = time.time()
            test_pred = best_model.predict(x_test)
            test_pred_proba = best_model.predict_proba(x_test)
            data_dict['Inference Time'] = (
                time.time() - start_inference_time) / (len(x_test)) * 1000
            print("test accuracy:", round(accuracy_score(y_test, test_pred),
                                          4))
            print()
            data_dict['Accuracy'] = accuracy_score(y_test, test_pred)
            data_dict['Precision'] = precision_score(
                y_test,
                test_pred,
                average='macro',
                labels=np.unique(test_pred))
            unique_labels = np.unique(Y.values)
            if len(unique_labels) == 2:  # multiclass vs binary classification
                data_dict['AUC'] = roc_auc_score(y_true=y_test,
                                                 y_score=test_pred_proba[:, 1])
            else:
                # plaster = test_pred_proba[:, [np.where(np.unique(Y.values) == x)[0][0] for x in np.unique(y_test)]]
                # plaster2 = np.array([[x / sum(y) for x in y] for y in plaster])
                data_dict['AUC'] = roc_auc_score(y_true=y_test,
                                                 y_score=test_pred_proba,
                                                 multi_class='ovr',
                                                 labels=np.unique(y_test))
            all_TPR = []
            all_FPR = []
            all_PR_CURVE = []
            for index, class_label in enumerate(np.unique(y_test)):
                tn, fp, fn, tp = confusion_matrix(
                    y_test == class_label, test_pred == class_label).ravel()
                all_FPR.append(fp / (fp + tn))
                all_TPR.append(tp / (tp + fn))
                precision, recall, _ = precision_recall_curve(
                    y_test == class_label, test_pred_proba[:, index])
                all_PR_CURVE.append(auc(recall, precision))
            data_dict['FPR'] = np.mean(all_FPR)
            data_dict['TPR'] = np.mean(all_TPR)
            data_dict['PR Curve'] = np.mean(all_PR_CURVE)

            df_results = df_results.append(data_dict, ignore_index=True)
    df_results.to_csv(results_dir + '/' + filename, index=False)
    df_results = df_results.iloc[0:0]
Ejemplo n.º 55
0
classifier = SVC(
    C=100,
    kernel='rbf',  # kernel type
    degree=3,  # default value
    gamma=1,
    coef0=1,
    shrinking=True,
    tol=0.001,
    probability=False,
    cache_size=200,
    class_weight=None,
    verbose=False,
    max_iter=-1,
    decision_function_shape=None,
    random_state=None)
model = OneVsRestClassifier(classifier, n_jobs=4)

model.fit(X, y)

y_test = model.predict(X_test)
y_pred = lb.inverse_transform(y_test)

test_id = [doc['id'] for doc in test]
sub = pd.DataFrame({
    'id': test_id,
    'cuisine': y_pred
},
                   columns=['id', 'cuisine'])
sub.to_csv('svm_output.csv', index=False)
vencedor = resultados[maximo]
print('**************')
print("Vencedor: ")
print(vencedor)
print('**************')

## Treinando o modelo final (vencedor)
vencedor.fit(treino_dados, treino_marcacoes)

## Salva modelo Vencedor
dump(vencedor, arquivo_modelo_salvo) 
np.set_printoptions(precision=2)

tempo_final = time.time()
tempo_total = ((tempo_final - tempo_inicial) / 60)
print('-------------------------------------------------------------------------------------')
print("Tempo total de execução em minutos: %.2f" % tempo_total)
print('-------------------------------------------------------------------------------------')

## Teste real com dados de validação
teste_real(vencedor, validacao_dados, validacao_marcacoes)

modeloOneVsRest.fit(treino_dados, treino_marcacoes)
teste_real(modeloOneVsRest, validacao_dados, validacao_marcacoes)

modeloMultinomial.fit(treino_dados, treino_marcacoes)
teste_real(modeloMultinomial, validacao_dados, validacao_marcacoes)

modeloAdaBoost.fit(treino_dados, treino_marcacoes)
teste_real(modeloAdaBoost, validacao_dados, validacao_marcacoes)
def multiclass_classifier(X_train, X_test, y_train, y_test, model,
                          list_of_classes, class_labels):

    # Binarize the output
    y_train, y_test = label_binarize(y_train,
                                     classes=list_of_classes), label_binarize(
                                         y_test, classes=list_of_classes)
    n_classes = len(class_labels)

    # Learn to predict each class against the other
    classifier = OneVsRestClassifier(model)
    y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at these points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure(figsize=(12, 12))
    plt.plot(fpr["micro"],
             tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["micro"]),
             color='deeppink',
             linestyle=':',
             linewidth=4)

    plt.plot(fpr["macro"],
             tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='navy',
             linestyle=':',
             linewidth=4)

    colors = cycle([
        'aqua', 'darkorange', 'cornflowerblue', 'green', 'purple', 'red',
        'blue'
    ])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i],
                 tpr[i],
                 color=color,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i + 1, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(
        'Some extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    figure = plt.show()

    y_prob = classifier.predict_proba(X_test)

    # macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
    #                                   average="macro")
    # weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
    #                                      average="weighted")
    macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, average="macro")
    weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, average="weighted")
    # print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    #       "(weighted by prevalence)"
    #       .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))

    y_pred = classifier.predict(X_test)

    mcm = multilabel_confusion_matrix(y_test, y_pred, labels=class_labels)

    print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
          "(weighted by prevalence)".format(
              macro_roc_auc_ovr,
              weighted_roc_auc_ovr)), print(figure), print(mcm)

    return classifier
    def before_trading_start(context, data):
        context.days_traded += 1

        if context.model == {} or context.model['refresh_date'] <= context.days_traded:
            context.model['refresh_date'] = context.days_traded + context.refresh_frequency
            clusters = {}

            ws.send(msg_placeholder % "Retraining the clustering ML model")

            for ret_window in context.ret_windows:
                clusters[ret_window] = {'windows': {}}

                for window_length in context.window_lengths:
                    cluster_data = create_kmeans_features(context, data, window_length, ret_window)

                    window_length_str = str(window_length)

                    ws.send(msg_placeholder % ("Feature set for k-means with a look back of %s days created"
                                               % window_length_str))

                    cluster_data.dropna(inplace=True)
                    X = cluster_data.drop('rets', axis=1)
                    y = cluster_data['rets']

                    kmeans = KMeans(n_clusters=context.n_clusters, n_init=100, max_iter=500, random_state=42,
                                    precompute_distances=True)
                    kmeans.fit(X)

                    ws.send(msg_placeholder % ("K-means cluster for look back of %s days trained" % window_length_str))

                    clusters[ret_window]['windows'][window_length] = {
                        "kmeans": kmeans,
                        "regimes": kmeans.predict(X),
                        "rets": y
                    }

            ws.send(msg_placeholder % "Retraining the Random Forest ML model")

            panel = create_rand_forest_features(clusters)

            ws.send(msg_placeholder % "Feature set for Random Forest created")

            for ret_window, _ in clusters.items():
                df = panel[ret_window]
                ret = df['rets']

                X = df.drop('rets', axis=1)
                X_train = X.values

                if context.use_classifier:

                    global ret_buckets

                    try:
                        ret_buckets = context.ret_buckets[ret_window]
                    except KeyError:
                        ret_buckets = context.ret_buckets['gen']

                    clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=1000, random_state=42))

                    y = len(ret_buckets) * np.ones(len(ret)).astype(int)

                    for i in range(len(ret_buckets) - 1, -1, -1):
                        I = ret.values < ret_buckets[i]

                        y[I] = i

                    y_train = y

                    clf.fit(X_train, y_train)
                    clusters[ret_window]['clf'] = clf

                    ws.send(msg_placeholder % "Random Forest Classifier trained")

                else:
                    rfr = RandomForestRegressor(n_estimators=1000, random_state=42)
                    rfr.fit(X_train, ret.values)
                    clusters[ret_window]['reg'] = rfr

                    ws.send(msg_placeholder % "Random Forest Regression trained")

            context.model['clusters'] = clusters
Ejemplo n.º 59
0
svm_clf = OneVsRestClassifier(SVC(C=10))
cross_val_score(svm_clf, X_train, y_train, cv=10).mean()

svm_clf = OneVsRestClassifier(SVC(C=100))
cross_val_score(svm_clf, X_train, y_train, cv=10).mean()


# In[93]:

svm_optimized = OneVsRestClassifier(SVC(C=10))


# In[94]:

svm_optimized.fit(X_train, y_train)
svm_optimized.score(X_test, y_test)


# In[95]:

plot_learning_curve(svm_optimized, title='SVM learning curve', X=X_train, y=y_train, cv=10)
plt.show()


# ### Artificial Neural Networks

# In[50]:

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
random_state = np.random.RandomState(0)
n_classes = 2
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.333,
                                                    random_state=0)

# Run classifier
classifier = OneVsRestClassifier(
    svm.SVC(kernel='linear', probability=True, random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')

plt.legend(loc="lower right")

plt.savefig(
    '/home//askrey/Dropbox/Project_step_by_step/3_create_database/SVM.png')