def train_classifier(clf,X_train,y_train,X_test,y_test):
	clf = OneVsOneClassifier(clf)
	clf.fit(X_train, y_train)
	train_time = time() - t0
	print("train time: %0.3fs" % train_time)
	t0 = time()
	return clf
Example #2
0
def gen_svc(train_model):
    '''Given a training model, generates the SVM (and DictVectorizer) for it

    Args: 
        train_model: a training model object. should have 2 attributes:
        feature_lists, a map from POS tag to a dictionary of features
        (the ones used in the ith decision), and action_lists, a map from
        POS tag to the action (Shift, Left, Right) chosen for the ith decision
    Returns: dictionary mapping POS tag to a vectorizer, SVM tuple
    Raises: None
    '''
    models = {}
    for pos_tag in train_model.feature_lists:
        vec = DictVectorizer()
        feature_mat = vec.fit_transform(train_model.feature_lists[pos_tag])
        trained_svc = OneVsOneClassifier(LinearSVC())
        try:
            trained_svc.fit(feature_mat, np.array(train_model.action_lists[pos_tag]))
        except ValueError:
            # occasionally we get the same action for everything with a
            # particular POS, which raises an error. so in that case we just
            # use a custom class that always predicts the same action
            trained_svc = AlwaysPredict(train_model.feature_lists[pos_tag][0])
        models[pos_tag] = (vec, trained_svc)
    return models
def svm_training(train_X,train_Y,kernel):
	if kernel == False:
		clf = OneVsOneClassifier(svm.LinearSVC(random_state=0))
	else:
		clf = OneVsOneClassifier(svm.SVC(kernel='rbf'))
	clf.fit(train_X,train_Y)
	return clf
def test_ovo_partial_fit_predict():
    X, y = shuffle(iris.data, iris.target)
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
    ovo1.partial_fit(X[100:], y[100:])
    pred1 = ovo1.predict(X)

    ovo2 = OneVsOneClassifier(MultinomialNB())
    ovo2.fit(X, y)
    pred2 = ovo2.predict(X)
    assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2)
    assert_greater(np.mean(y == pred1), 0.65)
    assert_almost_equal(pred1, pred2)

    # Test when mini-batches don't have all target classes
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovo1.partial_fit(iris.data[60:], iris.target[60:])
    pred1 = ovo1.predict(iris.data)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data)

    assert_almost_equal(pred1, pred2)
    assert_equal(len(ovo1.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred1), 0.65)
Example #5
0
def gen_svc(train_model):
    '''Given a training model, generates the SVM (and DictVectorizer) for it'''
    vec = DictVectorizer()
    feature_mat = vec.fit_transform(train_model.feature_list)
    # for some reason just SVC() seems to always suggest "Shift"
    trained_svc = OneVsOneClassifier(LinearSVC())
    trained_svc.fit(feature_mat, np.array(train_model.action_list))
    return vec, trained_svc
Example #6
0
def test_ovo_fit_on_list():
    # Test that OneVsOne fitting works with a list of targets and yields the
    # same output as predict from an array
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    prediction_from_array = ovo.fit(iris.data, iris.target).predict(iris.data)
    prediction_from_list = ovo.fit(iris.data,
                                   list(iris.target)).predict(iris.data)
    assert_array_equal(prediction_from_array, prediction_from_list)
Example #7
0
def test_ovo_string_y():
    # Test that the OvO doesn't mess up the encoding of string labels
    X = np.eye(4)
    y = np.array(['a', 'b', 'c', 'd'])

    ovo = OneVsOneClassifier(LinearSVC())
    ovo.fit(X, y)
    assert_array_equal(y, ovo.predict(X))
Example #8
0
def test_ovo_string_y():
    "Test that the OvO doesn't screw the encoding of string labels"
    X = np.eye(4)
    y = np.array(['a', 'b', 'c', 'd'])

    svc = LinearSVC()
    ovo = OneVsOneClassifier(svc)
    ovo.fit(X, y)
    assert_array_equal(y, ovo.predict(X))
def OneVsOne(inputs_train, inputs_valid, target_train, target_valid):
	name = "Multiclass One Vs One"
	clf = OneVsOneClassifier(LinearSVC(random_state=0))
	clf.fit(inputs_train, np.ravel(target_train))
	prediction = clf.predict(inputs_valid)
	correct = np.count_nonzero(np.ravel(target_valid) == prediction)
	total = target_valid.shape[0]
	correctRate = (float(correct)/total)*100

	return name, correctRate
Example #10
0
def test_ovo_fit_predict():
    # A classifier which implements decision_function.
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    ovo.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ovo.estimators_), n_classes * (n_classes - 1) / 2)

    # A classifier which implements predict_proba.
    ovo = OneVsOneClassifier(MultinomialNB())
    ovo.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ovo.estimators_), n_classes * (n_classes - 1) / 2)
def svm(X,Y):
    X_train = np.array([x for i, x in enumerate(X) if i % 7 != 0], dtype = np.uint8)
    y_train = np.array([z for i, z in enumerate(Y) if i % 7 != 0], dtype = np.uint8)
    X_test  = np.array([x for i, x in enumerate(X) if i % 10 == 0], dtype = np.uint8)
    y_test  = np.array([z for i, z in enumerate(Y) if i % 10 == 0], dtype = np.uint8)

    clf = OneVsOneClassifier(LinearSVC(random_state=0))
    clf.fit(X_train, y_train)
    y_predicted = rf.predict(X_test)

    results = [prediction == truth for prediction, truth in zip(y_predicted, y_test)]
    accuracy = float(results.count(True)) / float(len(results))
    print accuracy
def test_pairwise_indices():
    clf_precomputed = svm.SVC(kernel="precomputed")
    X, y = iris.data, iris.target

    ovr_false = OneVsOneClassifier(clf_precomputed)
    linear_kernel = np.dot(X, X.T)
    ovr_false.fit(linear_kernel, y)

    n_estimators = len(ovr_false.estimators_)
    precomputed_indices = ovr_false.pairwise_indices_

    for idx in precomputed_indices:
        assert_equal(idx.shape[0] * n_estimators / (n_estimators - 1), linear_kernel.shape[0])
def test_ovo_partial_fit_predict():
    temp = datasets.load_iris()
    X, y = temp.data, temp.target
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
    ovo1.partial_fit(X[100:], y[100:])
    pred1 = ovo1.predict(X)

    ovo2 = OneVsOneClassifier(MultinomialNB())
    ovo2.fit(X, y)
    pred2 = ovo2.predict(X)
    assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2)
    assert_greater(np.mean(y == pred1), 0.65)
    assert_almost_equal(pred1, pred2)

    # Test when mini-batches have binary target classes
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:60], y[:60], np.unique(y))
    ovo1.partial_fit(X[60:], y[60:])
    pred1 = ovo1.predict(X)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(X, y).predict(X)

    assert_almost_equal(pred1, pred2)
    assert_equal(len(ovo1.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred1), 0.65)

    ovo = OneVsOneClassifier(MultinomialNB())
    X = np.random.rand(14, 2)
    y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2]
    ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4])
    ovo.partial_fit(X[7:], y[7:])
    pred = ovo.predict(X)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(X, y).predict(X)
    assert_almost_equal(pred, pred2)

    # raises error when mini-batch does not have classes from all_classes
    ovo = OneVsOneClassifier(MultinomialNB())
    error_y = [0, 1, 2, 3, 4, 5, 2]
    message_re = escape("Mini-batch contains {0} while "
                        "it must be subset of {1}".format(np.unique(error_y),
                                                          np.unique(y)))
    assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7],
                         error_y, np.unique(y))

    # test partial_fit only exists if estimator has it:
    ovr = OneVsOneClassifier(SVC())
    assert_false(hasattr(ovr, "partial_fit"))
Example #14
0
def test_ovo_ties():
    # test that ties are broken using the decision function, not defaulting to
    # the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron())
    ovo_prediction = multi_clf.fit(X, y).predict(X)

    # recalculate votes to make sure we have a tie
    predictions = np.vstack([clf.predict(X) for clf in multi_clf.estimators_])
    scores = np.vstack([clf.decision_function(X)
                        for clf in multi_clf.estimators_])
    # classifiers are in order 0-1, 0-2, 1-2
    # aggregate votes:
    votes = np.zeros((4, 3))
    votes[np.arange(4), predictions[0]] += 1
    votes[np.arange(4), 2 * predictions[1]] += 1
    votes[np.arange(4), 1 + predictions[2]] += 1
    # for the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # for the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # for the tie, the prediction is the class with the highest score
    assert_equal(ovo_prediction[0], 0)
    # in the zero-one classifier, the score for 0 is greater than the score for
    # one.
    assert_greater(scores[0][0], scores[0][1])
    # score for one is greater than score for zero
    assert_greater(scores[2, 0] - scores[0, 0], scores[0, 0] + scores[1, 0])
    # score for one is greater than score for two
    assert_greater(scores[2, 0] - scores[0, 0], -scores[1, 0] - scores[2, 0])
Example #15
0
def svm_classification(genres, features_type):
	training_set_features = tf.read_features_from_files("../../music/training", genres, features_type)
	testing_set_features = tf.read_features_from_files("../../music/testing", genres, features_type)

	X = []
	y = []
	for feature in training_set_features:
		(mean, cov_mat, genre_name) = feature
		X.append(mean.tolist())
		y.append(tf.get_genre_ID(genre_name))

	training_data = np.array(X)
	training_class = np.array(y)

	X = []
	y = []
	for feature in testing_set_features:
		(mean, cov_mat, genre_name) = feature
		X.append(mean.tolist())
		y.append(tf.get_genre_ID(genre_name))

	testing_data = np.array(X)
	testing_class = np.array(y)


	clf = OneVsOneClassifier(SVC(kernel='linear'))
	result_class = np.array(clf.fit(training_data, training_class).predict(testing_data))

	rt.print_accuracy(list(testing_class), list(result_class), genres, features_type, "svm")
	rt.write_accuracy_to_file("../../music/", list(testing_class), list(result_class), genres, features_type, "svm")
def test_ovo_decision_function():
    n_samples = iris.data.shape[0]

    ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0))
    # first binary
    ovo_clf.fit(iris.data, iris.target == 0)
    decisions = ovo_clf.decision_function(iris.data)
    assert_equal(decisions.shape, (n_samples,))

    # then multi-class
    ovo_clf.fit(iris.data, iris.target)
    decisions = ovo_clf.decision_function(iris.data)

    assert_equal(decisions.shape, (n_samples, n_classes))
    assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data))

    # Compute the votes
    votes = np.zeros((n_samples, n_classes))

    k = 0
    for i in range(n_classes):
        for j in range(i + 1, n_classes):
            pred = ovo_clf.estimators_[k].predict(iris.data)
            votes[pred == 0, i] += 1
            votes[pred == 1, j] += 1
            k += 1

    # Extract votes and verify
    assert_array_equal(votes, np.round(decisions))

    for class_idx in range(n_classes):
        # For each sample and each class, there only 3 possible vote levels
        # because they are only 3 distinct class pairs thus 3 distinct
        # binary classifiers.
        # Therefore, sorting predictions based on votes would yield
        # mostly tied predictions:
        assert_true(set(votes[:, class_idx]).issubset(set([0., 1., 2.])))

        # The OVO decision function on the other hand is able to resolve
        # most of the ties on this data as it combines both the vote counts
        # and the aggregated confidence levels of the binary classifiers
        # to compute the aggregate decision function. The iris dataset
        # has 150 samples with a couple of duplicates. The OvO decisions
        # can resolve most of the ties:
        assert_greater(len(np.unique(decisions[:, class_idx])), 146)
def trainOneVsOne2( histograms ):

    xAll = convertToSvmFormatFeature(histograms)
    scaleParam = computeScaleParameters(xAll)
    scaleFeatureData(xAll,scaleParam)

    xAll = np.array(xAll)

    yAll = [ x['label'] for x in histograms ]
    yAll = np.array(yAll)

    # svm = OneVsOneClassifier(LinearSVC(random_state=0,dual=svm_conf['dual'],C=svm_conf['C']))
    gammaBase = 1.0/kmeans_conf['K']
    # svm = OneVsOneClassifier(sklearn.svm.SVC(C=100, gamma=10*gammaBase,kernel='rbf'))
    svm = OneVsOneClassifier(sklearn.svm.SVC(C=1000, gamma=gammaBase,kernel='sigmoid'))
    svm.fit(xAll,yAll)

    out = {'scaleParam':scaleParam,'svm':svm}
    return out
Example #18
0
def test_ovo_ties2():
    # test that ties can not only be won by the first two labels
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y_ref = np.array([2, 0, 1, 2])

    # cycle through labels so that each label wins once
    for i in range(3):
        y = (y_ref + i) % 3
        multi_clf = OneVsOneClassifier(Perceptron())
        ovo_prediction = multi_clf.fit(X, y).predict(X)
        assert_equal(ovo_prediction[0], i % 3)
Example #19
0
    def test_multicluster(self):
        c = BinaryTiloClassifier(PinchRatioCutStrategy(),
                                 similarity.Gaussian())
        ##c = BinaryTiloClassifier(similarity.KNN())
        ##mcc = OneVsRestClassifier(c)
        mcc = OneVsOneClassifier(c)
        data = self.three_class_pts
        classes = self.three_class_labels

        peturbed_data = data + 0.01 * np.random.random(data.shape)
        fitted = mcc.fit(peturbed_data, classes)
        guesses = fitted.predict(peturbed_data)
        assert_array_equal(guesses, classes)
def learn(cat1,cat2,cat3):
    X = []
    Y = []
    IDF=get_IDF([cat1,cat2,cat3])
    for d in cat1:
        X.append(MapToEvalVS(d,IDF));
        Y.append(0)
    for d in cat2:
        X.append(MapToEvalVS(d,IDF));
        Y.append(1)
    for d in cat3:
        X.append(MapToEvalVS(d,IDF));
        Y.append(2)

    X=np.array(X)
    Y=np.array(Y)
    #clf = svm.SVC(verbose=True)
    #clf=svm.SVC()
    clf = OneVsOneClassifier(svm.SVC())
    #clf=KNeighborsClassifier(weights='distance')
    clf.fit(X, Y)
    return [clf,IDF]
    def analysis(self, testanalysis=True):
        if testanalysis:
            trainingdata, testdata = self.getTrainTestData()
        else:
            trainingdata, testdata = self.getRealData()

        aDict = {}
        for value in trainingdata:
            phrase = value.Phrase

            phrase = phrase.strip()

            aDict[phrase] = value.Sentiment

        _all_values = aDict.keys()
        _all_sentiments = aDict.values()

        # self.KFOLDTEST(np.asarray(_all_values), np.asarray(_all_sentiments))

        count_vectorizer = CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data)
        count = count_vectorizer.fit_transform(_all_values)

        # self.countWordFreq(count_vectorizer, count)

        tfidf = TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False)
        data = tfidf.fit_transform(count)

        classfier = OneVsOneClassifier(LinearSVC())
        classfier.fit(data, np.asarray(_all_sentiments))

        # Data to write the content into the CSV , for getting this comment the above to take entire training set
        # as the real data
        # along with that call the method @getRealData
        if testanalysis:
            self.normalexecution(testdata, count_vectorizer, tfidf, classfier)
        else:
            self.writeToFile(testdata, count_vectorizer, tfidf, classfier)
Example #22
0
def test_ovo_ties():
    # Test that ties are broken using the decision function,
    # not defaulting to the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron())
    ovo_prediction = multi_clf.fit(X, y).predict(X)
    ovo_decision = multi_clf.decision_function(X)

    # Classifiers are in order 0-1, 0-2, 1-2
    # Use decision_function to compute the votes and the normalized
    # sum_of_confidences, which is used to disambiguate when there is a tie in
    # votes.
    votes = np.round(ovo_decision)
    normalized_confidences = ovo_decision - votes

    # For the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # For the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # For the tie, the prediction is the class with the highest score
    assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())
Example #23
0
from sklearn import svm, datasets
from sklearn.multiclass import OneVsOneClassifier
#调用SVC()
clf = svm.SVC()
#载入鸢尾花数据集
iris = datasets.load_iris()
print(iris)

X, y = iris.data, iris.target
clf = svm.LinearSVC(random_state=0)

clf = OneVsOneClassifier(clf)  # 根据二分类器构建多分类器
clf.fit(X, y)  # 训练模型
y_pred = clf.predict(X)  # 预测样本
print('预测正确的个数:%d,预测错误的个数:%d' % ((y == y_pred).sum(), (y != y_pred).sum()))
Example #24
0
clf = OneVsOneClassifier(LinearSVC(C=1.0, random_state=0))
pred = cross_val_predict(clf, features, target, cv=30, n_jobs=-1)
print(
    classification_report(target,
                          pred,
                          target_names=train_val_data.label.unique()))

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(min_samples_split=5,
                             random_state=0,
                             n_estimators=100,
                             n_jobs=-1,
                             verbose=1,
                             class_weight="balanced")
clf.fit(features, target)

val_features = validation_data.drop(['label'], axis=1).values
val_target = validation_data.label.values
predicted = clf.predict(val_features)
print(
    classification_report(val_target,
                          predicted,
                          target_names=train_val_data.label.unique()))

test_features = test_data.drop(['label'], axis=1).values
test_target = test_data.label.values
predicted = clf.predict(test_features)
print(
    classification_report(test_target,
                          predicted,
Example #25
0
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
    """Gaussian process classification (GPC) based on Laplace approximation.

    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
    Gaussian Processes for Machine Learning (GPML) by Rasmussen and
    Williams.

    Internally, the Laplace approximation is used for approximating the
    non-Gaussian posterior by a Gaussian.

    Currently, the implementation is restricted to using the logistic link
    function. For multi-class classification, several binary one-versus rest
    classifiers are fitted. Note that this class thus does not implement
    a true multi-class Laplace approximation.

    Parameters
    ----------
    kernel : kernel object
        The kernel specifying the covariance function of the GP. If None is
        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
        the kernel's hyperparameters are optimized during fitting.

    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
        Can either be one of the internally supported optimizers for optimizing
        the kernel's parameters, specified by a string, or an externally
        defined optimizer passed as a callable. If a callable is passed, it
        must have the  signature::

            def optimizer(obj_func, initial_theta, bounds):
                # * 'obj_func' is the objective function to be maximized, which
                #   takes the hyperparameters theta as parameter and an
                #   optional flag eval_gradient, which determines if the
                #   gradient is returned additionally to the function value
                # * 'initial_theta': the initial value for theta, which can be
                #   used by local optimizers
                # * 'bounds': the bounds on the values of theta
                ....
                # Returned are the best found hyperparameters theta and
                # the corresponding value of the target function.
                return theta_opt, func_min

        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
        is used. If None is passed, the kernel's parameters are kept fixed.
        Available internal optimizers are::

            'fmin_l_bfgs_b'

    n_restarts_optimizer : int, optional (default: 0)
        The number of restarts of the optimizer for finding the kernel's
        parameters which maximize the log-marginal likelihood. The first run
        of the optimizer is performed from the kernel's initial parameters,
        the remaining ones (if any) from thetas sampled log-uniform randomly
        from the space of allowed theta-values. If greater than 0, all bounds
        must be finite. Note that n_restarts_optimizer=0 implies that one
        run is performed.

    max_iter_predict : int, optional (default: 100)
        The maximum number of iterations in Newton's method for approximating
        the posterior during predict. Smaller values will reduce computation
        time at the cost of worse results.

    warm_start : bool, optional (default: False)
        If warm-starts are enabled, the solution of the last Newton iteration
        on the Laplace approximation of the posterior mode is used as
        initialization for the next call of _posterior_mode(). This can speed
        up convergence when _posterior_mode is called several times on similar
        problems as in hyperparameter optimization.

    copy_X_train : bool, optional (default: True)
        If True, a persistent copy of the training data is stored in the
        object. Otherwise, just a reference to the training data is stored,
        which might cause predictions to change if the data is modified
        externally.

    random_state : int, RandomState instance or None, optional (default: None)
        The generator used to initialize the centers.
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    multi_class : string, default : "one_vs_rest"
        Specifies how multi-class classification problems are handled.
        Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
        one binary Gaussian process classifier is fitted for each class, which
        is trained to separate this class from the rest. In "one_vs_one", one
        binary Gaussian process classifier is fitted for each pair of classes,
        which is trained to separate these two classes. The predictions of
        these binary predictors are combined into multi-class predictions.
        Note that "one_vs_one" does not support predicting probability
        estimates.

    n_jobs : int, optional, default: 1
        The number of jobs to use for the computation. If -1 all CPUs are used.
        If 1 is given, no parallel computing code is used at all, which is
        useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
        used. Thus for n_jobs = -2, all CPUs but one are used.

    Attributes
    ----------
    kernel_ : kernel object
        The kernel used for prediction. In case of binary classification,
        the structure of the kernel is the same as the one passed as parameter
        but with optimized hyperparameters. In case of multi-class
        classification, a CompoundKernel is returned which consists of the
        different kernels used in the one-versus-rest classifiers.

    log_marginal_likelihood_value_ : float
        The log-marginal-likelihood of ``self.kernel_.theta``

    classes_ : array-like, shape = (n_classes,)
        Unique class labels.

    n_classes_ : int
        The number of classes in the training data

    .. versionadded:: 0.18
    """
    def __init__(self,
                 kernel=None,
                 optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0,
                 max_iter_predict=100,
                 warm_start=False,
                 copy_X_train=True,
                 random_state=None,
                 multi_class="one_vs_rest",
                 n_jobs=1):
        self.kernel = kernel
        self.optimizer = optimizer
        self.n_restarts_optimizer = n_restarts_optimizer
        self.max_iter_predict = max_iter_predict
        self.warm_start = warm_start
        self.copy_X_train = copy_X_train
        self.random_state = random_state
        self.multi_class = multi_class
        self.n_jobs = n_jobs

    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,
            self.random_state)

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes; got %d class (only class %s "
                             "is present)" %
                             (self.n_classes_, self.classes_[0]))
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
                    OneVsRestClassifier(self.base_estimator_,
                                        n_jobs=self.n_jobs)
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                    OneVsOneClassifier(self.base_estimator_,
                                       n_jobs=self.n_jobs)
            else:
                raise ValueError("Unknown multi-class mode %s" %
                                 self.multi_class)

        self.base_estimator_.fit(X, y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean([
                estimator.log_marginal_likelihood()
                for estimator in self.base_estimator_.estimators_
            ])
        else:
            self.log_marginal_likelihood_value_ = \
                self.base_estimator_.log_marginal_likelihood()

        return self

    def predict(self, X):
        """Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array, shape = (n_samples,)
            Predicted target values for X, values are from ``classes_``
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        X = check_array(X)
        return self.base_estimator_.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array-like, shape = (n_samples, n_classes)
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
            raise ValueError("one_vs_one multi-class mode does not support "
                             "predicting probability estimates. Use "
                             "one_vs_rest mode instead.")
        X = check_array(X)
        return self.base_estimator_.predict_proba(X)

    @property
    def kernel_(self):
        if self.n_classes_ == 2:
            return self.base_estimator_.kernel_
        else:
            return CompoundKernel([
                estimator.kernel_
                for estimator in self.base_estimator_.estimators_
            ])

    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
        """Returns log-marginal likelihood of theta for training data.

        In the case of multi-class classification, the mean log-marginal
        likelihood of the one-versus-rest classifiers are returned.

        Parameters
        ----------
        theta : array-like, shape = (n_kernel_params,) or none
            Kernel hyperparameters for which the log-marginal likelihood is
            evaluated. In the case of multi-class classification, theta may
            be the  hyperparameters of the compound kernel or of an individual
            kernel. In the latter case, all individual kernel get assigned the
            same theta values. If None, the precomputed log_marginal_likelihood
            of ``self.kernel_.theta`` is returned.

        eval_gradient : bool, default: False
            If True, the gradient of the log-marginal likelihood with respect
            to the kernel hyperparameters at position theta is returned
            additionally. Note that gradient computation is not supported
            for non-binary classification. If True, theta must not be None.

        Returns
        -------
        log_likelihood : float
            Log-marginal likelihood of theta for training data.

        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
            Gradient of the log-marginal likelihood with respect to the kernel
            hyperparameters at position theta.
            Only returned when eval_gradient is True.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])

        if theta is None:
            if eval_gradient:
                raise ValueError(
                    "Gradient can only be evaluated for theta!=None")
            return self.log_marginal_likelihood_value_

        theta = np.asarray(theta)
        if self.n_classes_ == 2:
            return self.base_estimator_.log_marginal_likelihood(
                theta, eval_gradient)
        else:
            if eval_gradient:
                raise NotImplementedError(
                    "Gradient of log-marginal-likelihood not implemented for "
                    "multi-class GPC.")
            estimators = self.base_estimator_.estimators_
            n_dims = estimators[0].kernel_.n_dims
            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
                return np.mean([
                    estimator.log_marginal_likelihood(theta)
                    for i, estimator in enumerate(estimators)
                ])
            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
                # theta for compound kernel
                return np.mean([
                    estimator.log_marginal_likelihood(theta[n_dims * i:n_dims *
                                                            (i + 1)])
                    for i, estimator in enumerate(estimators)
                ])
            else:
                raise ValueError(
                    "Shape of theta must be either %d or %d. "
                    "Obtained theta with shape %d." %
                    (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
Example #26
0
#Regression Module Testing
def Build_Data_Set(features = ["citation2yrs","RDI","rcount","auth_prod_avg","authprod_max","auth_div_avg","auth_div_max","auth_hindex_avg","auth_hindex_max","auth_soc_avg","auth_soc_max","team"]):
    citation_data = pd.DataFrame.from_csv("./workspace/SVR.txt")
    citation_data = citation_data.loc[citation_data['paper_year'] <= 1995]   
    citation_data.iloc[np.random.permutation(len(citation_data))]
    
    X = np.array(citation_data[features].values)
    #X = preprocessing.scale(X)
    y = (citation_data["paper_cat"].values.tolist())
    return X,y

X, y = Build_Data_Set()

clf = OneVsOneClassifier(SVC(random_state=0, kernel='rbf'))
print("Learning started")
clf.fit(X,y)
#Saving the classifier
import pickle

save_classifier = open("./workspace/rbfsvr.pickle","wb")
pickle.dump(clf, save_classifier)
save_classifier.close()


#testing
def Build_Data_Set(features = ["citation2yrs","RDI","rcount","auth_prod_avg","authprod_max","auth_div_avg","auth_div_max","auth_hindex_avg","auth_hindex_max","auth_soc_avg","auth_soc_max","team"]):
    citation_data = pd.DataFrame.from_csv("./workspace/SVR.txt")
    citation_data = citation_data.loc[citation_data['paper_year'] >= 1996]   
    citation_data.iloc[np.random.permutation(len(citation_data))]
    
    X = np.array(citation_data[features].values)
 def fit(self, X, y):
     self._fit_y = y
     self.nbrs.fit(X, y)
     return OneVsOneClassifier.fit(self, X, y)
class EveryWordOneFeature(object):
    def __init__(self, slack=1, gamma=1, kernelType='linear', gram=1):
        self.gram = gram
        self.slack = slack
        self.gamma = gamma
        self.kernelType = kernelType
        self.data = np.ones((1000, 1000))
        self.cityClassifier = {}
        #TODO: Wieso nimmst du OneVsOne und nicht OneVsRest? Ginge OneVsRest nicht schneller?
        self.countryClassifier = OneVsOneClassifier(
            svm.SVC(kernel=self.kernelType, C=self.slack, gamma=self.gamma, probability=False,
                    cache_size=1000))
        self.bag = None
        self.numberOfFeatures = 0
        #Features and labels
        self.fitting_data = None
        self.predict_data = None
        self.cityPrediction = {}
        self.countryPrediction = None
        self.numberOfCityFeatures = {}

    def fit_cities(self, trainingData, labels, countryCode):
        print "Start fitting cities for country "  + str(countryCode)
        #TODO: Wieso nimmst du OneVsOne und nicht OneVsRest? Ginge OneVsRest nicht schneller?
        self.cityClassifier[countryCode] = OneVsOneClassifier(
            svm.SVC(kernel=self.kernelType, C=self.slack, gamma=self.gamma, probability=False))
        start = time.time()
        self.cityClassifier[countryCode].fit(trainingData[:, :self.get_number_of_city_features(countryCode)],
                                labels)
        end = time.time()
        print "Finished fitting cities in " + str((end - start)) + "s"

    def fit_countries(self):
        print "Start Fitting countries"
        start = time.time()
        self.countryClassifier.fit(self.fitting_data[:, :self.numberOfFeatures],
                                   self.fitting_data[:, (self.numberOfFeatures + 1)])
        end = time.time()
        print "Finished fitting countries in " + str((end - start)) + "s"


    def preprocess_training_data(self, data):
        startOfPreprocessing = time.time()
        print "Start Preprocessing"
        lengthOfTrainingData = self.data.shape[0]
        print "length of trainingData = " + str(lengthOfTrainingData)
        self.bag = BagOfWords(data)
        self.fitting_data = self.bag.get_features_and_labels()
        self.numberOfFeatures = self.fitting_data.shape[1] - 2
        startOfFittingCities = time.time()
        print "Finished Preprocessing in " + str((startOfFittingCities - startOfPreprocessing)) + "s"

    def fit(self, data):
        self.data = data
        self.preprocess_training_data(data)
        self.numberOfFeatures = self.fitting_data.shape[1] - 2

        self.fit_countries()



    def predict_cities(self, data, countryCode):
        print "Start predict cities"
        start = time.time()
        print data[:, :self.numberOfFeatures]
        print self.cityPrediction
        self.cityPrediction[countryCode] = self.cityClassifier[countryCode].predict(data[:, :self.get_number_of_city_features(countryCode)])
        end = time.time()
        print "Finished predicting cities in " + str((end - start)) + "s"


    def predict_countries(self):
        start = time.time()
        print "start predicting countries"
        print self.predict_data
        self.countryPrediction = self.countryClassifier.predict(self.predict_data[:, :self.numberOfFeatures])
        end = time.time()
        print "finished predicting countries in " + str((end - start)) + "s"


    def preprocess_predict_data(self, predict):
        self.predict_data = self.bag.get_get_validation_features(predict)

    def get_city_featuers(self, data, countryCode):
        return np.zeros((data.shape[0], 3))

    def predict(self, predict):
        self.preprocess_predict_data(predict)
        self.numberOfFeatures = self.predict_data.shape[1]
        # t1 = threading.Thread(target=self.predict_cities)
        self.predict_countries()
        joinedCityPredictions = np.zeros(predict.shape[0])

        countryCodes = np.unique(self.data[:, 2].astype(int))
        for countryCode in countryCodes:
            countryIndices = np.where(self.data[:, 2].astype(int) == countryCode)[0]

            self.fit_cities(self.get_city_featuers(self.data[countryIndices][:, 0],countryCode), self.data[countryIndices][:,1], countryCode)
        countryCodes = np.unique(self.countryPrediction)
        for countryCode in countryCodes:
            countryIndices = np.where(self.countryPrediction == countryCode)[0]
            self.predict_cities(self.get_city_featuers(predict[countryIndices], countryCode), countryCode)
            joinedCityPredictions[countryIndices] = self.cityPrediction[countryCode]

        prediction = np.vstack((joinedCityPredictions, self.countryPrediction)).T
        return prediction

    def get_number_of_city_features(self, cityCode):
        return 3
def main():
    #######################################
    # Saima Aman emotion blog data
    # replacing with our data
    global AddFeatures
    AddFeatures=["TEXTLEN","TITLELEN","STARS","VERIFIEDPURCHASE","BADGE","COMMENTS","FORMAT"]
    ourdatatuples = getmyxls()
    print "Length of ourdatatuples is: ",  len(ourdatatuples)
    #shuffle(saimaDataTuples)
    print "saimaDataTuples", ourdatatuples[0]
    trainTuples=ourdatatuples#[:1000]
    #testTuples=saimaDataTuples[1000:]

#     #######################################
    myData=getThreeColumnDataDict(ourdatatuples)
    #print(myData)
    print "lol: mydata "
    #print(myData)
    totalCount=sum([len(myData[k]) for k in myData])
    print totalCount
#     del trainLines
#     print"*"*50
    getDataStats(myData)
#     dataTuples=getLabeledDataTuples(myData)
#     ####################################
#     # Add first 1000 Saima tuples
#     #dataTuples=dataTuples+saimaDataTuples[:1000]
#     print dataTuples[0]
#     del myData
    ids, labels, vectors= getLabelsAndVectors(trainTuples)
    #print labels
    space=getSpace(vectors)
    print "Total # of features in your space is: ", len(space)
    # augment space with emotion features...

    space= augmentSpace(space, AddFeatures)
    #reducedSpace=getReducedSpace(vectors, space)
    print "Total # of features in your augmented space is: ", len(space)
    print "Predicted error"
    #print "Total # of features in your reducedSpace is: ", len(reducedSpace)
    oneHotVectors=getOneHotVectors(ids, labels, vectors, space)
    print(oneHotVectors[0])
    vectors, labels=getOneHotVectorsAndLabels(oneHotVectors)
    del oneHotVectors
    trainVectors = vectors
    trainLabels = labels
    #trainLabels.fit_transform([('H','NH')])
    #trainLabels = preprocessing.label_binarize(trainLabels,classes=[unicode("H"),unicode("NH")])

    #del vectors
    #del labels
    #C, gamma = getCAndGamma(trainVectors, trainLabels, kernel = 'rbf')
    # Train classifier

    clf = OneVsOneClassifier(SVC(kernel='linear', class_weight='auto', verbose= True, probability=True))
    #clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear', gamma=1, verbose= False, probability=False))

    clf.fit(trainVectors, trainLabels)
    print "\nDone fitting classifier on training data...\n"
    #testVectors = vectors[200:250]
    #testLabels = labels[200:250]
    #predicted_testLabels = clf.predict(testVectors)
    #print "Done predicting on DEV data...\n"
    #print "classification_report:\n", classification_report(testLabels, predicted_testLabels)#, target_names=target_names)
    #print "accuracy_score:", round(accuracy_score(testLabels, predicted_testLabels), 2)

    #del trainVectors
    #del trainLabels
#     saimaDataTuples=getSAIMAThreeColumnFormat()
#     print "Length of saimaDataTuples is: ",  len(saimaDataTuples)
#     shuffle(saimaDataTuples)
#     print "saimaDataTuples", saimaDataTuples[0]
#     ids, labels, vectors= getLabelsAndVectors(testTuples)
#     oneHotVectors=getOneHotVectors(ids, labels, vectors, space)
#     vectors, labels=getOneHotVectorsAndLabels(oneHotVectors)
#     del oneHotVectors
#     testVectors = vectors
#     testLabels = labels
#     predicted_testLabels = clf.predict(testVectors)




    #------------------------------------------------------------------------------------------
    print "="*50, "\n"
    print "Results with 5-fold cross validation:\n"
    print "="*50, "\n"
    #------------------------------------------------------------------------------------------
    predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=5)
    print "*"*20
    print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted)
    print "*"*20

    print "precision_score\t", metrics.precision_score(trainLabels, predicted,pos_label=unicode("H"),average='binary')
    print "recall_score\t", metrics.recall_score(trainLabels, predicted,pos_label=unicode("H"),average='binary')

    print "\nclassification_report:\n\n", metrics.classification_report(trainLabels, predicted)
    print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(trainLabels, predicted)

    '''#"------------------------------------------------------------------------------------------
Example #30
0
label_encoder = []
x_encoded = np.empty(x.shape)
for i, item in enumerate(x[0]):
    if item.isdigit():
        x_encoded[:, i] = x[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        x_encoded[:, i] = label_encoder[-1].fit_transform(x[:, i])
x = x_encoded[:, :-1].astype(int)
y = x_encoded[:, -1].astype(int)

# Create SVM classifier
classifier = OneVsOneClassifier(LinearSVC(random_state=0))

# Train the classifier
classifier.fit(x, y)

# Cross validation
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=5)
classifier = OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(x_train, y_train)
y_test_pred = classifier.predict(x_test)

# Compute the F1 score
f1 = cross_val_score(classifier, x, y, scoring='f1_weighted', cv=3)
print("F1 Score: " + str(round(100 * f1.mean(), 2)) + '%')

# Predict output for a test datapoint
Example #31
0
def plot_classification():
    # modified http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#sphx-glr-auto-examples-svm-plot-iris-py
    iris = datasets.load_iris()
    X = iris.data[:, :2]  # only want  to classify what type of iris it is
    y = iris.target
    C = 1.0  # SVM regularization parameter
    h = .02  # step size in the mesh

    svm_c = OneVsOneClassifier(SVC(kernel='linear', C=C))
    svm_l = OneVsRestClassifier(LinearSVC(C=C))

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    titles = ['One-vs-One', 'One-vs-All']

    plt.suptitle('Linear SVM')

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    plt.subplot(1, 2, 0 + 1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    Z_c = svm_c.fit(X, y).predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z_c = Z_c.reshape(xx.shape)
    plt.contourf(xx, yy, Z_c, cmap=plt.cm.coolwarm, alpha=0.8)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title(titles[0])

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    plt.subplot(1, 2, 1 + 1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    Z_l = svm_l.fit(X, y).predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z_l = Z_l.reshape(xx.shape)
    plt.contourf(xx, yy, Z_l, cmap=plt.cm.coolwarm, alpha=0.8)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title(titles[1])

    plt.savefig('images/svm_linear_1v1_1va.png')
    plt.close()
# # Prepare the cross-validation procedure
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_val_score
# cv = KFold(n_splits=10, random_state=1, shuffle=True)
# scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# print('KFold Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

# Prepare the repeated cross-validation procedure
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import cross_val_score
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# print('Repeated KFold Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

# Fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_comp = pd.DataFrame(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
fig, ax = plt.subplots(figsize=(7,4))
cm = confusion_matrix(y_test, y_pred)
xlabel = ['login', 'setting']
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.0%', cmap='Blues', ax=ax, vmin=0, linewidths=1, xticklabels=xlabel, yticklabels=xlabel)
plt.savefig('test.pdf')
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None)
rec = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)
Example #33
0
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.show()


# 使用所有数据
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

# OvR
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))

# OvO
log_reg2 = LogisticRegression(multi_class="multinomial", solver="newton-cg")
log_reg2.fit(X_train, y_train)
print(log_reg2.score(X_test, y_test))


# sklearn 关于 OvR 和 OvO 的其他支持
ovr = OneVsRestClassifier(log_reg)
ovr.fit(X_train, y_train)
print(ovr.score(X_test, y_test))

ovo = OneVsOneClassifier(log_reg)
ovo.fit(X_train, y_train)
print(ovo.score(X_test, y_test))
np.argmax(some_digit_scores)


# In[25]:


SGD_C.classes_


# In[26]:


# one-versus-one and one-versus-all
from sklearn.multiclass import OneVsOneClassifier
OvO_C = OneVsOneClassifier(SGDClassifier(random_state=42))
OvO_C.fit(X_train, Y_train)
OvO_C.predict([some_digit])


# In[27]:


len(OvO_C.estimators_)


# In[28]:


forest_clf.fit(X_train, Y_train)
forest_clf.predict_proba([some_digit])
Example #35
0
    train_label = np.append(train_label,
                            np.load('macro-mapping/' + str(train_index[i]) +
                                    '.npy'),
                            axis=0)

for i in range(1, len(test_index)):
    temp = np.load('alpha_carbon/' + str(test_index[i]) + '.npy')
    test_data = np.append(test_data,
                          np.load('alpha_carbon/' + str(test_index[i]) +
                                  '.npy'),
                          axis=0)
    del temp
    test_label = np.append(test_label,
                           np.load('macro-mapping/' + str(test_index[i]) +
                                   '.npy'),
                           axis=0)

clf = OneVsOneClassifier(
    RandomForestClassifier(n_estimators=50, max_depth=depth, random_state=0))
clf.fit(train_data, train_label)
print('Fold: %d Depth %d Train Accu: %.3f Test Accu: %.3f' %
      (fold, depth,
       np.sum(clf.predict(train_data) == train_label) / len(train_label),
       np.sum(clf.predict(test_data) == test_label) / len(test_label)))

del train_data, test_data, train_label, test_label

## save model
from sklearn.externals import joblib
joblib.dump(clf, 'ovo-randomforest/' + str(fold) + "_" + str(depth) + ".pkl")
Example #36
0
def main():
    #######################################
    # Saima Aman emotion blog data
    # replacing with our data
    global AddFeatures
    AddFeatures = [
        "TEXTLEN", "TITLELEN", "STARS", "VERIFIEDPURCHASE", "BADGE",
        "COMMENTS", "FORMAT"
    ]
    ourdatatuples = getmyxls()
    print "Length of ourdatatuples is: ", len(ourdatatuples)
    #shuffle(saimaDataTuples)
    print "saimaDataTuples", ourdatatuples[0]
    trainTuples = ourdatatuples  #[:1000]
    #testTuples=saimaDataTuples[1000:]

    #     #######################################
    myData = getThreeColumnDataDict(ourdatatuples)
    #print(myData)
    print "lol: mydata "
    #print(myData)
    totalCount = sum([len(myData[k]) for k in myData])
    print totalCount
    #     del trainLines
    #     print"*"*50
    getDataStats(myData)
    #     dataTuples=getLabeledDataTuples(myData)
    #     ####################################
    #     # Add first 1000 Saima tuples
    #     #dataTuples=dataTuples+saimaDataTuples[:1000]
    #     print dataTuples[0]
    #     del myData
    ids, labels, vectors = getLabelsAndVectors(trainTuples)
    #print labels
    space = getSpace(vectors)
    print "Total # of features in your space is: ", len(space)
    # augment space with emotion features...

    space = augmentSpace(space, AddFeatures)
    #reducedSpace=getReducedSpace(vectors, space)
    print "Total # of features in your augmented space is: ", len(space)
    print "Predicted error"
    #print "Total # of features in your reducedSpace is: ", len(reducedSpace)
    oneHotVectors = getOneHotVectors(ids, labels, vectors, space)
    print(oneHotVectors[0])
    vectors, labels = getOneHotVectorsAndLabels(oneHotVectors)
    del oneHotVectors
    trainVectors = vectors
    trainLabels = labels
    #trainLabels.fit_transform([('H','NH')])
    #trainLabels = preprocessing.label_binarize(trainLabels,classes=[unicode("H"),unicode("NH")])

    #del vectors
    #del labels
    #C, gamma = getCAndGamma(trainVectors, trainLabels, kernel = 'rbf')
    # Train classifier

    clf = OneVsOneClassifier(
        SVC(kernel='linear',
            class_weight='auto',
            verbose=True,
            probability=True))
    #clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear', gamma=1, verbose= False, probability=False))

    clf.fit(trainVectors, trainLabels)
    print "\nDone fitting classifier on training data...\n"
    #testVectors = vectors[200:250]
    #testLabels = labels[200:250]
    #predicted_testLabels = clf.predict(testVectors)
    #print "Done predicting on DEV data...\n"
    #print "classification_report:\n", classification_report(testLabels, predicted_testLabels)#, target_names=target_names)
    #print "accuracy_score:", round(accuracy_score(testLabels, predicted_testLabels), 2)

    #del trainVectors
    #del trainLabels
    #     saimaDataTuples=getSAIMAThreeColumnFormat()
    #     print "Length of saimaDataTuples is: ",  len(saimaDataTuples)
    #     shuffle(saimaDataTuples)
    #     print "saimaDataTuples", saimaDataTuples[0]
    #     ids, labels, vectors= getLabelsAndVectors(testTuples)
    #     oneHotVectors=getOneHotVectors(ids, labels, vectors, space)
    #     vectors, labels=getOneHotVectorsAndLabels(oneHotVectors)
    #     del oneHotVectors
    #     testVectors = vectors
    #     testLabels = labels
    #     predicted_testLabels = clf.predict(testVectors)

    #------------------------------------------------------------------------------------------
    print "=" * 50, "\n"
    print "Results with 5-fold cross validation:\n"
    print "=" * 50, "\n"
    #------------------------------------------------------------------------------------------
    predicted = cross_validation.cross_val_predict(clf,
                                                   trainVectors,
                                                   trainLabels,
                                                   cv=5)
    print "*" * 20
    print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted)
    print "*" * 20

    print "precision_score\t", metrics.precision_score(trainLabels,
                                                       predicted,
                                                       pos_label=unicode("H"),
                                                       average='binary')
    print "recall_score\t", metrics.recall_score(trainLabels,
                                                 predicted,
                                                 pos_label=unicode("H"),
                                                 average='binary')

    print "\nclassification_report:\n\n", metrics.classification_report(
        trainLabels, predicted)
    print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(
        trainLabels, predicted)
    '''#"------------------------------------------------------------------------------------------
Example #37
0
def main():

    # import the data
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784')
    x, y = mnist["data"], mnist["target"]
    print(x.shape)
    print(y.shape)

    # show the image
    some_digit = x[36000]
    some_digit_image = some_digit.reshape(28, 28)
    plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
    plt.axis("off")
    plt.show()

    # prepare the testing/training tests
    x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]
    np.random.seed(3)
    shuffle_index = np.random.permutation(60000)
    x_train, y_train = x_train[shuffle_index], y_train[shuffle_index]

    # Binary Classifier
    y_train_5 = (y_train == '5')  # True for all 5s
    y_test_5 = (y_test == '5')  # make sure it's int not chars
    from sklearn.linear_model import SGDClassifier
    sgd_clf = SGDClassifier(random_state=42)
    sgd_clf.fit(x_train, y_train_5)  # enable the model
    print(sgd_clf.predict([some_digit]))

    # implement Cross-Validation
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    skfolds = StratifiedKFold(n_splits=3, random_state=42)
    for train_index, test_index in skfolds.split(x_train, y_train_5):
        clone_clf = clone(sgd_clf)  # train clone on training folds, then predict on test fold
        x_train_folds = x_train[train_index]
        y_train_folds = y_train_5[train_index]
        x_test_fold = x_train[test_index]
        y_test_fold = y_train_5[test_index]
        clone_clf.fit(x_train_folds, y_train_folds)
        y_pred = clone_clf.predict(x_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))

    # evaluate the model with 'accuracy'
    from sklearn.model_selection import cross_val_score
    cross_val_score = cross_val_score(sgd_clf, x_train, y_train_5, cv=3, scoring="accuracy")
    print(cross_val_score)

    # see accuracy from a non5classifier
    from sklearn.base import BaseEstimator
    class Never5Classifier(BaseEstimator):
        def fit(self, x, y=None):
            pass
        def predicit(self, x):
            return np.zeros((len(x), 1), dtype=bool)
    never_5_clf = Never5Classifier()
    never_5_clf_score = cross_val_score(never_5_clf, x_train, y_train_5, cv=3, scoring="accuracy")
    print(never_5_clf_score)

    # evaluate the model with 'confusion matrix'
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import confusion_matrix
    y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3)
    confusion_matrix = confusion_matrix(y_train_5, y_train_pred)
    print(confusion_matrix)

    # precision and recall
    from sklearn.metrics import precision_score, recall_score, f1_score
    precision_score = precision_score(y_train_5, y_train_pred)
    recall_score = recall_score(y_train_5, y_train_pred)
    f1_score = f1_score(y_train_5, y_train_pred)
    print(precision_score)
    print(recall_score)
    print(f1_score)  # f1 score is the harmonic mean of precision and recall

    # precision vs recall trade-off
    from sklearn.metrics import precision_recall_curve

    def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
        plt.plot(thresholds, precisions[:-1], "b--", label="precision")  # function to plot precision vs threshold
        plt.plot(thresholds, recalls[:-1], "g-", label="recall")
        plt.xlabel("Threshold", fontsize=16)
        plt.legend(loc="upper left", fontsize=16)
        plt.ylim([0, 1])

    def plot_precision_vs_recall(precisions, recalls):
        plt.plot(recalls, precisions, "b-", linewidth=2)
        plt.xlabel("recall", fontsize=16)
        plt.ylabel("precision", fontsize=16)
        plt.axis([0, 1, 0, 1])
    y_scores = cross_val_predict(sgd_clf, x_train, y_train, cv=3, method="decision_function")  # return decision value
    if y_scores.ndim == 2:
        y_scores = y_scores[:, 1]  # to get around with the issue of "extra first dimension"
    precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    plot_precision_vs_recall(precisions, recalls)
    plt.show()

    # manly set the threshold
    y_train_pred_90 = (y_scores > 70000)  # gain new trained dataset
    precision_score = precision_score(y_train_5, y_train_pred_90)
    recall_score = recall_score(y_train_5, y_train_pred_90)
    print("precision_score=", precision_score)
    print("recall_score=", recall_score)

    # ROC curve
    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

    def plot_roc_curve(fpr, tpr, label=None):
        plt.plot(fpr, tpr, linewidth=2, label=label)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.axis([0, 1, 0, 1])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
    plot_roc_curve(fpr, tpr)
    plt.show()
    from sklearn.metrics import roc_auc_score
    from sklearn.ensemble import RandomForestClassifier
    forest_clf = RandomForestClassifier(random_state=42)
    y_probas_forest = cross_val_predict(forest_clf, x_train, y_train_5, cv=3, method="predict_proba")  # have no decision_function
    y_scores_forest = y_probas_forest[:, 1]  # extract the score from probability metrics
    fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
    plt.plot(fpr, tpr, "b:", label="SGD")
    plot_roc_curve(fpr_forest, tpr_forest, "random Forest")
    plt.legend(loc="lower right")
    plt.show()
    roc_auc_score = roc_auc_score(y_train_5, y_scores_forest)
    print(roc_auc_score)

    # Multiclass classification
    sgd_clf.fit(x_train, y_train)  # train the model to the all set.
    sgd_clf.predict([some_digit])
    some_digit_score = sgd_clf.decision_function([some_digit])  # obtain score for each class
    print(some_digit_score)

    # OvO classifier
    from sklearn.multiclass import OneVsOneClassifier
    ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
    ovo_clf.fit(x_train, y_train)
    print(ovo_clf.predict([some_digit]))
    forest_clf.fit(x_train, y_train)
    print(forest_clf.predict_proba([some_digit]))
    sgd_clf_score = cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring="accuracy")
    print(sgd_clf_score)  # here the score is for multiclass classification as for y_train
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train.astype(np.float64))
    sgd_clf_score(sgd_clf, x_train_scaled, y_train, cv=3, scoring="accuracy")
    print(sgd_clf_score)  # scaling can improve the accuracy for model

    # error analysis
    y_train_pred = cross_val_predict(sgd_clf, x_train_scaled, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)  # row for actual, column for predicted
    print(conf_mx)
    plt.matshow(conf_mx, cmap=plt.cm.gray)  # showing the matrix with a image
    plt.show()
    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums  # transform error number into error rate
    np.fill_diagonal(norm_conf_mx, 0)  # keep only the errors
    plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
    plt.show()

    # multilabel classification
    from sklearn.neighbors import KNeighborsClassifier
    y_train_large = (y_train >= 7)
    y_train_odd = (y_train % 2 == 1)  # imply odd number in this way
    y_multilabel = np.c_[y_train_large, y_train_odd]
    knn_clf = KNeighborsClassifier()  # KNeighborClassifier for multilabel
    knn_clf.fit(x_train, y_multilabel)
    print(knn_clf.predcit([some_digit]))

    # multioutput classification
    import numpy.random as rnd
    noise1 = rnd.randint(0, 100, len(x_train), 784)
    noise2 = rnd.randint(0, 100, (len(x_train), 784))  # grant noise and try to clean
    x_train_mod = x_train +noise1
    x_test_mod = x_test + noise2
    y_train_mod = x_train
    y_test_mod = x_test
    knn_clf.fit(x_train_mod, y_train_mod)
    clean_digit = knn_clf.predict([x_test_mod[1]])
    plot_digit(clean_digit)
Example #38
0
class perceptronMOM(BaseEstimator):

    '''Perceptron MOM classifier.
    Perceptron MOM risk minimization. The Perceptron minimize the perceptron loss using SGD without regularization.
    
    Parameters
    ----------

    w0 : array-like, length = n_features + 1, default ones(n_features + 1)
        initial coefficients (including the intercept) of the classifier.

    K : int, default 10
        number of blocks for the computation of the MOM. A big value of K deals with more outliers but small values of K are better for the performance when there are no outliers.
        
    eta0 : float, default 1
        step size parameter, the step size is defined as the i-th iteration by 1/(1+eta0*i).

    epoch : int, default 200
        number of iterations before the end of the algorithm.

    mu : float between 0 and 1, default 0.95
        coefficient in the momentum.

    agg : int, default 1
        number of runs of the algorithm on which we aggregate. One might want to decrease this number if the complexity is a problem.

    compter : boolean, default False
        used for outlier detection, if compter=True, the number of time each point is used in the algorithm will be recorded in the attribute "counts".

    progress : boolean, default False
        display a progress bar to monitor the algorithm on each run (agg > 1 means several progress bar).

    verbose : boolean, default True
        display a message at the end of each run if agg > 1.

    multi : {'ovr','ovo'} , default 'ovr'
        method used to go from binary classification to multiclass classification. 'ovr' means "one vs the rest" and 'ovo' means "one vs one" .
        
    Attributes
    ----------
    
    w0 : array like, length = n_features + 1
        w0 is updated in the algorithm, provides with the final coefficients of the decision function.

    counts : array like, length = n_sampled
        the i-th element record the number of time the i-th element of the training dataset X has been used. Only if compter=True.

    Methods
    -------

    fit(X,y) : fit the model
        X : numpy matrix size = (n_samples,n_features)
        y : array like, length = n_samples


    predict(X) : predict the class of the points in X
        X : numpy matrix size = (n_samples,n_features)
        returns array-like, length = n_samples.

    predict_proba(X) : predict the probability that each point belong to each class.
        X : numpy matrox size = (n_samples,n_features)
        returns matrix, size = (n_samples,n_class)
        
    '''

    def __init__( self,w0=None,K=10,eta0=1,epoch=100,mu=0.95,agg=1,compter=False,progress=False, verbose = True, multi='ovr'):
        binary_clf=perceptronMOM_binary(w0,K,eta0,epoch,mu,agg,compter,progress,verbose)
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")
        for arg, val in values.items():
            setattr(self, arg, val)
        if multi=="ovr":
            self.clf=OneVsRestClassifier(binary_clf)
        elif multi=="ovo":
            self.clf=OneVsOneClassifier(binary_clf)
        else:
            raise NameError('Multiclass meta-algorithm not known')
    def fit(self,X,y):
        self.clf.fit(X,y)
        return self
    def predict(self,X):
        return self.clf.predict(X)
    def predict_proba(self,X):
        return self.clf.predict_proba(X)
    def score(self,X,y):
        return np.mean(self.predict(X)==y)
    def set_params(self,**params):
        self.__init__(**params)
        return self
precisions_forest, recalls_forest, threshold_forest = precision_recall_curve(y_train_5,y_scores_forest) # problem with output
#plot_pr_curve(recalls_forest,precision_forest)




# SDG OvA
sgd_clf.fit(X_train,y_train)# use the OvA
sgd_clf.predict([some_digits])
some_digit_scores = sdg_clf.decision_function([some_digit]) # gives 10 scores, one for each class
np.argmax(some_digit_scores) # the highest scores is the class 5

# SDG OvO
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train,y_train)
ovo_clf.predict([some_digit])
len(ovo_clf.estimators_)

# Random Forest
forest_clf.fit(X_train,y_train)
forest_clf.predict([some_digit])
forest_clf.predict_proba([some_digit])

# evulate with cross_val
cross_val_score(sgd_clf,X_train,y_train, cv=3, scoring='accuracy') # 84% accuracy. A random one would have 10% (1/10 chances)
# Improve accuracy with scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) # comapre with cahap2
cross_val_score ( sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')
 
def hybrid_classification_for_fold(x_train, x_test, y_train, y_test, estimator, scoring):
    '''
    Runs the hybrid classification algorithm for each fold.
    '''

    scaler = MinMaxScaler(feature_range=(0, 1))
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    num_features = x_train.shape[1]
    fs = SelectKBest(scoring, k=2*num_features/3)
    #fs = SelectPercentile(scoring, percentile=50)
    x_train = fs.fit_transform(x_train, y_train)
    x_test = fs.transform(x_test)

    #############################'
    # PHASE 1
    #############################'
    ovr = OneVsRestClassifier(estimator, n_jobs=-1)

    ovr.fit(x_train, y_train)

    ovr_estimators = ovr.estimators_

    y_predict_ovr = get_ovr_estimators_prediction(ovr_estimators, x_test)
    #print y_predict_ovr # dimensions: no. of estimators X no. of samples. each row is the output of a particular estimator for
                         # all the samples we sent in

    sample_predictions_per_ovr_estimator = np.transpose(y_predict_ovr) # dimensions: no. samples X no. ovr_estimators.
                                                                       # each row has the prediction of all ovr_estimators for a given sample.
                                                                       # remember that this is an OVR classification so each estimator fits one class only.
                                                                       # for that sample. e.g.
                                                                       # [[0 0 0 0 0 0 0 0] <- none of the ovr_estimators thought this sample belonged to their class
                                                                       #  [0 0 0 1 0 0 0 0] <- ovr_estimator 3 thinks this sample belongs to its class
                                                                       #  [0 0 0 1 0 0 0 1]] <- ovr_estimator 3 and 7 both think this sample belongs to their class
    #print sample_predictions_per_ovr_estimator

    test_indices_unclassified_in_phase1 = []
    y_test_predict = np.ones(len(y_test))*-1 # -1 is an invalid value. Denotes an unclassified sample.

    for index, sample_prediction in enumerate(sample_predictions_per_ovr_estimator):
        if(np.sum(sample_prediction)==1): # only one estimator's decision_function is +ve
            y_test_predict[index] = ovr.classes_[np.nonzero(sample_prediction)[0][0]]
        else:
            test_indices_unclassified_in_phase1.append(index)

    #print 'Phase {phase} Correctly classified: {0:2.3f}'.format(float(np.sum(y_test_predict==y_test))/len(y_test), phase=1)
    #print 'Phase {phase} Unclassified: {0:2.3f}'.format(float(np.sum(y_test_predict==-1))/len(y_test), phase=1)
    correct_after_phase1 = float(np.sum(y_test_predict==y_test))/len(y_test)
    incorrect_after_phase1 = float(len(filter(lambda x: x <> -1, y_test_predict[y_test_predict<>y_test])))/len(y_test)
    unclassified_after_phase1 = float(np.sum(y_test_predict==-1))/len(y_test)

    #############################'
    # PHASE 2
    #############################'
    ovo = OneVsOneClassifier(estimator, n_jobs=-1)

    ovo.fit(x_train, y_train)
    ovo_estimators = ovo.estimators_

    for index in test_indices_unclassified_in_phase1:
        y_predict_ovo = get_ovo_estimators_prediction(ovo_estimators, ovo.classes_, np.reshape(x_test[index], (1, len(x_test[index]))))
        if y_predict_ovo <> -1:
            y_test_predict[index] = y_predict_ovo

    #print 'Phase {phase} Correctly classified: {0:2.3f}'.format(float(np.sum(y_test_predict==y_test))/len(y_test), phase=2)
    #print 'Phase {phase} Unclassified: {0:2.3f}'.format(float(np.sum(y_test_predict==-1))/len(y_test), phase=2)
    correct_after_phase2 = float(np.sum(y_test_predict==y_test))/len(y_test)
    incorrect_after_phase2 = float(len(filter(lambda x: x <> -1, y_test_predict[y_test_predict<>y_test])))/len(y_test)
    unclassified_after_phase2 = float(np.sum(y_test_predict==-1))/len(y_test)

    accuracy_score = metrics.accuracy_score(y_test_predict, y_test)

    return np.array([accuracy_score, correct_after_phase1, correct_after_phase2, incorrect_after_phase1,\
                     incorrect_after_phase2, unclassified_after_phase1, unclassified_after_phase2])
Example #41
0
from sklearn import datasets
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC

iris = datasets.load_iris()
data = [[1, 1, 1, 1], [2, 2, 2, 1], [3, 3, 3, 1], [4, 4, 4, 1], [1, 1, 1, 2],
        [1, 1, 1, 3], [1, 1, 1, 4]]
classes = [1, 1, 1, 1, 2, 3, 4]
X, y = iris.data, iris.target
classifier = OneVsOneClassifier(LinearSVC(random_state=0))
print X
print y
classifier.fit(X, y)
while True:
    to_predict = raw_input('Enter 4 numbers to predict: ')
    lst = to_predict.split()
    print lst
    new_lst = []
    for num in lst:
        new_lst.append(float(num))
    print classifier.predict([new_lst])
Example #42
0
def main():
    # load pickle
    arxiv_11 = pickle.load(open("2011_big_pop.p", "rb"))
    arxiv_12 = pickle.load(open("2012_big_pop.p", "rb"))
    topiclists = pickle.load(open("minor_subjects.p", "rb"))

    print "loaded pickles"

    # build doc set
    # build doc set
    doc_set = arxiv_11['astro'] + arxiv_11['cond'] + \
            arxiv_11['cs'] + arxiv_11['hep'] + \
            arxiv_11['math'] + arxiv_11['physics'] + \
            arxiv_11['quant'] + arxiv_11['stat']
    label_set = [1]*len(arxiv_11['astro']) + [2]*len(arxiv_11['cond']) + \
              [3]*len(arxiv_11['cs']) + [4]*len(arxiv_11['hep']) + \
              [5]*len(arxiv_11['math']) + [6]*len(arxiv_11['physics']) + \
              [7]*len(arxiv_11['quant']) + [8]*len(arxiv_11['stat'])

    doc_texts = tokenize(doc_set)

    # build indiv training sets
    topic_superset = []
    topic_superset.append(arxiv_11['astro'])
    topic_superset.append(arxiv_11['cond'])
    topic_superset.append(arxiv_11['cs'])
    topic_superset.append(arxiv_11['hep'])
    topic_superset.append(arxiv_11['math'])
    topic_superset.append(arxiv_11['physics'])
    topic_superset.append(arxiv_11['quant'])
    topic_superset.append(arxiv_11['stat'])

    # build individual lda
    lda_superset = []
    num_topics_list = []
    dictionary_set = []

    for topic_set in topic_superset:
        topic_texts = tokenize(topic_set)

        # turn our tokenized documents into a id - term dictionary
        dictionary = corpora.Dictionary(topic_texts)
        dictionary_set.append(dictionary)

        # convert tokenized documents into a document-term matrix
        corpus = [dictionary.doc2bow(text) for text in topic_texts]

        # generate LDA model
        num_topics = math.floor(len(topic_set) / 100)
        num_topics_list.append(num_topics)
        ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                                   num_topics=num_topics,
                                                   id2word=dictionary,
                                                   passes=20)
        lda_superset.append(ldamodel)

    print "all LDA built"

    # build training matrix
    prop_array_superset = []
    for i in range(len(num_topics_list)):
        num_topics = num_topics_list[i]
        topicPropArray = np.zeros((len(doc_texts), num_topics))
        for j in range(len(doc_texts)):
            text = doc_texts[j]
            textProp = lda_superset[i][dictionary_set[i].doc2bow(text)]
            for pair in textProp:
                topicIdx = pair[0]
                weight = pair[1]
                topicPropArray[j, topicIdx] = weight
        prop_array_superset.append(topicPropArray)

    # concat full feature array
    trainingArray = prop_array_superset[0]
    for i in range(len(prop_array_superset)):
        if i != 0:
            trainingArray = np.concatenate(
                (trainingArray, prop_array_superset[i]), axis=1)

    print "training matrix built"
    print "------------------"
    print "testing"

    # test on new data
    test_set = arxiv_12['astro'][0:99] + arxiv_12['cond'][0:99] + \
                arxiv_12['cs'][0:99] + arxiv_12['hep'][0:99] + \
                arxiv_12['math'][0:99] + arxiv_12['physics'][0:99] + \
                arxiv_12['quant'][0:99] + arxiv_12['stat'][0:99]
    test_label = [1]*100 + [2]*100 + [3]*100 + [4]*100 + [5]*100 + \
                   [6]*100 + [7]*100 + [8]*100

    test_texts = tokenize(test_set)

    # build indiv test prop array
    test_prop_array_superset = []
    for i in range(len(num_topics_list)):
        num_topics = num_topics_list[i]
        testPropArray = np.zeros((800, num_topics))
        for j in range(len(test_texts)):
            test = test_texts[j]
            testProp = lda_superset[i][dictionary_set[i].doc2bow(test)]
            for pair in testProp:
                topicIdx = pair[0]
                weight = pair[1]
                testPropArray[j, topicIdx] = weight
        test_prop_array_superset.append(testPropArray)

    # concat full test array
    testArray = test_prop_array_superset[0]
    for i in range(len(test_prop_array_superset)):
        if i != 0:
            testArray = np.concatenate(
                (testArray, test_prop_array_superset[i]), axis=1)

    cla = svm.SVC(kernel='linear')
    X_train, X_test, y_train, y_test = trainingArray, testArray, label_set, test_label

    print "training_array length: " + str(len(topicPropArray))
    print "test_array length: " + str(len(testPropArray))
    print "training_label length: " + str(len(label_set))
    print "test_label length: " + str(len(test_label))
    print '--------------------------------'

    # ova
    # gnb
    gnb = GaussianNB()
    cla = OneVsOneClassifier(gnb)
    cla.fit(X_train, y_train)
    predictions = cla.predict(X_test)
    np.savetxt('ecocovosub_pred.csv',
               predictions.astype(int),
               fmt='%i',
               delimiter=",")
    # print predictions
    print 'ecoc gnb'
    print zero_one_loss(predictions, y_test)
    print '--------------------------------'

    svmlin = svm.SVC(kernel='linear')
    cla = OneVsOneClassifier(svmlin)
    cla.fit(X_train, y_train)
    predictions = cla.predict(X_test)
    np.savetxt('ecocovosubsvm_pred.csv',
               predictions.astype(int),
               fmt='%i',
               delimiter=",")
    # print predictions
    print 'ecoc svm'
    print zero_one_loss(predictions, y_test)
    print '--------------------------------'
Example #43
0
def training(train_data, dev_data, param):
    """
    Train the model on train_data and generate prediction on train_data, dev_data
    :param train_data: dictionary containing data, encoded label and binary label
    :param dev_data: dictionary containing data, encoded label and binary label
    :param param: parameter for training
    :return:
    train_prediction: prediction of training data
    dev_prediction: prediction of development data
    train_vec.shape: shape of training vector (sample size, feature size)
    dev_vec.shape: shape of development vector (sample size, feature size)
    model: trained classifier
    word_vec_map: learned tfidf/count vectorizer
    """
    text_to_vec = TextToVec(**param)

    # Fit with both train and dev data
    text_to_vec.fit(train_data['data'] + dev_data['data'])
    word_vec_map = text_to_vec.vectorizer.get_feature_names()
    train_vec = text_to_vec.transform(train_data['data'])
    dev_vec = text_to_vec.transform(dev_data['data'])
    logger.info(
        f"train vec size:{train_vec.shape}, dev vec size:{dev_vec.shape}")

    # # apply weights on tfidf based on whether the word appear in multiple classes
    # tt_occ = Counter(train_data['encoded_label'])
    # weight_list = []
    # for i in range(train_vec.shape[1]):  # For every feature
    #     occ = Counter(train_data['encoded_label'][train_vec[:, i] > 0.0])
    #     for key, value in occ.items():
    #         occ[key] = value/tt_occ[key]
    #     weight_list.append(np.std(list(occ.values()))/0.35)
    # weight = np.array(weight_list).reshape(1, -1)
    # weight = weight/np.max(weight)
    # train_vec = np.multiply(train_vec, weight)

    # Perform oversampling on training data
    if param['balanced'] not in ['Bootstrap', 'Handsample']:
        logger.info(
            f"class info before resampling: {sorted(Counter(train_data['encoded_label']).items())}"
        )
        train_vec, train_data['encoded_label'] = resample(
            X_train=train_vec,
            y_train=train_data['encoded_label'],
            balance=param['balanced'])
        logger.info(
            f"class info after resampling:{sorted(Counter(train_data['encoded_label']).items())}"
        )

    # Fit model
    if param['classifier'] == 'MultinomialNB':
        clf = MultinomialNB()
    elif param['classifier'] == 'LDA':
        clf = LinearDiscriminantAnalysis()
    else:
        clf = svm.LinearSVC()

    if param['multiclass'] == 'OnevsOne':
        model = OneVsOneClassifier(clf)
    else:
        model = OneVsRestClassifier(clf)

    if param['classifier'] == 'LinearSVM' or param['multiclass'] == 'OnevsOne':
        logger.info(f'Fitting model: {param}')
        model = model.fit(train_vec, train_data['encoded_label'])
        train_prediction = model.predict(train_vec)
        dev_prediction = model.predict(dev_vec)
    else:
        logger.info(f'Fitting model: {param}')
        model = model.fit(train_vec, train_data['binary_label'])
        train_prediction = np.argmax(model.predict(train_vec), axis=1)
        dev_prediction = np.argmax(model.predict(dev_vec), axis=1)

    return train_prediction, dev_prediction, train_vec.shape, dev_vec.shape, model, word_vec_map
Example #44
0
y_pred_2_threshhold = (y_score > 70000)

# Multi-class classification
# By defualt, most binary classfiers when given a multi-class
# problem, use OvA strategy, except a few like SVMs which use OVO
# as it is faster. We can also explicitly specify to Sklearn which
# strategy to use.
multi_model = SGDClassifier()
multi_model.fit(X_train, y_train)  # this will use OvA
# In reality, this will use 10 different Binary classifiers
multi_scores = multi_model.decision_function([some_digit])
# This will store an array of 10 scores in scores_multi
# The one with the highest index will be selected by predict_ method
# To explicityly use specified methiods
OvO_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
OvO_clf.fit(X_train, y_train)

# Random-Forest Classifer
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
rf_clf.predict_proba([some_digit])
y_pred_multi = cross_val_predict(multi_model, X_train, y_train, cv=3)
conf_mat = confusion_matrix(y_train, y_pred_multi)
plt.plot(conf_mat, cmap=plt.cm.gray)
# plt.show()
# The row represents the actual classes while the col
# represents the predicted classes.

# Multi-Label Classification -- classifying multiple
# binary labels on one input
# Creating a new label array
Example #45
0
result2 = np.append(c, d.astype(float), axis=1)
#print(result)
#np.savetxt("reslut.csv",np.array(clf.fit(X,Y).predict(test)).astype(int),delimiter=",")
with open("logloss.csv", "wb") as f:
    f.write(
        b'Sample_id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,Class_10\n'
    )
    np.savetxt(f, result2, delimiter=",")

# Split the data into a training set and a test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

#Multiclass learning using OvO and linearSVC
OvOclf = OneVsOneClassifier(LinearSVC(random_state=0))
a = np.reshape(np.asarray(list(range(1, len(test) + 1))), (6544, 1))
b = np.reshape(np.asarray(OvOclf.fit(X_train, Y_train).predict(test)),
               (6544, 1))
OvOclf.fit(X, Y).predict(X)
y_pred = OvOclf.fit(X_train, Y_train).predict(X_test)


def plot_confusion_matrix(cm,
                          classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
Example #46
0
class RobustWeightedClassifier(BaseEstimator, ClassifierMixin):
    """Algorithm for robust classification using reweighting algorithm.

    This model use iterative reweighting of samples to make a regression or
    classification estimator robust.

    The principle of the algorithm is to use an empirical risk minimization
    principle where the risk is estimated using a robust estimator (for example
    Huber estimator or median-of-means estimator)[1], [3]. The idea behind this
    algorithm was mentioned before in [2].
    This idea translates in an iterative algorithm where the sample_weight
    are changed at each iterations and are dependent of the sample. Informally
    the outliers should have small weight while the inliers should have big
    weight, where outliers are sample with a big loss function.

    This algorithm enjoy a non-zero breakdown-point (it can handle arbitrarily
    bad outliers). When the "mom" weighting scheme is used, k outliers can be
    tolerated. When the "Huber" weighting scheme is used, asymptotically the
    number of outliers has to be less than half the sample size.

    Read more in the :ref:`User Guide <robust>`.

    Parameters
    ----------

    weighting : string, default="huber"
        Weighting scheme used to make the estimator robust.
        Can be 'huber' for huber-type weights or  'mom' for median-of-means
        type weights.

    max_iter : int, default=100
        Maximum number of iterations.
        For more information, see the optimization scheme of base_estimator
        and the eta0 and burn_in parameter.

    burn_in : int, default=10
        Number of steps used without changing the learning rate.
        Can be useful to make the weight estimation better at the beginning.

    eta0 : float, default=0.01
        Constant step-size used during the burn_in period. Used only if
        burn_in>0. Can have a big effect on efficiency.

    c : float>0 or None, default=None
        Parameter used for Huber weighting procedure, used only if weightings
        is 'huber'. Measure the robustness of the weighting procedure. A small
        value of c means a more robust estimator.
        Can have a big effect on efficiency.
        If None, c is estimated at each step using half the Inter-quartile
        range, this tends to be conservative (robust).

    k : int < sample_size/2, default=1
        Parameter used for mom weighting procedure, used only if weightings
        is 'mom'. 2k+1 is the number of blocks used for median-of-means
        estimation, higher value of k means a more robust estimator.
        Can have a big effect on efficiency.
        If None, k is estimated using the number of points distant from the
        median of means of more than 2 times a robust estimate of the scale
        (using the inter-quartile range), this tends to be conservative
        (robust).

    loss : string, None or callable, default="log"
        Name of the loss used, must be the same loss as the one optimized in
        base_estimator.
        Classification losses supported : 'log', 'hinge'.
        If 'log', then the base_estimator must support predict_proba.
        Regression losses supported : 'squared_loss', .

    sgd_args : dict, default={}
        arguments of the SGDClassifier base estimator.

    multi_class : string, default="ovr"
        multi-class scheme. Can be either "ovo" for OneVsOneClassifier or "ovr"
        for OneVsRestClassifier or "binary" for binary classification.

    n_jobs : int, default=1
        number of jobs used in the multi-class meta-algorithm computation.

    tol : float or None, (default = 1e-3)
        The stopping criterion. If it is not None, training will stop when
        (loss > best_loss - tol) for n_iter_no_change consecutive epochs.

    n_iter_no_change : int, default=10
        Number of iterations with no improvement to wait before early stopping.

    random_state : int, RandomState instance or None, optional (default=None)
        The seed of the pseudo random number generator to use when shuffling
        the data. If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by np.random.



    Attributes
    ----------

    classes_ : ndarray of shape (n_classes, )
        A list of class labels known to the classifier.

    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
        Coefficient of the features in the decision function. Only available if
        multi_class = "binary"

    intercept_ : ndarray of shape (1,) or (n_classes,)
        Intercept (a.k.a. bias) added to the decision function.
        Only available if multi_class = "binary"

    n_iter_ : ndarray of shape (n_classes,) or (1, )
        Actual number of iterations for all classes. If binary or multinomial,
        it returns only 1 element. For liblinear solver, only the maximum
        number of iteration across all classes is given.

    base_estimator_ : object,
        The fitted base estimator SGDCLassifier.

    weights_ : array like, length = n_sample.
        Weight of each sample at the end of the algorithm. Can be used as a
        measure of how much of an outlier a sample is. Only available if
        multi_class = "binary"


    Notes
    -----

    Often, there is a need to use RobustScaler as preprocessing.

    Examples
    --------

    >>> from sklearn_extra.robust import RobustWeightedClassifier
    >>> from sklearn.datasets import make_blobs
    >>> import numpy as np
    >>> rng = np.random.RandomState(42)
    >>> X,y = make_blobs(n_samples=100, centers=np.array([[-1, -1], [1, 1]]),
    ...                  random_state=rng)
    >>> clf=RobustWeightedClassifier()
    >>> _ = clf.fit(X, y)
    >>> score = np.mean(clf.predict(X)==y)

    References
    ----------

    [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu.
        "Robust classification via MOM minimization", Mach Learn 109, (2020).
        https://doi.org/10.1007/s10994-019-05863-6 (2018).
        arXiv:1808.03106

    [2] Christian Brownlees, Emilien Joly and Gábor Lugosi.
        "Empirical risk minimization for heavy-tailed losses", Ann. Statist.
        Volume 43, Number 6 (2015), 2507-2536.

    [3] Stanislav Minsker and Timothée Mathieu.
        "Excess risk bounds in robust empirical risk minimization"
        arXiv preprint (2019). arXiv:1910.07485.

    """

    def __init__(
        self,
        weighting="huber",
        max_iter=100,
        burn_in=10,
        eta0=0.01,
        c=None,
        k=0,
        loss="log",
        sgd_args=None,
        multi_class="ovr",
        n_jobs=1,
        tol=1e-3,
        n_iter_no_change=10,
        random_state=None,
    ):
        self.weighting = weighting
        self.max_iter = max_iter
        self.burn_in = burn_in
        self.eta0 = eta0
        self.c = c
        self.k = k
        self.loss = loss
        self.sgd_args = sgd_args
        self.multi_class = multi_class
        self.n_jobs = n_jobs
        self.tol = tol
        self.n_iter_no_change = n_iter_no_change
        self.random_state = random_state

    def fit(self, X, y):
        """Fit the model to data matrix X and target(s) y.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).

        Returns
        -------
        self : returns an estimator trained with RobustWeightedClassifier.
        """

        if self.sgd_args is None:
            sgd_args = {}
        else:
            sgd_args = self.sgd_args

        # Define the base estimator
        base_robust_estimator_ = _RobustWeightedEstimator(
            SGDClassifier(**sgd_args, loss=self.loss),
            weighting=self.weighting,
            loss=self.loss,
            burn_in=self.burn_in,
            c=self.c,
            k=self.k,
            eta0=self.eta0,
            max_iter=self.max_iter,
            tol=self.tol,
            n_iter_no_change=self.n_iter_no_change,
            random_state=self.random_state,
        )

        if self.multi_class == "ovr":
            self.base_estimator_ = OneVsRestClassifier(
                base_robust_estimator_, n_jobs=self.n_jobs
            )
        elif self.multi_class == "binary":
            self.base_estimator_ = base_robust_estimator_
        elif self.multi_class == "ovo":
            self.base_estimator_ = OneVsOneClassifier(
                base_robust_estimator_, n_jobs=self.n_jobs
            )
        else:
            raise ValueError("No such multiclass method implemented.")

        self.base_estimator_.fit(X, y)
        if self.multi_class == "binary":
            self.weights_ = self.base_estimator_.weights_
            self.coef_ = self.base_estimator_.coef_
            self.intercept_ = self.base_estimator_.intercept_
        self.n_iter_ = self.max_iter * len(X)
        self.classes_ = self.base_estimator_.classes_
        return self

    def predict(self, X):
        """Predict using the estimator trained with RobustWeightedClassifier.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        Returns
        -------
        y : array-like, shape (n_samples, n_outputs)
            The predicted values.
        """

        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.predict(X)

    def _check_proba(self):
        if self.loss != "log":
            raise AttributeError(
                "Probability estimates are not available for"
                " loss=%r" % self.loss
            )

    @property
    def predict_proba(self):
        """
        Probability estimates when binary classification.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Vector to be scored, where `n_samples` is the number of samples and
            `n_features` is the number of features.

        Returns
        -------
        T : array-like of shape (n_samples, n_classes)
            Returns the probability of the sample for each class in the model,
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        self._check_proba()
        return self._predict_proba

    def _predict_proba(self, X):
        return self.base_estimator_.predict_proba(X)

    @property
    def _estimator_type(self):
        return self.base_estimator._estimator_type

    def score(self, X, y=None):
        """Returns the score on the given data, using
        ``base_estimator_.score``.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like of shape (n_samples, n_output) or (n_samples,), optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        Returns
        -------
        score : float
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.score(X, y)

    def decision_function(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)

        Returns
        -------
        array, shape (n_samples,)
           Predicted target values per element in X.
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.decision_function(X)
Example #47
0
class LSVMDetector:
    # just the training() function changes, rest all remains same.

    def __init__(self, subjects, data, attacker_data):
        self.data = data
        self.attacker = attacker_data
        self.u_scores = []
        self.i_scores = []
        self.mean_vector = []
        self.subjects = subjects
        self.fp = []

    def training(self):
        self.clf = OneVsOneClassifier(SVC(kernel='rbf', gamma='auto'))
        labels = [0] * len(self.train) + [1] * len(self.train_imposter)
        self.clf.fit(pandas.concat([self.train, self.train_imposter]), labels)

    def testing(self):
        self.u_scores = self.clf.decision_function(self.test_genuine)
        self.i_scores = self.clf.decision_function(self.test_imposter)
        self.u_scores = list(self.u_scores)
        self.i_scores = list(self.i_scores)

    def evaluate(self):
        eers = []
        fpr = []

        if isinstance(self.subjects, list):
            for idx, subject in enumerate(self.subjects):
                genuine_user_data = self.data.loc[self.data.user_id == subject, \
                                                  ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                   'length of trajectory', 'mid-stroke pressure',
                                                   'mid-stroke area covered',
                                                   '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                   '20\%-perc. dev. from end-to-end line',
                                                   '50\%-perc. dev. from end-to-end line',
                                                   '80\%-perc. dev. from end-to-end line']]
                imposter_data = self.data.loc[self.data.user_id != subject, :]
                # generated_data = attacker_data
                genuine_user_data = normalize_df(genuine_user_data[:400])

                self.train = genuine_user_data[:200]
                self.test_genuine = genuine_user_data[200:400]

                # self.test_imposter = normalize_np(self.attacker[idx])
                # self.test_imposter = normalize_df(imposter_data.groupby("user_id"). \
                #                                    head(10).loc[:,
                #                                    ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                #                                     'length of trajectory', 'mid-stroke pressure',
                #                                     'mid-stroke area covered',
                #                                     '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                #                                     '20\%-perc. dev. from end-to-end line',
                #                                     '50\%-perc. dev. from end-to-end line',
                #                                     '80\%-perc. dev. from end-to-end line']])
                self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \
                                         tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                         'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered',
                                                         '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                         '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line',
                                                         '80\%-perc. dev. from end-to-end line']])

                self.test_imposter = self.attacker[idx]

                self.training()
                self.testing()
                # eers.append(evaluateEER(self.u_scores, \
                #                         self.i_scores))
                fpr.append(evaluateFAR(self.u_scores, self.i_scores))
                # print(evaluateFAR(self.u_scores, self.i_scores))

        else:
            genuine_user_data = self.data.loc[self.data.user_id == self.subjects, \
                                              ["stroke duration", 'start $x$', 'start $y$', 'stop $x$',
                                               'stop $y$',
                                               'length of trajectory', 'mid-stroke pressure',
                                               'mid-stroke area covered',
                                               '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                               '20\%-perc. dev. from end-to-end line',
                                               '50\%-perc. dev. from end-to-end line',
                                               '80\%-perc. dev. from end-to-end line']]
            imposter_data = self.data.loc[
                self.data.user_id != self.subjects, :]
            # generated_data = attacker_data
            genuine_user_data = normalize_df(genuine_user_data[:400])

            self.train = genuine_user_data[:200]
            self.test_genuine = genuine_user_data[200:400]
            # self.test_imposter = imposter_data.groupby("subject"). \
            #                          tail(6).loc[:, "H.period":"H.Return"]
            # self.test_imposter = normalize_np(self.attacker)
            self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \
                                         tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                         'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered',
                                                         '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                         '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line',
                                                         '80\%-perc. dev. from end-to-end line']])
            self.test_imposter = self.attacker

            self.training()
            self.testing()
            # eers.append(evaluateEER(self.u_scores, \
            #                        self.i_scores))
            fpr.append(evaluateFAR(self.u_scores, self.i_scores))

        return np.mean(fpr)
Example #48
0
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
    """Gaussian process classification (GPC) based on Laplace approximation.

    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
    Gaussian Processes for Machine Learning (GPML) by Rasmussen and
    Williams.

    Internally, the Laplace approximation is used for approximating the
    non-Gaussian posterior by a Gaussian.

    Currently, the implementation is restricted to using the logistic link
    function. For multi-class classification, several binary one-versus rest
    classifiers are fitted. Note that this class thus does not implement
    a true multi-class Laplace approximation.

    Parameters
    ----------
    kernel : kernel object
        The kernel specifying the covariance function of the GP. If None is
        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
        the kernel's hyperparameters are optimized during fitting.

    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
        Can either be one of the internally supported optimizers for optimizing
        the kernel's parameters, specified by a string, or an externally
        defined optimizer passed as a callable. If a callable is passed, it
        must have the  signature::

            def optimizer(obj_func, initial_theta, bounds):
                # * 'obj_func' is the objective function to be maximized, which
                #   takes the hyperparameters theta as parameter and an
                #   optional flag eval_gradient, which determines if the
                #   gradient is returned additionally to the function value
                # * 'initial_theta': the initial value for theta, which can be
                #   used by local optimizers
                # * 'bounds': the bounds on the values of theta
                ....
                # Returned are the best found hyperparameters theta and
                # the corresponding value of the target function.
                return theta_opt, func_min

        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
        is used. If None is passed, the kernel's parameters are kept fixed.
        Available internal optimizers are::

            'fmin_l_bfgs_b'

    n_restarts_optimizer : int, optional (default: 0)
        The number of restarts of the optimizer for finding the kernel's
        parameters which maximize the log-marginal likelihood. The first run
        of the optimizer is performed from the kernel's initial parameters,
        the remaining ones (if any) from thetas sampled log-uniform randomly
        from the space of allowed theta-values. If greater than 0, all bounds
        must be finite. Note that n_restarts_optimizer=0 implies that one
        run is performed.

    max_iter_predict : int, optional (default: 100)
        The maximum number of iterations in Newton's method for approximating
        the posterior during predict. Smaller values will reduce computation
        time at the cost of worse results.

    warm_start : bool, optional (default: False)
        If warm-starts are enabled, the solution of the last Newton iteration
        on the Laplace approximation of the posterior mode is used as
        initialization for the next call of _posterior_mode(). This can speed
        up convergence when _posterior_mode is called several times on similar
        problems as in hyperparameter optimization.

    copy_X_train : bool, optional (default: True)
        If True, a persistent copy of the training data is stored in the
        object. Otherwise, just a reference to the training data is stored,
        which might cause predictions to change if the data is modified
        externally.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    multi_class: string, default : "one_vs_rest"
        Specifies how multi-class classification problems are handled.
        Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
        one binary Gaussian process classifier is fitted for each class, which
        is trained to separate this class from the rest. In "one_vs_one", one
        binary Gaussian process classifier is fitted for each pair of classes,
        which is trained to separate these two classes. The predictions of
        these binary predictors are combined into multi-class predictions.
        Note that "one_vs_one" does not support predicting probability
        estimates.

    n_jobs : int, optional, default: 1
        The number of jobs to use for the computation. If -1 all CPUs are used.
        If 1 is given, no parallel computing code is used at all, which is
        useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
        used. Thus for n_jobs = -2, all CPUs but one are used.

    Attributes
    ----------
    kernel_ : kernel object
        The kernel used for prediction. In case of binary classification,
        the structure of the kernel is the same as the one passed as parameter
        but with optimized hyperparameters. In case of multi-class
        classification, a CompoundKernel is returned which consists of the
        different kernels used in the one-versus-rest classifiers.

    log_marginal_likelihood_value_ : float
        The log-marginal-likelihood of ``self.kernel_.theta``

    classes_ : array-like, shape = (n_classes,)
        Unique class labels.

    n_classes_ : int
        The number of classes in the training data

    .. versionadded:: 0.18
    """
    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0, max_iter_predict=100,
                 warm_start=False, copy_X_train=True, random_state=None,
                 multi_class="one_vs_rest", n_jobs=1):
        self.kernel = kernel
        self.optimizer = optimizer
        self.n_restarts_optimizer = n_restarts_optimizer
        self.max_iter_predict = max_iter_predict
        self.warm_start = warm_start
        self.copy_X_train = copy_X_train
        self.random_state = random_state
        self.multi_class = multi_class
        self.n_jobs = n_jobs

    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,
            self.random_state)

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes. Only class %s present."
                             % self.classes_[0])
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
                    OneVsRestClassifier(self.base_estimator_,
                                        n_jobs=self.n_jobs)
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                    OneVsOneClassifier(self.base_estimator_,
                                       n_jobs=self.n_jobs)
            else:
                raise ValueError("Unknown multi-class mode %s"
                                 % self.multi_class)

        self.base_estimator_.fit(X, y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean(
                [estimator.log_marginal_likelihood()
                 for estimator in self.base_estimator_.estimators_])
        else:
            self.log_marginal_likelihood_value_ = \
                self.base_estimator_.log_marginal_likelihood()

        return self

    def predict(self, X):
        """Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array, shape = (n_samples,)
            Predicted target values for X, values are from ``classes_``
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        X = check_array(X)
        return self.base_estimator_.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array-like, shape = (n_samples, n_classes)
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
            raise ValueError("one_vs_one multi-class mode does not support "
                             "predicting probability estimates. Use "
                             "one_vs_rest mode instead.")
        X = check_array(X)
        return self.base_estimator_.predict_proba(X)

    @property
    def kernel_(self):
        if self.n_classes_ == 2:
            return self.base_estimator_.kernel_
        else:
            return CompoundKernel(
                [estimator.kernel_
                 for estimator in self.base_estimator_.estimators_])

    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
        """Returns log-marginal likelihood of theta for training data.

        In the case of multi-class classification, the mean log-marginal
        likelihood of the one-versus-rest classifiers are returned.

        Parameters
        ----------
        theta : array-like, shape = (n_kernel_params,) or none
            Kernel hyperparameters for which the log-marginal likelihood is
            evaluated. In the case of multi-class classification, theta may
            be the  hyperparameters of the compound kernel or of an individual
            kernel. In the latter case, all individual kernel get assigned the
            same theta values. If None, the precomputed log_marginal_likelihood
            of ``self.kernel_.theta`` is returned.

        eval_gradient : bool, default: False
            If True, the gradient of the log-marginal likelihood with respect
            to the kernel hyperparameters at position theta is returned
            additionally. Note that gradient computation is not supported
            for non-binary classification. If True, theta must not be None.

        Returns
        -------
        log_likelihood : float
            Log-marginal likelihood of theta for training data.

        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
            Gradient of the log-marginal likelihood with respect to the kernel
            hyperparameters at position theta.
            Only returned when eval_gradient is True.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])

        if theta is None:
            if eval_gradient:
                raise ValueError(
                    "Gradient can only be evaluated for theta!=None")
            return self.log_marginal_likelihood_value_

        theta = np.asarray(theta)
        if self.n_classes_ == 2:
            return self.base_estimator_.log_marginal_likelihood(
                theta, eval_gradient)
        else:
            if eval_gradient:
                raise NotImplementedError(
                    "Gradient of log-marginal-likelihood not implemented for "
                    "multi-class GPC.")
            estimators = self.base_estimator_.estimators_
            n_dims = estimators[0].kernel_.n_dims
            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
                return np.mean(
                    [estimator.log_marginal_likelihood(theta)
                     for i, estimator in enumerate(estimators)])
            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
                # theta for compound kernel
                return np.mean(
                    [estimator.log_marginal_likelihood(
                        theta[n_dims * i:n_dims * (i + 1)])
                     for i, estimator in enumerate(estimators)])
            else:
                raise ValueError("Shape of theta must be either %d or %d. "
                                 "Obtained theta with shape %d."
                                 % (n_dims, n_dims * self.classes_.shape[0],
                                    theta.shape[0]))
# numpy ndarray
tr = np.genfromtxt(tr_path, delimiter=' ')
ts = np.genfromtxt(ts_path, delimiter=' ')

tr_feat = tr[:, 1:]
ts_feat = ts[:, 1:]
tr_label = tr[:, 0]
ts_label = ts[:, 0]

# use sklearn C-Support Vector Classification
## == one-vs-one == ##
# The multiclass support is handled in a one-vs-one scheme
# train
ovo_clf = OneVsOneClassifier(LinearSVC())
ovo_clf.fit(tr_feat, tr_label)

# predict
ovo_pred = ovo_clf.predict(ts_feat)
ovo_err = 1 - ovo_clf.score(ts_feat, ts_label)

# confusion matrix
#
#array([[159,   7],
#       [  5, 161]])
ovo_cmat = metrics.confusion_matrix(ts_label, ovo_pred)
pred_total = np.sum(ovo_cmat, axis=1)
ovo_mis = 1 - np.diag(ovo_cmat).astype(float) / pred_total
print("one vs. one svm - classification err: %s \n" % (ovo_err))
print("confusion matrix: \n %s" % (ovo_cmat))
print("class misclassification rate : \n %s" % (ovo_mis))
Example #50
0
print 79 * "_"
print 'OvO', cv_scores_ovo.mean()
print 'OvA', cv_scores_ova.mean()

plt.figure(figsize=(4, 3))
plt.boxplot([cv_scores_ova, cv_scores_ovo])
plt.xticks([1, 2], ['One vs All', 'One vs One'])
plt.title('Prediction: accuracy score')

### Plot a confusion matrix ###################################################
# Fit on the the first 10 sessions and plot a confusion matrix on the
# last 2 sessions
from sklearn.metrics import confusion_matrix

svc_ovo.fit(X[session < 10], y[session < 10])
y_pred_ovo = svc_ovo.predict(X[session >= 10])

plt.matshow(confusion_matrix(y_pred_ovo, y[session >= 10]))
plt.title('Confusion matrix: One vs One')
plt.xticks(np.arange(len(unique_conditions)), unique_conditions)
plt.yticks(np.arange(len(unique_conditions)), unique_conditions)

svc_ova.fit(X[session < 10], y[session < 10])
y_pred_ova = svc_ova.predict(X[session >= 10])

plt.matshow(confusion_matrix(y_pred_ova, y[session >= 10]))
plt.title('Confusion matrix: One vs All')
plt.xticks(np.arange(len(unique_conditions)), unique_conditions)
plt.yticks(np.arange(len(unique_conditions)), unique_conditions)
Example #51
0
    async def do_run_async(self):
        training_set = super().load_train_images()

        # Training set needs to be reshaped from 3D (60000,28,28) to 2D (60000, 784) for the classifier to be able to
        # use in training phase
        training_set_tr = training_set.reshape((60000, 784))
        training_labels = super().load_train_labels()

        # Scikit-learn is smart enough to detect when you try to use a binary classification algorithm
        # such as SGD on a multiclass classification task (when the labels are not binary) and automatically runs OvA
        # strategy (trains N binary classifiers, one for each class) except for SVM for which it runs OvO
        # (trains N x (N-1)/2 binary classifiers, one between 0 and 1, one between 1 and 2 etc)

        sgd_classifier = SGDClassifier(random_state=77)
        sgd_classifier.fit(training_set_tr, training_labels)

        seven = super().get_random_digit(training_set_tr, training_labels, 7)
        print(f"The digit is:{sgd_classifier.predict([seven])}")

        # Get the classifier to return the decision scores for each class rather than a prediction
        # The class with the higher score is used for prediction
        scores = sgd_classifier.decision_function([seven])
        print(f"The decision scores for the digit are:{scores}")

        # Can also force Scikit-Learn to use the SGDClassifier with OvO strategy
        ovo = OneVsOneClassifier(sgd_classifier)
        ovo.fit(training_set_tr, training_labels)
        print("OvO: The digit is:", ovo.predict([seven]))

        # Random Forest algorithm can also be used for classification (besides regression - RandomForestRegressor)
        # and is a multiclass algorithm so no need for OvA or OvO strategies
        rnd_forest = RandomForestClassifier()
        rnd_forest.fit(training_set_tr, training_labels)
        print(f"Random Forest: The digit is:{rnd_forest.predict([seven])}")
        print(
            f"Random Forest: Probabilities:{rnd_forest.predict_proba([seven])}"
        )

        # Evaluate SGD Classifier vs Random Forest based on confusion matrix
        sgd_predictions = cross_val_predict(sgd_classifier,
                                            training_set_tr,
                                            training_labels,
                                            cv=3)
        rnd_forest_predictions = cross_val_predict(rnd_forest,
                                                   training_set_tr,
                                                   training_labels,
                                                   cv=3)

        print("SGD Classifier Confusion Matrix:")
        print(confusion_matrix(training_labels, sgd_predictions))
        print("Random Forest Classifier Confusion Matrix:")
        print(confusion_matrix(training_labels, rnd_forest_predictions))

        # Random Forest generally seems better - higher values on the main diagonal
        # Test persisting a trained classifier
        rnd_forest = RandomForestClassifier()
        rnd_forest.fit(training_set_tr, training_labels)

        file = open("D:\\rnd_forest.dat", "wb")
        pickle.dump(rnd_forest, file)
        file.close()

        file2 = open("D:\\rnd_forest.dat", "rb")
        rnd_forest_2 = pickle.load(file2)
        file2.close()
        print("Random Forest Persisted: The digit is:",
              rnd_forest_2.predict([seven]))
Example #52
0
def evaluation():
    iris = datasets.load_iris()
    class_names = iris.target_names
    C = 1.0  # SVM regularization parameter
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        test_size=0.4,
                                                        random_state=0)

    svm_c = OneVsOneClassifier(SVC(kernel='linear', C=C))
    y_pred = svm_c.fit(X_train, y_train).predict(X_test)

    with open('output_files/svm_1v1_classification_report.tex', 'w') as out:
        out.write('\\begin{table}\n')
        out.write(
            report_to_latex_table(
                classification_report(y_test,
                                      y_pred,
                                      target_names=iris.target_names)))
        out.write('\n\\end{table}\n')

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=class_names,
                          normalize=True,
                          title='SVM One-vs-One Confusion Matrix')
    plt.tight_layout()
    plt.savefig('images/svm_linear_1v1_cm.png')
    plt.close()

    svm_l = OneVsRestClassifier(LinearSVC(C=C))
    y_pred = svm_l.fit(X_train, y_train).predict(X_test)
    with open('output_files/svm_1va_classification_report.tex', 'w') as out:
        out.write('\\begin{table}\n')
        out.write(
            report_to_latex_table(
                classification_report(y_test,
                                      y_pred,
                                      target_names=iris.target_names)))
        out.write('\n\\end{table}\n')

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=class_names,
                          normalize=True,
                          title='SVM One-vs-All Confusion Matrix',
                          showC=False)

    plt.tight_layout()
    plt.savefig('images/svm_linear_1va_cm.png')
    plt.close()
Example #53
0
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import label_binarize
from sklearn.svm import LinearSVC
from sklearn import metrics

data_tr_r = np.loadtxt('multitest_out.csv', delimiter = ',')
data_ts_r = np.loadtxt('multitrain_out.csv', delimiter = ',')

data_tr = data_tr_r[:, :-1]
data_ts = data_ts_r[:, :-1]
label_tr = data_tr_r[:,-1]
label_ts = data_ts_r[:,-1]

# Learn to predict each class against one class
clf = OneVsOneClassifier(LinearSVC(random_state = 0))
OvsO = clf.fit(data_tr, label_tr)

result=clf.predict(data_ts)
#accuracy=clf.score(data_ts,label_ts)
accuracy = metrics.accuracy_score(result, label_ts)
error_vector = result - label_ts
error = 0

p_data= clf.fit(data_tr, label_tr).decision_function(data_ts)
p_data=p_data[:,1]

conf_mat=metrics.confusion_matrix(label_ts,result)
precision=metrics.precision_score(label_ts, result, average = None)
recall=metrics.recall_score(label_ts, result, average = None)
print ("confusion_matrix:")
print (conf_mat)
# Splitting the dataset into test and train
s_train, s_test, t_train, t_test = train_test_split(s,
                                                    t,
                                                    test_size=0.33,
                                                    random_state=4)

#Printing shapes
print(s_train.shape)
print(s_test.shape)
print(t_train.shape)
print(t_test.shape)

# One-vs-One SVM Classifier Prediction
smodel = OneVsOneClassifier(SVC()).fit(s_train, t_train)
smodel.fit(s_train, t_train)
sprediction = smodel.predict(s_test)
print(sprediction)

# One-vs-Rest SVM Classifier Prediction
clf = OneVsRestClassifier(SVC()).fit(s_train, t_train)
spredict = clf.predict(s_test)
print(spredict)

# Actual values which should have been predicted based on testing dataset
print(t_test)
"""<h1>Evaluating the classifiers</h1>"""

# Accuracy for One-vs-One Classifier
accuracy = metrics.accuracy_score(t_test, sprediction)
print(accuracy)
Example #55
0
scores = sgd.decision_function([some_digit])
#array([[-836368.4535247 , -461981.66956632, -660256.15197058,
#       -148855.65250873, -137458.04986937, -154654.76568534,
#       -864502.26667054, -245167.9063152 , -149510.01775103,
#       -233700.77221455]])

#argmax gives max values of scores
np.argmax(scores)

sgd.classes_
#array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U1')

from sklearn.multiclass import OneVsOneClassifier

ovo = OneVsOneClassifier(SGDClassifier(random_state=100))
ovo.fit(data_train, target_train)
ovo.predict([some_digit])
len(ovo.estimators_)
ovo.decision_function([some_digit])
#array([[ 1.5       ,  4.01086892,  0.50210079,  5.22484016,  8.31545536,
#         5.11411311, -0.43998285,  5.13308383,  7.3219439 ,  8.3175768 ]])

cross_val_score(sgd, data_train, target_train, cv=3, scoring='accuracy')
#array([0.86552689, 0.86179309, 0.86117918])

import pandas as pd

predict_m = cross_val_predict(sgd, data_train, target_train, cv=3)

ps = precision_score(target_train, predict_m, average=None)
def my_HKNNSVM(X_train, X_test, Y_train, K_Neighbors, Kernel_SVM):
    train = X_train
    train = normalize(train)
    test = X_test
    test = normalize(test)
    kelas = Y_train
    k = K_Neighbors
    kernel = Kernel_SVM
    hasilkelas = []
    Y_pred = []

    for z in range(0, len(test)):
        distance = []

        train = numpy.array(train)
        test = numpy.array(test)
        index_train = numpy.arange(len(train))
        index_train = index_train.tolist()
        length = len(train)
        for i in range(0, length):
            distance.append(
                (math.sqrt(sum([(a - b)**2 for a, b in zip(train[i], test[z])
                                ])), kelas[i], tuple(train[i]), kelas[i]))

        distance.sort(key=operator.itemgetter(0))

        neighbor = []
        ttg = []
        kelasttg = []
        jarak = []
        for j in range(k):
            neighbor.append(distance[j])
            ttg.append(neighbor[j][2])
            kelasttg.append(neighbor[j][1])
            jarak.append(distance[0])

        ttg = list(ttg)
        classVotes = {}
        for a in range(len(neighbor)):
            response = neighbor[a][1]
            if response in classVotes:
                classVotes[response] += 1
            else:
                classVotes[response] = 1
        #sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)

        items = []
        items = list(classVotes.items())
        #print(classVotes)
        '''svm'''
        if len(items) > 1:
            clf = OneVsOneClassifier(SVC(kernel=kernel))
            clf.fit(list(ttg), list(kelasttg))
            ley = [list(test[z])]
            hasilkelas = clf.predict(ley)
            #print(hasilkelas)
        else:
            hasilkelas = max(classVotes.items(), key=operator.itemgetter(1))[0]
            #print(hasilkelas)
            hasilkelas = numpy.reshape(hasilkelas, (1, ))
            hasilkelas = numpy.array(hasilkelas)
        '''svm'''

        Y_pred.append(hasilkelas)

    return Y_pred
Example #57
0
class SeCoEstimator(BaseEstimator, ClassifierMixin):
    """A classifier using rules learned with the *Separate-and-Conquer* (SeCo)
    algorithm, also known as *Covering* algorithm.

    Wraps `_BaseSeCoEstimator` to handle multi-class problems, selecting a
    multi-class strategy and making sure that `_BaseSeCoEstimator` always sees
    an integer range [0..n_classes_) of class labels, where 0 is the intended
    fallback class; i.e. the biggest class in multi-class problems, or the
    negative class when learning a binary concept.

    The concrete SeCo variant to run is defined by `algorithm_config`.

    Fields
    -----
    algorithm_config : subclass of SeCoAlgorithmConfiguration
        Defines the concrete SeCo algorithm to run, see
        :class:`SeCoAlgorithmConfiguration`.

    Parameters
    -----
    multi_class : callable or str or None
        Which strategy to use for non-binary problems. Possible values:

        - None: auto-select; use 'direct' if possible
          (`algorithm_config.direct_multiclass_support()` returns True),
          'one_vs_rest' otherwise.
        - A callable: Construct
          `self.base_estimator_ = multi_class(_BaseSeCoEstimator())` and
          delegate to that estimator. Useful if you want to roll a different
          binarization strategy, e.g.

          >>> import sklearn.multiclass, functools
          >>> multi_class=functools.partial(
          ...     sklearn.multiclass.OutputCodeClassifier,
          ...     code_size=0.7, random_state=42)

          If you use this, make sure to pass to `_BaseSeCoEstimator` classes `y`
          from an integer range [0..n_classes_), e.g. using `LabelEncoder`.
          Also be aware of class order influence on tie-breaking.
        - 'direct': Directly learn a theory of rules with different heads
          (target classes). Uses :class:`BySizeLabelEncoder` internally.
        - 'one_vs_rest': Use `sklearn.multiclass.OneVsRestClassifier` for class
          binarization and learn binary theories.
        - 'one_vs_one': Use `sklearn.multiclass.OneVsOneClassifier` for class
          binarization and learn binary theories.
        - TODO: multi_class strategy of ripper: OneVsRest, remove C_i after learning rules for it

    random_state : None | int | instance of np.random.RandomState
        RNG, may be used by the algorithm. Value passed through
        `sklearn.utils.check_random_state`.

    n_jobs : int, optional
        Passed to `OneVsRestClassifier` or `OneVsOneClassifier` if these are
        used.

    Attributes
    -----
    base_estimator_ : estimator instance
        The estimator object that all tasks are delegated to. One of
        `sklearn.multiclass.OneVsRestClassifier`,
        `sklearn.multiclass.OneVsOneClassifier` or
        `sklearn_seco.util.TargetTransformingMetaEstimator` if demanded by the
        `multi_class_` strategy, a `_BaseSeCoEstimator` otherwise.

    multi_class_ : callable or str
        The actual strategy used on a non-binary problem. Relevant if
        `multi_class=None` demanded auto-selection.

    classes_ : np.ndarray
        `np.unique(y)`

    See Also
    -----
    `_BaseSeCoEstimator`
    """

    algorithm_config: Type[SeCoAlgorithmConfiguration]

    # TODO: _BaseSeCoEstimator.export_text equivalent inverting binarization & target transformation for display

    def _more_tags(self):
        # tell sklearn >= 0.21 that we can handle categorical data
        return {'X_types': ['2darray', 'categorical'], 'allow_nan': True}

    def __init__(self, multi_class=None, random_state=1, n_jobs=1):
        self.multi_class = multi_class
        self.random_state = random_state
        self.n_jobs = n_jobs

    def fit(self, X, y, **kwargs):
        """Learn SeCo theory/theories on training data `X, y`.

        For possible parameters (`**kwargs`), refer to
        :class:`_BaseSeCoEstimator`.
        """
        X, y = check_X_y(X, y, force_all_finite='allow-nan')
        self.multi_class_ = self.multi_class
        self.base_estimator_ = _BaseSeCoEstimator(
            self.algorithm_config, random_state=self.random_state, **kwargs)

        # NOTE: if using multiprocessing (e.g. through OvO or OvR), all
        #   sub-estimators share the same random seed/state.
        #   I think this should not harm.

        def wrapper_ordering_classes_by_size(estimator):
            # BySizeLabelEncoder ensures:  first class = default = biggest
            # and that classes form an integer range [0..n_classes_)
            return TargetTransformingMetaEstimator(BySizeLabelEncoder(),
                                                   estimator)

        self.classes_ = np.unique(y)
        n_classes_ = self.classes_.size
        if n_classes_ == 1:
            raise ValueError("SeCoEstimator requires 2 or more distinct "
                             "classes. Only 1 class (%s) present." %
                             self.classes_[0])
        elif n_classes_ == 2:
            self.base_estimator_ = wrapper_ordering_classes_by_size(
                self.base_estimator_)
        else:  # n_classes_ > 2
            if self.multi_class_ is None:
                # default / auto-selection
                if self.algorithm_config.direct_multiclass_support():
                    self.multi_class_ = "direct"
                else:
                    self.multi_class_ = "one_vs_rest"

            if callable(self.multi_class_):
                self.base_estimator_ = self.multi_class_(self.base_estimator_)
            elif self.multi_class_ == "one_vs_rest":
                self.base_estimator_ = OneVsRestClassifier(
                    self.base_estimator_, n_jobs=self.n_jobs)
            elif self.multi_class_ == "one_vs_one":
                self.base_estimator_ = OneVsOneClassifier(self.base_estimator_,
                                                          n_jobs=self.n_jobs)
            elif self.multi_class_ == "direct":
                # TODO: if self.multi_class=='direct' (not `None` auto-detect), only assertion prevents binary-only learner to silently learn on multiclass training data
                self.base_estimator_ = wrapper_ordering_classes_by_size(
                    self.base_estimator_)
            else:
                raise ValueError("Unknown multi-class mode %s" %
                                 self.multi_class_)

        # NOTE: param categorical_features is data dependent, but OvR/OvO don't
        #   pass extra parameters through fit(), so it has to be in
        #   `_BaseSeCoEstimator.__init__`.
        self.base_estimator_.fit(X, y)
        return self

    def predict(self, X):
        """Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array, shape = (n_samples,)
            Predicted target values for X, values are from ``classes_``
        """
        check_is_fitted(self, ["classes_"])
        X = check_array(X, force_all_finite='allow-nan')
        return self.base_estimator_.predict(X)

    @if_delegate_has_method('base_estimator_')
    def predict_proba(self, X):
        # noinspection PyUnresolvedReferences
        return self.base_estimator_.predict_proba(X)

    @if_delegate_has_method('base_estimator_')
    def decision_function(self, X):
        # noinspection PyUnresolvedReferences
        return self.base_estimator_.decision_function(X)

    def get_seco_estimators(self) -> Sequence[_BaseSeCoEstimator]:
        """
        :return: The `_BaseSeCoEstimator` instances that were trained.
            Depending on the multi-class strategy, the class labels they use
            differ in order and value.
            Cannot be used when self.multi_class_ is a callable.
        """
        check_is_fitted(self, 'base_estimator_')
        is_binary = len(self.classes_) == 2
        if is_binary or self.multi_class_ == "direct":
            assert isinstance(self.base_estimator_,
                              TargetTransformingMetaEstimator)
            return [self.base_estimator_.estimator]
        elif self.multi_class_ == "one_vs_rest":
            assert isinstance(self.base_estimator_, OneVsRestClassifier)
            return self.base_estimator_.estimators_
        elif self.multi_class_ == "one_vs_one":
            assert isinstance(self.base_estimator_, OneVsOneClassifier)
            return self.base_estimator_.estimators_
        else:
            assert False, "invalid state: unknown type of base_estimator_ " \
                f"({str(self.base_estimator_)})"
X = np.array(X)

label_encoder = []
X_encoded = np.empty(X.shape)
for i, item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])

X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

classifier = OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(X, y)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=5)
classifier = OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

f1 = cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3)
print('F1 score : ' + str(round(100 * f1.mean(), 2)) + '%')

input_data = [
    '37', 'Private', '215646', 'HS-grad', '9', 'Never-married',
    'Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0', '0', '40',
Example #59
0
print 79 * "_"
print 'OvO', cv_scores_ovo.mean()
print 'OvA', cv_scores_ova.mean()

plt.figure(figsize=(4, 3))
plt.boxplot([cv_scores_ova, cv_scores_ovo])
plt.xticks([1, 2], ['One vs All', 'One vs One'])
plt.title('Prediction: accuracy score')

### Plot a confusion matrix ###################################################
# Fit on the the first 10 sessions and plot a confusion matrix on the
# last 2 sessions
from sklearn.metrics import confusion_matrix

svc_ovo.fit(X[session < 10], y[session < 10])
y_pred_ovo = svc_ovo.predict(X[session >= 10])

plt.matshow(confusion_matrix(y_pred_ovo, y[session >= 10]))
plt.title('Confusion matrix: One vs One')
plt.xticks(np.arange(len(unique_conditions)), unique_conditions)
plt.yticks(np.arange(len(unique_conditions)), unique_conditions)

svc_ova.fit(X[session < 10], y[session < 10])
y_pred_ova = svc_ova.predict(X[session >= 10])

plt.matshow(confusion_matrix(y_pred_ova, y[session >= 10]))
plt.title('Confusion matrix: One vs All')
plt.xticks(np.arange(len(unique_conditions)), unique_conditions)
plt.yticks(np.arange(len(unique_conditions)), unique_conditions)