def train_classifier(clf,X_train,y_train,X_test,y_test):
	clf = OneVsOneClassifier(clf)
	clf.fit(X_train, y_train)
	train_time = time() - t0
	print("train time: %0.3fs" % train_time)
	t0 = time()
	return clf
Example #2
0
def svm_classification(genres, features_type):
	training_set_features = tf.read_features_from_files("../../music/training", genres, features_type)
	testing_set_features = tf.read_features_from_files("../../music/testing", genres, features_type)

	X = []
	y = []
	for feature in training_set_features:
		(mean, cov_mat, genre_name) = feature
		X.append(mean.tolist())
		y.append(tf.get_genre_ID(genre_name))

	training_data = np.array(X)
	training_class = np.array(y)

	X = []
	y = []
	for feature in testing_set_features:
		(mean, cov_mat, genre_name) = feature
		X.append(mean.tolist())
		y.append(tf.get_genre_ID(genre_name))

	testing_data = np.array(X)
	testing_class = np.array(y)


	clf = OneVsOneClassifier(SVC(kernel='linear'))
	result_class = np.array(clf.fit(training_data, training_class).predict(testing_data))

	rt.print_accuracy(list(testing_class), list(result_class), genres, features_type, "svm")
	rt.write_accuracy_to_file("../../music/", list(testing_class), list(result_class), genres, features_type, "svm")
def svm_training(train_X,train_Y,kernel):
	if kernel == False:
		clf = OneVsOneClassifier(svm.LinearSVC(random_state=0))
	else:
		clf = OneVsOneClassifier(svm.SVC(kernel='rbf'))
	clf.fit(train_X,train_Y)
	return clf
Example #4
0
def test_ovo_ties():
    # test that ties are broken using the decision function, not defaulting to
    # the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron())
    ovo_prediction = multi_clf.fit(X, y).predict(X)

    # recalculate votes to make sure we have a tie
    predictions = np.vstack([clf.predict(X) for clf in multi_clf.estimators_])
    scores = np.vstack([clf.decision_function(X)
                        for clf in multi_clf.estimators_])
    # classifiers are in order 0-1, 0-2, 1-2
    # aggregate votes:
    votes = np.zeros((4, 3))
    votes[np.arange(4), predictions[0]] += 1
    votes[np.arange(4), 2 * predictions[1]] += 1
    votes[np.arange(4), 1 + predictions[2]] += 1
    # for the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # for the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # for the tie, the prediction is the class with the highest score
    assert_equal(ovo_prediction[0], 0)
    # in the zero-one classifier, the score for 0 is greater than the score for
    # one.
    assert_greater(scores[0][0], scores[0][1])
    # score for one is greater than score for zero
    assert_greater(scores[2, 0] - scores[0, 0], scores[0, 0] + scores[1, 0])
    # score for one is greater than score for two
    assert_greater(scores[2, 0] - scores[0, 0], -scores[1, 0] - scores[2, 0])
Example #5
0
def gen_svc(train_model):
    '''Given a training model, generates the SVM (and DictVectorizer) for it

    Args: 
        train_model: a training model object. should have 2 attributes:
        feature_lists, a map from POS tag to a dictionary of features
        (the ones used in the ith decision), and action_lists, a map from
        POS tag to the action (Shift, Left, Right) chosen for the ith decision
    Returns: dictionary mapping POS tag to a vectorizer, SVM tuple
    Raises: None
    '''
    models = {}
    for pos_tag in train_model.feature_lists:
        vec = DictVectorizer()
        feature_mat = vec.fit_transform(train_model.feature_lists[pos_tag])
        trained_svc = OneVsOneClassifier(LinearSVC())
        try:
            trained_svc.fit(feature_mat, np.array(train_model.action_lists[pos_tag]))
        except ValueError:
            # occasionally we get the same action for everything with a
            # particular POS, which raises an error. so in that case we just
            # use a custom class that always predicts the same action
            trained_svc = AlwaysPredict(train_model.feature_lists[pos_tag][0])
        models[pos_tag] = (vec, trained_svc)
    return models
Example #6
0
def gen_svc(train_model):
    '''Given a training model, generates the SVM (and DictVectorizer) for it'''
    vec = DictVectorizer()
    feature_mat = vec.fit_transform(train_model.feature_list)
    # for some reason just SVC() seems to always suggest "Shift"
    trained_svc = OneVsOneClassifier(LinearSVC())
    trained_svc.fit(feature_mat, np.array(train_model.action_list))
    return vec, trained_svc
Example #7
0
def test_ovo_fit_on_list():
    # Test that OneVsOne fitting works with a list of targets and yields the
    # same output as predict from an array
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    prediction_from_array = ovo.fit(iris.data, iris.target).predict(iris.data)
    prediction_from_list = ovo.fit(iris.data,
                                   list(iris.target)).predict(iris.data)
    assert_array_equal(prediction_from_array, prediction_from_list)
Example #8
0
def test_ovo_string_y():
    # Test that the OvO doesn't mess up the encoding of string labels
    X = np.eye(4)
    y = np.array(['a', 'b', 'c', 'd'])

    ovo = OneVsOneClassifier(LinearSVC())
    ovo.fit(X, y)
    assert_array_equal(y, ovo.predict(X))
Example #9
0
def test_ovo_string_y():
    "Test that the OvO doesn't screw the encoding of string labels"
    X = np.eye(4)
    y = np.array(['a', 'b', 'c', 'd'])

    svc = LinearSVC()
    ovo = OneVsOneClassifier(svc)
    ovo.fit(X, y)
    assert_array_equal(y, ovo.predict(X))
Example #10
0
def OneVsOne(inputs_train, inputs_valid, target_train, target_valid):
	name = "Multiclass One Vs One"
	clf = OneVsOneClassifier(LinearSVC(random_state=0))
	clf.fit(inputs_train, np.ravel(target_train))
	prediction = clf.predict(inputs_valid)
	correct = np.count_nonzero(np.ravel(target_valid) == prediction)
	total = target_valid.shape[0]
	correctRate = (float(correct)/total)*100

	return name, correctRate
Example #11
0
def test_ovo_fit_predict():
    # A classifier which implements decision_function.
    ovo = OneVsOneClassifier(LinearSVC())
    pred = ovo.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ovo.estimators_), n_classes * (n_classes - 1) / 2)

    # A classifier which implements predict_proba.
    ovo = OneVsOneClassifier(MultinomialNB())
    pred = ovo.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ovo.estimators_), n_classes * (n_classes - 1) / 2)
 def __init__(self, estimator, n_jobs=-1, n_neighbors=18, radius=1.0,
              algorithm='auto', leaf_size=30, metric='minkowski',
              p=2, threshold=0.2, metric_params=None):
     OneVsOneClassifier.__init__(self, estimator, n_jobs)
     self.nbrs = NearestNeighbors(n_neighbors=n_neighbors, radius=radius, algorithm=algorithm,
                                  leaf_size=leaf_size, metric=metric, p=p,
                                  metric_params=metric_params, n_jobs=n_jobs)
     self.n_neighbors = n_neighbors
     self.threshold = threshold
     self._fit_y = None
Example #13
0
def test_ovo_ties2():
    # test that ties can not only be won by the first two labels
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y_ref = np.array([2, 0, 1, 2])

    # cycle through labels so that each label wins once
    for i in range(3):
        y = (y_ref + i) % 3
        multi_clf = OneVsOneClassifier(Perceptron())
        ovo_prediction = multi_clf.fit(X, y).predict(X)
        assert_equal(ovo_prediction[0], i % 3)
Example #14
0
    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,
            self.random_state)

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes; got %d class (only class %s "
                             "is present)"
                             % (self.n_classes_, self.classes_[0]))
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
                    OneVsRestClassifier(self.base_estimator_,
                                        n_jobs=self.n_jobs)
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                    OneVsOneClassifier(self.base_estimator_,
                                       n_jobs=self.n_jobs)
            else:
                raise ValueError("Unknown multi-class mode %s"
                                 % self.multi_class)

        self.base_estimator_.fit(X, y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean(
                [estimator.log_marginal_likelihood()
                 for estimator in self.base_estimator_.estimators_])
        else:
            self.log_marginal_likelihood_value_ = \
                self.base_estimator_.log_marginal_likelihood()

        return self
Example #15
0
    def test_multicluster(self):
        c = BinaryTiloClassifier(PinchRatioCutStrategy(),
                                 similarity.Gaussian())
        ##c = BinaryTiloClassifier(similarity.KNN())
        ##mcc = OneVsRestClassifier(c)
        mcc = OneVsOneClassifier(c)
        data = self.three_class_pts
        classes = self.three_class_labels

        peturbed_data = data + 0.01 * np.random.random(data.shape)
        fitted = mcc.fit(peturbed_data, classes)
        guesses = fitted.predict(peturbed_data)
        assert_array_equal(guesses, classes)
def svm(X,Y):
    X_train = np.array([x for i, x in enumerate(X) if i % 7 != 0], dtype = np.uint8)
    y_train = np.array([z for i, z in enumerate(Y) if i % 7 != 0], dtype = np.uint8)
    X_test  = np.array([x for i, x in enumerate(X) if i % 10 == 0], dtype = np.uint8)
    y_test  = np.array([z for i, z in enumerate(Y) if i % 10 == 0], dtype = np.uint8)

    clf = OneVsOneClassifier(LinearSVC(random_state=0))
    clf.fit(X_train, y_train)
    y_predicted = rf.predict(X_test)

    results = [prediction == truth for prediction, truth in zip(y_predicted, y_test)]
    accuracy = float(results.count(True)) / float(len(results))
    print accuracy
def test_pairwise_indices():
    clf_precomputed = svm.SVC(kernel="precomputed")
    X, y = iris.data, iris.target

    ovr_false = OneVsOneClassifier(clf_precomputed)
    linear_kernel = np.dot(X, X.T)
    ovr_false.fit(linear_kernel, y)

    n_estimators = len(ovr_false.estimators_)
    precomputed_indices = ovr_false.pairwise_indices_

    for idx in precomputed_indices:
        assert_equal(idx.shape[0] * n_estimators / (n_estimators - 1), linear_kernel.shape[0])
class ClassifierOvOAsFeatures:
    """
    A transformation that esentially implement a form of dimensionality
    reduction.
    This class uses a fast SGDClassifier configured like a linear SVM to produce
    a vector of decision functions separating target classes in a
    one-versus-rest fashion.
    It's useful to reduce the dimension bag-of-words feature-set into features
    that are richer in information.
    """
    def fit(self, X, y):
        """
        `X` is expected to be an array-like or a sparse matrix.
        `y` is expected to be an array-like containing the classes to learn.
        """
        self.classifier = OneVsOneClassifier(SGDClassifier(),n_jobs=-1).fit(X,numpy.array(y))
        return self

    def transform(self, X, y=None):
        """
        `X` is expected to be an array-like or a sparse matrix.
        It returns a dense matrix of shape (n_samples, m_features) where
            m_features = (n_classes * (n_classes - 1)) / 2
        """
        return self.classifier.decision_function(X)
 def fit(self, X, y):
     """
     `X` is expected to be an array-like or a sparse matrix.
     `y` is expected to be an array-like containing the classes to learn.
     """
     self.classifier = OneVsOneClassifier(SGDClassifier(),n_jobs=-1).fit(X,numpy.array(y))
     return self
def trainOneVsOne2( histograms ):

    xAll = convertToSvmFormatFeature(histograms)
    scaleParam = computeScaleParameters(xAll)
    scaleFeatureData(xAll,scaleParam)

    xAll = np.array(xAll)

    yAll = [ x['label'] for x in histograms ]
    yAll = np.array(yAll)

    # svm = OneVsOneClassifier(LinearSVC(random_state=0,dual=svm_conf['dual'],C=svm_conf['C']))
    gammaBase = 1.0/kmeans_conf['K']
    # svm = OneVsOneClassifier(sklearn.svm.SVC(C=100, gamma=10*gammaBase,kernel='rbf'))
    svm = OneVsOneClassifier(sklearn.svm.SVC(C=1000, gamma=gammaBase,kernel='sigmoid'))
    svm.fit(xAll,yAll)

    out = {'scaleParam':scaleParam,'svm':svm}
    return out
def multiclassSVC(classifier, sz=2000):

    mnsize = sz
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test)
    print 'Beginning analysis: {}'.format(X.shape)
    #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y)
    clf = OneVsOneClassifier(classifier).fit(X, y)
    #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y)
    y_pred = clf.predict(X)
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
Example #22
0
def test_ovo_ties():
    # Test that ties are broken using the decision function,
    # not defaulting to the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron())
    ovo_prediction = multi_clf.fit(X, y).predict(X)
    ovo_decision = multi_clf.decision_function(X)

    # Classifiers are in order 0-1, 0-2, 1-2
    # Use decision_function to compute the votes and the normalized
    # sum_of_confidences, which is used to disambiguate when there is a tie in
    # votes.
    votes = np.round(ovo_decision)
    normalized_confidences = ovo_decision - votes

    # For the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # For the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # For the tie, the prediction is the class with the highest score
    assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())
def learn(cat1,cat2,cat3):
    X = []
    Y = []
    IDF=get_IDF([cat1,cat2,cat3])
    for d in cat1:
        X.append(MapToEvalVS(d,IDF));
        Y.append(0)
    for d in cat2:
        X.append(MapToEvalVS(d,IDF));
        Y.append(1)
    for d in cat3:
        X.append(MapToEvalVS(d,IDF));
        Y.append(2)

    X=np.array(X)
    Y=np.array(Y)
    #clf = svm.SVC(verbose=True)
    #clf=svm.SVC()
    clf = OneVsOneClassifier(svm.SVC())
    #clf=KNeighborsClassifier(weights='distance')
    clf.fit(X, Y)
    return [clf,IDF]
Example #24
0
def test_ovo_decision_function():
    n_samples = iris.data.shape[0]

    ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0))
    ovo_clf.fit(iris.data, iris.target)
    decisions = ovo_clf.decision_function(iris.data)

    assert_equal(decisions.shape, (n_samples, n_classes))
    assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data))

    # Compute the votes
    votes = np.zeros((n_samples, n_classes))

    k = 0
    for i in range(n_classes):
        for j in range(i + 1, n_classes):
            pred = ovo_clf.estimators_[k].predict(iris.data)
            votes[pred == 0, i] += 1
            votes[pred == 1, j] += 1
            k += 1

    # Extract votes and verify
    assert_array_equal(votes, np.round(decisions))

    for class_idx in range(n_classes):
        # For each sample and each class, there only 3 possible vote levels
        # because they are only 3 distinct class pairs thus 3 distinct
        # binary classifiers.
        # Therefore, sorting predictions based on votes would yield
        # mostly tied predictions:
        assert_true(set(votes[:, class_idx]).issubset(set([0., 1., 2.])))

        # The OVO decision function on the other hand is able to resolve
        # most of the ties on this data as it combines both the vote counts
        # and the aggregated confidence levels of the binary classifiers
        # to compute the aggregate decision function. The iris dataset
        # has 150 samples with a couple of duplicates. The OvO decisions
        # can resolve most of the ties:
        assert_greater(len(np.unique(decisions[:, class_idx])), 146)
Example #25
0
def multiclass_SVC(X, y):

    from sklearn.svm import LinearSVC

    from sklearn import cross_validation

    # first move: split data
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.35)

    # one-vs-rest implementation
    from sklearn.multiclass import OneVsRestClassifier

    ovr = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)

    # one-vs-all implementation
    from sklearn.multiclass import OneVsOneClassifier

    ovo = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)

    one_vs_rest = ovr.score(X_test, y_test)
    one_vs_one = ovo.score(X_test, y_test)

    return one_vs_rest, one_vs_one
    def analysis(self, testanalysis=True):
        if testanalysis:
            trainingdata, testdata = self.getTrainTestData()
        else:
            trainingdata, testdata = self.getRealData()

        aDict = {}
        for value in trainingdata:
            phrase = value.Phrase

            phrase = phrase.strip()

            aDict[phrase] = value.Sentiment

        _all_values = aDict.keys()
        _all_sentiments = aDict.values()

        # self.KFOLDTEST(np.asarray(_all_values), np.asarray(_all_sentiments))

        count_vectorizer = CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data)
        count = count_vectorizer.fit_transform(_all_values)

        # self.countWordFreq(count_vectorizer, count)

        tfidf = TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False)
        data = tfidf.fit_transform(count)

        classfier = OneVsOneClassifier(LinearSVC())
        classfier.fit(data, np.asarray(_all_sentiments))

        # Data to write the content into the CSV , for getting this comment the above to take entire training set
        # as the real data
        # along with that call the method @getRealData
        if testanalysis:
            self.normalexecution(testdata, count_vectorizer, tfidf, classfier)
        else:
            self.writeToFile(testdata, count_vectorizer, tfidf, classfier)
Example #27
0
    def fit(self, X, y):
        """Check target values and fit model."""
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError("Need 2 or more classes.")
        elif n_classes == 2:
            self.t = np.zeros(y.shape)
            self.t[y == self.classes_[1]] = 1
            return super(RVC, self).fit(X, self.t)
        else:
            self.multi_ = None
            self.multi_ = OneVsOneClassifier(self)
            self.multi_.fit(X, y)
            return self
 def __init__(self, slack=1, gamma=1, kernelType='linear', gram=1):
     self.gram = gram
     self.slack = slack
     self.gamma = gamma
     self.kernelType = kernelType
     self.data = np.ones((1000, 1000))
     self.cityClassifier = {}
     #TODO: Wieso nimmst du OneVsOne und nicht OneVsRest? Ginge OneVsRest nicht schneller?
     self.countryClassifier = OneVsOneClassifier(
         svm.SVC(kernel=self.kernelType, C=self.slack, gamma=self.gamma, probability=False,
                 cache_size=1000))
     self.bag = None
     self.numberOfFeatures = 0
     #Features and labels
     self.fitting_data = None
     self.predict_data = None
     self.cityPrediction = {}
     self.countryPrediction = None
     self.numberOfCityFeatures = {}
Example #29
0
class LSVMDetector:
    # just the training() function changes, rest all remains same.

    def __init__(self, subjects, data, attacker_data):
        self.data = data
        self.attacker = attacker_data
        self.u_scores = []
        self.i_scores = []
        self.mean_vector = []
        self.subjects = subjects
        self.fp = []

    def training(self):
        self.clf = OneVsOneClassifier(SVC(kernel='rbf', gamma='auto'))
        labels = [0] * len(self.train) + [1] * len(self.train_imposter)
        self.clf.fit(pandas.concat([self.train, self.train_imposter]), labels)

    def testing(self):
        self.u_scores = self.clf.decision_function(self.test_genuine)
        self.i_scores = self.clf.decision_function(self.test_imposter)
        self.u_scores = list(self.u_scores)
        self.i_scores = list(self.i_scores)

    def evaluate(self):
        eers = []
        fpr = []

        if isinstance(self.subjects, list):
            for idx, subject in enumerate(self.subjects):
                genuine_user_data = self.data.loc[self.data.user_id == subject, \
                                                  ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                   'length of trajectory', 'mid-stroke pressure',
                                                   'mid-stroke area covered',
                                                   '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                   '20\%-perc. dev. from end-to-end line',
                                                   '50\%-perc. dev. from end-to-end line',
                                                   '80\%-perc. dev. from end-to-end line']]
                imposter_data = self.data.loc[self.data.user_id != subject, :]
                # generated_data = attacker_data
                genuine_user_data = normalize_df(genuine_user_data[:400])

                self.train = genuine_user_data[:200]
                self.test_genuine = genuine_user_data[200:400]

                # self.test_imposter = normalize_np(self.attacker[idx])
                # self.test_imposter = normalize_df(imposter_data.groupby("user_id"). \
                #                                    head(10).loc[:,
                #                                    ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                #                                     'length of trajectory', 'mid-stroke pressure',
                #                                     'mid-stroke area covered',
                #                                     '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                #                                     '20\%-perc. dev. from end-to-end line',
                #                                     '50\%-perc. dev. from end-to-end line',
                #                                     '80\%-perc. dev. from end-to-end line']])
                self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \
                                         tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                         'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered',
                                                         '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                         '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line',
                                                         '80\%-perc. dev. from end-to-end line']])

                self.test_imposter = self.attacker[idx]

                self.training()
                self.testing()
                # eers.append(evaluateEER(self.u_scores, \
                #                         self.i_scores))
                fpr.append(evaluateFAR(self.u_scores, self.i_scores))
                # print(evaluateFAR(self.u_scores, self.i_scores))

        else:
            genuine_user_data = self.data.loc[self.data.user_id == self.subjects, \
                                              ["stroke duration", 'start $x$', 'start $y$', 'stop $x$',
                                               'stop $y$',
                                               'length of trajectory', 'mid-stroke pressure',
                                               'mid-stroke area covered',
                                               '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                               '20\%-perc. dev. from end-to-end line',
                                               '50\%-perc. dev. from end-to-end line',
                                               '80\%-perc. dev. from end-to-end line']]
            imposter_data = self.data.loc[
                self.data.user_id != self.subjects, :]
            # generated_data = attacker_data
            genuine_user_data = normalize_df(genuine_user_data[:400])

            self.train = genuine_user_data[:200]
            self.test_genuine = genuine_user_data[200:400]
            # self.test_imposter = imposter_data.groupby("subject"). \
            #                          tail(6).loc[:, "H.period":"H.Return"]
            # self.test_imposter = normalize_np(self.attacker)
            self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \
                                         tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$',
                                                         'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered',
                                                         '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity',
                                                         '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line',
                                                         '80\%-perc. dev. from end-to-end line']])
            self.test_imposter = self.attacker

            self.training()
            self.testing()
            # eers.append(evaluateEER(self.u_scores, \
            #                        self.i_scores))
            fpr.append(evaluateFAR(self.u_scores, self.i_scores))

        return np.mean(fpr)
Example #30
0
 def training(self):
     self.clf = OneVsOneClassifier(SVC(kernel='rbf', gamma='auto'))
     labels = [0] * len(self.train) + [1] * len(self.train_imposter)
     self.clf.fit(pandas.concat([self.train, self.train_imposter]), labels)
train_label = np.fromfile("../../mnist/mnist_train/mnist_train_label",
                          dtype=np.uint8)
test_label = np.fromfile("../../mnist/mnist_test/mnist_test_label",
                         dtype=np.uint8)

accuList = []
runtimeList = []
# trying different dimension numbers from 5 to 250
for i in xrange(5, 251, 5):
    # Dimensionality reduction to target dimension number
    pca = PCA(n_components=i)
    train_data_reduce = pca.fit_transform(train_data)
    test_data_reduce = pca.transform(test_data)
    print 'dim:{}'.format(i),
    # Using LinearSVC on i-dimensional data
    clf = OneVsOneClassifier(LinearSVC(), n_jobs=20)
    train_time, test_time, accu = run_model(clf, train_data_reduce,
                                            train_label, test_data_reduce,
                                            test_label)
    # print the performance
    print 'train time:{} test time:{} accuracy:{}'.format(
        train_time, test_time, accu)
    runtimeList.append(train_time + test_time)
    accuList.append(accu)

# Plot the accuracy figure.
plt.figure()
plt.plot(np.arange(5, 251, 5), accuList)
plt.xlabel('dimension')
plt.ylabel('accuracy')
plt.savefig('pca-accu.pdf')
Example #32
0
class RobustWeightedClassifier(BaseEstimator, ClassifierMixin):
    """Algorithm for robust classification using reweighting algorithm.

    This model use iterative reweighting of samples to make a regression or
    classification estimator robust.

    The principle of the algorithm is to use an empirical risk minimization
    principle where the risk is estimated using a robust estimator (for example
    Huber estimator or median-of-means estimator)[1], [3]. The idea behind this
    algorithm was mentioned before in [2].
    This idea translates in an iterative algorithm where the sample_weight
    are changed at each iterations and are dependent of the sample. Informally
    the outliers should have small weight while the inliers should have big
    weight, where outliers are sample with a big loss function.

    This algorithm enjoy a non-zero breakdown-point (it can handle arbitrarily
    bad outliers). When the "mom" weighting scheme is used, k outliers can be
    tolerated. When the "Huber" weighting scheme is used, asymptotically the
    number of outliers has to be less than half the sample size.

    Read more in the :ref:`User Guide <robust>`.

    Parameters
    ----------

    weighting : string, default="huber"
        Weighting scheme used to make the estimator robust.
        Can be 'huber' for huber-type weights or  'mom' for median-of-means
        type weights.

    max_iter : int, default=100
        Maximum number of iterations.
        For more information, see the optimization scheme of base_estimator
        and the eta0 and burn_in parameter.

    burn_in : int, default=10
        Number of steps used without changing the learning rate.
        Can be useful to make the weight estimation better at the beginning.

    eta0 : float, default=0.01
        Constant step-size used during the burn_in period. Used only if
        burn_in>0. Can have a big effect on efficiency.

    c : float>0 or None, default=None
        Parameter used for Huber weighting procedure, used only if weightings
        is 'huber'. Measure the robustness of the weighting procedure. A small
        value of c means a more robust estimator.
        Can have a big effect on efficiency.
        If None, c is estimated at each step using half the Inter-quartile
        range, this tends to be conservative (robust).

    k : int < sample_size/2, default=1
        Parameter used for mom weighting procedure, used only if weightings
        is 'mom'. 2k+1 is the number of blocks used for median-of-means
        estimation, higher value of k means a more robust estimator.
        Can have a big effect on efficiency.
        If None, k is estimated using the number of points distant from the
        median of means of more than 2 times a robust estimate of the scale
        (using the inter-quartile range), this tends to be conservative
        (robust).

    loss : string, None or callable, default="log"
        Name of the loss used, must be the same loss as the one optimized in
        base_estimator.
        Classification losses supported : 'log', 'hinge'.
        If 'log', then the base_estimator must support predict_proba.
        Regression losses supported : 'squared_loss', .

    sgd_args : dict, default={}
        arguments of the SGDClassifier base estimator.

    multi_class : string, default="ovr"
        multi-class scheme. Can be either "ovo" for OneVsOneClassifier or "ovr"
        for OneVsRestClassifier or "binary" for binary classification.

    n_jobs : int, default=1
        number of jobs used in the multi-class meta-algorithm computation.

    tol : float or None, (default = 1e-3)
        The stopping criterion. If it is not None, training will stop when
        (loss > best_loss - tol) for n_iter_no_change consecutive epochs.

    n_iter_no_change : int, default=10
        Number of iterations with no improvement to wait before early stopping.

    random_state : int, RandomState instance or None, optional (default=None)
        The seed of the pseudo random number generator to use when shuffling
        the data. If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by np.random.



    Attributes
    ----------

    classes_ : ndarray of shape (n_classes, )
        A list of class labels known to the classifier.

    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
        Coefficient of the features in the decision function. Only available if
        multi_class = "binary"

    intercept_ : ndarray of shape (1,) or (n_classes,)
        Intercept (a.k.a. bias) added to the decision function.
        Only available if multi_class = "binary"

    n_iter_ : ndarray of shape (n_classes,) or (1, )
        Actual number of iterations for all classes. If binary or multinomial,
        it returns only 1 element. For liblinear solver, only the maximum
        number of iteration across all classes is given.

    base_estimator_ : object,
        The fitted base estimator SGDCLassifier.

    weights_ : array like, length = n_sample.
        Weight of each sample at the end of the algorithm. Can be used as a
        measure of how much of an outlier a sample is. Only available if
        multi_class = "binary"


    Notes
    -----

    Often, there is a need to use RobustScaler as preprocessing.

    Examples
    --------

    >>> from sklearn_extra.robust import RobustWeightedClassifier
    >>> from sklearn.datasets import make_blobs
    >>> import numpy as np
    >>> rng = np.random.RandomState(42)
    >>> X,y = make_blobs(n_samples=100, centers=np.array([[-1, -1], [1, 1]]),
    ...                  random_state=rng)
    >>> clf=RobustWeightedClassifier()
    >>> _ = clf.fit(X, y)
    >>> score = np.mean(clf.predict(X)==y)

    References
    ----------

    [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu.
        "Robust classification via MOM minimization", Mach Learn 109, (2020).
        https://doi.org/10.1007/s10994-019-05863-6 (2018).
        arXiv:1808.03106

    [2] Christian Brownlees, Emilien Joly and Gábor Lugosi.
        "Empirical risk minimization for heavy-tailed losses", Ann. Statist.
        Volume 43, Number 6 (2015), 2507-2536.

    [3] Stanislav Minsker and Timothée Mathieu.
        "Excess risk bounds in robust empirical risk minimization"
        arXiv preprint (2019). arXiv:1910.07485.

    """

    def __init__(
        self,
        weighting="huber",
        max_iter=100,
        burn_in=10,
        eta0=0.01,
        c=None,
        k=0,
        loss="log",
        sgd_args=None,
        multi_class="ovr",
        n_jobs=1,
        tol=1e-3,
        n_iter_no_change=10,
        random_state=None,
    ):
        self.weighting = weighting
        self.max_iter = max_iter
        self.burn_in = burn_in
        self.eta0 = eta0
        self.c = c
        self.k = k
        self.loss = loss
        self.sgd_args = sgd_args
        self.multi_class = multi_class
        self.n_jobs = n_jobs
        self.tol = tol
        self.n_iter_no_change = n_iter_no_change
        self.random_state = random_state

    def fit(self, X, y):
        """Fit the model to data matrix X and target(s) y.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).

        Returns
        -------
        self : returns an estimator trained with RobustWeightedClassifier.
        """

        if self.sgd_args is None:
            sgd_args = {}
        else:
            sgd_args = self.sgd_args

        # Define the base estimator
        base_robust_estimator_ = _RobustWeightedEstimator(
            SGDClassifier(**sgd_args, loss=self.loss),
            weighting=self.weighting,
            loss=self.loss,
            burn_in=self.burn_in,
            c=self.c,
            k=self.k,
            eta0=self.eta0,
            max_iter=self.max_iter,
            tol=self.tol,
            n_iter_no_change=self.n_iter_no_change,
            random_state=self.random_state,
        )

        if self.multi_class == "ovr":
            self.base_estimator_ = OneVsRestClassifier(
                base_robust_estimator_, n_jobs=self.n_jobs
            )
        elif self.multi_class == "binary":
            self.base_estimator_ = base_robust_estimator_
        elif self.multi_class == "ovo":
            self.base_estimator_ = OneVsOneClassifier(
                base_robust_estimator_, n_jobs=self.n_jobs
            )
        else:
            raise ValueError("No such multiclass method implemented.")

        self.base_estimator_.fit(X, y)
        if self.multi_class == "binary":
            self.weights_ = self.base_estimator_.weights_
            self.coef_ = self.base_estimator_.coef_
            self.intercept_ = self.base_estimator_.intercept_
        self.n_iter_ = self.max_iter * len(X)
        self.classes_ = self.base_estimator_.classes_
        return self

    def predict(self, X):
        """Predict using the estimator trained with RobustWeightedClassifier.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        Returns
        -------
        y : array-like, shape (n_samples, n_outputs)
            The predicted values.
        """

        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.predict(X)

    def _check_proba(self):
        if self.loss != "log":
            raise AttributeError(
                "Probability estimates are not available for"
                " loss=%r" % self.loss
            )

    @property
    def predict_proba(self):
        """
        Probability estimates when binary classification.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Vector to be scored, where `n_samples` is the number of samples and
            `n_features` is the number of features.

        Returns
        -------
        T : array-like of shape (n_samples, n_classes)
            Returns the probability of the sample for each class in the model,
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        self._check_proba()
        return self._predict_proba

    def _predict_proba(self, X):
        return self.base_estimator_.predict_proba(X)

    @property
    def _estimator_type(self):
        return self.base_estimator._estimator_type

    def score(self, X, y=None):
        """Returns the score on the given data, using
        ``base_estimator_.score``.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like of shape (n_samples, n_output) or (n_samples,), optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        Returns
        -------
        score : float
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.score(X, y)

    def decision_function(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)

        Returns
        -------
        array, shape (n_samples,)
           Predicted target values per element in X.
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.decision_function(X)
def test_ovo_partial_fit_predict():
    X, y = shuffle(iris.data, iris.target)
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
    ovo1.partial_fit(X[100:], y[100:])
    pred1 = ovo1.predict(X)

    ovo2 = OneVsOneClassifier(MultinomialNB())
    ovo2.fit(X, y)
    pred2 = ovo2.predict(X)
    assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2)
    assert_greater(np.mean(y == pred1), 0.65)
    assert_almost_equal(pred1, pred2)

    # Test when mini-batches don't have all target classes
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovo1.partial_fit(iris.data[60:], iris.target[60:])
    pred1 = ovo1.predict(iris.data)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data)

    assert_almost_equal(pred1, pred2)
    assert_equal(len(ovo1.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred1), 0.65)
def all_classifier_models():
    models = []
    metrix = []
    c_report = []
    train_accuracy = []
    test_accuracy = []
    
    models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100)))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('Linear_SVM', LinearSVC()))
    models.append(('XGB', XGBClassifier()))
    models.append(('SGD', SGDClassifier()))
    models.append(('Perceptron', Perceptron()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    models.append(('OneClassSVM', OneClassSVM(gamma = 'auto')))
    models.append(('NuSVC', NuSVC()))
    models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)))
    models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0)))
    models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0)))
    models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('LogisticRegressionCV', LogisticRegressionCV()))
    models.append(('RidgeClassifierCV', RidgeClassifierCV()))
    models.append(('RidgeClassifier', RidgeClassifier()))
    models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier()))
    models.append(('GaussianProcessClassifier', GaussianProcessClassifier()))
    models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier()))
    estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))]
    models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())))
    clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('BaggingClassifier', BaggingClassifier()))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    models.append(('CategoricalNB', CategoricalNB()))
    models.append(('ComplementNB', ComplementNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('MultinomialNB', MultinomialNB()))
    models.append(('CalibratedClassifierCV', CalibratedClassifierCV()))
    models.append(('LabelPropagation', LabelPropagation()))
    models.append(('LabelSpreading', LabelSpreading()))
    models.append(('NearestCentroid', NearestCentroid()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('GaussianMixture', GaussianMixture()))
    models.append(('BayesianGaussianMixture', BayesianGaussianMixture()))
    
    test_accuracy= []
    names = []
    for name, model in models:
        try:
            m = model
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            train_acc = round(m.score(X_train, y_train) * 100, 2)
            test_acc = metrics.accuracy_score(y_test,y_pred) *100
            c_report.append(classification_report(y_test, y_pred))
            test_accuracy.append(test_acc)
            names.append(name)
            metrix.append([name, train_acc, test_acc])
        except:
            print("Exception Occurred  :",name)
    return metrix,test_accuracy,names
Example #35
0
def test_ovo_exceptions():
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    assert_raises(ValueError, ovo.predict, [])
Example #36
0
#Um versos o resto/todo (One versus Rest/All)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
#0 => 0 e 1 => 1,2, LinearSVC vai falar que eh do tipo 0 ou do resto (38%, resto 62%)
#0 => 0,2 e 1 => 1 LinearSVC vai falar que eh do tipo 1 ou do resto (44%, resto 56%)
#0 => 0,1 e 2 => 2, LinearSVC vai falar que eh do tipo 0 ou do resto (20%, restp80%)
modeloOneVsRest = OneVsRestClassifier(LinearSVC(random_state=0))
resultadoOneVsRest = fit_and_predict(modeloOneVsRest, treino_dados,
                                     treino_marcacoes, teste_dados,
                                     teste_marcacoes, "One Vs Rest Classifier")
resultados[resultadoOneVsRest] = modeloOneVsRest

#OneVsOne
from sklearn.multiclass import OneVsOneClassifier
modeloOneVsOne = OneVsOneClassifier(LinearSVC(random_state=0))
resultadoOneVsOne = fit_and_predict(modeloOneVsOne, treino_dados,
                                    treino_marcacoes, teste_dados,
                                    teste_marcacoes, "One Vs One")
resultados[resultadoOneVsOne] = modeloOneVsOne

#importa o algoritmo de classidicacao MultinomialNB
from sklearn.naive_bayes import MultinomialNB
#atribui o algoritmo MultinomialNB a variavel chamada modelo
modeloMultinomial = MultinomialNB()
resultadoMultinomial = fit_and_predict(modeloMultinomial, treino_dados,
                                       treino_marcacoes, teste_dados,
                                       teste_marcacoes, "MultinomialNB")
resultados[resultadoMultinomial] = modeloMultinomial

#Algotimo que encontra a melhor possibilidade de combinacoes das caracteristicas dos dados de treino
Example #37
0
    # Separating training and validation
    training_percentage = 0.8

    len_training = int(training_percentage * len(Y))
    len_validation = len(Y) - len_training

    training_data = X[0:len_training]
    training_marker = Y[0:len_training]

    validation_data = X[len_training:]
    validation_marker = Y[len_training:]

    results = {}
    # Applying to OneVsRestClassifier
    one_vs_rest_model = OneVsRestClassifier(LinearSVC(random_state=0))
    one_vs_one_model = OneVsOneClassifier(LinearSVC(random_state=0))
    multinomial_model = MultinomialNB()
    ada_boost_model = AdaBoostClassifier()

    results['OneVsRestClassifier'] = fit_and_predict("OneVsRestClassifier",
                                                     one_vs_rest_model,
                                                     training_data,
                                                     training_marker, 10)
    results['OneVsOneClassifier'] = fit_and_predict("OneVsOneClassifier",
                                                    one_vs_one_model,
                                                    training_data,
                                                    training_marker, 10)
    results['MultinomialNB'] = fit_and_predict("MultinomialNB",
                                               multinomial_model,
                                               training_data, training_marker,
                                               10)
def main():

    # Checks for correct number of arguments
    if len(sys.argv) != 3:
        print(
            'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]')
        sys.exit()

    # set up dataset
    data_train = pd.read_csv(sys.argv[1])
    data_test = pd.read_csv(sys.argv[2])

    print('train: {}'.format(sys.argv[1]))
    print('test: {}'.format(sys.argv[2]))

    x_train = data_train.drop(
        [data_train.columns[0], data_train.columns[1], data_train.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_train = pd.Series(data_train.iloc[:, -1])
    x_test = data_test.drop(
        [data_test.columns[0], data_test.columns[1], data_test.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_test = pd.Series(data_test.iloc[:, -1])

    type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ')
    if type == 1:
        method = input('method: [1: classification, 2: regression] ')
        if method == 1:
            classifier = input(
                'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] '
            )
            if classifier == 1:
                criterion = input('criterion: [1: gini, 2: entropy] ')
                if criterion == 1:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='gini')
                elif criterion == 2:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='entropy')
                else:
                    print('no criterion chosen')
                    exit()
            elif classifier == 2:
                print(type, method, classifier)
                model = ExtraTreeClassifier()
            elif classifier == 3:
                print(type, method, classifier)
                model = ExtraTreesClassifier()
            elif classifier == 4:
                n = input('n: [1: 1, 2: 3: 3: 5] ')
                if n == 1:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=1)
                elif n == 2:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=3)
                elif n == 3:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=5)
                else:
                    print('no n chosen')
                    exit()
            elif classifier == 5:
                version = input(
                    'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] '
                )
                if version == 1:
                    print(type, method, classifier, version)
                    model = GaussianNB()
                elif version == 2:
                    print(type, method, classifier, version)
                    model = BernoulliNB()
                elif version == 3:
                    print(type, method, classifier, version)
                    model = MultinomialNB()
                elif version == 4:
                    print(type, method, classifier, version)
                    model = ComplementNB()
                else:
                    print('no version chosen')
                    exit()
            elif classifier == 6:
                print(type, method, classifier)
                model = RadiusNeighborsClassifier(radius=1.0)
            elif classifier == 7:
                print(type, method, classifier)
                model = RandomForestClassifier(n_estimators=50, random_state=1)
            elif classifier == 8:
                print(type, method, classifier)
                model = LinearSVC(
                    multi_class='crammer_singer')  #multi_class='ovr'
            elif classifier == 9:
                print(type, method, classifier)
                model = GradientBoostingClassifier()
            elif classifier == 10:
                print(type, method, classifier)
                model = GaussianProcessClassifier(multi_class='one_vs_one')
                # model = GaussianProcessClassifier(multi_class='one_vs_rest')
            elif classifier == 11:
                print(type, method, classifier)
                model = SGDClassifier()
            elif classifier == 12:
                print(type, method, classifier)
                model = PassiveAggressiveClassifier()
            elif classifier == 13:
                print(type, method, classifier)
                model = NearestCentroid()
            elif classifier == 14:
                print(type, method, classifier)
                model = Perceptron(tol=1e-3, random_state=0)
            elif classifier == 15:
                print(type, method, classifier)
                model = MLPClassifier()
            elif classifier == 16:
                print(type, method, classifier)
                model = AdaBoostClassifier(n_estimators=100)
            else:
                print('no classifier chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # predict output
            predictions = pd.Series(model.predict(x_test))

            filename = '{},{},{}.txt'.format(type, method, classifier)
            with open(filename, 'w') as output:
                output.write('{:10}\t{:10}\t{:10}\t{:10}'.format(
                    'actual', 'predict', 'approximate', 'match?'))
                for i in range(len(predictions)):
                    match = True if (y_test[i] == predictions[i]) else False
                    output.write('{:10}\t{:10}\t{:10}'.format(
                        y_train[i], predictions[i], match))
                output.write('accuracy: {:7.2f}%'.format(
                    100 * accuracy_score(y_test, predictions)))

            print('accuracy: {:7.2f}%'.format(
                100 * accuracy_score(y_test, predictions)))
            print(
                classification_report(
                    y_test,
                    predictions,
                    target_names=['RightTroll', 'LeftTroll', 'Other']))
            print(
                confusion_matrix(y_test,
                                 predictions,
                                 labels=["RightTroll", "LeftTroll", "Other"]))
        elif method == 2:
            # transform into binary classification problem
            # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1)
            # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1)

            # transform string labels into integers
            # le = LabelEncoder()
            # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1]))
            # print(le.classes_)
            #
            # y_train = le.transform(y_train)
            # y_test = le.transform(y_test)

            regressor = input(
                'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] '
            )
            if regressor == 1:
                print(type, method, regressor)
                model = LinearDiscriminantAnalysis()
            elif regressor == 2:
                print(type, method, regressor)
                model = LogisticRegression(
                    solver='lbfgs', multi_class='multinomial')  #'newton-cg'
            elif regressor == 3:
                print(type, method, regressor)
                model = RidgeClassifier()
            elif regressor == 4:
                print(type, method, regressor)
                model = QuadraticDiscriminantAnalysis()
            elif regressor == 5:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(LinearRegression())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(LinearRegression())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 6:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(DecisionTreeRegressor())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(DecisionTreeRegressor())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 7:
                print(type, method, regressor)
                model = PLSRegression(n_components=2)
            elif regressor == 8:
                print(type, method, regressor)
                model = PLSCanonical(n_components=2)
            elif regressor == 9:
                print(type, method, regressor)
                model = CCA(n_components=1)
            elif regressor == 10:
                print(type, method, regressor)
                model = Lasso(alpha=0.1)
            elif regressor == 11:
                print(type, method, regressor)
                model = MultiTaskLasso(alpha=0.1)
            elif regressor == 12:
                print(type, method, regressor)
                model = ElasticNet(random_state=0)
            elif regressor == 13:
                print(type, method, regressor)
                model = MultiTaskElasticNet(random_state=0)
            elif regressor == 14:
                print(type, method, regressor)
                model = Lars(n_nonzero_coefs=1)
            elif regressor == 15:
                print(type, method, regressor)
                model = LassoLars(alpha=.1)
            elif regressor == 16:
                print(type, method, regressor)
                model = OrthogonalMatchingPursuit()
            elif regressor == 17:
                print(type, method, regressor)
                model = BayesianRidge()
            elif regressor == 18:
                print(type, method, regressor)
                model = ARDRegression()
            elif regressor == 19:
                print(type, method, regressor)
                model = TheilSenRegressor(random_state=0)
            elif regressor == 20:
                print(type, method, regressor)
                model = HuberRegressor()
            elif regressor == 21:
                print(type, method, regressor)
                model = RANSACRegressor(random_state=0)
            else:
                print('no regressor chosen')
                exit()

            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # print('coefficient:', model.coef_)
            # print('intercept:', model.intercept_)

            # predict output
            predictions = pd.Series(model.predict(x_test))
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # calculate accuracy
            numerator = 0.0
            denominator = float(len(predictions))
            for i in range(len(predictions)):
                match = True if (y_test[i] == predictions[i]) else False
                numerator += 1 if match else 0
                print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                                   match))
            print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))

        else:
            print('no method chosen')
            exit()
    elif type == 2:
        classifier = input(
            'classifier: [1: label propagation, 2: label spreading] ')
        if classifier == 1:
            print(type, classifier)
            model = LabelPropagation()
        elif classifier == 2:
            print(type, classifier)
            model = LabelSpreading()
        else:
            print('no classifier chosen')
            exit()
        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    elif type == 3:
        method = input(
            'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] '
        )
        if method == 1:
            clusterer = input('clustere: [1: k means]')
            if clusterer == 1:
                clusters = input('clusters: [1: 1, 2: 2, 3: 3] ')
                if clusters == 1:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=1, random_state=0)
                elif clusters == 2:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=2, random_state=0)
                elif clusters == 3:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=3, random_state=0)
                else:
                    print('no clusters chosen')
                    exit()
            else:
                print('no clusterer chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.predict(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # check details
            print('centroids: ' + model.cluster_centers_)
            # print('labels: ' + model.labels_)
        elif method == 2:
            model = RandomTreesEmbedding()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.apply(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))
        elif method == 3:
            model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
            # train the model using the training sets and check score
            model.fit(x_train)
            distances, indices = nbrs.kneighbors(X)

        else:
            print('no method chosen')
            exit()

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    else:
        print('no type chosen')
        exit()
Example #39
0
from sklearn import svm, datasets
from sklearn.multiclass import OneVsOneClassifier
#调用SVC()
clf = svm.SVC()
#载入鸢尾花数据集
iris = datasets.load_iris()
print(iris)

X, y = iris.data, iris.target
clf = svm.LinearSVC(random_state=0)

clf = OneVsOneClassifier(clf)  # 根据二分类器构建多分类器
clf.fit(X, y)  # 训练模型
y_pred = clf.predict(X)  # 预测样本
print('预测正确的个数:%d,预测错误的个数:%d' % ((y == y_pred).sum(), (y != y_pred).sum()))
Example #40
0
df = pd.read_csv("foods.csv")

Y = df["clss"].values
X = df.drop(["brands", "countries", "product_name", "clss"], axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0, test_size = 0.25)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# One-vs-all: Sklearn hat automatisch erkannt, wir möchten hier
#             mehrere Klassen vorhersagen - daher wird per default
#             die One-vs-all-Methode (OneVsRestClassifier) verwendet. 

model = LogisticRegression(solver='lbfgs', multi_class='auto') #solver/multi_class siehe Doku
model.fit(X_train, Y_train)

print(model.score(X_test, Y_test))

# One-vs-one

model = OneVsOneClassifier(LogisticRegression(solver='lbfgs', multi_class='auto'))
model.fit(X_train, Y_train)

print(model.score(X_test, Y_test))

Example #41
0
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
    """Gaussian process classification (GPC) based on Laplace approximation.

    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
    ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
    Williams.

    Internally, the Laplace approximation is used for approximating the
    non-Gaussian posterior by a Gaussian.

    Currently, the implementation is restricted to using the logistic link
    function. For multi-class classification, several binary one-versus rest
    classifiers are fitted. Note that this class thus does not implement
    a true multi-class Laplace approximation.

    Parameters
    ----------
    kernel : kernel object
        The kernel specifying the covariance function of the GP. If None is
        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
        the kernel's hyperparameters are optimized during fitting.

    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
        Can either be one of the internally supported optimizers for optimizing
        the kernel's parameters, specified by a string, or an externally
        defined optimizer passed as a callable. If a callable is passed, it
        must have the  signature::

            def optimizer(obj_func, initial_theta, bounds):
                # * 'obj_func' is the objective function to be maximized, which
                #   takes the hyperparameters theta as parameter and an
                #   optional flag eval_gradient, which determines if the
                #   gradient is returned additionally to the function value
                # * 'initial_theta': the initial value for theta, which can be
                #   used by local optimizers
                # * 'bounds': the bounds on the values of theta
                ....
                # Returned are the best found hyperparameters theta and
                # the corresponding value of the target function.
                return theta_opt, func_min

        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
        is used. If None is passed, the kernel's parameters are kept fixed.
        Available internal optimizers are::

            'fmin_l_bfgs_b'

    n_restarts_optimizer: int, optional (default: 0)
        The number of restarts of the optimizer for finding the kernel's
        parameters which maximize the log-marginal likelihood. The first run
        of the optimizer is performed from the kernel's initial parameters,
        the remaining ones (if any) from thetas sampled log-uniform randomly
        from the space of allowed theta-values. If greater than 0, all bounds
        must be finite. Note that n_restarts_optimizer=0 implies that one
        run is performed.

    max_iter_predict: int, optional (default: 100)
        The maximum number of iterations in Newton's method for approximating
        the posterior during predict. Smaller values will reduce computation
        time at the cost of worse results.

    warm_start : bool, optional (default: False)
        If warm-starts are enabled, the solution of the last Newton iteration
        on the Laplace approximation of the posterior mode is used as
        initialization for the next call of _posterior_mode(). This can speed
        up convergence when _posterior_mode is called several times on similar
        problems as in hyperparameter optimization.

    copy_X_train : bool, optional (default: True)
        If True, a persistent copy of the training data is stored in the
        object. Otherwise, just a reference to the training data is stored,
        which might cause predictions to change if the data is modified
        externally.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    multi_class: string, default: "one_vs_rest"
        Specifies how multi-class classification problems are handled.
        Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
        one binary Gaussian process classifier is fitted for each class, which
        is trained to separate this class from the rest. In "one_vs_one", one
        binary Gaussian process classifier is fitted for each pair of classes,
        which is trained to separate these two classes. The predictions of
        these binary predictors are combined into multi-class predictions.
        Note that "one_vs_one" does not support predicting probability
        estimates.

    n_jobs : int, optional, default: 1
        The number of jobs to use for the computation. If -1 all CPUs are used.
        If 1 is given, no parallel computing code is used at all, which is
        useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
        used. Thus for n_jobs = -2, all CPUs but one are used.

    Attributes
    ----------
    kernel_ : kernel object
        The kernel used for prediction. In case of binary classification,
        the structure of the kernel is the same as the one passed as parameter
        but with optimized hyperparameters. In case of multi-class
        classification, a CompoundKernel is returned which consists of the
        different kernels used in the one-versus-rest classifiers.

    log_marginal_likelihood_value_: float
        The log-marginal-likelihood of self.kernel_.theta

    classes_ : array-like, shape = (n_classes,)
        Unique class labels.

    n_classes_ : int
        The number of classes in the training data
    """
    def __init__(self,
                 kernel=None,
                 optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0,
                 max_iter_predict=100,
                 warm_start=False,
                 copy_X_train=True,
                 random_state=None,
                 multi_class="one_vs_rest",
                 n_jobs=1):
        self.kernel = kernel
        self.optimizer = optimizer
        self.n_restarts_optimizer = n_restarts_optimizer
        self.max_iter_predict = max_iter_predict
        self.warm_start = warm_start
        self.copy_X_train = copy_X_train
        self.random_state = random_state
        self.multi_class = multi_class
        self.n_jobs = n_jobs

    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,
            self.random_state)

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes. Only class %s present." %
                             self.classes_[0])
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
                    OneVsRestClassifier(self.base_estimator_,
                                        n_jobs=self.n_jobs)
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                    OneVsOneClassifier(self.base_estimator_,
                                       n_jobs=self.n_jobs)
            else:
                raise ValueError("Unknown multi-class mode %s" %
                                 self.multi_class)

        self.base_estimator_.fit(X, y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean([
                estimator.log_marginal_likelihood()
                for estimator in self.base_estimator_.estimators_
            ])
        else:
            self.log_marginal_likelihood_value_ = \
                self.base_estimator_.log_marginal_likelihood()

        return self

    def predict(self, X):
        """Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array, shape = (n_samples,)
            Predicted target values for X, values are from classes_
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        X = check_array(X)
        return self.base_estimator_.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array-like, shape = (n_samples, n_classes)
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
            raise ValueError("one_vs_one multi-class mode does not support "
                             "predicting probability estimates. Use "
                             "one_vs_rest mode instead.")
        X = check_array(X)
        return self.base_estimator_.predict_proba(X)

    @property
    def kernel_(self):
        if self.n_classes_ == 2:
            return self.base_estimator_.kernel_
        else:
            return CompoundKernel([
                estimator.kernel_
                for estimator in self.base_estimator_.estimators_
            ])

    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
        """Returns log-marginal likelihood of theta for training data.

        In the case of multi-class classification, the mean log-marginal
        likelihood of the one-versus-rest classifiers are returned.

        Parameters
        ----------
        theta : array-like, shape = (n_kernel_params,) or none
            Kernel hyperparameters for which the log-marginal likelihood is
            evaluated. In the case of multi-class classification, theta may
            be the  hyperparameters of the compound kernel or of an individual
            kernel. In the latter case, all individual kernel get assigned the
            same theta values. If None, the precomputed log_marginal_likelihood
            of self.kernel_.theta is returned.

        eval_gradient : bool, default: False
            If True, the gradient of the log-marginal likelihood with respect
            to the kernel hyperparameters at position theta is returned
            additionally. Note that gradient computation is not supported
            for non-binary classification. If True, theta must not be None.

        Returns
        -------
        log_likelihood : float
            Log-marginal likelihood of theta for training data.

        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
            Gradient of the log-marginal likelihood with respect to the kernel
            hyperparameters at position theta.
            Only returned when eval_gradient is True.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])

        if theta is None:
            if eval_gradient:
                raise ValueError(
                    "Gradient can only be evaluated for theta!=None")
            return self.log_marginal_likelihood_value_

        theta = np.asarray(theta)
        if self.n_classes_ == 2:
            return self.base_estimator_.log_marginal_likelihood(
                theta, eval_gradient)
        else:
            if eval_gradient:
                raise NotImplementedError(
                    "Gradient of log-marginal-likelhood not implemented for "
                    "multi-class GPC.")
            estimators = self.base_estimator_.estimators_
            n_dims = estimators[0].kernel_.n_dims
            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
                return np.mean([
                    estimator.log_marginal_likelihood(theta)
                    for i, estimator in enumerate(estimators)
                ])
            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
                # theta for compound kernel
                return np.mean([
                    estimator.log_marginal_likelihood(theta[n_dims * i:n_dims *
                                                            (i + 1)])
                    for i, estimator in enumerate(estimators)
                ])
            else:
                raise ValueError(
                    "Shape of theta must be either %d or %d. "
                    "Obtained theta with shape %d." %
                    (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
Example #42
0
 def get_estimator(self):
     return OneVsOneClassifier(LogisticRegression())
Example #43
0
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot
from collections import Counter

train_data = pd.read_csv("train_data_clean.csv", header=0)
validation_data = pd.read_csv("validation_data_clean.csv", header=0)
test_data = pd.read_csv("test_data_clean.csv", header=0)
pred_data = pd.read_csv("ElectionsData_Pred_Features.csv", header=0)

train_val_list = [train_data, validation_data]
train_val_data = pd.concat(train_val_list)
features = train_val_data.drop(['label'], axis=1).values
target = train_val_data.label.values

clf = OneVsOneClassifier(LinearSVC(C=1.0, random_state=0))
pred = cross_val_predict(clf, features, target, cv=30, n_jobs=-1)
print(
    classification_report(target,
                          pred,
                          target_names=train_val_data.label.unique()))

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(min_samples_split=5,
                             random_state=0,
                             n_estimators=100,
                             n_jobs=-1,
                             verbose=1,
                             class_weight="balanced")
clf.fit(features, target)
def test_ovo_partial_fit_predict():
    X, y = shuffle(iris.data, iris.target)
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
    ovo1.partial_fit(X[100:], y[100:])
    pred1 = ovo1.predict(X)

    ovo2 = OneVsOneClassifier(MultinomialNB())
    ovo2.fit(X, y)
    pred2 = ovo2.predict(X)
    assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2)
    assert_greater(np.mean(y == pred1), 0.65)
    assert_almost_equal(pred1, pred2)

    # Test when mini-batches don't have all target classes
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovo1.partial_fit(iris.data[60:], iris.target[60:])
    pred1 = ovo1.predict(iris.data)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data)

    assert_almost_equal(pred1, pred2)
    assert_equal(len(ovo1.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred1), 0.65)
Example #45
0
def main():

    # import the data
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784')
    x, y = mnist["data"], mnist["target"]
    print(x.shape)
    print(y.shape)

    # show the image
    some_digit = x[36000]
    some_digit_image = some_digit.reshape(28, 28)
    plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
    plt.axis("off")
    plt.show()

    # prepare the testing/training tests
    x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]
    np.random.seed(3)
    shuffle_index = np.random.permutation(60000)
    x_train, y_train = x_train[shuffle_index], y_train[shuffle_index]

    # Binary Classifier
    y_train_5 = (y_train == '5')  # True for all 5s
    y_test_5 = (y_test == '5')  # make sure it's int not chars
    from sklearn.linear_model import SGDClassifier
    sgd_clf = SGDClassifier(random_state=42)
    sgd_clf.fit(x_train, y_train_5)  # enable the model
    print(sgd_clf.predict([some_digit]))

    # implement Cross-Validation
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    skfolds = StratifiedKFold(n_splits=3, random_state=42)
    for train_index, test_index in skfolds.split(x_train, y_train_5):
        clone_clf = clone(sgd_clf)  # train clone on training folds, then predict on test fold
        x_train_folds = x_train[train_index]
        y_train_folds = y_train_5[train_index]
        x_test_fold = x_train[test_index]
        y_test_fold = y_train_5[test_index]
        clone_clf.fit(x_train_folds, y_train_folds)
        y_pred = clone_clf.predict(x_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))

    # evaluate the model with 'accuracy'
    from sklearn.model_selection import cross_val_score
    cross_val_score = cross_val_score(sgd_clf, x_train, y_train_5, cv=3, scoring="accuracy")
    print(cross_val_score)

    # see accuracy from a non5classifier
    from sklearn.base import BaseEstimator
    class Never5Classifier(BaseEstimator):
        def fit(self, x, y=None):
            pass
        def predicit(self, x):
            return np.zeros((len(x), 1), dtype=bool)
    never_5_clf = Never5Classifier()
    never_5_clf_score = cross_val_score(never_5_clf, x_train, y_train_5, cv=3, scoring="accuracy")
    print(never_5_clf_score)

    # evaluate the model with 'confusion matrix'
    from sklearn.model_selection import cross_val_predict
    from sklearn.metrics import confusion_matrix
    y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3)
    confusion_matrix = confusion_matrix(y_train_5, y_train_pred)
    print(confusion_matrix)

    # precision and recall
    from sklearn.metrics import precision_score, recall_score, f1_score
    precision_score = precision_score(y_train_5, y_train_pred)
    recall_score = recall_score(y_train_5, y_train_pred)
    f1_score = f1_score(y_train_5, y_train_pred)
    print(precision_score)
    print(recall_score)
    print(f1_score)  # f1 score is the harmonic mean of precision and recall

    # precision vs recall trade-off
    from sklearn.metrics import precision_recall_curve

    def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
        plt.plot(thresholds, precisions[:-1], "b--", label="precision")  # function to plot precision vs threshold
        plt.plot(thresholds, recalls[:-1], "g-", label="recall")
        plt.xlabel("Threshold", fontsize=16)
        plt.legend(loc="upper left", fontsize=16)
        plt.ylim([0, 1])

    def plot_precision_vs_recall(precisions, recalls):
        plt.plot(recalls, precisions, "b-", linewidth=2)
        plt.xlabel("recall", fontsize=16)
        plt.ylabel("precision", fontsize=16)
        plt.axis([0, 1, 0, 1])
    y_scores = cross_val_predict(sgd_clf, x_train, y_train, cv=3, method="decision_function")  # return decision value
    if y_scores.ndim == 2:
        y_scores = y_scores[:, 1]  # to get around with the issue of "extra first dimension"
    precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    plot_precision_vs_recall(precisions, recalls)
    plt.show()

    # manly set the threshold
    y_train_pred_90 = (y_scores > 70000)  # gain new trained dataset
    precision_score = precision_score(y_train_5, y_train_pred_90)
    recall_score = recall_score(y_train_5, y_train_pred_90)
    print("precision_score=", precision_score)
    print("recall_score=", recall_score)

    # ROC curve
    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

    def plot_roc_curve(fpr, tpr, label=None):
        plt.plot(fpr, tpr, linewidth=2, label=label)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.axis([0, 1, 0, 1])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
    plot_roc_curve(fpr, tpr)
    plt.show()
    from sklearn.metrics import roc_auc_score
    from sklearn.ensemble import RandomForestClassifier
    forest_clf = RandomForestClassifier(random_state=42)
    y_probas_forest = cross_val_predict(forest_clf, x_train, y_train_5, cv=3, method="predict_proba")  # have no decision_function
    y_scores_forest = y_probas_forest[:, 1]  # extract the score from probability metrics
    fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
    plt.plot(fpr, tpr, "b:", label="SGD")
    plot_roc_curve(fpr_forest, tpr_forest, "random Forest")
    plt.legend(loc="lower right")
    plt.show()
    roc_auc_score = roc_auc_score(y_train_5, y_scores_forest)
    print(roc_auc_score)

    # Multiclass classification
    sgd_clf.fit(x_train, y_train)  # train the model to the all set.
    sgd_clf.predict([some_digit])
    some_digit_score = sgd_clf.decision_function([some_digit])  # obtain score for each class
    print(some_digit_score)

    # OvO classifier
    from sklearn.multiclass import OneVsOneClassifier
    ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
    ovo_clf.fit(x_train, y_train)
    print(ovo_clf.predict([some_digit]))
    forest_clf.fit(x_train, y_train)
    print(forest_clf.predict_proba([some_digit]))
    sgd_clf_score = cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring="accuracy")
    print(sgd_clf_score)  # here the score is for multiclass classification as for y_train
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train.astype(np.float64))
    sgd_clf_score(sgd_clf, x_train_scaled, y_train, cv=3, scoring="accuracy")
    print(sgd_clf_score)  # scaling can improve the accuracy for model

    # error analysis
    y_train_pred = cross_val_predict(sgd_clf, x_train_scaled, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)  # row for actual, column for predicted
    print(conf_mx)
    plt.matshow(conf_mx, cmap=plt.cm.gray)  # showing the matrix with a image
    plt.show()
    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums  # transform error number into error rate
    np.fill_diagonal(norm_conf_mx, 0)  # keep only the errors
    plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
    plt.show()

    # multilabel classification
    from sklearn.neighbors import KNeighborsClassifier
    y_train_large = (y_train >= 7)
    y_train_odd = (y_train % 2 == 1)  # imply odd number in this way
    y_multilabel = np.c_[y_train_large, y_train_odd]
    knn_clf = KNeighborsClassifier()  # KNeighborClassifier for multilabel
    knn_clf.fit(x_train, y_multilabel)
    print(knn_clf.predcit([some_digit]))

    # multioutput classification
    import numpy.random as rnd
    noise1 = rnd.randint(0, 100, len(x_train), 784)
    noise2 = rnd.randint(0, 100, (len(x_train), 784))  # grant noise and try to clean
    x_train_mod = x_train +noise1
    x_test_mod = x_test + noise2
    y_train_mod = x_train
    y_test_mod = x_test
    knn_clf.fit(x_train_mod, y_train_mod)
    clean_digit = knn_clf.predict([x_test_mod[1]])
    plot_digit(clean_digit)
Example #46
0
                           sessions=session, smoothing_fwhm=4,
                           memory="nilearn_cache", memory_level=1)
X = nifti_masker.fit_transform(dataset_files.func)
X = X[non_rest]

### Predictor #################################################################

### Define the prediction function to be used.
# Here we use a Support Vector Classification, with a linear kernel
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.pipeline import Pipeline

svc_ovo = OneVsOneClassifier(Pipeline([
                ('anova', SelectKBest(f_classif, k=500)),
                ('svc', SVC(kernel='linear'))
                ]))

svc_ova = OneVsRestClassifier(Pipeline([
                ('anova', SelectKBest(f_classif, k=500)),
                ('svc', SVC(kernel='linear'))
                ]))

### Cross-validation scores ###################################################
from sklearn.cross_validation import cross_val_score

cv_scores_ovo = cross_val_score(svc_ovo, X, y, cv=5, verbose=True)

cv_scores_ova = cross_val_score(svc_ova, X, y, cv=5, verbose=True)

print 79 * "_"
# Convert string data to numerical data
label_encoder = []
X_encoded = np.empty(X.shape)
for i, item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])

X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

# Create SVM classifier
classifier = OneVsOneClassifier(LinearSVC(random_state=0))

# Train the classifier
classifier.fit(X, y)

# Cross validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=5)
classifier = OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

# Compute the F1 score of the SVM classifier
f1 = cross_validation.cross_val_score(classifier,
                                      X,
                                      y,
s = StandardScaler()
s.fit(X_train)

X_train = s.transform(X_train)
X_test = s.transform(X_test)

## #################################
## Teil 3:  One-vs-All (automatisch)
## #################################
""" One-vs-All: Sklearn hat automatisch erkannt, wir möchten hier
                mehrere Klassen vorhersagen - daher wird per default
                die One-vs-all-Methode verwendet. """

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', multi_class='auto')
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

## ###################
## Teil 4:  One-vs-One
## ###################

from sklearn.multiclass import OneVsOneClassifier

model = OneVsOneClassifier(LogisticRegression(solver='lbfgs'))
model.fit(X_train, y_train)

print(model.score(X_test, y_test))
Example #49
0
def test_ovo_partial_fit_predict():
    temp = datasets.load_iris()
    X, y = temp.data, temp.target
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
    ovo1.partial_fit(X[100:], y[100:])
    pred1 = ovo1.predict(X)

    ovo2 = OneVsOneClassifier(MultinomialNB())
    ovo2.fit(X, y)
    pred2 = ovo2.predict(X)
    assert len(ovo1.estimators_) == n_classes * (n_classes - 1) / 2
    assert np.mean(y == pred1) > 0.65
    assert_almost_equal(pred1, pred2)

    # Test when mini-batches have binary target classes
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:60], y[:60], np.unique(y))
    ovo1.partial_fit(X[60:], y[60:])
    pred1 = ovo1.predict(X)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(X, y).predict(X)

    assert_almost_equal(pred1, pred2)
    assert len(ovo1.estimators_) == len(np.unique(y))
    assert np.mean(y == pred1) > 0.65

    ovo = OneVsOneClassifier(MultinomialNB())
    X = np.random.rand(14, 2)
    y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2]
    ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4])
    ovo.partial_fit(X[7:], y[7:])
    pred = ovo.predict(X)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(X, y).predict(X)
    assert_almost_equal(pred, pred2)

    # raises error when mini-batch does not have classes from all_classes
    ovo = OneVsOneClassifier(MultinomialNB())
    error_y = [0, 1, 2, 3, 4, 5, 2]
    message_re = escape("Mini-batch contains {0} while "
                        "it must be subset of {1}".format(
                            np.unique(error_y), np.unique(y)))
    assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7],
                         error_y, np.unique(y))

    # test partial_fit only exists if estimator has it:
    ovr = OneVsOneClassifier(SVC())
    assert not hasattr(ovr, "partial_fit")
def return_data(request):
    clf = sklearn.svm.LinearSVC()

    training_files = sklearn.datasets.load_files(
        "/../../../SVM/dataset_training")
    f = open("/../../../SVM/dataset_prediction/test/lol.txt", 'w')
    fil = open("/../../../SVM/data_print", 'a')
    fil.write("\n")
    text = request.POST.get('text')

    fil.close()
    f.write(text)
    f.close()
    #print "Text ",text

    #print training_files.data

    predict_files = sklearn.datasets.load_files(
        "/../../../SVM/dataset_prediction")

    #print "Predict",predict_files.data

    vectorizer = TfidfVectorizer(encoding='utf-8')
    X_t = vectorizer.fit_transform(
        (open(f).read() for f in training_files.filenames))
    #print("n_samples: %d, n_features: %d" % X_t.shape)
    assert sp.issparse(X_t)

    X_p = vectorizer.transform(
        (open(f).read() for f in predict_files.filenames))

    #print X_p
    clf.fit(X_t, training_files.target)
    y_predicted = ""
    y_predicted = clf.predict(X_p)
    #print "OUT",y_predicted
    if y_predicted[0] == 0:

        f1 = open("/../../../SVM_Multi/dataset_prediction/test/lol.txt", 'w')
        f1.write(text)
        f1.close()
        cn = 0
        with open("/../../../SVM/pande.txt") as f:
            #print "HOLA1",text
            for line in f:
                #print "HOLA2", line

                if (text == line.strip("\n")):
                    #print "HOLA3"
                    #print line
                    cn = 1
        if (cn == 0):
            num = random.randint(0, 100000000)
            fl = open(
                "/../../../SVM/dataset_training/bully/" + str(num) + ".txt",
                'w')
            fl.write(text)
            fl.close()
            f3 = open("/../../../SVM/pande.txt", 'a')
            f3.write("\n" + text)
            f3.close()

            fl = open("/../../../SVM_Multi/dataset_prediction/lol.txt", 'w')
            fl.write(text)
            fl.close()

            clf = sklearn.svm.LinearSVC()

            training_files = sklearn.datasets.load_files(
                "/../../../SVM_Multi/dataset_training")

            #print training_files.data

            predict_files = sklearn.datasets.load_files(
                "/../../../SVM/dataset_prediction")

            vectorizer = TfidfVectorizer(encoding='utf-8')
            X_t = vectorizer.fit_transform(
                (open(f).read() for f in training_files.filenames))
            assert sp.issparse(X_t)

            X_p = vectorizer.transform(
                (open(f).read() for f in predict_files.filenames))

            y = OneVsOneClassifier(LinearSVC(random_state=0)).fit(
                X_t, training_files.target).predict(X_p)
            if (y[0] == 0):
                num = random.randint(0, 100000000)
                fl = open(
                    "/../../../SVM_Multi/dataset_training/1/" + str(num) +
                    ".txt", 'w')
                fl.write(text)
                fl.close()

            elif (y[0] == 1):
                num = random.randint(0, 100000000)
                fl = open(
                    "/../../../SVM_Multi/dataset_training/2/" + str(num) +
                    ".txt", 'w')
                fl.write(text)
                fl.close()
            elif (y[0] == 2):
                num = random.randint(0, 100000000)
                fl = open(
                    "/../../../SVM_Multi/dataset_training/3/" + str(num) +
                    ".txt", 'w')
                fl.write(text)
                fl.close()

        os.system("rm /../../../SVM_Multi/dataset_prediction/test/lol.txt~")
        clf = sklearn.svm.LinearSVC()

        training_files = sklearn.datasets.load_files(
            "/../../../SVM_Multi/dataset_training")

        predict_files = sklearn.datasets.load_files(
            "/../../../SVM_Multi/dataset_prediction")

        vectorizer = TfidfVectorizer(encoding='utf-8')
        X_t = vectorizer.fit_transform(
            (open(f).read() for f in training_files.filenames))
        assert sp.issparse(X_t)
        X_p = vectorizer.transform(
            (open(f).read() for f in predict_files.filenames))
        y1 = OneVsOneClassifier(LinearSVC(random_state=0)).fit(
            X_t, training_files.target).predict(X_p)
        if y1 == 0:
            fil = open("/home/ubuntu/Desktop/SVM/optional_data_print", 'a')
            fil.write(text)
            fil.write("\n")
            fil.close()

            return render(
                request, 'output.html', {
                    'pred':
                    "100 friends will view this post. Our system has detected harmful content which might hurt the users sentiments.Are you sure you want to post this ?",
                    'val': True,
                    'text': text,
                    'l': False
                })
        elif y1 == 1:
            return render(
                request, 'output.html', {
                    'pred':
                    "You have been temporarily banned till the moderator checks this post.Our system has detected harmful content which might hurt the users sentiments. You cannot post another message until then. You can still continue to surf. You will be redirected to depression chat room for online help.  ",
                    'val': False,
                    'l': True
                })
        elif y1 == 2:
            return render(
                request, 'output.html', {
                    'pred':
                    "Our system has detected some very harmful content in your post which might hurt the users sentiments. Keeping this in mind your posting privileges have been suspended for a week . You cannot post another message until then. You can still continue to surf. Repeated posting of such highly offensive content will lead to a report being generated and sent to the concerned authorities. You will be redirected to depression chat room for online help.",
                    'val': False,
                    'l': True
                })
    else:
        tweets = tweet_dict("/../../../SVM/dataset_prediction/test/lol.txt")
        sentiment = sentiment_dict("/../../../SentiNet/AFINN-111.txt")

        for index in range(len(tweets)):

            tweet_word = tweets[index].split()
            sent_score = 0  # sentiment score della frase

            for word in tweet_word:
                word = word.rstrip('?:!.,;"!@')
                word = word.replace("\n", "")

                if not (word.encode('utf-8', 'ignore') == ""):
                    if word.encode('utf-8') in sentiment.keys():
                        sent_score = sent_score + float(sentiment[word])

        if (sent_score < 0):
            f1 = open("/../../../SVM_Multi/dataset_prediction/test/lol.txt",
                      'w')
            f1.write(text)
            f1.close()
            cn = 0
            with open("/../../../SVM/pande.txt") as f:
                for line in f:
                    if (text == line.strip("\n")):
                        cn = 1
            if (cn == 0):
                num = random.randint(0, 100000000)
                fl = open(
                    "/../../../SVM/dataset_training/bully/" + str(num) +
                    ".txt", 'w')
                fl.write(text)
                fl.close()
                f3 = open("/../../../SVM/pande.txt", 'a')
                f3.write("\n" + text)
                f3.close()

                fl = open("/../../../SVM_Multi/dataset_prediction/lol.txt",
                          'w')
                fl.write(text)
                fl.close()

                clf = sklearn.svm.LinearSVC()

                training_files = sklearn.datasets.load_files(
                    "/../../../SVM_Multi/dataset_training")
                predict_files = sklearn.datasets.load_files(
                    "/../../../SVM/dataset_prediction")
                vectorizer = TfidfVectorizer(encoding='utf-8')
                X_t = vectorizer.fit_transform(
                    (open(f).read() for f in training_files.filenames))
                assert sp.issparse(X_t)
                X_p = vectorizer.transform(
                    (open(f).read() for f in predict_files.filenames))
                y = OneVsOneClassifier(LinearSVC(random_state=0)).fit(
                    X_t, training_files.target).predict(X_p)
                if (y[0] == 0):
                    num = random.randint(0, 100000000)
                    fl = open(
                        "/../../../SVM_Multi/dataset_training/1/" + str(num) +
                        ".txt", 'w')
                    fl.write(text)
                    fl.close()

                elif (y[0] == 1):
                    num = random.randint(0, 100000000)
                    fl = open(
                        "/../../../SVM_Multi/dataset_training/2/" + str(num) +
                        ".txt", 'w')
                    fl.write(text)
                    fl.close()
                elif (y[0] == 2):
                    num = random.randint(0, 100000000)
                    fl = open(
                        "/../../../SVM_Multi/dataset_training/3/" + str(num) +
                        ".txt", 'w')
                    fl.write(text)
                    fl.close()

            os.system(
                "rm /../../../SVM_Multi/dataset_prediction/test/lol.txt~")
            clf = sklearn.svm.LinearSVC()
            training_files = sklearn.datasets.load_files(
                "/../../../SVM_Multi/dataset_training")
            predict_files = sklearn.datasets.load_files(
                "/../../../SVM_Multi/dataset_prediction")
            vectorizer = TfidfVectorizer(encoding='utf-8')
            X_t = vectorizer.fit_transform(
                (open(f).read() for f in training_files.filenames))
            assert sp.issparse(X_t)
            X_p = vectorizer.transform(
                (open(f).read() for f in predict_files.filenames))
            y1 = OneVsOneClassifier(LinearSVC(random_state=0)).fit(
                X_t, training_files.target).predict(X_p)
            if y1 == 0:
                fil = open("/../../../SVM/optional_data_print", 'a')
                fil.write(text)
                fil.write("\n")
                fil.close()

                return render(
                    request, 'output.html', {
                        'pred':
                        "100 friends will view this post. Our system has detected harmful content which might hurt the users sentiments.Are you sure you want to post this ?",
                        'val': True,
                        'text': text,
                        'l': False
                    })
            elif y1 == 1:
                return render(
                    request, 'output.html', {
                        'pred':
                        "You have been temporarily banned till the moderator checks this post.Our system has detected harmful content which might hurt the users sentiments. You cannot post another message until then. You can still continue to surf. You will be redirected to depression chat room for online help.",
                        'val': False,
                        'l': True
                    })
            elif y1 == 2:
                return render(
                    request, 'output.html', {
                        'pred':
                        "Our system has detected some very harmful content in your post which might hurt the users sentiments. Keeping this in mind your posting privileges have been suspended for a week . You cannot post another message until then. You can still continue to surf. Repeated posting of such highly offensive content will lead to a report being generated and sent to the concerned authorities. You will be redirected to depression chat room for online help.",
                        'val': False,
                        'l': True
                    })
        else:
            fil = open("/../../../SVM/data_print", 'a')
            fil.write(text)
            fil.close()
        return HttpResponseRedirect("http://127.0.0.1:8000/home/form/")
Example #51
0
                           memory="nilearn_cache",
                           memory_level=1)
X = nifti_masker.fit_transform(dataset_files.func)
X = X[non_rest]

### Predictor #################################################################

### Define the prediction function to be used.
# Here we use a Support Vector Classification, with a linear kernel
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.pipeline import Pipeline

svc_ovo = OneVsOneClassifier(
    Pipeline([('anova', SelectKBest(f_classif, k=500)),
              ('svc', SVC(kernel='linear'))]))

svc_ova = OneVsRestClassifier(
    Pipeline([('anova', SelectKBest(f_classif, k=500)),
              ('svc', SVC(kernel='linear'))]))

### Cross-validation scores ###################################################
from sklearn.cross_validation import cross_val_score

cv_scores_ovo = cross_val_score(svc_ovo, X, y, cv=5, verbose=True)

cv_scores_ova = cross_val_score(svc_ova, X, y, cv=5, verbose=True)

print 79 * "_"
print 'OvO', cv_scores_ovo.mean()
Example #52
0
avg / total       0.77      0.77      0.76      3313

[[ 296    0    6   78    2    0   14]
 [   5   17   18   19    0    0    9]
 [  13    1  314   34    9    0   87]
 [  45    5   49 1281   23    2   86]
 [   8    3   22   57   87    2   27]
 [   0    1   10   15    2    7   28]
 [   6    0    4   72    3    1  545]]
'''
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
Model = build_and_evaluate(X=data['text'],
                           y=data['class'],
                           classifier=OneVsOneClassifier(LinearSVC()),
                           ngram_range=(1, 3),
                           test_size=0.4)

#%%
'''
Building for evaluation
Classification Report:
precision    recall  f1-score   support

     course       0.80      0.66      0.73       384
 department       0.55      0.15      0.24        73
    faculty       0.71      0.63      0.67       430
      other       0.80      0.86      0.82      1511
    project       0.69      0.42      0.52       205
      staff       0.44      0.07      0.12        55
Example #53
0
                                            'misc.forsale',
                                            'soc.religion.christian'
                                        ]),
    print '==================================================================\n'

    print 'Confusion Matrix:'
    print '==================='
    print metrics.confusion_matrix(target, predicted)
    print '===================\n'

    print 'Total Accuracy: '
    print np.mean(target == predicted)


clf_list = [
    OneVsOneClassifier(GaussianNB()),
    OneVsOneClassifier(svm.LinearSVC()),
    OneVsRestClassifier(GaussianNB()),
    OneVsRestClassifier(svm.LinearSVC())
]
clf_name = [
    'One vs One Classifier - Naive Bayes', 'One vs One Classifier - SVM',
    'One vs Rest Classifier - Naive Bayes', 'One vs Rest Classifier - SVM'
]

# perform classification
for clf, clf_n in zip(clf_list, clf_name):
    pound_sign = ''
    spaces = ''
    for i in range(len(clf_n) + 2):
        pound_sign += '#'
Example #54
0
                         y_pred_gnb,
                         name="Multiclass Gaussian Naive Bayes",
                         average='weighted')
gnb_cm = confusion_matrix(
    y_test, y_pred_gnb)  # Multiclass Gaussian Naive Bayes confusion matrix
plt.figure()
plot_confusion_matrix(gnb_cm,
                      classes=class_names,
                      title='Multiclass Gaussian Naive Bayes Confusion Matrix'
                      )  # Multiclass Gaussian Naive Bayes roc curve

params = {
    'estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}  # parameters to try for best gamma in Linear SVC

svm_one = OneVsOneClassifier(
    LinearSVC(random_state=42))  # One v One SVM classifier
clf_one = GridSearchCV(svm_one, params, cv=5,
                       scoring='accuracy')  # grid search to find best gamma
y_pred_one = clf_one.fit(X_train_LSI,
                         y_train).best_estimator_.predict(X_test_LSI)
print(clf_one.best_estimator_)
print_classifier_metrics(y_test,
                         y_pred_one,
                         name="1v1 SVM",
                         average='weighted')
one_cm = confusion_matrix(y_test, y_pred_one)  # One v One SVM confusion matrix
plt.figure()
plot_confusion_matrix(one_cm,
                      classes=class_names,
                      title='1v1 SVM Confusion Matrix')
Example #55
0
# Convert string data to numerical data
label_encoder = []
X_encoded = np.empty(X.shape)
for i, item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])

X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=5)

# Create SVM classifier
classifier = OneVsOneClassifier(LinearSVC(random_state=0))

# Train the classifier
# Cross validation

classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

# Compute the F1 score of the SVM classifier
f1 = cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3)
print("F1 score: " + str(round(100 * f1.mean(), 2)) + "%")