def train_data_SVC(X, y):
    """
    Create and train the Support Vector Machine.
    """
    classif = OneVsRestClassifier(LinearSVC())
    classif.fit(X,y)
    return classif
Exemple #2
0
def test_ovr_multilabel():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
    y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"],
         ["ham", "eggs"], ["ham"]]
    #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]]
    Y = np.array([[0, 1, 1],
                  [0, 1, 0],
                  [1, 1, 1],
                  [1, 0, 1],
                  [1, 0, 0]])

    classes = set("ham eggs spam".split())

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(),
                     ElasticNet(), Lasso(alpha=0.5)):
        # test input as lists of tuples
        clf = assert_warns(DeprecationWarning,
                           OneVsRestClassifier(base_clf).fit,
                           X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_equal(set(y_pred), set(["spam", "eggs"]))
        assert_true(clf.multilabel_)

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_array_equal(y_pred, [0, 1, 1])
        assert_true(clf.multilabel_)
Exemple #3
0
def test_decision_function_shape_two_class():
    for n_classes in [2, 3]:
        X, y = make_blobs(centers=n_classes, random_state=0)
        for estimator in [svm.SVC, svm.NuSVC]:
            clf = OneVsRestClassifier(estimator(
                decision_function_shape="ovr")).fit(X, y)
            assert_equal(len(clf.predict(X)), len(y))
def roc(features_trunc, labels, categories, classifier):
	"""
	compute and plot the roc curve for the given classifier
		features_trunc - features matrix truncated to the k best features
		labels - the classes of the data
		categories - different possible categories (66 for subcategories or 14 for categories)
		classifier - MultinomialNB or lda
	"""
	# divide the data into training and test set
	features_train, features_test, categoryids_train, categoryids_test = train_test_split(features_trunc, labels, test_size=.1,random_state=0)
	# define the OneVsRestClassifier with the given classifier (LDA or Naive Bayes)
	clf = OneVsRestClassifier(classifier)
	# train the classifier and compute the probabilities for the test data labels
	clf_fit = clf.fit(features_train, categoryids_train)
	labels_score = clf_fit.predict_proba(features_test)
	# binarize the labels (necessary for the roc curve)
	categoryids_test = label_binarize(categoryids_test, classes=categories)
	# compute the false positive rate, true positive rate and the thresholds
	fpr, tpr, thresholds = metrics.roc_curve(categoryids_test.ravel(), labels_score.ravel())
	# compute the area under the curve
	roc_auc = metrics.auc(fpr, tpr)
	# plot the roc curve
	pl.clf()
	pl.plot(fpr, tpr, 'r',label='micro-average ROC curve (area = {0:0.2f})'''.format(roc_auc), linewidth=2)
	pl.plot([0, 1], [0, 1], 'k--', linewidth=2)
	pl.xlim([0.0, 1.0])
	pl.ylim([0.0, 1.05])
	pl.xlabel('false positive rate')
	pl.ylabel('true positive rate')
	pl.title('Receiver operating characteristic for micro-averaged classification scores')
	pl.legend(loc="lower right")
	pl.show()
Exemple #5
0
  def train(self, trainfile_name):
    print >>sys.stderr, "Reading data.."
    train_data = [tuple(x.strip().split("\t")) for x in codecs.open(trainfile_name, "r", "utf-8")]
    shuffle(train_data)
    filter_feature = get_filter()
    train_labels, train_clauses = zip(*train_data)
    train_labels = [tl.lower() for tl in train_labels]
    print >>sys.stderr, "Indexing features.."
    self.fp.index_data(train_clauses, filter_feature)
    X = numpy.asarray([self.fp.featurize(clause, filter_feature) for clause in train_clauses])
    tagset = list(set(train_labels))
    tag_index = {l:i for (i, l) in enumerate(tagset)}
    Y = numpy.asarray([[tag_index[label]] for label in train_labels])

    classifier = OneVsRestClassifier(SVC(kernel='linear'))
    if self.cv:
      print >>sys.stderr, "Starting Cross-validation for %d folds.."%(self.folds)
      y = [l[0] for l in Y]
      scores = cross_validation.cross_val_score(classifier, X, y, cv=self.folds, scoring='f1_weighted')
      print >>sys.stderr, "Scores:", scores
      print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(scores.mean(), scores.std() * 2)

    print >>sys.stderr, "Starting training.."
    classifier.fit(X, Y)
    pickle.dump(classifier, open(self.trained_model_name, "wb"))
    pickle.dump(self.fp.feat_index, open(self.feat_index_name, "wb"))
    pickle.dump(tagset, open(self.stored_tagset, "wb"))

    print >>sys.stderr, "Done"
def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # decision function only estimator. Fails in current implementation.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert_raises(AttributeError, decision_only.predict_proba, X_test)

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = [tuple(l.nonzero()[0]) for l in (Y_proba > 0.5)]
        assert_equal(pred, Y_pred)
def test_ovr_multiclass():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
    y = ["eggs", "spam", "ham", "eggs", "ham"]
    Y = np.array([[0, 0, 1],
                  [0, 1, 0],
                  [1, 0, 0],
                  [0, 0, 1],
                  [1, 0, 0]])

    classes = set("ham eggs spam".split())

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(),
                     ElasticNet()):

        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_equal(set(y_pred), set("eggs"))

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[0, 0, 4]])[0]
        assert_array_equal(y_pred, [0, 0, 1])
Exemple #8
0
def svm():
    #load data
    x_train,y_train=load_svmlight_file("12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("12testdata")
    x_test.todense()
    sk=SelectKBest(f_classif,9).fit(x_train,y_train)
    x_new=sk.transform(x_train)
    x_newtest=sk.transform(x_test)
    print(sk.scores_)
    print(x_new.shape)
    print(sk.get_support())
    #classfier
    clf=SVC(C=2,gamma=2)
    ovrclf=OneVsRestClassifier(clf,-1)
    ovrclf.fit(x_train,y_train)
    y_pred=ovrclf.predict(x_test)
    # write result
    with open("result.txt","w") as fw:
        for st in y_pred.tolist():
            fw.write(str(st)+'\n')
    print(np.array(y_pred).shape)

    target_names=['0','1','2','3']
    #result
    #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    #print(classification_report(y_test,y_pred,target_names=target_names))
    #print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
def main():

    dataTuples=getDataInFormat()
    print "Length of dataTuples is: ",  len(dataTuples)
    shuffle(dataTuples)
    trainTuples=dataTuples
    del dataTuples
    ids, labels, vectors= getLabelsAndVectors(trainTuples)
    del trainTuples
    followerCountsList = loadFollowerCountsFromFile()
    space=getSpace(vectors)
    reducedSpace=getReducedSpace(vectors, space)
    spaceWithMetaFeatures= augmentSpace(reducedSpace, emotionFeatures)

    print "Total # of features in your space is: ", len(space)
    print "Total # of features in your reducedSpace is: ", len(reducedSpace)
    oneHotVectors=getOneHotVectors(ids, labels, vectors,spaceWithMetaFeatures , followerCountsList)
    trainVectors, trainLabels=getOneHotVectorsAndLabels(oneHotVectors)
    del oneHotVectors
    clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear',gamma=0.1, verbose= False, probability=False))
    clf.fit(trainVectors, trainLabels)
    
    print "\nDone fitting classifier on training data...\n"
    print "\nDone fitting classifier on training data...\n"
    print "="*50, "\n"
    print "Results with 10-fold cross validation:\n"
    print "="*50, "\n"
    predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=10)
    print "*"*20
    print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted)
    print "*"*20
    print "precision_score\t", metrics.precision_score(trainLabels, predicted)
    print "recall_score\t", metrics.recall_score(trainLabels, predicted)
    print "\nclassification_report:\n\n", metrics.classification_report(trainLabels, predicted)
    print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(trainLabels, predicted)
Exemple #10
0
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = OneVsRestClassifier(SVC(C=1000000.0, gamma='auto', kernel='rbf'))
    svm.fit(X, y)
    return svm
def AgeClassifier(data_feature_stack,data_age_stack,test_size = 0.5):
	Age_range = np.unique(data_age_stack)
	# 923,  1529,   856,   1617,    13836,      6260,     1198

	AgeX_train,AgeX_test,AgeY_train,AgeY_test = preprocess(data_feature_stack,data_age_stack,test_size)
	print "fitting Age Clssfifer..."
	# parameters = (C=1.0, class_weight=None, dual=True, fit_intercept=True,\
	# intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',\
 #     random_state=0, tol=0.0001, verbose=0)

	clf = OneVsRestClassifier(LinearSVC(C = 0.001)).fit(AgeX_train, AgeY_train)


	print "predicting Age..."
	Age_test_result  = clf.predict(AgeX_test)
	Age_train_result = clf.predict(AgeX_train)	

	# Age_acc_test  = clf.score(AgeX_test, AgeY_test)
	# Age_acc_train = clf.score(AgeX_train, AgeY_train)
	Age_acc_test  = np.sum(Age_test_result == AgeY_test)
	Age_acc_train = np.sum(Age_train_result == AgeY_train)

	temp   = Age_test_result-AgeY_test
	error  = np.sqrt(temp**2)
	rmse   = np.mean(error)
	error2 = np.sqrt(temp[temp!=0]**2)
	rmse2  = np.mean(error2)


	pdb.set_trace()
	return clf, Age_acc_test,Age_acc_train
Exemple #12
0
def trainAndPredictLR(trainX, trainY, testX):
    """
    Logistic regression is used for predicting the target labels of the test data
    The probability of belonging to each of the labels is predicted for every test
    data and the labels with the top 10 probability values are extracted
    
    Input:
        1. trainX: ntrainingSamples * 2000 numpy matrix representing training data features
        2. trainY: ntrainingSamples * 185 numpy matrix representing the training data labels
        3. testX: ntestSamples * 2000 numpy matrix representing test data features
    
    Output:
        testY: ntestSamples * 19 numpy matrix representing the labels for the test data
    
    """
    clf = OneVsRestClassifier(LogisticRegression(C = 1.0))
    clf.fit(trainX, trainY)
    actY = clf.predict_proba(testX)
    testY = []
    # fetch the labels with max probability
    for prob in actY:
        y = []
        for i in range(10):
            index = np.argmax(prob, axis=0)
            classVal = classOrder[index]
            y.append(classVal)
            prob[index] = -1
        testY.append(y)
    return np.array(testY)
def run(data_path):
    print "Reading the dataset:", data_path
    mnist = fetch_mldata('MNIST original')
    mnist.data, mnist.target = shuffle(mnist.data, mnist.target)

    # Trunk the data
    n_train = 600
    n_test = 400

    # Define training and testing sets
    indices = arange(len(mnist.data))
    random.seed(0)
    train_idx = random.sample(indices, n_train)
    test_idx = random.sample(indices, n_test)
    X_train, y_train = mnist.data[train_idx], mnist.target[train_idx]
    X_test, y_test = mnist.data[test_idx], mnist.target[test_idx]

    # Apply a learning algorithm
    print "Applying a learning algorithm..."
    clf = OneVsRestClassifier(LinearSVC()).fit(X_train, y_train)

    # Make a prediction
    print "Making predictions..."
    y_pred = clf.predict(X_test)

    print y_pred

    # Evaluate the prediction
    print "Evaluating results..."
    print "Precision: \t", metrics.precision_score(y_test, y_pred)
    print "Recall: \t", metrics.recall_score(y_test, y_pred)
    print "F1 score: \t", metrics.f1_score(y_test, y_pred)
    print "Mean accuracy: \t", clf.score(X_test, y_test)
def run_classifier(sentences, labels, test_doc_list, output_file_path_list):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	estimator = LinearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)

	for test_doc, output_file_path in zip(test_doc_list, output_file_path_list):
		test_sentences = doc2sentences([test_doc])
		sentence_matrix = tfidf.transform(test_sentences)
		print("Shape of sentence matrix : ", sentence_matrix.shape)
		predictions = classifier.predict(sentence_matrix)

		from lxml import etree
		document = etree.Element('doc')
		doc_tree = etree.ElementTree(document)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i]
		doc_tree.write(output_file_path)
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression(),
                            order=np.array([0, 2, 4, 6, 8, 10,
                                            12, 1, 3, 5, 7, 9,
                                            11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Exemple #16
0
    def fit(self, df_X, df_y):
        if not df_y.shape[0] == df_X.shape[0]:
            raise ValueError("number of regions is not equal")
        if df_y.shape[1] != 1:
            raise ValueError("y needs to have 1 label column")

        le = LabelEncoder()
        y = le.fit_transform(df_y.iloc[:,0].values)

        clf = RandomForestClassifier(n_estimators=100)
        
        # Multiclass
        if len(le.classes_) > 2:
            orc = OneVsRestClassifier(clf)
            orc.fit(df_X.values, y)

            importances = np.array([c.feature_importances_ for c in orc.estimators_]).T
        else: # Only two classes
            clf.fit(df_X.values, y)
            importances = np.array([
                clf.feature_importances_,
                clf.feature_importances_
                ]).T
        
        for i,c in enumerate(le.classes_):
            
            diff = df_X.loc[y == c].quantile(q=0.75) - df_X.loc[y != c].quantile(q=0.75)
            sign = (diff >= 0) * 2 - 1
            importances[:,i] *= sign
        
        
        # create output DataFrame
        self.act_ = pd.DataFrame(importances,
                columns=le.inverse_transform(range(len(le.classes_))),
                index=df_X.columns)
def benchmark(clf_current):
    print('_' * 80)
    print("Test performance for: ")
    clf_descr = str(clf_current).split('(')[0]
    print(clf_descr)
    t0 = time()
    classif = OneVsRestClassifier(clf_current)
    classif.fit(X_train, Y_train.toarray())
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    t0 = time()
    if hasattr(clf_current,"decision_function"):
        dfmatrix = classif.decision_function(X_test)
        score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5))
    else:
        probsmatrix = classif.predict_proba(X_test)
        score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5))
        
    test_time = time() - t0

    
    print("f1-score:   %0.7f" % score)
    print("test time:  %0.3fs" % test_time)

    print('_' * 80)
    return clf_descr, score, train_time, test_time
Exemple #18
0
def one_vs_all(X, y, test_size=0.2, run_num = 100, svm_type='linear'):
    """Trains 15 1 vs all SVM classifiers of specified type"""
    # Python has a wonderful wrapper function that creates 1 vs all classifiers!
    if type == 'linear':
        estimator = LinearSVC()
    else:
        # This will automatically use RBF functions
        estimator = SVC()

    ovr = OneVsRestClassifier(estimator = estimator)

    acc_tr = []
    acc_tst = []

    for i in range(run_num):
        [X_train, X_test, y_train, y_test] = train_test_split(X, y,
                                                              test_size=test_size)
        # Train the classifier
        ovr.fit(X_train, y_train.ravel())

        # Work out the score on the training data. However there is nothing
        # to optimise for - we are just getting an idea of the accuracy for
        # training vs test data. box plot opportunity!
        tr_acc = ovr.score(X_train, y_train.ravel())
        tst_acc = ovr.score(X_test, y_test.ravel())

        acc_tr.append(tr_acc)
        acc_tst.append(tst_acc)

        # All the data isn't used here as it tends to overtrain the classifier.

    return ovr, acc_tr, acc_tst
class ClassDistanceMapper(TransformerMixin):
    """ Fit a OneVsRestClassifier for each sentiment class (against all others
        combined) and return the distances from the decision boundary for each
        class. Hence, this transformation can be seen as a dimensionality
        reduction from #words to #sentiment_classes (=5).

    """

    def __init__(self):
        """ Initialize a one-vs-rest multiclass classifer with a
            SGDClassifier. The choice of the SGDclassifier here is arbitrary,
            any other classifier might work as well.

        """
        self.clf = OneVsRestClassifier(LogisticRegression())

    def fit(self, X, y):
        """ Fit the multiclass classifier. """
        self.clf.fit(X, y)
        return self

    def transform(self, X):
        """ Return the distance of each sample from the decision boundary for
            each class.

        """
        return self.clf.decision_function(X)
Exemple #20
0
def fit_multiclass_svm1(documents, idfs):
	model = gensim.models.doc2vec.Doc2Vec.load("train_doc2vec.model")
	X = np.zeros([4000, 300]);
	X_test = np.zeros([490, 300]);
	y = np.zeros(4000);
	y_test = np.zeros(490);
	i = 0
	for doc in documents[:4000]:
		x = np.zeros(300)
		count = 0
		for sent in doc["summary"]:
			for word in sent.split():
				if word in model:
					x = x + (idfs[word] * model[word])
					count += 1
		X[i, :] = x/count
		y[i] = doc["topic_id"]
		i = i + 1;
	svm_model = OneVsRestClassifier(svm.SVC(kernel='poly', gamma=2)).fit(X, y)
	
	
	i = 0
	for doc in documents[4000:4490]:
		x = np.zeros(300)
		count = 0
		for sent in doc["summary"]:
			for word in sent.split():
				if word in model:
					x = x + (idfs[word] * model[word])
					count += 1
		X_test[i, :] = x/count
		y_test[i] = doc["topic_id"]
		i = i + 1;
	print svm_model.score(X_test, y_test)
Exemple #21
0
def make_classifier():
    test_size=0
    X, y = make_X_Y()
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=test_size)
    X_train = X_train.astype(int)
    X_test = X_test.astype(int)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)
    clf = OneVsRestClassifier(SVC(kernel='linear', class_weight='auto', probability=True))
    clf.fit(X_train, y_train)
    try:
        y_suggest = clf.predict_proba(X_test)
        nn = 0
        n = 0
        for y_s, y_t in zip(y_suggest, y_test):
            s1 = chords_Y[np.argmax(y_s)]
            y_s[np.argmax(y_s)]=0
            s2 = chords_Y[np.argmax(y_s)]
            t = chords_Y[np.argmax(y_t)]        
            print 'Suggest: ' + s1 + ' or ' + s2 + '  Real: ' + t
            n = n+1
            if s1==t:
                nn = nn+1
        if n>0:
            print 'Accuracy is ' + str(float(nn)/n)
    except ValueError:
        pass
    #print classification_report(clf.predict(X_test), y_test)
    pickle.dump(clf, open("classifier.bin", "wb"))   
Exemple #22
0
def train_linear(X, Y, splits, model_config, results_dir, best_k=10, validation_score='f1',
                threshold_score='f1', threshold_criterion='zack', fn_prefix='', label_idx=None):
    label_idx = np.arange(Y.shape[1]) if label_idx is None else label_idx
    best_perf = None
    best_C = None
    best_model = None
    for C in np.logspace(-3,3, num=20):
        sys.stdout.write('Training Ridge Regression with C={0}...'.format(C))
        sys.stdout.flush()
        model = OneVsRestClassifier(LogisticRegression(C=C))
        try:
            model.fit(X[splits[0]], Y[splits[0]])
        except KeyboardInterrupt:
            sys.stdout.write('training interrupted...')
            break
        except:
            raise

        Yp = model.predict_proba(X[splits[1]])
        perf = compute_micro_evaluations(Y[splits[1]][:,label_idx], Yp[:,label_idx], k=best_k,
                                        threshold_score=threshold_score, criterion=threshold_criterion)
        sys.stdout.write(' {0}={1:.4f}'.format(validation_score, perf[validation_score]))
        sys.stdout.flush()
        if best_perf is None or perf[validation_score] > best_perf[validation_score]:
            best_perf = perf
            best_model = model
            best_C = C
            sys.stdout.write(' *BEST')
        sys.stdout.write('\n')

    model_config['C'] = best_C
    cPickle.dump(best_model, open(os.path.join(results_dir, fn_prefix + '-model.pkl'), 'wb'))

    return best_model, model_config
Exemple #23
0
def ml_train(datasetFilePath, falsePredictionsFilePath, unknownPredictionsFilePath, confusionMatricesDir, classifierFilePath):
    logger.info("start of training and testing phase")

    classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True), n_jobs=NUMBER_OF_CPUS_TO_USE)

    logger.info("loading data set")
    dataset, features_names = load_dataset(datasetFilePath)

    #limited_dataset = limit_dataset(dataset)
    limited_dataset = dataset
    
    ml_dataset = split_dataset(limited_dataset, len(features_names))

    logger.info("fitting training set X_train - %s, y_train - %s" % (ml_dataset.X_train.shape, ml_dataset.y_train.shape))
    classifier.fit(ml_dataset.X_train, ml_dataset.y_train)

    logger.info("predicting test set X_test - %s, y_test - %s" % (ml_dataset.X_test.shape, ml_dataset.y_test.shape))
    y_pred = classifier.predict(ml_dataset.X_test)

    y_pred_probabilities = classifier.predict_proba(ml_dataset.X_test)

    y_pred_with_unknown_cls, y_pred_fictive, max_y_pred_probs = process_prediction_vector(ml_dataset.y_test, y_pred, y_pred_probabilities)

    validation(ml_dataset.y_test, y_pred, y_pred_with_unknown_cls, y_pred_fictive, list(classifier.classes_) + ["unknown"])
    plot_confusion_matrices(ml_dataset.y_test, y_pred, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "1")
    plot_confusion_matrices(ml_dataset.y_test, y_pred_with_unknown_cls, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "2")
    plot_confusion_matrices(ml_dataset.y_test, y_pred_fictive, list(classifier.classes_) + ["unknown"], confusionMatricesDir, "3")

    produce_output(ml_dataset.y_test, y_pred, max_y_pred_probs, ml_dataset.test_terms_name, falsePredictionsFilePath, unknownPredictionsFilePath)

    logger.info("exporting classifier model")
    joblib.dump(classifier, classifierFilePath)

    logger.info("end of training and testing phase")
def main():
    word_vec_dict = readGloveData('../glove.twitter.27B/glove.twitter.27B.25d.txt')
    tweets = readTweets('../dataset_raw/semeval2016-task6-trainingdata.txt')

    tweetVectors = getTweetVectors(tweets[0:len(tweets) - 1], word_vec_dict)
    print tweets[0]
    print getSumVectors(tweets[0], word_vec_dict)
    tweetClasses = set(tweets[-1])

    mapping = {'favor': 1, 'none': 0, 'against': 1}

    tweetClasses = np.asarray([mapping[x] for x in tweets[-1]])
    tweetData = np.asarray(tweetVectors)
    print tweetClasses.shape
    print tweetData.shape
    X = tweetData
    Y = tweetClasses
    clf = OneVsRestClassifier(LinearSVC(random_state=0))
    # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0)
    X_train = X[0:int(0.7 * len(X))]
    y_train = Y[0:int(0.7 * len(Y))]
    X_test = X[int(0.7 * len(X)) : len(X)]
    y_test = Y[int(0.7 * len(Y)) : len(Y)]
    clf.fit(X_train, y_train)
    print clf.score(X_test, y_test)
Exemple #25
0
def compute_ranking(learnFullModel=False):
    path='/home/arya/PubMed/GEO/Datasets/'
    modelpath=path+'libsvm/model/'
    if not os.path.exists(modelpath):            os.makedirs(modelpath)
    outpath='{}libsvm/out/'.format(path)
    sys.stdout=open('{}SVM.log'.format('/home/arya/PubMed/GEO/Log/'),'w')
    sys.stderr=open('{}SVM.err'.format('/home/arya/PubMed/GEO/Log/'),'w')
    if not os.path.exists(outpath):            os.makedirs(outpath)
    X, Y = load_svmlight_file(path+'Corpus.libsvm',multilabel=True)
    Y=np.array(Y)
    if learnFullModel:
        model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, Y)
        joblib.dump(model, modelpath+'Model.libsvm')
        print 'The Full Model is Saved!'
    Folds=pd.read_pickle(path+'Folds.df')
    for fold in range(Folds.shape[1]):
        start=time()
        Xtr,Ytr=X[Folds[fold].values,:],Y[Folds[fold].values]
        print 'learning on fold...',Xtr.shape,fold, sys.stdout.flush()
        model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(Xtr, Ytr)
        Xte=X[~Folds[fold].values,:]
        labels=model.classes_
#         Yte=remove_unknown_classes(Yte, labels)
#         idx=np.array(map(lambda x: len(x)>0,Yte))
#         Yte=np.array(Yte)[idx]
#         Xte=Xte[idx]
        print 'predicting...',Xte.shape, sys.stdout.flush()
        pd.DataFrame(columns=labels,data=model.decision_function(Xte)).to_pickle('{}deci.{}.df'.format(outpath,fold))
#         (pd.DataFrame(columns=labels,data=MultiLabelBinarizer().fit_transform(list(Yte)+[labels]))).iloc[:-1].to_pickle('{}labels.{}.df'.format(outpath,fold))
#         ranking.to_pickle('{}ranking.{}.df'.format(outpath,fold))
        print 'Done in {:.0f} minutes'.format((time()-start)/60.0)
def runDigitsDensity(n,_i, j):
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute
    #skclf = KernelDensity(metric=ma)
    myclf = hw7u.MyKNN(metric=metric[j], density=True)
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    #skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    #print 'scikit predict'
    #sk_pred = skclf.predict(X_test)
    #print sk_pred
    print y_test
    print y_pred
    #print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
    print 'My Accuracy: {}'.format(myacc)
def runDigits(n, skclf, myclf):
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    print 'scikit predict'
    sk_pred = skclf.predict(X_test)
    print sk_pred
    print y_test
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
Exemple #28
0
def multiclass_AUC(clf, X, Y):
    # Binarize the output
    X, Y = np.array(X), np.array(Y)
    Y = label_binarize(Y, classes=list(set(Y)))
    n_classes = Y.shape[1]

    # shuffle and split training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5,
                                                        random_state=0)
    # Learn to predict each class against the other
    classifier = OneVsRestClassifier(clf)
    Y_score = classifier.fit(X_train, Y_train).predict(X_test)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    print "AUC for multiclass {}: {}".format(clf.__class__.__name__, roc_auc["micro"])
Exemple #29
0
def prepare_multiclass_clf(X, y):
    clf = GridSearchCV(LogisticRegression(penalty='l1'),
                       {'C': np.logspace(-4, 2, 10)},
                       scoring='accuracy', cv=5)
    multi_clf = OneVsRestClassifier(clf)
    multi_clf.fit(X, y)
    return multi_clf
Exemple #30
0
def fit_multiclass_svm(documents, idfs):
	model = gensim.models.Word2Vec.load("train_word2vec.model")
	dim = 50;
	X = np.zeros([4000, dim]);
	X_test = np.zeros([490, dim]);
	y = np.zeros(4000);
	y_test = np.zeros(490);
	i = 0
	for doc in documents[:4000]:
		x = np.zeros(dim)
		count = 0
		for sent in doc["summary"]:
			for word in sent.split():
				if word in model:
					x = x + (idfs[word] * model[word])
					count += 1
		X[i, :] = x/count
		y[i] = doc["topic_id"]
		i = i + 1;
	svm_model = OneVsRestClassifier(LinearSVC(random_state=0, C = 1)).fit(X, y)
	
	
	i = 0
	for doc in documents[4000:4490]:
		x = np.zeros(dim)
		count = 0
		for sent in doc["summary"]:
			for word in sent.split():
				if word in model:
					x = x + (idfs[word] * model[word])
					count += 1
		X_test[i, :] = x/count
		y_test[i] = doc["topic_id"]
		i = i + 1;
	print svm_model.score(X_test, y_test)
def model(model, OvsR=False):
    if OvsR:
        model = OneVsRestClassifier(model, n_jobs=-1)

    return model
# --------------
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score

# clf = OneVsRestClassifier(LogisticRegression())
# clf1 = OneVsRestClassifier(LogisticRegression())
# model_fit_all_features = clf1.fit(X_train, Y_train)
# predictions_all_features = clf1.predict(X_test)
# score_all_features = accuracy_score(Y_test, predictions_all_features)
# # print(score_all_features)
# model_fit_top_features= clf.fit(scaled_features_train_df,Y_train)
# predictions_top_features= clf.predict(X_test)
# score_top_features = accuracy_score(Y_test,predictions_top_features)

clf = OneVsRestClassifier(LogisticRegression())
clf1 = OneVsRestClassifier(LogisticRegression())
model_fit_all_features = clf1.fit(X_train, Y_train)

predictions_all_features = model_fit_all_features.predict(X_test)

score_all_features = accuracy_score(Y_test, predictions_all_features)

print(score_all_features)

model_fit_top_features = clf.fit(scaled_features_train_df[top_k_predictors],
                                 Y_train)

predictions_top_features = model_fit_top_features.predict(
    scaled_features_train_df[top_k_predictors])
Exemple #33
0
y = data['grp']
X = data.iloc[:, 1:31]

# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

# split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.3,
                                                    random_state=0)

# Learn to predict each class against the other
classifier = OneVsRestClassifier(
    svm.SVC(kernel='linear', probability=True, random_state=0))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

ova = classifier.fit(X_train, y_train)
y_pred = ova.predict(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area !!!!!!!!!!!!!!!!!!!! PROCI STA ZNACE OVE MERE
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
Exemple #34
0
    k = 10
    scores = cross_val_score(modelo, treino_dados, treino_marcacoes, cv=k)
    taxa_de_acerto = np.mean(scores)

    msg = "Taxa de acerto do {0}: {1}".format(nome, taxa_de_acerto)
    print(msg)
    return taxa_de_acerto


resultados = {}

from sklearn.multiclass import OneVsRestClassifier
#Algoritmo que irá rodar por trás do OneVsRestClassifier
from sklearn.svm import LinearSVC
#random_state roda de maneira fixa e não aleatória
modeloOneVsRest = OneVsRestClassifier(LinearSVC(random_state=0))
resultadoOneVsRest = fit_and_predict("OneVsRestClassifier", modeloOneVsRest,
                                     treino_dados, treino_marcacoes)
#Adiciona ao dicionario
resultados[resultadoOneVsRest] = modeloOneVsRest

from sklearn.multiclass import OneVsOneClassifier

modeloOneVsOne = OneVsOneClassifier(LinearSVC(random_state=0))
resultadoOneVsOne = fit_and_predict("OneVsOne", modeloOneVsOne, treino_dados,
                                    treino_marcacoes)
resultados[resultadoOneVsOne] = modeloOneVsOne

from sklearn.naive_bayes import MultinomialNB

modeloMultinomial = MultinomialNB()
            elif m == 'MCB':
                pool_classifiers = RandomForestClassifier(n_estimators=10)
                pool_classifiers.fit(Feature_train, Label_train.ravel())
                mcb = MCB(pool_classifiers)
                mcb.fit(Feature_train, Label_train.ravel())
                Label_predict = mcb.predict(Feature_test)
            elif m == 'DES-MI':
                pool_classifiers = RandomForestClassifier(n_estimators=10)
                pool_classifiers.fit(Feature_train, Label_train.ravel())
                dmi = DESMI(pool_classifiers)
                dmi.fit(Feature_train, Label_train.ravel())
                Label_predict = dmi.predict(Feature_test)
            elif m == 'One_vs_Rest-SMOTE-XGBoost':
                sm = SMOTE()
                Feature_train_o, Label_train_o = sm.fit_sample(Feature_train, Label_train.ravel())
                clf = OneVsRestClassifier(xgboost.XGBClassifier(**BayesOp_Parameters))
                clf.fit(Feature_train_o, Label_train_o)
                Label_predict = clf.predict(Feature_test)
            elif m == 'One_vs_Rest-XGBoost':
                clf = OneVsRestClassifier(xgboost.XGBClassifier(**BayesOp_Parameters))
                clf.fit(Feature_train, Label_train.ravel())
                Label_predict = clf.predict(Feature_test)

            ml_record.measure(i, Label_test, Label_predict, 'weighted')
            i += 1

        file_wirte = "Result_One_vs_All_BayOp_XGBoost_G_mean_GA_99_pop.txt"
        ml_record.output(file_wirte, m, Dir)


    def run_data(self, flag, model_name):
        if flag == 'orig':
            print(
                '\n(¯`·._.·(¯`·._.· Evaluation on original data ·._.·´¯)·._.·´¯)\n'
            )
            y = self.y
            y_pca = y
        elif flag == 'full_anno':
            print(
                '\n(¯`·._.·(¯`·._.· Evaluation on full annotations ·._.·´¯)·._.·´¯)\n'
            )
            print(self.filt_txt)
            y = self.y_all_anno_max
            y_pca = np.ravel(self.oe.inverse_transform(y))
        elif flag == 'filt':
            print(
                '\n(¯`·._.·(¯`·._.· Evaluation on filtered data ·._.·´¯)·._.·´¯)\n'
            )
            print(self.filt_txt)
            y = self.y_filt_anno_max
            y_pca = np.ravel(self.oe.inverse_transform(y))
        if self.pca_plot:
            fig = plt.figure()
            ax = fig.add_subplot(111, projection='3d')
            ax.scatter(self.X[:, 0], self.X[:, 1], self.X[:, 2], c=y_pca)
            plt.show()

        if model_name == 'svm':
            # # support vector classifier
            if self.multi_label is False:
                if flag != 'orig':
                    y = np.ravel(self.oe.inverse_transform(y))
                kfold = model_selection.StratifiedKFold(n_splits=5,
                                                        shuffle=True,
                                                        random_state=self.seed)
                clf_cv = svm.SVC(C=self.C,
                                 gamma=self.gamma,
                                 random_state=self.seed)
            # if multilabel
            else:
                if flag != 'orig':
                    kfold = IterativeStratification(n_splits=5,
                                                    order=1,
                                                    random_state=self.seed)
                else:
                    # kfold = model_selection.KFold(n_splits=5, random_state=self.seed)
                    kfold = model_selection.StratifiedKFold(
                        n_splits=5, random_state=self.seed)

                clf_cv = OneVsRestClassifier(
                    svm.SVC(C=self.C, gamma=self.gamma,
                            random_state=self.seed))
                # clf_cv = svm.SVC(C=self.C, gamma=self.gamma, random_state=1987)

            # best_params = self.svc_param_selection(clf_cv, self.X, self.y, kfold)
            # print('Best params:', best_params)
            pre = model_selection.cross_val_score(clf_cv,
                                                  self.X,
                                                  y,
                                                  cv=kfold,
                                                  scoring='precision_macro')
            rec = model_selection.cross_val_score(clf_cv,
                                                  self.X,
                                                  y,
                                                  cv=kfold,
                                                  scoring='recall_macro')
            fsc = model_selection.cross_val_score(clf_cv,
                                                  self.X,
                                                  y,
                                                  cv=kfold,
                                                  scoring='f1_macro')
            # score, perm_sc, pvalue = model_selection.permutation_test_score(clf_cv, self.X, self.y, cv=kfold, scoring='f1_macro', n_permutations=100, n_jobs=-1)

            print('5-Fold CV Precision: {} , STD: {}'.format(
                pre.mean(), pre.std()))
            print('5-Fold CV Recall: {} , STD: {}'.format(
                rec.mean(), rec.std()))
            print('5-Fold CV F1-Score: {} , STD: {}'.format(
                fsc.mean(), fsc.std()))

            # print('Classification score {} (pvalue : {})'.format(score, pvalue))
            # X_train, X_test, y_train, y_test = model_selection.train_test_split(self.X, y, test_size=0.5, random_state=1987)

            # clf_cv.fit(X_train, y_train)
            # y_pred = clf_cv.predict(X_test)

            # report = classification_report(y_test, y_pred)
            # print(report)

            return pre, rec, fsc

        elif model_name == 'gmm':
            # gaussian mixture model
            clf_cv = mixture.GaussianMixture(n_components=len(np.unique(y)),
                                             covariance_type='full',
                                             n_init=10,
                                             random_state=1987)
            # clf_cv = mixture.BayesianGaussianMixture(n_components=len(np.unique(self.y)), covariance_type='full', n_init=10, random_state=1987)
            clf_cv.fit(self.X)
            y_pred = clf_cv.predict(self.X)
            # pdb.set_trace()
            return y, y_pred
Exemple #37
0
    class SKModel(object):
        '''
        This class facilitates training, testing, storage, and deployment of
        scikit-learn models.
        '''
        def __init__(self, estimator=None, encoder=None):
            self.estimator = estimator
            self.id = None
            self.deployed = False
            self.call_count = 0
            self.last_call = None
            self.recommendation_threshold = 0.0
            self.train_results = None
            self.test_results = None
            self.dependent = None
            self.independent = None
            self.model = None
            self.encoder = encoder
            self.tpr = None
            self.fpr = None
            self.roc_auc = None
            self.model_path = None
            self.encoder_path = None
            self.encoder_type = None
            self.train_timestamp = None
            self.train_time = None
            self.train_data_balance = None
            self.test_timestamp = None
            self.test_time = None

        def train(self, data):

            if self.estimator is None:
                logging.warning(
                    'Model estimator not yet specified. Please define or load an estimator.',
                    UserWarning)

            self.model = OneVsRestClassifier(self.estimator).fit(
                data.X_train, data.y_train)
            self.dependent = data.dependent
            independent_vars = []
            for i in data.independent:
                independent_vars.append({"name": i})
            self.independent = independent_vars

            train_results, timestamp, train_time, train_data_balance = Models(
            )._train(self.model,
                     data.X_train,
                     data.y_train,
                     balance=data.balance,
                     encoder=self.encoder)

            self.train_results = train_results
            self.train_timestamp = timestamp
            self.train_time = train_time
            self.train_data_balance = train_data_balance

        def test(self, data):

            if self.model is None:
                logging.warning(
                    'Model not yet specified. Please train or load a model.',
                    UserWarning)

            test_results, timestamp, test_time = Models()._test(
                self.model, data.X_test, data.y_test)

            self.test_results = test_results
            self.test_timestamp = timestamp
            self.test_time = test_time

        def predict(self, X):

            if self.model is None:
                logging.warning(
                    'Model not yet specified. Please train or load a model.',
                    UserWarning)

            y_pred = self.model.predict(X)

            return y_pred

        def predict_proba(self, X):

            if self.model is None:
                logging.warning(
                    'Model not yet specified. Please train or load a model.',
                    UserWarning)

            Y_pred_proba = self.model.predict_proba(X)

            return Y_pred_proba

        def store(self,
                  model_path,
                  server_config,
                  encoder_path=None,
                  encoder=None,
                  override=False):

            Models(server_config=server_config)._store(
                model=self,
                model_path=model_path,
                encoder_path=encoder_path,
                encoder=encoder,
                override=override)
            logging.info('Model stored successfully.')

        def load_model(self, model_id, server_config):
            models_connection = Models(server_config=server_config)
            model_info = models_connection._get_info(model_id)

            self.id = model_info['models']['id']
            self.deployed = model_info['models']['deployed']
            self.call_count = model_info['models']['callCount']
            self.last_call = model_info['models']['lastCall']
            self.recommendation_threshold = model_info['models'][
                'recommendationThreshold']
            self.train_results = {
                'accuracy': model_info['models']['trainAccuracy'],
                'recall': model_info['models']['trainPrecision'],
                'precision': model_info['models']['trainPrecision'],
                'f1': model_info['models']['trainF1']
            }
            self.test_results = {
                'accuracy': model_info['models']['testAccuracy'],
                'recall': model_info['models']['testPrecision'],
                'precision': model_info['models']['testPrecision'],
                'f1': model_info['models']['testF1']
            }
            self.dependent = model_info['models']['dependent']
            self.independent = model_info['models']['independent']
            self.model_path = model_info['models']['modelPath']
            self.encoder_path = model_info['models']['encoderPath']
            self.encoder_type = model_info['models']['encoderType']
            self.train_timestamp = model_info['models']['lastTrainedDate']
            self.train_time = model_info['models']['trainTime']
            self.train_data_balance = model_info['models']['trainDataBalance']
            self.test_timestamp = model_info['models']['lastTestedDate']
            self.test_time = model_info['models']['testTime']

            model = models_connection._load_from_bucket(
                model_info['models']['modelPath'])
            self.model = model
            self.estimator = self.model.estimator
            if self.encoder_path is not None:
                self.encoder = models_connection._load_from_bucket(
                    self.encoder_path)

            return self

        @staticmethod
        def load_generators(model_id, server_config):
            models_connection = Models(server_config=server_config)
            model_info = models_connection._get_info(model_id)
            generators = []
            for i in model_info['models']['independent']:
                if isinstance(i, dict):
                    if 'generator_path' in i.keys():
                        func = models_connection._load_from_bucket(
                            i['generator_path'])
                        generators.append(func)

            return generators

        def delete_model(self, model_id, server_config):
            models_connection = Models(server_config=server_config)
            model_info = models_connection._get_info(model_id)
            # delete the modelPath and encoderPath objects in S3
            models_connection._delete_from_bucket(
                model_info['models']['modelPath'])
            models_connection._delete_from_bucket(
                model_info['models']['encoderPath'])
            # delete generators
            self.delete_generators(model_id, server_config)
            # then delete from Elasticsearch
            models_connection._delete_from_index(model_id)

        @staticmethod
        def delete_generators(model_id, server_config):
            models_connection = Models(server_config=server_config)
            model_info = models_connection._get_info(model_id)
            all_models = models_connection.index.get()[0]

            generator_paths = []
            to_delete = []
            for i in model_info['models']['independent']:
                if isinstance(i, dict):
                    if 'generator_path' in i.keys():
                        generator_paths.append(i['generator_path'])
                        to_delete.append(i['generator_path'])
            for m in all_models['models']:
                if m['_id'] != model_info['models']['id']:
                    for i in m['_source']['independent']:
                        if isinstance(i, dict):
                            if 'generator_path' in i.keys():
                                if i['generator_path'] in generator_paths and i[
                                        'generator_path'] in to_delete:
                                    to_delete.remove(i['generator_path'])
                                    logging.info(
                                        i['generator_path'] +
                                        ' shared with another model, skipping delete.'
                                    )
            for i in to_delete:
                models_connection._delete_from_bucket(i)

        def deploy(self,
                   server_config,
                   deploy=False,
                   recommendation_threshold=0.0):
            if not self.model_path or not self.encoder_path:
                logging.warning(
                    'Must store model and encoder prior to deployment.',
                    UserWarning)
            else:
                self.deployed = deploy
                self.recommendation_threshold = recommendation_threshold
                Models(server_config=server_config)._deploy(model=self)
                logging.info(
                    'Model deployed successfully and will be available after the next server restart.'
                )

        def tag_generator(self,
                          func,
                          output_var,
                          input_vars,
                          generator_path=None):
            if generator_path:
                generator_path = generator_path
            else:
                generator_path = func.__name__ + '.pickle'
            if not isinstance(self.independent, list):
                logging.warning('Independent variables not defined as a list.',
                                UserWarning)
                sys.exit()
            for i in self.independent:
                if i['name'] == output_var:
                    i['inputs'] = input_vars
                    i['generator_path'] = generator_path
            # create tmp directory if not present
            if not os.path.exists('tmp/'):
                os.makedirs('tmp/')
                logging.info('Created directory tmp to tag generator.')
            with open('tmp/' + generator_path, 'wb') as g:
                dill.dump(func, g, protocol=dill.HIGHEST_PROTOCOL)
            logging.info('Generator tagged successfully.')
Exemple #38
0
    accuracy = accuracy_score(y_val, predicted)
    f1_score_macro = f1_score(y_val, predicted, average='macro')
    f1_score_micro = f1_score(y_val, predicted, average='micro')
    f1_score_weighted = f1_score(y_val, predicted, average='weighted')
    print("accuracy:", accuracy)
    print("f1_score_macro:", f1_score_macro)
    print("f1_score_micro:", f1_score_micro)
    print("f1_score_weighted:", f1_score_weighted)
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

# TF-IDF+朴素贝叶斯模型---------------------------------------------------------------------------------------------------
NB_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')),
    ('clf', OneVsRestClassifier(MultinomialNB())),
])

NB_pipeline.fit(X_train, y_train)
prob=NB_pipeline.predict_proba(X_val)
predicted = NB_pipeline.predict(X_val)
print_evaluation_scores(y_val, predicted)


#TF-IDF+逻辑回归---------------------------------------------------------------------------------------------------------
LogReg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=10000), n_jobs=1)),
])

LogReg_pipeline.fit(X_train, y_train)
    validation_result = algorithm.predict(X_validation)

    matches = validation_result == Y_validation

    total_matches = sum(matches)
    total_elements_test = len(Y_validation)

    print "Total de elementos de validação", total_elements_test

    print "Taxa de acertos do melhor algoritmo com elementos de validação: {0} %".format(
        round(100.0 * total_matches / total_elements_test, 2))


multinomialModel = MultinomialNB()
adaBoostModel = AdaBoostClassifier()
oneVsRestModel = OneVsRestClassifier(LinearSVC(random_state=0))
oneVsOneModel = OneVsOneClassifier(LinearSVC(random_state=0))

total_matches_multinomial = fit_and_predict(multinomialModel, "MultinomialNB",
                                            X_trainning, Y_trainning)

total_matches_adaboost = fit_and_predict(adaBoostModel, "AdaBoost",
                                         X_trainning, Y_trainning)

total_matches_one_vs_rest = fit_and_predict(oneVsRestModel, "OneVsRest",
                                            X_trainning, Y_trainning)

total_matches_one_vs_one = fit_and_predict(oneVsOneModel, "OneVsOne",
                                           X_trainning, Y_trainning)

print "Total de elementos analisados no teste: {0}".format(len(Y_trainning))
# accuracy with tfidf vectorizer
acc_tfidf_nb = accuracy_score(nb_2.predict(X_test_tfidf), Y_test)

# display accuracies
print(acc_count_nb)
print(acc_tfidf_nb)

# Code ends here


# --------------
import warnings
warnings.filterwarnings('ignore')

# initialize logistic regression
logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10))
logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10))
# fit on count vectorizer training data
logreg_1.fit(X_train_count ,Y_train )
logreg_2.fit(X_train_tfidf ,Y_train )
# fit on tfidf vectorizer training data

acc_count_logreg = accuracy_score(logreg_1.predict(X_test_count), Y_test) 
# accuracy with count vectorizer

# accuracy with tfidf vectorizer
acc_tfidf_logreg = accuracy_score(logreg_2.predict(X_test_tfidf), Y_test) 

# display accuracies

print(acc_count_logreg)
Exemple #41
0
def score(emb, startfrom0=False, topk=False):

    # 0. Files
    #embeddings_file = "blogcatalog.embeddings"
    list_of_files = glob.glob('../emb/kaggle/*.emb')
    matfile = mat_file
    embeddings_file = emb_file

    # 2. Load labels
    mat = loadmat(matfile)
    A = mat['network']
    graph = sparse2graph(A)
    labels_matrix = mat['group']

    if startfrom0:
        index_align = 0
    else:
        index_align = 1

    features_matrix_array = []
    dw_features_matrix_array = {}
    cf_features_matrix_array = {}
    cfi_features_matrix_array = {}

    if all_file:
        for f in list_of_files:
            embed = numpy.loadtxt(f, skiprows=1)
            features_matrix = numpy.asarray([
                embed[numpy.where(embed[:, 0] == node + index_align), 1:][0, 0]
                for node in range(len(graph))
            ])
            features_matrix = numpy.reshape(
                features_matrix,
                [features_matrix.shape[0], features_matrix.shape[-1]])
            if os.path.basename(
                    os.path.splitext(f)[0]).split('_')[-1] == 'cfi':
                cfi_features_matrix_array['cfi'] = features_matrix
            elif os.path.basename(
                    os.path.splitext(f)[0]).split('_')[-1] == 'cf':
                cf_features_matrix_array['cf'] = features_matrix
            else:
                nw = int(
                    os.path.basename(os.path.splitext(f)[0]).split('_')[-1])
                dw_features_matrix_array[nw] = features_matrix
        features_matrix_array.append(dw_features_matrix_array)
        features_matrix_array.append(cf_features_matrix_array)
        features_matrix_array.append(cfi_features_matrix_array)

    else:
        if emb is None:
            # 1. Load Embeddings
            embed = numpy.loadtxt(embeddings_file, skiprows=1)
            features_matrix = numpy.asarray([
                embed[numpy.where(embed[:, 0] == node + index_align), 1:][0, 0]
                for node in range(len(graph))
            ])
            features_matrix = numpy.reshape(
                features_matrix,
                [features_matrix.shape[0], features_matrix.shape[-1]])
        else:
            features_matrix = emb
        features_matrix_array.append(features_matrix)

    res = []

    training_percents = [0.3, 0.5, 0.9]
    # uncomment for all training percents
    #training_percents = numpy.asarray(range(1,10))*.1
    for emb in features_matrix_array:
        score_array = {}
        for key in emb.keys():
            emb_buf = emb[key]
            # 3. to score each train/test group
            all_results = defaultdict(list)

            # 2. Shuffle, to create train/test groups
            shuffles = []
            number_shuffles = 2
            for x in range(number_shuffles):
                shuffles.append(skshuffle(emb_buf, labels_matrix))

            for train_percent in training_percents:
                for shuf in shuffles:

                    X, y = shuf

                    training_size = int(train_percent * X.shape[0])

                    X_train = X[:training_size, :]
                    y_train_ = y[:training_size]

                    y_train = [[] for x in xrange(y_train_.shape[0])]

                    cy = y_train_.tocoo()
                    for i, j in izip(cy.row, cy.col):
                        y_train[i].append(j)

                    #mlb = MultiLabelBinarizer()
                    #y_train_onehot = mlb.fit_transform(y_train)
                    y_train_onehot = label2onehot(
                        y_train,
                        labels_matrix.toarray().shape[1])

                    #assert sum(len(l) for l in y_train) == y_train_.nnz

                    X_test = X[training_size:, :]
                    y_test_ = y[training_size:]

                    y_test = [[] for x in xrange(y_test_.shape[0])]

                    cy = y_test_.tocoo()
                    for i, j in izip(cy.row, cy.col):
                        y_test[i].append(j)

                    #y_test_onehot = mlb.fit_transform(y_test)
                    y_test_onehot = label2onehot(
                        y_test,
                        labels_matrix.toarray().shape[1])

                    if topk:
                        clf = TopKRanker(LogisticRegression(max_iter=500, ))
                    else:
                        clf = OneVsRestClassifier(
                            LogisticRegression(max_iter=500))

                    clf.fit(X_train, y_train_onehot)

                    if topk:
                        # find out how many labels should be predicted
                        top_k_list = [len(l) for l in y_test]
                        preds = clf.predict(X_test, top_k_list)
                        preds = label2onehot(preds,
                                             labels_matrix.toarray().shape[1])
                    else:
                        preds = clf.predict(X_test)

                    results = {}
                    averages = ["micro", "macro", "samples", "weighted"]
                    for average in averages:
                        results[average] = f1_score(y_test_onehot,
                                                    preds,
                                                    average=average)

                    all_results[train_percent].append(results)

            print 'Results, using embeddings of dimensionality', X.shape[1]
            print '-------------------'
            for train_percent in sorted(all_results.keys()):
                print 'Train percent:', train_percent
                for x in all_results[train_percent]:
                    print x
                print '-------------------'

            score_array[key] = all_results
        res.append(score_array)

    dw_res, cf_res, cfi_res = res[0], res[1], res[2]

    averages = ["micro", "macro", "samples", "weighted"]
    percent = [0.3, 0.5, 0.9]
    for average in averages:
        for p in percent:
            plt.figure()
            y_value_dw = [
                dw_res[k][p][0][average] for k in sorted(dw_res.keys())
            ]
            y_value_cf = [
                cf_res['cf'][p][0][average] for k in sorted(dw_res.keys())
            ]
            y_value_cfi = [
                cfi_res['cfi'][p][0][average] for k in sorted(dw_res.keys())
            ]
            plt.plot(y_value_dw, 'bo-')
            plt.plot(y_value_cf, 'ro-')
            plt.plot(y_value_cfi, 'go-')
            plt.grid(True)
            plt.xlabel('number of walks at 10, 20, 50, 100')
            plt.ylabel('score')
            plt.title('percentage: %f, metric: %s' % (p, average))
            plt.savefig("p%.1f_%s.png" % (p, average))
def train_classifiers(dataset,
                      dataset_fp,
                      subset_size,
                      n_grams,
                      seeds,
                      test_size):

    overall_start_time = time.time()

    if dataset == 'bnc_rb':
        # Read raw data
        raw_data = pd.read_csv('data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0_rand_balanced.csv')

        # prepocess data
        data = preprocess_df(df=raw_data, data='bnc_rb')

        # change column names so everything works later
        data.rename(columns={"clean_text": "clean_data",
                             "age_cat": "labels"}, inplace=True)
    elif dataset == 'bnc':
        raw_data = pd.read_csv('data/bnc/bnc_subset_19_29_vs_50_plus_nfiles_0.csv')

        # prepocess data
        data = preprocess_df(df=raw_data, data='bnc')

        # change column names so everything works later
        data.rename(columns={"clean_text": "clean_data",
                             "age_cat": "labels"}, inplace=True)
    elif dataset == 'blog':

        raw_data = pd.read_csv('data/blogs_kaggle/blogtext.csv')

        # prepocess data
        data = preprocess_df(df=raw_data, data='blog')

        # change column names so everything works later
        data.rename(columns={"clean_text": "clean_data",
                             "age_cat": "labels"}, inplace=True)

        # preproc_file = Path("./data/blogs_kaggle/blogger_preprocessed_data_FAKE.csv")

        # # Pre-process raw data if pre-processed data doesn't exist
        # try:
        #     preproc_abs_path = preproc_file.resolve(strict=True)
        # except FileNotFoundError:
        #     # doesn't exist
        #
        #     # Read and load dataset
        #     print("Reading raw data...")
        #     data = pd.read_csv("./data/blogs_kaggle/blogtext.csv")
        #     print("Done reading raw data.")
        #
        #
        #     # Subsetting data
        #     # perc_df = 0.00020 # fraction of dataset to take
        #     # sub_sample = math.ceil(perc_df * data.shape[0])
        #
        #     if subset_size != -1:
        #         # Chosen to train and test model(s) on subset of size subset_size
        #
        #         #shuffle data set before subsampling
        #         data = data.sample(frac=1).reset_index(drop=True)
        #         data = data[:subset_size]
        #
        #     print(f"Dataset size before preprocessing: {data.shape[0]}")
        #
        #     print("Preprocessing data...")
        #     # Removing all unwanted text/characters from data['text'] column
        #     # Remove all non-alphabetical characters
        #     data['clean_data'] = data['text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ', x))
        #
        #     # Make all letters lower case
        #     data['clean_data'] = data['clean_data'].apply(lambda x: x.lower())
        #
        #     # Remove white space from beginning and end of string
        #     data['clean_data'] = data['clean_data'].apply(lambda x: x.strip())
        #
        #     # Remove instances empty strings
        #     before_rm_empty = len(data)
        #     data.drop(data[data.clean_data == ''].index, inplace = True)
        #
        #     print(f'{before_rm_empty - len(data)} empty string instances removed.')
        #
        #     # Remove texts that are probably not English by filtering blogs that dont contain at least one of the top 50 most used English words
        #     # create dict with most common English words
        #     top_en_words = {}
        #     with open('./data/wordlists/top1000english.txt') as f:
        #         count = 1
        #         for line in f:
        #             key = line.split()[0].lower()
        #             top_en_words[key] = count
        #             count += 1
        #
        #             # Stop at top 50 words. Idea taken from DialoGPT paper.
        #             if count > 50:
        #                 break
        #
        #
        #     data['top_50_en'] = data['clean_data'].apply(lambda x : True if not set(x.split()).isdisjoint(top_en_words) else False)
        #
        #     def top_lang_detect(text):
        #
        #         detected_langs = detect_langs(text)
        #
        #         return detected_langs[0].lang
        #
        #
        #     def top_prob_detect(text):
        #
        #         detected_langs = detect_langs(text)
        #
        #         return detected_langs[0].prob
        #
        #     start_time = time.time()
        #     data['top_lang'] = data['clean_data'].apply(top_lang_detect)
        #     print(f"Top lang detection took {time.time() - start_time} seconds")
        #     start_time = time.time()
        #     data['top_prob'] = data['clean_data'].apply(top_prob_detect)
        #     print(f"Top lang prob lang detection took {time.time() - start_time} seconds")
        #
        #     # Remove rows without one of top50 most common english words
        #     before_top50_removal = len(data)
        #     data.drop(data[data['top_50_en'] == False].index, inplace = True)
        #     print(f"{before_top50_removal - len(data)} instances dropped")
        #
        #     before_top_lang = len(data)
        #     data.drop(data[data['top_lang'] != 'en'].index, inplace = True)
        #     print(f'{before_top_lang - len(data)} instances dropped.')
        #
        #     before_top_prob = len(data)
        #     data.drop(data[data['top_prob'] < 0.9].index, inplace = True)
        #     print(f'{before_top_prob - len(data)} instances dropped.')
        #
        #     # Remove stop words
        #     stopwords = set(nltk.corpus.stopwords.words('english')) # use set (hash table) data structure for faster lookup
        #
        #     # also add urllink and nbsp to set of words to remove
        #     stopwords.update(['urllink', 'nbsp'])
        #
        #     data['clean_data'] = data['clean_data'].apply(lambda x: ' '.join([words for words in x.split() if words not in stopwords]))
        #
        #     print("Done preprocessing data.")
        #
        #     print("Saving preprocessed dataframe to csv...")
        #     # save pre-processed dataframe to csv
        #     data.to_csv("./data/blogs_kaggle/blogger_preprocessed_data.csv")
        #
        # else:
        #     # exists
        #     # Read and load dataset
        #     print("Reading preprocessed data...")
        #     data = pd.read_csv("./data/blogs_kaggle/blogger_preprocessed_data.csv")
        #     print("Done reading preprocessed data.")
        #     # data = data[['clean_data', 'labels']]
        #
        # print(f"Dataset size after preprocessing: {data.shape[0]}")
        #
        # # Drop columns that are uninformative for writing style (i.e., ID and date)
        # data.drop(['id', 'date'], axis = 1, inplace = True)
        #
        # # Add labels for age categories
        # def age_to_cat(age):
        #     '''Returns age category label for given age number.'''
        #
        #     if 13 <= int(age) <= 17:
        #         return '13-17'
        #     elif 23 <= int(age) <= 27:
        #         return '23-27'
        #     elif 33 <= int(age):
        #         return '33-47'
        #     else:
        #         print(int(age))
        #         raise ValueError("Given age not in one of pre-defined age groups.")
        #
        #
        # data['age_cat'] = data['age'].apply(age_to_cat)
        #
        # # Merge all possibly interesting labels into one column
        # data['labels'] = data.apply(lambda col: [col['gender'], str(col['age']), col['topic'], col['sign']], axis = 1)
        #
        # # Only keep age as label
        # # data['labels'] = data.apply(lambda col: [str(col['age'])], axis = 1) # TODO: Why keep age as string?
        # # data['labels'] = data.apply(lambda col: [col['age']], axis = 1)
        # data['labels'] = data.apply(lambda col: [col['age_cat']], axis = 1)
        #
        # # Reduce dataframe to only contain cleaned blogs and list of labels
        # data = data[['clean_data', 'labels']]

    # results dict
    accs_all = {}
    if dataset == 'blog':
        class_labels_list = ['13-17', '23-27', '33-47']
    elif dataset == 'bnc' or dataset == 'bnc_rb':
        class_labels_list = ['19_29', '50_plus']

    # Evaluate performance
    def print_evaluation_scores(labels, preds):
        print(f"Accuracy: {accuracy_score(labels, preds)}")
        print(f"F1 score: {f1_score(labels, preds, average = None)}") # outputs F1 per class
        print(f"Average precision: {average_precision_score(labels, preds, average = 'micro')}")
        print(f"Average recall: {recall_score(labels, preds, average = 'micro')}")
        print(classification_report(labels, preds, digits=5, zero_division=0))
        # print(f"Confusion Matrix: {confusion_matrix(labels.argmax(axis=1), preds.argmax(axis=1))}")


    # def print_top_n(vectorizer, clf, class_labels, n_feat = 10):
    #     """Prints features with the highest coefficient values, per class"""
    #     feature_names = vectorizer.get_feature_names()
    #     for i, class_label in enumerate(class_labels):
    #         topn = np.argsort(clf.estimators_[i].coef_)[0][-n_feat:]
    #         print("%s: %s" % (class_label,
    #               " ".join(feature_names[j] for j in topn)))

    # spacy english tokenizer
    # spacy_eng = spacy.load("en_core_web_sm")

    # def tokenizer_eng(text):
    #     text = str(text)
    #     return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    #
    # token_counter = Counter()
    # for sentence in data.clean_data:
    #     for word in tokenizer_eng(sentence):
    #         token_counter.update([word])
    #
    # min_thresh = 3000
    # trunc_counter = {x: count for x, count in token_counter.items() if count >= min_thresh}

    # TODO: FIX REVERSE ORDERING BUG. SEE NOTEBOOK FOR RPA
    # def print_top_n_thresh(vectorizer, clf, class_labels, n_feat = 100,
    #                        counter = trunc_counter):
    #     """Prints features with the highest coefficient values, per class"""
    #     feature_names = vectorizer.get_feature_names()
    #     for i, class_label in enumerate(class_labels):
    #         topn = np.argsort(clf.estimators_[i].coef_)[0][-n_feat:]
    #         print("%s: %s" % (class_label,
    #               " ".join(feature_names[j] for j in topn if feature_names[j] in counter)))

    # def print_top_n_thresh(vectorizer, clf, class_labels, n_feat = 100,
    #                    counter = trunc_counter):
    #     """Prints features with the highest coefficient values, per class"""
    #     feature_names = vectorizer.get_feature_names()
    #     for i, class_label in enumerate(class_labels):
    #         topn = np.argsort(clf.estimators_[i].coef_)[0][-n_feat:]
    #         topn = topn[::-1]  # Reverse order of arg s.t. features with high coefficients appear first
    #         print("%s: %s" % (class_label,
    #               " ".join(feature_names[j] for j in topn if feature_names[j] in counter)))
    #
    # def most_informative_feature_for_class(vectorizer, classifier, class_labels, n=10):
    #     #labelid = list(classifier.classes_).index(classlabel)
    #     feature_names = vectorizer.get_feature_names()
    #     for i, class_label in enumerate(class_labels):
    #         topn = sorted(zip(classifier.estimators_[i].coef_[0], feature_names))[-n:]
    #
    #         for coef, feat in topn:
    #             print(class_label, feat, coef)

    test_accs = {}
    test_f1s = {}

    for n_gram in n_grams:
        test_accs[n_gram] = {}
        test_f1s[n_gram] = {}
        # for class_label in class_labels_list:
        #     test_f1s[n_gram][class_label] = {}



    print("Starting training and testing loops...")
    for seed in tqdm(seeds, desc = "Seed loop."):

        # set seed for reproducibility
        np.random.seed(seed)

        # shuffle dataframe
        data = data.sample(frac=1).reset_index(drop=True)


        for n in tqdm(n_grams, desc = "n gram loop."):

            # Split data into features/ X and labels / Y
            X = data['clean_data']
            Y = data['labels']

            # n-gram model
            vectorizer = CountVectorizer(binary = True, ngram_range = (1, n))

            # fit model
            X = vectorizer.fit_transform(X)

            # # check out a sample of the uni- and bigrams
            # print(vectorizer.get_feature_names()[:10])

            # Get label counts
            label_counts = {}
            if dataset == 'blog':
                # for labels in data.labels.values:
                #     for label in labels:
                #         if label in label_counts:
                #             label_counts[label] += 1
                #         else:
                #             label_counts[label] = 1
                for label in data.labels.values:
                    if label in label_counts:
                        label_counts[label] += 1
                    else:
                        label_counts[label] = 1
            elif dataset == 'bnc_rb' or dataset == 'bnc':
                for label in data.labels.values:
                    if label in label_counts:
                        label_counts[label] += 1
                    else:
                        label_counts[label] = 1

            # Binarize the labels for prediction
            if dataset == 'blog':
                # binarizer = MultiLabelBinarizer(classes = sorted(label_counts.keys()))
                binarizer = LabelBinarizer()
            elif dataset == 'bnc_rb' or dataset == 'bnc':
                binarizer = LabelBinarizer()

            Y = binarizer.fit_transform(data.labels)

            label_counts.keys()

            # Split data into train and test sets
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size)

            # if n == 1:
            #     # save splits and vectorizer
            #     save_file_splits_vzer = f"splits_vzer_{n}_gram_seed_{seed}"
            #     pickle.dump((vectorizer, X_train, X_test, Y_train, Y_test),
            #                 open(save_file_splits_vzer, 'wb'))


            # Fit logistic regression model
            start_time = time.time()
            model = LogisticRegression(solver = 'lbfgs', multi_class='ovr', max_iter = 1000000)
            model = OneVsRestClassifier(model)
            # model = MultiOutputClassifier(model)
            model.fit(X_train, Y_train)
            print(f"Fitting model took {time.time() - start_time} seconds.")

            # save the classifier
            # save_file_name = f"logit_{n}_gram_seed_{seed}"
            # pickle.dump(model, open(save_file_name, 'wb'))

            # make predictions on test set
            Y_pred = model.predict(X_test)

            Y_pred_inversed = binarizer.inverse_transform(Y_pred)
            Y_test_inversed = binarizer.inverse_transform(Y_test)

            print("=" * 81)

            print(f"n = {n}")
            print(f"seed = {seed}")
            print_evaluation_scores(Y_test, Y_pred)

            test_accs[n][seed] = accuracy_score(Y_test, Y_pred)
            test_f1s[n][seed] = f1_score(Y_test, Y_pred, average=None)

            # for label_idx in range(len(class_labels_list)):
            #     test_f1s[n][class_labels_list[label_idx]][seed] = f1_score(Y_test, Y_pred, average=None)[label_idx]

            if n in accs_all:
                accs_all[n].append(accuracy_score(Y_test, Y_pred))
            else:
                accs_all[n] = [accuracy_score(Y_test, Y_pred)]

            # Print most informative features
            # if n == 1:
            #     print("Most informative features per age-group.")
            #     print_top_n_thresh(vectorizer = vectorizer, clf = model,
            #                 class_labels = class_labels_list, n_feat = 20)

            print("-" * 81)
    #         print("Some failure cases.")
    # #         predictions = model.predict(inputs)
    #         for i, (x, pred, label) in enumerate(zip(X_test, Y_pred, Y_test)):
    #             if (pred != label).any():
    #                 print(f"pred: {pred}")
    #                 print(f"label: {label}")
    #                 pred_cat = binarizer.classes_[np.where(pred == 1)[0][0]]
    #                 label_cat = binarizer.classes_[np.where(label == 1)[0][0]]
    #                 print(data['clean_data'][i], 'has been classified as ', pred_cat, 'and should be ', label_cat)

            print("=" * 81)

            # UNCOMMENT FOLLOWING LINES FOR CM PLOTS
            # int_labels = [label for label in range(len(class_labels_list))]
            # cm = confusion_matrix(Y_test, Y_pred, labels=int_labels)
            # make_confusion_matrix(cf=cm, categories=class_labels_list, title=f'Confusion Matrix for {dataset} on Test set',
            #                       num_labels=int_labels, y_true=Y_test, y_pred=Y_pred, figsize=FIGSIZE)
            # cur_datetime = datetime.now().strftime('%d_%b_%Y_%H_%M_%S')
            # plt.savefig(f"{FIGDIR}{dataset}/cm_{n}_gram_{dataset}_dt_{cur_datetime}.png",
            #             bbox_inches='tight')

    #         most_informative_feature_for_class(vectorizer = vectorizer, classifier = model, class_labels = class_labels_list, n=10)

    # def plot_accuracies(accs, show = False):
    #
    #     means = [np.mean(accs[n]) for n in range(1, len(accs) + 1)]
    #     # print(np.mean(means))
    #     stds = [np.std(accs[n]) for n in range(1, len(accs) + 1)]
    #
    #     x_pos = np.arange(len(accs))
    #     x_labels = list(accs.keys())
    #
    #     # Build the plot
    #     fig, ax = plt.subplots()
    #     ax.bar(x_pos, means, yerr=stds, align='center', alpha=0.5, ecolor='black', capsize=10)
    #     ax.set_ylabel('Mean classification accuracy.')
    #     ax.set_xlabel("$n$")
    #     ax.set_xticks(x_pos)
    #     ax.set_xticklabels(x_labels)
    #     ax.set_title('Age group prediction accuracy for various n-gram models.')
    #     ax.yaxis.grid(True)
    #
    #     # Save the figure and show
    #     plt.tight_layout()
    #     plt.savefig('figures/bar_plot_with_error_bars_10000.png')
    #
    #     if show:
    #         plt.show()

    # plot_accuracies(accs_all)

    # print average metrics
    print(89*'-')
    print(89 * '-')
    print("PRINTING AVERAGE METRICS")
    for n_gram in n_grams:
        n_gram_accs = []
        n_gram_f1s = []
        for seed in seeds:
            n_gram_accs.append(test_accs[n_gram][seed])
            n_gram_f1s.append(test_f1s[n_gram][seed])

        print(f"| n = {n_gram} | Average accuracy = {np.mean(n_gram_accs)} | Acc std = {np.std(n_gram_accs)} "
              f"| Average f1s = {np.mean(n_gram_f1s, axis=0)} | F1s std = {np.std(n_gram_f1s, axis=0)} |")

    overall_end_time = time.time()


    print(f"Done with everything. Took {overall_end_time - overall_start_time} seconds.")
class ClassifierLinearSVM:
    def __init__(self, task, cv=3):
        self.cv = cv
        self.model = None
        self.calibrated_model = None
        # name of the property
        self.task = task
        self.config = helpers.load_yaml("src/config.yml")

    def train(self, X_train, y_train):
        if self.task.label_task == "single-label":
            self.model = LinearSVC(dual=True, max_iter=3000)
        elif self.task.label_task == "multi-label":
            self.model = OneVsRestClassifier(
                LinearSVC(dual=True, max_iter=3000))

        self.model.fit(X_train, y_train)
        self.calibrated_model = CalibratedClassifierCV(
            base_estimator=self.model, cv="prefit")
        self.calibrated_model.fit(X_train, y_train)
        return self.calibrated_model

    @staticmethod
    def _linear_scale_confidence(confidences):
        """
        return the ratio of prob according to the sum of top n probabilities for the predicted intents.
        if probs = [p1, p2, p3] then the return probabilities will be scaled as
        [p1/sum(p1,p2,p3), p2/sum(p1,p2,p3), p3/sum(p1,p2,p3)]
        Args:
            confidences: probabilities of intents
        Returns:
            numpy array: the scaled confidences
        """
        s = np.sum(confidences)
        return confidences / s

    def predict_utt_top_n(self, featurized_utt, n=3):
        """
        predict the topn predictions along with the confidence probability for each one.
        Note that model.classes_ contains the trained labels in alphabetical order. Here, we sort the
        confidences together with the labels, and return the top3 from this sorted order
        Args:
            featurized_utt (str): featurized and tokenized single utterance
        Returns:
            One list of strings and one list of floats
        """
        raw_confidences = self.calibrated_model.predict_proba(
            featurized_utt)[0]
        # indices of sorted confidences from high to low confidence
        sorted_conf_idx = np.argsort(raw_confidences)[::-1][:n]
        labels = np.take(self.calibrated_model.classes_, sorted_conf_idx)
        confidences = np.take(raw_confidences, sorted_conf_idx)
        # scaled_confidences = self._linear_scale_confidence(confidences)
        scaled_confidences = confidences

        return labels, scaled_confidences

    def predict_batch_top_n(self, X_test, topn=5):
        """
        predict the topn predictions for the whole batch.
        Returns a list of tuples, where each tuple is a list
        """
        return [
            self.predict_utt_top_n(test.reshape(1, -1), n=topn)
            for test in X_test
        ]

    def get_pred_and_accuracy(self, X_test, y_test, topn=5):
        """
        Returns predictions and accuracy for the test set
        """
        predictions = [
            self.predict_utt_top_n(test.reshape(1, -1), n=topn)
            for test in X_test
        ]
        num_correct = 0
        for test, pred in zip(y_test, predictions):
            topn_list = pred[0]
            if test in topn_list:
                num_correct += 1
        return predictions, num_correct / len(y_test)

    def load(self):
        self.calibrated_model = helpers.load_model_from_dir(
            self.config["models_dir"], self.task.classifier_name)

    def export(self):
        helpers.save_model_to_dir(self.config["models_dir"],
                                  self.task.classifier_name,
                                  self.calibrated_model)
Exemple #44
0
for filename in elsa_filenames:
    #read the images
    image = imread(filename)
    #flatten it
    image = resize(image, (200,200))
    hog_features = hog(image, orientations=12, pixels_per_cell=(16, 16),cells_per_block=(1, 1))
    #hog_features = hog(image, orientations=12, pixels_per_cell=(16,16), cells_per_bock=(1,1))
    data.append(hog_features)
    labels.append(1)
print('Finished adding Elsa samples to dataset')

for filename in eric_filenames:
    #read the images
    image = imread(filename)
    #flatten it
    image = resize(image, (200,200))
    hog_features = hog(image, orientations=12, pixels_per_cell=(16, 16),cells_per_block=(1, 1))
    #hog_features = hog(image, orientations=12, pixels_per_cell=(16,16), cells_per_bock=(1,1))
    data.append(hog_features)
    labels.append(2)
print('Finished adding Eric samples to dataset')


print('Training the SVM')
#create the SVC
clf = OneVsRestClassifier(SVC(kernel="linear", probability=True))
#traing the svm
clf.fit(data, labels)
#pickle it - save it to a file
clf = pickle.dump(clf, open("signature.detector","wb"))
                             max_features=200000,
                             smooth_idf=True,
                             norm="l2",
                             tokenizer=lambda x: x.split(),
                             sublinear_tf=False,
                             ngram_range=(1, 3))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])

print("Dimensions of train data X: ", x_train_multilabel.shape, "Y: ",
      y_train.shape)
print("Dimensions of test data X: ", x_test_multilabel.shape, "Y: ",
      y_test.shape)

classifier = OneVsRestClassifier(SGDClassifier(loss='log',
                                               alpha=0.00001,
                                               penalty='l1'),
                                 n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict(x_test_multilabel)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print("Macro f1 score:", metrics.f1_score(y_test, predictions,
                                          average='macro'))
print("Micro f1 score:", metrics.f1_score(y_test, predictions,
                                          average='micro'))
print("Hamming loss:", metrics.hamming_loss(y_test, predictions))
print("Precision recall report: \n",
      metrics.classification_report(y_test, predictions))

# Dumping model
joblib.dump(classifier, './model/equal_weight_model.pkl')
Exemple #46
0
def oneVrest(x,y,test):
    predict=OneVsRestClassifier(SVC(kernel='linear')).fit(x, y).predict(test)
    return predict
def test_pairwise_n_features_in():
    """Check the n_features_in_ attributes of the meta and base estimators

    When the training data is a regular design matrix, everything is intuitive.
    However, when the training data is a precomputed kernel matrix, the
    multiclass strategy can resample the kernel matrix of the underlying base
    estimator both row-wise and column-wise and this has a non-trivial impact
    on the expected value for the n_features_in_ of both the meta and the base
    estimators.
    """
    X, y = iris.data, iris.target

    # Remove the last sample to make the classes not exactly balanced and make
    # the test more interesting.
    assert y[-1] == 0
    X = X[:-1]
    y = y[:-1]

    # Fitting directly on the design matrix:
    assert X.shape == (149, 4)

    clf_notprecomputed = svm.SVC(kernel="linear").fit(X, y)
    assert clf_notprecomputed.n_features_in_ == 4

    ovr_notprecomputed = OneVsRestClassifier(clf_notprecomputed).fit(X, y)
    assert ovr_notprecomputed.n_features_in_ == 4
    for est in ovr_notprecomputed.estimators_:
        assert est.n_features_in_ == 4

    ovo_notprecomputed = OneVsOneClassifier(clf_notprecomputed).fit(X, y)
    assert ovo_notprecomputed.n_features_in_ == 4
    assert ovo_notprecomputed.n_classes_ == 3
    assert len(ovo_notprecomputed.estimators_) == 3
    for est in ovo_notprecomputed.estimators_:
        assert est.n_features_in_ == 4

    # When working with precomputed kernels we have one "feature" per training
    # sample:
    K = X @ X.T
    assert K.shape == (149, 149)

    clf_precomputed = svm.SVC(kernel="precomputed").fit(K, y)
    assert clf_precomputed.n_features_in_ == 149

    ovr_precomputed = OneVsRestClassifier(clf_precomputed).fit(K, y)
    assert ovr_precomputed.n_features_in_ == 149
    assert ovr_precomputed.n_classes_ == 3
    assert len(ovr_precomputed.estimators_) == 3
    for est in ovr_precomputed.estimators_:
        assert est.n_features_in_ == 149

    # This becomes really interesting with OvO and precomputed kernel together:
    # internally, OvO will drop the samples of the classes not part of the pair
    # of classes under consideration for a given binary classifier. Since we
    # use a precomputed kernel, it will also drop the matching columns of the
    # kernel matrix, and therefore we have fewer "features" as result.
    #
    # Since class 0 has 49 samples, and class 1 and 2 have 50 samples each, a
    # single OvO binary classifier works with a sub-kernel matrix of shape
    # either (99, 99) or (100, 100).
    ovo_precomputed = OneVsOneClassifier(clf_precomputed).fit(K, y)
    assert ovo_precomputed.n_features_in_ == 149
    assert ovr_precomputed.n_classes_ == 3
    assert len(ovr_precomputed.estimators_) == 3
    assert ovo_precomputed.estimators_[
        0].n_features_in_ == 99  # class 0 vs class 1
    assert ovo_precomputed.estimators_[
        1].n_features_in_ == 99  # class 0 vs class 2
    assert ovo_precomputed.estimators_[
        2].n_features_in_ == 100  # class 1 vs class 2
Exemple #48
0
def generate_tags(article_id):
    try:
        import pandas as pd
        import numpy as np
        import itertools

        from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
        from sklearn.preprocessing import MultiLabelBinarizer
        from sklearn.multiclass import OneVsRestClassifier
        from sklearn.svm import SVC

        from website.models import Article
        a = Article.objects.get(id=article_id)

        # import article's comments into dataframe
        df = pd.DataFrame(list(a.comment_set.all().values(
            'id', 'article', 'disqus_id',
            'text', 'summary',
            'tags', 'suggested_tags')))
        # merge all text (comments+summaries) into a new column
        df['train_text'] = df[['text', 'summary']].apply(lambda x: ' '.join(x), axis=1)
        print(df['train_text'])

        # define  classifier
        clf = OneVsRestClassifier(SVC(kernel='linear'))

        # train data: use only comments with tags
        tagged = df.loc[df['tags'].notnull()]

        # train data: preproccess and vectorize (TfIdf) text data
        count_vect = CountVectorizer(
            stop_words='english',
            min_df=3, max_df=0.30,
            #lowercase=True,
            ngram_range=(1, 2),
        )
        X_train_counts = count_vect.fit_transform(list(tagged.train_text))
        tfidf_transformer = TfidfTransformer().fit(X_train_counts)
        X_train_tfidf = tfidf_transformer.transform(X_train_counts)
        # train classifier
        clf = clf.fit(X_train_tfidf, tagged.tags)

        # suggest tags for ALL instances in df
        test_df = df.drop_duplicates(subset=['disqus_id'])
        X_test_counts = count_vect.transform(list(test_df.train_text))
        X_test_tfidf = tfidf_transformer.transform(X_test_counts)
        suggested = clf.predict(X_test_tfidf)
        # save suggested tags to the dataframe
        test_df.suggested_tags = suggested

        # add suggested tags to the database
        sorted_df = test_df.sort_values('disqus_id')
        comments = a.comment_set.all().order_by('disqus_id')

        for comment in comments:
            comment.suggested_tags.clear()

        for row_item, comment in zip(sorted_df.iterrows(), comments):
            index, row = row_item
            if row['suggested_tags']:
                if not comment.tags.filter(id=row['suggested_tags']).exists():
                    comment.suggested_tags.add(row['suggested_tags'])

    except Exception, e:
        print e
def test_ovr_fit_predict_svc():
    ovr = OneVsRestClassifier(svm.SVC())
    ovr.fit(iris.data, iris.target)
    assert len(ovr.estimators_) == 3
    assert ovr.score(iris.data, iris.target) > 0.9
def test_ovr_partial_fit():
    # Test if partial_fit is working as intended
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert len(ovr.estimators_) == len(np.unique(y))
    assert np.mean(y == pred) > 0.65

    # Test when mini batches doesn't have all classes
    # with SGDClassifier
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]

    ovr = OneVsRestClassifier(
        SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0))
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    ovr.partial_fit(X[7:], y[7:])
    pred = ovr.predict(X)
    ovr1 = OneVsRestClassifier(
        SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0))
    pred1 = ovr1.fit(X, y).predict(X)
    assert np.mean(pred == y) == np.mean(pred1 == y)

    # test partial_fit only exists if estimator has it:
    ovr = OneVsRestClassifier(SVC())
    assert not hasattr(ovr, "partial_fit")
def test_ovr_fit_predict_sparse():
    for sparse in [
            sp.csr_matrix,
            sp.csc_matrix,
            sp.coo_matrix,
            sp.dok_matrix,
            sp.lil_matrix,
    ]:
        base_clf = MultinomialNB(alpha=1)

        X, Y = datasets.make_multilabel_classification(
            n_samples=100,
            n_features=20,
            n_classes=5,
            n_labels=3,
            length=50,
            allow_unlabeled=True,
            random_state=0,
        )

        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]

        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
        Y_pred_sprs = clf_sprs.predict(X_test)

        assert clf.multilabel_
        assert sp.issparse(Y_pred_sprs)
        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)

        # Test predict_proba
        Y_proba = clf_sprs.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > 0.5
        assert_array_equal(pred, Y_pred_sprs.toarray())

        # Test decision_function
        clf = svm.SVC()
        clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse(Y_train))
        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(
            n_samples=100,
            n_features=20,
            n_classes=5,
            n_labels=3,
            length=50,
            allow_unlabeled=au,
            random_state=0,
        )
        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # Decision function only estimator.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert not hasattr(decision_only, "predict_proba")

        # Estimator with predict_proba disabled, depending on parameters.
        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
        assert not hasattr(decision_only, "predict_proba")
        decision_only.fit(X_train, Y_train)
        assert not hasattr(decision_only, "predict_proba")
        assert hasattr(decision_only, "decision_function")

        # Estimator which can get predict_proba enabled after fitting
        gs = GridSearchCV(svm.SVC(probability=False),
                          param_grid={"probability": [True]})
        proba_after_fit = OneVsRestClassifier(gs)
        assert not hasattr(proba_after_fit, "predict_proba")
        proba_after_fit.fit(X_train, Y_train)
        assert hasattr(proba_after_fit, "predict_proba")

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > 0.5
        assert_array_equal(pred, Y_pred)
Exemple #53
0
             'verb_count', 'adjective_count', 'tot_pos_words_count', 
             'tot_neg_words_count', 'tot_neu_words_count', 'user_avg_stars', 'user_yelping_since', 'user_review_count']
)

df = df[df.review_stars != 3]

X = df.values[:, 1:]
y = df.values[:, 0]

# Binarize the output
y = label_binarize(y, classes=[1, 2, 4, 5])
n_classes = y.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

classifier = OneVsRestClassifier(GradientBoostingClassifier(loss= 'deviance', max_features= 'auto', n_estimators= 110, random_state= 3))

y_score = classifier.fit(X_train, y_train).predict_proba(X_test) #.decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
colors = ['blue', 'red', 'green', 'yellow']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--')
def test_ovr_always_present():
    # Test that ovr works with classes that are always present or absent.
    # Note: tests is the case where _ConstantPredictor is utilised
    X = np.ones((10, 2))
    X[:5, :] = 0

    # Build an indicator matrix where two features are always on.
    # As list of lists, it would be: [[int(i >= 5), 2, 3] for i in range(10)]
    y = np.zeros((10, 3))
    y[5:, 0] = 1
    y[:, 1] = 1
    y[:, 2] = 1

    ovr = OneVsRestClassifier(LogisticRegression())
    msg = r"Label .+ is present in all training examples"
    with pytest.warns(UserWarning, match=msg):
        ovr.fit(X, y)
    y_pred = ovr.predict(X)
    assert_array_equal(np.array(y_pred), np.array(y))
    y_pred = ovr.decision_function(X)
    assert np.unique(y_pred[:, -2:]) == 1
    y_pred = ovr.predict_proba(X)
    assert_array_equal(y_pred[:, -1], np.ones(X.shape[0]))

    # y has a constantly absent label
    y = np.zeros((10, 2))
    y[5:, 0] = 1  # variable label
    ovr = OneVsRestClassifier(LogisticRegression())

    msg = r"Label not 1 is present in all training examples"
    with pytest.warns(UserWarning, match=msg):
        ovr.fit(X, y)
    y_pred = ovr.predict_proba(X)
    assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
Exemple #55
0
    X[count, :] = vec_one
    count += 1

lb = preprocessing.LabelBinarizer()
labels_get = list(labels.label)
labels_get = [[x] for x in labels_get]
y = MultiLabelBinarizer().fit_transform(labels_get)
list_mico = []
list_maco = []
items = [p / 10.0 for p in range(1, 10)]

for item in items:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=item,
                                                        random_state=51)
    clf = LogisticRegression()  #C= 1, penalty = "l2", tol=0.01)
    y_score = OneVsRestClassifier(clf).fit(X_train, y_train).predict(X_test)

    item_preict = []
    for item in y_score:
        if item.any():
            item_preict.append(item)
    all_zeros = not np.any(y_score)
    micro_f1 = f1_score(y_test, y_score, average='micro')
    macro_f2 = f1_score(y_test, y_score, average='macro')
    print micro_f1
    print macro_f2
    list_mico.append(micro_f1)
    list_maco.append(macro_f2)
print "time to train cosine k nearest neighbors: %.2f seconds\n" % (end - start)

'''
-----------------------------------------
'''
print "\n-----------------------------------------\n\n"

# Load the digits dataset
digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#naive bayes on digits
start = time.clock()
clf = OneVsRestClassifier(MultinomialNB())
clf.fit(X_train,y_train)
accuracy = clf.score(X_test, y_test) * 100.0
end = time.clock()
print "naive bayes accuracy on digits (small dataset): %.2f%%" % accuracy
print "time to train naive bayes: %.2f seconds\n" % (end - start)

#logistic regression on digits
start = time.clock()
clf = LogisticRegression('l1', C=0.1)
clf.fit(X_train,y_train)
accuracy = clf.score(X_test, y_test) * 100.0
end = time.clock()
print "logistic regression accuracy on digits (small dataset): %.2f%%" % accuracy
print "time to train logistic regression: %.2f seconds\n" % (end - start)
class NodeTransformerLogit(Transformer):
    """
    we will get a list of blocks belonging to N classes.
    we train a logit classifier for those classes, as well as a multilabel classifier for the neighor of those classes

    the built feature vector is 2*N long
    """
    dGridSearch_LR_conf = dGridSearch_CONF

    def __init__(self,
                 nbClass=None,
                 n_feat_node=1000,
                 t_ngrams_node=(2, 4),
                 b_node_lc=False,
                 n_jobs=1):
        """
        input: 
        - number of classes
        - number of ngram
        - ngram min/max size
        - lowercase or not
        - njobs when fitting the logit using grid search
        if n_feat_node is negative, or 0, or None, we use all possible ngrams
        """
        Transformer.__init__(self)

        self.nbClass = nbClass
        self.n_feat_node, self.t_ngrams_node, self.b_node_lc = n_feat_node, t_ngrams_node, b_node_lc
        self.n_jobs = n_jobs

        self.text_pipeline = None  # feature extractor
        self.mdl_main = None  # the main model predicting among the nbClass classes
        self.mdl_neighbor = None  # the neighborhood model predicting zero to many of the classes

    def fit(self, X, y=None):
        """
        This tranformer needs the graphs to be fitted properly - see fitByGraph
        """
        return self

    def fitByGraph(self, lGraph, lAllNode=None):
        """
        we need to train 2 Logit: one to predict the node class, another to predict the class of the neighborhhod
        """
        self.text_pipeline = Pipeline([
            ('selector', NodeTransformerTextEnclosed()),
            (
                'tf',
                TfidfVectorizer(
                    lowercase=self.b_node_lc
                    #, max_features=10000
                    ,
                    analyzer='char',
                    ngram_range=self.t_ngrams_node)
            )  #(2,6)), #we can use it separately from the pipleline once fitted
            #                                        , ('word_selector'  , SelectKBest(chi2, k=self.n_feat_node))
        ])
        # the y
        if lAllNode == None: lAllNode = [nd for g in lGraph for nd in g.lNode]
        y = np.array([nd.cls for nd in lAllNode], dtype=np.int)
        if self.nbClass != len(np.unique(y)):
            traceln("Classes seen are: %s" % np.unique(y).tolist())
            traceln(self.nbClass)
            raise ValueError(
                "ERROR: some class is not represented in the training set")

        #fitting the textual feature extractor
        self.text_pipeline.fit(lAllNode, y)

        #extracting textual features
        x = self.text_pipeline.transform(lAllNode)

        #creating and training the main logit model
        lr = LogisticRegression(class_weight='balanced')
        self.mdl_main = GridSearchCV(lr,
                                     self.dGridSearch_LR_conf,
                                     refit=True,
                                     n_jobs=self.n_jobs)
        self.mdl_main.fit(x, y)
        del y
        if DEBUG: print(self.mdl_main)

        #now fit a multiclass multilabel logit to predict if a node is neighbor with at least one node of a certain class, for each class
        #Shape = (nb_tot_nodes x nb_tot_labels)
        y = np.vstack([g.getNeighborClassMask()
                       for g in lGraph])  #we get this from the graph object.
        assert y.shape[0] == len(lAllNode)

        lr = LogisticRegression(class_weight='balanced')
        gslr = GridSearchCV(lr,
                            self.dGridSearch_LR_conf,
                            refit=True,
                            n_jobs=self.n_jobs)
        self.mdl_neighbor = OneVsRestClassifier(gslr, n_jobs=self.n_jobs)
        self.mdl_neighbor.fit(x, y)

        del x, y
        if DEBUG: print(self.mdl_neighbor)

        return self

    def transform(self, lNode):
        """
        return the 2 logit scores
        """
        a = np.zeros(
            (len(lNode), 3 * self.nbClass), dtype=np.float64
        )  #for each class: is_of_class? is_neighbor_of_class on same page or accross page?

        x = self.text_pipeline.transform(lNode)

        a[..., 0:self.nbClass] = self.mdl_main.predict_proba(x)
        a[...,
          self.nbClass:3 * self.nbClass] = self.mdl_neighbor.predict_proba(x)
        #         for i, nd in enumerate(lNode):
        #             print i, nd, a[i]
        if DEBUG: print(a)
        return a
Exemple #58
0
     "kernel; c/gamma; tol; jakosc zbioru trenujacego (1vs1); jakosc zbiory testowego (1vs1); jakosc zbioru trenujacego (1vsR); jakosc zbiory testowego (1vsR)\n"
 )
 X_train, y_train = getData(sciezka + train[i])
 X_test, y_test = getData(sciezka + test[i])
 for j in range(0, 4):
     gram = get_kernel(j, X_train, X_train, 10)
     gram_test = rbf_kernel(X_test, X_train, 10)
     gram_test = rbf_kernel(X_test, X_train, 10)
     print('Test gram calculated')
     #print('Jakosc klasyfikacji zbioru testowego: ', clf.score(gram_test,y_test))
     #cpredicted = clf.predict(gram_test)
     ovo = OneVsOneClassifier(
         SVC(C=params[j], kernel='precomputed',
             tol=params[j])).fit(gram, y_train)
     ovr = OneVsRestClassifier(
         SVC(C=params[j], kernel='precomputed',
             tol=params[j])).fit(gram, y_train)
     print('Jakosc klasyfikacji zbioru trenujacego: ',
           ovo.score(X_train, y_train))
     print('Jakosc klasyfikacji zbioru testowego: ',
           ovo.score(X_test, y_test))
     f.write(kernels[j])
     f.write(';')
     f.write(str(params[j]))
     f.write(';')
     f.write(str(tols[j]))
     gram_test = rbf_kernel(X_test, X_train, 10)
     f.write(';')
     f.write(str(ovo.score(gram, y_train)))
     f.write(';')
     f.write(str(ovo.score(gram_test, y_test)))
Exemple #59
0
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.externals import joblib
import numpy as np

if __name__ == '__main__':
    rechtspraak_train_text = np.load("train_data.npy")
    rechtspraak_train_labels = np.load("train_label.npy")
    rechtspraak_test_text = np.load("test_data.npy")
    rechtspraak_test_labels = np.load("test_label.npy")

    print("Training classifier")
    classif = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=11),
                                  n_jobs=-1)
    classif.fit(rechtspraak_train_text, rechtspraak_train_labels)
    score_acc = classif.score(rechtspraak_test_text, rechtspraak_test_labels)
    joblib.dump(classif, 'model_knn.pkl')
    print("Score: " + str(score_acc))
print("Accuracy of XGB =", accuracy_score(y_test,xgb_pred),"\n")
print("Classification of XGB\n\n",classification_report(y_test,xgb_pred),"\n")
print("Confusion matrix of XGB\n\n\n",confusion_matrix(y_test,xgb_pred))


# In[58]:


from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import  AdaBoostClassifier


# In[59]:


ada_model = OneVsRestClassifier(AdaBoostClassifier())


# In[60]:


ada_model.fit(x_train,y_train)


# In[61]:


ada_pred = ada_model.predict(x_test)


# In[64]: