Ejemplo n.º 1
0
def compute_ranking(learnFullModel=False):
    path='/home/arya/PubMed/GEO/Datasets/'
    modelpath=path+'libsvm/model/'
    if not os.path.exists(modelpath):            os.makedirs(modelpath)
    outpath='{}libsvm/out/'.format(path)
    sys.stdout=open('{}SVM.log'.format('/home/arya/PubMed/GEO/Log/'),'w')
    sys.stderr=open('{}SVM.err'.format('/home/arya/PubMed/GEO/Log/'),'w')
    if not os.path.exists(outpath):            os.makedirs(outpath)
    X, Y = load_svmlight_file(path+'Corpus.libsvm',multilabel=True)
    Y=np.array(Y)
    if learnFullModel:
        model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, Y)
        joblib.dump(model, modelpath+'Model.libsvm')
        print 'The Full Model is Saved!'
    Folds=pd.read_pickle(path+'Folds.df')
    for fold in range(Folds.shape[1]):
        start=time()
        Xtr,Ytr=X[Folds[fold].values,:],Y[Folds[fold].values]
        print 'learning on fold...',Xtr.shape,fold, sys.stdout.flush()
        model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(Xtr, Ytr)
        Xte=X[~Folds[fold].values,:]
        labels=model.classes_
#         Yte=remove_unknown_classes(Yte, labels)
#         idx=np.array(map(lambda x: len(x)>0,Yte))
#         Yte=np.array(Yte)[idx]
#         Xte=Xte[idx]
        print 'predicting...',Xte.shape, sys.stdout.flush()
        pd.DataFrame(columns=labels,data=model.decision_function(Xte)).to_pickle('{}deci.{}.df'.format(outpath,fold))
#         (pd.DataFrame(columns=labels,data=MultiLabelBinarizer().fit_transform(list(Yte)+[labels]))).iloc[:-1].to_pickle('{}labels.{}.df'.format(outpath,fold))
#         ranking.to_pickle('{}ranking.{}.df'.format(outpath,fold))
        print 'Done in {:.0f} minutes'.format((time()-start)/60.0)
Ejemplo n.º 2
0
    def setUp(self):
        import sklearn.svm as svm
        import sklearn.preprocessing as pp
        from sklearn.multiclass import OneVsRestClassifier

        # 2 class
        iris = datasets.load_iris()
        self.data = iris.data
        self.target = pp.LabelBinarizer().fit_transform(iris.target)
        self.df = pdml.ModelFrame(self.data, target=self.target)
        self.assertEqual(self.df.shape, (150, 7))

        svc1 = svm.SVC(probability=True, random_state=self.random_state)
        estimator1 = OneVsRestClassifier(svc1)
        self.df.fit(estimator1)
        self.df.predict(estimator1)
        self.assertTrue(isinstance(self.df.predicted, pdml.ModelFrame))

        svc2 = svm.SVC(probability=True, random_state=self.random_state)
        estimator2 = OneVsRestClassifier(svc2)
        estimator2.fit(self.data, self.target)
        self.pred = estimator2.predict(self.data)
        self.proba = estimator2.predict_proba(self.data)
        self.decision = estimator2.decision_function(self.data)

        # argument for classification reports
        self.labels = np.array([2, 1, 0])
Ejemplo n.º 3
0
class ClassDistanceMapper(TransformerMixin):
    """ Fit a OneVsRestClassifier for each sentiment class (against all others
        combined) and return the distances from the decision boundary for each
        class. Hence, this transformation can be seen as a dimensionality
        reduction from #words to #sentiment_classes (=5).

    """

    def __init__(self):
        """ Initialize a one-vs-rest multiclass classifer with a
            SGDClassifier. The choice of the SGDclassifier here is arbitrary,
            any other classifier might work as well.

        """
        self.clf = OneVsRestClassifier(LogisticRegression())

    def fit(self, X, y):
        """ Fit the multiclass classifier. """
        self.clf.fit(X, y)
        return self

    def transform(self, X):
        """ Return the distance of each sample from the decision boundary for
            each class.

        """
        return self.clf.decision_function(X)
Ejemplo n.º 4
0
def benchmark(clf_current):
    print('_' * 80)
    print("Test performance for: ")
    clf_descr = str(clf_current).split('(')[0]
    print(clf_descr)
    t0 = time()
    classif = OneVsRestClassifier(clf_current)
    classif.fit(X_train, Y_train.toarray())
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    t0 = time()
    if hasattr(clf_current,"decision_function"):
        dfmatrix = classif.decision_function(X_test)
        score = metrics.f1_score(Y_test.toarray(), df_to_preds(dfmatrix, k = 5))
    else:
        probsmatrix = classif.predict_proba(X_test)
        score = metrics.f1_score(Y_test.toarray(), probs_to_preds(probsmatrix, k = 5))
        
    test_time = time() - t0

    
    print("f1-score:   %0.7f" % score)
    print("test time:  %0.3fs" % test_time)

    print('_' * 80)
    return clf_descr, score, train_time, test_time
Ejemplo n.º 5
0
def test_ovr_always_present():
    """Test that ovr works with classes that are always present or absent
    """
    # Note: tests is the case where _ConstantPredictor is utilised
    X = np.ones((10, 2))
    X[:5, :] = 0
    y = np.zeros((10, 3))
    y[5:, 0] = 1
    y[:, 1] = 1
    y[:, 2] = 1

    [[int(i >= 5), 2, 3] for i in range(10)]
    ovr = OneVsRestClassifier(LogisticRegression())
    assert_warns(UserWarning, ovr.fit, X, y)
    y_pred = ovr.predict(X)
    assert_array_equal(np.array(y_pred), np.array(y))
    y_pred = ovr.decision_function(X)
    assert_equal(np.unique(y_pred[:, -2:]), 1)
    y_pred = ovr.predict_proba(X)
    assert_array_equal(y_pred[:, -1], np.ones(X.shape[0]))

    # y has a constantly absent label
    y = np.zeros((10, 2))
    y[5:, 0] = 1  # variable label
    ovr = OneVsRestClassifier(LogisticRegression())
    assert_warns(UserWarning, ovr.fit, X, y)
    y_pred = ovr.predict_proba(X)
    assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
Ejemplo n.º 6
0
def test_ovr_fit_predict_sparse():
    for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]:
        base_clf = MultinomialNB(alpha=1)

        X, Y = datasets.make_multilabel_classification(
            n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0
        )

        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]

        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
        Y_pred_sprs = clf_sprs.predict(X_test)

        assert_true(clf.multilabel_)
        assert_true(sp.issparse(Y_pred_sprs))
        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)

        # Test predict_proba
        Y_proba = clf_sprs.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > 0.5
        assert_array_equal(pred, Y_pred_sprs.toarray())

        # Test decision_function
        clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train))
        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
Ejemplo n.º 7
0
def test_ovr_multilabel_decision_function():
    X, Y = datasets.make_multilabel_classification(
        n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0
    )
    X_train, Y_train = X[:80], Y[:80]
    X_test = X[80:]
    clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
    assert_array_equal((clf.decision_function(X_test) > 0).astype(int), clf.predict(X_test))
Ejemplo n.º 8
0
def test_ovr_single_label_decision_function():
    X, Y = datasets.make_classification(n_samples=100,
                                        n_features=20,
                                        random_state=0)
    X_train, Y_train = X[:80], Y[:80]
    X_test, Y_test = X[80:], Y[80:]
    clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
    assert_array_equal(clf.decision_function(X_test).ravel() > 0,
                       clf.predict(X_test))
Ejemplo n.º 9
0
def main():
    #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"])
    sets = select_sets_by_tag(20,4,tag_names)
    #sets = random_select_sets(30,6)
    train_tags = fetch_tags(sets["train"])
    train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"])
    #vectorize
    count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename")
    X_train_counts = count_vect.fit_transform(train_texts)

    #tf-idf transformation
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    #process tags
    mlb = MultiLabelBinarizer()
    processed_train_tags = mlb.fit_transform(train_tags)
    #rint(processed_train_tags)
    #classifier
    #clf = OneVsRestClassifier(MultinomialNB())
    clf = OneVsRestClassifier(LinearSVC())
    clf.fit(X_train_tfidf,processed_train_tags)
    print("classes:{}".format(clf.classes_))
    #process test set

    test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"])
    X_test_counts = count_vect.transform(test_texts)
    #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts)))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    predicted_tags = clf.predict(X_test_tfidf)
    predicted_tags_readable = mlb.inverse_transform(predicted_tags)
    test_tags_actual = fetch_tags(sets["test"])
    predicted_probs = clf.decision_function(X_test_tfidf)
    #predicted_probs = clf.get_params(X_test_tfidf)
    class_list = mlb.classes_
    report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list)
    print(report)
    #retrieve top 30% for each class
    top_percentage = 30
    threshold_index = int( len(sets["test"]) *(top_percentage/100.0) )
    threshold_vals_dic = {}
    threshold_vals = []
    num_classes = len(class_list)
    for i in range(num_classes):
        z = [ predicted_probs[j,i] for j in range(len(sets["test"]))]
        z.sort(reverse=True)
        threshold_vals_dic[class_list[i]]= z[threshold_index]
        threshold_vals.append(z[threshold_index])
    print(threshold_vals_dic)


    print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
def gensim_classifier():
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  # split all sentences to list of words
  sentences = []
  for tweet in tweet_list:
    temp_doc = tweet.split()
    sentences.append(temp_doc)

  # parameters for model
  num_features = 100
  min_word_count = 1
  num_workers = 4
  context = 2
  downsampling = 1e-3

  # Initialize and train the model
  w2v_model = Word2Vec(sentences, workers=num_workers, \
              size=num_features, min_count = min_word_count, \
              window = context, sample = downsampling, seed=1)

  index_value, train_set, test_set = train_test_split(0.80, sentences)
  train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
  test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
  train_vector = Imputer().fit_transform(train_vector)
  test_vector = Imputer().fit_transform(test_vector)

  # train model and predict
  model = LinearSVC()
  classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value])
  result = classifier_fitted.predict(test_vector)

  # output result to csv
  create_directory('data')
  result.tofile("data/w2v_linsvc.csv", sep=',')

  # store the model to mmap-able files
  create_directory('model')
  joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')

  # evaluation
  label_score = classifier_fitted.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(label_list, classes=class_list)

  evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def PR_multi_class(data_train, data_test, data_test_vectors):
    # Binarize the output
    y_train_label = label_binarize(data_train.target, classes=[0, 1, 2])
    n_classes = y_train_label.shape[1]
    
    random_state = np.random.RandomState(0)
    
    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data_train_vectors, y_train_label, test_size=.5,
                                                        random_state=random_state)
    
    # Learn to predict each class against the other
    classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
    classifier.fit(X_train, y_train)
    y_pred_score = classifier.decision_function(data_test_vectors)
    
    y_test_label = label_binarize(data_test.target, classes=[0, 1, 2])
    
    # Compute Precision-Recall and plot curve
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(n_classes):
        precision[i], recall[i], _ = precision_recall_curve(y_test_label[:, i], y_pred_score[:, i])
        average_precision[i] = average_precision_score(y_test_label[:, i], y_pred_score[:, i])
    
    # Compute micro-average ROC curve and ROC area
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test_label.ravel(), y_pred_score.ravel())
    average_precision["micro"] = average_precision_score(y_test_label, y_pred_score, average="micro")
    
    # Plot Precision-Recall curve for each class
    plt.clf()
#    plt.plot(recall["micro"], precision["micro"],
#             label='micro-average PR curve (area = {0:0.2f})'
#                   ''.format(average_precision["micro"]))
    for i in range(n_classes):
        plt.plot(recall[i], precision[i],
                 label='PR curve of class {0} (area = {1:0.2f})'
                       ''.format(i, average_precision[i]))
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall curve of multi-class')
    plt.legend(loc="lower right")
    plt.show()
    return 0
Ejemplo n.º 12
0
    def conduct_test(base_clf, test_predict_proba=False):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_equal(set(y_pred), set("eggs"))
        if hasattr(base_clf, 'decision_function'):
            dec = clf.decision_function(X)
            assert_equal(dec.shape, (5,))

        if test_predict_proba:
            X_test = np.array([[0, 0, 4]])
            probabilities = clf.predict_proba(X_test)
            assert_equal(2, len(probabilities[0]))
            assert_equal(clf.classes_[np.argmax(probabilities, axis=1)],
                         clf.predict(X_test))

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[3, 0, 0]])[0]
        assert_equal(y_pred, 1)
Ejemplo n.º 13
0
def lin_svc():
  label_list = get_labels()
  tweet_list = get_labelled_tweets()
  # vectorise using tf-idf
  vectoriser = TfidfVectorizer(min_df=3,
                               max_features=None,
                               strip_accents='unicode',
                               analyzer='word',
                               token_pattern=r'\w{1,}',
                               ngram_range=(1, 2),
                               use_idf=1,
                               smooth_idf=1,
                               sublinear_tf=1,)

  ## do transformation into vector
  fitted_vectoriser = vectoriser.fit(tweet_list)
  vectorised_tweet_list = fitted_vectoriser.transform(tweet_list)
  train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
                                                                          label_list,
                                                                          test_size=0.8,
                                                                          random_state=42)

  # train model and predict
  model = LinearSVC()
  ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels)
  result = ovr_classifier.predict(test_vector)

  # output result to csv
  create_directory('data')
  save_to_csv("data/testset_labels.csv", test_labels)
  result.tofile("data/tfidf_linsvc.csv", sep=',')

  save_model(ovr_classifier, 'tfidf_linsvc')
  save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser')

  # evaluation
  label_score = ovr_classifier.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(test_labels, classes=class_list)

  evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
Ejemplo n.º 14
0
class SVM(ContinuousModel):

    """C-Support Vector Machine Classifier

    When decision_function_shape == 'ovr', we use OneVsRestClassifier(SVC) from
    sklearn.multiclass instead of the output from SVC directory since it is not
    exactly the implementation of One Vs Rest.

    References
    ----------
    http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    """

    def __init__(self, *args, **kwargs):
        self.model = sklearn.svm.SVC(*args, **kwargs)
        if self.model.decision_function_shape == 'ovr':
            self.decision_function_shape = 'ovr'
            # sklearn's ovr isn't real ovr
            self.model = OneVsRestClassifier(self.model)

    def train(self, dataset, *args, **kwargs):
        return self.model.fit(*(dataset.format_sklearn() + args), **kwargs)

    def predict(self, feature, *args, **kwargs):
        return self.model.predict(feature, *args, **kwargs)

    def score(self, testing_dataset, *args, **kwargs):
        return self.model.score(*(testing_dataset.format_sklearn() + args),
                                **kwargs)

    def predict_real(self, feature, *args, **kwargs):
        dvalue = self.model.decision_function(feature, *args, **kwargs)
        if len(np.shape(dvalue)) == 1:  # n_classes == 2
            return np.vstack((-dvalue, dvalue)).T
        else:
            if self.decision_function_shape != 'ovr':
                LOGGER.warn("SVM model support only 'ovr' for multiclass"
                            "predict_real.")
            return dvalue
Ejemplo n.º 15
0
def svm_training_1(combined_data):
    print('svm_training_1')
    """
		function to perform svm training
		1. benign vs pca training
		2. pca grading training on pca group
		params:
			benignData: dictionary of numpy arrays for benign patients
			pcaData: dictionary of numpy arrays for pca patients
		TODO: BALANCE DATA SETS SO EACH GRADE HAS SIMILAR NUMBER OF VOXELS
			  OPTIMIZE KERNELS TO SEE WHICH ONE FITS OUR DATA THE BEST
	"""
    start = time.time()
    training_data, test_data, training_target, test_target = ms.train_test_split(
        combined_data.get('data'), combined_data.get('label1'), test_size=0.2)
    svm_plain_classifier = OneVsRestClassifier(
        svm.SVC(C=1000.0,
                cache_size=200,
                class_weight='balanced',
                decision_function_shape=None,
                gamma=0.1,
                kernel='rbf',
                max_iter=-1,
                probability=False,
                random_state=None,
                shrinking=True,
                tol=0.001,
                verbose=False))
    svm_plain_classifier.fit(training_data, training_target)
    score = svm_plain_classifier.decision_function(test_data)
    false_pos = dict()
    true_pos = dict()
    roc_auc = dict()
    svm_predict = svm_plain_classifier.predict(test_data)
    accuracy = svm_plain_classifier.score(test_data, test_target)
    end = time.time()
    runtime = end - start
    print('runtime: ' + str(runtime))
    print('score: ')
    print(score)
    print("benign vs pca accuracy: " + str(accuracy))
    print(mt.confusion_matrix(test_target, svm_predict))
    classes = np.unique(combined_data.get('label1'))
    print(
        mt.classification_report(test_target,
                                 svm_predict,
                                 target_names=map(str, classes)))
    # benign_v_pca_results = [test_data, test_target, svm_predict]
    print(type(test_target))
    print(test_target.shape)
    print(type(score))
    print(score.shape)
    sys.exit()
    for i in range(len(classes)):
        false_pos[i], true_pos[i], _ = mt.roc_curve(test_target[:, i],
                                                    score[:, i])
        roc_auc[i] - mt.auc(false_pos[i], true_pos[i])

    false_pos["micro"], true_pos["micro"], _ = roc_curve(
        test_target.ravel(), score.ravel())
    roc_auc["micro"] = auc(false_pos["micro"], true_pos["micro"])

    plt.figure()
    lw = 2
    plt.plot(false_pos[2],
             true_pos[2],
             color='darkorange',
             lw=lw,
             label='ROC curve(area = %0.2f)' % roc_auc[2])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characterisitc Example')
    plt.legend(loc='lower right')
    plt.show()
    return [[test_data, test_target, svm_predict]]
Ejemplo n.º 16
0
# In[46]:


pca = PCA(n_components=n_components,whiten=True)
pca.fit(X_train_multiclass)
X_train_multiclass_pca = pca.transform(X_train_multiclass)
X_test_multiclass_pca = pca.transform(X_test_multiclass)


# In[48]:


oneRestClassifier=OneVsRestClassifier(lr)

oneRestClassifier.fit(X_train_multiclass_pca, y_train_multiclass)
y_score=oneRestClassifier.decision_function(X_test_multiclass_pca)


# In[49]:


# for each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i],recall[i], _ = metrics.precision_recall_curve(y_test_multiclass[:,i],y_score[:,i])
    precision['micro'], recall['micro'], _ = metrics.precision_recall_curve(y_test_multiclass.ravel(),y_score.ravel())
    
    average_precision['micro'] = metrics.average_precision_score(y_test_multiclass,y_score,average='micro')
    print('Average precision score, micro-averaged over all classes: {0:0.2f}'
Ejemplo n.º 17
0
if args.load_n_classifier == None:
    n_estimator = OneVsRestClassifier(LinearSVC(random_state=0, C=100, loss='l1', penalty='l2'))
    n_estimator.fit(X_n_train_PCA, Y_n_train)
    if not args.save_n_classifier==None:
        pickle.dump(n_estimator, open(args.save_n_classifier, 'wb'))
else:
    n_estimator = pickle.load(open(args.load_n_classifier, 'rb'))
if args.load_s_classifier == None:
    s_estimator = OneVsRestClassifier(LinearSVC(random_state=0, C=100, loss='l1', penalty='l2'))
    s_estimator.fit(X_s_train_PCA, Y_s_train)
    if not args.save_s_classifier==None:
        pickle.dump(s_estimator, open(args.save_s_classifier, 'wb'))
else:
    s_estimator = pickle.load(open(args.load_s_classifier, 'rb'))
test_normal_scores = n_estimator.decision_function(X_n_test_PCA)
test_shuffled_scores = n_estimator.decision_function(X_s_test_PCA)
test_n_sm = [softmax(line) for line in test_normal_scores]
test_s_sm = [softmax(line) for line in test_shuffled_scores]

print('normal score:', test_normal_scores[0])
print('shuffled score:', test_shuffled_scores[0])

print('normal softmax:', test_n_sm[0])
print('shuffled softmax:', test_s_sm[0])

root_mse = [np.sqrt(mean_squared_error(test_n_sm[i], test_s_sm[i])) for i in range(len(test_n_sm))]
print(root_mse[:5])
print(np.mean(root_mse[:5]))
# dist = numpy.linalg.norm(a-b)
root_mse_mean = np.mean(root_mse)
Ejemplo n.º 18
0
                                  paperClassifier.y_train)

    # Find the best Hyper Parametets for the estimator
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(paperClassifier.parameters_grid.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    paperClassifier.text_clf = LogisticRegression(
        penalty='l2', tol=best_parameters['classifier__tol'])

    # Fit the model OneVsRestClassifier
    paper_clf = OneVsRestClassifier(paperClassifier.pipeline).fit(
        paperClassifier.x_train, paperClassifier.y_train)
    y_train_test_score = paper_clf.decision_function(
        paperClassifier.x_train_test)

    paperClassifier.plot_roc_curves(y_train_test_score)

    # Get test IDs too
    test_ids = list()
    with open('./data/test.csv', 'r') as f:
        next(f)
        for line in f:
            test_ids.append(line[:-2])

    y_pred = paper_clf.predict_proba(paperClassifier.x_test)

    # Write predictions to a file
    with open('sample_submission.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
Ejemplo n.º 19
0
for i in range(T):
    labels = map(int, input().split(' '))
    RawData.append(input())
    Labels.append(labels)

Queries = []
for i in range(E):
    Queries.append(input())

RawData.extend(Queries)
X = CVectorizer.fit_transform(RawData)
Xtf = TfIdfVectorizer.fit_transform(X)
del X

MLB = MultiLabelBinarizer()
Yt = MLB.fit_transform(Labels)
XtfTrain = Xtf[0:T]
XtfTest = Xtf[T:]
Clf = OneVsRestClassifier(LinearSVC(loss='l1', class_weight={
    1: 100,
    0: 1
})).fit(XtfTrain, Yt)
Classes = list(MLB.classes_)

for xTest in XtfTest:
    y = Clf.decision_function(xTest)
    y1 = list(y[0])
    c1 = Classes
    lbls = [x for (y, x) in sorted(zip(y1, c1))][-10:]
    list.reverse(lbls)
    print(' '.join([str(i) for i in lbls]))
Ejemplo n.º 20
0
# split into input (X) and output (Y) variables
X = dataset[:, 0:13]
Y = dataset[:,13]
XTest1 = dataset[0:91, 0:13]
YTest1 = dataset[0:91, 13]
XTest1 = numpy.concatenate([XTest1, dataset[181:271, 0:13]])
YTest1 = numpy.concatenate([YTest1, dataset[181:271, 13]])
XTest1Valid = dataset[91:181, 0:13]
YTest1Valid = dataset[91:181, 13]

XPredict = dataset[250:271, 0:13]
YPredict = dataset[250:271, 13]

ovr = OneVsRestClassifier(svm.SVC(kernel='linear', C=2))
ovr.fit(X, Y)
cross = cross_val_score(ovr, X, Y, cv=3)
print("Cross-Validation")
print(cross)
scores = ovr.score(XTest1, YTest1)
print("Evalutation: %0.2f%%" % (scores.mean()*100))
scores = ovr.score(XTest1Valid, YTest1Valid)
print("Validation: %0.2f%%" % (scores.mean()*100))

print("Vraies valeurs = ")
print(YPredict)
print("Prédictions = ")
print(ovr.predict(XPredict))


print(ovr.decision_function(XPredict))
Ejemplo n.º 21
0
class PPB2(BaseEstimator, ClassifierMixin):
    """PPB2 model"""
    def __init__(self, model="morg2-nn+nb", n_proc=8, k=200):
        model = model.split("-")
        assert len(model) == 2
        self.fp = model[0]
        assert self.fp in {
            "rdk", "morg2", "morg3", "rdk_maccs", "circular", "maccs", "all"
        }
        self.model_name = model[1]
        assert self.model_name in {
            "dum", "nn", "nb", "nn+nb", "bag", "lr", "svc", "etc", "ridge",
            "ada", "gb", "lda", "xgc"
        }
        self.n_proc = n_proc
        self.k = k

        model_name = self.model_name
        if model_name == "dum":
            self.model = DummyClassifier(strategy="stratified")
        elif model_name == "nn":
            self.model = KNeighborsClassifier(n_neighbors=self.k,
                                              metric="jaccard",
                                              algorithm="brute",
                                              n_jobs=self.n_proc)
        elif model_name == "nb":
            self.model = BernoulliNB(alpha=1.)
        elif model_name == "nn+nb":
            self.model = None
        elif model_name == "svc":
            self.model = SVC(probability=True)
        elif model_name == "bag":
            self.model = BaggingClassifier(
                # n_jobs=self.n_proc,
                n_jobs=None,
                verbose=True)
        elif model_name == "lr":
            self.model = LogisticRegressionCV(
                max_iter=1000,
                # n_jobs=self.n_proc,
                n_jobs=None,
            )
        elif model_name == "ada":
            self.model = AdaBoostClassifier()
        elif model_name == "gb":
            self.model = GradientBoostingClassifier()
        elif model_name == "lda":
            self.model = LinearDiscriminantAnalysis()
        elif model_name == "etc":
            self.model = ExtraTreesClassifier(
                n_estimators=500,
                bootstrap=True,
                max_features="log2",
                min_samples_split=10,
                max_depth=5,
                min_samples_leaf=3,
                verbose=True,
                n_jobs=n_proc
            )  # capable of multilabel classification out of the box
        elif model_name == "ridge":
            self.model = RidgeClassifierCV()
        elif model_name == "xgc":
            self.model = XGBClassifier(
                # n_jobs=self.n_proc,
                n_jobs=None,
                num_parallel_tree=None,
                verbosity=1)
        else:
            raise Exception

    def fit(self, X, y):
        """
        """
        assert isinstance(X, pd.Series)
        assert X.shape[0] == y.shape[0]

        print("fitting PPB2 model", "({}-{})".format(self.fp, self.model_name),
              "to", X.shape[0], "SMILES")

        if len(y.shape) == 1:
            print("fitting in the single-target setting")
            self.multi_label = False
        else:
            print("fitting in the multi-target setting")
            print("number of targets:", y.shape[1])
            self.multi_label = True

        if self.multi_label and self.model_name not in support_multi_label.union(
            {"nn+nb"}):
            self.model = OneVsRestClassifier(  # wrap classifier in OneVsRestClassifier for multi-label case
                self.model,
                n_jobs=self.n_proc)

        # covert X to fingerprint
        # X = load_training_fingerprints(X, self.fp,)
        X = compute_fp(smiles=X, all_fp=self.fp, n_proc=self.n_proc)

        if self.model_name in dense_input:  # cannot handle sparse input
            X = X.A

        if self.model_name == "nn+nb":  # keep training data references for local NB fitting
            self.X = X
            self.y = y

        assert X.shape[0] == y.shape[0]

        if self.model is not None:
            print("fitting", self.model_name, "model to", X.shape[0], "'",
                  self.fp, "' fingerprints", "of shape", X.shape, "for",
                  y.shape[1], "targets", "using", self.n_proc, "core(s)")

            with parallel_backend('loky', n_jobs=self.n_proc):
                self.model.fit(X, y)

        return self

    # def _determine_k_closest_samples(self, X, chunksize=1000):
    #     if not isinstance(X, np.ndarray): # dense needed for jaccard distance
    #         X = X.A

    #     # training_samples = load_training_fingerprints(self.X, self.fp)
    #     training_samples = self.X
    #     if not isinstance(training_samples, np.ndarray):
    #         training_samples = training_samples.A
    #     training_labels = self.y
    #     if not isinstance(training_labels, np.ndarray):
    #         training_labels = training_labels.A

    #     print ("determining", self.k,
    #         "nearest compounds to each query")
    #     n_queries = X.shape[0]
    #     n_chunks = n_queries // chunksize + 1
    #     print ("chunking queries with chunksize", chunksize,)
    #     print ("number of chunks:", n_chunks)
    #     # idx = np.empty((n_queries, self.k))

    #     for chunk in range(n_chunks):

    #         chunk_queries = X[chunk*chunksize:(chunk+1)*chunksize]

    #         dists = pairwise_distances(
    #                 chunk_queries,
    #                 training_samples,
    #             metric="jaccard", n_jobs=self.n_proc, )
    #         # idx[chunk*chunksize:(chunk+1)*chunksize] = \
    #         idx =  dists.argsort(axis=-1)[:,:self.k] # smallest k distances

    #         k_nearest_samples = training_samples[idx] # return dense
    #         k_nearest_labels = training_labels[idx]

    #         yield (chunk_queries,
    #             k_nearest_samples, k_nearest_labels)

    #         print ("completed chunk", chunk+1)

    # print ("closest", self.k, "neighbours determined")

    # assert idx.shape[0] == X.shape[0]
    # assert idx.shape[1] == self.k

    # k_nearest_samples = training_samples[idx] # return dense
    # k_nearest_labels = training_labels[idx]

    # return k_nearest_samples, k_nearest_labels

    def _fit_local_nb(self, query, mode="predict", alpha=1.):

        if len(query.shape) == 1:
            query = query[None, :]

        X = self.X
        y = self.y

        assert isinstance(query, sp.csr_matrix)
        assert query.dtype == bool
        assert isinstance(X, sp.csr_matrix)
        assert X.dtype == bool

        # sparse jaccard distance
        assert query.shape[1] == X.shape[1]
        dists = pairwise_distances(query.A, X.A, metric="jaccard", n_jobs=1)
        idx = dists.argsort()[0, :self.k]

        assert query.shape[0] == 1

        X = X[idx]
        y = y[idx]

        n_targets = y.shape[-1]

        pred = np.zeros(n_targets)
        ones_idx = y.all(axis=0)
        zeros_idx = (1 - y).all(axis=0)

        # set prediction for classes where only positive class
        # is seen
        pred[ones_idx] = 1

        # only fit on targets with pos and neg examples
        idx = ~np.logical_or(ones_idx, zeros_idx)
        if idx.any():
            nb = BernoulliNB(alpha=alpha)
            if idx.sum() > 1:
                nb = OneVsRestClassifier(nb, n_jobs=1)
            y_ = y[:, idx]
            if idx.sum() == 1:
                y_ = y_.flatten()
            nb.fit(X, y_)
            pred_ = (nb.predict(query)[0]
                     if mode == "predict" else nb.predict_proba(query)[0])
            if idx.sum() == 1 and mode != "predict":
                assert pred_.shape[0] == 2
                assert nb.classes_.any()
                pred_ = pred_[nb.classes_ == 1]
            pred[idx] = pred_
        return pred

    def _local_nb_prediction(
            self,
            queries,
            # X, y,
            mode="predict"):
        print("fitting unique NB models for each query", "in mode", mode)

        n_queries = queries.shape[0]

        with mp.Pool(processes=self.n_proc) as p:
            predictions = p.map(
                functools.partial(self._fit_local_nb, mode=mode),
                (query for query in queries))

        predictions = np.array(predictions)
        assert predictions.shape[0] == n_queries

        if self.multi_label:
            assert predictions.shape[1] == self.y.shape[-1]

        return predictions

    def predict(self, X):
        print("predicting for", X.shape[0], "query molecules")
        X = compute_fp(X, self.fp, n_proc=self.n_proc)
        print("performing prediction", "using", self.n_proc, "processes")

        if self.model_name == "nn+nb":

            return self._local_nb_prediction(X, mode="predict")
        else:
            if self.model_name in dense_input \
                and not isinstance(X, np.ndarray):
                X = X.A
            assert hasattr(self.model, "predict")

            with parallel_backend('threading', n_jobs=self.n_proc):
                return self.model.predict(X)

    def predict_proba(self, X):
        print("predicting probabilities for", X.shape[0], "query molecules")
        X = compute_fp(X, self.fp, n_proc=self.n_proc)
        print("performing probability prediction", "using", self.n_proc,
              "processes")
        if self.model_name == "nn+nb":

            return self._local_nb_prediction(X, mode="predict_proba")

        if self.model_name in dense_input \
            and not isinstance(X, np.ndarray):
            X = X.A

        if self.model_name in support_multi_label:
            with parallel_backend('threading', n_jobs=self.n_proc):
                probs = self.model.predict_proba(
                    X)  # handle missing classes correctly
            classes = self.model.classes_
            return np.hstack([
                probs[:, idx] if idx.any() else 1 - probs
                for probs, idx in zip(probs, classes)
            ])  # check for existence of positive class

        else:
            assert isinstance(self.model, OneVsRestClassifier)
            if hasattr(self.model, "predict_proba"):
                with parallel_backend('threading', n_jobs=self.n_proc):
                    return self.model.predict_proba(X)
            elif hasattr(self.model, "decision_function"):
                print("predicting with decision function")
                with parallel_backend('threading', n_jobs=self.n_proc):
                    return self.model.decision_function(X)
            else:
                raise Exception

    def decision_function(self, X):
        print("predicting probabilities for", X.shape[0], "query molecules")
        X = compute_fp(X, self.fp, n_proc=self.n_proc)
        print("determining decision function", "using", self.n_proc,
              "processes")
        if self.model_name == "nn+nb":

            return self._local_nb_prediction(
                X,
                mode="predict_proba")  # NB does not have a decision function

        if self.model_name in dense_input \
            and not isinstance(X, np.ndarray):
            X = X.A

        if self.model_name in support_multi_label:  # k neigbours has no decision function
            with parallel_backend('threading', n_jobs=self.n_proc):
                probs = self.model.predict_proba(
                    X)  # handle missing classes correctly
            classes = self.model.classes_
            return np.hstack([
                probs[:, idx] if idx.any() else 1 - probs
                for probs, idx in zip(probs, classes)
            ])  # check for existence of positive class

        else:
            assert isinstance(self.model, OneVsRestClassifier)

            if hasattr(self.model, "decision_function"):
                with parallel_backend('threading', n_jobs=self.n_proc):
                    return self.model.decision_function(X)
            elif hasattr(self.model, "predict_proba"):
                print("predicting using probability")
                with parallel_backend('threading', n_jobs=self.n_proc):
                    return self.model.predict_proba(X)
            else:
                raise Exception

    def check_is_fitted(self):
        if self.model is None:
            return True
        try:
            check_is_fitted(self.model)
            return True
        except NotFittedError:
            return False

    def __str__(self):
        return "PPB2({}-{})".format(self.fp, self.model_name)

    def set_n_proc(self, n_proc):
        self.n_proc = n_proc
        if self.model is not None:
            self.model.n_jobs = n_proc

    def set_k(self, k):
        self.k = k
        if isinstance(self.model, KNeighborsClassifier):
            self.model.n_neighbors = k
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.5,
                                                    random_state=0)

# Learn to predict each class against the other
classifier = OneVsRestClassifier(
    svm.SVC(kernel='linear', probability=True, random_state=random_state))
classifier = classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_score = classifier.decision_function(X_test)

feature_list = range(4)
target_names = ['setosa', 'versicolor', 'virginica']

# Create a trained model instance
ce = ClassifierEvaluator(classifier,
                         y_test,
                         y_pred,
                         y_score,
                         feature_list,
                         target_names,
                         estimator_name='super awesome SVC')

template = '''
           # Report
class MySVM:
    #_tasks = ['sede1', 'sede2', 'sede12', 'morfo1', 'morfo2', 'morfo12']
    _tasks = ['sede1', 'sede12', 'morfo1', 'morfo2']
    #_tasks = ['sede1']
    _filesFolder = "./filesFolds-SVMbigramsPROOFRAND"
    _memmapFolder = "./memmapFolds-SVMbigramsPROOFRAND"
    _corpusFolder = "./corpusLSTM_ICDO3"

    _fileLb = {
        'sede1': _memmapFolder + "/binarizers/lbSede1.p",
        'sede2': _memmapFolder + "/binarizers/lbSede2.p",
        'sede12': _memmapFolder + "/binarizers/lbSede12.p",
        'morfo1': _memmapFolder + "/binarizers/lbMorfo1.p",
        'morfo2': _memmapFolder + "/binarizers/lbMorfo2.p",
        'morfo12': _memmapFolder + "/binarizers/lbMorfo12.p"
    }
    _fileEvaluation = _filesFolder + "/outputSVM/evaluation.txt"
    _fileModel = {
        'sede1': _filesFolder + "/modelsSVM/modelCatSede1.h5",
        'sede2': _filesFolder + "/modelsSVM/modelCatSede2.h5",
        'sede12': _filesFolder + "/modelsSVM/modelCatSede12.h5",
        'morfo1': _filesFolder + "/modelsSVM/modelCatMorfo1.h5",
        'morfo2': _filesFolder + "/modelsSVM/modelCatMorfo2.h5",
        'morfo12': _filesFolder + "/modelsSVM/modelCatMorfo12.h5",
    }

    _textFile = _corpusFolder + "/text.txt"
    _fileSedeClean = _corpusFolder + "/sedeClean.txt"
    _fileMorfoClean = _corpusFolder + "/morfoClean.txt"
    _fileVectors = _corpusFolder + "/vectors.txt"

    _fileMemmapX = "./tmp/X.dat"
    _fileMemmapYUn = {
        'sede1': "./tmp/yUnSede1.dat",
        'sede2': "./tmp/yUnSede2.dat",
        'sede12': "./tmp/yUnSede12.dat",
        'morfo1': "./tmp/yUnMorfo1.dat",
        'morfo2': "./tmp/yUnMorfo2.dat",
        'morfo12': "./tmp/yUnMorfo12.dat"
    }
    _fileMemmapY = {
        'sede1': "./tmp/ySede1.dat",
        'sede2': "./tmp/ySede2.dat",
        'sede12': "./tmp/ySede12.dat",
        'morfo1': "./tmp/yMorfo1.dat",
        'morfo2': "./tmp/yMorfo2.dat",
        'morfo12': "./tmp/yMorfo12.dat"
    }

    _fileShapes = _memmapFolder + "/shapes.p"
    _fileIndexes = _memmapFolder + "/indexes.p"

    _fileMemmapXTrain = _memmapFolder + "/XTrain.dat"
    _fileMemmapYTrain = {
        'sede1': _memmapFolder + "/ySede1Train.dat",
        'sede2': _memmapFolder + "/ySede2Train.dat",
        'sede12': _memmapFolder + "/ySede12Train.dat",
        'morfo1': _memmapFolder + "/yMorfo1Train.dat",
        'morfo2': _memmapFolder + "/yMorfo2Train.dat",
        'morfo12': _memmapFolder + "/yMorfo12Train.dat"
    }

    _fileMemmapXTest = _memmapFolder + "/XTest.dat"
    _fileMemmapYTest = {
        'sede1': _memmapFolder + "/ySede1Test.dat",
        'sede2': _memmapFolder + "/ySede2Test.dat",
        'sede12': _memmapFolder + "/ySede12Test.dat",
        'morfo1': _memmapFolder + "/yMorfo1Test.dat",
        'morfo2': _memmapFolder + "/yMorfo2Test.dat",
        'morfo12': _memmapFolder + "/yMorfo12Test.dat"
    }

    def extractData(self):
        self._phraseLen = 100
        self.stratifications = 10

        with open(self._textFile) as fid:
            text = fid.readlines()

        with open(self._fileSedeClean) as fid:
            sedeClean = fid.readlines()

        with open(self._fileMorfoClean) as fid:
            morfoClean = fid.readlines()

        vectorizer = TfidfVectorizer(min_df=3,
                                     max_df=0.3,
                                     strip_accents='unicode',
                                     ngram_range=(1, 2))
        #vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, strip_accents='unicode', ngram_range=(1,2))
        vectorizer.fit(text)
        self._vecLen = len(vectorizer.get_feature_names())
        #X = np.memmap(self._fileMemmapX, mode='w+', shape=(len(text), self._vecLen), dtype=np.float)
        #X[:] = vectorizer.transform(text).toarray()
        self.X = vectorizer.transform(text)

        del text

        yUn = {}

        yUn['sede1'] = np.memmap(self._fileMemmapYUn['sede1'],
                                 mode='w+',
                                 shape=(len(sedeClean)),
                                 dtype=np.int)
        yUn['sede2'] = np.memmap(self._fileMemmapYUn['sede2'],
                                 mode='w+',
                                 shape=(len(sedeClean)),
                                 dtype=np.int)
        yUn['sede12'] = np.memmap(self._fileMemmapYUn['sede12'],
                                  mode='w+',
                                  shape=(len(sedeClean)),
                                  dtype=np.int)
        for i, c in enumerate(sedeClean):
            yUn['sede1'][i], yUn['sede2'][i] = c.split()
            yUn['sede12'][i] = yUn['sede1'][i] * 10 + yUn['sede2'][i]

        yUn['morfo1'] = np.memmap(self._fileMemmapYUn['morfo1'],
                                  mode='w+',
                                  shape=(len(morfoClean)),
                                  dtype=np.int)
        yUn['morfo2'] = np.memmap(self._fileMemmapYUn['morfo2'],
                                  mode='w+',
                                  shape=(len(morfoClean)),
                                  dtype=np.int)
        yUn['morfo12'] = np.memmap(self._fileMemmapYUn['morfo12'],
                                   mode='w+',
                                   shape=(len(morfoClean)),
                                   dtype=np.int)
        for i, c in enumerate(morfoClean):
            yUn['morfo1'][i], yUn['morfo2'][i] = c.split()
            yUn['morfo12'][i] = yUn['morfo1'][i] * 10 + yUn['morfo2'][i]

        self.lb = LabelBinarizer()
        self.lb.fit(yUn['sede12'])

        self.y = np.memmap(self._fileMemmapY['sede12'],
                           mode='w+',
                           shape=(len(sedeClean), len(self.lb.classes_)),
                           dtype=np.int)
        self.y[:] = self.lb.transform(yUn['sede12'])

        #del yUn[task]

        print("Splitting data")
        skf = StratifiedKFold(n_splits=self.stratifications)

        self.trainIndexes = []
        self.testIndexes = []

        for train, test in skf.split(np.zeros(len(yUn['sede12'])),
                                     yUn['sede12']):
            self.trainIndexes.append(train)
            self.testIndexes.append(test)

        #self.fold = random.randint(0,9)
        self.fold = 1

        self.XTrain = self.X[self.trainIndexes[self.fold]]
        self.XTest = self.X[self.testIndexes[self.fold]]

        self.yTrain = np.memmap(self._fileMemmapYTrain['sede12'],
                                mode='w+',
                                shape=(len(self.trainIndexes[self.fold]),
                                       len(self.lb.classes_)),
                                dtype=np.int)
        self.yTest = np.memmap(self._fileMemmapYTest['sede12'],
                               mode='w+',
                               shape=(len(self.testIndexes[self.fold]),
                                      len(self.lb.classes_)),
                               dtype=np.int)

        self.yTrain[:] = self.y[self.trainIndexes[self.fold]]
        self.yTest[:] = self.y[self.testIndexes[self.fold]]

        self.yTrain.flush()
        self.yTest.flush()

    def createModels(self):
        print("Creating models")

        self.model = OneVsRestClassifier(LinearSVC())
        self.model.fit(self.XTrain, self.yTrain)

    def evaluate(self):
        print("Evaluating Test")
        self._evaluate(self.XTest, self.yTest)
        #print("Evaluating Train")
        #self._evaluate(self.XTrain, self.yTrain)

    def _evaluate(self, X, y):

        metrics = {}

        table = [[
            "task", "average", "MAPs", "MAPc", "accur.", "kappa", "prec.",
            "recall", "f1score"
        ]]
        na = ' '

        table.append([" ", " ", " ", " ", " ", " ", " ", " "])
        yp = self.model.decision_function(X)
        yt = y
        ytn = self.lb.inverse_transform(yt)
        yc = np.zeros(yt.shape, np.int)
        for i, p in enumerate(yp):
            yc[i][np.argmax(p)] = 1
        ycn = self.lb.inverse_transform(yc)

        metrics = {}
        metrics['MAPs'] = MAPScorer().samplesScore(yt, yp)
        metrics['MAPc'] = MAPScorer().classesScore(yt, yp)
        metrics['accuracy'] = accuracy_score(yt, yc)
        metrics['kappa'] = cohen_kappa_score(ytn, ycn)

        metrics['precision'] = {}
        metrics['recall'] = {}
        metrics['f1score'] = {}

        table.append([
            'sede12', na, "{:.3f}".format(metrics['MAPs']),
            "{:.3f}".format(metrics['MAPc']),
            "{:.3f}".format(metrics['accuracy']),
            "{:.3f}".format(metrics['kappa']), na, na, na
        ])
        for avg in ['micro', 'macro', 'weighted']:
            metrics['precision'][avg], metrics['recall'][avg], metrics[
                'f1score'][avg], _ = precision_recall_fscore_support(
                    yt, yc, average=avg)
            table.append([
                'sede12', avg, na, na, na, na,
                "{:.3f}".format(metrics['precision'][avg]),
                "{:.3f}".format(metrics['recall'][avg]),
                "{:.3f}".format(metrics['f1score'][avg])
            ])

        #metrics['pr-curve'] = {}
        #metrics['pr-curve']['x'], metrics['pr-curve']['y'], metrics['pr-curve']['auc'] = self._calculateMicroMacroCurve(lambda y,s: (lambda t: (t[1],t[0]))(precision_recall_curve(y,s)), yt, yp)

        #metrics['roc-curve'] = {}
        #metrics['roc-curve']['x'], metrics['roc-curve']['y'], metrics['roc-curve']['auc'] = self._calculateMicroMacroCurve(lambda y,s: (lambda t: (t[0],t[1]))(roc_curve(y,s)), yt, yp)

        print(tabulate(table))
def run_prototype(snow_tweets_folder,
                  prototype_output_folder,
                  restart_probability,
                  number_of_threads):
    """
    This is a sample execution of the User Network Profile Classifier Prototype.

    Specifically:
           - Reads a set of tweets from a local folder.
           - Forms graphs and text-based vector representation for the users involved.
           - Fetches Twitter lists for influential users.
           - Extracts keywords from Twitter lists and thus annotates these users as experts in these topics.
           - Extracts graph-based features using the ARCTE algorithm.
           - Performs user classification for the rest of the users.
    """
    if number_of_threads is None:
        number_of_threads = get_threads_number()

    ####################################################################################################################
    # Read data.
    ####################################################################################################################
    # Read graphs.
    edge_list_path = os.path.normpath(snow_tweets_folder + "/graph.tsv")
    adjacency_matrix = read_adjacency_matrix(file_path=edge_list_path,
                                             separator='\t')
    number_of_nodes = adjacency_matrix.shape[0]

    # Read labels.
    node_label_list_path = os.path.normpath(snow_tweets_folder + "/user_label_matrix.tsv")
    user_label_matrix, number_of_categories, labelled_node_indices = read_node_label_matrix(node_label_list_path,
                                                                                            '\t')

    ####################################################################################################################
    # Extract features.
    ####################################################################################################################
    features = arcte(adjacency_matrix,
                     restart_probability,
                     0.00001,
                     number_of_threads=number_of_threads)

    features = normalize_columns(features)

    percentages = np.arange(1, 11, dtype=np.int)
    trial_num = 10

    ####################################################################################################################
    # Perform user classification.
    ####################################################################################################################
    mean_macro_precision = np.zeros(percentages.size, dtype=np.float)
    std_macro_precision = np.zeros(percentages.size, dtype=np.float)
    mean_micro_precision = np.zeros(percentages.size, dtype=np.float)
    std_micro_precision = np.zeros(percentages.size, dtype=np.float)
    mean_macro_recall = np.zeros(percentages.size, dtype=np.float)
    std_macro_recall = np.zeros(percentages.size, dtype=np.float)
    mean_micro_recall = np.zeros(percentages.size, dtype=np.float)
    std_micro_recall = np.zeros(percentages.size, dtype=np.float)
    mean_macro_F1 = np.zeros(percentages.size, dtype=np.float)
    std_macro_F1 = np.zeros(percentages.size, dtype=np.float)
    mean_micro_F1 = np.zeros(percentages.size, dtype=np.float)
    std_micro_F1 = np.zeros(percentages.size, dtype=np.float)
    F1 = np.zeros((percentages.size, number_of_categories), dtype=np.float)
    for p in np.arange(percentages.size):
        percentage = percentages[p]
        # Initialize the metric storage arrays to zero
        macro_precision = np.zeros(trial_num, dtype=np.float)
        micro_precision = np.zeros(trial_num, dtype=np.float)
        macro_recall = np.zeros(trial_num, dtype=np.float)
        micro_recall = np.zeros(trial_num, dtype=np.float)
        macro_F1 = np.zeros(trial_num, dtype=np.float)
        micro_F1 = np.zeros(trial_num, dtype=np.float)
        trial_F1 = np.zeros((trial_num, number_of_categories), dtype=np.float)

        folds = generate_folds(user_label_matrix,
                               labelled_node_indices,
                               number_of_categories,
                               percentage,
                               trial_num)
        for trial in np.arange(trial_num):
            train, test = next(folds)
            ########################################################################################################
            # Separate train and test sets
            ########################################################################################################
            X_train, X_test, y_train, y_test = features[train, :],\
                                                features[test, :],\
                                                user_label_matrix[train, :],\
                                                user_label_matrix[test, :]

            contingency_matrix = chi2_contingency_matrix(X_train, y_train)
            community_weights = peak_snr_weight_aggregation(contingency_matrix)
            X_train, X_test = community_weighting(X_train, X_test, community_weights)

            ####################################################################################################
            # Train model
            ####################################################################################################
            # Train classifier
            model = OneVsRestClassifier(svm.LinearSVC(C=1,
                                                      random_state=None,
                                                      dual=False,
                                                      fit_intercept=True),
                                        n_jobs=number_of_threads)

            model.fit(X_train, y_train)
            ####################################################################################################
            # Make predictions
            ####################################################################################################
            y_pred = model.decision_function(X_test)

            y_pred = form_node_label_prediction_matrix(y_pred, y_test)

            ########################################################################################################
            # Calculate measures
            ########################################################################################################
            measures = evaluation.calculate_measures(y_pred, y_test)

            macro_recall[trial] = measures[0]
            micro_recall[trial] = measures[1]

            macro_precision[trial] = measures[2]
            micro_precision[trial] = measures[3]

            macro_F1[trial] = measures[4]
            micro_F1[trial] = measures[5]

            trial_F1[trial, :] = measures[6]

        mean_macro_precision[p] = np.mean(macro_precision)
        std_macro_precision[p] = np.std(macro_precision)
        mean_micro_precision[p] = np.mean(micro_precision)
        std_micro_precision[p] = np.std(micro_precision)
        mean_macro_recall[p] = np.mean(macro_recall)
        std_macro_recall[p] = np.std(macro_recall)
        mean_micro_recall[p] = np.mean(micro_recall)
        std_micro_recall[p] = np.std(micro_recall)
        mean_macro_F1[p] = np.mean(macro_F1)
        std_macro_F1[p] = np.std(macro_F1)
        mean_micro_F1[p] = np.mean(micro_F1)
        std_micro_F1[p] = np.std(micro_F1)
        F1[p, :] = np.mean(trial_F1, axis=0)

    measure_list = [(mean_macro_precision, std_macro_precision),
                    (mean_micro_precision, std_micro_precision),
                    (mean_macro_recall, std_macro_recall),
                    (mean_micro_recall, std_micro_recall),
                    (mean_macro_F1, std_macro_F1),
                    (mean_micro_F1, std_micro_F1),
                    F1]

    write_results(measure_list,
                  os.path.normpath(prototype_output_folder + "/F1_average_scores.txt"))
def ROC_multi_class(data_train, data_test, data_test_vectors):
    
    # Binarize the output
    y_train_label = label_binarize(data_train.target, classes=[0, 1, 2])
    n_classes = y_train_label.shape[1]
    
    
    random_state = np.random.RandomState(1)
    
    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data_train_vectors, y_train_label, test_size=.5,
                                                        random_state=0)
    
    # Learn to predict each class against the other
    classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
    classifier.fit(X_train, y_train)
    y_pred_score = classifier.decision_function(data_test_vectors)
    
    y_test_label = label_binarize(data_test.target, classes=[0, 1, 2])
    
    
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_label[:, i], y_pred_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_label.ravel(), y_pred_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    
    # Plot ROC curves for the multiclass problem
    # Compute macro-average ROC curve and ROC area
    
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    
    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])
    
    # Finally average it and compute AUC
    mean_tpr /= n_classes
    
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    # Plot all ROC curves
    plt.figure()
#    plt.plot(fpr["micro"], tpr["micro"],
#             label='micro-average ROC curve (area = {0:0.2f})'
#                   ''.format(roc_auc["micro"]),
#             linewidth=2)
#    
#    plt.plot(fpr["macro"], tpr["macro"],
#             label='macro-average ROC curve (area = {0:0.2f})'
#                   ''.format(roc_auc["macro"]),
#             linewidth=2)
    
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                       ''.format(i, roc_auc[i]))
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic of multi-class')
    plt.legend(loc="lower right")
    plt.show()
    
    return 0
Ejemplo n.º 26
0
                               solver='adam',
                               verbose=0,
                               random_state=21)
#                     warm_start=warm_start_set)

sys.exit(0)
print("Strat training...")
classifier.fit(X_train, y_train)
#sys.exit(0)

#---------------------use X_test for evaluation------------------

if (classifier_type == 'svm'):
    #note that in svm predict_proba is inconsistent with predict function
    #use decision_function-->consistent
    y_pred_proba = classifier.decision_function(
        X_test)  #return inverse of distance

if (classifier_type == 'mlp'):
    y_pred_proba = classifier.predict_proba(X_test)

all_labels = classifier.classes_

#----------------------get top-k results-------------------------

#print("Training finished(test on original dataset):\ncomponent type: {} \nemddeing: {} \nclassifier: {}\n" \
#      .format(cur_exp_param,cur_sent_embd_type,classifier_type))

y_top_K = []
# --pick out the max probability labels(by sorting predict_proba or decision_function)
#--note this may be different in rnn
if (classifier_type == 'mlp' or classifier_type == 'svm'):
Ejemplo n.º 27
0
Archivo: komd.py Proyecto: sspeng/MKLpy
class KOMD(BaseEstimator, ClassifierMixin):
    """KOMD.
    
    KOMD is a kernel method for classification and ranking.
    
    Read more in http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf
    by F. Aiolli, G. Da San Martino, and A. Sperduti.
    
    For details on the precise mathematical formulation of the provided
    kernel functions and how `gamma`, `coef0` and `degree` affect each
    other, see the corresponding section in the narrative documentation:
    :ref:`svm_kernels`.
	
    Parameters
    ----------
    lam : float, (default=0.1)
        Specifies the lambda value, between 0.0 and 1.0.
    
    kernel : optional (default='linear')
        Specifies the kernel function used by the algorithm.
        It must be one of 'linear', 'poly', 'rbf', a callable or a gram matrix.
        If none is given, 'linear' will be used. If a callable is given it is
        used to pre-compute the kernel matrix from data matrices; that matrix
        should be an array of shape ``(n_samples, n_samples)``.
    
    rbf_gamma : float, optional (default=0.1)
        Coefficient for 'rbf' and 'poly' kernels.
        Ignored by all other kernels.
    
    degree : float, optional (default=2.0)
        Specifies the degree of the 'poly' kernel.
	    Ignored by all other kernels.
    
    coef0 : flaot, optional (default=0.0)
        Specifies the coeff0 in a polynomial kernel.
        Ignored by all other kernels.
    
    max_iter : int, optional (default=100)
        Hard limit on iterations within solver, it can't be negative.
    
    verbose : bool, (default=False)
        Enable verbose output during fit.
    
    multiclass_strategy : string, optional (default='ova')
        Specifies the strategy used in case of multiclass.
        'ova' for one_vs_all pattern (also called one_vs_rest),
        'ovo' for one_vs_one pattern.
        With other unexpected string, 'ova' pattern is used.
    
    Attributes
    ----------
    gamma : array-like, shape = [n_samples]
        probability-like vector that define the distance vector
        over the two class.
    
    classes_ : array-like, shape = [n_classes]
        Vector that contain all possibile labels
    
    multiclass_ : boolean,
        True if the number of classes > 2
    
    Examples
    --------
    >>>import numpy as np
    >>>from ??.komd import KOMD
    >>>X = np.array([[1,2,i] for i in range(5)])
    >>>Y = np.array([1,1,1,-1,-1])
    >>>cls = KOMD()
    >>>cls = cls.fit(X,Y)
    >>>print cls.predict([[1,1,5]])
    [1]
    
    References
    ----------
    `A Kernel Method for the Optimization of the Margin Distribution
    <http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf>`__
    """
    
    def __init__(self, lam = 0.1, kernel = 'rbf', rbf_gamma = 0.1, degree = 2.0, coef0 = 0.0, max_iter = 100, verbose = False, multiclass_strategy = 'ova'):
        self.lam = lam
        self.gamma = None
        self.bias = None
        self.X = None
        self.Y = None
        self.is_fitted = False
        self.rbf_gamma = rbf_gamma
        self.degree = degree
        self.coef0 = coef0
        self.max_iter = max_iter
        self.verbose = verbose
        self.kernel = kernel
        self.multiclass_strategy = multiclass_strategy
        self.multiclass_ = None
        self.classes_ = None
        self._pairwise = self.kernel=='precomputed'

    def __kernel_definition__(self):
        """Select the kernel function
        
        Returns
        -------
        kernel : a callable relative to selected kernel
        """
        if hasattr(self.kernel, '__call__'):
            return self.kernel
        if self.kernel == 'rbf' or self.kernel == None:
            return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
        if self.kernel == 'poly':
            return lambda X,Y : polynomial_kernel(X, Y, degree=self.degree, gamma=self.rbf_gamma, coef0=self.coef0)
        if self.kernel == 'linear':
            return lambda X,Y : linear_kernel(X,Y)
        if self.kernel == 'precomputed':
            return lambda X,Y : X
    
    def fit(self, X, Y):
        
        """Fit the model according to the given training data
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Matrix of the examples, where
            n_samples is the number of samples and
            n_feature is the number of features
        
        Y : array-like, shape = [n_samples]
            array of the labels relative to X
        
        Returns
        -------
        self : object
            Returns self
        """
        X,Y = validation.check_X_y(X, Y, dtype=np.float64, order='C', accept_sparse='csr')
        #check_consistent_length(X,Y)
        check_classification_targets(Y)
        
        self.classes_ = np.unique(Y)
        if len(self.classes_) < 2:
            raise ValueError("The number of classes has to be almost 2; got ", len(self.classes_))
        
        if len(self.classes_) == 2:
            self.multiclass_ = False
            return self._fit(X,Y)
        else :
            self.multiclass_ = True
            if self.multiclass_strategy == 'ovo':
                return self._one_vs_one(X,Y)
            else :
                return self._one_vs_rest(X,Y)
        raise ValueError('This is a very bad exception...')
    
    def _one_vs_one(self,X,Y):
        self.cls = OneVsOneClassifier(KOMD(**self.get_params())).fit(X,Y)
        self.is_fitted = True
        return self
    
    def _one_vs_rest(self,X,Y):
        self.cls = OneVsRestClassifier(KOMD(**self.get_params())).fit(X,Y)
        self.is_fitted = True
        return self
        
    def _fit(self,X,Y):    
        self.X = X
        values = np.unique(Y)
        Y = [1 if l==values[1] else -1 for l in Y]
        self.Y = Y
        npos = len([1.0 for l in Y if l == 1])
        nneg = len([1.0 for l in Y if l == -1])
        gamma_unif = matrix([1.0/npos if l == 1 else 1.0/nneg for l in Y])
        YY = matrix(np.diag(list(matrix(Y))))

        Kf = self.__kernel_definition__()
        ker_matrix = matrix(Kf(X,X).astype(np.double))
        #KLL = (1.0 / (gamma_unif.T * YY * ker_matrix * YY * gamma_unif)[0])*(1.0-self.lam)*YY*ker_matrix*YY
        KLL = (1.0-self.lam)*YY*ker_matrix*YY
        LID = matrix(np.diag([self.lam * (npos * nneg / (npos+nneg))]*len(Y)))
        Q = 2*(KLL+LID)
        p = matrix([0.0]*len(Y))
        G = -matrix(np.diag([1.0]*len(Y)))
        h = matrix([0.0]*len(Y),(len(Y),1))
        A = matrix([[1.0 if lab==+1 else 0 for lab in Y],[1.0 if lab2==-1 else 0 for lab2 in Y]]).T
        b = matrix([[1.0],[1.0]],(2,1))
        
        solvers.options['show_progress'] = False#True
        solvers.options['maxiters'] = self.max_iter
        sol = solvers.qp(Q,p,G,h,A,b)
        self.gamma = sol['x']
        if self.verbose:
            print '[KOMD]'
            print 'optimization finished, #iter = ', sol['iterations']
            print 'status of the solution: ', sol['status']
            print 'objval: ', sol['primal objective']
            
        bias = 0.5 * self.gamma.T * ker_matrix * YY * self.gamma
        self.bias = bias
        self.is_fitted = True
        self.ker_matrix = ker_matrix
        return self
        
    def predict(self, X):
        """Perform classification on samples in X.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Matrix containing new samples
        
        Returns
        -------
        y_pred : array, shape = [n_samples]
            The value of prediction for each sample
        """
        
        if self.is_fitted == False:
            raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.")
        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
        if self.multiclass_ == True:
            return self.cls.predict(X)
        
        return np.array([self.classes_[1] if p >=0 else self.classes_[0] for p in self.decision_function(X)])

    def get_params(self, deep=True):
        # this estimator has parameters:
        return {"lam": self.lam, "kernel": self.kernel, "rbf_gamma":self.rbf_gamma,
                "degree":self.degree, "coef0":self.coef0, "max_iter":self.max_iter,
                "verbose":self.verbose, "multiclass_strategy":self.multiclass_strategy}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self,parameter,value)
        return self


    def decision_function(self, X):
        """Distance of the samples in X to the separating hyperplane.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        
        Returns
        -------
        Z : array-like, shape = [n_samples, 1]
            Returns the decision function of the samples.
        """
        
        if self.is_fitted == False:
            raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.")
        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
        
        if self.multiclass_ == True:
            return self.cls.decision_function(X)
        
        Kf = self.__kernel_definition__()
        YY = matrix(np.diag(list(matrix(self.Y))))
        ker_matrix = matrix(Kf(X,self.X).astype(np.double))
        z = ker_matrix*YY*self.gamma
        z = z-self.bias
        return np.array(list(z))
Ejemplo n.º 28
0
    def _predict(self, train_index, test_index):
        """
        :param train_index: list with the index of the models data used in the algorithm of SVM.
        :param test_index:  list with the index to predict.
        :return:    tuple with (label_prediction, label_score, word_prediction, word_score, bin_predictions) of test_index made by:
                        - predict Label (hiper/hipo/normal) with SVM (using all trainindex of the same meal type)
                        - predict word with SVM using only hiper/hipo/normal trainindex depending on the label predicted
        """
        # models to predict
        data_pred = self.loader.get_models(test_index,
                                           {'weight': self.weights})
        data_pred = data_pred.iloc[:, 1:-1]

        # Get models with words
        wdata = self.loader.get_models(train_index, {'weight': self.weights})

        bin_labels = []
        score_label = []

        # Get Possible meal labels
        meal_types = list(wdata.iloc[:, 3].unique())
        for d in data_pred.iloc[:, 2].unique():
            if d not in meal_types:
                meal_types.append(d)

        # split models by type of meal
        for etiqueta_apat in meal_types:
            mdata_pred = data_pred.ix[data_pred.iloc[:, 2] == etiqueta_apat]
            mwdata = wdata.ix[wdata.iloc[:, 3] == etiqueta_apat]

            # Get models hipo/hiper/norm labels
            ldata_labels = self.loader.get_labels_of_words(
                mwdata.iloc[:, -1].tolist())

            # Predict label
            svm = OneVsRestClassifier(
                SVC(kernel=self._kernel, C=self._C, gamma=self._gamma))
            y = label_binarize(ldata_labels, classes=[-1, 0, 1])
            svm.fit(mwdata.iloc[:, 1:-1], y)
            mbin_labels = svm.predict(mdata_pred)
            mscore_label = svm.decision_function(mdata_pred)
            if len(bin_labels):
                bin_labels = np.concatenate((bin_labels, mbin_labels), axis=0)
                score_label = np.concatenate((score_label, mscore_label),
                                             axis=0)
            else:
                bin_labels = mbin_labels
                score_label = mscore_label

        ldata_labels = self.loader.get_labels_of_words(wdata.iloc[:,
                                                                  -1].tolist())
        # Predict word using only vocabulary of label predicted
        res_word = []
        res_label = []
        score_word = []
        predicters = {}
        for i in range(len(bin_labels)):
            label = None
            for l in range(-1, 2):
                if bin_labels[i][l + 1] == 1:
                    label = l
                    break
            if not label:
                maxscore = -1000
                for s in range(-1, 2):
                    score = score_label[i][s + 1]
                    if score >= maxscore:
                        label = s
                        maxscore = score
            res_label.append(label)
            # Work only with sessions of the label
            # sessions = [wdata.iloc[z, 0] for z in range(len(wdata)) if ldata_labels[z] == label]
            #
            # models = wdata[wdata.id.isin(sessions)]
            # models_labels = models.iloc[:, -1]
            # models = models.iloc[:, 1:-1]
            #
            # if not predicters.get(str(label), False):
            #     svm = SVC(kernel=self._kernel, C=self._C, gamma=self._gamma)
            #     svm.fit(models, models_labels)
            #     predicters[str(label)] = svm
            #
            # svm = predicters.get(str(label))
            # res_w = svm.predict(data_pred)
            # score_w = svm.decision_function(data_pred)
            #
            # res_word.append(res_w)
            # score_word.append(score_w)

        return (res_label, score_label, res_word, score_word, bin_labels)
Ejemplo n.º 29
0
n_classes = Y.shape[1]

# Split into training and test
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
    doc_vec, Y, test_size=.3, random_state=10)

# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier

# Run classifier
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', random_state=12))
#classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=25, random_state=1))

classifier.fit(Train_X, Y_train)
#y_score = classifier.predict_proba(Test_X)
y_score = classifier.decision_function(Test_X)

# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(
        Y_test[:, i], y_score[:, i])
    average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i])

# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(
    Y_test.ravel(), y_score.ravel())
average_precision["micro"] = average_precision_score(Y_test,
                                                     y_score,
Ejemplo n.º 30
0
# In[41]:


print(classification_report(y_test, mn_y_pred))
print(classification_report(y_test, svc_y_pred))


# In[38]:


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

clf = OneVsRestClassifier(svc)
clf.fit(X_train, y_train)
y_score = clf.decision_function(X_test)
# For each class
precision = dict()
recall = dict()
average_precision = dict()
n_classes = y_bin.shape[1]
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
                                                        y_score[:, i])
    average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
    
precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel())
average_precision["micro"] = average_precision_score(y_test, y_score, average="micro")
print('Average precision score, micro-averaged over all classes: {0:0.2f}'.format(average_precision["micro"]))

Ejemplo n.º 31
0
def cross_validation(X, y, n_trials=5, trial_splits=None, fname=None):
    """Perform model selection via 5-fold cross validation"""
    # filter samples with no annotations
    del_rid = np.where(y.sum(axis=1) == 0)[0]
    y = np.delete(y, del_rid, axis=0)
    X = np.delete(X, del_rid, axis=0)

    # range of hyperparameters
    C_range = 10.**np.arange(-1, 3)
    gamma_range = 10.**np.arange(-3, 1)

    # pre-generating kernels
    print("### Pregenerating kernels...")
    K_rbf = {}
    for gamma in gamma_range:
        K_rbf[gamma] = rbf_kernel(X, gamma=gamma)
    print("### Done.")

    # performance measures
    perf = dict()
    pr_micro = []
    pr_macro = []
    fmax = []
    acc = []

    if trial_splits is None:
        # shuffle and split training and test sets
        trials = ShuffleSplit(n_splits=n_trials,
                              test_size=0.2,
                              random_state=None)
        ss = trials.split(X)
        trial_splits = []
        for train_idx, test_idx in ss:
            trial_splits.append((train_idx, test_idx))

    it = 0
    for jj in range(0, n_trials):
        train_idx = trial_splits[jj][0]
        test_idx = trial_splits[jj][1]
        it += 1
        y_train = y[train_idx]
        y_test = y[test_idx]
        print("### [Trial %d] Perfom cross validation...." % (it))
        print("Train samples=%d; #Test samples=%d" %
              (y_train.shape[0], y_test.shape[0]))
        # setup for neasted cross-validation
        splits = ml_split(y_train)

        # parameter fitting
        C_opt = None
        gamma_opt = None
        max_aupr = 0
        for C in C_range:
            for gamma in gamma_range:
                # Multi-label classification
                cv_results = []
                for train, valid in splits:
                    clf = OneVsRestClassifier(svm.SVC(C=C,
                                                      kernel='precomputed',
                                                      probability=False),
                                              n_jobs=-1)
                    K_train = K_rbf[gamma][
                        train_idx[train], :][:, train_idx[train]]
                    K_valid = K_rbf[gamma][
                        train_idx[valid], :][:, train_idx[train]]
                    y_train_t = y_train[train]
                    y_train_v = y_train[valid]
                    y_score_valid = np.zeros(y_train_v.shape, dtype=float)
                    y_pred_valid = np.zeros_like(y_train_v)
                    idx = np.where(y_train_t.sum(axis=0) > 0)[0]
                    clf.fit(K_train, y_train_t[:, idx])
                    y_score_valid[:, idx] = clf.decision_function(K_valid)
                    y_pred_valid[:, idx] = clf.predict(K_valid)
                    perf_cv = evaluate_performance(y_train_v, y_score_valid,
                                                   y_pred_valid)
                    cv_results.append(perf_cv['m-aupr'])
                cv_aupr = np.median(cv_results)
                print("### gamma = %0.3f, C = %0.3f, AUPR = %0.3f" %
                      (gamma, C, cv_aupr))
                if cv_aupr > max_aupr:
                    C_opt = C
                    gamma_opt = gamma
                    max_aupr = cv_aupr
        print("### Optimal parameters: ")
        print("C_opt = %0.3f, gamma_opt = %0.3f" % (C_opt, gamma_opt))
        print("### Train dataset: AUPR = %0.3f" % (max_aupr))
        print("### Using full training data...")
        clf = OneVsRestClassifier(svm.SVC(C=C_opt,
                                          kernel='precomputed',
                                          probability=False),
                                  n_jobs=-1)
        y_score = np.zeros(y_test.shape, dtype=float)
        y_pred = np.zeros_like(y_test)
        idx = np.where(y_train.sum(axis=0) > 0)[0]
        clf.fit(K_rbf[gamma_opt][train_idx, :][:, train_idx], y_train[:, idx])

        # Compute performance on test set
        y_score[:, idx] = clf.decision_function(
            K_rbf[gamma_opt][test_idx, :][:, train_idx])
        y_pred[:, idx] = clf.predict(K_rbf[gamma_opt][test_idx, :][:,
                                                                   train_idx])
        perf_trial = evaluate_performance(y_test, y_score, y_pred)
        pr_micro.append(perf_trial['m-aupr'])
        pr_macro.append(perf_trial['M-aupr'])
        fmax.append(perf_trial['F1'])
        acc.append(perf_trial['acc'])
        print(
            "### Test dataset: AUPR['micro'] = %0.3f, AUPR['macro'] = %0.3f, F1 = %0.3f, Acc = %0.3f"
            % (perf_trial['m-aupr'], perf_trial['M-aupr'], perf_trial['F1'],
               perf_trial['acc']))
    perf['m-aupr_avg'] = np.mean(pr_micro)
    perf['m-aupr_std'] = std(pr_micro)
    perf['M-aupr_avg'] = np.mean(pr_macro)
    perf['M-aupr_std'] = std(pr_macro)
    perf['F1_avg'] = np.mean(fmax)
    perf['F1_std'] = std(fmax)
    perf['acc_avg'] = np.mean(acc)
    perf['acc_std'] = std(acc)

    if fname is not None:
        fout = open(fname, 'w')
        fout.write("aupr[micro], aupr[macro], F_max, accuracy\n")
        for ii in range(0, n_trials):
            fout.write(pr_micro[ii], pr_macro[ii], fmax[ii], acc[ii])
        fout.close()

    return perf
Ejemplo n.º 32
0
total_y_pred = []
test11 = []
test22 = []
for train_index, test_index in kf.split(X):
    # print(train_index,test_index)
    # print("_")
    train_X = fromIndexToFeatures(X, train_index)
    train_y = fromIndexToLabels(y, train_index)
    test_X = fromIndexToFeatures(X, test_index)
    test_y = fromIndexToLabels(y, test_index)

    test11.extend(test_y)

    clf.fit(train_X, train_y)

    score = clf.decision_function(test_X)
    for i in score:
        test22.append(i)

    y_pred = clf.predict(test_X)
    total_y_test.extend(test_y)
    total_y_pred.extend(y_pred)
    print('done')
    # print(train_X)
    # print()
test11 = np.asarray(test11)
test22 = np.asarray(test22)
test11 = label_binarize(y, classes=[0, 1, 2, 3, 4, 5, 6, 7])

print(confusion_matrix(total_y_test, total_y_pred))
print(classification_report(total_y_test, total_y_pred))
Ejemplo n.º 33
0
def temporal_holdout(X,
                     y,
                     indx,
                     bootstrap,
                     fname,
                     goterms=None,
                     go_fname=None):
    """Perform temporal holdout validation"""

    X_train = X[indx['train'].tolist()]
    X_test = X[indx['test'].tolist()]
    X_valid = X[indx['valid'].tolist()]
    y_train = y['train'].tolist()
    y_test = y['test'].tolist()
    y_valid = y['valid'].tolist()
    if goterms is not None:
        goterms = goterms['terms'].tolist()

    # range of hyperparameters
    C_range = 10.**np.arange(-1, 3)
    gamma_range = 10.**np.arange(-3, 1)

    # pre-generating kernels
    print("### Pregenerating kernels...")
    K_rbf_train = {}
    K_rbf_test = {}
    K_rbf_valid = {}
    for gamma in gamma_range:
        K_rbf_train[gamma] = rbf_kernel(X_train, gamma=gamma)
        K_rbf_test[gamma] = rbf_kernel(X_test, X_train, gamma=gamma)
        K_rbf_valid[gamma] = rbf_kernel(X_valid, X_train, gamma=gamma)
    print("### Done.")
    print("Train samples=%d; #Test samples=%d" %
          (y_train.shape[0], y_test.shape[0]))

    # parameter fitting
    C_opt = None
    gamma_opt = None
    max_aupr = 0
    for C in C_range:
        for gamma in gamma_range:
            # Multi-label classification
            clf = OneVsRestClassifier(svm.SVC(C=C,
                                              kernel='precomputed',
                                              probability=False),
                                      n_jobs=-1)
            clf.fit(K_rbf_train[gamma], y_train)
            y_score_valid = clf.decision_function(K_rbf_valid[gamma])
            y_pred_valid = clf.predict(K_rbf_valid[gamma])
            perf = evaluate_performance(y_valid, y_score_valid, y_pred_valid)
            micro_aupr = perf['m-aupr']
            print("### gamma = %0.3f, C = %0.3f, AUPR = %0.3f" %
                  (gamma, C, micro_aupr))
            if micro_aupr > max_aupr:
                C_opt = C
                gamma_opt = gamma
                max_aupr = micro_aupr
    print("### Optimal parameters: ")
    print("C_opt = %0.3f, gamma_opt = %0.3f" % (C_opt, gamma_opt))
    print("### Train dataset: AUPR = %0.3f" % (max_aupr))
    print("### Computing performance on test dataset...")
    clf = OneVsRestClassifier(svm.SVC(C=C_opt,
                                      kernel='precomputed',
                                      probability=False),
                              n_jobs=-1)
    clf.fit(K_rbf_train[gamma_opt], y_train)

    # Compute performance on test set
    y_score = clf.decision_function(K_rbf_test[gamma_opt])
    y_pred = clf.predict(K_rbf_test[gamma_opt])

    # performance measures for bootstrapping
    perf = dict()
    pr_micro = []
    pr_macro = []
    fmax = []
    acc = []

    # individual goterms
    pr_goterms = {}
    for i in range(0, len(goterms)):
        pr_goterms[goterms[i]] = []

    for ind in bootstrap:
        perf_ind = evaluate_performance(y_test[ind], y_score[ind], y_pred[ind])
        pr_micro.append(perf_ind['m-aupr'])
        pr_macro.append(perf_ind['M-aupr'])
        fmax.append(perf_ind['F1'])
        acc.append(perf_ind['acc'])
        for i in range(0, len(goterms)):
            pr_goterms[goterms[i]].append(perf_ind[i])

    perf['m-aupr_avg'] = np.mean(pr_micro)
    perf['m-aupr_std'] = std(pr_micro)
    perf['M-aupr_avg'] = np.mean(pr_macro)
    perf['M-aupr_std'] = std(pr_macro)
    perf['F1_avg'] = np.mean(fmax)
    perf['F1_std'] = std(fmax)
    perf['acc_avg'] = np.mean(acc)
    perf['acc_std'] = std(acc)

    # trials
    fout = open(fname, 'w')
    fout.write("aupr[micro], aupr[macro], F_max, accuracy\n")
    for it in range(0, len(bootstrap)):
        fout.write(pr_micro[it], pr_macro[it], fmax[it], acc[it], "\n")
    fout.close()

    # write performance on individual GO terms
    if go_fname is not None:
        fout = open(go_fname, 'wb')
        print >> fout, "GO_id, AUPRs"
        for i in range(0, len(goterms)):
            print >> fout, goterms[i], sum(y_train[:, i]) / float(
                y_train.shape[0]),
            for pr in pr_goterms[goterms[i]]:
                print >> fout, pr,
            print >> fout
        fout.close()

    return perf
Ejemplo n.º 34
0
class TextClassifier:
    def __init__(self):
        self.vectorizer = None
        self.clf = None

        self.doc_ids = None
        self.label2id = None
        self.id2label = None

        self.platt_a = None
        self.platt_b = None
        self.dist_max = None
        self.dist_min = None

    def _get_label_dicts(self, labels):
        """
        Create dictionaries mapping labels to integers 0 to n, in which n is the
        number of unique labels encountered in the given list of labels.
        :param labels: (list)
        """

        sorted_labels = set([l.strip() for ls in labels for l in ls])
        self.label2id = {l.strip(): i for i, l in enumerate(sorted_labels)}
        self.id2label = {i: l.strip() for l, i in self.label2id.items()}

    def _file_save(self, path, filename, platt_a, platt_b, dist_max, dist_min):
        """
        :param path: (string)
        :param filename: (str) 
        :param platt_a: (float)
        :param platt_b: (float)
        :param dist_max: (float)
        :param dist_min: (float)
        """

        with open(path + '{0}_vec.pkl'.format(filename), 'wb') as f:
            dill.dump(self.vectorizer, f)
        with open(path + '{0}_clf.pkl'.format(filename), 'wb') as f:
            dill.dump(self.clf, f)

        with open(path + '{0}.json'.format(filename), 'w') as f:
            d = {
                'classifier_name': '{0}_clf.pkl'.format(filename),
                'vectorizer_name': '{0}_vec.pkl'.format(filename),
                'save_datetime': str(datetime.now()),
                'parameters': {
                    'PlattA': str(platt_a),
                    'PlattB': str(platt_b),
                    'DistMaximum': str(dist_max.tostring()),
                    'DistMinimum': str(dist_min.tostring()),
                    'DocumentIDs': self.doc_ids,
                    'Labels2IDs': self.label2id
                }
            }
            json.dump(json.dumps(d), f, indent=4)

    def _load_from_file(self, path, filename):
        """
        :param path: (string)
        :param filename: (string) 
        """

        with open(path + '{0}.json'.format(filename), 'r') as f:
            metadata = json.loads(json.load(f))

        with open(path + '{0}'.format(metadata['classifier_name']), 'rb') as f:
            self.clf = dill.load(f)
        with open(path + '{0}'.format(metadata['vectorizer_name']), 'rb') as f:
            self.vectorizer = dill.load(f)

        self.platt_a = float(metadata['parameters']['PlattA'])
        self.platt_b = float(metadata['parameters']['PlattB'])
        self.dist_max = np.fromstring(
            eval(metadata['parameters']['DistMaximum']))
        self.dist_min = np.fromstring(
            eval(metadata['parameters']['DistMinimum']))
        self.doc_ids = metadata['parameters']['DocumentIDs']
        self.label2id = metadata['parameters']['Labels2IDs']
        self.id2label = {i: l.strip() for l, i in self.label2id.items()}

    def _predict_multi(self, documents, output_positive_score=False):
        """
        Returns label guesses (with probability of accuracy) for each document.
        :param documents: (list)
        :param output_positive_score: (bool, False by default) 
        :return: list of tuples (str, float) of label predictions and associated probabilities
        """

        doc_vectors = self.vectorizer.transform(documents)
        decisions = self.clf.decision_function(doc_vectors)

        a = self.platt_a if self.platt_a is not None else -5.
        b = self.platt_b if self.platt_b is not None else 1.

        pdf = 1. / (1. + np.exp(a * decisions + b))
        assert isinstance(pdf, np.ndarray)

        classes = [str(self.id2label[i]) for i in range(pdf.shape[1])]
        predictions = []
        for ps in pdf:
            if output_positive_score:
                zp = zip(
                    np.array(classes)[decisions[0] > 0].tolist(),
                    [float(x) for x in ps[decisions[0] > 0]])
            else:
                zp = zip(classes, map(lambda x: float(x), ps))
            predictions.append(sorted(zp, reverse=True, key=lambda x: x[1]))
        return predictions

    def label_vectorizer(self, labels):
        """
        Turn a list of labels into an equivalent binarized array of labels.
        :param labels: (list)
        :return: (ndarray)
        """

        label_ids = [[self.label2id[l.strip()] for l in ls] for ls in labels]
        return MultiLabelBinarizer(
            classes=range(len(self.label2id))).fit_transform(label_ids)

    def train(self, documents, labels, identifiers):
        """
        Fits vectorizer and classifier
        :param documents: (list)
        :param labels: (list)
        :param identifiers: (list)
        """

        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                          min_df=1,
                                          tokenizer=lemma_tokenizer)
        self.clf = OneVsRestClassifier(LinearSVC(random_state=0))

        self._get_label_dicts(labels)
        self.doc_ids = identifiers

        x = self.vectorizer.fit_transform(documents)
        y = self.label_vectorizer(labels)
        self.clf.fit(x, y)

    def predict(self, documents):
        """
        Returns an array of predictions for documents.
        :param documents: (list)
        :return: (ndarray)
        """

        prediction = self._predict_multi(documents)
        return np.array(prediction)[:, 0]

    def grid_predict(self, documents, platt_a, platt_b, low_memory=False):
        """
        Returns label guesses (with probability of accuracy) for each document. This function is only 
        executed when grid searches for the parameters of Platt's posterior probability bootstrapping
        algorithm are being performed.  Otherwise predict_multi is run.
        :param documents: (list)
        :param platt_a: Platt parameter A (float) 
        :param platt_b: Platt parameter B (float)
        :param low_memory: (bool)
        :return: ndarray if low_memory is True, list if low_memory is False
        """

        decisions = self.decision_function(documents)

        if low_memory:
            pdf = np.exp(platt_a * decisions + platt_b).astype(np.float16)
            pdf += 1.
            return 1. / pdf

        pdf = 1. / (1. + np.exp(platt_a * decisions + platt_b))
        assert isinstance(pdf, np.ndarray)

        classes = [self.id2label[i] for i in range(pdf.shape[1])]
        predictions_bulk = []
        for ps in pdf:
            prediction = zip(classes, ps)
            prediction = sorted(prediction, reverse=True, key=lambda s: s[1])
            predictions_bulk.append(prediction)
        return predictions_bulk

    def decision_function(self, documents):
        """
        Returns the decision function values
        :param documents: (list)
        :return: (ndarray)
        """

        doc_vectors = self.vectorizer.transform(documents)
        return self.clf.decision_function(doc_vectors)

    def save(self,
             path,
             name,
             platt_a,
             platt_b,
             dist_max,
             dist_min,
             in_db=False):
        """
        :param path: (string)
        :param name: (str) 
        :param platt_a: (float)
        :param platt_b: (float)
        :param dist_max: (float)
        :param dist_min: (float)
        :param in_db: (bool)
        """

        file_name = '{0}_{1}'.format(name, uuid4())
        if not in_db:
            self._file_save(path, file_name, platt_a, platt_b, dist_max,
                            dist_min)
            return file_name
        else:
            raise NotImplementedError

    def load(self, path, name, in_db=False):
        """
        :param path: (string)
        :param name: (str) 
        :param in_db: (bool)
        """

        if not in_db:
            self._load_from_file(path, name)
        else:
            raise NotImplementedError
modelsvm.fit(X_train, y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n", modelsvm.best_params_)
y_pred = modelsvm.predict(X_test)
y_pred_train = modelsvm.predict(X_train)

y_train1 = label_binarize(y_train, classes=[0, 1, 2, 3, 4])
y_pred_train1 = label_binarize(y_pred_train, classes=[0, 1, 2, 3, 4])
y_pred1 = label_binarize(y_pred, classes=[0, 1, 2, 3, 4])
y_test1 = label_binarize(y_test, classes=[0, 1, 2, 3, 4])

auc_r2_rmse(y_train1, y_pred_train1, y_test1, y_pred1, "svm")

classifier_svm = OneVsRestClassifier(modelsvm.best_estimator_)
y_score = classifier_svm.fit(X_train, y_train1).decision_function(X_test)
y_score_train = classifier_svm.decision_function(X_train)
plot_roc_auc(y_score, y_test1, 'svm_auc_roc.png', 'SVM (test)')
plot_roc_auc(y_score_train, y_train1, 'svm_train_auc_roc.png', 'SVM (train)')

#############################################################################################
#############################################################################################

#############################################################################################
#############################################################################################
############# NN - 1 hidden layer ####################################################################
#############################################################################################
#############################################################################################
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
Ejemplo n.º 36
0
T, E = map(int, input().split(' '))
RawData = []
Labels = []
for i in range(T) :
    labels = map(int, input().split(' '))
    RawData.append(input())
    Labels.append(labels)

Queries = []
for i in range(E) :
    Queries.append(input())

RawData.extend(Queries)
X = CVectorizer.fit_transform(RawData)
Xtf = TfIdfVectorizer.fit_transform(X)
del X

MLB = MultiLabelBinarizer()
Yt = MLB.fit_transform(Labels)
XtfTrain = Xtf[0:T]
XtfTest = Xtf[T:]
Clf = OneVsRestClassifier(LinearSVC(loss='l1', class_weight={1:100,0:1})).fit(XtfTrain, Yt)
Classes = list(MLB.classes_)

for xTest in XtfTest:
    y = Clf.decision_function(xTest)
    y1 = list(y[0])
    c1 = Classes
    lbls = [x for (y,x) in sorted(zip(y1,c1))][-10:]
    list.reverse(lbls)
    print (' '.join([str(i) for i in lbls]))
Ejemplo n.º 37
0
        print '\rFitting %d/%d ' % (i, TIMES),
        sys.stdout.flush()

        # resampling
        classifier = OneVsRestClassifier(
            svm.SVC(kernel=multichannel_wrapper(2, chi_square_kernel),
                    probability=True))
        X_train, X_test, y_train, y_test = train_test_split(x, y, tag)
        y_score = classifier.fit(X_train, y_train).decision_function(X_test)

        l.append(
            float((y_test.argmax(1) == y_score.argmax(1)).sum()) /
            y_score.shape[0] * 100)
    print map(lambda x: '%.3f%%' % x, l), '=', np.mean(l)

    y_score = classifier.decision_function(x)
    print 'Test all = %.3f%%' % (float(
        (y.argmax(1) == y_score.argmax(1)).sum()) / y_score.shape[0] * 100)

    if True:
        import matplotlib.pyplot as plt
        from sklearn.metrics import precision_recall_curve
        from sklearn.metrics import average_precision_score

        # Compute Precision-Recall and plot curve
        precision = dict()
        recall = dict()
        average_precision = dict()
        for i in range(n_classes):
            precision[i], recall[i], _ = precision_recall_curve(
                y[:, i], y_score[:, i])
Ejemplo n.º 38
0
    X1 = X_train.toarray()
    X2 = X_test.toarray()

#    X1 = X_train
#    X2 = X_test

#    clf = GaussianNB()
#   clf=SGDClassifier()
    clf=LinearSVC(random_state=0)
#    clf=RandomForestClassifier(n_estimators = 100)
  #  clf=MultinomialNB()


    classif = OneVsRestClassifier(clf).fit(X1, Y1)
    class_set=classif.classes_
    scores=classif.decision_function(X2)
    Y3=[]
        #    predict=classif.predict(X2)
        
    if len(scores.shape) == 1:
        indices = (scores > 0).astype(np.int)
    else:
        for score in scores:
            buf=[]
            for i in range(9):
                if score[i]>0:
                   buf.append(class_set[i])
            if not buf:
                   indices = np.argmax(score)
                   
                   buf.append(class_set[indices])
Ejemplo n.º 39
0
import pandas as pd
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
plt.rcParams['axes.unicode_minus']= False

iris = load_iris()
# OneVsOneClassifier 비해 속도는 빠르고 정확도는 떨어진다.
model_ovr =OneVsRestClassifier(LogisticRegression(solver='lbfgs')).fit(iris.data, iris.target)
ax1 = plt.subplot(211)
pd.DataFrame(model_ovr.decision_function(iris.data)).plot(ax=ax1, legend=True)
plt.title("판별함수")
ax2 = plt.subplot(212)
pd.DataFrame(model_ovr.predict(iris.data), columns=["prediction"]).plot(marker='o', ls='',ax=ax2)
plt.title('클래스 판별')
plt.tight_layout()
plt.show()
Ejemplo n.º 40
0
    if FOLD_CV:
        print "Performing 5-fold cv"
        scores = cv.cross_val_score(
            clf, X, y, cv=5, scoring="roc_auc"
        )
        print "%d-fold cv, average auRoc %f" % (len(scores), scores.mean())

    if PLOT_RESULTS:
        X_train, X_test, y_train, y_test = cv.train_test_split(
            X, y, test_size=0.3, random_state=0
        )
        clf.fit(X_train, y_train)

        print "Plotting results"
        y_scores = clf.decision_function(X_test)

        tname = "-".join(tissues)
        is_extra = brain_feats is not None or limb_feats is not None or heart_feats is not None

        plot_roc(
            y_test,
            y_scores,
            "ROC Tissue",
            out="figures/roc-curve-tis-%s%s.png" % (tname, is_extra)
        )
        # plot_precision_recall(y_true, y_scores)
        # plot_2d_results(X_test, y_test, clf.predict(X_test))
        print "Done plotting"

    end = time.clock()
Ejemplo n.º 41
0
class KOMD(BaseEstimator, ClassifierMixin):
    """KOMD.
    
    KOMD is a kernel method for classification and ranking.
    
    Read more in http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf
    by F. Aiolli, G. Da San Martino, and A. Sperduti.
    
    For details on the precise mathematical formulation of the provided
    kernel functions and how `gamma`, `coef0` and `degree` affect each
    other, see the corresponding section in the narrative documentation:
    :ref:`svm_kernels`.
	
    Parameters
    ----------
    lam : float, (default=0.1)
        Specifies the lambda value, between 0.0 and 1.0.
    
    kernel : optional (default='linear')
        Specifies the kernel function used by the algorithm.
        It must be one of 'linear', 'poly', 'rbf', a callable or a gram matrix.
        If none is given, 'linear' will be used. If a callable is given it is
        used to pre-compute the kernel matrix from data matrices; that matrix
        should be an array of shape ``(n_samples, n_samples)``.
    
    rbf_gamma : float, optional (default=0.1)
        Coefficient for 'rbf' and 'poly' kernels.
        Ignored by all other kernels.
    
    degree : float, optional (default=2.0)
        Specifies the degree of the 'poly' kernel.
	    Ignored by all other kernels.
    
    coef0 : flaot, optional (default=0.0)
        Specifies the coeff0 in a polynomial kernel.
        Ignored by all other kernels.
    
    max_iter : int, optional (default=100)
        Hard limit on iterations within solver, it can't be negative.
    
    verbose : bool, (default=False)
        Enable verbose output during fit.
    
    multiclass_strategy : string, optional (default='ova')
        Specifies the strategy used in case of multiclass.
        'ova' for one_vs_all pattern (also called one_vs_rest),
        'ovo' for one_vs_one pattern.
        With other unexpected string, 'ova' pattern is used.
    
    Attributes
    ----------
    gamma : array-like, shape = [n_samples]
        probability-like vector that define the distance vector
        over the two class.
    
    classes_ : array-like, shape = [n_classes]
        Vector that contain all possibile labels
    
    multiclass_ : boolean,
        True if the number of classes > 2
    
    Examples
    --------
    >>>import numpy as np
    >>>from ??.komd import KOMD
    >>>X = np.array([[1,2,i] for i in range(5)])
    >>>Y = np.array([1,1,1,-1,-1])
    >>>cls = KOMD()
    >>>cls = cls.fit(X,Y)
    >>>pred = cls.predict([[1,1,5]])
    
    References
    ----------
    `A Kernel Method for the Optimization of the Margin Distribution
    <http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf>`__
    """
    
    def __init__(self, lam = 0.1, kernel = 'rbf', rbf_gamma = 0.1, degree = 2.0, coef0 = 0.0, max_iter = 100, verbose = False, multiclass_strategy = 'ova'):
        self.lam = lam
        self.gamma = None
        self.bias = None
        self.X = None
        self.Y = None
        self.is_fitted = False
        self.rbf_gamma = rbf_gamma
        self.degree = degree
        self.coef0 = coef0
        self.max_iter = max_iter
        self.verbose = verbose
        self.kernel = kernel
        self.multiclass_strategy = multiclass_strategy
        self.multiclass_ = None
        self.classes_ = None
        self._pairwise = self.kernel=='precomputed'

    def __kernel_definition__(self):
        """Select the kernel function
        
        Returns
        -------
        kernel : a callable relative to selected kernel
        """
        if hasattr(self.kernel, '__call__'):
            return self.kernel
        if self.kernel == 'rbf' or self.kernel == None:
            return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
        if self.kernel == 'poly':
            return lambda X,Y : polynomial_kernel(X, Y, degree=self.degree, gamma=self.rbf_gamma, coef0=self.coef0)
        if self.kernel == 'linear':
            return lambda X,Y : linear_kernel(X,Y)
        if self.kernel == 'precomputed':
            return lambda X,Y : X
    
    def fit(self, X, Y):
        
        """Fit the model according to the given training data
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Matrix of the examples, where
            n_samples is the number of samples and
            n_feature is the number of features
        
        Y : array-like, shape = [n_samples]
            array of the labels relative to X
        
        Returns
        -------
        self : object
            Returns self
        """
        X,Y = validation.check_X_y(X, Y, dtype=np.float64, order='C', accept_sparse='csr')
        #check_consistent_length(X,Y)
        check_classification_targets(Y)
        
        self.classes_ = np.unique(Y)
        if len(self.classes_) < 2:
            raise ValueError("The number of classes has to be almost 2; got ", len(self.classes_))
        
        if len(self.classes_) == 2:
            self.multiclass_ = False
            return self._fit(X,Y)
        else :
            self.multiclass_ = True
            if self.multiclass_strategy == 'ovo':
                return self._one_vs_one(X,Y)
            else :
                return self._one_vs_rest(X,Y)
        raise ValueError('This is a very bad exception...')
    
    def _one_vs_one(self,X,Y):
        self.cls = OneVsOneClassifier(KOMD(**self.get_params())).fit(X,Y)
        self.is_fitted = True
        return self
    
    def _one_vs_rest(self,X,Y):
        self.cls = OneVsRestClassifier(KOMD(**self.get_params())).fit(X,Y)
        self.is_fitted = True
        return self
        
    def _fit(self,X,Y):    
        self.X = X
        values = np.unique(Y)
        Y = [1 if l==values[1] else -1 for l in Y]
        self.Y = Y
        npos = len([1.0 for l in Y if l == 1])
        nneg = len([1.0 for l in Y if l == -1])
        gamma_unif = matrix([1.0/npos if l == 1 else 1.0/nneg for l in Y])
        YY = matrix(np.diag(list(matrix(Y))))

        Kf = self.__kernel_definition__()
        ker_matrix = matrix(Kf(X,X).astype(np.double))
        #KLL = (1.0 / (gamma_unif.T * YY * ker_matrix * YY * gamma_unif)[0])*(1.0-self.lam)*YY*ker_matrix*YY
        KLL = (1.0-self.lam)*YY*ker_matrix*YY
        LID = matrix(np.diag([self.lam * (npos * nneg / (npos+nneg))]*len(Y)))
        Q = 2*(KLL+LID)
        p = matrix([0.0]*len(Y))
        G = -matrix(np.diag([1.0]*len(Y)))
        h = matrix([0.0]*len(Y),(len(Y),1))
        A = matrix([[1.0 if lab==+1 else 0 for lab in Y],[1.0 if lab2==-1 else 0 for lab2 in Y]]).T
        b = matrix([[1.0],[1.0]],(2,1))
        
        solvers.options['show_progress'] = False#True
        solvers.options['maxiters'] = self.max_iter
        sol = solvers.qp(Q,p,G,h,A,b)
        self.gamma = sol['x']
        if self.verbose:
            print ('[KOMD]')
            print ('optimization finished, #iter = %d' % sol['iterations'])
            print ('status of the solution: %s' % sol['status'])
            print ('objval: %.5f' % sol['primal objective'])
            
        bias = 0.5 * self.gamma.T * ker_matrix * YY * self.gamma
        self.bias = bias
        self.is_fitted = True
        self.ker_matrix = ker_matrix
        return self
        
    def predict(self, X):
        """Perform classification on samples in X.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Matrix containing new samples
        
        Returns
        -------
        y_pred : array, shape = [n_samples]
            The value of prediction for each sample
        """
        
        if self.is_fitted == False:
            raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.")
        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
        if self.multiclass_ == True:
            return self.cls.predict(X)
        
        return np.array([self.classes_[1] if p >=0 else self.classes_[0] for p in self.decision_function(X)])

    def get_params(self, deep=True):
        # this estimator has parameters:
        return {"lam": self.lam, "kernel": self.kernel, "rbf_gamma":self.rbf_gamma,
                "degree":self.degree, "coef0":self.coef0, "max_iter":self.max_iter,
                "verbose":self.verbose, "multiclass_strategy":self.multiclass_strategy}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self,parameter,value)
        return self


    def decision_function(self, X):
        """Distance of the samples in X to the separating hyperplane.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        
        Returns
        -------
        Z : array-like, shape = [n_samples, 1]
            Returns the decision function of the samples.
        """
        
        if self.is_fitted == False:
            raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.")
        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
        
        if self.multiclass_ == True:
            return self.cls.decision_function(X)
        
        Kf = self.__kernel_definition__()
        YY = matrix(np.diag(list(matrix(self.Y))))
        ker_matrix = matrix(Kf(X,self.X).astype(np.double))
        z = ker_matrix*YY*self.gamma
        z = z-self.bias
        return np.array(list(z))
Ejemplo n.º 42
0
def run_experiment(dataset_name,
                   dataset_folder,
                   feature_extraction_method_name,
                   percentages,
                   trial_num,
                   thread_num,
                   feature_extraction_parameters,
                   classifier_parameters):
    if dataset_name == "snow2014":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_snow2014graph_data(dataset_folder)
    elif dataset_name == "flickr":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_asu_data(dataset_folder)
    elif dataset_name == "youtube":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_asu_data(dataset_folder)
    elif dataset_name == "politicsuk":
        adjacency_matrix,\
        node_label_matrix,\
        labelled_node_indices,\
        number_of_categories = read_insight_data(dataset_folder)
    else:
        print("Invalid dataset name.")
        raise RuntimeError
    print("Graphs and labels read.")

    feature_matrix,\
    feature_extraction_elapsed_time = feature_extraction(adjacency_matrix,
                                                         feature_extraction_method_name,
                                                         thread_num,
                                                         feature_extraction_parameters)
    print("Feature extraction elapsed time: ", feature_extraction_elapsed_time)
    if feature_extraction_parameters["community_weighting"] is None:
        pass
    elif feature_extraction_parameters["community_weighting"] == "chi2":
        feature_matrix = normalize_columns(feature_matrix)
    elif feature_extraction_parameters["community_weighting"] == "ivf":
        feature_matrix = normalize_columns(feature_matrix)
    else:
        print("Invalid community weighting selection.")
        raise RuntimeError

    C = classifier_parameters["C"]
    fit_intercept = classifier_parameters["fit_intercept"]

    for p in np.arange(percentages.size):
        percentage = percentages[p]

        # Initialize the metric storage arrays to zero
        macro_F1 = np.zeros(trial_num, dtype=np.float)
        micro_F1 = np.zeros(trial_num, dtype=np.float)

        folds = generate_folds(node_label_matrix,
                               labelled_node_indices,
                               number_of_categories,
                               percentage,
                               trial_num)

        for trial in np.arange(trial_num):
            train, test = next(folds)
            ########################################################################################################
            # Separate train and test sets
            ########################################################################################################
            X_train, X_test, y_train, y_test = feature_matrix[train, :],\
                                               feature_matrix[test, :],\
                                               node_label_matrix[train, :],\
                                               node_label_matrix[test, :]

            if issparse(feature_matrix):
                if feature_extraction_parameters["community_weighting"] == "chi2":
                    contingency_matrix = chi2_contingency_matrix(X_train, y_train)
                    community_weights = peak_snr_weight_aggregation(contingency_matrix)

                    X_train, X_test = community_weighting(X_train, X_test, community_weights)
                else:
                    X_train = normalize(X_train, norm="l2")
                    X_test = normalize(X_test, norm="l2")

            ############################################################################################################
            # Train model
            ############################################################################################################
            # Train classifier.
            start_time = time.time()
            model = OneVsRestClassifier(svm.LinearSVC(C=C,
                                                      random_state=None,
                                                      dual=False,
                                                      fit_intercept=fit_intercept),
                                        n_jobs=thread_num)

            model.fit(X_train, y_train)
            hypothesis_training_time = time.time() - start_time
            print('Model fitting time: ', hypothesis_training_time)

            ############################################################################################################
            # Make predictions
            ############################################################################################################
            start_time = time.time()
            y_pred = model.decision_function(X_test)
            prediction_time = time.time() - start_time
            print('Prediction time: ', prediction_time)

            ############################################################################################################
            # Calculate measures
            ############################################################################################################
            y_pred = evaluation.form_node_label_prediction_matrix(y_pred, y_test)

            measures = evaluation.calculate_measures(y_pred, y_test)

            macro_F1[trial] = measures[4]
            micro_F1[trial] = measures[5]

            # print('Trial ', trial+1, ':')
            # print(' Macro-F1:        ', macro_F1[trial])
            # print(' Micro-F1:        ', micro_F1[trial])
            # print('\n')

        ################################################################################################################
        # Experiment results
        ################################################################################################################
        print(percentage)
        print('\n')
        print('Macro F1        average: ', np.mean(macro_F1))
        print('Micro F1        average: ', np.mean(micro_F1))
        print('Macro F1            std: ', np.std(macro_F1))
        print('Micro F1            std: ', np.std(micro_F1))
Ejemplo n.º 43
0
    def _predict(self, train_index, test_index):
        """
        :param train_index: list with the index of the models data used in the algorithm of SVM.
        :param test_index:  list with the index to predict.
        :return:    tuple with (label_prediction, label_score, word_prediction, word_score, bin_predictions) of test_index made by: 
                        - predict Label (hiper/hipo/normal) with SVM (using all trainindex)
                        - predict word with KNN using only hiper/hipo/normal trainindex depending on the label predicted
        """
        # models to predict
        data_pred = self.loader.get_models(test_index,
                                           {'weight': self.weights})
        data_pred = data_pred.iloc[:, 1:-1]

        # Get models with words
        wdata = self.loader.get_models(train_index, {'weight': self.weights})

        # Get models hipo/hiper/norm labels
        ldata_labels = self.loader.get_labels_of_words(wdata.iloc[:,
                                                                  -1].tolist())

        # Predict label
        svm = OneVsRestClassifier(
            SVC(kernel=self._kernel, C=self._C, gamma=self._gamma))
        y = label_binarize(ldata_labels, classes=[-1, 0, 1])
        svm.fit(wdata.iloc[:, 1:-1], y)
        bin_labels = svm.predict(data_pred)
        score_label = svm.decision_function(data_pred)

        # Predict word using only vocabulary of label predicted
        res_word = []
        res_label = []
        score_word = []
        predicters = {}
        for i in range(len(bin_labels)):
            label = None
            for l in range(-1, 2):
                if bin_labels[i][l + 1] == 1:
                    label = l
                    break
            if not label:
                maxscore = -1000
                for s in range(-1, 2):
                    score = score_label[i][s + 1]
                    if score >= maxscore:
                        label = s
                        maxscore = score
            res_label.append(label)
            # Work only with sessions of the label
            sessions = [
                wdata.iloc[z, 0] for z in range(len(wdata))
                if ldata_labels[z] == label
            ]

            models = wdata[wdata.id.isin(sessions)]
            models_labels = models.iloc[:, -1]
            models = models.iloc[:, 1:-1]

            if not predicters.get(str(label), False):
                knn = KNN(n_neighbors=5)
                knn.fit(models, models_labels)
                predicters[str(label)] = knn

            knn = predicters.get(str(label))
            res_w = knn.predict(data_pred)
            score_w = knn.predict_proba(data_pred)

            res_word.append(res_w)
            score_word.append(score_w)

        return (res_label, score_label, res_word, score_word, bin_labels)
# Use label_binarize to be multi-label like settings
Y = label_binarize(y, classes=[0, 1, 2])
n_classes = Y.shape[1]

# Split into training and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5,
                                                    random_state=random_state)

# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier

# Run classifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)


###############################################################################
# The average precision score in multi-label settings
# ....................................................
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(Y_test[:, i],
                                                        y_score[:, i])
Ejemplo n.º 45
0
#saving the above data into a npz file, as a temporary storage so that we don't have to run the entire parsing
#over and over again.
outfile = TemporaryFile()
#np.savez(outfile, X = X, Y=Y, X_t = X_t, Y_t = Y_t)
outfile.seek(0)


npzfile = np.load(outfile)
X = npzfile['X']
Y = npzfile['Y']
X_t = npzfile['X_t']
Y_t = npzfile['Y_t']"""

#model = OneVsRestClassifier(svm.SVC(kernel='linear',probability=True,random_state=0)).fit(X,Y)
model2 = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X,Y)
Y_score = model2.decision_function(X_t)
#Y_pred = model2.predict(X_t)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(Y_t[:, i], Y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
#plot ROC
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                   ''.format(i, roc_auc[i]))
Ejemplo n.º 46
0
y_val = np.array(encoded_labels_df_val)

# Define model
linsvm = LinearSVC(loss='hinge')
#multi_class='ovr',
#verbose=True,
#max_iter=1000)
model = OneVsRestClassifier(linsvm, n_jobs=-1)

start = time.process_time()
model.fit(X_train, Y_train)
elapsed_fit = time.process_time() - start

print("Time to fit model (min):", elapsed_fit / 60)

start_predict = time.process_time()
### change
y_pred = model.decision_function(x_val)
elapsed_predict = time.process_time() - start_predict

print("Time to predict (min):", elapsed_predict / 60)

# Evaluate
### change
y_true = y_val
LRAP = label_ranking_average_precision_score(y_true, y_pred)

print("LRAP:", LRAP)

print(y_pred[0:3])
Ejemplo n.º 47
0
Y_train_bin = lb.fit_transform(test_features[f'Label'])
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
numSplits = sss.get_n_splits(X_train,Y_train_bin)
precisionList = dict()
recallList = dict()
APList = dict()
classifyReports = dict()
preRecFSupports = dict()
for j, (train_index, test_index) in enumerate(sss.split(X_train,Y_train_bin)):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = Y_train_bin[train_index], Y_train_bin[test_index]
    svc.fit(X_train_fold,y_train_fold)
    for num in range(3):
        print(f'Number of support vectors in the {num} class is {svc.estimators_[num].n_support_}')
    y_predict = svc.predict(X_test_fold)
    y_score = svc.decision_function(X_test_fold)
    #print(y_test_fold)
    #average precision score do not support multiple class.
    #average_precision = average_precision_score(y_test_fold, y_score)
    #print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    # For each class
    precision = dict()
    recall = dict()
    average_precision = dict()
    # we can get U,D,S information from here.
    for i in range(3):
        precision[i], recall[i], _ = precision_recall_curve(y_test_fold[:, i],y_score[:, i])
        average_precision[i] = average_precision_score(y_test_fold[:, i], y_score[:, i])

    # A "micro-average": quantifying score on all classes jointly
    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test_fold.ravel(),y_score.ravel())
Ejemplo n.º 48
0
    print("Training SVM")
    TIMES = 10
    l = []
    for i in range(TIMES):
        print '\rFitting %d/%d ' % (i, TIMES),
        sys.stdout.flush()

        # resampling
        classifier = OneVsRestClassifier(svm.SVC(kernel=multichannel_wrapper(2, chi_square_kernel), probability=True))
        X_train, X_test, y_train, y_test = train_test_split(x, y, tag)
        y_score = classifier.fit(X_train, y_train).decision_function(X_test)

        l.append(float((y_test.argmax(1) == y_score.argmax(1)).sum())/y_score.shape[0]*100)
    print map(lambda x: '%.3f%%' % x, l), '=', np.mean(l)

    y_score = classifier.decision_function(x)
    print 'Test all = %.3f%%' % (float((y.argmax(1) == y_score.argmax(1)).sum())/y_score.shape[0]*100 )

    if True:
        import matplotlib.pyplot as plt
        from sklearn.metrics import precision_recall_curve
        from sklearn.metrics import average_precision_score

        # Compute Precision-Recall and plot curve
        precision = dict()
        recall = dict()
        average_precision = dict()
        for i in range(n_classes):
            precision[i], recall[i], _ = precision_recall_curve(y[:, i], y_score[:, i])
            average_precision[i] = average_precision_score(y[:, i], y_score[:, i])
Ejemplo n.º 49
0
        0, support_index[0]:support_index[1]] = clf.estimators_[0].dual_coef_
    supports = clf.estimators_[0].support_vectors_

    # create the alpha vector and support vector list by iterating through the estimators
    for i in range(1, classes):
        alpha_vector[
            i,
            support_index[i]:support_index[i +
                                           1]] = clf.estimators_[i].dual_coef_
        supports = np.concatenate(
            (supports, clf.estimators_[i].support_vectors_))

    num_classifiers = classes

    # this is the raw votes, test for equality here
    decision1 = clf.decision_function(X_test)

# one vs one classification creates pairwise combinations of all classes as classifier
# we need to create a classifier map for these pairwise combinations
# we will also arrange the alphas accordingly, since the SVC function from sklearn is too optimized...
elif class_type == 'ovo':
    # extract parameters from the SVC function
    alphas = clf.dual_coef_
    supports = clf.support_vectors_
    intercept = clf.intercept_

    # create the indices for the alpha vector expansion
    support_index = np.concatenate(([0], np.cumsum(clf.n_support_)))

    # generate the combination maps for the alphas
    combo_map = np.zeros((classes, classes - 1, 2))
Ejemplo n.º 50
0
def svm(i):
    train_x = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Train_CV_{i}.csv').iloc[:,
                                                                           9:]
    train_y = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Train_CV_{i}.csv').iloc[:,
                                                                           4]
    validation_x = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Validation_CV_{i}.csv'
    ).iloc[:, 9:]
    validation_y = pd.read_csv(
        f'./CV_FeCV_Features_631atures/ClassificationFeatures/Validation_CV_{i}.csv'
    ).iloc[:, 4]
    test_x = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Test_CV_{i}.csv').iloc[:,
                                                                          9:]
    test_y = pd.read_csv(
        f'./CV_Features_631/ClassificationFeatures/Test_CV_{i}.csv').iloc[:, 4]

    encoder = LabelEncoder().fit(
        train_y)  # #训练LabelEncoder, 把y_train中的类别编码为0,1,2,3,4,5
    y = encoder.transform(train_y)
    y_train = pd.DataFrame(
        encoder.transform(train_y))  # 使用训练好的LabelEncoder对源数据进行编码
    y_valid = pd.DataFrame(encoder.transform(validation_y))
    y_test = pd.DataFrame(encoder.transform(test_y))

    # 标签降维度
    y_train = y_train.iloc[:, 0].ravel()
    y_valid = y_valid.iloc[:, 0].ravel()
    y_test = y_test.iloc[:, 0].ravel()

    # X标准化
    scaler = StandardScaler()
    x_train_std = scaler.fit_transform(train_x)
    x_valid_std = scaler.fit_transform(validation_x)
    x_test_std = scaler.fit_transform(test_x)

    # ------------
    # Gamma
    # ------------
    accuracy_list_valid, f1_list_valid, auc_list_valid = [], [], []
    gamma_range = np.logspace(-10, 1, 10, base=2)
    logger.info(gamma_range)
    for idx, gamma in enumerate(tqdm(gamma_range)):
        # ------------
        # Training
        # ------------
        time0 = time()
        logger.info(
            f">>>>>>>CV = {i}/10, Start Trainng {idx + 1}/{len(gamma_range)}>>>>>>>"
        )
        print(
            f">>>>>>> CV = {i}/10, Start Training {idx + 1}/{len(gamma_range)}>>>>>>>"
        )
        clf = OneVsRestClassifier(
            SVC(
                kernel='rbf',  #
                gamma=gamma,
                C=1,  # default
                degree=1,
                cache_size=5000,
                probability=True,
                class_weight='balanced'))
        clf.fit(x_train_std, y_train)
        # ------------
        # Validation: Fine-tuning on Validation dataset
        # ------------
        y_prediction_valid = clf.predict(x_valid_std)
        accuracy_valid = accuracy_score(y_valid, y_prediction_valid)
        accuracy_list_valid.append(accuracy_valid)
        f1_valid = f1_score(y_valid, y_prediction_valid, average="weighted")
        f1_list_valid.append(f1_valid)
        y_binary_valid = label_binarize(y_valid, classes=list(range(6)))
        result_valid = clf.decision_function(x_valid_std)
        auc_valid = roc_auc_score(y_binary_valid,
                                  result_valid,
                                  average='micro')
        auc_list_valid.append(auc_valid)
        # Logger
        logger.info(
            f"Validation Gamma >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}"
        )
        print(
            f"Validation Gamma >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}"
        )
        print(
            datetime.datetime.fromtimestamp(time() -
                                            time0).strftime("%M:%S:%f"))

    best_gamma = gamma_range[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    best_acc = max(accuracy_list_valid)
    best_f1 = f1_list_valid[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    best_auc = auc_list_valid[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    print(
        f"Validation >>> Best gamma = {best_gamma}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}\n"
    )
    logger.info(
        f"Validation >>> Best gamma = {best_gamma}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}"
    )

    # ------------
    # C
    # ------------
    best_gamma = gamma_range[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    C = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
    accuracy_list_C_valid = []
    for idx, c in enumerate(tqdm(C)):
        time0 = time()
        logger.info(
            f">>>>>>>CV = {i}/10, Fine-Tuining C, Start Trainng {idx + 1}/{len(C)}>>>>>>>"
        )
        print(
            f">>>>>>> CV = {i}/10, Fine-Tuining C, Start Training {idx + 1}/{len(C)}>>>>>>>"
        )
        clf = OneVsRestClassifier(
            SVC(
                kernel='rbf',  #
                gamma=best_gamma,
                C=c,  # default
                degree=1,
                cache_size=5000,
                probability=True,
                class_weight='balanced'))
        clf.fit(x_train_std, y_train)
        # ------------
        # Validation: Fine-tuning on Validation dataset
        # ------------
        y_prediction_valid = clf.predict(x_valid_std)
        accuracy_valid = accuracy_score(y_valid, y_prediction_valid)
        accuracy_list_C_valid.append(accuracy_valid)
        f1_valid = f1_score(y_valid, y_prediction_valid, average="weighted")
        y_binary_valid = label_binarize(y_valid, classes=list(range(6)))
        result_valid = clf.decision_function(x_valid_std)
        auc_valid = roc_auc_score(y_binary_valid,
                                  result_valid,
                                  average='micro')
        # Logger
        logger.info(
            f"Validation C >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}"
        )
        print(
            f"Validation C >>> Acc. = {accuracy_valid}, F1-Score = {f1_valid}, AUC = {auc_valid}"
        )
        print(
            datetime.datetime.fromtimestamp(time() -
                                            time0).strftime("%M:%S:%f"))
    best_c = C[accuracy_list_C_valid.index(max(accuracy_list_C_valid))]

    # logger
    best_acc = max(accuracy_list_C_valid)
    best_f1 = f1_list_valid[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    best_auc = auc_list_valid[accuracy_list_valid.index(
        max(accuracy_list_valid))]
    print(
        f"Validation >>> Best gamma = {best_gamma}, Best C = {best_c}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}\n"
    )
    logger.info(
        f"Validation >>> Best gamma = {best_gamma}, Best C = {best_c}, Acc. ={best_acc}, F1-Score = {best_f1}, AUC = {best_auc}"
    )

    # ------------
    # Test: Test on Test dataset with best gamma
    # ------------
    clf_best_test = OneVsRestClassifier(
        SVC(
            kernel='rbf',  #
            gamma=best_gamma,
            C=best_c,  # default
            degree=1,
            cache_size=5000,
            probability=True,
            class_weight='balanced'))
    clf_best_test.fit(x_train_std, y_train)
    # accuracy & F1 & AUC on Test dataset
    y_test_prediction = clf_best_test.predict(x_test_std)
    test_accuracy = round(accuracy_score(y_test, y_test_prediction), 4)
    test_f1 = round(f1_score(y_test, y_test_prediction, average="weighted"), 4)
    y_test_binary = label_binarize(y_test,
                                   classes=list(range(6)))  # 转化为one-hot
    result_test = clf_best_test.decision_function(x_test_std)
    test_auc = round(
        roc_auc_score(y_test_binary, result_test, average='micro'), 4)
    print(
        f"CV = {i}, Test >>> gamma = {best_gamma}, Acc. ={test_accuracy}, F1-Score = {test_f1}, AUC = {test_auc}"
    )
    logger.info(
        f"CV = {i}, Test >>> gamma = {best_gamma}, Acc. ={test_accuracy}, F1-Score = {test_f1}, AUC = {test_auc}"
    )

    # save
    result_test = clf_best_test.predict_proba(x_test_std)
    df = pd.DataFrame(result_test)
    df.to_csv(
        f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210621_prediction_CV{i}_Gamma_{round(best_gamma,4)}_C_{round(best_c)}_ACC_{test_accuracy}_F1_{test_f1}_AUC_{test_auc}.csv"
    )
    df2 = pd.DataFrame(y_test)
    df2.to_csv(
        f"./Prediction_202106_Ratio631/categorical_vggish_6pnn_20210324_GT_CV{i}.csv"
    )
    print(f">>>>>>> CV = {i}/10, Over Training >>>>>>>\n")
    logger.info(f">>>>>>> CV = {i}/10,Over Training >>>>>>>")
    return [test_accuracy, test_f1, test_auc]
Ejemplo n.º 51
0
def runsvm(set):
    print("DataSet: " + set)
    print("parameters for SVM: " + json.dumps(request.json))

    _kernel = request.json['kernel']
    _gamma = request.json['gamma']
    _C = request.json['penalty']
    _degree = request.json['degree']

    #_kernel = request.form['kernel']
    #_C	= request.form['c']
    #_gamma = request.form['gamma']
    nClasses = 10
    trainData = pd.read_csv(os.path.join(root_dir(), 'data/mnist_train.csv'),
                            sep=',',
                            header=None)
    trainLabel = pd.read_csv(os.path.join(root_dir(),
                                          'data/mnist_train_label.csv'),
                             sep=',',
                             header=None)
    trainLabelBinary = label_binarize(trainLabel,
                                      classes=np.array(range(nClasses)))
    testData = pd.read_csv(os.path.join(root_dir(), 'data/mnist_test.csv'),
                           sep=',',
                           header=None)
    testLabel = pd.read_csv(os.path.join(root_dir(),
                                         'data/mnist_test_label.csv'),
                            sep=',',
                            header=None)
    testLabelBinary = label_binarize(testLabel,
                                     classes=np.array(range(nClasses)))
    #random_state = np.random.RandomState(0)
    clt = svm.SVC(kernel=_kernel, C=_C, degree=_degree)
    classifier = OneVsRestClassifier(clt)
    clt.fit(trainData, trainLabel)
    classifier.fit(trainData, trainLabelBinary)
    precision = dict()
    recall = dict()
    precisionJson = ""
    recallJson = ""
    if set == "test":
        pred = clt.predict(testData)
        f1Score = f1_score(testLabel, pred, average=None)
        cm = confusion_matrix(testLabel, pred)
        accuracy = accuracy_score(testLabel, pred)
        precisionScore = precision_score(testLabel, pred, average=None)
        recallScore = recall_score(testLabel, pred, average=None)
        #hingeLoss = hinge_loss(testLabel, pred)
        dec = classifier.decision_function(testData)
        for i in range(nClasses):
            precision[i], recall[i], _ = precision_recall_curve(
                testLabelBinary[:, i], dec[:, i])
            precisionJson += json.dumps(np.round(precision[i], 3).tolist())
            recallJson += json.dumps(np.round(recall[i], 3).tolist())
            if i != nClasses - 1:
                precisionJson += ", "
                recallJson += ", "
    else:
        pred = clt.predict(trainData)
        f1Score = f1_score(trainLabel, pred, average=None)
        cm = confusion_matrix(trainLabel, pred)
        accuracy = accuracy_score(trainLabel, pred)
        precisionScore = precision_score(trainLabel, pred, average=None)
        recallScore = recall_score(trainLabel, pred, average=None)
        #hingeLoss = hinge_loss(trainLabel, pred)
        dec = classifier.decision_function(trainData)
        for i in range(nClasses):
            precision[i], recall[i], _ = precision_recall_curve(
                trainLabelBinary[:, i], dec[:, i])
            precisionJson += json.dumps(np.round(precision[i], 3).tolist())
            recallJson += json.dumps(np.round(recall[i], 3).tolist())
            if i != nClasses - 1:
                precisionJson += ", "
                recallJson += ", "

    return "{\"result\": " + json.dumps(pred.tolist()) + ",\"f1_score\": " + json.dumps(np.round(f1Score,3).tolist()) \
     + ",\"confusion\": " + json.dumps(cm.tolist()) + ",\"accuracy_score\": " + json.dumps(np.round(accuracy,3).tolist()) \
     + ", \"precision_score\": " + json.dumps(np.round(precisionScore,3).tolist()) \
     + ", \"recall_score\": " + json.dumps(np.round(recallScore,3).tolist()) \
     + ", \"precision_curve\": ["+ precisionJson + "],\"recall_curve\": [" + recallJson \
     + "]" + "}"
Ejemplo n.º 52
0

n_estimators = 3

print("Developing SVM models....")
model3 = OneVsRestClassifier(BaggingClassifier(LinearSVC(class_weight='balanced', max_iter = 100000), max_samples=1.0 / n_estimators, n_estimators=n_estimators))
print("Fitting SVM models....")
model3.fit(X_train, y_train)
dump(model3, "svm_model.joblib")
print("SVM - Saved!")
print()

# predict probabilities
nb_probs = model1.predict_proba(X_test)
rf_probs = model2.predict_proba(X_test)
svm_probs = model3.decision_function(X_test)

# keep probabilities for the positive outcome only
nb_probs = nb_probs[:, 1]
rf_probs = rf_probs[:, 1]


# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
nb_auc = roc_auc_score(y_test, nb_probs)
rf_auc = roc_auc_score(y_test, rf_probs)
svm_auc = roc_auc_score(y_test, svm_probs)


print("NB - Accuracy: %f" % accuracy_score(y_test, model1.predict(X_test))) 
print("NB - AUC score: %f" % nb_auc)
    def getTrainAndTest(self):
        #df = pd.read_csv('H:\pc programming\Django(Prac)\ML\Classification\Classification\Review_Testing_Format.txt')
        df = pd.read_csv('Review_Testing_Format.txt')
        df.replace('?', -99999, inplace=True)
        df.drop(['id'], 1, inplace=True)

        X = np.array(df.drop(['class'], 1))
        y = np.array(df['class'])

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.10)

        #  Built-In Decision Tree
        self.clf_DTree = tree.DecisionTreeClassifier()
        self.clf_DTree.fit(X_train, y_train)
        accuracy = self.clf_DTree.score(X_test, y_test)
        print("Accuracy in Decision Tree: %s" % accuracy)

        #  Built-In K-Nearest Neighbour
        self.clf_KNN = tree.DecisionTreeClassifier()
        self.clf_KNN.fit(X_train, y_train)
        accuracy = self.clf_KNN.score(X_test, y_test)
        print("Accuracy in KNN: %s" % accuracy)

        #  Built-In Support Vector Machine
        self.clf_SVM = tree.DecisionTreeClassifier()
        self.clf_SVM.fit(X_train, y_train)
        accuracy = self.clf_SVM.score(X_test, y_test)
        print("Accuracy in SVM: %s" % accuracy)

        Y = label_binarize(y, classes=['A', 'B', 'C'])
        n_classes = Y.shape[1]

        X_train, X_test, Y_train, Y_test = train_test_split(
            X,
            Y,
            test_size=.5,
        )
        classifier = OneVsRestClassifier(svm.LinearSVC(random_state=None))
        classifier.fit(X_train, Y_train)
        y_score = classifier.decision_function(X_test)
        # For each class
        precision = dict()
        recall = dict()
        average_precision = dict()
        '''
        for i in range(n_classes):
            average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i])

        average_precision["micro"] = average_precision_score(Y_test, y_score,
                                                             average="micro")
        average_precision["macro"] = average_precision_score(Y_test, y_score,
                                                             average="macro")
        average_precision["weighted"] = average_precision_score(Y_test, y_score,
                                                             average="weighted")
        print('Average precision score, micro-averaged over all classes: {0:0.2f}'
              .format(average_precision["micro"]))

        recall["micro"] = recall_score(Y_test, y_score,average="micro")
        print('Recall score, micro over all classes: {0:0.2f}'
              .format(recall["micro"]))
              '''
        '''
    # TEST_FEATURES = numpy.array(TEST_FEATURES)
    TEST_FEATURES = TRAIN_FEATURES

    TRAIN_LABELS = numpy.array(TRAIN_LABELS)
    # TEST_LABELS = numpy.array(TEST_LABELS)
    TEST_LABELS = TRAIN_LABELS

    TRAIN_ATTRIBUTE = numpy.array(TRAIN_ATTRIBUTE)
    # TEST_ATTRIBUTE = numpy.array(TEST_ATTRIBUTE)
    TEST_ATTRIBUTE = TRAIN_ATTRIBUTE

    pc=0
    nc=0
    classifier = OneVsRestClassifier(LinearSVC(C=2.0,random_state=0))
    classifier.fit(TRAIN_FEATURES,TRAIN_ATTRIBUTE)
    decision = classifier.decision_function(TEST_FEATURES)
    prediction = classifier.predict(TEST_FEATURES)
    for i in range(0,len(TEST_ATTRIBUTE)):
        for j in range(22):
            if prediction[i][j]==TEST_ATTRIBUTE[i][j]:
                pc+=1
            else:
                nc+=1
        # print prediction[i],TEST_ATTRIBUTE[i],TEST_LABELS[i], decision[i]
    print pc,nc
    print classifier.score(TEST_FEATURES,TEST_ATTRIBUTE)

    TRAIN_LABELS = []
    TRAIN_FEATURES = []
    TRAIN_ATTRIBUTE = []
Ejemplo n.º 55
0
    def oneVsAll(self, clf, idindiv=0, nbrot=5, test_size=0.33):
        data = self.countMat[idindiv * self.kmByIndiv:(idindiv + 1) *
                             self.kmByIndiv, :].T
        data = normalize(data, axis=1, copy=False)

        from sklearn.preprocessing import label_binarize
        Y = label_binarize(self.classname, classes=np.unique(self.classname))
        uniqClasname = np.unique(self.classname)
        n_classes = Y.shape[1]

        result = np.array([])
        for i in range(nbrot):
            X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
                data, Y, test_size=test_size)  # ,random_state=seed)
            classifier = OneVsRestClassifier(clf)
            classifier.fit(X_train, Y_train)
            y_score = classifier.decision_function(X_test)

            # For each class
            precision = dict()
            recall = dict()
            average_precision = dict()

            for i in range(n_classes):
                if n_classes > 1:
                    tmpscore = y_score[:, i]
                else:
                    tmpscore = y_score
                precision[i], recall[i], _ = precision_recall_curve(
                    Y_test[:, i], tmpscore)
                average_precision[i] = average_precision_score(
                    Y_test[:, i], tmpscore)

            # A "micro-average": quantifying score on all classes jointly
            precision["micro"], recall["micro"], _ = precision_recall_curve(
                Y_test.ravel(), y_score.ravel())
            average_precision["micro"] = average_precision_score(
                Y_test, y_score, average="micro")
            print(
                'Average precision score, micro-averaged over all classes: {0:0.2f}'
                .format(average_precision["micro"]))
            result = np.append(result, average_precision["micro"])

            plt.figure()
            plt.step(recall['micro'],
                     precision['micro'],
                     color='b',
                     alpha=0.2,
                     where='post')
            #plt.fill_between(recall["micro"], precision["micro"], alpha=0.2, color='b', **step_kwargs)

            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.ylim([0.0, 1.05])
            plt.xlim([0.0, 1.0])
            plt.title(
                'Average precision score, micro-averaged over all classes: AP={0:0.2f}'
                .format(average_precision["micro"]))
            plt.savefig(
                os.path.join(self.savepath, "average_precision_score.png"))
            plt.close()

            #Plot            Precision - Recall            curve            for each class and iso-f1 curves¶
            from itertools import cycle
            # setup plot details
            colors = cycle(
                ['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])

            plt.figure(figsize=(7, 8))
            f_scores = np.linspace(0.2, 0.8, num=4)
            lines = []
            labels = []
            for f_score in f_scores:
                x = np.linspace(0.01, 1)
                y = f_score * x / (2 * x - f_score)
                l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
                plt.annotate('f1={0:0.1f}'.format(f_score),
                             xy=(0.9, y[45] + 0.02))

            lines.append(l)
            labels.append('iso-f1 curves')
            l, = plt.plot(recall["micro"],
                          precision["micro"],
                          color='gold',
                          lw=2)
            lines.append(l)
            labels.append('micro-average Precision-recall (area = {0:0.2f})'
                          ''.format(average_precision["micro"]))

            for i, color in zip(range(n_classes), colors):
                l, = plt.plot(recall[i], precision[i], color=color, lw=2)
                lines.append(l)
                labels.append(
                    'Precision-recall for class {0} (area = {1:0.2f})'
                    ''.format(uniqClasname[i], average_precision[i]))

            fig = plt.gcf()
            fig.subplots_adjust(bottom=0.25)
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Extension of Precision-Recall curve to multi-class')
            plt.legend(lines, labels, loc=(0, -.38), prop=dict(size=14))
            plt.savefig(
                os.path.join(self.savepath, "Precision-Recall_curve.png"))
            plt.close()

        print(
            "oneVsAll crossvalidation Average precision score, micro-averaged over all classes:"
        )
        # print(result)
        print("mean = ", np.mean(result), ", std = ", np.std(result))