Ejemplo n.º 1
0
def comparativeSpreadsheet():

    genrePredictions = []
    
    #'p' = this variable was predicted by the ML model
        
    listOfGenres = [ 
        'pAlphabet',
        'pAnimals_and_Nature',
        'pArt_and_Music',
        'pBiographies',
        
        'pFairy_and_Folk_Tales',
        'pFiction',
        'pFoods',
        'pHealth',
    
        'pHistory',
        'pHolidays',
        'pMath_and_Science',
        'pNursery_Rhymes',
        
        'pPeople_and_Places',
        'pPoetry',
        'pRecreation_and_Leisure',
        'pSports'
    ]

    train, test = train_test_split(df, test_size=0.33, shuffle=True)
    X_train = train.Book_Text
    X_test = test.Book_Text

    totalAcc = 0
    i = 0
    
    SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
    for genre in genres:
        print('... Processing {}'.format(genre))
        # train the model using X_dtm & y
        SVC_pipeline.fit(X_train, train[genre])
        # compute the testing accuracy
        prediction = SVC_pipeline.predict(X_test)
        
        print('Test accuracy is {}'.format(accuracy_score(test[genre], prediction)))
        
        genrePredictions.append(SVC_pipeline.predict(df.Book_Text))
        
        acc = accuracy_score(test[genre], prediction)
        totalAcc += acc
        i += 1
        if i == 16:
            totalAcc = totalAcc / 16
            print("\n")
            print('Overall average test accuracy is {}'.format(totalAcc))
           
    Book_Text = df.Book_Text
    AI_Assigned_Genre = np.empty((len(Book_Text), 0)).tolist()

    genPointer = -1
    for row in genrePredictions:
        i = 0
        genPointer += 1
        print(listOfGenres[genPointer])
        for category in row:
            if category == 1:
                AI_Assigned_Genre[i].append(listOfGenres[genPointer])
            i+=1
            
        answers = pd.DataFrame(
                {'Book_Text': Book_Text, 
                 'Human_Assigned_Genre': Human_Assigned_Genre,
                 'AI_Assigned_Genre': AI_Assigned_Genre
                 }) 
Ejemplo n.º 2
0
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, pipeline, metrics, grid_search
import cPickle as pickle
import scipy.sparse
from sklearn.multiclass import OneVsRestClassifier

X=[]
with open('train_newFeat_sparse_mat.dat', 'rb') as infile:
	X = pickle.load(infile)

y = pd.read_csv('labels.csv',index_col=False,header=None)
y = np.array(y).astype('int')
y_temp = y[:,0]
y=y_temp

model = OneVsRestClassifier(SVC(C=100.0, gamma = 0.1, probability=True, verbose=1,kernel='linear'),n_jobs=-1)

model.fit(X, y)


X_test=[]
with open('test_newFeat_sparse_mat.dat', 'rb') as infile:
        X_test = pickle.load(infile)

from sklearn.externals import joblib
joblib.dump(model, 'svmModelUbuntu.pkl') 
Ejemplo n.º 3
0
## Cross Validation

cross_val = StratifiedShuffleSplit(churnTarget,
                                   1,
                                   test_size=0.1,
                                   random_state=seed)

classifiers = [
    ('Decision Tree Classifier', DecisionTreeClassifier()),
    ('Navie Bayes', GaussianNB()),
    ('Scochastic Gradient Descent',
     SGDClassifier(loss='modified_huber', shuffle=True)),
    ('Support Vector Classifier', SVC(kernel="linear", C=0.025)),
    ('K Nearest Neighbors', KNeighborsClassifier()),
    ('One Vs Rest Classifier', OneVsRestClassifier(LinearSVC())),
    ('Random Forest Classifier',
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10)),
    ('Ada Boost Classifier', AdaBoostClassifier()),
]

for clf in classifiers:
    score = 0
    for trainIndex, testIndex in cross_val:
        trainDataCV, testDataCV = churnMatrix[trainIndex], churnMatrix[
            testIndex]
        trainLabelCV, testLabelCV = churnTarget[trainIndex], churnTarget[
            testIndex]
        clf[1].fit(trainDataCV, trainLabelCV)
        score = score + clf[1].score(testDataCV, testLabelCV)
    print('\n', clf[0], ' after Cross Validation :', score)
Ejemplo n.º 4
0
                manifolds[dropout][reg][epoch] = (manifold, indices)

                # Get training data for feed dict
                x_data_train, y_data_train = \
                    cifar10.train.images[:train_size_lm], cifar10.train.labels[:train_size_lm]

                # Get train set features at flatten, fc1 and fc2 layers
                flatten_features_train, fc1_features_train, fc2_features_train = \
                    sess.run([model.flatten, model.fc1, fc2],
                             {x : x_data_train, y : y_data_train, is_training : False})

                features_list = [
                    ['flat', flatten_features_train, flatten_features_test],
                    ['fc1', fc1_features_train, fc1_features_test],
                    ['fc2', fc2_features_train, fc2_features_test]
                ]
                print('dropout:', dropout, 'reg:', reg)
                now = time.time()
                for (name, features_train, features_test) in features_list:
                    classif = OneVsRestClassifier(SVC(kernel='linear'))
                    classif.fit(features_train, y_data_train)
                    lm_test_predictions = classif.predict(features_test)
                    acc = np.mean(
                        np.equal(np.argmax(y_data_test, 1),
                                 np.argmax(lm_test_predictions, 1)))
                    print(name, 'accuracy =', np.round(acc * 100, 2), '%')
                    results_df.loc[(epoch, dropout, '%.e' % reg)][name] = acc

                print(results_df)
                cPickle.dump((manifolds, results_df),
                             open('manifoldses.dump', 'wb'))
Ejemplo n.º 5
0
def separate_data(events, total):
    x_train = []
    y_train = []
    cats_seen = []

    ll = total / 2
    for e in events[:ll]:
        item = e.title + '.' + e.description
        for t in e.tags:
            item += '.' + t
        for p in e.performers:
            item += '.' + p
        x_train.append(item)
        y_train.append(list(e.categories))
        cats_seen += list(e.categories)

    print cats_seen

    print x_train
    print y_train

    X_train = np.array(x_train)
    print X_train

    lb = MultiLabelBinarizer()
    Y = lb.fit_transform(y_train)
    print Y

    x_test = []
    y_test = []
    for e in events[ll:total]:
        if all(cat in cats_seen for cat in e.categories):
            item = e.title + '.' + e.description
            for t in e.tags:
                item += '.' + t
            for p in e.performers:
                item += '.' + p
            x_test.append(item)
            y_test.append(list(e.categories))

    print len(x_test)
    X_test = np.array(x_test)

    classifier = Pipeline([('vectorizer',
                            CountVectorizer(ngram_range=(1, 3),
                                            min_df=1,
                                            stop_words='english',
                                            strip_accents='unicode')),
                           ('tfidf',
                            TfidfTransformer(norm='l2', sublinear_tf=True)),
                           ('clf', OneVsRestClassifier(LinearSVC()))])

    classifier.fit(X_train, Y)
    predicted = classifier.predict(X_test)

    all_labels = lb.inverse_transform(predicted)

    for item, labels, true_labels in zip(X_test, all_labels, y_test):
        print item.partition('.')[0]
        print 'predicted:', labels, '|  actual:', true_labels

    Y_test = lb.transform(y_test)
    print '\n'
    print 'Accuracy:', classifier.score(X_test, Y_test) * 100, '%'
    print '\n'
    return classifier, lb
},
             inplace=True)
alles = pd.merge(newXX, all_data, on='photo_id')
x_data = alles.iloc[:, -4096:]
y_data = alles.loc[:, '00':'88']
x_data.to_csv('x_data_PerImage.csv', index=True)
y_data.to_csv('y_data_PerImage.csv', index=True)

#Load the x_data en y_data
x_dat = pd.read_csv('x_data_PerImage.csv', sep=',')
y_dat = pd.read_csv('y_data_PerImage.csv', sep=',')

#Train SVM
print 'Training SVM....'
ti = time.time()
S = OneVsRestClassifier(SVC(kernel='poly')).fit(x_data, y_data)
print time.time() - ti
print 'seconds needed'

#Save classifier
with open('svm.pkl', 'wb') as f:
    pickle.dump(S, f)

#Open classifier
#with open('filename.pkl', 'rb') as f:
#    clf = pickle.load(f)

#Load test data
t = time.time()
#loads featurevectors
features1 = pd.read_csv('ML/Features_data/caffe_features_test.csv',
Ejemplo n.º 7
0
    print name
    clf.fit(x_train, y_train)
    # # scores = clf.score(x_test, y_test)
    # predict = clf.predict(x_test)
    # confusion_mat = metrics.confusion_matrix(y_test, predict)

    # con_mat = np.zeros(confusion_mat.shape, dtype=np.float)
    # size = len(confusion_mat[:,0])
    # for i in xrange(size):
    #     row = confusion_mat[i,:]
    #     con_mat[i,:] = confusion_mat[i,:]/float(row.sum())
    # print con_mat
    # print metrics.classification_report(y_test, predict)


    classifier = OneVsRestClassifier(clf)
    classifier.fit(x_train, y_train)
    predict = classifier.predict(x_test)
    y_score = classifier.predict_proba(x_test)
    y_labels = classifier.predict(x_test)

    print metrics.classification_report(y_test, predict)
    # print metrics.confusion_matrix(y_test, y_labels)
    # print y_score, y_labels


    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(3):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
Ejemplo n.º 8
0
def TrainModelWithOnevsRest(algoritm, trainData, testData, trainLabel):
    prediction = OneVsRestClassifier(algoritm).fit(
        trainData, trainLabel).predict(testData)
    return prediction
Ejemplo n.º 9
0
    print "processing image %i of %i" % (i+1, len(images)) 
    labels.append(images[i][:-len('00.jpg')])
    im = mh.imread(images[i])
    imgrey = mh.colors.rgb2gray(im, dtype=np.uint8)
    features.append(np.concatenate([mh.features.haralick(im).ravel(), mh.features.lbp(imgrey, 30, 10).ravel(), colors(im)]))
    surfim = mh.imread(images[i], as_grey=True)
    surfim = surfim.astype(np.uint8)
    alldescriptors.append(surf.dense(surfim, spacing=16))

concatenated = np.concatenate(alldescriptors)
print "fitting k mean clusters for surf descriptors"
km = KMeans(15)
km.fit(concatenated)
print "creating surf features"
sfeatures = []
for d in alldescriptors:
    c = km.predict(d)
    sfeatures.append(np.array([np.sum(c == ci) for ci in range(15)]))

features = np.array(features) 
sfeatures = np.array(sfeatures, dtype=float)
features = np.concatenate((features, sfeatures), axis=1)
labels = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=42, stratify=labels)
clf = Pipeline([('scaler', StandardScaler()),('classifier', OneVsRestClassifier(SVC()))])

print "building model"
clf.fit(X_train,y_train)
score = clf.score(X_test,y_test)
print 'Accuracy of model: %.2f%%' % (score*100.)
Ejemplo n.º 10
0
# Use label_binarize to be multi-label like settings
Y = label_binarize(y, classes=[0, 1, 2])
n_classes = Y.shape[1]

# Split into training and test
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=.5,
                                                    random_state=random_state)

# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier

# Run classifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)

###############################################################################
# The average precision score in multi-label settings
# ....................................................
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(
import pandas as pd

# Import data frame
sample_df = pd.read_csv('../data/DataCamp/exercise/sample.csv', index_col=0)
sample_df['text'].fillna("", inplace=True)  # pipeline can't fit text with NaNs


##################################
# (1) Split and select text data #
##################################

# Split out only the text data
X_train, X_test, y_train, y_test = train_test_split(sample_df['text'],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=456)

# Instantiate Pipeline object: pl
pl = Pipeline([
        ('vec', CountVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# Fit to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on sample data - just text data: ", accuracy)

Ejemplo n.º 12
0
b = 0
for i, key in enumerate(yk):
    if (key_old != key and key_old is not None) or i == len(yk) - 1:
        porig.append(
            np.hstack(
                (np.mean(p[b:i, :], axis=0), np.amax(p[b:i, :], axis=0),
                 np.sum(p[b:i, :],
                        axis=0), np.log(np.sum(xm[b:i, :], axis=0) + 1))))
        b = i
    key_old = key

ym2 = np.array(yorig)
xm2 = np.array(porig)
clf = OneVsRestClassifier(SVC(C=.005,
                              kernel='linear',
                              random_state=4,
                              probability=True),
                          n_jobs=9)
print 'Train SVC - CV'
p2 = utils.cross_val_proba(clf, xm2, ym2, 5, 0, n_jobs=1)
print f1_score(ym2, np.array(p2 >= 0.5, dtype='l'), average='samples')

key_old = None
pt_orig, idt_orig = [], []
b = 0
for i, key in enumerate(idt):
    if (key_old != key and key_old is not None) or i == len(idt) - 1:
        #pt_orig.append(np.hstack((np.mean(pt[b:i, :], axis=0), np.amax(pt[b:i, :], axis=0), np.sum(pt[b:i, :], axis=0),
        #                          np.log(np.sum(xt[b:i, :], axis=0) + 1))))
        pt_orig.append(
            np.hstack(np.sum(pt[b:i, :], axis=0),
Ejemplo n.º 13
0
            p = len(ensemble)

            if save:
                save_method_results(cv_path,
                                    X_train=X_train,
                                    Y_train=Y_train,
                                    X_test=X_test,
                                    Y_test=Y_test,
                                    X_val=X_val,
                                    Y_val=Y_val,
                                    Y_val_meta=Y_val_meta,
                                    Y_test_meta=Y_test_meta)

            # META-LEARNING STEP
            ml_clf = OneVsRestClassifier(estimator=LogisticRegression())
            ml_clf.fit(X_val, Y_val_meta)
            proba = ml_clf.predict_proba(X_test)

            # ENSEMBLE
            Y_pred_meta = np.ones((X_test.shape[0], p))
            Y_pred = predict_with_meta_dataset(ensemble, Y_pred_meta, X_test)
            if save:
                save_method_results(cv_path,
                                    method_name='ensemble',
                                    Y_pred=Y_pred)
            if verbose:
                print("For ENSEMBLE: {0}".format(accuracy_score(
                    Y_test, Y_pred)))

            # PRECISION MINIMIZER
Ejemplo n.º 14
0
def newPipelines (s): #This function made possible by Susan Li's article on TwoardDataScience.com
    train, test = train_test_split(df, test_size=0.33, shuffle=True)
    X_train = train.Book_Text
    X_test = test.Book_Text

    totalAcc = 0
    i = 0
    
    if s == 'SVC': 
        SVC_pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
                ])
        for genre in genres:
            print('... Processing {}'.format(genre))
            # train the model using X_dtm & y
            SVC_pipeline.fit(X_train, train[genre])
            # compute the testing accuracy
            prediction = SVC_pipeline.predict(X_test)
            print('Test accuracy is {}'.format(accuracy_score(test[genre], prediction)))
            
            totalAcc += acc
            i += 1
            if i == 16:
                totalAcc = totalAcc / 16
                print("\n")
                print('Overall average test accuracy is {}'.format(totalAcc))
               
        return totalAcc


    if s == 'NB': 
        NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
        for genre in genres:
                print('... Processing {}'.format(genre))
                # train the model using X_dtm & y
                NB_pipeline.fit(X_train, train[genre])
                # compute the testing accuracy
                prediction = NB_pipeline.predict(X_test)
                print('Test accuracy is {}'.format(accuracy_score(test[genre], prediction)))
                
                totalAcc += acc
                i += 1
                if i == 16:
                    totalAcc = totalAcc / 16
                    print("\n")
                    print('Overall average test accuracy is {}'.format(totalAcc))
                    
        return totalAcc 


    if s == 'LogReg':
        LogReg_pipeline = Pipeline([
                        ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                        ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
                    ])
        for genre in genres:
            print('... Processing {}'.format(genre))
            # train the model using X_dtm & y
            LogReg_pipeline.fit(X_train, train[genre])
            # compute the testing accuracy
            prediction = LogReg_pipeline.predict(X_test)
            print('Test accuracy is {}'.format(accuracy_score(test[genre], prediction)))            
            
            totalAcc += acc
            i += 1
            if i == 16:
                totalAcc = totalAcc / 16
                print("\n")
                print('Overall average test accuracy is {}'.format(totalAcc))
        
        return totalAcc        
    # масштабирование выборки
    scaler = StandardScaler()
    scaler.fit_transform(X)

    # разделение на тренировочную и тестовую выборки
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42,
                                                        shuffle=True)

    # построение трехмерного графика сниженной размерности
    # dim_reduction_plot_tsne(X_test, y_test)

    # nonlinear svm
    clf_SVC = OneVsRestClassifier(LinearSVC())
    # clf_SVC = SVC(C=0.1, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True,
    #               probability=False, tol=0.001, cache_size=1000, class_weight=None,
    #               verbose=0, max_iter=-1, decision_function_shape="ovr", random_state=0)
    clf_SVC.fit(X_train, y_train)

    print('Accuracy of SVC on training set: {:.2f}'.format(
        clf_SVC.score(X_train, y_train) * 100))
    print('Accuracy of SVC on test set: {:.2f}'.format(
        clf_SVC.score(X_test, y_test) * 100))

    total_time = round((time.time() - start_time))
    print("Time elapsed: %s minutes %s seconds" %
          ((total_time // 60), round(total_time % 60)))
Ejemplo n.º 16
0
    return stem

#Defining X and y for training:
X = []
for i in range(tweets_df.shape[0]):
    X.append(tweets_df.iloc[i][0])
y = np.array(tweets_df['polarity_bin'])

#defining Train and test:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=5)

#Defining Model Pipeline for Logistic Regression:
model_Logistic_Regression = Pipeline([
    #Def stopword for model and Tf-idf  for the Tweets
    ('tfidf', TfidfVectorizer(stop_words=spa_stop)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag')))
])

#Defining Parameters:
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

GridSearch_Log_Reg = GridSearchCV(model_Logistic_Regression, parameters, n_jobs=-1)
GridSearch_Log_Reg = GridSearch_Log_Reg.fit(X, y)
print('Training the Model please be patient.')
print('\n')
print('GridSearch Logistic Regression best score: \n',GridSearch_Log_Reg.best_score_)
from sklearn import metrics, cross_validation
from sklearn import datasets
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
df = pd.read_csv('dataSample1.csv')
samples = df.loc[:, [
    'Openness', 'Conscientousness', 'Extraversion', 'Agreeableness',
    'Emotional_Range', 'Conversation', 'Openness to Change', 'Hedonism',
    'Self-enhancement', 'Self-transcendence'
]]
target = df.loc[:, 'Profession']

cv_folds = cross_validation.StratifiedKFold(target,
                                            n_folds=5,
                                            shuffle=False,
                                            random_state=0)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    samples, target, test_size=0.2, random_state=0)
model = OneVsRestClassifier(LogisticRegression())
model.fit(X_train, y_train)
predicted = model.predict(X_test)
report = classification_report(y_test, predicted)
print(report)
Ejemplo n.º 18
0
X_train.shape
y_train.shape
X_valid.shape
y_valid.shape

df_label = pd.read_csv('./data/train_result.csv',
                       engine='python',
                       encoding='utf-8')
label_list = sorted(df_label.label.unique())
label2num_dict = {k: v for v, k in enumerate(label_list)}
num2label_dict = {k: v for k, v in enumerate(label_list)}

# =============================================================================
# 建模训练
# =============================================================================
ovr = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=100))
ovr.fit(X_train, y_train)  # 训练模型

y_pred = ovr.predict(X_valid)

Accuracy = accuracy_score(y_valid, y_pred, normalize=True)
Recall = recall_score(y_valid, y_pred, average='macro')
F1 = f1_score(y_valid, y_pred, average='weighted')
#F1_score = 2 * accuracy_score * recall_acc / (accuracy_score + recall_score)
F1_2 = F1**2
print('准确率 = {0} %'.format(round(Accuracy, 2)))
print('召回率 = {0} %'.format(round(Recall * 100, 2)))
print('F1 = {0}'.format(F1))
print('(F1)2 = {0}'.format(F1_2))

# 读入测试集,测试模型
Ejemplo n.º 19
0
                        negative=5,
                        workers=2,
                        epochs=20)
model.build_vocab(docs)
model.train(docs, epochs=model.iter, total_examples=model.corpus_count)
# Build doc2vec vectors
x_train = []
x_test = []
for i in range(n_train):
    x_train.append(model.docvecs[i])
for i in range(n_test):
    x_test.append(model.docvecs[n_train + i])

# classification
pipeline = Pipeline([
    ('clf', OneVsRestClassifier(SVC(probability=True), n_jobs=1)),
])
parameters = {'kernel': ('linear', 'rbf'), 'C': [0.1, 1, 10]}

clf = GridSearchCV(pipeline, parameters, cv=2, verbose=1)
clf.fit(x_train, y_train)

print(clf.best_score_)
print(clf.best_params_)

y_pred = clf.predict_proba(x_test)

# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
Ejemplo n.º 20
0
def define_model_for_optimization(mt, ndp, mc):
    model = algorithm_setup(model_type=mt, nondef_params=ndp)
    if mc:
        model = OneVsRestClassifier(model, n_jobs=1)
    return model
Ejemplo n.º 21
0
# Multiple Binary Classifications - (One Vs Rest Classifier)

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
#from IPython.display import Markdown, display

# def printmd(string):
# display(Markdown(string))

# % time

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
])

arrs = []
for category in categories:
    # printmd('**Processing {} review...**'.format(category))

    # Training logistic regression model on train data
    # print("x_train")
    # print(x_train)
    LogReg_pipeline.fit(x_train, train[category])

    # calculating test accuracy
    # print("x_test")
    # print(x_test)
    prediction = LogReg_pipeline.predict(x_test)
                                                    random_state=4)

#Printing shapes
print(s_train.shape)
print(s_test.shape)
print(t_train.shape)
print(t_test.shape)

# One-vs-One SVM Classifier Prediction
smodel = OneVsOneClassifier(SVC()).fit(s_train, t_train)
smodel.fit(s_train, t_train)
sprediction = smodel.predict(s_test)
print(sprediction)

# One-vs-Rest SVM Classifier Prediction
clf = OneVsRestClassifier(SVC()).fit(s_train, t_train)
spredict = clf.predict(s_test)
print(spredict)

# Actual values which should have been predicted based on testing dataset
print(t_test)
"""<h1>Evaluating the classifiers</h1>"""

# Accuracy for One-vs-One Classifier
accuracy = metrics.accuracy_score(t_test, sprediction)
print(accuracy)

# Accuracy for One-vs-Rest Classifier
accuracy1 = metrics.accuracy_score(t_test, spredict)
print(accuracy1)
Ejemplo n.º 23
0
metrics.accuracy_score(test.ix[:,0], clf.predict(test.ix[:,1:])) 


# In[52]:

svm.SVC(decision_function_shape='ovo')
clf.fit(train.ix[:,1:], train.ix[:,0])
metrics.accuracy_score(test.ix[:,0], clf.predict(test.ix[:,1:])) 


# In[53]:

from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
clf = OneVsRestClassifier(LinearSVC(random_state=0))
clf.fit(train.ix[:,1:], train.ix[:,0])
metrics.accuracy_score(test.ix[:,0], clf.predict(test.ix[:,1:])) 


# In[54]:

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train.ix[:,1:], train.ix[:,0])
metrics.accuracy_score(test.ix[:,0], gnb.predict(test.ix[:,1:])) 


# In[55]:

from sklearn import linear_model
Ejemplo n.º 24
0
def draw_roc_auc(ax,clf,X,y,title):    
    # 分割训练集与测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.5,
                                                        random_state=0)
    
    n_classes = y.shape[1]
    
    # 1对多分类器
    oneVsRestclassifier = OneVsRestClassifier(clf)
    y_score = oneVsRestclassifier.fit(X_train, y_train).predict_proba(X_test)
    
    # 计算ROC曲线和面积
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = ms.roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = ms.auc(fpr[i], tpr[i])
    # 计算 micro-average ROC曲线和面积
    fpr["micro"], tpr["micro"], _ = ms.roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = ms.auc(fpr["micro"], tpr["micro"])
    
    lw = 2
    
    # 计算 macro-average ROC曲线和面积,按照类别依次进行计算指标,求平均值
    # 该方法不考虑类别的不均衡
    # 首先汇总全部的fpr
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    # 插值计算所有的ROC曲线上的点
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])
    
    # 计算AUC 
    mean_tpr /= n_classes
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = ms.auc(fpr["macro"], tpr["macro"])
    
    # 绘制ROC曲线
    ax.plot(fpr["micro"], tpr["micro"],
        label='micro-average ROC曲线 (area = {0:0.2f})' 
        ''.format(roc_auc["micro"]),
        color='deeppink', linestyle=':', linewidth=4)
    ax.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC曲线 (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='类{0}的ROC曲线 (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))
    ax.plot([0, 1], [0, 1], 'k--', lw=lw)
    
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(title,fontproperties=myfont)
    ax.legend(loc="best",prop=myfont)
    plt.show()
    return
def train(dataset, model_name='sgd'):

    X_train, y_train, X_val, y_val, X_test = dataset

    if (model_name.lower() in ['svm', 'all']):
        model = SVC(C=1,
                    kernel='rbf',
                    degree=4,
                    gamma='auto',
                    coef0=0.0,
                    shrinking=True,
                    probability=False,
                    tol=0.0001,
                    cache_size=200,
                    class_weight='balanced',
                    verbose=False,
                    max_iter=-1,
                    decision_function_shape='ovr',
                    random_state=None)
        train_start = time()
        model.fit(X_train, y_train)
        train_end = time()
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)
        train_f1 = metrics.f1_score(y_train, train_pred, average='macro')
        val_f1 = metrics.f1_score(y_val, val_pred, average='macro')
        val_acc = metrics.accuracy_score(y_val, val_pred)
        print(
            "SVM Performance: train F1: {} | train time: {} | val F1: {} | val Acc: {}"
            .format(train_f1, timedelta(seconds=train_end - train_start),
                    val_f1, val_acc))

    if (model_name.lower() in ['pa', 'all']):
        model = PassiveAggressiveClassifier(C=1.0,
                                            fit_intercept=True,
                                            max_iter=None,
                                            tol=0.001,
                                            shuffle=True,
                                            verbose=0,
                                            loss='hinge',
                                            n_jobs=1,
                                            random_state=None,
                                            warm_start=False,
                                            class_weight='balanced',
                                            average=True,
                                            n_iter=None)
        train_start = time()
        model.fit(X_train, y_train)
        train_end = time()
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)
        train_f1 = metrics.f1_score(y_train, train_pred, average='macro')
        val_f1 = metrics.f1_score(y_val, val_pred, average='macro')
        val_acc = metrics.accuracy_score(y_val, val_pred)
        print(
            "PA Performance: train F1: {} | train time: {} | val F1: {} | val Acc: {}"
            .format(train_f1, timedelta(seconds=train_end - train_start),
                    val_f1, val_acc))

    if (model_name.lower() in ['sgd', 'all']):
        model = SGDClassifier(loss='hinge',
                              penalty='l2',
                              alpha=0.0001,
                              l1_ratio=0.15,
                              fit_intercept=True,
                              max_iter=None,
                              tol=0.001,
                              shuffle=True,
                              verbose=0,
                              epsilon=0.1,
                              n_jobs=1,
                              random_state=None,
                              learning_rate='optimal',
                              eta0=0.0,
                              power_t=0.5,
                              class_weight=None,
                              warm_start=False,
                              average=False,
                              n_iter=None)
        train_start = time()
        model.fit(X_train, y_train)
        train_end = time()
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)
        train_f1 = metrics.f1_score(y_train, train_pred, average='macro')
        val_f1 = metrics.f1_score(y_val, val_pred, average='macro')
        val_acc = metrics.accuracy_score(y_val, val_pred)
        print(
            "SGD Performance: train F1: {} | train time: {} | val F1: {} | val Acc: {}"
            .format(train_f1, timedelta(seconds=train_end - train_start),
                    val_f1, val_acc))

    if (model_name.lower() in ['rf', 'all']):
        model = RandomForestClassifier(n_estimators=100)
        train_start = time()
        model.fit(X_train, y_train)
        train_end = time()
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)
        train_f1 = metrics.f1_score(y_train, train_pred, average='macro')
        val_f1 = metrics.f1_score(y_val, val_pred, average='macro')
        val_acc = metrics.accuracy_score(y_val, val_pred)
        print(
            "RF Performance: train F1: {} | train time: {} | val F1: {} | val Acc: {}"
            .format(train_f1, timedelta(seconds=train_end - train_start),
                    val_f1, val_acc))

    if (model_name.lower() in ['ovr', 'all']):
        base_estimator = RandomForestClassifier(n_estimators=100)
        model = OneVsRestClassifier(estimator=base_estimator)
        train_start = time()
        model.fit(X_train, y_train)
        train_end = time()
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)
        train_f1 = metrics.f1_score(y_train, train_pred, average='macro')
        val_f1 = metrics.f1_score(y_val, val_pred, average='macro')
        val_acc = metrics.accuracy_score(y_val, val_pred)
        print(
            "OvR Performance: train F1: {} | train time: {} | val F1: {} | val Acc: {}"
            .format(train_f1, timedelta(seconds=train_end - train_start),
                    val_f1, val_acc))

    return (model)
np.save(outpdir + r'\X_train', X_train)
np.save(outpdir + r'\X_validation', X_validation)
np.save(outpdir + r'\X_test', X_test)
np.save(outpdir + r'\y_train', y_train)
np.save(outpdir + r'\y_validation', y_validation)
np.save(outpdir + r'\y_test', y_test)

# Just a warm up: Use the logistics regression to complete this Chinese document classification task
# We first train the data without undersampling and see how it performs in the validation set
print('===============================Without Undersampling Starts===============================')

start = time.time()

classifier = LogisticRegression
pipeline = make_pipeline(classifier(C=40, random_state=0))
multiC = OneVsRestClassifier(estimator=pipeline)
validation_result = multiC.fit(X_train, y_train).predict(X_validation)
true_validation = np.array(y_validation)

end = time.time()

print('Total time - Without Undersampling: ', end - start, ' seconds\n')
print(metrics.classification_report(y_validation, validation_result))
print()
print('Without Undersampling -  Pipeline Score {}'.format(multiC.fit(X_train, y_train).score(X_validation, y_validation)))
print()
print_results("Without Undersampling - Validation set: ", true_validation, validation_result)

print('===============================Without Undersampling Ends===============================\n')

print('================================With Undersampling Starts===============================\n')
Ejemplo n.º 27
0
# accuracy with tfidf vectorizer
acc_tfidf_nb = accuracy_score(nb_2.predict(X_test_tfidf), Y_test)

# display accuracies
print("Accuracy of Count = ", acc_count_nb)

print("Accuracy of Tfidf = ", acc_tfidf_nb)

# Code ends here

# --------------
import warnings
warnings.filterwarnings('ignore')

# initialize logistic regression
logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10))
logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10))

# fit on count vectorizer training data
logreg_1.fit(X_train_count, Y_train)

# fit on tfidf vectorizer training data
logreg_2.fit(X_train_tfidf, Y_train)

# accuracy with count vectorizer
acc_count_logreg = accuracy_score(logreg_1.predict(X_test_count), Y_test)

# accuracy with tfidf vectorizer
acc_tfidf_logreg = accuracy_score(logreg_2.predict(X_test_tfidf), Y_test)

# display accuracies
Ejemplo n.º 28
0
                    file_name + '/' + file_name + '_' + name + '.csv',
                    delimiter=',')
    f2 = np.loadtxt('D:/Study/Bioinformatics/王浩/data and code/data/feature/' +
                    file_name + '/' + file_name + '_label.csv',
                    delimiter=',',
                    skiprows=1)
    X = f1
    y = f2

    cv = KFold(n_splits=10, shuffle=True, random_state=0)
    parameters = {
        'estimator__gamma': np.logspace(5, -15, base=2, num=21),
        'estimator__n_neighbors': np.linspace(5, 15, num=3, dtype=int)
    }

    grid = GridSearchCV(OneVsRestClassifier(Class_KDVM_knn.KDVM(kernel='rbf'),
                                            n_jobs=-1),
                        parameters,
                        n_jobs=-1,
                        cv=cv,
                        verbose=2)

    grid.fit(X, y)
    gamma = grid.best_params_['estimator__gamma']
    n_neighbors = grid.best_params_['estimator__n_neighbors']

    # average_pre_score = average_precision_score(Y_test, pre_score_2, average='samples')
    # zero_one_loss_1 = metrics.zero_one_loss(Y_test, pre_y)
    # coverage_error_1 = coverage_error(Y_test, pre_score_2) - 1
    # label_ranking_loss_1 = label_ranking_loss(Y_test, pre_score_2)
    # ham_loss = metrics.hamming_loss(Y_test.T, pre_y.T)
    # acc_score = metrics.accuracy_score(Y_test, pre_y)
Ejemplo n.º 29
0
iris = datasets.load_iris()

X = iris.data
Y = np.array(iris.target)
Y = Y.reshape(len(Y), 1)

XY = np.concatenate((X, Y), axis=1)

print XY.shape

XY = pd.DataFrame(XY,
                  columns=[
                      'sepal_length', 'sepal_width', 'petal_length',
                      'petal_width', 'target'
                  ])

print XY
#classif = SVC(kernel='linear')
classif = RandomForestRegressor()

classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))
classifier.fit(X, Y.ravel())

result = RankingTool.GetLearningResult(
    XY,
    classif, ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
    'target',
    modeltype='tree',
    use_cv=5)

print result
Ejemplo n.º 30
0
                             tokenizer=lambda x: x.split(),
                             sublinear_tf=False,
                             ngram_range=(1, 3))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)

print("Dimensions of train data X:", x_train_multilabel.shape, "Y :",
      y_train.shape)
print("Dimensions of test data X:", x_test_multilabel.shape, "Y:",
      y_test.shape)

# This function is compute heavy and takes 6-7 hours to run.
classifier = OneVsRestClassifier(SGDClassifier(loss='log',
                                               alpha=0.00001,
                                               penalty='l1',
                                               n_jobs=-1),
                                 n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict(x_test_multilabel)

print("accuracy :", metrics.accuracy_score(y_test, predictions))
print("macro f1 score :", metrics.f1_score(y_test,
                                           predictions,
                                           average='macro'))
print("micro f1 scoore :",
      metrics.f1_score(y_test, predictions, average='micro'))
print("hamming loss :", metrics.hamming_loss(y_test, predictions))
print("Precision recall report :\n",
      metrics.classification_report(y_test, predictions))