Esempio n. 1
0
from sklearn import svm
import itertools
from sklearn.ensemble import RandomForestClassifier as RFC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import all_parsing_codes

TEMPFILE = '../data/train_test_sets/randomized109_proteins.3line.txt'
TEMPFILE2 = '../data/testing_sets/dataset_of_50.txt'
win_len = 7
class_names = ['G', 'I', 'H', 'E', 'B', 'T', 'S', 'C']
X_train, y_train, e, o = all_parsing_codes.parse_with_train_test(
    TEMPFILE, win_len)
X_test, y_test = all_parsing_codes.parse_with_all_codes(TEMPFILE2, win_len)
#X_train, y_train, X_test, y_test = all_parsing_codes.protein_w_pssm_train(TEMPFILE,win_len)

classifier_model = RFC(n_estimators=350, min_samples_split=2, n_jobs=-1)
y_pred = classifier_model.fit(X_train, y_train).predict(X_test)


def plot_confusion_matrix(cm,
                          classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
import all_parsing_codes
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as RFC
import numpy as np
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import all_parsing_codes

tempfile = '../data/train_test_sets/randomized109_proteins.3line.txt'
TEMPFILE2 = '../data/testing_sets/dataset_of_50.txt'
OUTPUT = open("../results/testing_results/RFC_metrics_again.txt", 'w')

X_TRAIN, Y_TRAIN, X_TEST, Y_TEST = all_parsing_codes.parse_with_train_test(
    tempfile, 11)

MODEL = RFC(n_estimators=350, min_samples_split=3, n_jobs=-1)
MODEL.fit(
    X_TRAIN,
    Y_TRAIN,
)
PREDICTION = MODEL.predict(X_TEST)
REPORT = classification_report(
    Y_TEST,
    PREDICTION,
    labels=[1, 2, 3, 4, 5, 6, 7, 8],
    target_names=['G', 'I', 'H', 'E', 'B', 'T', 'S', 'C'])
CONFUSION = confusion_matrix(Y_TEST,
                             PREDICTION,
                             labels=[1, 2, 3, 4, 5, 6, 7, 8])
#import my functions from the all_parsing_codes file#
#####################################################
import all_parsing_codes
import numpy as np
from sklearn import svm
from sklearn.model_selection import cross_val_score

tempfile = '../data/train_test_sets/34_proteins.3line.txt'



##########################################################################################
#Split my dataset into 70% and 30%. 70% being the training set and 30% being the test set#
##########################################################################################

X_train, Y_train, X_test, Y_test = all_parsing_codes.parse_with_train_test(tempfile, 11)

#####################################
#fit the model with the training set#
#####################################

clf = svm.SVC(kernel='linear', cache_size=3000)
clf.fit(X_train, Y_train)

#########################################################################
#use the testing set's feature to see if predicting works for the labels#
#########################################################################

prediction=clf.predict(X_test)

#############################################################################
Esempio n. 4
0
import numpy as np
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import all_parsing_codes

TEMPFILE = '../data/train_test_sets/randomized109_proteins.3line.txt'
#OUTPUT = open("../results/testing_results/linearSVM_metrics.txt", 'w')
OUTPUT2 = open("../results/testing_results/linearSVM_crossvalidation_new.txt",
               'w')

for c_score in (0.1, 1, 10, 100):
    for win_len in range(19, 26, 2):
        X_TRAIN, Y_TRAIN, X_TEST, Y_TEST = all_parsing_codes.parse_with_train_test(
            TEMPFILE, win_len)
        MODEL = svm.LinearSVC(C=c_score)
        SCORE = cross_val_score(MODEL,
                                X_TRAIN,
                                Y_TRAIN,
                                cv=3,
                                verbose=True,
                                n_jobs=-1)
        SCORE_AVERAGE = np.average(SCORE)
        SCORE_DEVIATION = np.std(SCORE)
        OUTPUT2.write("C-score: " + str(c_score) + '\n' + " window size: " +
                      str(win_len) + '\n' + " cross-validation score: " +
                      str(SCORE_AVERAGE) + '\n' + " standard deviation: " +
                      str(SCORE_DEVIATION) + '\n')
        print("C-score: " + str(c_score), "window size: " + str(win_len),
              "cross-validation score: " + str(SCORE_AVERAGE),