from sklearn import svm import itertools from sklearn.ensemble import RandomForestClassifier as RFC import numpy as np import matplotlib.pyplot as plt from sklearn import svm, datasets from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix import all_parsing_codes TEMPFILE = '../data/train_test_sets/randomized109_proteins.3line.txt' TEMPFILE2 = '../data/testing_sets/dataset_of_50.txt' win_len = 7 class_names = ['G', 'I', 'H', 'E', 'B', 'T', 'S', 'C'] X_train, y_train, e, o = all_parsing_codes.parse_with_train_test( TEMPFILE, win_len) X_test, y_test = all_parsing_codes.parse_with_all_codes(TEMPFILE2, win_len) #X_train, y_train, X_test, y_test = all_parsing_codes.protein_w_pssm_train(TEMPFILE,win_len) classifier_model = RFC(n_estimators=350, min_samples_split=2, n_jobs=-1) y_pred = classifier_model.fit(X_train, y_train).predict(X_test) def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Greens): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`.
import all_parsing_codes import numpy as np from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier as RFC import numpy as np from sklearn.metrics import matthews_corrcoef from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report import all_parsing_codes tempfile = '../data/train_test_sets/randomized109_proteins.3line.txt' TEMPFILE2 = '../data/testing_sets/dataset_of_50.txt' OUTPUT = open("../results/testing_results/RFC_metrics_again.txt", 'w') X_TRAIN, Y_TRAIN, X_TEST, Y_TEST = all_parsing_codes.parse_with_train_test( tempfile, 11) MODEL = RFC(n_estimators=350, min_samples_split=3, n_jobs=-1) MODEL.fit( X_TRAIN, Y_TRAIN, ) PREDICTION = MODEL.predict(X_TEST) REPORT = classification_report( Y_TEST, PREDICTION, labels=[1, 2, 3, 4, 5, 6, 7, 8], target_names=['G', 'I', 'H', 'E', 'B', 'T', 'S', 'C']) CONFUSION = confusion_matrix(Y_TEST, PREDICTION, labels=[1, 2, 3, 4, 5, 6, 7, 8])
#import my functions from the all_parsing_codes file# ##################################################### import all_parsing_codes import numpy as np from sklearn import svm from sklearn.model_selection import cross_val_score tempfile = '../data/train_test_sets/34_proteins.3line.txt' ########################################################################################## #Split my dataset into 70% and 30%. 70% being the training set and 30% being the test set# ########################################################################################## X_train, Y_train, X_test, Y_test = all_parsing_codes.parse_with_train_test(tempfile, 11) ##################################### #fit the model with the training set# ##################################### clf = svm.SVC(kernel='linear', cache_size=3000) clf.fit(X_train, Y_train) ######################################################################### #use the testing set's feature to see if predicting works for the labels# ######################################################################### prediction=clf.predict(X_test) #############################################################################
import numpy as np from sklearn import svm from sklearn.model_selection import cross_val_score from sklearn.metrics import matthews_corrcoef from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report import all_parsing_codes TEMPFILE = '../data/train_test_sets/randomized109_proteins.3line.txt' #OUTPUT = open("../results/testing_results/linearSVM_metrics.txt", 'w') OUTPUT2 = open("../results/testing_results/linearSVM_crossvalidation_new.txt", 'w') for c_score in (0.1, 1, 10, 100): for win_len in range(19, 26, 2): X_TRAIN, Y_TRAIN, X_TEST, Y_TEST = all_parsing_codes.parse_with_train_test( TEMPFILE, win_len) MODEL = svm.LinearSVC(C=c_score) SCORE = cross_val_score(MODEL, X_TRAIN, Y_TRAIN, cv=3, verbose=True, n_jobs=-1) SCORE_AVERAGE = np.average(SCORE) SCORE_DEVIATION = np.std(SCORE) OUTPUT2.write("C-score: " + str(c_score) + '\n' + " window size: " + str(win_len) + '\n' + " cross-validation score: " + str(SCORE_AVERAGE) + '\n' + " standard deviation: " + str(SCORE_DEVIATION) + '\n') print("C-score: " + str(c_score), "window size: " + str(win_len), "cross-validation score: " + str(SCORE_AVERAGE),