continue else: param_count += 1 param_str = \ "%d: Assay = '%s', ngram %s, alphabet %s, mhc_class %s" % \ (param_count, assay, max_ngram, alphabet, mhc_class) print param_str d['assay'].append(assay) d['alphabet'].append(alphabet) d['ngram'].append(max_ngram) d['mhc'].append(mhc_class) X, Y, vectorizer = iedb.load_tcell_ngrams( assay_group = assay, human = True, mhc_class = 1, max_ngram = max_ngram, reduced_alphabet = alphabet_dict, min_count = None, return_transformer = True) print "Data shape", X.shape, "n_true", np.sum(Y) ensemble = BalancedEnsembleClassifier() accs = sklearn.cross_validation.cross_val_score( ensemble, X, Y, cv = 3) acc = np.mean(accs) print "CV accuracy %0.4f (std %0.4f)" % \ (acc, np.std(accs)) d['cv_acc'].append(acc) aucs = sklearn.cross_validation.cross_val_score( ensemble, X, Y, cv = 5, scoring='roc_auc')
x_test_true = f.transform(cancer_peptides) x_test_false = f.transform(self_peptides) x_test = np.vstack([x_test_true, x_test_false]) y_test = np.ones(x_test.shape[0], dtype='bool') y_test[len(x_test_true):] = 0 eval_dataset.eval_split(x,y,x_test,y_test) ASSAY = 'cytotoxicity' print print "---" print "aromatic unigram" X, Y, f = iedb.load_tcell_ngrams( noisy_labels = 'majority', assay_group = ASSAY, subsample_bigger_class = True, human = True, mhc_class = 1, max_ngram = 1, reduced_alphabet= reduced_alphabet.aromatic2, return_transformer = True) eval_dataset.eval_cv(X, Y) print "Tumor-specific antigens" run(X,Y,f) print print "---" print "aromatic bigram" X, Y, f = iedb.load_tcell_ngrams( noisy_labels = 'majority', assay_group = ASSAY, subsample_bigger_class = True, human = True, mhc_class = 1,
import sklearn.linear_model from epitopes import iedb import eval_dataset """ Instead of dropping or keeping the noisy labels, started trying to just the majority vote. This is saner and became the default """ print print "---" print "Human MHC1" X_human_mhc1_filter, Y_human_mhc1_filter = iedb.load_tcell_ngrams( noisy_labels = 'majority', human = True, mhc_class = 1) eval_dataset.eval_cv(X_human_mhc1_filter, Y_human_mhc1_filter) print print "---" print "No HLA-A2" X_no_hla_a2, Y_no_hla_a2 = iedb.load_tcell_ngrams( noisy_labels = 'majority', human = True, mhc_class = 1, exclude_hla_type = 'HLA-A2$|A-\*02') eval_dataset.eval_cv(X_no_hla_a2, Y_no_hla_a2)
from epitopes import iedb, amino_acid, features, reduced_alphabet import eval_dataset """ Do results from a restrict HLA sample (only A2) generalize to all the other HLA types? (repeated for AA bigrams) """ A2 = "A2$|A\*02" print print "---" print "Human MHC1 (keep)" X_human_mhc1, Y_human_mhc1 = iedb.load_tcell_ngrams(noisy_labels="keep", human=True, max_ngram=2, mhc_class=1) eval_dataset.eval_cv(X_human_mhc1, Y_human_mhc1) print print "---" print "Human MHC1 (drop)" X_human_mhc1_filter, Y_human_mhc1_filter = iedb.load_tcell_ngrams( noisy_labels="drop", human=True, max_ngram=2, mhc_class=1 ) eval_dataset.eval_cv(X_human_mhc1_filter, Y_human_mhc1_filter) print print "---" print "Human MHC1 noisy = positive"
lr.fit(X,Y) print "LR coefs", lr.coef_ n_classifiers = 200 rf = sklearn.ensemble.RandomForestClassifier(n_classifiers) print "RF Accuracy", np.mean(sklearn.cross_validation.cross_val_score(rf, X, Y, cv = 10)) rf.fit(X,Y) print "RF Features", rf.feature_importances_ print "4 letter alphabet:" X4,Y4 = iedb.load_tcell_ngrams( assay_group = 'cytotoxicity', reduced_alphabet = reduced_alphabet.gbmr4, ) run_classifiers(X4, Y4) print "---" print print "12 letter alphabet:" X12,Y12 = iedb.load_tcell_ngrams( assay_group = 'cytotoxicity', reduced_alphabet = reduced_alphabet.sdm12, ) run_classifiers(X12, Y12) print "---" print
import sklearn.linear_model from epitopes import iedb, amino_acid, features, reduced_alphabet import eval_dataset """ Do results from a restrict HLA sample (only A2) generalize to all the other HLA types? """ A2 = 'A2$|A\*02' print print "---" print "Human MHC1 (keep)" X_human_mhc1, Y_human_mhc1 = iedb.load_tcell_ngrams( noisy_labels = 'keep', human = True, mhc_class = 1) eval_dataset.eval_cv(X_human_mhc1, Y_human_mhc1) print print "---" print "Human MHC1 (drop)" X_human_mhc1_filter, Y_human_mhc1_filter = iedb.load_tcell_ngrams( noisy_labels = 'drop', human = True, mhc_class = 1) eval_dataset.eval_cv(X_human_mhc1_filter, Y_human_mhc1_filter) print