def test_tcell_reduced_alphabet(): """ IEBD T-cell: Changing to a binary amino acid alphabet should reduce the number of samples since some distinct 20-letter strings collide as 2-letter strings """ imm, non = iedb.load_tcell_classes(nrows = 100) imm2, non2 = \ iedb.load_tcell_classes( nrows = 100, reduced_alphabet = reduced_alphabet.hp2) assert len(imm) + len(non) > len(imm2) + len(non2)
import sklearn import sklearn.cross_validation import sklearn.ensemble import sklearn.linear_model from epitopes import iedb, amino_acid, features """ Compare IEDB classification AUC on: Logistic Regression vs. Random Forest 9mer vs. n-gram and: LR weights vs. RF feature importances """ imm, non = iedb.load_tcell_classes(peptide_length = 9) X, Y = features.make_kmer_dataset(imm, non) X_1gram, Y_1gram = features.make_ngram_dataset(imm, non, max_ngram = 1) X_2gram, Y_2gram = features.make_ngram_dataset(imm, non, max_ngram = 2) lr = sklearn.linear_model.LogisticRegression() print "Amino acid 9mers w/ Logistic Regression" print "LR Accuracy", np.mean(sklearn.cross_validation.cross_val_score(lr, X, Y, cv = 10)) lr.fit(X, Y) print "LR coefs", lr.coef_ print "Amino acid unigrams w/ Logistic Regression" print "LR Accuracy", np.mean(sklearn.cross_validation.cross_val_score(lr, X_1gram, Y_1gram, cv = 10)) lr.fit(X_1gram,Y_1gram) print "LR coefs", lr.coef_
import scipy.sparse import numpy as np import sklearn.metrics import sklearn.metrics.pairwise import sklearn.utils import sklearn.utils.graph_shortest_path from epitopes import iedb, amino_acid from epitopes.amino_acid import peptide_to_indices CUTOFF = 3 SPARSE = False ASSAY = None #'cytotoxicity' LENGTH = 9 imm, non = iedb.load_tcell_classes(peptide_length = LENGTH, assay_group = ASSAY) imm = list(imm) non = list(non) peptides = imm + non labels = [True] * len(imm) + [False] * len(non) X = np.array([peptide_to_indices(p) for p in peptides]) Y = np.array(labels) n = len(labels) D = sklearn.metrics.pairwise.pairwise_distances(X, metric='hamming') D = np.round(D*LENGTH).astype('int') print "Distances"
n_features = 0 for i in xrange(max_ngram): n_features += n_letters ** (i+1) if n_features > 500: continue else: param_count += 1 param_str = \ "%d: Assay = '%s', ngram %s, alphabet %s, mhc_class %s" % \ (param_count, assay, max_ngram, alphabet, mhc_class) print param_str imm_pos, imm_neg = iedb.load_tcell_classes( assay_group = assay, human = True, min_count = None, mhc_class = mhc_class) mhc_pos, _ = iedb.load_mhc_classes( human = True, min_count = None, mhc_class = mhc_class) imm = list(mhc_pos.intersection(imm_pos)) non = list(mhc_pos.intersection(imm_neg)) vectorizer = PeptideVectorizer( max_ngram = max_ngram, reduced_alphabet = alphabet_dict) X = vectorizer.fit_transform(imm + non)
'acc' : [], } best_model = None best_vectorizer = None best_params = None param_count = 0 for assay in ('cytotoxicity', None, ): for mhc_class in (1, None): for min_count in (3, 5, None): imm, non = iedb.load_tcell_classes( assay_group = assay, human = True, mhc_class = mhc_class, min_count = min_count) for alphabet in \ ('hp2', 'gbmr4', 'hp_vs_aromatic', 'sdm12', 'hsdm17'): transformer = reduced_alphabet.make_alphabet_transformer(alphabet) param_str = \ "%d: Assay = '%s', min_count %s, alphabet %s, mhc_class %s" % \ (param_count, assay, min_count, alphabet, mhc_class) print param_str d['assay'].append(assay) d['alphabet'].append(alphabet) d['mhc'].append(mhc_class)
import numpy as np import sklearn import sklearn.cross_validation import sklearn.ensemble import sklearn.linear_model from epitopes import iedb, amino_acid, features """ Better performance when filtering the assay group? cytotoxicity looks cleanest """ imm, non = iedb.load_tcell_classes(assay_group = 'cytotoxicity') X_1gram, Y_1gram = features.make_ngram_dataset(imm, non, max_ngram = 1) X_2gram, Y_2gram = features.make_ngram_dataset(imm, non, max_ngram = 2) lr = sklearn.linear_model.LogisticRegression() print "Amino acid unigrams w/ Logistic Regression" print "LR Accuracy", np.mean(sklearn.cross_validation.cross_val_score(lr, X_1gram, Y_1gram, cv = 10)) lr.fit(X_1gram,Y_1gram) #print "LR coefs", lr.coef_ print "Amino acid bigrams w/ Logistic Regression" print "LR Accuracy", np.mean(sklearn.cross_validation.cross_val_score(lr, X_2gram, Y_2gram, cv = 10)) lr.fit(X_2gram,Y_2gram) #print "LR coefs", lr.coef_ n_classifiers = 200