alphabet_dict = reduced_alphabet.hp2 elif alphabet == 'aromatic2': alphabet_dict = reduced_alphabet.aromatic2 else: assert alphabet is None, alphabet alphabet_dict = None X, Y, vectorizer = iedb.load_tcell_ngrams( assay_group = assay, human = True, mhc_class = 1, max_ngram = max_ngram, reduced_alphabet = alphabet_dict, min_count = None, return_transformer = True) print "Data shape", X.shape, "n_true", np.sum(Y) ensemble = BalancedEnsembleClassifier() accs = sklearn.cross_validation.cross_val_score( ensemble, X, Y, cv = 5) print "CV accuracy %0.4f (std %0.4f)" % \ (np.mean(accs), np.std(accs)) d['acc'].append(np.mean(accs)) aucs = sklearn.cross_validation.cross_val_score( ensemble, X, Y, cv = 5, scoring='roc_auc') print "CV AUC %0.4f (std %0.4f)" % \ (np.mean(aucs), np.std(aucs)) d['auc'].append(np.mean(aucs)) ensemble.fit(X, Y)
print param_str d['assay'].append(assay) d['alphabet'].append(alphabet) d['ngram'].append(max_ngram) d['mhc'].append(mhc_class) X, Y, vectorizer = iedb.load_tcell_ngrams( assay_group = assay, human = True, mhc_class = 1, max_ngram = max_ngram, reduced_alphabet = alphabet_dict, min_count = None, return_transformer = True) print "Data shape", X.shape, "n_true", np.sum(Y) ensemble = BalancedEnsembleClassifier() accs = sklearn.cross_validation.cross_val_score( ensemble, X, Y, cv = 3) acc = np.mean(accs) print "CV accuracy %0.4f (std %0.4f)" % \ (acc, np.std(accs)) d['cv_acc'].append(acc) aucs = sklearn.cross_validation.cross_val_score( ensemble, X, Y, cv = 5, scoring='roc_auc') auc = np.mean(aucs) print "CV AUC %0.4f (std %0.4f)" % \ (auc, np.std(aucs)) d['cv_auc'].append(auc)
def strings_to_array(strings): all_strings = ''.join(strings) X = np.fromstring(all_strings, dtype='uint8') m = len(X) / kmer_length X = X.reshape((m, kmer_length)) X -= ord('0') return X X = strings_to_array(X_combined) Y = np.array(Y_combined) W = np.array(W_combined) print "# imm = %d, # non = %d" % (len(imm), len(non)) print "Data shape", X.shape, "n_true", np.sum(Y) rf = BalancedEnsembleClassifier(n_estimators = 200) #aucs = sklearn.cross_validation.cross_val_score( # rf, X, Y, cv = 10, scoring='roc_auc') #print "CV AUC %0.4f (std %0.4f)" % (np.mean(aucs), np.std(aucs)) #d['cv_auc'].append(np.mean(aucs)) #rf = RandomForestClassifier(n_estimators = 100) rf.fit(X, Y, W) def predict(peptides): Y_pred = np.zeros(len(peptides), dtype=float) counts = np.zeros(len(peptides), dtype=int) X_test, _, Indices = expand(peptides) X_test = strings_to_array(X_test) #Y_pred_raw = rf.predict(X_test) Y_pred_prob = rf.predict_proba(X_test)[:, 1] Y_pred_rescaled = (2 * (Y_pred_prob - 0.5))