accs = sklearn.cross_validation.cross_val_score( ensemble, X, Y, cv = 3) acc = np.mean(accs) print "CV accuracy %0.4f (std %0.4f)" % \ (acc, np.std(accs)) d['cv_acc'].append(acc) aucs = sklearn.cross_validation.cross_val_score( ensemble, X, Y, cv = 5, scoring='roc_auc') auc = np.mean(aucs) print "CV AUC %0.4f (std %0.4f)" % \ (auc, np.std(aucs)) d['cv_auc'].append(auc) ensemble.fit(X, Y) X_pos_test = vectorizer.transform(cancer_peptides) Y_pos_pred = ensemble.predict(X_pos_test) pos_acc = np.mean(Y_pos_pred) print "Tumor antigen accuracy %0.4f" % (pos_acc,) d['pos_acc'].append(pos_acc) X_neg_test = vectorizer.transform( non_immunogenic_hiv_peptides) Y_neg_pred = ensemble.predict(X_neg_test) neg_acc = 1.0 - np.mean(Y_neg_pred) print "Non-immunogenic accuracy %0.4f" % (neg_acc,) d['neg_acc'].append(neg_acc) n_pos_pred = np.sum(Y_pos_pred)
X -= ord('0') return X X = strings_to_array(X_combined) Y = np.array(Y_combined) W = np.array(W_combined) print "# imm = %d, # non = %d" % (len(imm), len(non)) print "Data shape", X.shape, "n_true", np.sum(Y) rf = BalancedEnsembleClassifier(n_estimators = 200) #aucs = sklearn.cross_validation.cross_val_score( # rf, X, Y, cv = 10, scoring='roc_auc') #print "CV AUC %0.4f (std %0.4f)" % (np.mean(aucs), np.std(aucs)) #d['cv_auc'].append(np.mean(aucs)) #rf = RandomForestClassifier(n_estimators = 100) rf.fit(X, Y, W) def predict(peptides): Y_pred = np.zeros(len(peptides), dtype=float) counts = np.zeros(len(peptides), dtype=int) X_test, _, Indices = expand(peptides) X_test = strings_to_array(X_test) #Y_pred_raw = rf.predict(X_test) Y_pred_prob = rf.predict_proba(X_test)[:, 1] Y_pred_rescaled = (2 * (Y_pred_prob - 0.5)) Y_pred_weight = np.sign(Y_pred_rescaled) * Y_pred_rescaled ** 2 # group outputs by the sample they came from, # at the end we'll have the majority vote #Y_pred = rf.predict(X_test) for (y,i) in zip(Y_pred_weight, Indices): Y_pred[i] += y