alphabet_dict =  reduced_alphabet.hp2
            elif alphabet == 'aromatic2':
                alphabet_dict = reduced_alphabet.aromatic2
            else:
                assert alphabet is None, alphabet
                alphabet_dict = None
            X, Y, vectorizer = iedb.load_tcell_ngrams(
                assay_group = assay,
                human = True,
                mhc_class = 1,
                max_ngram = max_ngram,
                reduced_alphabet = alphabet_dict,
                min_count = None,
                return_transformer = True)
            print "Data shape", X.shape, "n_true", np.sum(Y)
            ensemble = BalancedEnsembleClassifier()

            accs = sklearn.cross_validation.cross_val_score(
                ensemble, X, Y, cv = 5)
            print "CV accuracy %0.4f (std %0.4f)" % \
                (np.mean(accs), np.std(accs))
            d['acc'].append(np.mean(accs))

            aucs = sklearn.cross_validation.cross_val_score(
                ensemble, X, Y, cv = 5, scoring='roc_auc')
            print "CV AUC %0.4f (std %0.4f)" % \
                (np.mean(aucs), np.std(aucs))
            d['auc'].append(np.mean(aucs))

            ensemble.fit(X, Y)
                print param_str
                d['assay'].append(assay)
                d['alphabet'].append(alphabet)
                d['ngram'].append(max_ngram)
                d['mhc'].append(mhc_class)

                X, Y, vectorizer = iedb.load_tcell_ngrams(
                    assay_group = assay,
                    human = True,
                    mhc_class = 1,
                    max_ngram = max_ngram,
                    reduced_alphabet = alphabet_dict,
                    min_count = None,
                    return_transformer = True)
                print "Data shape", X.shape, "n_true", np.sum(Y)
                ensemble = BalancedEnsembleClassifier()

                accs = sklearn.cross_validation.cross_val_score(
                    ensemble, X, Y, cv = 3)
                acc = np.mean(accs)
                print "CV accuracy %0.4f (std %0.4f)" % \
                    (acc, np.std(accs))
                d['cv_acc'].append(acc)

                aucs = sklearn.cross_validation.cross_val_score(
                    ensemble, X, Y, cv = 5, scoring='roc_auc')
                auc = np.mean(aucs)
                print "CV AUC %0.4f (std %0.4f)" % \
                    (auc, np.std(aucs))
                d['cv_auc'].append(auc)
                def strings_to_array(strings):
                    all_strings = ''.join(strings)
                    X = np.fromstring(all_strings, dtype='uint8')
                    m = len(X) / kmer_length
                    X = X.reshape((m, kmer_length))
                    X -= ord('0')
                    return X

                X = strings_to_array(X_combined)
                Y = np.array(Y_combined)
                W = np.array(W_combined)
                print "# imm = %d, # non = %d" % (len(imm), len(non))
                print "Data shape", X.shape, "n_true", np.sum(Y)
                
                rf = BalancedEnsembleClassifier(n_estimators = 200)
                #aucs = sklearn.cross_validation.cross_val_score(
	        #  rf, X, Y, cv = 10, scoring='roc_auc')
		#print "CV AUC %0.4f (std %0.4f)" % (np.mean(aucs), np.std(aucs))
                #d['cv_auc'].append(np.mean(aucs))
                #rf = RandomForestClassifier(n_estimators = 100)
                rf.fit(X, Y, W)
                def predict(peptides):
                    Y_pred = np.zeros(len(peptides), dtype=float)
                    counts = np.zeros(len(peptides), dtype=int)
                    X_test, _, Indices = expand(peptides)
                    X_test = strings_to_array(X_test)
                    #Y_pred_raw = rf.predict(X_test)

                    Y_pred_prob = rf.predict_proba(X_test)[:, 1]
                    Y_pred_rescaled = (2 * (Y_pred_prob - 0.5))