Beispiel #1
0
def test_tcell_reduced_alphabet():
    """
    IEBD T-cell:
    Changing to a binary amino acid alphabet should reduce the number of
    samples since some distinct 20-letter strings collide as 2-letter strings
    """
    imm, non = iedb.load_tcell_classes(nrows = 100)
    imm2, non2 = \
        iedb.load_tcell_classes(
            nrows = 100,
            reduced_alphabet = reduced_alphabet.hp2)
    assert len(imm) + len(non) > len(imm2) + len(non2)
Beispiel #2
0
import sklearn
import sklearn.cross_validation
import sklearn.ensemble
import sklearn.linear_model

from epitopes import iedb, amino_acid, features

"""
Compare IEDB classification AUC on:
  Logistic Regression vs. Random Forest
  9mer vs. n-gram
and:
  LR weights vs. RF feature importances
"""

imm, non = iedb.load_tcell_classes(peptide_length = 9)
X, Y = features.make_kmer_dataset(imm, non)
X_1gram, Y_1gram = features.make_ngram_dataset(imm, non, max_ngram = 1)
X_2gram, Y_2gram = features.make_ngram_dataset(imm, non, max_ngram = 2)

lr = sklearn.linear_model.LogisticRegression()

print "Amino acid 9mers w/ Logistic Regression"
print "LR Accuracy", np.mean(sklearn.cross_validation.cross_val_score(lr, X, Y, cv = 10))
lr.fit(X, Y)
print "LR coefs", lr.coef_

print "Amino acid unigrams w/ Logistic Regression"
print "LR Accuracy", np.mean(sklearn.cross_validation.cross_val_score(lr, X_1gram, Y_1gram, cv = 10))
lr.fit(X_1gram,Y_1gram)
print "LR coefs", lr.coef_
import scipy.sparse
import numpy as np
import sklearn.metrics
import sklearn.metrics.pairwise
import sklearn.utils
import sklearn.utils.graph_shortest_path

from epitopes import iedb, amino_acid
from epitopes.amino_acid import peptide_to_indices

CUTOFF = 3
SPARSE = False
ASSAY = None #'cytotoxicity'
LENGTH = 9

imm, non = iedb.load_tcell_classes(peptide_length = LENGTH, assay_group = ASSAY)
imm = list(imm)
non = list(non)

peptides = imm + non
labels = [True] * len(imm) + [False] * len(non)

X = np.array([peptide_to_indices(p) for p in peptides])
Y = np.array(labels)

n = len(labels)

D = sklearn.metrics.pairwise.pairwise_distances(X, metric='hamming')
D = np.round(D*LENGTH).astype('int')

print "Distances"
                n_features = 0
                for i in xrange(max_ngram):
                    n_features += n_letters ** (i+1)

                if n_features > 500:
                    continue
                else:
                    param_count += 1
                param_str =  \
                    "%d: Assay = '%s', ngram %s, alphabet %s, mhc_class %s" % \
                    (param_count, assay, max_ngram, alphabet, mhc_class)
                print param_str

                imm_pos, imm_neg = iedb.load_tcell_classes(
                    assay_group = assay,
                    human = True,
                    min_count = None, 
                    mhc_class = mhc_class)

                mhc_pos, _ = iedb.load_mhc_classes(
                    human = True, 
                    min_count = None, 
                    mhc_class = mhc_class)

                imm = list(mhc_pos.intersection(imm_pos))
                non = list(mhc_pos.intersection(imm_neg))

                vectorizer = PeptideVectorizer(
                    max_ngram = max_ngram,
                    reduced_alphabet = alphabet_dict)
                X = vectorizer.fit_transform(imm + non)
    'acc' : [],
}


best_model = None
best_vectorizer = None
best_params = None

param_count = 0
for assay in ('cytotoxicity', None, ):
    for mhc_class in (1, None):
        for min_count in (3, 5,  None):

            imm, non = iedb.load_tcell_classes(
                assay_group = assay,
                human = True,
                mhc_class = mhc_class,
                min_count = min_count)

            for alphabet in \
                    ('hp2', 'gbmr4', 'hp_vs_aromatic', 'sdm12', 'hsdm17'):
                
                transformer = reduced_alphabet.make_alphabet_transformer(alphabet)
                param_str =  \
                    "%d: Assay = '%s', min_count %s, alphabet %s, mhc_class %s" % \
                    (param_count, assay, min_count, alphabet, mhc_class)
                print param_str
                
                d['assay'].append(assay)
                d['alphabet'].append(alphabet)
                d['mhc'].append(mhc_class)
Beispiel #6
0
import numpy as np
import sklearn
import sklearn.cross_validation
import sklearn.ensemble
import sklearn.linear_model

from epitopes import iedb, amino_acid, features

"""
Better performance when filtering the assay group? cytotoxicity looks cleanest
"""

imm, non = iedb.load_tcell_classes(assay_group = 'cytotoxicity')
X_1gram, Y_1gram = features.make_ngram_dataset(imm, non, max_ngram = 1)
X_2gram, Y_2gram = features.make_ngram_dataset(imm, non, max_ngram = 2)

lr = sklearn.linear_model.LogisticRegression()


print "Amino acid unigrams w/ Logistic Regression"
print "LR Accuracy", np.mean(sklearn.cross_validation.cross_val_score(lr, X_1gram, Y_1gram, cv = 10))
lr.fit(X_1gram,Y_1gram)
#print "LR coefs", lr.coef_

print "Amino acid bigrams w/ Logistic Regression"
print "LR Accuracy", np.mean(sklearn.cross_validation.cross_val_score(lr, X_2gram, Y_2gram, cv = 10))
lr.fit(X_2gram,Y_2gram)
#print "LR coefs", lr.coef_


n_classifiers = 200