コード例 #1
0
ファイル: test_speed.py プロジェクト: minghao2016/mhcflurry
def setup():
    global ALLELE_SPECIFIC_PREDICTOR, PAN_ALLELE_PREDICTOR
    startup()
    ALLELE_SPECIFIC_PREDICTOR = Class1AffinityPredictor.load(
        get_path("models_class1", "models"))

    PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load(
        get_path("models_class1_pan", "models.with_mass_spec"))
コード例 #2
0
def setup():
    global PREDICTORS
    startup()
    PREDICTORS = {
        'allele-specific':
        Class1AffinityPredictor.load(get_path("models_class1", "models")),
        'pan-allele':
        Class1AffinityPredictor.load(
            get_path("models_class1_pan", "models.combined"))
    }
コード例 #3
0
def setup():
    global PAN_ALLELE_PREDICTOR
    startup()
    PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load(
        get_path("models_class1_pan", "models.combined"),
        optimization_level=0,
    )
コード例 #4
0
def run_and_check(n_jobs=0, delete=True, additional_args=[]):
    source_models_dir = get_path("models_class1_pan", "models.combined")
    dest_models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")

    # Save a new predictor that has no percent rank calibration data.
    original_predictor = Class1AffinityPredictor.load(source_models_dir)
    print("Loaded predictor", source_models_dir)
    new_predictor = Class1AffinityPredictor(
        class1_pan_allele_models=original_predictor.class1_pan_allele_models,
        allele_to_sequence=original_predictor.allele_to_sequence,
    )
    new_predictor.save(dest_models_dir)
    print("Saved predictor to", dest_models_dir)

    new_predictor = Class1AffinityPredictor.load(dest_models_dir)
    assert_equal(len(new_predictor.allele_to_percent_rank_transform), 0)

    args = [
        "mhcflurry-calibrate-percentile-ranks",
        "--models-dir",
        dest_models_dir,
        "--match-amino-acid-distribution-data",
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"),
        "--motif-summary",
        "--num-peptides-per-length",
        "1000",
        "--allele",
        "HLA-A*02:01",
        "HLA-B*07:02",
        "--verbosity",
        "1",
        "--num-jobs",
        str(n_jobs),
    ] + additional_args
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    new_predictor = Class1AffinityPredictor.load(dest_models_dir)
    assert_equal(len(new_predictor.allele_to_percent_rank_transform), 2)

    if delete:
        print("Deleting: %s" % dest_models_dir)
        shutil.rmtree(dest_models_dir)
    else:
        print("Not deleting: %s" % dest_models_dir)
コード例 #5
0
def initialise_prediction():
    """
    Enable mhcflurry prediction
    :return:
    """
    os.environ[
        'TF_CPP_MIN_LOG_LEVEL'] = '2'  # Prevents warnings. No, I'm not building from source
    from mhcflurry import Class1AffinityPredictor
    global predictor
    predictor = Class1AffinityPredictor.load()
コード例 #6
0
ファイル: mhcflurry.py プロジェクト: shah-newaz/mhctools
    def __init__(self,
                 alleles,
                 default_peptide_lengths=[9],
                 predictor=None,
                 models_path=None):
        """
        Parameters
        -----------
        alleles : list of str

        default_peptide_lengths : list of int

        predictor : mhcflurry.Class1AffinityPredictor (optional)
            MHCflurry predictor to use

        models_path : string
            Models dir to use if predictor argument is None

        """
        # moving import here since the mhcflurry package imports
        # Keras and its backend (either Theano or TF) which end up
        # slowing down responsive for any CLI application using MHCtools
        from mhcflurry import Class1AffinityPredictor
        BasePredictor.__init__(self,
                               alleles=alleles,
                               default_peptide_lengths=default_peptide_lengths,
                               min_peptide_length=8,
                               max_peptide_length=15)
        if predictor:
            self.predictor = predictor
        elif models_path:
            logging.info("Loading MHCflurry models from %s" % models_path)
            self.predictor = Class1AffinityPredictor.load(models_path)
        else:
            self.predictor = Class1AffinityPredictor.load()

        # relying on BasePredictor and MHCflurry to both normalize
        # allele names the same way using mhcnames
        for allele in self.alleles:
            if allele not in self.predictor.supported_alleles:
                raise UnsupportedAllele(allele)
コード例 #7
0
ファイル: ANN.py プロジェクト: e-dorigatti/Fred2
        def predict(self, peptides, alleles=None, binary=False, **kwargs):

            # test whether one peptide or a list
            if not isinstance(peptides, list):
                peptides = [peptides]

            # if no alleles are specified do predictions for all supported alleles
            if alleles is None:
                alleles = self.supportedAlleles
            else:
                # filter for supported alleles
                alleles = filter(lambda a: a in self.supportedAlleles, alleles)

            alleles = self.convert_alleles(alleles)

            # test mhcflurry models are available => download if not
            p = subprocess.Popen(
                ['mhcflurry-downloads', 'path', 'models_class1'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            if p is not 0:
                subprocess.call(
                    ['mhcflurry-downloads', 'fetch', 'models_class1'])

            # load model
            predictor = Class1AffinityPredictor.load()

            # predict and assign binding affinities
            result = {}
            for a in alleles:
                allele_repr = self.revert_allele_repr(a)
                result[allele_repr] = {}
                for p in peptides:
                    seq = p.__str__()
                    binding_affinity = predictor.predict(allele=a,
                                                         peptides=[seq])[0]
                    if binary:
                        if binding_affinity <= 500:
                            result[allele_repr][p] = 1.0
                        else:
                            result[allele_repr][p] = 0.0
                    else:
                        result[allele_repr][p] = binding_affinity

            # create EpitopePredictionResult object. This is a multi-indexed DataFrame
            # with Peptide and Method as multi-index and alleles as columns
            df_result = EpitopePredictionResult.from_dict(result)
            df_result.index = pandas.MultiIndex.from_tuples(
                [tuple((i, self.name)) for i in df_result.index],
                names=['Seq', 'Method'])
            return df_result
コード例 #8
0
ファイル: base.py プロジェクト: fenggen2016/epitopepredict
    def predict(self, sequence=None, peptides=None, length=11, overlap=1,
                      allele='HLA-A0101', name='', **kwargs):
        """Uses mhcflurry python classes for prediction"""

        self.sequence = sequence
        from mhcflurry import Class1AffinityPredictor
        predictor = Class1AffinityPredictor.load()
        if peptides == None:
            peptides, s = peptutils.create_fragments(seq=sequence,
                                                    length=length, overlap=overlap)
        df = predictor.predict_to_dataframe(peptides=peptides, allele=allele)
        #print (df[:5])
        df = self.prepareData(df, name, allele)
        self.data = df
        return df
コード例 #9
0
def run_and_check(n_jobs=0):
    models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
    hyperparameters_filename = os.path.join(models_dir, "hyperparameters.yaml")
    with open(hyperparameters_filename, "w") as fd:
        json.dump(HYPERPARAMETERS, fd)

    args = [
        "mhcflurry-class1-train-allele-specific-models",
        "--data",
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"),
        "--hyperparameters",
        hyperparameters_filename,
        "--allele",
        "HLA-A*02:01",
        "HLA-A*03:01",
        "--out-models-dir",
        models_dir,
        "--num-jobs",
        str(n_jobs),
    ]
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    # Calibrate percentile ranks
    args = [
        "mhcflurry-calibrate-percentile-ranks",
        "--models-dir",
        models_dir,
        "--num-peptides-per-length",
        "10000",
        "--num-jobs",
        str(n_jobs),
    ]
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    result = Class1AffinityPredictor.load(models_dir)
    predictions = result.predict(peptides=["SLYNTVATL"],
                                 alleles=["HLA-A*02:01"])
    assert_equal(predictions.shape, (1, ))
    assert_array_less(predictions, 1000)
    df = result.predict_to_dataframe(peptides=["SLYNTVATL"],
                                     alleles=["HLA-A*02:01"])
    print(df)
    assert "prediction_percentile" in df.columns

    print("Deleting: %s" % models_dir)
    shutil.rmtree(models_dir)
コード例 #10
0
ファイル: run_predictors.py プロジェクト: ramadhita/mhcflurry
def do_predictions_mhcflurry(work_item_dicts, constant_data=None):
    """
    Each dict of work items should have keys: work_item_num, peptides, alleles

    """

    # This may run on the cluster in a way that misses all top level imports,
    # so we have to re-import everything here.
    import time
    from mhcflurry.encodable_sequences import EncodableSequences
    from mhcflurry import Class1AffinityPredictor

    if constant_data is None:
        constant_data = GLOBAL_DATA

    args = constant_data['args']

    assert args.predictor == "mhcflurry"
    assert constant_data['cols'] == ["affinity"]

    predictor = Class1AffinityPredictor.load(args.mhcflurry_models_dir)

    results = []
    for (i, d) in enumerate(work_item_dicts):
        work_item_num = d['work_item_num']
        peptides = d['peptides']
        alleles = d['alleles']

        print("Processing work item", i + 1, "of", len(work_item_dicts))
        result = {}
        results.append((work_item_num, result))
        start = time.time()
        peptides = EncodableSequences.create(peptides)
        for (i, allele) in enumerate(alleles):
            print("Processing allele %d / %d: %0.2f sec elapsed" %
                  (i + 1, len(alleles), time.time() - start))
            for col in ["affinity"]:
                result["%s %s" % (allele, col)] = predictor.predict(
                    peptides=peptides,
                    allele=allele,
                    throw=False,
                    model_kwargs={
                        'batch_size': args.mhcflurry_batch_size,
                    }).astype(constant_data['args'].result_dtype)
        print("Done predicting in", time.time() - start, "sec")
    return results
コード例 #11
0
def test_mhcflurry():
    predictor = MHCflurry(alleles=[DEFAULT_ALLELE])
    binding_predictions = predictor.predict_subsequences(protein_sequence_dict,
                                                         peptide_lengths=[9])
    eq_(4, len(binding_predictions),
        "Expected 4 binding predictions from %s" % (binding_predictions, ))

    prediction_scores = {(x.peptide, x.allele): x.affinity
                         for x in binding_predictions}

    predictor = Class1AffinityPredictor.load()
    # test one prediction at a time to make sure there's no peptide/allele mixup
    for (peptide, allele), affinity in prediction_scores.items():
        prediction = predictor.predict([peptide], allele=allele)
        assert len(prediction) == 1
        # we've seen results differ a bit so doing an approximate check, not an error condition
        testing.assert_almost_equal(prediction[0], affinity, decimal=0)
コード例 #12
0
def setup():
    global AFFINITY_PREDICTOR
    global CLEAVAGE_PREDICTOR
    global CLEAVAGE_PREDICTOR_NO_FLANKING
    global PRESENTATION_PREDICTOR
    startup()
    AFFINITY_PREDICTOR = Class1AffinityPredictor.load(get_path(
        "models_class1_pan", "models.combined"),
                                                      optimization_level=0,
                                                      max_models=1)
    CLEAVAGE_PREDICTOR = Class1ProcessingPredictor.load(get_path(
        "models_class1_processing", "models.selected.with_flanks"),
                                                        max_models=1)
    CLEAVAGE_PREDICTOR_NO_FLANKING = Class1ProcessingPredictor.load(
        get_path("models_class1_processing", "models.selected.no_flank"),
        max_models=1)
    PRESENTATION_PREDICTOR = Class1PresentationPredictor.load()
コード例 #13
0
def test_merge():
    assert len(PAN_ALLELE_PREDICTOR.class1_pan_allele_models) > 1
    peptides = random_peptides(100, length=9)
    peptides.extend(random_peptides(100, length=10))
    peptides = pandas.Series(peptides).sample(frac=1.0)

    alleles = pandas.Series(["HLA-A*03:01", "HLA-B*57:01",
                             "HLA-C*02:01"]).sample(n=len(peptides),
                                                    replace=True)

    predictions1 = PAN_ALLELE_PREDICTOR.predict(peptides=peptides,
                                                alleles=alleles)

    merged = Class1NeuralNetwork.merge(
        PAN_ALLELE_PREDICTOR.class1_pan_allele_models)
    merged_predictor = Class1AffinityPredictor(
        allele_to_sequence=PAN_ALLELE_PREDICTOR.allele_to_sequence,
        class1_pan_allele_models=[merged],
    )
    predictions2 = merged_predictor.predict(peptides=peptides, alleles=alleles)
    numpy.testing.assert_allclose(predictions1, predictions2, atol=0.1)
コード例 #14
0
 def predict(self, input_file, allele, epitope_length, iedb_executable_path,
             iedb_retries):
     predictor = Class1AffinityPredictor.load()
     results = pd.DataFrame()
     for line in input_file:
         match = re.search('^>([0-9]+)$', line)
         if match:
             seq_num = match.group(1)
         else:
             epitopes = self.determine_neoepitopes(line.rstrip(),
                                                   epitope_length)
             df = predictor.predict_to_dataframe(allele=allele,
                                                 peptides=epitopes)
             df['seq_num'] = seq_num
             df['start'] = df.index + 1
             df.rename(columns={
                 'prediction': 'ic50',
                 'prediction_percentile': 'percentile'
             },
                       inplace=True)
             results = results.append(df)
     return (results, 'pandas')
コード例 #15
0
def test_class1_affinity_predictor_a0205_memorize_training_data():
    # Memorize the dataset.
    hyperparameters = dict(
        activation="tanh",
        layer_sizes=[64],
        max_epochs=100,
        early_stopping=False,
        validation_split=0.0,
        locally_connected_layers=[],
        dense_layer_l1_regularization=0.0,
        dropout_probability=0.0)

    allele = "HLA-A*02:05"

    df = pandas.read_csv(
        get_path(
            "data_curated", "curated_training_data.affinity.csv.bz2"))
    df = df.loc[
        df.allele == allele
    ]
    df = df.loc[
        df.peptide.str.len() == 9
    ]
    df = df.loc[
        df.measurement_type == "quantitative"
    ]
    df = df.loc[
        df.measurement_source == "kim2014"
    ]

    predictor = Class1AffinityPredictor()
    predictor.fit_allele_specific_predictors(
        n_models=2,
        architecture_hyperparameters_list=[hyperparameters],
        allele=allele,
        peptides=df.peptide.values,
        affinities=df.measurement_value.values,
        verbose=0,
    )
    predictor.calibrate_percentile_ranks(num_peptides_per_length=1000)
    ic50_pred = predictor.predict(df.peptide.values, allele=allele)
    ic50_true = df.measurement_value.values
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(
        numpy.log(ic50_pred),
        numpy.log(ic50_true),
        rtol=0.2,
        atol=0.2)

    ic50_pred_df = predictor.predict_to_dataframe(
        df.peptide.values, allele=allele)
    print(ic50_pred_df)
    assert 'prediction_percentile' in ic50_pred_df.columns
    assert ic50_pred_df.prediction_percentile.isnull().sum() == 0

    ic50_pred_df2 = predictor.predict_to_dataframe(
        df.peptide.values,
        allele=allele,
        include_individual_model_predictions=True)
    print(ic50_pred_df2)

    # Test an unknown allele
    print("Starting unknown allele check")
    eq_(predictor.supported_alleles, [allele])
    ic50_pred = predictor.predict(
        df.peptide.values,
        allele="HLA-A*02:01",
        throw=False)
    assert numpy.isnan(ic50_pred).all()

    assert_raises(
        ValueError,
        predictor.predict,
        df.peptide.values,
        allele="HLA-A*02:01")


    eq_(predictor.supported_alleles, [allele])
    assert_raises(
        ValueError,
        predictor.predict,
        ["AAAAA"],  # too short
        allele=allele)
    assert_raises(
        ValueError,
        predictor.predict,
        ["AAAAAAAAAAAAAAAAAAAA"],  # too long
        allele=allele)
    ic50_pred = predictor.predict(
        ["AAAAA", "AAAAAAAAA", "AAAAAAAAAAAAAAAAAAAA"],
        allele=allele,
        throw=False)
    assert numpy.isnan(ic50_pred[0])
    assert not numpy.isnan(ic50_pred[1])
    assert numpy.isnan(ic50_pred[2])
コード例 #16
0
def setup():
    global DOWNLOADED_PREDICTOR
    startup()
    DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load()
    logging.basicConfig(level=logging.DEBUG)
コード例 #17
0
    def check_allele_valid(self, allele):
        valid_alleles = self.valid_allele_names()
        if allele not in valid_alleles:
            sys.exit(
                "Allele %s not valid for method %s. Run `pvacseq valid_alleles %s` for a list of valid allele names."
                % (allele, self.__class__.__name__, self.__class__.__name__))


class MHCI(PredictionClass, metaclass=ABCMeta):
    @property
    def needs_epitope_length(self):
        return True


mhcflurry_predictor = Class1AffinityPredictor.load()


class MHCflurry(MHCI):
    def valid_allele_names(self):
        return mhcflurry_predictor.supported_alleles

    def check_length_valid_for_allele(self, length, allele):
        return True

    def valid_lengths_for_allele(self, allele):
        return [8, 9, 10, 11, 12, 13, 14]

    def determine_neoepitopes(self, sequence, length):
        epitopes = []
        for i in range(0, len(sequence) - length + 1):
コード例 #18
0
import traceback
import sys

import numpy
import pandas
numpy.random.seed(0)

from mhcflurry import Class1AffinityPredictor

from nose.tools import eq_, assert_raises
from numpy import testing

from mhcflurry.downloads import get_path
from mhcflurry.testing_utils import cleanup, startup

DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load()


def setup():
    global DOWNLOADED_PREDICTOR
    startup()
    DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load()
    logging.basicConfig(level=logging.DEBUG)


def teardown():
    global DOWNLOADED_PREDICTOR
    DOWNLOADED_PREDICTOR = None
    cleanup()

コード例 #19
0
This script includes MHCflurry model

"""
from mhcflurry import Class1AffinityPredictor
from mhcflurry.downloads import get_path
import pandas as pd
import numpy as np

data_path = get_path('data_curated', 'curated_training_data.no_mass_spec.csv.bz2')
df = pandas.read_csv(data_path)
df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 11)]

models = {}
for hla in HLAs:
    new_predictor = Class1AffinityPredictor()
    if(df.loc[df.allele == hla].shape[0]>0):
        single_allele_train_data = df.loc[df.allele == hla].sample(21, replace = True)
    else:
        models[hla] = ''
        continue
    model = new_predictor.fit_allele_specific_predictors(n_models=1,architecture_hyperparameters_list=[{"layer_sizes": [16],"max_epochs": 5,"random_negative_constant": 5,}],peptides=single_allele_train_data.peptide.values,affinities=single_allele_train_data.measurement_value.values,allele="HLA-B*57:01")
    models[hla] = model
    
binding_affinity = []
for i in range(len(test_peptide)):
    ba = float('inf')
    for hla in test_onehot[i]:
        if (hla not in models.keys()):
            if(df.loc[df.allele == hla].shape[0]>0):
                single_allele_train_data = df.loc[df.allele == hla].sample(21, replace = True)
コード例 #20
0
def run_and_check(n_jobs=0, delete=True, additional_args=[]):
    models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
    hyperparameters_filename = os.path.join(models_dir, "hyperparameters.yaml")
    with open(hyperparameters_filename, "w") as fd:
        json.dump(HYPERPARAMETERS_LIST, fd)

    data_df = pandas.read_csv(
        get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2"))
    selected_data_df = data_df.loc[data_df.allele.str.startswith("HLA-A")]
    selected_data_df.to_csv(os.path.join(models_dir, "_train_data.csv"),
                            index=False)

    args = [
        "mhcflurry-class1-train-pan-allele-models",
        "--data",
        os.path.join(models_dir, "_train_data.csv"),
        "--allele-sequences",
        get_path("allele_sequences", "allele_sequences.csv"),
        "--hyperparameters",
        hyperparameters_filename,
        "--out-models-dir",
        models_dir,
        "--num-jobs",
        str(n_jobs),
        "--num-folds",
        "2",
        "--verbosity",
        "1",
    ] + additional_args
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    # Run model selection
    models_dir_selected = tempfile.mkdtemp(
        prefix="mhcflurry-test-models-selected")
    args = [
        "mhcflurry-class1-select-pan-allele-models",
        "--data",
        os.path.join(models_dir, "train_data.csv.bz2"),
        "--models-dir",
        models_dir,
        "--out-models-dir",
        models_dir_selected,
        "--max-models",
        "1",
        "--num-jobs",
        str(n_jobs),
    ] + additional_args
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    result = Class1AffinityPredictor.load(models_dir_selected,
                                          optimization_level=0)
    assert_equal(len(result.neural_networks), 2)
    predictions = result.predict(peptides=["SLYNTVATL"],
                                 alleles=["HLA-A*02:01"])
    assert_equal(predictions.shape, (1, ))
    assert_array_less(predictions, 1000)

    if delete:
        print("Deleting: %s" % models_dir)
        shutil.rmtree(models_dir)
        shutil.rmtree(models_dir_selected)
コード例 #21
0
ファイル: prediction_class.py プロジェクト: tmooney/pVACtools
 def valid_allele_names(self):
     predictor = Class1AffinityPredictor.load()
     return predictor.supported_alleles
コード例 #22
0
def setup():
    global DOWNLOADED_PREDICTOR
    startup()
    DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load()
コード例 #23
0
import numpy as np
import pandas as pd
from pathlib import Path
from mhcflurry import Class1AffinityPredictor
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# Load data and models - change path as necessary
DATA = Path("./data")
ms_file = DATA / "abelin_peptides.mhcflurry_no_mass_spec.csv"
MODELS = Path("/Users/haemin/Library/Application Support/mhcflurry/4/1.4.0")

no_ms_model = MODELS / "models_class1_selected_no_mass_spec/models"
no_ms_predictor = Class1AffinityPredictor.load(no_ms_model)

# Read data file and initial cleanup
ms = pd.read_csv(ms_file)
ms = ms.rename(columns={"mhcflurry": "mhcflurry2"})
ms["mhcflurry4"] = np.full_like(ms.mhcflurry2, -1)
ms = ms.loc[ms.allele.isin(no_ms_predictor.supported_alleles), :]
ms["peptide_len"] = ms.peptide.str.len()
ms.loc[ms["peptide_len"] > 12, "peptide_len"] = 13
'''
    Generate Figure A
'''
# Compute PPV values
models = ["netmhc", "netmhcpan", "mhcflurry2", "mhcflurry4"]
alleles = ms.allele[ms.allele.isin(no_ms_predictor.supported_alleles)].unique()

# Compute predictions for mhcflurry4
コード例 #24
0
ファイル: ANN.py プロジェクト: lkuchenb/fred
        def predict(self, peptides, alleles=None, binary=False, **kwargs):

            # test whether one peptide or a list
            if not isinstance(peptides, list):
                peptides = [peptides]

            # if no alleles are specified do predictions for all supported alleles
            if alleles is None:
                alleles = self.supportedAlleles
            else:
                # filter for supported alleles
                alleles = filter(lambda a: a in self.supportedAlleles, alleles)
            alleles = self.convert_alleles(alleles)

            # test mhcflurry models are available => download if not
            p = subprocess.Popen(
                ['mhcflurry-downloads', 'path', 'models_class1'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            if p is not 0:
                subprocess.call(
                    ['mhcflurry-downloads', 'fetch', 'models_class1'])

            # load model
            predictor = Class1AffinityPredictor.load()

            # prepare results dictionary
            result = defaultdict(defaultdict)

            # group peptides by length
            peptides.sort(key=len)
            for length, peps in itertools.groupby(peptides, key=len):
                if length not in self.supportedLength:
                    logging.warn(
                        "Peptide length must be at least %i or at most %i for %s but is %i"
                        % (min(self.supportedLength), max(
                            self.supportedLength), self.name, length))
                    continue
                peps = list(peps)

                # predict and assign binding affinities
                for a in alleles:
                    allele_repr = self.revert_allele_repr(a)
                    for p in peps:
                        binding_affinity = predictor.predict(allele=a,
                                                             peptides=[str(p)
                                                                       ])[0]
                        if binary:
                            if binding_affinity <= 500:
                                result[allele_repr][p] = 1.0
                            else:
                                result[allele_repr][p] = 0.0
                        else:
                            result[allele_repr][p] = binding_affinity

            if not result:
                raise ValueError(
                    "No predictions could be made with " + self.name +
                    " for given input. Check your epitope length and HLA allele combination."
                )

            # create EpitopePredictionResult object. This is a multi-indexed DataFrame
            # with Peptide and Method as multi-index and alleles as columns
            df_result = EpitopePredictionResult.from_dict(result)
            df_result.index = pandas.MultiIndex.from_tuples(
                [tuple((i, self.name)) for i in df_result.index],
                names=['Seq', 'Method'])
            return df_result
コード例 #25
0
def test_a1_known_epitopes_in_newly_trained_model():
    allele = "HLA-A*01:01"
    df = pandas.read_csv(
        get_path(
            "data_curated", "curated_training_data.affinity.csv.bz2"))
    df = df.loc[
        (df.allele == allele) &
        (df.peptide.str.len() >= 8) &
        (df.peptide.str.len() <= 15)
    ]

    hyperparameters = {
        "max_epochs": 100,
        "patience": 10,
        "early_stopping": True,
        "validation_split": 0.2,

        "random_negative_rate": 0.0,
        "random_negative_constant": 25,

        "peptide_amino_acid_encoding": "BLOSUM62",
        "use_embedding": False,
        "kmer_size": 15,
        "batch_normalization": False,
        "locally_connected_layers": [
            {
                "filters": 8,
                "activation": "tanh",
                "kernel_size": 3
            }
        ],
        "activation": "relu",
        "output_activation": "sigmoid",
        "layer_sizes": [
            32
        ],
        "random_negative_affinity_min": 20000.0,
        "random_negative_affinity_max": 50000.0,
        "dense_layer_l1_regularization": 0.001,
        "dropout_probability": 0.0
    }

    predictor = Class1AffinityPredictor()
    predictor.fit_allele_specific_predictors(
        n_models=2,
        architecture_hyperparameters_list=[hyperparameters],
        allele=allele,
        peptides=df.peptide.values,
        affinities=df.measurement_value.values,
        verbose=0,
    )

    predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor)

    models_dir = tempfile.mkdtemp("_models")
    print(models_dir)
    predictor.save(models_dir)
    predictor2 = Class1AffinityPredictor.load(models_dir)
    predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor2)
    shutil.rmtree(models_dir)

    predictor3 = Class1AffinityPredictor(
        allele_to_allele_specific_models={
            allele: [predictor.allele_to_allele_specific_models[allele][0]]
        })
    predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor3)
    models_dir = tempfile.mkdtemp("_models")
    print(models_dir)
    predictor3.save(models_dir)
    predictor4 = Class1AffinityPredictor.load(models_dir)
    predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor4)
    shutil.rmtree(models_dir)
seqs = [
    l.split()[1] for l in mztab_read if l.startswith("PSM")
    if l.split()[1] not in seqs_new_smaller_qval
]
seqs_new_greater_qval = list(set(seqs))
seqs_new_greater_qval = [
    s for s in seqs_new_greater_qval if 7 < len(s) < 13 and not 'U' in s
    and not 'X' in s and not 'Z' in s and not 'J' in s and not 'B' in s
]

#call mhcflurry
#subprocess.call("mhcflurry-predict --peptides {p} --alleles {a} --out {o}".format(p=" ".join(seqs_new), a=" ".join(alleles), o=sys.argv[-1]))
seqs_filtered = []
for allele in alleles:
    print(allele)
    predictor = Class1AffinityPredictor.load()
    df_pred = predictor.predict_to_dataframe(allele=allele,
                                             peptides=seqs_new_greater_qval)
    seqs_filtered += df_pred[df_pred['prediction'] <= float(
        sys.argv[-5])]['peptide'].values.tolist()

#merge sequence lists and append decoys
seqs_new_all = list(set(seqs_new_smaller_qval + seqs_filtered))
seqs_new_all = seqs_new_all + [s[::-1] for s in seqs_new_all]

#write idXML for filtering
op = open(sys.argv[-1], 'w')
op.write(
    '<PeptideIdentification score_type="q-value" higher_score_better="false">'
    + '\n')
for pep in seqs_new_all:
コード例 #27
0
def run_and_check_with_model_selection(n_jobs=1):
    models_dir1 = tempfile.mkdtemp(prefix="mhcflurry-test-models")
    hyperparameters_filename = os.path.join(models_dir1,
                                            "hyperparameters.yaml")

    # Include one architecture that has max_epochs = 0. We check that it never
    # gets selected in model selection.
    hyperparameters = [
        deepcopy(HYPERPARAMETERS[0]),
        deepcopy(HYPERPARAMETERS[0]),
    ]
    hyperparameters[-1]["max_epochs"] = 0
    with open(hyperparameters_filename, "w") as fd:
        json.dump(hyperparameters, fd)

    args = [
        "mhcflurry-class1-train-allele-specific-models",
        "--data",
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"),
        "--hyperparameters",
        hyperparameters_filename,
        "--allele",
        "HLA-A*02:01",
        "HLA-A*03:01",
        "--out-models-dir",
        models_dir1,
        "--num-jobs",
        str(n_jobs),
        "--held-out-fraction-reciprocal",
        "10",
        "--n-models",
        "1",
    ]
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    result = Class1AffinityPredictor.load(models_dir1)
    assert_equal(len(result.neural_networks), 4)

    models_dir2 = tempfile.mkdtemp(prefix="mhcflurry-test-models")
    args = [
        "mhcflurry-class1-select-allele-specific-models",
        "--data",
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"),
        "--exclude-data",
        models_dir1 + "/train_data.csv.bz2",
        "--out-models-dir",
        models_dir2,
        "--models-dir",
        models_dir1,
        "--num-jobs",
        str(n_jobs),
        "--mse-max-models",
        "1",
        "--unselected-accuracy-scorer",
        "combined:mass-spec,mse",
        "--unselected-accuracy-percentile-threshold",
        "95",
    ]
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    result = Class1AffinityPredictor.load(models_dir2)
    assert_equal(len(result.neural_networks), 2)
    assert_equal(len(result.allele_to_allele_specific_models["HLA-A*02:01"]),
                 1)
    assert_equal(len(result.allele_to_allele_specific_models["HLA-A*03:01"]),
                 1)
    assert_equal(
        result.allele_to_allele_specific_models["HLA-A*02:01"]
        [0].hyperparameters["max_epochs"], 500)
    assert_equal(
        result.allele_to_allele_specific_models["HLA-A*03:01"]
        [0].hyperparameters["max_epochs"], 500)

    print("Deleting: %s" % models_dir1)
    print("Deleting: %s" % models_dir2)
    shutil.rmtree(models_dir1)