Example #1
0
def setup():
    global ALLELE_SPECIFIC_PREDICTOR, PAN_ALLELE_PREDICTOR
    startup()
    ALLELE_SPECIFIC_PREDICTOR = Class1AffinityPredictor.load(
        get_path("models_class1", "models"))

    PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load(
        get_path("models_class1_pan", "models.with_mass_spec"))
Example #2
0
def setup():
    global PREDICTORS
    startup()
    PREDICTORS = {
        'allele-specific':
        Class1AffinityPredictor.load(get_path("models_class1", "models")),
        'pan-allele':
        Class1AffinityPredictor.load(
            get_path("models_class1_pan", "models.combined"))
    }
def test_cross_validation_with_imputation():
    imputer = fancyimpute.MICE(
        n_imputations=2, n_burn_in=1, n_nearest_columns=25)
    train_data = (
        mhcflurry.dataset.Dataset.from_csv(
            get_path("data_kim2014" , "bdata.2009.mhci.public.1.txt"))
        .get_alleles(["HLA-A0201", "HLA-A0202", "HLA-A0301"]))

    folds = cross_validation_folds(
        train_data,
        n_folds=3,
        imputer=imputer,
        drop_similar_peptides=True,
        alleles=["HLA-A0201", "HLA-A0202"])

    eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
    eq_(len(folds), 6)

    for fold in folds:
        eq_(fold.train.unique_alleles(), set([fold.allele]))
        eq_(fold.imputed_train.unique_alleles(), set([fold.allele]))
        eq_(fold.test.unique_alleles(), set([fold.allele]))

    models = HYPERPARAMETER_DEFAULTS.models_grid(
        activation=["tanh", "relu"],
        layer_sizes=[[4]],
        embedding_output_dim=[8],
        n_training_epochs=[3])
    print(models)

    df = train_across_models_and_folds(folds, models)
    print(df)
    assert df.test_auc.mean() > 0.6
def test_cross_validation_with_imputation():
    imputer = fancyimpute.MICE(n_imputations=2,
                               n_burn_in=1,
                               n_nearest_columns=25)
    train_data = (mhcflurry.dataset.Dataset.from_csv(
        get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")).get_alleles(
            ["HLA-A0201", "HLA-A0202", "HLA-A0301"]))

    folds = cross_validation_folds(train_data,
                                   n_folds=3,
                                   imputer=imputer,
                                   drop_similar_peptides=True,
                                   alleles=["HLA-A0201", "HLA-A0202"])

    eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
    eq_(len(folds), 6)

    for fold in folds:
        eq_(fold.train.unique_alleles(), set([fold.allele]))
        eq_(fold.imputed_train.unique_alleles(), set([fold.allele]))
        eq_(fold.test.unique_alleles(), set([fold.allele]))

    models = HYPERPARAMETER_DEFAULTS.models_grid(activation=["tanh", "relu"],
                                                 layer_sizes=[[4]],
                                                 embedding_output_dim=[8],
                                                 n_training_epochs=[3])
    print(models)

    df = train_across_models_and_folds(folds, models)
    print(df)
    assert df.test_auc.mean() > 0.6
def test_class1_affinity_predictor_a0205_training_accuracy():
    # Memorize the dataset.
    hyperparameters = dict(activation="tanh",
                           layer_sizes=[16],
                           max_epochs=500,
                           early_stopping=False,
                           validation_split=0.0,
                           locally_connected_layers=[],
                           dense_layer_l1_regularization=0.0,
                           dropout_probability=0.0)

    # First test a Class1NeuralNetwork, then a Class1AffinityPredictor.
    allele = "HLA-A*02:05"

    df = pandas.read_csv(
        get_path("data_curated", "curated_training_data.csv.bz2"))
    df = df.ix[df.allele == allele]
    df = df.ix[df.peptide.str.len() == 9]
    df = df.ix[df.measurement_type == "quantitative"]
    df = df.ix[df.measurement_source == "kim2014"]

    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(df.peptide.values, df.measurement_value.values)
    ic50_pred = predictor.predict(df.peptide.values)
    ic50_true = df.measurement_value.values
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(numpy.log(ic50_pred),
                            numpy.log(ic50_true),
                            rtol=0.2,
                            atol=0.2)
def test_run():
    try:
        models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
        hyperparameters_filename = os.path.join(models_dir,
                                                "hyperparameters.json")
        with open(hyperparameters_filename, "w") as fd:
            json.dump(HYPERPARAMETERS, fd)

        args = [
            "--data",
            get_path("data_curated", "curated_training_data.csv.bz2"),
            "--hyperparameters",
            hyperparameters_filename,
            "--min-measurements-per-allele",
            "9000",
            "--out-models-dir",
            models_dir,
        ]
        print("Running with args: %s" % args)
        train_allele_specific_models_command.run(args)

        result = Class1AffinityPredictor.load(models_dir)
        predictions = result.predict(peptides=["SLYNTVATL"],
                                     alleles=["HLA-A*02:01"])
        assert_equal(predictions.shape, (1, ))
        assert_array_less(predictions, 500)

    finally:
        print("Deleting: %s" % models_dir)
        shutil.rmtree(models_dir)
def test_class1_binding_predictor_A0205_training_accuracy():
    dataset = Dataset.from_csv(get_path(
        "data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
    dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205")
    dataset_a0205 = Dataset(
        dataset_a0205_all_lengths._df.ix[
            dataset_a0205_all_lengths._df.peptide.str.len() == 9])

    predictor = Class1BindingPredictor(
        name="A0205",
        embedding_output_dim=32,
        activation="tanh",
        layer_sizes=[64],
        optimizer="adam",
        dropout_probability=0.0)
    predictor.fit_dataset(dataset_a0205, n_training_epochs=1000)
    peptides = dataset_a0205.peptides
    ic50_pred = predictor.predict(peptides)
    ic50_true = dataset_a0205.affinities
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(
        np.log(ic50_pred),
        np.log(ic50_true),
        rtol=0.2,
        atol=0.2)
def setup():
    global PAN_ALLELE_PREDICTOR
    startup()
    PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load(
        get_path("models_class1_pan", "models.combined"),
        optimization_level=0,
    )
Example #9
0
def setup():
    global AFFINITY_PREDICTOR
    global CLEAVAGE_PREDICTOR
    global CLEAVAGE_PREDICTOR_NO_FLANKING
    global PRESENTATION_PREDICTOR
    startup()
    AFFINITY_PREDICTOR = Class1AffinityPredictor.load(get_path(
        "models_class1_pan", "models.combined"),
                                                      optimization_level=0,
                                                      max_models=1)
    CLEAVAGE_PREDICTOR = Class1ProcessingPredictor.load(get_path(
        "models_class1_processing", "models.selected.with_flanks"),
                                                        max_models=1)
    CLEAVAGE_PREDICTOR_NO_FLANKING = Class1ProcessingPredictor.load(
        get_path("models_class1_processing", "models.selected.no_flank"),
        max_models=1)
    PRESENTATION_PREDICTOR = Class1PresentationPredictor.load()
Example #10
0
def run_and_check(n_jobs=0, delete=True, additional_args=[]):
    source_models_dir = get_path("models_class1_pan", "models.combined")
    dest_models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")

    # Save a new predictor that has no percent rank calibration data.
    original_predictor = Class1AffinityPredictor.load(source_models_dir)
    print("Loaded predictor", source_models_dir)
    new_predictor = Class1AffinityPredictor(
        class1_pan_allele_models=original_predictor.class1_pan_allele_models,
        allele_to_sequence=original_predictor.allele_to_sequence,
    )
    new_predictor.save(dest_models_dir)
    print("Saved predictor to", dest_models_dir)

    new_predictor = Class1AffinityPredictor.load(dest_models_dir)
    assert_equal(len(new_predictor.allele_to_percent_rank_transform), 0)

    args = [
        "mhcflurry-calibrate-percentile-ranks",
        "--models-dir",
        dest_models_dir,
        "--match-amino-acid-distribution-data",
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"),
        "--motif-summary",
        "--num-peptides-per-length",
        "1000",
        "--allele",
        "HLA-A*02:01",
        "HLA-B*07:02",
        "--verbosity",
        "1",
        "--num-jobs",
        str(n_jobs),
    ] + additional_args
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    new_predictor = Class1AffinityPredictor.load(dest_models_dir)
    assert_equal(len(new_predictor.allele_to_percent_rank_transform), 2)

    if delete:
        print("Deleting: %s" % dest_models_dir)
        shutil.rmtree(dest_models_dir)
    else:
        print("Not deleting: %s" % dest_models_dir)
Example #11
0
def test_class1_neural_network_a0205_training_accuracy():
    # Memorize the dataset.
    hyperparameters = dict(activation="tanh",
                           layer_sizes=[16],
                           max_epochs=500,
                           early_stopping=False,
                           validation_split=0.0,
                           locally_connected_layers=[{
                               "filters": 8,
                               "activation": "tanh",
                               "kernel_size": 3
                           }],
                           dense_layer_l1_regularization=0.0,
                           dropout_probability=0.0)

    # First test a Class1NeuralNetwork, then a Class1AffinityPredictor.
    allele = "HLA-A*02:05"

    df = pandas.read_csv(
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"))
    df = df.loc[df.allele == allele]
    df = df.loc[df.peptide.str.len() == 9]
    df = df.loc[df.measurement_type == "quantitative"]
    df = df.loc[df.measurement_source == "kim2014"]

    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(df.peptide.values, df.measurement_value.values)
    ic50_pred = predictor.predict(df.peptide.values)
    ic50_true = df.measurement_value.values
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(numpy.log(ic50_pred),
                            numpy.log(ic50_true),
                            rtol=0.2,
                            atol=0.2)

    # Test that a second predictor has the same architecture json.
    # This is important for an optimization we use to re-use predictors of the
    # same architecture at prediction time.
    hyperparameters2 = dict(activation="tanh",
                            layer_sizes=[16],
                            max_epochs=1,
                            early_stopping=False,
                            validation_split=0.0,
                            locally_connected_layers=[{
                                "filters": 8,
                                "activation": "tanh",
                                "kernel_size": 3
                            }],
                            dense_layer_l1_regularization=0.0,
                            dropout_probability=0.0)
    predictor2 = Class1NeuralNetwork(**hyperparameters2)
    predictor2.fit(df.peptide.values, df.measurement_value.values, verbose=0)
    eq_(predictor.network().to_json(), predictor2.network().to_json())
Example #12
0
def run_and_check(n_jobs=0):
    models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
    hyperparameters_filename = os.path.join(models_dir, "hyperparameters.yaml")
    with open(hyperparameters_filename, "w") as fd:
        json.dump(HYPERPARAMETERS, fd)

    args = [
        "mhcflurry-class1-train-allele-specific-models",
        "--data",
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"),
        "--hyperparameters",
        hyperparameters_filename,
        "--allele",
        "HLA-A*02:01",
        "HLA-A*03:01",
        "--out-models-dir",
        models_dir,
        "--num-jobs",
        str(n_jobs),
    ]
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    # Calibrate percentile ranks
    args = [
        "mhcflurry-calibrate-percentile-ranks",
        "--models-dir",
        models_dir,
        "--num-peptides-per-length",
        "10000",
        "--num-jobs",
        str(n_jobs),
    ]
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    result = Class1AffinityPredictor.load(models_dir)
    predictions = result.predict(peptides=["SLYNTVATL"],
                                 alleles=["HLA-A*02:01"])
    assert_equal(predictions.shape, (1, ))
    assert_array_less(predictions, 1000)
    df = result.predict_to_dataframe(peptides=["SLYNTVATL"],
                                     alleles=["HLA-A*02:01"])
    print(df)
    assert "prediction_percentile" in df.columns

    print("Deleting: %s" % models_dir)
    shutil.rmtree(models_dir)
def test_imputation():
    imputer = fancyimpute.MICE(n_imputations=2,
                               n_burn_in=1,
                               n_nearest_columns=25)
    train_data = (mhcflurry.dataset.Dataset.from_csv(
        get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")).get_alleles(
            ["HLA-A0201", "HLA-A0202", "HLA-A0301"]))

    folds = cross_validation_folds(train_data,
                                   n_folds=3,
                                   imputer=imputer,
                                   drop_similar_peptides=True,
                                   alleles=["HLA-A0201", "HLA-A0202"])

    eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
    eq_(len(folds), 6)

    for fold in folds:
        eq_(fold.train.unique_alleles(), set([fold.allele]))
        eq_(fold.imputed_train.unique_alleles(), set([fold.allele]))
        eq_(fold.test.unique_alleles(), set([fold.allele]))
def test_imputation():
    imputer = fancyimpute.MICE(
        n_imputations=2, n_burn_in=1, n_nearest_columns=25)
    train_data = (
        mhcflurry.dataset.Dataset.from_csv(
            get_path("data_kim2014", "bdata.2009.mhci.public.1.txt"))
        .get_alleles(["HLA-A0201", "HLA-A0202", "HLA-A0301"]))

    folds = cross_validation_folds(
        train_data,
        n_folds=3,
        imputer=imputer,
        drop_similar_peptides=True,
        alleles=["HLA-A0201", "HLA-A0202"])

    eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
    eq_(len(folds), 6)

    for fold in folds:
        eq_(fold.train.unique_alleles(), set([fold.allele]))
        eq_(fold.imputed_train.unique_alleles(), set([fold.allele]))
        eq_(fold.test.unique_alleles(), set([fold.allele]))
Example #15
0
def test_class1_binding_predictor_A0205_training_accuracy():
    dataset = Dataset.from_csv(
        get_path("data_combined_iedb_kim2014",
                 "combined_human_class1_dataset.csv"))
    dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205")
    dataset_a0205 = Dataset(dataset_a0205_all_lengths._df.ix[
        dataset_a0205_all_lengths._df.peptide.str.len() == 9])

    predictor = Class1BindingPredictor(name="A0205",
                                       embedding_output_dim=32,
                                       activation="tanh",
                                       layer_sizes=[64],
                                       optimizer="adam",
                                       dropout_probability=0.0)
    predictor.fit_dataset(dataset_a0205, n_training_epochs=1000)
    peptides = dataset_a0205.peptides
    ic50_pred = predictor.predict(peptides)
    ic50_true = dataset_a0205.affinities
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(np.log(ic50_pred),
                            np.log(ic50_true),
                            rtol=0.2,
                            atol=0.2)
Example #16
0
def run():
    from mhcflurry.amino_acid import COMMON_AMINO_ACIDS

    args = parser.parse_args(sys.argv[1:])

    configure_logging()

    serial_run = not args.cluster_parallelism and args.num_jobs == 0

    if not args.affinity_predictor:
        args.affinity_predictor = get_path(
            "models_class1_pan", "models.combined")
        print("Using downloaded affinity predictor: ", args.affinity_predictor)

    if not args.frequency_matrices:
        args.frequency_matrices = os.path.join(
            args.affinity_predictor, "frequency_matrices.csv.bz2")

    if not args.length_distributions:
        args.length_distributions = os.path.join(args.affinity_predictor,
            "length_distributions.csv.bz2")

    if not args.train_data:
        args.train_data = os.path.join(args.affinity_predictor,
            "train_data.csv.bz2")

    frequency_matrices_df = pandas.read_csv(args.frequency_matrices)
    length_distributions = pandas.read_csv(args.length_distributions)
    train_data = pandas.read_csv(args.train_data)

    alleles = args.alleles
    if alleles:
        print("Using specified alleles, ", *alleles)
    else:
        alleles = frequency_matrices_df.allele.unique()

    if args.max_alleles:
        alleles = alleles[:args.max_alleles]

    print("Using %d alleles" % len(alleles), alleles)

    amino_acids = sorted(COMMON_AMINO_ACIDS)

    distribution = frequency_matrices_df.loc[
        (frequency_matrices_df.cutoff_fraction == 1.0), amino_acids
    ].mean(0)

    normalized_frequency_matrices = frequency_matrices_df.copy()
    normalized_frequency_matrices.loc[:, amino_acids] = (
            normalized_frequency_matrices[amino_acids] / distribution)

    GLOBAL_DATA["args"] = args
    GLOBAL_DATA["normalized_frequency_matrices"] = normalized_frequency_matrices
    GLOBAL_DATA["length_distributions"] = length_distributions
    GLOBAL_DATA["train_data"] = train_data

    artifacts_out = os.path.join(args.out, "artifacts")

    if not os.path.exists(args.out):
        os.mkdir(args.out)

    if not os.path.exists(artifacts_out):
        os.mkdir(artifacts_out)

    tasks = [
        {
            "task_num": i,
            "allele": allele,
            "out_dir": artifacts_out,
        }
        for (i, allele) in enumerate(alleles)
    ]

    jobs = []
    for task in tasks:
        if not jobs or len(jobs[-1]['tasks']) >= args.chunk_size:
            jobs.append({'tasks': []})
        jobs[-1]['tasks'].append(task)

    print("Generated %d tasks, packed into %d jobs" % (len(tasks), len(jobs)))

    worker_pool = None
    start = time.time()

    if serial_run:
        # Serial run
        print("Running in serial.")
        results = (
            do_job(**job) for job in jobs)
    elif args.cluster_parallelism:
        # Run using separate processes HPC cluster.
        print("Running on cluster.")
        results = cluster_results_from_args(
            args,
            work_function=do_job,
            work_items=jobs,
            constant_data=GLOBAL_DATA,
            input_serialization_method="dill",
            result_serialization_method="pickle",
            clear_constant_data=False)
    else:
        worker_pool = worker_pool_with_gpu_assignments_from_args(args)
        print("Worker pool", worker_pool)
        assert worker_pool is not None

        for task in tasks:
            task['constant_data'] = GLOBAL_DATA

        results = worker_pool.imap_unordered(
            partial(call_wrapped_kwargs, do_job),
            jobs,
            chunksize=1)

    print("Reading results")

    task_results = {}

    for job_result in tqdm.tqdm(results, total=len(jobs)):
        for task_result in job_result:
            task_results[task_result['task_num']] = task_result

    print("Received all results in %0.2f sec" % (time.time() - start))

    artifacts_df = pandas.DataFrame(task_results).T.set_index("task_num")

    length_distributions_out = os.path.join(args.out,
        "length_distributions.csv")
    length_distributions.to_csv(length_distributions_out,
        index=False)
    print("Wrote: ", length_distributions_out)

    artifacts_summary_out = os.path.join(args.out, "artifacts.csv")
    artifacts_df.to_csv(artifacts_summary_out)
    print("Wrote: ", artifacts_summary_out)

    if worker_pool:
        worker_pool.close()
        worker_pool.join()
def test_small_run():
    base_temp_dir = tempfile.mkdtemp()
    temp_dir = join(base_temp_dir, "models_class1_allele_specific_single")
    mkdir(temp_dir)

    def write_json(payload, filename):
        path = join(temp_dir, filename)
        with open(path, 'w') as fd:
            json.dump(payload, fd)
        return path

    models = HYPERPARAMETER_DEFAULTS.models_grid(impute=[False, True],
                                                 activation=["tanh"],
                                                 layer_sizes=[[4], [8]],
                                                 embedding_output_dim=[16],
                                                 dropout_probability=[.25],
                                                 n_training_epochs=[20])

    imputer_args = {
        "imputation_method_name": "mice",
        "n_burn_in": 2,
        "n_imputations": 10,
        "n_nearest_columns": 10,
        "min_observations_per_peptide": 5,
        "min_observations_per_allele": 1000,  # limit the number of alleles
    }

    bdata2009 = downloads.get_path("data_kim2014",
                                   "bdata.2009.mhci.public.1.txt")
    bdata_blind = downloads.get_path("data_kim2014",
                                     "bdata.2013.mhci.public.blind.1.txt")

    mkdir(join(temp_dir, "models"))

    args = [
        "--model-architectures",
        write_json(models, "models.json"),
        "--imputer-description",
        write_json(imputer_args, "imputer.json"),
        "--train-data",
        bdata2009,
        "--test-data",
        bdata_blind,
        "--out-cv-results",
        join(temp_dir, "cv.csv"),
        "--out-production-results",
        join(temp_dir, "production.csv"),
        "--out-models",
        join(temp_dir, "models"),
        "--cv-num-folds",
        "2",
        "--alleles",
        "HLA-A0201",
        "HLA-A0301",
        "--verbose",
        "--num-local-threads",
        "1",
    ]
    print("Running cv_and_train_command with args: %s " % str(args))

    cv_and_train_command.run(args)
    verify_trained_models(base_temp_dir)
Example #18
0
        'vector_encoding_name': 'BLOSUM62',
    },
    'random_negative_affinity_max': 50000.0,
    'random_negative_affinity_min': 20000.0,
    'random_negative_constant': 25,
    'random_negative_distribution_smoothing': 0.0,
    'random_negative_match_distribution': True,
    'random_negative_rate': 0.2,
    'random_negative_method': 'by_allele',
    'train_data': {},
    'validation_split': 0.1,
}


ALLELE_TO_SEQUENCE = pandas.read_csv(
    get_path(
        "allele_sequences", "allele_sequences.csv"),
    index_col=0).sequence.to_dict()


TRAIN_DF = pandas.read_csv(
    get_path(
        "data_curated", "curated_training_data.no_mass_spec.csv.bz2"))

TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.allele.isin(ALLELE_TO_SEQUENCE)]
TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.peptide.str.len() >= 8]
TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.peptide.str.len() <= 15]

TRAIN_DF = TRAIN_DF.loc[
    TRAIN_DF.allele.isin(TRAIN_DF.allele.value_counts().iloc[:3].index)
]
from os.path import join

import pypandoc
import pandas
from keras.utils.vis_utils import plot_model
from tabulate import tabulate

from mhcflurry import __version__
from mhcflurry.downloads import get_path
from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor

parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
    "--cv-summary-csv",
    metavar="FILE.csv",
    default=get_path(
        "cross_validation_class1", "summary.all.csv", test_exists=False),
    help="Cross validation scores summary. Default: %(default)s",
)
parser.add_argument(
    "--class1-models-dir",
    metavar="DIR",
    default=get_path(
        "models_class1", "models", test_exists=False),
    help="Class1 models. Default: %(default)s",
)
parser.add_argument(
    "--class1-unselected-models-dir",
    metavar="DIR",
    default=get_path(
        "models_class1_unselected", "models", test_exists=False),
    help="Class1 unselected models. Default: %(default)s",
Example #20
0
def run_and_check(n_jobs=0, delete=True, additional_args=[]):
    models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
    hyperparameters_filename = os.path.join(models_dir, "hyperparameters.yaml")
    with open(hyperparameters_filename, "w") as fd:
        json.dump(HYPERPARAMETERS_LIST, fd)

    data_df = pandas.read_csv(
        get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2"))
    selected_data_df = data_df.loc[data_df.allele.str.startswith("HLA-A")]
    selected_data_df.to_csv(os.path.join(models_dir, "_train_data.csv"),
                            index=False)

    args = [
        "mhcflurry-class1-train-pan-allele-models",
        "--data",
        os.path.join(models_dir, "_train_data.csv"),
        "--allele-sequences",
        get_path("allele_sequences", "allele_sequences.csv"),
        "--hyperparameters",
        hyperparameters_filename,
        "--out-models-dir",
        models_dir,
        "--num-jobs",
        str(n_jobs),
        "--num-folds",
        "2",
        "--verbosity",
        "1",
    ] + additional_args
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    # Run model selection
    models_dir_selected = tempfile.mkdtemp(
        prefix="mhcflurry-test-models-selected")
    args = [
        "mhcflurry-class1-select-pan-allele-models",
        "--data",
        os.path.join(models_dir, "train_data.csv.bz2"),
        "--models-dir",
        models_dir,
        "--out-models-dir",
        models_dir_selected,
        "--max-models",
        "1",
        "--num-jobs",
        str(n_jobs),
    ] + additional_args
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    result = Class1AffinityPredictor.load(models_dir_selected,
                                          optimization_level=0)
    assert_equal(len(result.neural_networks), 2)
    predictions = result.predict(peptides=["SLYNTVATL"],
                                 alleles=["HLA-A*02:01"])
    assert_equal(predictions.shape, (1, ))
    assert_array_less(predictions, 1000)

    if delete:
        print("Deleting: %s" % models_dir)
        shutil.rmtree(models_dir)
        shutil.rmtree(models_dir_selected)
Example #21
0
def test_performance_improves_for_A0205_with_pretraining():
    # test to make sure that imputation improves predictive accuracy after a
    # small number of training iterations (5 epochs)
    dataset = Dataset.from_csv(
        get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
    print("Full dataset: %d pMHC entries" % len(dataset))

    limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]

    # restrict to just five alleles
    dataset = dataset.get_alleles(limited_alleles)
    print("After filtering to %s, # entries: %d" % (
        limited_alleles, len(dataset)))

    a0205_data_without_imputation = dataset.get_allele("HLA-A0205")

    print("Dataset with only A0205, # entries: %d" % (
        len(a0205_data_without_imputation)))

    predictor_without_imputation = Class1BindingPredictor(
        name="A0205-no-impute")

    X_index, ic50_true, sample_weights, _ = (
        a0205_data_without_imputation.kmer_index_encoding())

    assert sample_weights.min() >= 0, sample_weights.min()
    assert sample_weights.max() <= 1, sample_weights.max()
    assert ic50_true.min() >= 0, ic50_true.min()

    predictor_without_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        n_training_epochs=10)

    ic50_pred_without_imputation = (
        predictor_without_imputation
        .predict_ic50_for_kmer_encoded_array(X_index))
    diff_squared = (ic50_true - ic50_pred_without_imputation) ** 2

    ic50_true_label = ic50_true <= 500
    ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500
    ic50_label_same_without_imputation = (
        ic50_true_label == ic50_pred_label_without_imputation)
    mse_without_imputation = (
        (diff_squared * sample_weights).sum() / sample_weights.sum())
    accuracy_without_imputation = ((
        ic50_label_same_without_imputation * sample_weights).sum() /
        sample_weights.sum())
    imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25))
    print("After imputation, dataset for %s has %d entries" % (
        limited_alleles, len(imputed_datset)))
    a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
    print("Limited to just A0205, # entries: %d" % (
        len(a0205_data_with_imputation)))

    X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \
        a0205_data_with_imputation.kmer_index_encoding()
    assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min()
    assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max()
    assert ic50_imputed.min() >= 0, ic50_imputed.min()

    predictor_with_imputation = Class1BindingPredictor(name="A0205-impute")

    predictor_with_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        X_pretrain=X_index_imputed,
        ic50_pretrain=ic50_imputed,
        sample_weights_pretrain=sample_weights_imputed,
        n_training_epochs=10)

    ic50_pred_with_imputation = \
        predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index)
    diff_squared = (ic50_true - ic50_pred_with_imputation) ** 2
    mse_with_imputation = (
        (diff_squared * sample_weights).sum() / sample_weights.sum())

    ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500
    ic50_label_same_with_imputation = (
        ic50_true_label == ic50_pred_label_with_imputation)
    accuracy_with_imputation = ((
        ic50_label_same_with_imputation * sample_weights).sum() /
        sample_weights.sum())
    print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation),))
    print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation),))

    assert mse_with_imputation < mse_without_imputation, \
        "Expected MSE with imputation (%f) to be < w/o imputation (%f)" % (
            mse_with_imputation, mse_without_imputation)

    print("IC50 <= 500nM accuracy w/out imputation: %f" % (
        accuracy_without_imputation,))
    print("IC50 <= 500nM accuracy w/ imputation: %f" % (
        accuracy_with_imputation,))
    assert accuracy_with_imputation > accuracy_without_imputation
Example #22
0
parser = argparse.ArgumentParser(usage=__doc__)

parser.add_argument(
    "data",
    metavar="CSV",
    help="Model selection data")
parser.add_argument(
    "--proteome-peptides",
    metavar="CSV",
    required=True,
    help="Proteome peptides")
parser.add_argument(
    "--protein-data",
    metavar="CSV",
    default=get_path("data_references", "uniprot_proteins.csv.bz2", test_exists=False),
    help="Proteome data. Default: %(default)s.")
parser.add_argument(
    "--out",
    metavar="CSV",
    required=True,
    help="File to write")


def run():
    args = parser.parse_args(sys.argv[1:])

    data_df = pandas.read_csv(args.data)
    print("Read", args.data, len(data_df))
    print(data_df)
Example #23
0
# -*- coding: utf-8 -*-
"""
Created on Fri Jan  3 07:26:12 2020

Author: Ruby Li

This script includes MHCflurry model

"""
from mhcflurry import Class1AffinityPredictor
from mhcflurry.downloads import get_path
import pandas as pd
import numpy as np

data_path = get_path('data_curated', 'curated_training_data.no_mass_spec.csv.bz2')
df = pandas.read_csv(data_path)
df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 11)]

models = {}
for hla in HLAs:
    new_predictor = Class1AffinityPredictor()
    if(df.loc[df.allele == hla].shape[0]>0):
        single_allele_train_data = df.loc[df.allele == hla].sample(21, replace = True)
    else:
        models[hla] = ''
        continue
    model = new_predictor.fit_allele_specific_predictors(n_models=1,architecture_hyperparameters_list=[{"layer_sizes": [16],"max_epochs": 5,"random_negative_constant": 5,}],peptides=single_allele_train_data.peptide.values,affinities=single_allele_train_data.measurement_value.values,allele="HLA-B*57:01")
    models[hla] = model
    
binding_affinity = []
for i in range(len(test_peptide)):
Example #24
0
def run():
    args = parser.parse_args(sys.argv[1:])
    df = pandas.read_csv(args.benchmark)

    peptides = df.peptide.unique()
    alleles = set()
    for some in df.hla.unique():
        alleles.update(some.split())

    precomputed_dfs = {}

    if 'netmhcpan4.ba' in args.predictors:
        precomputed_dfs['netmhcpan4.ba'] = load_results(
            get_path("data_mass_spec_benchmark",
                     "predictions/all.netmhcpan4.ba"),
            result_df=pandas.DataFrame(
                dtype=numpy.float32,
                index=peptides,
                columns=[
                    "%s affinity" % a for a in alleles
                ])).rename(columns=lambda s: s.replace("affinity", "").strip())
        precomputed_dfs['netmhcpan4.ba'] *= -1  # flip since it's affinities

    if 'netmhcpan4.el' in args.predictors:
        precomputed_dfs['netmhcpan4.el'] = load_results(
            get_path("data_mass_spec_benchmark",
                     "predictions/all.netmhcpan4.el"),
            result_df=pandas.DataFrame(
                dtype=numpy.float32,
                index=peptides,
                columns=[
                    "%s score" % a for a in alleles
                ])).rename(columns=lambda s: s.replace("score", "").strip())

    if 'mixmhcpred' in args.predictors:
        precomputed_dfs['mixmhcpred'] = load_results(
            get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"),
            result_df=pandas.DataFrame(
                dtype=numpy.float32,
                index=peptides,
                columns=[
                    "%s score" % a for a in alleles
                ])).rename(columns=lambda s: s.replace("score", "").strip())

    skip_experiments = set()

    for hla_text, sub_df in tqdm.tqdm(df.groupby("hla"),
                                      total=df.hla.nunique()):
        hla = hla_text.split()
        for (name, precomputed_df) in precomputed_dfs.items():
            df.loc[sub_df.index, name] = numpy.nan
            prediction_df = pandas.DataFrame(index=sub_df.peptide, dtype=float)
            for allele in hla:
                if allele not in precomputed_df.columns or precomputed_df[
                        allele].isnull().all():
                    print(sub_df.sample_id.unique(), hla)
                    skip_experiments.update(sub_df.sample_id.unique())
                prediction_df[allele] = precomputed_df.loc[prediction_df.index,
                                                           allele]
            df.loc[sub_df.index, name] = prediction_df.max(1,
                                                           skipna=False).values
            df.loc[sub_df.index, name + "_best_allele"] = prediction_df.idxmax(
                1, skipna=False).values

    if 'netmhcpan4.ba' in args.predictors:
        # unflip the values
        df['netmhcpan4.ba'] *= -1

    print("Skip experiments", skip_experiments)
    print("results")
    print(df)

    df.to_csv(args.out, index=False)
    print("Wrote", args.out)
Example #25
0
def run_and_check_with_model_selection(n_jobs=1):
    models_dir1 = tempfile.mkdtemp(prefix="mhcflurry-test-models")
    hyperparameters_filename = os.path.join(models_dir1,
                                            "hyperparameters.yaml")

    # Include one architecture that has max_epochs = 0. We check that it never
    # gets selected in model selection.
    hyperparameters = [
        deepcopy(HYPERPARAMETERS[0]),
        deepcopy(HYPERPARAMETERS[0]),
    ]
    hyperparameters[-1]["max_epochs"] = 0
    with open(hyperparameters_filename, "w") as fd:
        json.dump(hyperparameters, fd)

    args = [
        "mhcflurry-class1-train-allele-specific-models",
        "--data",
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"),
        "--hyperparameters",
        hyperparameters_filename,
        "--allele",
        "HLA-A*02:01",
        "HLA-A*03:01",
        "--out-models-dir",
        models_dir1,
        "--num-jobs",
        str(n_jobs),
        "--held-out-fraction-reciprocal",
        "10",
        "--n-models",
        "1",
    ]
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    result = Class1AffinityPredictor.load(models_dir1)
    assert_equal(len(result.neural_networks), 4)

    models_dir2 = tempfile.mkdtemp(prefix="mhcflurry-test-models")
    args = [
        "mhcflurry-class1-select-allele-specific-models",
        "--data",
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"),
        "--exclude-data",
        models_dir1 + "/train_data.csv.bz2",
        "--out-models-dir",
        models_dir2,
        "--models-dir",
        models_dir1,
        "--num-jobs",
        str(n_jobs),
        "--mse-max-models",
        "1",
        "--unselected-accuracy-scorer",
        "combined:mass-spec,mse",
        "--unselected-accuracy-percentile-threshold",
        "95",
    ]
    print("Running with args: %s" % args)
    subprocess.check_call(args)

    result = Class1AffinityPredictor.load(models_dir2)
    assert_equal(len(result.neural_networks), 2)
    assert_equal(len(result.allele_to_allele_specific_models["HLA-A*02:01"]),
                 1)
    assert_equal(len(result.allele_to_allele_specific_models["HLA-A*03:01"]),
                 1)
    assert_equal(
        result.allele_to_allele_specific_models["HLA-A*02:01"]
        [0].hyperparameters["max_epochs"], 500)
    assert_equal(
        result.allele_to_allele_specific_models["HLA-A*03:01"]
        [0].hyperparameters["max_epochs"], 500)

    print("Deleting: %s" % models_dir1)
    print("Deleting: %s" % models_dir2)
    shutil.rmtree(models_dir1)
# Expensive test - not run by nose.

from mhcflurry import train_pan_allele_models_command
from mhcflurry.downloads import get_path
from mhcflurry.allele_encoding import AlleleEncoding

import pandas
import numpy

PRETRAIN_DATA_PATH = get_path("random_peptide_predictions",
                              "predictions.csv.bz2")

FULL_TRAIN_DF = pandas.read_csv(
    get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2"))
TRAIN_DF = FULL_TRAIN_DF.loc[(FULL_TRAIN_DF.peptide.str.len() >= 8)
                             & (FULL_TRAIN_DF.peptide.str.len() <= 15)]
ALLELE_SEQUENCES = pandas.read_csv(get_path("allele_sequences",
                                            "allele_sequences.csv"),
                                   index_col=0).sequence
ALLELE_SEQUENCES = ALLELE_SEQUENCES.loc[ALLELE_SEQUENCES.index.isin(
    TRAIN_DF.allele)]
TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.allele.isin(ALLELE_SEQUENCES.index)]
FOLDS_DF = pandas.DataFrame(index=TRAIN_DF.index)
FOLDS_DF["fold_0"] = True

HYPERPARAMTERS = {
    'activation': 'tanh',
    'allele_dense_layer_sizes': [],
    'batch_normalization': False,
    'dense_layer_l1_regularization': 0.0,
    'dense_layer_l2_regularization': 0.0,
Example #27
0
def test_performance_improves_for_A0205_with_pretraining():
    # test to make sure that imputation improves predictive accuracy after a
    # small number of training iterations (5 epochs)
    dataset = Dataset.from_csv(
        get_path("data_combined_iedb_kim2014",
                 "combined_human_class1_dataset.csv"))
    print("Full dataset: %d pMHC entries" % len(dataset))

    limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]

    # restrict to just five alleles
    dataset = dataset.get_alleles(limited_alleles)
    print("After filtering to %s, # entries: %d" %
          (limited_alleles, len(dataset)))

    a0205_data_without_imputation = dataset.get_allele("HLA-A0205")

    print("Dataset with only A0205, # entries: %d" %
          (len(a0205_data_without_imputation)))

    predictor_without_imputation = Class1BindingPredictor(
        name="A0205-no-impute")

    X_index, ic50_true, sample_weights, _ = (
        a0205_data_without_imputation.kmer_index_encoding())

    assert sample_weights.min() >= 0, sample_weights.min()
    assert sample_weights.max() <= 1, sample_weights.max()
    assert ic50_true.min() >= 0, ic50_true.min()

    predictor_without_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        n_training_epochs=10)

    ic50_pred_without_imputation = (
        predictor_without_imputation.predict_ic50_for_kmer_encoded_array(
            X_index))
    diff_squared = (ic50_true - ic50_pred_without_imputation)**2

    ic50_true_label = ic50_true <= 500
    ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500
    ic50_label_same_without_imputation = (
        ic50_true_label == ic50_pred_label_without_imputation)
    mse_without_imputation = ((diff_squared * sample_weights).sum() /
                              sample_weights.sum())
    accuracy_without_imputation = (
        (ic50_label_same_without_imputation * sample_weights).sum() /
        sample_weights.sum())
    imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25))
    print("After imputation, dataset for %s has %d entries" %
          (limited_alleles, len(imputed_datset)))
    a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
    print("Limited to just A0205, # entries: %d" %
          (len(a0205_data_with_imputation)))

    X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \
        a0205_data_with_imputation.kmer_index_encoding()
    assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min()
    assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max()
    assert ic50_imputed.min() >= 0, ic50_imputed.min()

    predictor_with_imputation = Class1BindingPredictor(name="A0205-impute")

    predictor_with_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        X_pretrain=X_index_imputed,
        ic50_pretrain=ic50_imputed,
        sample_weights_pretrain=sample_weights_imputed,
        n_training_epochs=10)

    ic50_pred_with_imputation = \
        predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index)
    diff_squared = (ic50_true - ic50_pred_with_imputation)**2
    mse_with_imputation = ((diff_squared * sample_weights).sum() /
                           sample_weights.sum())

    ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500
    ic50_label_same_with_imputation = (
        ic50_true_label == ic50_pred_label_with_imputation)
    accuracy_with_imputation = (
        (ic50_label_same_with_imputation * sample_weights).sum() /
        sample_weights.sum())
    print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation), ))
    print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation), ))

    assert mse_with_imputation < mse_without_imputation, \
        "Expected MSE with imputation (%f) to be < w/o imputation (%f)" % (
            mse_with_imputation, mse_without_imputation)

    print("IC50 <= 500nM accuracy w/out imputation: %f" %
          (accuracy_without_imputation, ))
    print("IC50 <= 500nM accuracy w/ imputation: %f" %
          (accuracy_with_imputation, ))
    assert accuracy_with_imputation > accuracy_without_imputation
Example #28
0
import time
import pandas

from mhcflurry.allele_encoding import AlleleEncoding
from mhcflurry.amino_acid import BLOSUM62_MATRIX
from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor
from mhcflurry.downloads import get_path

from numpy.testing import assert_equal

from mhcflurry.testing_utils import cleanup, startup
teardown = cleanup
setup = startup

ALLELE_TO_SEQUENCE = pandas.read_csv(
    get_path(
        "allele_sequences", "allele_sequences.csv"),
    index_col=0).sequence.to_dict()

HYPERPARAMETERS = {
    'activation': 'tanh',
    'allele_dense_layer_sizes': [],
    'batch_normalization': False,
    'dense_layer_l1_regularization': 0.0,
    'dense_layer_l2_regularization': 0.0,
    'dropout_probability': 0.5,
    'early_stopping': True,
    'init': 'glorot_uniform',
    'layer_sizes': [4],
    'learning_rate': None,
    'locally_connected_layers': [],
    'loss': 'custom:mse_with_inequalities',
def test_class1_affinity_predictor_a0205_memorize_training_data():
    # Memorize the dataset.
    hyperparameters = dict(
        activation="tanh",
        layer_sizes=[64],
        max_epochs=100,
        early_stopping=False,
        validation_split=0.0,
        locally_connected_layers=[],
        dense_layer_l1_regularization=0.0,
        dropout_probability=0.0)

    allele = "HLA-A*02:05"

    df = pandas.read_csv(
        get_path(
            "data_curated", "curated_training_data.affinity.csv.bz2"))
    df = df.loc[
        df.allele == allele
    ]
    df = df.loc[
        df.peptide.str.len() == 9
    ]
    df = df.loc[
        df.measurement_type == "quantitative"
    ]
    df = df.loc[
        df.measurement_source == "kim2014"
    ]

    predictor = Class1AffinityPredictor()
    predictor.fit_allele_specific_predictors(
        n_models=2,
        architecture_hyperparameters_list=[hyperparameters],
        allele=allele,
        peptides=df.peptide.values,
        affinities=df.measurement_value.values,
        verbose=0,
    )
    predictor.calibrate_percentile_ranks(num_peptides_per_length=1000)
    ic50_pred = predictor.predict(df.peptide.values, allele=allele)
    ic50_true = df.measurement_value.values
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(
        numpy.log(ic50_pred),
        numpy.log(ic50_true),
        rtol=0.2,
        atol=0.2)

    ic50_pred_df = predictor.predict_to_dataframe(
        df.peptide.values, allele=allele)
    print(ic50_pred_df)
    assert 'prediction_percentile' in ic50_pred_df.columns
    assert ic50_pred_df.prediction_percentile.isnull().sum() == 0

    ic50_pred_df2 = predictor.predict_to_dataframe(
        df.peptide.values,
        allele=allele,
        include_individual_model_predictions=True)
    print(ic50_pred_df2)

    # Test an unknown allele
    print("Starting unknown allele check")
    eq_(predictor.supported_alleles, [allele])
    ic50_pred = predictor.predict(
        df.peptide.values,
        allele="HLA-A*02:01",
        throw=False)
    assert numpy.isnan(ic50_pred).all()

    assert_raises(
        ValueError,
        predictor.predict,
        df.peptide.values,
        allele="HLA-A*02:01")


    eq_(predictor.supported_alleles, [allele])
    assert_raises(
        ValueError,
        predictor.predict,
        ["AAAAA"],  # too short
        allele=allele)
    assert_raises(
        ValueError,
        predictor.predict,
        ["AAAAAAAAAAAAAAAAAAAA"],  # too long
        allele=allele)
    ic50_pred = predictor.predict(
        ["AAAAA", "AAAAAAAAA", "AAAAAAAAAAAAAAAAAAAA"],
        allele=allele,
        throw=False)
    assert numpy.isnan(ic50_pred[0])
    assert not numpy.isnan(ic50_pred[1])
    assert numpy.isnan(ic50_pred[2])
def test_a1_known_epitopes_in_newly_trained_model():
    allele = "HLA-A*01:01"
    df = pandas.read_csv(
        get_path(
            "data_curated", "curated_training_data.affinity.csv.bz2"))
    df = df.loc[
        (df.allele == allele) &
        (df.peptide.str.len() >= 8) &
        (df.peptide.str.len() <= 15)
    ]

    hyperparameters = {
        "max_epochs": 100,
        "patience": 10,
        "early_stopping": True,
        "validation_split": 0.2,

        "random_negative_rate": 0.0,
        "random_negative_constant": 25,

        "peptide_amino_acid_encoding": "BLOSUM62",
        "use_embedding": False,
        "kmer_size": 15,
        "batch_normalization": False,
        "locally_connected_layers": [
            {
                "filters": 8,
                "activation": "tanh",
                "kernel_size": 3
            }
        ],
        "activation": "relu",
        "output_activation": "sigmoid",
        "layer_sizes": [
            32
        ],
        "random_negative_affinity_min": 20000.0,
        "random_negative_affinity_max": 50000.0,
        "dense_layer_l1_regularization": 0.001,
        "dropout_probability": 0.0
    }

    predictor = Class1AffinityPredictor()
    predictor.fit_allele_specific_predictors(
        n_models=2,
        architecture_hyperparameters_list=[hyperparameters],
        allele=allele,
        peptides=df.peptide.values,
        affinities=df.measurement_value.values,
        verbose=0,
    )

    predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor)

    models_dir = tempfile.mkdtemp("_models")
    print(models_dir)
    predictor.save(models_dir)
    predictor2 = Class1AffinityPredictor.load(models_dir)
    predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor2)
    shutil.rmtree(models_dir)

    predictor3 = Class1AffinityPredictor(
        allele_to_allele_specific_models={
            allele: [predictor.allele_to_allele_specific_models[allele][0]]
        })
    predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor3)
    models_dir = tempfile.mkdtemp("_models")
    print(models_dir)
    predictor3.save(models_dir)
    predictor4 = Class1AffinityPredictor.load(models_dir)
    predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor4)
    shutil.rmtree(models_dir)
import pandas
import logomaker

from matplotlib import pyplot

from mhcflurry.downloads import get_path
from mhcflurry.amino_acid import COMMON_AMINO_ACIDS

AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)

parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
    "--class1-models-dir-with-ms",
    metavar="DIR",
    default=get_path(
        "models_class1_pan", "models.with_mass_spec", test_exists=False),
    help="Class1 models. Default: %(default)s",
)
parser.add_argument(
    "--class1-models-dir-no-ms",
    metavar="DIR",
    default=get_path(
        "models_class1_pan", "models.no_mass_spec", test_exists=False),
    help="Class1 models. Default: %(default)s",
)
parser.add_argument(
    "--logo-cutoff",
    default=0.01,
    type=float,
    help="Fraction of top to use for motifs",
)