def setup(): global ALLELE_SPECIFIC_PREDICTOR, PAN_ALLELE_PREDICTOR startup() ALLELE_SPECIFIC_PREDICTOR = Class1AffinityPredictor.load( get_path("models_class1", "models")) PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load( get_path("models_class1_pan", "models.with_mass_spec"))
def setup(): global PREDICTORS startup() PREDICTORS = { 'allele-specific': Class1AffinityPredictor.load(get_path("models_class1", "models")), 'pan-allele': Class1AffinityPredictor.load( get_path("models_class1_pan", "models.combined")) }
def test_cross_validation_with_imputation(): imputer = fancyimpute.MICE( n_imputations=2, n_burn_in=1, n_nearest_columns=25) train_data = ( mhcflurry.dataset.Dataset.from_csv( get_path("data_kim2014" , "bdata.2009.mhci.public.1.txt")) .get_alleles(["HLA-A0201", "HLA-A0202", "HLA-A0301"])) folds = cross_validation_folds( train_data, n_folds=3, imputer=imputer, drop_similar_peptides=True, alleles=["HLA-A0201", "HLA-A0202"]) eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"}) eq_(len(folds), 6) for fold in folds: eq_(fold.train.unique_alleles(), set([fold.allele])) eq_(fold.imputed_train.unique_alleles(), set([fold.allele])) eq_(fold.test.unique_alleles(), set([fold.allele])) models = HYPERPARAMETER_DEFAULTS.models_grid( activation=["tanh", "relu"], layer_sizes=[[4]], embedding_output_dim=[8], n_training_epochs=[3]) print(models) df = train_across_models_and_folds(folds, models) print(df) assert df.test_auc.mean() > 0.6
def test_cross_validation_with_imputation(): imputer = fancyimpute.MICE(n_imputations=2, n_burn_in=1, n_nearest_columns=25) train_data = (mhcflurry.dataset.Dataset.from_csv( get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")).get_alleles( ["HLA-A0201", "HLA-A0202", "HLA-A0301"])) folds = cross_validation_folds(train_data, n_folds=3, imputer=imputer, drop_similar_peptides=True, alleles=["HLA-A0201", "HLA-A0202"]) eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"}) eq_(len(folds), 6) for fold in folds: eq_(fold.train.unique_alleles(), set([fold.allele])) eq_(fold.imputed_train.unique_alleles(), set([fold.allele])) eq_(fold.test.unique_alleles(), set([fold.allele])) models = HYPERPARAMETER_DEFAULTS.models_grid(activation=["tanh", "relu"], layer_sizes=[[4]], embedding_output_dim=[8], n_training_epochs=[3]) print(models) df = train_across_models_and_folds(folds, models) print(df) assert df.test_auc.mean() > 0.6
def test_class1_affinity_predictor_a0205_training_accuracy(): # Memorize the dataset. hyperparameters = dict(activation="tanh", layer_sizes=[16], max_epochs=500, early_stopping=False, validation_split=0.0, locally_connected_layers=[], dense_layer_l1_regularization=0.0, dropout_probability=0.0) # First test a Class1NeuralNetwork, then a Class1AffinityPredictor. allele = "HLA-A*02:05" df = pandas.read_csv( get_path("data_curated", "curated_training_data.csv.bz2")) df = df.ix[df.allele == allele] df = df.ix[df.peptide.str.len() == 9] df = df.ix[df.measurement_type == "quantitative"] df = df.ix[df.measurement_source == "kim2014"] predictor = Class1NeuralNetwork(**hyperparameters) predictor.fit(df.peptide.values, df.measurement_value.values) ic50_pred = predictor.predict(df.peptide.values) ic50_true = df.measurement_value.values eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose(numpy.log(ic50_pred), numpy.log(ic50_true), rtol=0.2, atol=0.2)
def test_run(): try: models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join(models_dir, "hyperparameters.json") with open(hyperparameters_filename, "w") as fd: json.dump(HYPERPARAMETERS, fd) args = [ "--data", get_path("data_curated", "curated_training_data.csv.bz2"), "--hyperparameters", hyperparameters_filename, "--min-measurements-per-allele", "9000", "--out-models-dir", models_dir, ] print("Running with args: %s" % args) train_allele_specific_models_command.run(args) result = Class1AffinityPredictor.load(models_dir) predictions = result.predict(peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) assert_equal(predictions.shape, (1, )) assert_array_less(predictions, 500) finally: print("Deleting: %s" % models_dir) shutil.rmtree(models_dir)
def test_class1_binding_predictor_A0205_training_accuracy(): dataset = Dataset.from_csv(get_path( "data_combined_iedb_kim2014", "combined_human_class1_dataset.csv")) dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205") dataset_a0205 = Dataset( dataset_a0205_all_lengths._df.ix[ dataset_a0205_all_lengths._df.peptide.str.len() == 9]) predictor = Class1BindingPredictor( name="A0205", embedding_output_dim=32, activation="tanh", layer_sizes=[64], optimizer="adam", dropout_probability=0.0) predictor.fit_dataset(dataset_a0205, n_training_epochs=1000) peptides = dataset_a0205.peptides ic50_pred = predictor.predict(peptides) ic50_true = dataset_a0205.affinities eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose( np.log(ic50_pred), np.log(ic50_true), rtol=0.2, atol=0.2)
def setup(): global PAN_ALLELE_PREDICTOR startup() PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load( get_path("models_class1_pan", "models.combined"), optimization_level=0, )
def setup(): global AFFINITY_PREDICTOR global CLEAVAGE_PREDICTOR global CLEAVAGE_PREDICTOR_NO_FLANKING global PRESENTATION_PREDICTOR startup() AFFINITY_PREDICTOR = Class1AffinityPredictor.load(get_path( "models_class1_pan", "models.combined"), optimization_level=0, max_models=1) CLEAVAGE_PREDICTOR = Class1ProcessingPredictor.load(get_path( "models_class1_processing", "models.selected.with_flanks"), max_models=1) CLEAVAGE_PREDICTOR_NO_FLANKING = Class1ProcessingPredictor.load( get_path("models_class1_processing", "models.selected.no_flank"), max_models=1) PRESENTATION_PREDICTOR = Class1PresentationPredictor.load()
def run_and_check(n_jobs=0, delete=True, additional_args=[]): source_models_dir = get_path("models_class1_pan", "models.combined") dest_models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") # Save a new predictor that has no percent rank calibration data. original_predictor = Class1AffinityPredictor.load(source_models_dir) print("Loaded predictor", source_models_dir) new_predictor = Class1AffinityPredictor( class1_pan_allele_models=original_predictor.class1_pan_allele_models, allele_to_sequence=original_predictor.allele_to_sequence, ) new_predictor.save(dest_models_dir) print("Saved predictor to", dest_models_dir) new_predictor = Class1AffinityPredictor.load(dest_models_dir) assert_equal(len(new_predictor.allele_to_percent_rank_transform), 0) args = [ "mhcflurry-calibrate-percentile-ranks", "--models-dir", dest_models_dir, "--match-amino-acid-distribution-data", get_path("data_curated", "curated_training_data.affinity.csv.bz2"), "--motif-summary", "--num-peptides-per-length", "1000", "--allele", "HLA-A*02:01", "HLA-B*07:02", "--verbosity", "1", "--num-jobs", str(n_jobs), ] + additional_args print("Running with args: %s" % args) subprocess.check_call(args) new_predictor = Class1AffinityPredictor.load(dest_models_dir) assert_equal(len(new_predictor.allele_to_percent_rank_transform), 2) if delete: print("Deleting: %s" % dest_models_dir) shutil.rmtree(dest_models_dir) else: print("Not deleting: %s" % dest_models_dir)
def test_class1_neural_network_a0205_training_accuracy(): # Memorize the dataset. hyperparameters = dict(activation="tanh", layer_sizes=[16], max_epochs=500, early_stopping=False, validation_split=0.0, locally_connected_layers=[{ "filters": 8, "activation": "tanh", "kernel_size": 3 }], dense_layer_l1_regularization=0.0, dropout_probability=0.0) # First test a Class1NeuralNetwork, then a Class1AffinityPredictor. allele = "HLA-A*02:05" df = pandas.read_csv( get_path("data_curated", "curated_training_data.affinity.csv.bz2")) df = df.loc[df.allele == allele] df = df.loc[df.peptide.str.len() == 9] df = df.loc[df.measurement_type == "quantitative"] df = df.loc[df.measurement_source == "kim2014"] predictor = Class1NeuralNetwork(**hyperparameters) predictor.fit(df.peptide.values, df.measurement_value.values) ic50_pred = predictor.predict(df.peptide.values) ic50_true = df.measurement_value.values eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose(numpy.log(ic50_pred), numpy.log(ic50_true), rtol=0.2, atol=0.2) # Test that a second predictor has the same architecture json. # This is important for an optimization we use to re-use predictors of the # same architecture at prediction time. hyperparameters2 = dict(activation="tanh", layer_sizes=[16], max_epochs=1, early_stopping=False, validation_split=0.0, locally_connected_layers=[{ "filters": 8, "activation": "tanh", "kernel_size": 3 }], dense_layer_l1_regularization=0.0, dropout_probability=0.0) predictor2 = Class1NeuralNetwork(**hyperparameters2) predictor2.fit(df.peptide.values, df.measurement_value.values, verbose=0) eq_(predictor.network().to_json(), predictor2.network().to_json())
def run_and_check(n_jobs=0): models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join(models_dir, "hyperparameters.yaml") with open(hyperparameters_filename, "w") as fd: json.dump(HYPERPARAMETERS, fd) args = [ "mhcflurry-class1-train-allele-specific-models", "--data", get_path("data_curated", "curated_training_data.affinity.csv.bz2"), "--hyperparameters", hyperparameters_filename, "--allele", "HLA-A*02:01", "HLA-A*03:01", "--out-models-dir", models_dir, "--num-jobs", str(n_jobs), ] print("Running with args: %s" % args) subprocess.check_call(args) # Calibrate percentile ranks args = [ "mhcflurry-calibrate-percentile-ranks", "--models-dir", models_dir, "--num-peptides-per-length", "10000", "--num-jobs", str(n_jobs), ] print("Running with args: %s" % args) subprocess.check_call(args) result = Class1AffinityPredictor.load(models_dir) predictions = result.predict(peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) assert_equal(predictions.shape, (1, )) assert_array_less(predictions, 1000) df = result.predict_to_dataframe(peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) print(df) assert "prediction_percentile" in df.columns print("Deleting: %s" % models_dir) shutil.rmtree(models_dir)
def test_imputation(): imputer = fancyimpute.MICE(n_imputations=2, n_burn_in=1, n_nearest_columns=25) train_data = (mhcflurry.dataset.Dataset.from_csv( get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")).get_alleles( ["HLA-A0201", "HLA-A0202", "HLA-A0301"])) folds = cross_validation_folds(train_data, n_folds=3, imputer=imputer, drop_similar_peptides=True, alleles=["HLA-A0201", "HLA-A0202"]) eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"}) eq_(len(folds), 6) for fold in folds: eq_(fold.train.unique_alleles(), set([fold.allele])) eq_(fold.imputed_train.unique_alleles(), set([fold.allele])) eq_(fold.test.unique_alleles(), set([fold.allele]))
def test_imputation(): imputer = fancyimpute.MICE( n_imputations=2, n_burn_in=1, n_nearest_columns=25) train_data = ( mhcflurry.dataset.Dataset.from_csv( get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")) .get_alleles(["HLA-A0201", "HLA-A0202", "HLA-A0301"])) folds = cross_validation_folds( train_data, n_folds=3, imputer=imputer, drop_similar_peptides=True, alleles=["HLA-A0201", "HLA-A0202"]) eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"}) eq_(len(folds), 6) for fold in folds: eq_(fold.train.unique_alleles(), set([fold.allele])) eq_(fold.imputed_train.unique_alleles(), set([fold.allele])) eq_(fold.test.unique_alleles(), set([fold.allele]))
def test_class1_binding_predictor_A0205_training_accuracy(): dataset = Dataset.from_csv( get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv")) dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205") dataset_a0205 = Dataset(dataset_a0205_all_lengths._df.ix[ dataset_a0205_all_lengths._df.peptide.str.len() == 9]) predictor = Class1BindingPredictor(name="A0205", embedding_output_dim=32, activation="tanh", layer_sizes=[64], optimizer="adam", dropout_probability=0.0) predictor.fit_dataset(dataset_a0205, n_training_epochs=1000) peptides = dataset_a0205.peptides ic50_pred = predictor.predict(peptides) ic50_true = dataset_a0205.affinities eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose(np.log(ic50_pred), np.log(ic50_true), rtol=0.2, atol=0.2)
def run(): from mhcflurry.amino_acid import COMMON_AMINO_ACIDS args = parser.parse_args(sys.argv[1:]) configure_logging() serial_run = not args.cluster_parallelism and args.num_jobs == 0 if not args.affinity_predictor: args.affinity_predictor = get_path( "models_class1_pan", "models.combined") print("Using downloaded affinity predictor: ", args.affinity_predictor) if not args.frequency_matrices: args.frequency_matrices = os.path.join( args.affinity_predictor, "frequency_matrices.csv.bz2") if not args.length_distributions: args.length_distributions = os.path.join(args.affinity_predictor, "length_distributions.csv.bz2") if not args.train_data: args.train_data = os.path.join(args.affinity_predictor, "train_data.csv.bz2") frequency_matrices_df = pandas.read_csv(args.frequency_matrices) length_distributions = pandas.read_csv(args.length_distributions) train_data = pandas.read_csv(args.train_data) alleles = args.alleles if alleles: print("Using specified alleles, ", *alleles) else: alleles = frequency_matrices_df.allele.unique() if args.max_alleles: alleles = alleles[:args.max_alleles] print("Using %d alleles" % len(alleles), alleles) amino_acids = sorted(COMMON_AMINO_ACIDS) distribution = frequency_matrices_df.loc[ (frequency_matrices_df.cutoff_fraction == 1.0), amino_acids ].mean(0) normalized_frequency_matrices = frequency_matrices_df.copy() normalized_frequency_matrices.loc[:, amino_acids] = ( normalized_frequency_matrices[amino_acids] / distribution) GLOBAL_DATA["args"] = args GLOBAL_DATA["normalized_frequency_matrices"] = normalized_frequency_matrices GLOBAL_DATA["length_distributions"] = length_distributions GLOBAL_DATA["train_data"] = train_data artifacts_out = os.path.join(args.out, "artifacts") if not os.path.exists(args.out): os.mkdir(args.out) if not os.path.exists(artifacts_out): os.mkdir(artifacts_out) tasks = [ { "task_num": i, "allele": allele, "out_dir": artifacts_out, } for (i, allele) in enumerate(alleles) ] jobs = [] for task in tasks: if not jobs or len(jobs[-1]['tasks']) >= args.chunk_size: jobs.append({'tasks': []}) jobs[-1]['tasks'].append(task) print("Generated %d tasks, packed into %d jobs" % (len(tasks), len(jobs))) worker_pool = None start = time.time() if serial_run: # Serial run print("Running in serial.") results = ( do_job(**job) for job in jobs) elif args.cluster_parallelism: # Run using separate processes HPC cluster. print("Running on cluster.") results = cluster_results_from_args( args, work_function=do_job, work_items=jobs, constant_data=GLOBAL_DATA, input_serialization_method="dill", result_serialization_method="pickle", clear_constant_data=False) else: worker_pool = worker_pool_with_gpu_assignments_from_args(args) print("Worker pool", worker_pool) assert worker_pool is not None for task in tasks: task['constant_data'] = GLOBAL_DATA results = worker_pool.imap_unordered( partial(call_wrapped_kwargs, do_job), jobs, chunksize=1) print("Reading results") task_results = {} for job_result in tqdm.tqdm(results, total=len(jobs)): for task_result in job_result: task_results[task_result['task_num']] = task_result print("Received all results in %0.2f sec" % (time.time() - start)) artifacts_df = pandas.DataFrame(task_results).T.set_index("task_num") length_distributions_out = os.path.join(args.out, "length_distributions.csv") length_distributions.to_csv(length_distributions_out, index=False) print("Wrote: ", length_distributions_out) artifacts_summary_out = os.path.join(args.out, "artifacts.csv") artifacts_df.to_csv(artifacts_summary_out) print("Wrote: ", artifacts_summary_out) if worker_pool: worker_pool.close() worker_pool.join()
def test_small_run(): base_temp_dir = tempfile.mkdtemp() temp_dir = join(base_temp_dir, "models_class1_allele_specific_single") mkdir(temp_dir) def write_json(payload, filename): path = join(temp_dir, filename) with open(path, 'w') as fd: json.dump(payload, fd) return path models = HYPERPARAMETER_DEFAULTS.models_grid(impute=[False, True], activation=["tanh"], layer_sizes=[[4], [8]], embedding_output_dim=[16], dropout_probability=[.25], n_training_epochs=[20]) imputer_args = { "imputation_method_name": "mice", "n_burn_in": 2, "n_imputations": 10, "n_nearest_columns": 10, "min_observations_per_peptide": 5, "min_observations_per_allele": 1000, # limit the number of alleles } bdata2009 = downloads.get_path("data_kim2014", "bdata.2009.mhci.public.1.txt") bdata_blind = downloads.get_path("data_kim2014", "bdata.2013.mhci.public.blind.1.txt") mkdir(join(temp_dir, "models")) args = [ "--model-architectures", write_json(models, "models.json"), "--imputer-description", write_json(imputer_args, "imputer.json"), "--train-data", bdata2009, "--test-data", bdata_blind, "--out-cv-results", join(temp_dir, "cv.csv"), "--out-production-results", join(temp_dir, "production.csv"), "--out-models", join(temp_dir, "models"), "--cv-num-folds", "2", "--alleles", "HLA-A0201", "HLA-A0301", "--verbose", "--num-local-threads", "1", ] print("Running cv_and_train_command with args: %s " % str(args)) cv_and_train_command.run(args) verify_trained_models(base_temp_dir)
'vector_encoding_name': 'BLOSUM62', }, 'random_negative_affinity_max': 50000.0, 'random_negative_affinity_min': 20000.0, 'random_negative_constant': 25, 'random_negative_distribution_smoothing': 0.0, 'random_negative_match_distribution': True, 'random_negative_rate': 0.2, 'random_negative_method': 'by_allele', 'train_data': {}, 'validation_split': 0.1, } ALLELE_TO_SEQUENCE = pandas.read_csv( get_path( "allele_sequences", "allele_sequences.csv"), index_col=0).sequence.to_dict() TRAIN_DF = pandas.read_csv( get_path( "data_curated", "curated_training_data.no_mass_spec.csv.bz2")) TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.allele.isin(ALLELE_TO_SEQUENCE)] TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.peptide.str.len() >= 8] TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.peptide.str.len() <= 15] TRAIN_DF = TRAIN_DF.loc[ TRAIN_DF.allele.isin(TRAIN_DF.allele.value_counts().iloc[:3].index) ]
from os.path import join import pypandoc import pandas from keras.utils.vis_utils import plot_model from tabulate import tabulate from mhcflurry import __version__ from mhcflurry.downloads import get_path from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( "--cv-summary-csv", metavar="FILE.csv", default=get_path( "cross_validation_class1", "summary.all.csv", test_exists=False), help="Cross validation scores summary. Default: %(default)s", ) parser.add_argument( "--class1-models-dir", metavar="DIR", default=get_path( "models_class1", "models", test_exists=False), help="Class1 models. Default: %(default)s", ) parser.add_argument( "--class1-unselected-models-dir", metavar="DIR", default=get_path( "models_class1_unselected", "models", test_exists=False), help="Class1 unselected models. Default: %(default)s",
def run_and_check(n_jobs=0, delete=True, additional_args=[]): models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join(models_dir, "hyperparameters.yaml") with open(hyperparameters_filename, "w") as fd: json.dump(HYPERPARAMETERS_LIST, fd) data_df = pandas.read_csv( get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2")) selected_data_df = data_df.loc[data_df.allele.str.startswith("HLA-A")] selected_data_df.to_csv(os.path.join(models_dir, "_train_data.csv"), index=False) args = [ "mhcflurry-class1-train-pan-allele-models", "--data", os.path.join(models_dir, "_train_data.csv"), "--allele-sequences", get_path("allele_sequences", "allele_sequences.csv"), "--hyperparameters", hyperparameters_filename, "--out-models-dir", models_dir, "--num-jobs", str(n_jobs), "--num-folds", "2", "--verbosity", "1", ] + additional_args print("Running with args: %s" % args) subprocess.check_call(args) # Run model selection models_dir_selected = tempfile.mkdtemp( prefix="mhcflurry-test-models-selected") args = [ "mhcflurry-class1-select-pan-allele-models", "--data", os.path.join(models_dir, "train_data.csv.bz2"), "--models-dir", models_dir, "--out-models-dir", models_dir_selected, "--max-models", "1", "--num-jobs", str(n_jobs), ] + additional_args print("Running with args: %s" % args) subprocess.check_call(args) result = Class1AffinityPredictor.load(models_dir_selected, optimization_level=0) assert_equal(len(result.neural_networks), 2) predictions = result.predict(peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) assert_equal(predictions.shape, (1, )) assert_array_less(predictions, 1000) if delete: print("Deleting: %s" % models_dir) shutil.rmtree(models_dir) shutil.rmtree(models_dir_selected)
def test_performance_improves_for_A0205_with_pretraining(): # test to make sure that imputation improves predictive accuracy after a # small number of training iterations (5 epochs) dataset = Dataset.from_csv( get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv")) print("Full dataset: %d pMHC entries" % len(dataset)) limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"] # restrict to just five alleles dataset = dataset.get_alleles(limited_alleles) print("After filtering to %s, # entries: %d" % ( limited_alleles, len(dataset))) a0205_data_without_imputation = dataset.get_allele("HLA-A0205") print("Dataset with only A0205, # entries: %d" % ( len(a0205_data_without_imputation))) predictor_without_imputation = Class1BindingPredictor( name="A0205-no-impute") X_index, ic50_true, sample_weights, _ = ( a0205_data_without_imputation.kmer_index_encoding()) assert sample_weights.min() >= 0, sample_weights.min() assert sample_weights.max() <= 1, sample_weights.max() assert ic50_true.min() >= 0, ic50_true.min() predictor_without_imputation.fit_kmer_encoded_arrays( X=X_index, ic50=ic50_true, sample_weights=sample_weights, n_training_epochs=10) ic50_pred_without_imputation = ( predictor_without_imputation .predict_ic50_for_kmer_encoded_array(X_index)) diff_squared = (ic50_true - ic50_pred_without_imputation) ** 2 ic50_true_label = ic50_true <= 500 ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500 ic50_label_same_without_imputation = ( ic50_true_label == ic50_pred_label_without_imputation) mse_without_imputation = ( (diff_squared * sample_weights).sum() / sample_weights.sum()) accuracy_without_imputation = (( ic50_label_same_without_imputation * sample_weights).sum() / sample_weights.sum()) imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25)) print("After imputation, dataset for %s has %d entries" % ( limited_alleles, len(imputed_datset))) a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205") print("Limited to just A0205, # entries: %d" % ( len(a0205_data_with_imputation))) X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \ a0205_data_with_imputation.kmer_index_encoding() assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min() assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max() assert ic50_imputed.min() >= 0, ic50_imputed.min() predictor_with_imputation = Class1BindingPredictor(name="A0205-impute") predictor_with_imputation.fit_kmer_encoded_arrays( X=X_index, ic50=ic50_true, sample_weights=sample_weights, X_pretrain=X_index_imputed, ic50_pretrain=ic50_imputed, sample_weights_pretrain=sample_weights_imputed, n_training_epochs=10) ic50_pred_with_imputation = \ predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index) diff_squared = (ic50_true - ic50_pred_with_imputation) ** 2 mse_with_imputation = ( (diff_squared * sample_weights).sum() / sample_weights.sum()) ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500 ic50_label_same_with_imputation = ( ic50_true_label == ic50_pred_label_with_imputation) accuracy_with_imputation = (( ic50_label_same_with_imputation * sample_weights).sum() / sample_weights.sum()) print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation),)) print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation),)) assert mse_with_imputation < mse_without_imputation, \ "Expected MSE with imputation (%f) to be < w/o imputation (%f)" % ( mse_with_imputation, mse_without_imputation) print("IC50 <= 500nM accuracy w/out imputation: %f" % ( accuracy_without_imputation,)) print("IC50 <= 500nM accuracy w/ imputation: %f" % ( accuracy_with_imputation,)) assert accuracy_with_imputation > accuracy_without_imputation
parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( "data", metavar="CSV", help="Model selection data") parser.add_argument( "--proteome-peptides", metavar="CSV", required=True, help="Proteome peptides") parser.add_argument( "--protein-data", metavar="CSV", default=get_path("data_references", "uniprot_proteins.csv.bz2", test_exists=False), help="Proteome data. Default: %(default)s.") parser.add_argument( "--out", metavar="CSV", required=True, help="File to write") def run(): args = parser.parse_args(sys.argv[1:]) data_df = pandas.read_csv(args.data) print("Read", args.data, len(data_df)) print(data_df)
# -*- coding: utf-8 -*- """ Created on Fri Jan 3 07:26:12 2020 Author: Ruby Li This script includes MHCflurry model """ from mhcflurry import Class1AffinityPredictor from mhcflurry.downloads import get_path import pandas as pd import numpy as np data_path = get_path('data_curated', 'curated_training_data.no_mass_spec.csv.bz2') df = pandas.read_csv(data_path) df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 11)] models = {} for hla in HLAs: new_predictor = Class1AffinityPredictor() if(df.loc[df.allele == hla].shape[0]>0): single_allele_train_data = df.loc[df.allele == hla].sample(21, replace = True) else: models[hla] = '' continue model = new_predictor.fit_allele_specific_predictors(n_models=1,architecture_hyperparameters_list=[{"layer_sizes": [16],"max_epochs": 5,"random_negative_constant": 5,}],peptides=single_allele_train_data.peptide.values,affinities=single_allele_train_data.measurement_value.values,allele="HLA-B*57:01") models[hla] = model binding_affinity = [] for i in range(len(test_peptide)):
def run(): args = parser.parse_args(sys.argv[1:]) df = pandas.read_csv(args.benchmark) peptides = df.peptide.unique() alleles = set() for some in df.hla.unique(): alleles.update(some.split()) precomputed_dfs = {} if 'netmhcpan4.ba' in args.predictors: precomputed_dfs['netmhcpan4.ba'] = load_results( get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.ba"), result_df=pandas.DataFrame( dtype=numpy.float32, index=peptides, columns=[ "%s affinity" % a for a in alleles ])).rename(columns=lambda s: s.replace("affinity", "").strip()) precomputed_dfs['netmhcpan4.ba'] *= -1 # flip since it's affinities if 'netmhcpan4.el' in args.predictors: precomputed_dfs['netmhcpan4.el'] = load_results( get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.el"), result_df=pandas.DataFrame( dtype=numpy.float32, index=peptides, columns=[ "%s score" % a for a in alleles ])).rename(columns=lambda s: s.replace("score", "").strip()) if 'mixmhcpred' in args.predictors: precomputed_dfs['mixmhcpred'] = load_results( get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"), result_df=pandas.DataFrame( dtype=numpy.float32, index=peptides, columns=[ "%s score" % a for a in alleles ])).rename(columns=lambda s: s.replace("score", "").strip()) skip_experiments = set() for hla_text, sub_df in tqdm.tqdm(df.groupby("hla"), total=df.hla.nunique()): hla = hla_text.split() for (name, precomputed_df) in precomputed_dfs.items(): df.loc[sub_df.index, name] = numpy.nan prediction_df = pandas.DataFrame(index=sub_df.peptide, dtype=float) for allele in hla: if allele not in precomputed_df.columns or precomputed_df[ allele].isnull().all(): print(sub_df.sample_id.unique(), hla) skip_experiments.update(sub_df.sample_id.unique()) prediction_df[allele] = precomputed_df.loc[prediction_df.index, allele] df.loc[sub_df.index, name] = prediction_df.max(1, skipna=False).values df.loc[sub_df.index, name + "_best_allele"] = prediction_df.idxmax( 1, skipna=False).values if 'netmhcpan4.ba' in args.predictors: # unflip the values df['netmhcpan4.ba'] *= -1 print("Skip experiments", skip_experiments) print("results") print(df) df.to_csv(args.out, index=False) print("Wrote", args.out)
def run_and_check_with_model_selection(n_jobs=1): models_dir1 = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join(models_dir1, "hyperparameters.yaml") # Include one architecture that has max_epochs = 0. We check that it never # gets selected in model selection. hyperparameters = [ deepcopy(HYPERPARAMETERS[0]), deepcopy(HYPERPARAMETERS[0]), ] hyperparameters[-1]["max_epochs"] = 0 with open(hyperparameters_filename, "w") as fd: json.dump(hyperparameters, fd) args = [ "mhcflurry-class1-train-allele-specific-models", "--data", get_path("data_curated", "curated_training_data.affinity.csv.bz2"), "--hyperparameters", hyperparameters_filename, "--allele", "HLA-A*02:01", "HLA-A*03:01", "--out-models-dir", models_dir1, "--num-jobs", str(n_jobs), "--held-out-fraction-reciprocal", "10", "--n-models", "1", ] print("Running with args: %s" % args) subprocess.check_call(args) result = Class1AffinityPredictor.load(models_dir1) assert_equal(len(result.neural_networks), 4) models_dir2 = tempfile.mkdtemp(prefix="mhcflurry-test-models") args = [ "mhcflurry-class1-select-allele-specific-models", "--data", get_path("data_curated", "curated_training_data.affinity.csv.bz2"), "--exclude-data", models_dir1 + "/train_data.csv.bz2", "--out-models-dir", models_dir2, "--models-dir", models_dir1, "--num-jobs", str(n_jobs), "--mse-max-models", "1", "--unselected-accuracy-scorer", "combined:mass-spec,mse", "--unselected-accuracy-percentile-threshold", "95", ] print("Running with args: %s" % args) subprocess.check_call(args) result = Class1AffinityPredictor.load(models_dir2) assert_equal(len(result.neural_networks), 2) assert_equal(len(result.allele_to_allele_specific_models["HLA-A*02:01"]), 1) assert_equal(len(result.allele_to_allele_specific_models["HLA-A*03:01"]), 1) assert_equal( result.allele_to_allele_specific_models["HLA-A*02:01"] [0].hyperparameters["max_epochs"], 500) assert_equal( result.allele_to_allele_specific_models["HLA-A*03:01"] [0].hyperparameters["max_epochs"], 500) print("Deleting: %s" % models_dir1) print("Deleting: %s" % models_dir2) shutil.rmtree(models_dir1)
# Expensive test - not run by nose. from mhcflurry import train_pan_allele_models_command from mhcflurry.downloads import get_path from mhcflurry.allele_encoding import AlleleEncoding import pandas import numpy PRETRAIN_DATA_PATH = get_path("random_peptide_predictions", "predictions.csv.bz2") FULL_TRAIN_DF = pandas.read_csv( get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2")) TRAIN_DF = FULL_TRAIN_DF.loc[(FULL_TRAIN_DF.peptide.str.len() >= 8) & (FULL_TRAIN_DF.peptide.str.len() <= 15)] ALLELE_SEQUENCES = pandas.read_csv(get_path("allele_sequences", "allele_sequences.csv"), index_col=0).sequence ALLELE_SEQUENCES = ALLELE_SEQUENCES.loc[ALLELE_SEQUENCES.index.isin( TRAIN_DF.allele)] TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.allele.isin(ALLELE_SEQUENCES.index)] FOLDS_DF = pandas.DataFrame(index=TRAIN_DF.index) FOLDS_DF["fold_0"] = True HYPERPARAMTERS = { 'activation': 'tanh', 'allele_dense_layer_sizes': [], 'batch_normalization': False, 'dense_layer_l1_regularization': 0.0, 'dense_layer_l2_regularization': 0.0,
def test_performance_improves_for_A0205_with_pretraining(): # test to make sure that imputation improves predictive accuracy after a # small number of training iterations (5 epochs) dataset = Dataset.from_csv( get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv")) print("Full dataset: %d pMHC entries" % len(dataset)) limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"] # restrict to just five alleles dataset = dataset.get_alleles(limited_alleles) print("After filtering to %s, # entries: %d" % (limited_alleles, len(dataset))) a0205_data_without_imputation = dataset.get_allele("HLA-A0205") print("Dataset with only A0205, # entries: %d" % (len(a0205_data_without_imputation))) predictor_without_imputation = Class1BindingPredictor( name="A0205-no-impute") X_index, ic50_true, sample_weights, _ = ( a0205_data_without_imputation.kmer_index_encoding()) assert sample_weights.min() >= 0, sample_weights.min() assert sample_weights.max() <= 1, sample_weights.max() assert ic50_true.min() >= 0, ic50_true.min() predictor_without_imputation.fit_kmer_encoded_arrays( X=X_index, ic50=ic50_true, sample_weights=sample_weights, n_training_epochs=10) ic50_pred_without_imputation = ( predictor_without_imputation.predict_ic50_for_kmer_encoded_array( X_index)) diff_squared = (ic50_true - ic50_pred_without_imputation)**2 ic50_true_label = ic50_true <= 500 ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500 ic50_label_same_without_imputation = ( ic50_true_label == ic50_pred_label_without_imputation) mse_without_imputation = ((diff_squared * sample_weights).sum() / sample_weights.sum()) accuracy_without_imputation = ( (ic50_label_same_without_imputation * sample_weights).sum() / sample_weights.sum()) imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25)) print("After imputation, dataset for %s has %d entries" % (limited_alleles, len(imputed_datset))) a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205") print("Limited to just A0205, # entries: %d" % (len(a0205_data_with_imputation))) X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \ a0205_data_with_imputation.kmer_index_encoding() assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min() assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max() assert ic50_imputed.min() >= 0, ic50_imputed.min() predictor_with_imputation = Class1BindingPredictor(name="A0205-impute") predictor_with_imputation.fit_kmer_encoded_arrays( X=X_index, ic50=ic50_true, sample_weights=sample_weights, X_pretrain=X_index_imputed, ic50_pretrain=ic50_imputed, sample_weights_pretrain=sample_weights_imputed, n_training_epochs=10) ic50_pred_with_imputation = \ predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index) diff_squared = (ic50_true - ic50_pred_with_imputation)**2 mse_with_imputation = ((diff_squared * sample_weights).sum() / sample_weights.sum()) ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500 ic50_label_same_with_imputation = ( ic50_true_label == ic50_pred_label_with_imputation) accuracy_with_imputation = ( (ic50_label_same_with_imputation * sample_weights).sum() / sample_weights.sum()) print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation), )) print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation), )) assert mse_with_imputation < mse_without_imputation, \ "Expected MSE with imputation (%f) to be < w/o imputation (%f)" % ( mse_with_imputation, mse_without_imputation) print("IC50 <= 500nM accuracy w/out imputation: %f" % (accuracy_without_imputation, )) print("IC50 <= 500nM accuracy w/ imputation: %f" % (accuracy_with_imputation, )) assert accuracy_with_imputation > accuracy_without_imputation
import time import pandas from mhcflurry.allele_encoding import AlleleEncoding from mhcflurry.amino_acid import BLOSUM62_MATRIX from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor from mhcflurry.downloads import get_path from numpy.testing import assert_equal from mhcflurry.testing_utils import cleanup, startup teardown = cleanup setup = startup ALLELE_TO_SEQUENCE = pandas.read_csv( get_path( "allele_sequences", "allele_sequences.csv"), index_col=0).sequence.to_dict() HYPERPARAMETERS = { 'activation': 'tanh', 'allele_dense_layer_sizes': [], 'batch_normalization': False, 'dense_layer_l1_regularization': 0.0, 'dense_layer_l2_regularization': 0.0, 'dropout_probability': 0.5, 'early_stopping': True, 'init': 'glorot_uniform', 'layer_sizes': [4], 'learning_rate': None, 'locally_connected_layers': [], 'loss': 'custom:mse_with_inequalities',
def test_class1_affinity_predictor_a0205_memorize_training_data(): # Memorize the dataset. hyperparameters = dict( activation="tanh", layer_sizes=[64], max_epochs=100, early_stopping=False, validation_split=0.0, locally_connected_layers=[], dense_layer_l1_regularization=0.0, dropout_probability=0.0) allele = "HLA-A*02:05" df = pandas.read_csv( get_path( "data_curated", "curated_training_data.affinity.csv.bz2")) df = df.loc[ df.allele == allele ] df = df.loc[ df.peptide.str.len() == 9 ] df = df.loc[ df.measurement_type == "quantitative" ] df = df.loc[ df.measurement_source == "kim2014" ] predictor = Class1AffinityPredictor() predictor.fit_allele_specific_predictors( n_models=2, architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=df.peptide.values, affinities=df.measurement_value.values, verbose=0, ) predictor.calibrate_percentile_ranks(num_peptides_per_length=1000) ic50_pred = predictor.predict(df.peptide.values, allele=allele) ic50_true = df.measurement_value.values eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose( numpy.log(ic50_pred), numpy.log(ic50_true), rtol=0.2, atol=0.2) ic50_pred_df = predictor.predict_to_dataframe( df.peptide.values, allele=allele) print(ic50_pred_df) assert 'prediction_percentile' in ic50_pred_df.columns assert ic50_pred_df.prediction_percentile.isnull().sum() == 0 ic50_pred_df2 = predictor.predict_to_dataframe( df.peptide.values, allele=allele, include_individual_model_predictions=True) print(ic50_pred_df2) # Test an unknown allele print("Starting unknown allele check") eq_(predictor.supported_alleles, [allele]) ic50_pred = predictor.predict( df.peptide.values, allele="HLA-A*02:01", throw=False) assert numpy.isnan(ic50_pred).all() assert_raises( ValueError, predictor.predict, df.peptide.values, allele="HLA-A*02:01") eq_(predictor.supported_alleles, [allele]) assert_raises( ValueError, predictor.predict, ["AAAAA"], # too short allele=allele) assert_raises( ValueError, predictor.predict, ["AAAAAAAAAAAAAAAAAAAA"], # too long allele=allele) ic50_pred = predictor.predict( ["AAAAA", "AAAAAAAAA", "AAAAAAAAAAAAAAAAAAAA"], allele=allele, throw=False) assert numpy.isnan(ic50_pred[0]) assert not numpy.isnan(ic50_pred[1]) assert numpy.isnan(ic50_pred[2])
def test_a1_known_epitopes_in_newly_trained_model(): allele = "HLA-A*01:01" df = pandas.read_csv( get_path( "data_curated", "curated_training_data.affinity.csv.bz2")) df = df.loc[ (df.allele == allele) & (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) ] hyperparameters = { "max_epochs": 100, "patience": 10, "early_stopping": True, "validation_split": 0.2, "random_negative_rate": 0.0, "random_negative_constant": 25, "peptide_amino_acid_encoding": "BLOSUM62", "use_embedding": False, "kmer_size": 15, "batch_normalization": False, "locally_connected_layers": [ { "filters": 8, "activation": "tanh", "kernel_size": 3 } ], "activation": "relu", "output_activation": "sigmoid", "layer_sizes": [ 32 ], "random_negative_affinity_min": 20000.0, "random_negative_affinity_max": 50000.0, "dense_layer_l1_regularization": 0.001, "dropout_probability": 0.0 } predictor = Class1AffinityPredictor() predictor.fit_allele_specific_predictors( n_models=2, architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=df.peptide.values, affinities=df.measurement_value.values, verbose=0, ) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor) models_dir = tempfile.mkdtemp("_models") print(models_dir) predictor.save(models_dir) predictor2 = Class1AffinityPredictor.load(models_dir) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor2) shutil.rmtree(models_dir) predictor3 = Class1AffinityPredictor( allele_to_allele_specific_models={ allele: [predictor.allele_to_allele_specific_models[allele][0]] }) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor3) models_dir = tempfile.mkdtemp("_models") print(models_dir) predictor3.save(models_dir) predictor4 = Class1AffinityPredictor.load(models_dir) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor4) shutil.rmtree(models_dir)
import pandas import logomaker from matplotlib import pyplot from mhcflurry.downloads import get_path from mhcflurry.amino_acid import COMMON_AMINO_ACIDS AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( "--class1-models-dir-with-ms", metavar="DIR", default=get_path( "models_class1_pan", "models.with_mass_spec", test_exists=False), help="Class1 models. Default: %(default)s", ) parser.add_argument( "--class1-models-dir-no-ms", metavar="DIR", default=get_path( "models_class1_pan", "models.no_mass_spec", test_exists=False), help="Class1 models. Default: %(default)s", ) parser.add_argument( "--logo-cutoff", default=0.01, type=float, help="Fraction of top to use for motifs", )