def setup(): global ALLELE_SPECIFIC_PREDICTOR, PAN_ALLELE_PREDICTOR startup() ALLELE_SPECIFIC_PREDICTOR = Class1AffinityPredictor.load( get_path("models_class1", "models")) PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load( get_path("models_class1_pan", "models.with_mass_spec"))
def setup(): global PREDICTORS startup() PREDICTORS = { 'allele-specific': Class1AffinityPredictor.load(get_path("models_class1", "models")), 'pan-allele': Class1AffinityPredictor.load( get_path("models_class1_pan", "models.combined")) }
def setup(): global PAN_ALLELE_PREDICTOR startup() PAN_ALLELE_PREDICTOR = Class1AffinityPredictor.load( get_path("models_class1_pan", "models.combined"), optimization_level=0, )
def run_and_check(n_jobs=0, delete=True, additional_args=[]): source_models_dir = get_path("models_class1_pan", "models.combined") dest_models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") # Save a new predictor that has no percent rank calibration data. original_predictor = Class1AffinityPredictor.load(source_models_dir) print("Loaded predictor", source_models_dir) new_predictor = Class1AffinityPredictor( class1_pan_allele_models=original_predictor.class1_pan_allele_models, allele_to_sequence=original_predictor.allele_to_sequence, ) new_predictor.save(dest_models_dir) print("Saved predictor to", dest_models_dir) new_predictor = Class1AffinityPredictor.load(dest_models_dir) assert_equal(len(new_predictor.allele_to_percent_rank_transform), 0) args = [ "mhcflurry-calibrate-percentile-ranks", "--models-dir", dest_models_dir, "--match-amino-acid-distribution-data", get_path("data_curated", "curated_training_data.affinity.csv.bz2"), "--motif-summary", "--num-peptides-per-length", "1000", "--allele", "HLA-A*02:01", "HLA-B*07:02", "--verbosity", "1", "--num-jobs", str(n_jobs), ] + additional_args print("Running with args: %s" % args) subprocess.check_call(args) new_predictor = Class1AffinityPredictor.load(dest_models_dir) assert_equal(len(new_predictor.allele_to_percent_rank_transform), 2) if delete: print("Deleting: %s" % dest_models_dir) shutil.rmtree(dest_models_dir) else: print("Not deleting: %s" % dest_models_dir)
def initialise_prediction(): """ Enable mhcflurry prediction :return: """ os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '2' # Prevents warnings. No, I'm not building from source from mhcflurry import Class1AffinityPredictor global predictor predictor = Class1AffinityPredictor.load()
def __init__(self, alleles, default_peptide_lengths=[9], predictor=None, models_path=None): """ Parameters ----------- alleles : list of str default_peptide_lengths : list of int predictor : mhcflurry.Class1AffinityPredictor (optional) MHCflurry predictor to use models_path : string Models dir to use if predictor argument is None """ # moving import here since the mhcflurry package imports # Keras and its backend (either Theano or TF) which end up # slowing down responsive for any CLI application using MHCtools from mhcflurry import Class1AffinityPredictor BasePredictor.__init__(self, alleles=alleles, default_peptide_lengths=default_peptide_lengths, min_peptide_length=8, max_peptide_length=15) if predictor: self.predictor = predictor elif models_path: logging.info("Loading MHCflurry models from %s" % models_path) self.predictor = Class1AffinityPredictor.load(models_path) else: self.predictor = Class1AffinityPredictor.load() # relying on BasePredictor and MHCflurry to both normalize # allele names the same way using mhcnames for allele in self.alleles: if allele not in self.predictor.supported_alleles: raise UnsupportedAllele(allele)
def predict(self, peptides, alleles=None, binary=False, **kwargs): # test whether one peptide or a list if not isinstance(peptides, list): peptides = [peptides] # if no alleles are specified do predictions for all supported alleles if alleles is None: alleles = self.supportedAlleles else: # filter for supported alleles alleles = filter(lambda a: a in self.supportedAlleles, alleles) alleles = self.convert_alleles(alleles) # test mhcflurry models are available => download if not p = subprocess.Popen( ['mhcflurry-downloads', 'path', 'models_class1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if p is not 0: subprocess.call( ['mhcflurry-downloads', 'fetch', 'models_class1']) # load model predictor = Class1AffinityPredictor.load() # predict and assign binding affinities result = {} for a in alleles: allele_repr = self.revert_allele_repr(a) result[allele_repr] = {} for p in peptides: seq = p.__str__() binding_affinity = predictor.predict(allele=a, peptides=[seq])[0] if binary: if binding_affinity <= 500: result[allele_repr][p] = 1.0 else: result[allele_repr][p] = 0.0 else: result[allele_repr][p] = binding_affinity # create EpitopePredictionResult object. This is a multi-indexed DataFrame # with Peptide and Method as multi-index and alleles as columns df_result = EpitopePredictionResult.from_dict(result) df_result.index = pandas.MultiIndex.from_tuples( [tuple((i, self.name)) for i in df_result.index], names=['Seq', 'Method']) return df_result
def predict(self, sequence=None, peptides=None, length=11, overlap=1, allele='HLA-A0101', name='', **kwargs): """Uses mhcflurry python classes for prediction""" self.sequence = sequence from mhcflurry import Class1AffinityPredictor predictor = Class1AffinityPredictor.load() if peptides == None: peptides, s = peptutils.create_fragments(seq=sequence, length=length, overlap=overlap) df = predictor.predict_to_dataframe(peptides=peptides, allele=allele) #print (df[:5]) df = self.prepareData(df, name, allele) self.data = df return df
def run_and_check(n_jobs=0): models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join(models_dir, "hyperparameters.yaml") with open(hyperparameters_filename, "w") as fd: json.dump(HYPERPARAMETERS, fd) args = [ "mhcflurry-class1-train-allele-specific-models", "--data", get_path("data_curated", "curated_training_data.affinity.csv.bz2"), "--hyperparameters", hyperparameters_filename, "--allele", "HLA-A*02:01", "HLA-A*03:01", "--out-models-dir", models_dir, "--num-jobs", str(n_jobs), ] print("Running with args: %s" % args) subprocess.check_call(args) # Calibrate percentile ranks args = [ "mhcflurry-calibrate-percentile-ranks", "--models-dir", models_dir, "--num-peptides-per-length", "10000", "--num-jobs", str(n_jobs), ] print("Running with args: %s" % args) subprocess.check_call(args) result = Class1AffinityPredictor.load(models_dir) predictions = result.predict(peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) assert_equal(predictions.shape, (1, )) assert_array_less(predictions, 1000) df = result.predict_to_dataframe(peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) print(df) assert "prediction_percentile" in df.columns print("Deleting: %s" % models_dir) shutil.rmtree(models_dir)
def do_predictions_mhcflurry(work_item_dicts, constant_data=None): """ Each dict of work items should have keys: work_item_num, peptides, alleles """ # This may run on the cluster in a way that misses all top level imports, # so we have to re-import everything here. import time from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry import Class1AffinityPredictor if constant_data is None: constant_data = GLOBAL_DATA args = constant_data['args'] assert args.predictor == "mhcflurry" assert constant_data['cols'] == ["affinity"] predictor = Class1AffinityPredictor.load(args.mhcflurry_models_dir) results = [] for (i, d) in enumerate(work_item_dicts): work_item_num = d['work_item_num'] peptides = d['peptides'] alleles = d['alleles'] print("Processing work item", i + 1, "of", len(work_item_dicts)) result = {} results.append((work_item_num, result)) start = time.time() peptides = EncodableSequences.create(peptides) for (i, allele) in enumerate(alleles): print("Processing allele %d / %d: %0.2f sec elapsed" % (i + 1, len(alleles), time.time() - start)) for col in ["affinity"]: result["%s %s" % (allele, col)] = predictor.predict( peptides=peptides, allele=allele, throw=False, model_kwargs={ 'batch_size': args.mhcflurry_batch_size, }).astype(constant_data['args'].result_dtype) print("Done predicting in", time.time() - start, "sec") return results
def test_mhcflurry(): predictor = MHCflurry(alleles=[DEFAULT_ALLELE]) binding_predictions = predictor.predict_subsequences(protein_sequence_dict, peptide_lengths=[9]) eq_(4, len(binding_predictions), "Expected 4 binding predictions from %s" % (binding_predictions, )) prediction_scores = {(x.peptide, x.allele): x.affinity for x in binding_predictions} predictor = Class1AffinityPredictor.load() # test one prediction at a time to make sure there's no peptide/allele mixup for (peptide, allele), affinity in prediction_scores.items(): prediction = predictor.predict([peptide], allele=allele) assert len(prediction) == 1 # we've seen results differ a bit so doing an approximate check, not an error condition testing.assert_almost_equal(prediction[0], affinity, decimal=0)
def setup(): global AFFINITY_PREDICTOR global CLEAVAGE_PREDICTOR global CLEAVAGE_PREDICTOR_NO_FLANKING global PRESENTATION_PREDICTOR startup() AFFINITY_PREDICTOR = Class1AffinityPredictor.load(get_path( "models_class1_pan", "models.combined"), optimization_level=0, max_models=1) CLEAVAGE_PREDICTOR = Class1ProcessingPredictor.load(get_path( "models_class1_processing", "models.selected.with_flanks"), max_models=1) CLEAVAGE_PREDICTOR_NO_FLANKING = Class1ProcessingPredictor.load( get_path("models_class1_processing", "models.selected.no_flank"), max_models=1) PRESENTATION_PREDICTOR = Class1PresentationPredictor.load()
def test_merge(): assert len(PAN_ALLELE_PREDICTOR.class1_pan_allele_models) > 1 peptides = random_peptides(100, length=9) peptides.extend(random_peptides(100, length=10)) peptides = pandas.Series(peptides).sample(frac=1.0) alleles = pandas.Series(["HLA-A*03:01", "HLA-B*57:01", "HLA-C*02:01"]).sample(n=len(peptides), replace=True) predictions1 = PAN_ALLELE_PREDICTOR.predict(peptides=peptides, alleles=alleles) merged = Class1NeuralNetwork.merge( PAN_ALLELE_PREDICTOR.class1_pan_allele_models) merged_predictor = Class1AffinityPredictor( allele_to_sequence=PAN_ALLELE_PREDICTOR.allele_to_sequence, class1_pan_allele_models=[merged], ) predictions2 = merged_predictor.predict(peptides=peptides, alleles=alleles) numpy.testing.assert_allclose(predictions1, predictions2, atol=0.1)
def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb_retries): predictor = Class1AffinityPredictor.load() results = pd.DataFrame() for line in input_file: match = re.search('^>([0-9]+)$', line) if match: seq_num = match.group(1) else: epitopes = self.determine_neoepitopes(line.rstrip(), epitope_length) df = predictor.predict_to_dataframe(allele=allele, peptides=epitopes) df['seq_num'] = seq_num df['start'] = df.index + 1 df.rename(columns={ 'prediction': 'ic50', 'prediction_percentile': 'percentile' }, inplace=True) results = results.append(df) return (results, 'pandas')
def test_class1_affinity_predictor_a0205_memorize_training_data(): # Memorize the dataset. hyperparameters = dict( activation="tanh", layer_sizes=[64], max_epochs=100, early_stopping=False, validation_split=0.0, locally_connected_layers=[], dense_layer_l1_regularization=0.0, dropout_probability=0.0) allele = "HLA-A*02:05" df = pandas.read_csv( get_path( "data_curated", "curated_training_data.affinity.csv.bz2")) df = df.loc[ df.allele == allele ] df = df.loc[ df.peptide.str.len() == 9 ] df = df.loc[ df.measurement_type == "quantitative" ] df = df.loc[ df.measurement_source == "kim2014" ] predictor = Class1AffinityPredictor() predictor.fit_allele_specific_predictors( n_models=2, architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=df.peptide.values, affinities=df.measurement_value.values, verbose=0, ) predictor.calibrate_percentile_ranks(num_peptides_per_length=1000) ic50_pred = predictor.predict(df.peptide.values, allele=allele) ic50_true = df.measurement_value.values eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose( numpy.log(ic50_pred), numpy.log(ic50_true), rtol=0.2, atol=0.2) ic50_pred_df = predictor.predict_to_dataframe( df.peptide.values, allele=allele) print(ic50_pred_df) assert 'prediction_percentile' in ic50_pred_df.columns assert ic50_pred_df.prediction_percentile.isnull().sum() == 0 ic50_pred_df2 = predictor.predict_to_dataframe( df.peptide.values, allele=allele, include_individual_model_predictions=True) print(ic50_pred_df2) # Test an unknown allele print("Starting unknown allele check") eq_(predictor.supported_alleles, [allele]) ic50_pred = predictor.predict( df.peptide.values, allele="HLA-A*02:01", throw=False) assert numpy.isnan(ic50_pred).all() assert_raises( ValueError, predictor.predict, df.peptide.values, allele="HLA-A*02:01") eq_(predictor.supported_alleles, [allele]) assert_raises( ValueError, predictor.predict, ["AAAAA"], # too short allele=allele) assert_raises( ValueError, predictor.predict, ["AAAAAAAAAAAAAAAAAAAA"], # too long allele=allele) ic50_pred = predictor.predict( ["AAAAA", "AAAAAAAAA", "AAAAAAAAAAAAAAAAAAAA"], allele=allele, throw=False) assert numpy.isnan(ic50_pred[0]) assert not numpy.isnan(ic50_pred[1]) assert numpy.isnan(ic50_pred[2])
def setup(): global DOWNLOADED_PREDICTOR startup() DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load() logging.basicConfig(level=logging.DEBUG)
def check_allele_valid(self, allele): valid_alleles = self.valid_allele_names() if allele not in valid_alleles: sys.exit( "Allele %s not valid for method %s. Run `pvacseq valid_alleles %s` for a list of valid allele names." % (allele, self.__class__.__name__, self.__class__.__name__)) class MHCI(PredictionClass, metaclass=ABCMeta): @property def needs_epitope_length(self): return True mhcflurry_predictor = Class1AffinityPredictor.load() class MHCflurry(MHCI): def valid_allele_names(self): return mhcflurry_predictor.supported_alleles def check_length_valid_for_allele(self, length, allele): return True def valid_lengths_for_allele(self, allele): return [8, 9, 10, 11, 12, 13, 14] def determine_neoepitopes(self, sequence, length): epitopes = [] for i in range(0, len(sequence) - length + 1):
import traceback import sys import numpy import pandas numpy.random.seed(0) from mhcflurry import Class1AffinityPredictor from nose.tools import eq_, assert_raises from numpy import testing from mhcflurry.downloads import get_path from mhcflurry.testing_utils import cleanup, startup DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load() def setup(): global DOWNLOADED_PREDICTOR startup() DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load() logging.basicConfig(level=logging.DEBUG) def teardown(): global DOWNLOADED_PREDICTOR DOWNLOADED_PREDICTOR = None cleanup()
This script includes MHCflurry model """ from mhcflurry import Class1AffinityPredictor from mhcflurry.downloads import get_path import pandas as pd import numpy as np data_path = get_path('data_curated', 'curated_training_data.no_mass_spec.csv.bz2') df = pandas.read_csv(data_path) df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 11)] models = {} for hla in HLAs: new_predictor = Class1AffinityPredictor() if(df.loc[df.allele == hla].shape[0]>0): single_allele_train_data = df.loc[df.allele == hla].sample(21, replace = True) else: models[hla] = '' continue model = new_predictor.fit_allele_specific_predictors(n_models=1,architecture_hyperparameters_list=[{"layer_sizes": [16],"max_epochs": 5,"random_negative_constant": 5,}],peptides=single_allele_train_data.peptide.values,affinities=single_allele_train_data.measurement_value.values,allele="HLA-B*57:01") models[hla] = model binding_affinity = [] for i in range(len(test_peptide)): ba = float('inf') for hla in test_onehot[i]: if (hla not in models.keys()): if(df.loc[df.allele == hla].shape[0]>0): single_allele_train_data = df.loc[df.allele == hla].sample(21, replace = True)
def run_and_check(n_jobs=0, delete=True, additional_args=[]): models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join(models_dir, "hyperparameters.yaml") with open(hyperparameters_filename, "w") as fd: json.dump(HYPERPARAMETERS_LIST, fd) data_df = pandas.read_csv( get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2")) selected_data_df = data_df.loc[data_df.allele.str.startswith("HLA-A")] selected_data_df.to_csv(os.path.join(models_dir, "_train_data.csv"), index=False) args = [ "mhcflurry-class1-train-pan-allele-models", "--data", os.path.join(models_dir, "_train_data.csv"), "--allele-sequences", get_path("allele_sequences", "allele_sequences.csv"), "--hyperparameters", hyperparameters_filename, "--out-models-dir", models_dir, "--num-jobs", str(n_jobs), "--num-folds", "2", "--verbosity", "1", ] + additional_args print("Running with args: %s" % args) subprocess.check_call(args) # Run model selection models_dir_selected = tempfile.mkdtemp( prefix="mhcflurry-test-models-selected") args = [ "mhcflurry-class1-select-pan-allele-models", "--data", os.path.join(models_dir, "train_data.csv.bz2"), "--models-dir", models_dir, "--out-models-dir", models_dir_selected, "--max-models", "1", "--num-jobs", str(n_jobs), ] + additional_args print("Running with args: %s" % args) subprocess.check_call(args) result = Class1AffinityPredictor.load(models_dir_selected, optimization_level=0) assert_equal(len(result.neural_networks), 2) predictions = result.predict(peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) assert_equal(predictions.shape, (1, )) assert_array_less(predictions, 1000) if delete: print("Deleting: %s" % models_dir) shutil.rmtree(models_dir) shutil.rmtree(models_dir_selected)
def valid_allele_names(self): predictor = Class1AffinityPredictor.load() return predictor.supported_alleles
def setup(): global DOWNLOADED_PREDICTOR startup() DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load()
import numpy as np import pandas as pd from pathlib import Path from mhcflurry import Class1AffinityPredictor from tqdm import tqdm import seaborn as sns import matplotlib.pyplot as plt # Load data and models - change path as necessary DATA = Path("./data") ms_file = DATA / "abelin_peptides.mhcflurry_no_mass_spec.csv" MODELS = Path("/Users/haemin/Library/Application Support/mhcflurry/4/1.4.0") no_ms_model = MODELS / "models_class1_selected_no_mass_spec/models" no_ms_predictor = Class1AffinityPredictor.load(no_ms_model) # Read data file and initial cleanup ms = pd.read_csv(ms_file) ms = ms.rename(columns={"mhcflurry": "mhcflurry2"}) ms["mhcflurry4"] = np.full_like(ms.mhcflurry2, -1) ms = ms.loc[ms.allele.isin(no_ms_predictor.supported_alleles), :] ms["peptide_len"] = ms.peptide.str.len() ms.loc[ms["peptide_len"] > 12, "peptide_len"] = 13 ''' Generate Figure A ''' # Compute PPV values models = ["netmhc", "netmhcpan", "mhcflurry2", "mhcflurry4"] alleles = ms.allele[ms.allele.isin(no_ms_predictor.supported_alleles)].unique() # Compute predictions for mhcflurry4
def predict(self, peptides, alleles=None, binary=False, **kwargs): # test whether one peptide or a list if not isinstance(peptides, list): peptides = [peptides] # if no alleles are specified do predictions for all supported alleles if alleles is None: alleles = self.supportedAlleles else: # filter for supported alleles alleles = filter(lambda a: a in self.supportedAlleles, alleles) alleles = self.convert_alleles(alleles) # test mhcflurry models are available => download if not p = subprocess.Popen( ['mhcflurry-downloads', 'path', 'models_class1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if p is not 0: subprocess.call( ['mhcflurry-downloads', 'fetch', 'models_class1']) # load model predictor = Class1AffinityPredictor.load() # prepare results dictionary result = defaultdict(defaultdict) # group peptides by length peptides.sort(key=len) for length, peps in itertools.groupby(peptides, key=len): if length not in self.supportedLength: logging.warn( "Peptide length must be at least %i or at most %i for %s but is %i" % (min(self.supportedLength), max( self.supportedLength), self.name, length)) continue peps = list(peps) # predict and assign binding affinities for a in alleles: allele_repr = self.revert_allele_repr(a) for p in peps: binding_affinity = predictor.predict(allele=a, peptides=[str(p) ])[0] if binary: if binding_affinity <= 500: result[allele_repr][p] = 1.0 else: result[allele_repr][p] = 0.0 else: result[allele_repr][p] = binding_affinity if not result: raise ValueError( "No predictions could be made with " + self.name + " for given input. Check your epitope length and HLA allele combination." ) # create EpitopePredictionResult object. This is a multi-indexed DataFrame # with Peptide and Method as multi-index and alleles as columns df_result = EpitopePredictionResult.from_dict(result) df_result.index = pandas.MultiIndex.from_tuples( [tuple((i, self.name)) for i in df_result.index], names=['Seq', 'Method']) return df_result
def test_a1_known_epitopes_in_newly_trained_model(): allele = "HLA-A*01:01" df = pandas.read_csv( get_path( "data_curated", "curated_training_data.affinity.csv.bz2")) df = df.loc[ (df.allele == allele) & (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) ] hyperparameters = { "max_epochs": 100, "patience": 10, "early_stopping": True, "validation_split": 0.2, "random_negative_rate": 0.0, "random_negative_constant": 25, "peptide_amino_acid_encoding": "BLOSUM62", "use_embedding": False, "kmer_size": 15, "batch_normalization": False, "locally_connected_layers": [ { "filters": 8, "activation": "tanh", "kernel_size": 3 } ], "activation": "relu", "output_activation": "sigmoid", "layer_sizes": [ 32 ], "random_negative_affinity_min": 20000.0, "random_negative_affinity_max": 50000.0, "dense_layer_l1_regularization": 0.001, "dropout_probability": 0.0 } predictor = Class1AffinityPredictor() predictor.fit_allele_specific_predictors( n_models=2, architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=df.peptide.values, affinities=df.measurement_value.values, verbose=0, ) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor) models_dir = tempfile.mkdtemp("_models") print(models_dir) predictor.save(models_dir) predictor2 = Class1AffinityPredictor.load(models_dir) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor2) shutil.rmtree(models_dir) predictor3 = Class1AffinityPredictor( allele_to_allele_specific_models={ allele: [predictor.allele_to_allele_specific_models[allele][0]] }) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor3) models_dir = tempfile.mkdtemp("_models") print(models_dir) predictor3.save(models_dir) predictor4 = Class1AffinityPredictor.load(models_dir) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor4) shutil.rmtree(models_dir)
seqs = [ l.split()[1] for l in mztab_read if l.startswith("PSM") if l.split()[1] not in seqs_new_smaller_qval ] seqs_new_greater_qval = list(set(seqs)) seqs_new_greater_qval = [ s for s in seqs_new_greater_qval if 7 < len(s) < 13 and not 'U' in s and not 'X' in s and not 'Z' in s and not 'J' in s and not 'B' in s ] #call mhcflurry #subprocess.call("mhcflurry-predict --peptides {p} --alleles {a} --out {o}".format(p=" ".join(seqs_new), a=" ".join(alleles), o=sys.argv[-1])) seqs_filtered = [] for allele in alleles: print(allele) predictor = Class1AffinityPredictor.load() df_pred = predictor.predict_to_dataframe(allele=allele, peptides=seqs_new_greater_qval) seqs_filtered += df_pred[df_pred['prediction'] <= float( sys.argv[-5])]['peptide'].values.tolist() #merge sequence lists and append decoys seqs_new_all = list(set(seqs_new_smaller_qval + seqs_filtered)) seqs_new_all = seqs_new_all + [s[::-1] for s in seqs_new_all] #write idXML for filtering op = open(sys.argv[-1], 'w') op.write( '<PeptideIdentification score_type="q-value" higher_score_better="false">' + '\n') for pep in seqs_new_all:
def run_and_check_with_model_selection(n_jobs=1): models_dir1 = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join(models_dir1, "hyperparameters.yaml") # Include one architecture that has max_epochs = 0. We check that it never # gets selected in model selection. hyperparameters = [ deepcopy(HYPERPARAMETERS[0]), deepcopy(HYPERPARAMETERS[0]), ] hyperparameters[-1]["max_epochs"] = 0 with open(hyperparameters_filename, "w") as fd: json.dump(hyperparameters, fd) args = [ "mhcflurry-class1-train-allele-specific-models", "--data", get_path("data_curated", "curated_training_data.affinity.csv.bz2"), "--hyperparameters", hyperparameters_filename, "--allele", "HLA-A*02:01", "HLA-A*03:01", "--out-models-dir", models_dir1, "--num-jobs", str(n_jobs), "--held-out-fraction-reciprocal", "10", "--n-models", "1", ] print("Running with args: %s" % args) subprocess.check_call(args) result = Class1AffinityPredictor.load(models_dir1) assert_equal(len(result.neural_networks), 4) models_dir2 = tempfile.mkdtemp(prefix="mhcflurry-test-models") args = [ "mhcflurry-class1-select-allele-specific-models", "--data", get_path("data_curated", "curated_training_data.affinity.csv.bz2"), "--exclude-data", models_dir1 + "/train_data.csv.bz2", "--out-models-dir", models_dir2, "--models-dir", models_dir1, "--num-jobs", str(n_jobs), "--mse-max-models", "1", "--unselected-accuracy-scorer", "combined:mass-spec,mse", "--unselected-accuracy-percentile-threshold", "95", ] print("Running with args: %s" % args) subprocess.check_call(args) result = Class1AffinityPredictor.load(models_dir2) assert_equal(len(result.neural_networks), 2) assert_equal(len(result.allele_to_allele_specific_models["HLA-A*02:01"]), 1) assert_equal(len(result.allele_to_allele_specific_models["HLA-A*03:01"]), 1) assert_equal( result.allele_to_allele_specific_models["HLA-A*02:01"] [0].hyperparameters["max_epochs"], 500) assert_equal( result.allele_to_allele_specific_models["HLA-A*03:01"] [0].hyperparameters["max_epochs"], 500) print("Deleting: %s" % models_dir1) print("Deleting: %s" % models_dir2) shutil.rmtree(models_dir1)