def test_load_repertoire_with_stop_codon(self): path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "igor") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["import_with_stop_codon"] = True params["metadata_file"] = path + "metadata.csv" dataset_stop_codons = IGoRImport.import_dataset( params, "igor_dataset_stop") self.assertEqual(2, dataset_stop_codons.get_example_count()) self.assertEqual(len(dataset_stop_codons.repertoires[0].sequences), 2) self.assertEqual(len(dataset_stop_codons.repertoires[1].sequences), 2) self.assertEqual( dataset_stop_codons.repertoires[0].sequences[0]. amino_acid_sequence, "ARVNRHIVVVTAIMTG*NWFDP") shutil.rmtree(path)
def standard_scale(scaler_file: str, design_matrix, with_mean: bool = True): """ scale to zero mean and unit variance on feature level :param scaler_file: path to scaler file fitted on train set or where the resulting scaler file will be stored :param design_matrix: rows -> examples, columns -> features :param with_mean: whether to scale to zero mean or not (could lose sparsity if scaled) :return: scaled design matrix """ if with_mean and hasattr(design_matrix, "todense"): scaled_design_matrix = design_matrix.todense() else: scaled_design_matrix = design_matrix if os.path.isfile(scaler_file): with open(scaler_file, 'rb') as file: scaler = pickle.load(file) scaled_design_matrix = scaler.transform(scaled_design_matrix) else: scaler = StandardScaler(with_mean=with_mean) scaled_design_matrix = scaler.fit_transform(scaled_design_matrix) PathBuilder.build(os.path.dirname(scaler_file)) with open(scaler_file, 'wb') as file: pickle.dump(scaler, file) return scaled_design_matrix
def __init__(self, specification_path: str, result_path: str): self._specification_path = specification_path self._result_path = os.path.relpath(result_path) + "/" PathBuilder.build(self._result_path) self._cache_path = f"{self._result_path}cache/"
def build_path(self, path: str = None): if path is None: path = EnvironmentSettings.root_path + "quickstart/" if os.path.isdir(path): shutil.rmtree(path) PathBuilder.build(path) return path
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) data_long_format = DataReshaper.reshape(self.dataset) table_result = self._write_results_table(data_long_format) report_output_fig = self._safe_plot(data_long_format=data_long_format) output_figures = None if report_output_fig is None else [report_output_fig] return ReportResult(self.name, output_figures, [table_result])
def test_make_subset(self): sequences = [] for i in range(100): sequences.append( ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i))) path = EnvironmentSettings.tmp_test_path + "element_generator_subset/" PathBuilder.build(path) for i in range(10): with open("{}batch{}.pkl".format(path, i), "wb") as file: sequences_to_pickle = sequences[i * 10:(i + 1) * 10] pickle.dump(sequences_to_pickle, file) d = SequenceDataset( filenames=["{}batch{}.pkl".format(path, i) for i in range(10)], file_size=10) indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92] d2 = d.make_subset(indices, path, SequenceDataset.TRAIN) for batch in d2.get_batch(1000): for sequence in batch: self.assertTrue(int(sequence.identifier) in indices) self.assertEqual(15, d2.get_example_count()) shutil.rmtree(path)
def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset: """ Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file Arguments: import_class: class to use for import params: instance of DatasetImportParams class which includes information on path, columns, result path etc. dataset_name: user-defined name of the dataset Returns: RepertoireDataset object that was created """ metadata = pd.read_csv(params.metadata_file, ",") ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], ImportHelper.__name__, f'{dataset_name}: params: metadata_file') PathBuilder.build(params.result_path + "repertoires/") arguments = [(import_class, row, params) for index, row in metadata.iterrows()] with Pool(params.number_of_processes) as pool: repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments) new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, params.result_path, dataset_name) potential_labels = list(set(metadata.columns.tolist()) - {"filename"}) dataset = RepertoireDataset(params={key: list(set(metadata[key].values.tolist())) for key in potential_labels}, repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name) PickleExporter.export(dataset, params.result_path) return dataset
def create_dataset(self, path, dataset_size: int = 50): sequences = [] for i in range(dataset_size): if i % 2 == 0: sequences.append( ReceptorSequence( amino_acid_sequence="AAACCC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 1}))) else: sequences.append( ReceptorSequence( amino_acid_sequence="ACACAC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 2}))) PathBuilder.build(path) filename = "{}sequences.pkl".format(path) with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "abundance_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceAbundanceEncoder.build_object(dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8 }) label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def test_import_sequences(self): path = EnvironmentSettings.tmp_test_path + "importseqsiris/sequences.csv" PathBuilder.build(os.path.dirname(path)) with open(path, "w") as file: file.write( "Cell type Clonotype ID Chain: TRA (1) TRA - V gene (1) TRA - D gene (1) TRA - J gene (1) Chain: TRA (2) TRA - V gene (2) \ TRA - D gene (2) TRA - J gene (2) Chain: TRB (1) TRB - V gene (1) TRB - D gene (1) TRB - J gene (1) Chain: TRB (2) TRB - V \ gene (2) TRB - D gene (2) TRB - J gene (2)\n\ TCR_AB 181 LVGG TRAV4*01 null TRAJ4*01 null null null null null null null null null null null null\n\ TCR_AB 591 AL TRAV9-2*01 null TRAJ21*01 null null null null null null null null null null null null\n\ TCR_AB 1051 VVNII TRAV12-1*01 null TRAJ3*01 null null null null null null null null null null null null\n\ TCR_AB 1341 LNKLT TRAV2*01 null TRAJ10*01 null null null null null null null null null null null null\n\ TCR_AB 1411 AVLY TRAV8-3*01 null TRAJ18*01 null null null null null null null null null null null null\n\ TCR_AB 1421 AT TRAV12-3*01 null TRAJ17*01 null null null null null null null null null null null null\n\ TCR_AB 1671 AVLI TRAV12*01 null TRAJ33*01 null null null null null null null null null null null null\n\ TCR_AB 1901 LVGKLI TRAV4*01 null TRAJ4*01 null null null null null null null null null null null null\n\ TCR_AB 2021 YSSASKII TRAV2-1*01 null TRAJ3*01 null null null null null null null null null null null null\n\ TCR_AB 2251 ARLY TRAV4/DV5*01 null TRAJ18*01 null null null null null null null null null null null null\n\ TCR_AB 2791 IEFN TRAV26-1*01 null TRAJ20*01 null null null null null null null null null null null null\n\ TCR_AB 3031 TLGRLY TRAV8-3*01 null TRAJ18*01 null null null null null null null null null null null null\n\ TCR_AB 3241 AVGLY TRAV8-3*01 null TRAJ18*01 null null null null null null null null null null null null\n\ TCR_AB 3511 KII TRAV12-1*01 null TRAJ3*01 null null null null null null null null null null null null\n\ TCR_AB 3821 LVGD TRAV8*01 null TRAJ4*01 null null null null null null null null null null null null\n" ) sequences = IRISSequenceImport.import_items(path) self.assertEqual(15, len(sequences)) self.assertTrue( all( isinstance(sequence, ReceptorSequence) for sequence in sequences)) self.assertEqual("LVGG", sequences[0].get_sequence()) shutil.rmtree(os.path.dirname(path))
def test_import_paired_sequences(self): path = EnvironmentSettings.tmp_test_path + "importseqsiris/sequences.csv" PathBuilder.build(os.path.dirname(path)) with open(path, "w") as file: file.write( "Cell type Clonotype ID Chain: TRA (1) TRA - V gene (1) TRA - D gene (1) TRA - J gene (1) Chain: TRA (2) TRA - V gene (2) \ TRA - D gene (2) TRA - J gene (2) Chain: TRB (1) TRB - V gene (1) TRB - D gene (1) TRB - J gene (1) Chain: TRB (2) TRB - V \ gene (2) TRB - D gene (2) TRB - J gene (2)\n\ TCR_AB 540891 ATDIWSNFGNEKLT TRAV17*01 TRAJ48*01 null null null null SARVRNYQETQY TRBV20-1*01 TRBD1*01 TRBJ2-5*01 null null null null\n\ TCR_AB 540892 AASAGDDKII TRAV29/DV5*01 TRAJ30*01 null null null null ASRPTGTVDYEQY TRBV5-1*01 TRBD1*01 TRBJ2-7*01 null null null null\n\ TCR_AB 540893 AAYTSGTYKYI TRAV8-1*01 TRAJ40*01 null null null null ASSLTGMNTEAF TRBV11-1*01 TRBD2*01 TRBJ1-1*01 null null null null\n\ TCR_AB 54084 ALLSRSGGYQKVT TRAV12-2*01 TRAJ13*02 null null null null SARDNQETQY TRBV20-1*01 TRBD1*01 TRBJ2-5*01 null null null null\n\ TCR_AB 540895 AYRSRIQGAQKLV TRAV38-2/DV8*01 TRAJ54*01 null null null null ASSHGTSGSGEQY TRBV7-9*01 TRBD2*02 TRBJ2-7*01 null null null null\n" ) paired_sequences = IRISSequenceImport.import_items(path, paired=True) self.assertEqual(5, len(paired_sequences)) self.assertTrue( all( isinstance(sequence, TCABReceptor) for sequence in paired_sequences)) self.assertEqual("ATDIWSNFGNEKLT", paired_sequences[0].alpha.get_sequence()) self.assertEqual("SARVRNYQETQY", paired_sequences[0].beta.get_sequence()) shutil.rmtree(os.path.dirname(path))
def test_sequence_flattened(self): path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/" PathBuilder.build(path) dataset = self.construct_test_flatten_dataset(path) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=path, label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]), pool_size=1, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, SequenceDataset)) onehot_a = [1.0] + [0.0] * 19 onehot_t = [0.0] * 16 + [1.0] + [0] * 3 self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()]) shutil.rmtree(path)
def test(self): path = EnvironmentSettings.tmp_test_path + "onehot_sequence/" PathBuilder.build(path) dataset, lc = self._construct_test_dataset(path) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": False}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=f"{path}encoded/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, SequenceDataset)) onehot_a = [1] + [0] * 19 onehot_t = [0] * 16 + [1] + [0] * 3 onehot_empty = [0] * 20 self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0]], [onehot_a for i in range(4)]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[2]], [onehot_a, onehot_t, onehot_t, onehot_empty]) self.assertListEqual(encoded_data.encoded_data.example_ids, [receptor.identifier for receptor in dataset.get_data()]) self.assertDictEqual(encoded_data.encoded_data.labels, {"l1": [receptor_seq.get_attribute("l1") for receptor_seq in dataset.get_data()], "l2": [receptor_seq.get_attribute("l2") for receptor_seq in dataset.get_data()]}) shutil.rmtree(path)
def test_load_sequence_dataset(self): """Test dataset content with and without a header included in the input file""" path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "igor") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path params["import_with_stop_codon"] = True dataset = IGoRImport.import_dataset(params, "igor_seq_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertEqual(4, dataset.get_example_count()) self.assertEqual( "GCGAGACGTGTCTAGGGAGGATATTGTAGTAGTACCAGCTGCTATGACGGGCGGTCCGGTAGTACTACTTTGACTAC", seqs[0].nucleotide_sequence) self.assertEqual( "GCGAGAGGCTTCCATGGAACTACAGTAACTACGTTTGTAGGCTGTAGTACTACATGGACGTC", seqs[1].nucleotide_sequence) self.assertEqual( "GCGAGAGTTAATCGGCATATTGTGGTGGTGACTGCTATTATGACCGGGTAAAACTGGTTCGACCCC", seqs[2].nucleotide_sequence) self.assertEqual( "GCGAGAGATAGGTGGTCAACCCCAGTATTACGATATTTTGACTGGTGGACCCCGCCCTACTACTACTACATGGACGTC", seqs[3].nucleotide_sequence) shutil.rmtree(path)
def test_import_repertoire_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/io_10xGenomics/" PathBuilder.build(path) self.create_dumy_dataset(path, add_metadata=True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "tenx_genomics") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path + "metadata.csv" dataset = TenxGenomicsImport.import_dataset(params, "tenx_dataset_repertoire") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(len(dataset.repertoires[0].sequences), 2) self.assertEqual(len(dataset.repertoires[1].sequences), 4) self.assertEqual( dataset.repertoires[0].sequences[0].amino_acid_sequence, "ALSGTGGYKVV") self.assertListEqual([Chain.ALPHA, Chain.BETA], list(dataset.repertoires[0].get_chains())) self.assertListEqual([2, 4], list(dataset.repertoires[0].get_counts())) shutil.rmtree(path)
def test_repertoire_export(self): path = EnvironmentSettings.tmp_test_path + "airr_exporter_repertoire/" PathBuilder.build(path) repertoire, metadata_path = self.create_dummy_repertoire(path) dataset = RepertoireDataset(repertoires=[repertoire], metadata_file=metadata_path) path_exported = f"{path}exported/" AIRRExporter.export(dataset, path_exported) resulting_data = pd.read_csv( path_exported + f"repertoires/{repertoire.identifier}.tsv", sep="\t") self.assertListEqual(list(resulting_data["sequence_id"]), ["receptor_1", "receptor_2"]) self.assertListEqual(list(resulting_data["cdr3"]), ["GCTGCTGCT", "GGTGGTGGT"]) self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"]) self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1", "TRAV2*01"]) self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1", "TRAJ2"]) self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1", "TRAD2"]) self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"]) self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15]) self.assertListEqual(list(resulting_data["custom_test"]), ["cust1", "cust2"]) self.assertListEqual(list(resulting_data["productive"]), ['T', nan]) self.assertListEqual(list(resulting_data["stop_codon"]), ['F', nan]) shutil.rmtree(path)
def test_import_receptor_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/io_10xGenomics/" PathBuilder.build(path) self.create_dumy_dataset(path, add_metadata=False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "tenx_genomics") params["is_repertoire"] = False params["paired"] = True params["result_path"] = path params["path"] = path params["sequence_file_size"] = 1 params["receptor_chains"] = "TRA_TRB" dataset = TenxGenomicsImport.import_dataset(params, "tenx_dataset_receptor") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(2, len(dataset.get_filenames())) data = dataset.get_data(1) for receptor in data: self.assertTrue(receptor.alpha.amino_acid_sequence in ["ALSGTGGYKVV", "AIVGNTGKLI"]) self.assertTrue(receptor.beta.amino_acid_sequence in ["ASSLYGGPEVF", "ASSFATNSDYT"]) shutil.rmtree(path)
def test_load_repertoire_dataset_minimal(self): # loading with minimal data (no dual genes, no duplicate V/J segments) number_of_repertoires = 5 path = EnvironmentSettings.tmp_test_path + "importseqsiris_mini/" PathBuilder.build(path) self._create_dummy_data(path, number_of_repertoires=number_of_repertoires, add_metadata=True) # case: minimal dataset (all dual chains and all genes = False) dataset = IRISImport.import_dataset({"is_repertoire": True, "result_path": path, "metadata_file": path + "metadata.csv", "path": path, "import_dual_chains": False, "import_all_gene_combinations": False, "separator": "\t", "extra_columns_to_load": ["extra_col"], "receptor_chains": "TRA_TRB"}, "iris_dataset") self.assertEqual(number_of_repertoires, dataset.get_example_count()) self.assertEqual(number_of_repertoires, len(dataset.get_data())) for repertoire in dataset.get_data(2): self.assertTrue(repertoire.metadata["label1"] in {0, 1}) self.assertEqual(7, len(repertoire.sequences)) # 6 alpha + 1 beta self.assertEqual(1, len(repertoire.receptors)) # 1 alpha/beta pair (dual chain (1)) self.assertListEqual([Chain.ALPHA for i in range(6)] + [Chain.BETA], list(repertoire.get_chains())) self.assertEqual(None, repertoire.get_counts()) shutil.rmtree(path)
def import_sequence_dataset(import_class, params, dataset_name: str): PathBuilder.build(params.result_path) filenames = ImportHelper.get_sequence_filenames(params.path, dataset_name) file_index = 0 dataset_filenames = [] dataset_params = {} items = None for index, filename in enumerate(filenames): new_items = ImportHelper.import_items(import_class, filename, params) items = np.append(items, new_items) if items is not None else new_items dataset_params = ImportHelper.extract_sequence_dataset_params(items, params) while len(items) > params.sequence_file_size or (index == len(filenames) - 1 and len(items) > 0): dataset_filenames.append(params.result_path + "batch_{}.pickle".format(file_index)) ImportHelper.store_sequence_items(dataset_filenames, items, params.sequence_file_size) items = items[params.sequence_file_size:] file_index += 1 init_kwargs = {"filenames": dataset_filenames, "file_size": params.sequence_file_size, "name": dataset_name, "params": dataset_params} dataset = ReceptorDataset(**init_kwargs) if params.paired else SequenceDataset(**init_kwargs) PickleExporter.export(dataset, params.result_path) return dataset
def _write_paired_matches(self, paired_matches_path) -> List[ReportOutput]: PathBuilder.build(paired_matches_path) report_outputs = [] for i in range(0, len(self.dataset.encoded_data.example_ids) ): # todo don't mention subject in the name twice filename = "example_{}_".format( self.dataset.encoded_data.example_ids[i]) filename += "_".join([ "{label}_{value}".format(label=label, value=values[i]) for label, values in self.dataset.encoded_data.labels.items() ]) filename += ".csv" filename = os.path.join(paired_matches_path, filename) if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder": self._write_paired_receptor_matches_for_repertoire( self.dataset.encoded_data.examples[i], filename) elif self.dataset.encoded_data.encoding == "MatchedRegexEncoder": self._write_paired_regex_matches_for_repertoire( self.dataset.encoded_data.examples[i], filename) report_outputs.append( ReportOutput( filename, f"example {self.dataset.encoded_data.example_ids[i]} paired matches" )) return report_outputs
def make_dummy_dataset(self, path, add_metadata): rep1text = """Clone ID Senior Author TRAJ Gene TRAV Gene CDR3A AA Sequence TRBV Gene TRBD Gene TRBJ Gene CDR3B AA Sequence Antigen Protein Antigen Gene Antigen Species Antigen Peptide AA # Epitope Peptide MHC Class HLA Restriction Counts 1E6 Sewell TRAJ12 TRAV12-3 CAMRGDSSYKLIF TRBV12-4 TRBD2 TRBJ2-4 CASSLWEKLAKNIQYF PPI INS Human 12-24 ALWGPDPAAA MHC I A*02:01 1 4.13 Nepom TRAJ44 TRAV19 CALSENRGGTASKLTF TRBV5-1 TRBD1 TRBJ1-1 CASSLVGGPSSEAFF GAD Human 555-567 MHC II DRB1*04:01 3 5 Roep TRAJ6 TRAV21 CAVKRTGGSYIPTF TRBV11-2 TRBD1 TRBJ2-2 CASSSFWGSDTGELFF Insulin B Human 9-23 MHC II DQ8 7 D222D 2 Mallone TRAJ36*01 TRAV17*01 CAVTGANNLFF TRBV19*01 TRBD1*01 TRBJ2-2*01 CASSIEGPTGELFF Zinc Transporter 8 ZnT8 Human 185-194 AVAANIVLTV MHC I A*02:01 2 GSE.20D11 Nakayama TRAJ4 TRAV12-3 CAILSGGYNKLIF TRBV2 TRBD2 TRBJ2-5 CASSAETQYF Insulin B Human 9-23 MHC II DQ8 10 GSE.6H9 Nakayama TRAJ40 TRAV26-1 CIVRVDSGTYKYIF TRBV7-2 TRBD2 TRBJ2-1 CASSLTAGLASTYNEQFF Insulin B Human 9-23 MHC II DQ8/DQ8 nan iGRP 32 DiLorenzo TRAJ48 TRAV12-1 CVVNILSNFGNEKLTF TRBV20/OR9-2 TRBD1 TRBJ2-1 CSASRQGWVNEQFF IGRP Human 265-273 MHC I A*02:01 1 MART-1 TBD TRAJ23 TRAV12-2 CAVNFGGGKLIF TRBV6-4 TRBD2 TRBJ1-1 CASSLSFGTEAFF Melan A Human 27-35 ELAGIGILTV MHC I A2 3 MHB10.3 TBD TRAJ27 TRAV4 CLVGDSLNTNAGKSTF TRBV29-1 TRBD2 TRBJ2-2 CSVEDRNTGELFF Insulin B Human 11-30 MHC II DRB1*03:01 NA PM1#11 TBD TRAJ54 TRAV35 CAGHSIIQGAQKLVF TRBV5-1 TRBD2 TRBJ2-1 CASGRSSYNEQFF GAD Human 339-352 MHC II DRB1*03:01 2 R164 Nepom TRAJ56 TRAV19 CALSEEGGGANSKLTF TRBV5-1 TRBD2 TRBJ1-6 CASSLAGGANSPLHF GAD Human 555-567 MHC II DRB1*04:01 1 SD32.5 Boehm TRAJ23 TRAV26-1 CIVRVSSAYYNQGGKLIF TRBV27 TRBD2 TRBJ2-3 CASSPRANTDTQYF Insulin A Human 5-21 MHC II DRB1*04:01 1 SD52.c1 Boehm TRAJ27 TRAV4 CLVGDSLNTNAGKSTF TRBV27 TRBD1 TRBJ1-5 CASSWSSIGNQPQHF PPI INS Human C18-A1 MHC II DRB1*04:01 1 T1D#10 C8 TBD TRAJ26 TRAV12-3 CATAYGQNFVF TRBV4-1 TRBD2 TRBJ2-2 CASSRGGGNTGELFF Insulin B Human 9-23 MHC II DQ8 1 T1D#3 C8 TBD TRAJ23 TRAV17 CATDAGYNQGGKLIF TRBV5-1 TRBD2 TRBJ1-3 CASSAGNTIYF Insulin B Human 9-23 MHC II DQ8 1""" PathBuilder.build(path) with open(path + "rep1.tsv", "w") as file: file.writelines(rep1text) if add_metadata: with open(path + "metadata.csv", "w") as file: file.writelines( """filename,chain,subject_id,coeliac status (yes/no) rep1.tsv,TRA,1234e,no""" )
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: rep_map = {} repertoires = [] indices_to_keep = [] processed_dataset = dataset.clone() PathBuilder.build(params["result_path"]) for index, repertoire in enumerate(processed_dataset.get_data()): if repertoire.metadata["subject_id"] in rep_map.keys(): sequences = np.append( repertoire.sequences, rep_map[repertoire.metadata["subject_id"]].sequences) del rep_map[repertoire.metadata["subject_id"]] repertoires.append( SubjectRepertoireCollector.store_repertoire( params["result_path"], repertoire, sequences)) else: rep_map[repertoire.metadata["subject_id"]] = repertoire indices_to_keep.append(index) for key in rep_map.keys(): repertoires.append( SubjectRepertoireCollector.store_repertoire( params["result_path"], rep_map[key], rep_map[key].sequences)) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata( dataset, indices_to_keep, params["result_path"]) return processed_dataset
def create_dummy_dataset(self, path, add_metadata): rep1text = """nucleotide aminoAcid count (templates/reads) frequencyCount (%) cdr3Length vMaxResolved vFamilyName vGeneName vGeneAllele vFamilyTies vGeneNameTies vGeneAlleleTies dMaxResolved dFamilyName dGeneName dGeneAllele dFamilyTies dGeneNameTies dGeneAlleleTies jMaxResolved jFamilyName jGeneName jGeneAllele jFamilyTies jGeneNameTies jGeneAlleleTies vDeletion n1Insertion d5Deletion d3Deletion n2Insertion jDeletion vIndex n1Index dIndex n2Index jIndex estimatedNumberGenomes sequenceStatus cloneResolved vOrphon dOrphon jOrphon vFunction dFunction jFunction fractionNucleated vAlignLength vAlignSubstitutionCount vAlignSubstitutionIndexes vAlignSubstitutionGeneThreePrimeIndexes vSeqWithMutations GCCATCCCCAACCAGACAGCTCTTTACTTCTGTGCCACCAGTGATCAACTTAACCGTTGGGGGACCGGGGAGCTGTTTTTTGGAGAA CATSDQLNRWGTGELFF 38 0.0017525250196006087 51 TCRBV24 TCRBV24 TCRBV24-01,TCRBV24-or09_02 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 TCRBJ02-02*01 TCRBJ02 TCRBJ02-02 01 3 0 6 1 13 5 30 45 58 -1 63 38 In VDJ GGGTTGGAGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCAAGGACGGCGACACCGGGGAGCTGTTTTTTGGAGAA CASKDGDTGELFF 48 0.002213715814232348 39 TCRBV06 TCRBV06 TCRBV06-02,TCRBV06-03 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 TCRBJ02-02*01 TCRBJ02 TCRBJ02-02 01 7 4 1 7 1 3 42 52 53 57 61 48 In VDJ AGGCCCTCACATACCTCTCAGTACCTCTGTGCCAGCAGTGGGGAGGGACAGGGGGTATTTGGTGGCACTGAAGCTTTCTTTGGACAA CASSGEGQGVFGGTEAFF 37 0.001706405940137435 54 TCRBV25-01*01 TCRBV25 TCRBV25-01 01 TCRBD01-01*01 TCRBD01 TCRBD01-01 01 TCRBJ01-01*01 TCRBJ01 TCRBJ01-01 01 4 10 0 1 4 4 27 40 44 55 65 37 In VDJ GAGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGTGAGGAGGTAGGGGGCAATCAGCCCCAGCATTTTGGTGAT CASSEEVGGNQPQHF 53 0.0024443112115482175 45 TCRBV06-01*01 TCRBV06 TCRBV06-01 01 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 TCRBJ01-05*01 TCRBJ01 TCRBJ01-05 01 3 0 5 2 6 2 36 50 56 -1 61 53 In VDJ GAGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGTGAATTACAGGAAGGTTATGAGACCCAGTACTTCGGGCCA CASSELQEGYETQYF 28 0.0012913342249688696 45 TCRBV06-01*01 TCRBV06 TCRBV06-01 01 TCRBD01-01*01 TCRBD01 TCRBD01-01 01 TCRBJ02-05*01 TCRBJ02 TCRBJ02-05 01 2 8 3 4 2 5 36 51 53 58 66 28 In VDJ TTGGAGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCAGTTTCCTAGCGGACCCCGGAGAGCAGTTCTTCGGGCCA CASSFLADPGEQFF 16 7.379052714107826E-4 42 TCRBV06 TCRBV06 TCRBV06-02,TCRBV06-03 TCRBD02-01 TCRBD02 TCRBD02-01 01,02 TCRBJ02-01*01 TCRBJ02 TCRBJ02-01 01 4 8 4 5 2 10 39 52 54 61 69 16 In VDJ CAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCTCACTTTGGGGTCGGAGGTATGGCTACACCTTCGGTTCG CASSSLWGRRYGYTF 72 0.003320573721348522 45 TCRBV07-09 TCRBV07 TCRBV07-09 01,03 TCRBD02-01*02 TCRBD02 TCRBD02-01 02 TCRBJ01-02*01 TCRBJ01 TCRBJ01-02 01 4 0 10 1 12 5 36 49 61 -1 66 72 In VDJ AGCAACATGAGCCCTGAAGACAGCAGCATATATCTCTGCAGCGTTTTGGACCTCCCGACCCAAACAGATACGCAGTATTTTGGCCCA CSVLDLPTQTDTQYF 14 6.456671124844348E-4 45 TCRBV29-01*01 TCRBV29 TCRBV29-01 01 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 TCRBJ02-03*01 TCRBJ02 TCRBJ02-03 01 5 12 1 7 2 3 36 45 47 51 63 14 In VDJ CAGCGCACACAGCAGGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCTTAAGGCTAGCGGGAGTGGAGACCCAGTACTTCGGGCCA CASSLRLAGVETQYF 26 0.0011990960660425217 45 TCRBV07-02*01 TCRBV07 TCRBV07-02 01 TCRBD02-01*02 TCRBD02 TCRBD02-01 02 TCRBJ02-05*01 TCRBJ02 TCRBJ02-05 01 2 2 4 2 3 5 36 51 54 64 66 26 In VDJ CTGGAGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGCAGCGGTCCAGGGATGGAGACCCAGTACTTCGGGCCA CASSSGPGMETQYF 13 5.995480330212608E-4 42 TCRBV06-01*01 TCRBV06 TCRBV06-01 01 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 TCRBJ02-05*01 TCRBJ02 TCRBJ02-05 01 6 3 4 3 8 5 39 50 58 63 66 13 In VDJ TCTAAGAAGCTCCTTCTCAGTGACTCTGGCTTCTATCTCTGTGCCTGGAGTGCTATAGCGGATTACAATGAGCAGTTCTTCGGGCCA CAWSAIADYNEQFF 8 3.689526357053913E-4 42 TCRBV30-01*01 TCRBV30 TCRBV30-01 01 TCRBD02-01 TCRBD02 TCRBD02-01 01,02 TCRBJ02-01*01 TCRBJ02 TCRBJ02-01 01 1 2 5 5 3 4 39 52 55 61 63 8 In VDJ TCCCTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTTTAATAGATACGCAGTATTTTGGCCCA CASSLIDTQYF 16 7.379052714107826E-4 33 TCRBV28-01*01 TCRBV28 TCRBV28-01 01 TCRBJ02-03*01 TCRBJ02 TCRBJ02-03 01 2 2 0 0 0 5 48 -1 -1 63 65 16 In VJ ATCCGGTCCACAAAGCTGGAGGACTCAGCCATGTACTTCTGTGCCAGCAGATCGGGACAGGGATGGGATGAGCAGTTCTTCGGGCCA CASRSGQGWDEQFF 8 3.689526357053913E-4 42 TCRBV02-01*01 TCRBV02 TCRBV02-01 01 TCRBD01-01*01 TCRBD01 TCRBD01-01 01 TCRBJ02-01*01 TCRBJ02 TCRBJ02-01 01 6 5 0 3 3 8 39 50 53 62 67 8 In VDJ ATCAATTCCCTGGAGCTTGGTGACTCTGCTGTGTATTTCTGTGCCAGCAGCCCTAGCGGAGACACCGGGGAGCTGTTTTTTGGAGAA CASSPSGDTGELFF 28 0.0012913342249688696 42 TCRBV03 TCRBV03 TCRBV03-01,TCRBV03-02 TCRBD02-01 TCRBD02 TCRBD02-01 01,02 TCRBJ02-02*01 TCRBJ02 TCRBJ02-02 01 4 2 4 5 0 3 39 -1 52 59 61 28 In VDJ GGTCCACAAAGCTGGAGGACTCAGCCATGTACTTCTGTGCCAGCAGTCCCGGGGGACGGGGCTTCATACGAGCAGTACTTCGGGCCG 8 3.689526357053913E-4 46 TCRBV02-01*01 TCRBV02 TCRBV02-01 01 TCRBD02-01*01 TCRBD02 TCRBD02-01 01 TCRBJ02-07*01 TCRBJ02 TCRBJ02-07 01 5 11 8 2 2 4 35 47 49 55 66 8 Out VDJ GAGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCAGTTCCGACAGCGGTCCCTACAATGAGCAGTTCTTCGGGCCA CASSSDSGPYNEQFF 7 3.228335562422174E-4 45 TCRBV06 TCRBV06 TCRBV06-02,TCRBV06-03 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 TCRBJ02-01*01 TCRBJ02 TCRBJ02-01 01 4 5 2 5 2 2 36 49 51 56 61 7 In VDJ GGGTTGGAGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCAGTCCAGGGGACACCGGGGAGCTGTTTTTTGGAGAA CASSPGDTGELFF 1 4.611907946317391E-5 39 TCRBV06 TCRBV06 TCRBV06-02,TCRBV06-03 TCRBD01-01*01 TCRBD01 TCRBD01-01 01 TCRBJ02-02*01 TCRBJ02 TCRBJ02-02 01 5 0 4 2 1 3 42 54 55 -1 61 1 In VDJ CTGAACATGAGCTCCTTGGAGCTGGGGGACTCAGCCCTGTACTTCTGTGCCAGCAGCTTACGCACAGATACGCAGTATTTTGGCCCA CASSLRTDTQYF 9 4.1507171516856525E-4 36 TCRBV13-01*01 TCRBV13 TCRBV13-01 01 TCRBJ02-03*01 TCRBJ02 TCRBJ02-03 01 2 1 0 0 0 1 45 -1 -1 60 61 9 In VJ AAGAAGCTCCTTCTCAGTGACTCTGGCTTCTATCTCTGTGCCTGGAGTGTACGTCCGGGCGCAGGGTACGAGCAGTACTTCGGGCCG CAWSVRPGAGYEQYF 1 4.611907946317391E-5 45 TCRBV30-01*01 TCRBV30 TCRBV30-01 01 TCRBD01-01*01 TCRBD01 TCRBD01-01 01 TCRBJ02-07*01 TCRBJ02 TCRBJ02-07 01 0 0 4 3 11 4 36 50 61 -1 66 1 In VDJ """ PathBuilder.build(path) with open(path + "rep1.tsv", "w") as file: file.writelines(rep1text) if add_metadata: with open(path + "metadata.csv", "w") as file: file.writelines( """filename,chain,subject_id,coeliac status (yes/no) rep1.tsv,TRA,1234a,no""")
def test_generate(self): path = EnvironmentSettings.root_path + "test/tmp/motifseedrecovery/" PathBuilder.build(path) report = self._create_report(path) # Running the report result = report.generate_report() self.assertIsInstance(result, ReportResult) self.assertEqual(result.output_tables[0].path, path + "motif_seed_recovery.csv") self.assertEqual(result.output_figures[0].path, path + "motif_seed_recovery.html") # Actual tests self.assertTrue(os.path.isfile(path + "motif_seed_recovery.csv")) self.assertTrue(os.path.isfile(path + "motif_seed_recovery.html")) written_data = pd.read_csv(path + "motif_seed_recovery.csv") self.assertListEqual(list(written_data.columns), ["features", "max_seed_overlap", "coefficients"]) self.assertListEqual(list(written_data["coefficients"]), [i for i in range(5)]) self.assertListEqual(list(written_data["features"]), ["AAA", "AAC", "CKJ", "KSA", "AKJ"]) self.assertListEqual(list(written_data["max_seed_overlap"]), [3, 2, 0, 1, 1]) shutil.rmtree(path)
def test_parse_receptor_dataset(self): file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = EnvironmentSettings.root_path + "test/tmp/dslimportparservdj/" data_path = EnvironmentSettings.root_path + "test/tmp/dslimportparservdj/receptor_data/" PathBuilder.build(data_path) with open(data_path + "receptors.tsv", "w") as file: file.writelines(file_content) st, desc = ImportParser.parse( { "datasets": { "d1": { "format": "VDJdb", "params": { "is_repertoire": False, "paired": True, "receptor_chains": "TRA_TRB", "path": data_path } } } }, SymbolTable(), path) dataset = st.get("d1") self.assertTrue(isinstance(dataset, ReceptorDataset)) self.assertEqual(2, dataset.get_example_count()) shutil.rmtree(path)
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/clones_per_repertoire_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path)[0]) dataset1 = ClonesPerRepertoireFilter.process(dataset, { "lower_limit": 3, "result_path": path }) self.assertEqual(2, dataset1.get_example_count()) dataset2 = ClonesPerRepertoireFilter.process(dataset, { "upper_limit": 2, "result_path": path }) self.assertEqual(1, dataset2.get_example_count()) self.assertRaises(AssertionError, ClonesPerRepertoireFilter.process, dataset, { "lower_limit": 10, "result_path": path }) shutil.rmtree(path)
def test_create_model(self): test_path = EnvironmentSettings.root_path + "test/tmp/w2v_test_tmp/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA") sequence2 = ReceptorSequence("CASSCCC") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) model_creator = KmerPairModelCreator() model = model_creator.create_model(dataset=dataset, k=2, vector_size=16, batch_size=1, model_path=test_path + "model.model") self.assertTrue(isinstance(model, Word2Vec)) self.assertTrue("CA" in model.wv.vocab) self.assertEqual(400, len(model.wv.vocab)) shutil.rmtree(test_path)
def run(self): print("Starting MultiDatasetBenchmarkTool...", flush=True) PathBuilder.build(self.result_path) specs = self._split_specs_file() self._extract_reports() instruction_states = {} for index, specs_name in enumerate(specs.keys()): print( f"Running nested cross-validation on dataset {specs_name} ({index+1}/{len(list(specs.keys()))})..", flush=True) app = ImmuneMLApp(specification_path=specs[specs_name], result_path=f"{self.result_path}/{specs_name}/") instruction_states[specs_name] = app.run()[0] print( f"Finished nested cross-validation on dataset {specs_name} ({index+1}/{len(list(specs.keys()))})..", flush=True) print( "Running reports on the results of nested cross-validation on all datasets...", flush=True) report_results = self._run_reports(instruction_states) print("Finished reports, now generating HTML output...", flush=True) MultiDatasetBenchmarkHTMLBuilder.build( report_results, self.result_path, { specs_name: f"{self.result_path}/{specs_name}/" for specs_name in specs.keys() }) print("MultiDatasetBenchmarkTool finished.", flush=True)
def test_implant_in_repertoire(self): path = EnvironmentSettings.tmp_test_path + "healthysequenceimplanting/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( [ ReceptorSequence(amino_acid_sequence="ACDFQ", identifier="1"), ReceptorSequence(amino_acid_sequence="TGCDF", identifier="2") ], path=path, metadata={"subject_id": "1"}) implanting = HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND) signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")], implanting) repertoire2 = implanting.implant_in_repertoire(repertoire, 0.5, signal, path) new_sequences = [ sequence.get_sequence() for sequence in repertoire2.sequences ] self.assertTrue("ACDFQ" in new_sequences or "TGCDF" in new_sequences) self.assertTrue(any(["CCC" in sequence for sequence in new_sequences])) shutil.rmtree(path)
def test_load_repertoire(self): """Test dataset content with and without a header included in the input file""" path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "igor") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path + "metadata.csv" dataset = IGoRImport.import_dataset(params, "igor_repertoire_dataset") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(len(dataset.repertoires[0].sequences), 1) self.assertEqual(len(dataset.repertoires[1].sequences), 1) self.assertEqual( dataset.repertoires[0].sequences[0].amino_acid_sequence, "ARDRWSTPVLRYFDWWTPPYYYYMDV") self.assertListEqual(list(dataset.repertoires[0].get_counts()), [1]) self.assertEqual(dataset.repertoires[0].get_chains(), None) shutil.rmtree(path)