def _generate(self) -> ReportResult: from immuneML.util.TCRdistHelper import TCRdistHelper from tcrdist.rep_diff import hcluster_diff from tcrdist.summarize import member_summ PathBuilder.build(self.result_path) subsampled_dataset = self._extract_positive_example_dataset() reference_sequences = self._extract_reference_sequences() tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label.name], self.cores) tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"], count_col='count') figures, tables = [], [] logging.info(f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.') for index, row in tcr_rep.hcluster_df.iterrows(): if len(row['neighbors_i']) >= self.min_cluster_size: figure_outputs, table_outputs = self._discover_motif_in_cluster(tcr_rep, index, row, reference_sequences) figures.extend(figure_outputs) tables.extend(table_outputs) res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope']) res_summary.to_csv(self.result_path / "tcrdist_summary.csv") tables.append(ReportOutput(path=self.result_path / "tcrdist_summary.csv", name="TCRdist summary (csv)")) return ReportResult(name=self.name, info="TCRdist motif discovery", output_figures=figures, output_tables=tables)
def create_dataset(self, path, dataset_size: int = 50): sequences = [] for i in range(dataset_size): if i % 2 == 0: sequences.append( ReceptorSequence( amino_acid_sequence="AAACCC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 1}))) else: sequences.append( ReceptorSequence( amino_acid_sequence="ACACAC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 2}))) PathBuilder.build(path) filename = path / "sequences.pkl" with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = SequenceDataset(labels={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=[1, 1, 1, 2, 2], path=path) if dataset_type == "receptor": receptordataset_filename = path / "receptors.pkl" with open(receptordataset_filename, "wb") as file: pickle.dump(test_repertoire.receptors, file) dataset = ReceptorDataset(filenames=[receptordataset_filename], identifier="receptor_dataset") elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) test_metadata_filepath = self.test_dataset.encoded_data.info[ 'metadata_filepath'] label_names = [self.label] hdf5_filepath = self.method._metadata_to_hdf5(test_metadata_filepath, label_names) n_examples_test = len(self.test_dataset.encoded_data.example_ids) indices = np.array(range(n_examples_test)) dataloader = self.method.make_data_loader(hdf5_filepath, pre_loaded_hdf5_file=None, indices=indices, label=self.label, eval_only=True, is_train=False) model = self.method.get_model(self.label)[self.label] compute_contributions(intgrds_set_loader=dataloader, deeprc_model=model, n_steps=self.n_steps, threshold=self.threshold, resdir=self.result_path, filename_inputs=self.filename_inputs, filename_kernels=self.filename_kernels) return ReportResult(self.name, output_figures=[ ReportOutput(self.filename_inputs), ReportOutput(self.filename_kernels) ])
def test_generate(self): path = EnvironmentSettings.tmp_test_path / "relevant_sequence_exporter/" PathBuilder.build(path) df = pd.DataFrame({ "v_genes": ["TRBV1-1", "TRBV1-1"], 'j_genes': ["TRBJ1-1", "TRBJ1-2"], "sequence_aas": ['ACCF', "EEFG"] }) df.to_csv(path / 'sequences.csv', index=False) dataset = RandomDatasetGenerator.generate_repertoire_dataset( 2, {2: 1}, {4: 1}, {}, path / "data") dataset.encoded_data = EncodedData( examples=None, info={'relevant_sequence_path': path / 'sequences.csv'}, encoding="SequenceAbundanceEncoder") report_result = RelevantSequenceExporter(dataset, path / "result", 'somename').generate_report() self.assertEqual(1, len(report_result.output_tables)) self.assertTrue(os.path.isfile(report_result.output_tables[0].path)) self.assertTrue( all(col in ["v_call", "j_call", "cdr3_aa"] for col in pd.read_csv( report_result.output_tables[0].path).columns)) shutil.rmtree(path)
def test_load_sequence_dataset(self): """Test dataset content with and without a header included in the input file""" path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "igor") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path params["import_with_stop_codon"] = True dataset = IGoRImport.import_dataset(params, "igor_seq_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertEqual(4, dataset.get_example_count()) self.assertListEqual( sorted([ "GCGAGACGTGTCTAGGGAGGATATTGTAGTAGTACCAGCTGCTATGACGGGCGGTCCGGTAGTACTACTTTGACTAC", "GCGAGAGGCTTCCATGGAACTACAGTAACTACGTTTGTAGGCTGTAGTACTACATGGACGTC", "GCGAGAGTTAATCGGCATATTGTGGTGGTGACTGCTATTATGACCGGGTAAAACTGGTTCGACCCC", "GCGAGAGATAGGTGGTCAACCCCAGTATTACGATATTTTGACTGGTGGACCCCGCCCTACTACTACTACATGGACGTC" ]), sorted([seq.nucleotide_sequence for seq in seqs])) shutil.rmtree(path)
def export(hp_item: HPItem, path: Path) -> Path: PathBuilder.build(path) preproc_filename = MLExporter._store_preprocessing_sequence( hp_item.hp_setting.preproc_sequence, path).name encoder_filename = MLExporter._store_encoder( hp_item.hp_setting.encoder, path).name hp_item.method.store(path, hp_item.method.get_feature_names()) labels_with_values = { hp_item.method.get_label(): hp_item.method.get_classes() } method_config = MLMethodConfiguration( labels_with_values=labels_with_values, software_used=hp_item.method.get_package_info(), encoding_name=hp_item.hp_setting.encoder_name, encoding_parameters=hp_item.hp_setting.encoder_params, encoding_file=encoder_filename, encoding_class=type(hp_item.hp_setting.encoder).__name__, ml_method=type(hp_item.method).__name__, ml_method_name=hp_item.method.name, train_dataset_id=hp_item.train_dataset.identifier, train_dataset_name=hp_item.train_dataset.name, preprocessing_sequence_name=hp_item.hp_setting. preproc_sequence_name, preprocessing_file=os.path.basename(preproc_filename), preprocessing_parameters={ type(seq).__name__: vars(seq) for seq in hp_item.hp_setting.preproc_sequence }) method_config.store(path / 'ml_config.yaml') return path
def make_dummy_dataset(self, path, add_metadata): rep1text = """Clone ID Senior Author TRAJ Gene TRAV Gene CDR3A AA Sequence TRBV Gene TRBD Gene TRBJ Gene CDR3B AA Sequence Antigen Protein Antigen Gene Antigen Species Antigen Peptide AA # Epitope Peptide MHC Class HLA Restriction Counts 1E6 Sewell TRAJ12 TRAV12-3 CAMRGDSSYKLIF TRBV12-4 TRBD2 TRBJ2-4 CASSLWEKLAKNIQYF PPI INS Human 12-24 ALWGPDPAAA MHC I A*02:01 1 4.13 Nepom TRAJ44 TRAV19 CALSENRGGTASKLTF TRBV5-1 TRBD1 TRBJ1-1 CASSLVGGPSSEAFF GAD Human 555-567 MHC II DRB1*04:01 3 5 Roep TRAJ6 TRAV21 CAVKRTGGSYIPTF TRBV11-2 TRBD1 TRBJ2-2 CASSSFWGSDTGELFF Insulin B Human 9-23 MHC II DQ8 7 D222D 2 Mallone TRAJ36*01 TRAV17*01 CAVTGANNLFF TRBV19*01 TRBD1*01 TRBJ2-2*01 CASSIEGPTGELFF Zinc Transporter 8 ZnT8 Human 185-194 AVAANIVLTV MHC I A*02:01 2 GSE.20D11 Nakayama TRAJ4 TRAV12-3 CAILSGGYNKLIF TRBV2 TRBD2 TRBJ2-5 CASSAETQYF Insulin B Human 9-23 MHC II DQ8 10 GSE.6H9 Nakayama TRAJ40 TRAV26-1 CIVRVDSGTYKYIF TRBV7-2 TRBD2 TRBJ2-1 CASSLTAGLASTYNEQFF Insulin B Human 9-23 MHC II DQ8/DQ8 nan iGRP 32 DiLorenzo TRAJ48 TRAV12-1 CVVNILSNFGNEKLTF TRBV20/OR9-2 TRBD1 TRBJ2-1 CSASRQGWVNEQFF IGRP Human 265-273 MHC I A*02:01 1 MART-1 TBD TRAJ23 TRAV12-2 CAVNFGGGKLIF TRBV6-4 TRBD2 TRBJ1-1 CASSLSFGTEAFF Melan A Human 27-35 ELAGIGILTV MHC I A2 3 MHB10.3 TBD TRAJ27 TRAV4 CLVGDSLNTNAGKSTF TRBV29-1 TRBD2 TRBJ2-2 CSVEDRNTGELFF Insulin B Human 11-30 MHC II DRB1*03:01 NA PM1#11 TBD TRAJ54 TRAV35 CAGHSIIQGAQKLVF TRBV5-1 TRBD2 TRBJ2-1 CASGRSSYNEQFF GAD Human 339-352 MHC II DRB1*03:01 2 R164 Nepom TRAJ56 TRAV19 CALSEEGGGANSKLTF TRBV5-1 TRBD2 TRBJ1-6 CASSLAGGANSPLHF GAD Human 555-567 MHC II DRB1*04:01 1 SD32.5 Boehm TRAJ23 TRAV26-1 CIVRVSSAYYNQGGKLIF TRBV27 TRBD2 TRBJ2-3 CASSPRANTDTQYF Insulin A Human 5-21 MHC II DRB1*04:01 1 SD52.c1 Boehm TRAJ27 TRAV4 CLVGDSLNTNAGKSTF TRBV27 TRBD1 TRBJ1-5 CASSWSSIGNQPQHF PPI INS Human C18-A1 MHC II DRB1*04:01 1 T1D#10 C8 TBD TRAJ26 TRAV12-3 CATAYGQNFVF TRBV4-1 TRBD2 TRBJ2-2 CASSRGGGNTGELFF Insulin B Human 9-23 MHC II DQ8 1 T1D#3 C8 TBD TRAJ23 TRAV17 CATDAGYNQGGKLIF TRBV5-1 TRBD2 TRBJ1-3 CASSAGNTIYF Insulin B Human 9-23 MHC II DQ8 1""" PathBuilder.build(path) with open(path / "rep1.tsv", "w") as file: file.writelines(rep1text) if add_metadata: with open(path / "metadata.csv", "w") as file: file.writelines( """filename,chain,subject_id,coeliac status (yes/no) rep1.tsv,TRA,1234e,no""")
def standard_scale(scaler_file: Path, design_matrix, with_mean: bool = True): """ scale to zero mean and unit variance on feature level :param scaler_file: path to scaler file fitted on train set or where the resulting scaler file will be stored :param design_matrix: rows -> examples, columns -> features :param with_mean: whether to scale to zero mean or not (could lose sparsity if scaled) :return: scaled design matrix """ if with_mean and hasattr(design_matrix, "todense"): scaled_design_matrix = design_matrix.todense() else: scaled_design_matrix = design_matrix if scaler_file.is_file(): with scaler_file.open('rb') as file: scaler = pickle.load(file) scaled_design_matrix = scaler.transform(scaled_design_matrix) else: scaler = StandardScaler(with_mean=with_mean) scaled_design_matrix = scaler.fit_transform(scaled_design_matrix) directory = scaler_file.parent PathBuilder.build(directory) with scaler_file.open('wb') as file: pickle.dump(scaler, file) return scaled_design_matrix
def create_dummy_receptordataset(self, path): receptors = [TCABReceptor(identifier="1", alpha=ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={"d_call": "TRAD1", "custom1": "cust1"})), beta=ReceptorSequence(amino_acid_sequence="ATATAT", identifier="1b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={"d_call": "TRBD1", "custom1": "cust1"}))), TCABReceptor(identifier="2", alpha=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2a", metadata=SequenceMetadata(v_gene="TRAV1", j_gene="TRAJ1", chain=Chain.ALPHA, frame_type="IN", custom_params={"d_call": "TRAD1", "custom2": "cust1"})), beta=ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="2b", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, frame_type="IN", custom_params={"d_call": "TRBD1", "custom2": "cust1"})))] receptors_path = path / "receptors" PathBuilder.build(receptors_path) return ReceptorDataset.build_from_objects(receptors, 2, receptors_path)
def create_dataset(self): path = Path( os.path.relpath(EnvironmentSettings.root_path / "test/tmp/immunemlapp/initial_dataset")) PathBuilder.build(path) repertoire_count = 30 repertoires, metadata = RepertoireBuilder.build( [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)], path, { "CD": [ 'yes' if i % 2 == 0 else 'no' for i in range(repertoire_count) ], "CMV": [ True if i % 2 == 1 else False for i in range(repertoire_count) ] }, [[{ "chain": "A" if i % 2 == 0 else "B", "count": random.randint(2, 5) } for i in range(4)] for j in range(repertoire_count)]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={ "CD": [True, False], "CMV": [True, False] }, name="d1") PickleExporter.export(dataset, path) return path / "d1.iml_dataset"
def test_repertoire_export(self): path = EnvironmentSettings.tmp_test_path / "airr_exporter_repertoire/" PathBuilder.build(path) repertoire, metadata_path = self.create_dummy_repertoire(path) dataset = RepertoireDataset(repertoires=[repertoire], metadata_file=metadata_path) path_exported = path / "exported" AIRRExporter.export(dataset, path_exported) resulting_data = pd.read_csv(path_exported / f"repertoires/{repertoire.identifier}.tsv", sep="\t") self.assertListEqual(list(resulting_data["sequence_id"]), ["receptor_1", "receptor_2"]) self.assertListEqual(list(resulting_data["cdr3"]), ["GCTGCTGCT", "GGTGGTGGT"]) self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"]) self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1", "TRAV2*01"]) self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1", "TRAJ2"]) self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1", "TRAD2"]) self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"]) self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15]) self.assertListEqual(list(resulting_data["custom_test"]), ["cust1", "cust2"]) self.assertListEqual(list(resulting_data["productive"]), ['T', 'F']) self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F']) shutil.rmtree(path)
def test_sequence_export(self): path = EnvironmentSettings.tmp_test_path / "airr_exporter_receptor/" PathBuilder.build(path) dataset = self.create_dummy_sequencedataset(path) path_exported = path / "exported_sequences" AIRRExporter.export(dataset, path_exported) resulting_data = pd.read_csv(path_exported / "batch1.tsv", sep="\t") self.assertListEqual(list(resulting_data["sequence_id"]), ["1a", "1b"]) self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAATTT", "ATATAT"]) self.assertListEqual(list(resulting_data["v_call"]), ["TRAV1", "TRBV1"]) self.assertListEqual(list(resulting_data["j_call"]), ["TRAJ1", "TRBJ1"]) self.assertListEqual(list(resulting_data["d_call"]), ["TRAD1", "TRBD1"]) self.assertListEqual(list(resulting_data["locus"]), ["TRA", "TRB"]) self.assertListEqual(list(resulting_data["custom1"]), ["cust1", nan]) self.assertListEqual(list(resulting_data["custom2"]), [nan, "cust1"]) self.assertListEqual(list(resulting_data["productive"]), ['T', 'T']) self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F']) resulting_data = pd.read_csv(path_exported / "batch2.tsv", sep="\t") self.assertListEqual(list(resulting_data["sequence_id"]), ["2b"]) self.assertListEqual(list(resulting_data["cdr3_aa"]), ["ATATAT"]) self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1"]) self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1"]) self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1"]) self.assertListEqual(list(resulting_data["locus"]), ["TRB"]) self.assertListEqual(list(resulting_data["custom2"]), ["cust1"]) self.assertListEqual(list(resulting_data["productive"]), ['T']) self.assertListEqual(list(resulting_data["stop_codon"]), ['F']) shutil.rmtree(path)
def test_make_subset(self): sequences = [] for i in range(100): sequences.append(ReceptorSequence(amino_acid_sequence="AAA", identifier=str(i))) path = EnvironmentSettings.tmp_test_path / "element_generator_subset/" PathBuilder.build(path) for i in range(10): filepath = path / f"batch{i}.npy" sequences_to_pickle = sequences[i * 10:(i + 1) * 10] sequence_matrix = np.core.records.fromrecords([seq.get_record() for seq in sequences_to_pickle], names=ReceptorSequence.get_record_names()) np.save(str(filepath), sequence_matrix, allow_pickle=False) d = SequenceDataset(filenames=[path / f"batch{i}.npy" for i in range(10)], file_size=10) indices = [1, 20, 21, 22, 23, 24, 25, 50, 52, 60, 70, 77, 78, 90, 92] d2 = d.make_subset(indices, path, SequenceDataset.TRAIN) for batch in d2.get_batch(1000): for sequence in batch: self.assertTrue(int(sequence.identifier) in indices) self.assertEqual(15, d2.get_example_count()) shutil.rmtree(path)
def test_load_repertoire(self): """Test dataset content with and without a header included in the input file""" path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "igor") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path / "metadata.csv" dataset = IGoRImport.import_dataset(params, "igor_repertoire_dataset") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(len(dataset.repertoires[0].sequences), 1) self.assertEqual(len(dataset.repertoires[1].sequences), 1) self.assertEqual( dataset.repertoires[0].sequences[0].amino_acid_sequence, "ARDRWSTPVLRYFDWWTPPYYYYMDV") self.assertListEqual(list(dataset.repertoires[0].get_counts()), [1]) self.assertEqual(dataset.repertoires[0].get_chains(), None) shutil.rmtree(path)
def _generate(self) -> ReportResult: report_result = ReportResult( name=self.name, info= "Plots ROC curves for all trained ML settings ([preprocessing], encoding, ML model) in the outer loop of cross-validation in the TrainMLModel instruction" ) PathBuilder.build(self.result_path) for label in self.state.label_configuration.get_label_objects(): if len(label.values) != 2: logging.warning( f"{ROCCurveSummary.__name__}: report {self.name} is skipping label {label.name} as it has {len(label.values)} " f"classes, while this report expects 2 classes.") elif label.positive_class is None: logging.warning( f"{ROCCurveSummary.__name__}: report {self.name} is skipping label {label.name} because 'positive_class' parameter " f"is not set.") else: for index in range(self.state.assessment.split_count): figure = self._create_figure_for_assessment_split( index, label) report_result.output_figures.append(figure) return report_result
def test_load_repertoire_with_stop_codon(self): path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "igor") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["import_with_stop_codon"] = True params["metadata_file"] = path / "metadata.csv" dataset_stop_codons = IGoRImport.import_dataset( params, "igor_dataset_stop") self.assertEqual(2, dataset_stop_codons.get_example_count()) self.assertEqual(len(dataset_stop_codons.repertoires[0].sequences), 2) self.assertEqual(len(dataset_stop_codons.repertoires[1].sequences), 2) self.assertEqual( dataset_stop_codons.repertoires[0].sequences[0]. amino_acid_sequence, "ARVNRHIVVVTAIMTG*NWFDP") shutil.rmtree(path)
def encode_dataset(dataset, hp_setting: HPSetting, path: Path, learn_model: bool, context: dict, number_of_processes: int, label_configuration: LabelConfiguration, encode_labels: bool = True, store_encoded_data: bool = False): PathBuilder.build(path) encoded_dataset = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=hp_setting.encoder, encoder_params=EncoderParams( model=hp_setting.encoder_params, result_path=path, pool_size=number_of_processes, label_config=label_configuration, learn_model=learn_model, filename="train_dataset.pkl" if learn_model else "test_dataset.pkl", encode_labels=encode_labels), store_encoded_data=store_encoded_data)) return encoded_dataset
def _generate(self) -> ReportResult: figures, tables = [], [] PathBuilder.build(self.result_path) if ReferenceSequenceOverlap._check_encoder_class( self.state.optimal_hp_items[self.label].encoder): figure, data = self._compute_optimal_model_overlap() figures.append(figure) tables.append(data) for assessment_state in self.state.assessment_states: encoder = assessment_state.label_states[ self.label].optimal_assessment_item.encoder if ReferenceSequenceOverlap._check_encoder_class(encoder): figure_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_model_vs_reference_overlap_{self.label}.pdf" df_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_overlap_sequences_{self.label}" figure, data = self._compute_model_overlap( figure_filename, df_filename, encoder, f"overlap sequences between the model for assessment split " f"{assessment_state.split_index + 1} and reference list") figures.append(figure) tables.append(data) return ReportResult(self.name, output_figures=figures, output_tables=tables)
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) data_long_format = DataReshaper.reshape(self.dataset) table_result = self._write_results_table(data_long_format) report_output_fig = self._safe_plot(data_long_format=data_long_format) output_figures = None if report_output_fig is None else [report_output_fig] return ReportResult(self.name, output_figures, [table_result])
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) paths = [] # make predictions predictions = self.method.predict( self.test_dataset.encoded_data, self.label)[self.label] # label = disease true_labels = self.test_dataset.get_metadata(self.metadata_labels + [self.label]) metrics = ["FP", "FN"] plot = make_subplots(rows=len(self.metadata_labels), cols=2) listOfPlot = [] for label_index, meta_label in enumerate(self.metadata_labels): csv_data = {} for metric_index, metric in enumerate(metrics): plotting_data = self._metrics(metric=metric, label=self.label, meta_label=meta_label, predictions=predictions, true_labels=true_labels) csv_data[f"{metric}"] = plotting_data[f"{metric}"] plot.add_trace(go.Bar(x=plotting_data[meta_label], y=plotting_data[metric]), row=label_index + 1, col=metric_index + 1) plot.update_xaxes(title_text=f"{meta_label}", row=label_index + 1, col=metric_index + 1, type='category') plot.update_yaxes(title_text=f"{metric}", row=label_index + 1, col=metric_index + 1, rangemode="nonnegative", tick0=0, dtick=1) csv_data[f"{meta_label}"] = plotting_data[f"{meta_label}"] csv_data = pd.DataFrame(csv_data) listOfPlot.append(csv_data) plot.update_traces(marker_color=px.colors.sequential.Teal[3], showlegend=False) filename = self.result_path / "plots.html" plot.write_html(str(filename)) report_output_fig = ReportOutput(filename) paths.append(report_output_fig) result_table_path = self._write_results_table(listOfPlot, self.metadata_labels) return ReportResult(name=self.name, output_figures=paths, output_tables=[ReportOutput(result_table_path[0])])
def test_run(self): path = EnvironmentSettings.tmp_test_path / "galaxy_api_dataset_generation/" PathBuilder.build(path) yaml_path = path / "specs.yaml" result_path = path / "results/" PathBuilder.build(path) self.prepare_specs(yaml_path) run_immuneML( Namespace( **{ "specification_path": yaml_path, "result_path": result_path, 'tool': "DatasetGenerationTool" })) self.assertTrue( os.path.isfile(result_path / "result/dataset_metadata.csv")) self.assertTrue( os.path.isfile(result_path / "result/dataset.iml_dataset")) self.assertEqual( 200, len([ name for name in os.listdir(result_path / "result/repertoires/") if os.path.isfile( os.path.join(result_path / "result/repertoires/", name)) ])) shutil.rmtree(path)
def test_generate(self): path = EnvironmentSettings.root_path / "test/tmp/featuredistribution/" PathBuilder.build(path) dataset = self._create_dummy_encoded_data(path) report = FeatureComparison.build_object(**{"dataset": dataset, "result_path": path, "comparison_label": "patient"}) self.assertTrue(report.check_prerequisites()) result = report.generate_report() self.assertIsInstance(result, ReportResult) self.assertEqual(result.output_figures[0].path, path / "feature_comparison.html") self.assertEqual(result.output_tables[0].path, path / "feature_values.csv") content = pd.read_csv(path / "feature_values.csv") self.assertListEqual(list(content.columns), ["patient", "example_id", "sequence", "feature", "value"]) # report should succeed to build but check_prerequisites should be false when data is not encoded report = FeatureDistribution.build_object(**{"dataset": RepertoireDataset(), "result_path": path}) self.assertFalse(report.check_prerequisites()) shutil.rmtree(path)
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: SubjectRepertoireCollector.check_dataset_type(dataset, [RepertoireDataset], "SubjectRepertoireCollector") rep_map = {} repertoires = [] indices_to_keep = [] processed_dataset = dataset.clone() PathBuilder.build(params["result_path"]) for index, repertoire in enumerate(processed_dataset.get_data()): if repertoire.metadata["subject_id"] in rep_map.keys(): sequences = np.append(repertoire.sequences, rep_map[repertoire.metadata["subject_id"]].sequences) del rep_map[repertoire.metadata["subject_id"]] repertoires.append(SubjectRepertoireCollector.store_repertoire( params["result_path"], repertoire, sequences)) else: rep_map[repertoire.metadata["subject_id"]] = repertoire indices_to_keep.append(index) for key in rep_map.keys(): repertoires.append(SubjectRepertoireCollector.store_repertoire(params["result_path"], rep_map[key], rep_map[key].sequences)) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata(dataset, indices_to_keep, params["result_path"]) return processed_dataset
def test_run(self): path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path / 'result/instr1/') shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle') shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path / 'result/') predictions_path = path / "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def store(self, path: Path, feature_names=None, details_path=None): content = self._convert_object_to_dict() PathBuilder.build(path) file_path = path / FilenameHandler.get_filename( self.__class__.__name__, "pickle") with file_path.open("wb") as file: pickle.dump(content, file) if details_path is None: params_path = path / FilenameHandler.get_filename( self.__class__.__name__, "yaml") else: params_path = details_path with params_path.open("w") as file: desc = { self.label.name: { **content, "feature_names": feature_names, "classes": list(self.class_mapping.values()) } } if self.label is not None: desc["label"] = vars(self.label) yaml.dump(desc, file)
def test_repertoire_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/cytoscape_export/" PathBuilder.build(path) repertoire_dataset = self._create_dummy_data(path / "data", dataset_type="repertoire") cne = CytoscapeNetworkExporter(repertoire_dataset, path, chains=("alpha", "beta"), drop_duplicates=True, additional_node_attributes=["custom_1"], additional_edge_attributes=["custom_2"]) result = cne._generate() self.assertIsInstance(result, ReportResult) self.assertTrue(os.path.isfile(result.output_tables[0].path)) self.assertTrue(os.path.isfile(result.output_tables[1].path)) self.assertTrue(os.path.isfile(result.output_tables[2].path)) self.assertTrue(os.path.isfile(result.output_tables[3].path)) with open(path / "repertoire_dataset/all_chains.sif") as file: self.assertListEqual(file.readlines(), [ '*tra*s=DUPDUP*v=V1-1*j=J1-1\tpair\t*trb*s=AILUDGYF*v=V1-1*j=J1-1\n', '*tra*s=DUPDUP*v=V1-1*j=J1-1\tpair\t*trb*s=DFJKHJ*v=V1-1*j=J1-1\n', '*tra*s=DIUYUAG*v=V1-1*j=J1-1\tpair\t*trb*s=CTGTCGH*v=V1-1*j=J1-1\n' ]) with open(path / "repertoire_dataset/shared_chains.sif") as file: self.assertListEqual(file.readlines(), [ '*tra*s=DUPDUP*v=V1-1*j=J1-1\tpair\t*trb*s=AILUDGYF*v=V1-1*j=J1-1\n', '*tra*s=DUPDUP*v=V1-1*j=J1-1\tpair\t*trb*s=DFJKHJ*v=V1-1*j=J1-1\n' ]) with open(path / "repertoire_dataset/node_metadata.tsv") as file: self.assertEqual( file.readline(), 'shared_name\tchain\tsequence\tv_subgroup\tv_gene\tj_subgroup\tj_gene\tcustom_1\tn_duplicates\n' ) self.assertListEqual( sorted(file.readlines()), sorted([ '*tra*s=DUPDUP*v=V1-1*j=J1-1\talpha\tDUPDUP\tTRAV1\tTRAV1-1\tTRAJ1\tTRAJ1-1\tCUST-0\t2\n', '*trb*s=AILUDGYF*v=V1-1*j=J1-1\tbeta\tAILUDGYF\tTRBV1\tTRBV1-1\tTRBJ1\tTRBJ1-1\tCUST-1\t1\n', '*trb*s=DFJKHJ*v=V1-1*j=J1-1\tbeta\tDFJKHJ\tTRBV1\tTRBV1-1\tTRBJ1\tTRBJ1-1\tCUST-2\t1\n', '*tra*s=DIUYUAG*v=V1-1*j=J1-1\talpha\tDIUYUAG\tTRAV1\tTRAV1-1\tTRAJ1\tTRAJ1-1\tCUST-3\t1\n', '*trb*s=CTGTCGH*v=V1-1*j=J1-1\tbeta\tCTGTCGH\tTRBV1\tTRBV1-1\tTRBJ1\tTRBJ1-1\tCUST-4\t1\n' ])) with open(path / "repertoire_dataset/edge_metadata.tsv") as file: self.assertListEqual(file.readlines(), [ 'shared_name\tcustom_2\n', '*tra*s=DUPDUP*v=V1-1*j=J1-1 (pair) *trb*s=AILUDGYF*v=V1-1*j=J1-1\tCUST-A\n', '*tra*s=DUPDUP*v=V1-1*j=J1-1 (pair) *trb*s=DFJKHJ*v=V1-1*j=J1-1\tCUST-A\n', '*tra*s=DIUYUAG*v=V1-1*j=J1-1 (pair) *trb*s=CTGTCGH*v=V1-1*j=J1-1\tCUST-B\n' ]) shutil.rmtree(path)
def test_generate(self): path = EnvironmentSettings.root_path / "test/tmp/motifseedrecovery/" PathBuilder.build(path) report = self._create_report(path) # Running the report result = report.generate_report() self.assertIsInstance(result, ReportResult) self.assertEqual(result.output_tables[0].path, path / "motif_seed_recovery.csv") self.assertEqual(result.output_figures[0].path, path / "motif_seed_recovery.html") # Actual tests self.assertTrue(os.path.isfile(path / "motif_seed_recovery.csv")) self.assertTrue(os.path.isfile(path / "motif_seed_recovery.html")) written_data = pd.read_csv(path / "motif_seed_recovery.csv") self.assertListEqual(list(written_data.columns), ["features", "max_seed_overlap", "coefficients"]) self.assertListEqual(list(written_data["coefficients"]), [i for i in range(5)]) self.assertListEqual(list(written_data["features"]), ["AAA", "AAC", "CKJ", "KSA", "AKJ"]) self.assertListEqual(list(written_data["max_seed_overlap"]), [3, 2, 0, 1, 1]) shutil.rmtree(path)
def test_create_model(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v_test_tmp/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA") sequence2 = ReceptorSequence("CASSCCC") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) model_creator = KmerPairModelCreator() model = model_creator.create_model(dataset=dataset, k=2, vector_size=16, batch_size=1, model_path=test_path / "model.model") self.assertTrue(isinstance(model, Word2Vec)) self.assertTrue("CA" in model.wv.vocab) self.assertEqual(400, len(model.wv.vocab)) shutil.rmtree(test_path)
def test_generate(self): path = EnvironmentSettings.root_path / "test/tmp/logregcoefsreport/" PathBuilder.build(path) report = self._create_report(path) # Running the report result = report.generate_report() self.assertIsInstance(result, ReportResult) self.assertEqual(result.output_tables[0].path, path / "coefficients.csv") self.assertEqual(result.output_figures[0].path, path / "all_coefficients.html") self.assertEqual(result.output_figures[1].path, path / "nonzero_coefficients.html") self.assertEqual(result.output_figures[2].path, path / "cutoff_10_coefficients.html") self.assertEqual(result.output_figures[3].path, path / "largest_5_coefficients.html") # Actual tests self.assertTrue(os.path.isfile(path / "coefficients.csv")) self.assertTrue(os.path.isfile(path / "all_coefficients.html")) self.assertTrue(os.path.isfile(path / "nonzero_coefficients.html")) self.assertTrue(os.path.isfile(path / "cutoff_10_coefficients.html")) self.assertTrue(os.path.isfile(path / "largest_5_coefficients.html")) written_data = pd.read_csv(path / "coefficients.csv") self.assertListEqual(list(written_data.columns), ["features", "coefficients"]) self.assertListEqual(list(written_data["coefficients"]), list(reversed([i for i in range(20)]))) self.assertListEqual(list(written_data["features"]), list(reversed([f"feature{i}" for i in range(20)]))) shutil.rmtree(path)