def annotate_features(dataset: RepertoireDataset, criteria: dict, name: str = "annotation"): """ Takes an encoded dataset and adds a new column to the feature_annotations with boolean values showing whether a feature matched the specified criteria or not. """ dataset = copy.deepcopy(dataset) feature_annotations = dataset.encoded_data.feature_annotations matcher = CriteriaMatcher() results = matcher.match(criteria=criteria, data=feature_annotations) feature_annotations[name] = results encoded = EncodedData( examples=dataset.encoded_data.examples, labels=dataset.encoded_data.labels, example_ids=dataset.encoded_data.example_ids, feature_names=dataset.encoded_data.feature_names, feature_annotations=feature_annotations ) result = RepertoireDataset( params=dataset.params, encoded_data=encoded, repertoires=dataset.get_data(), identifier=dataset.identifier, metadata_file=dataset.metadata_file ) return result
def test_run(self): method = LogisticRegression() dataset = RepertoireDataset() dataset.encoded_data = EncodedData(examples=np.array([[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4]]), labels={ "l1": [1, 0, 1, 0, 1, 0], "l2": [0, 1, 0, 1, 0, 1] }, feature_names=["f1", "f2", "f3"]) path = EnvironmentSettings.root_path + "test/tmp/mlmethodtrainer/" method = MLMethodTrainer.run( MLMethodTrainerParams( result_path=path, dataset=dataset, label="l1", method=method, model_selection_n_folds=2, model_selection_cv=True, cores_for_training=1, train_predictions_path=f"{path}predictions.csv", ml_details_path=f"{path}details.yaml", optimization_metric="balanced_accuracy")) method.predict(EncodedData(np.array([1, 2, 3]).reshape(1, -1)), "l1") self.assertTrue(os.path.isfile(f"{path}predictions.csv")) self.assertTrue(os.path.isfile(f"{path}details.yaml")) shutil.rmtree(path)
def process_dataset(self, dataset: RepertoireDataset): extract_fn = self.build_matching_fn() repertoire_count = dataset.get_example_count() for index, repertoire in enumerate(dataset.get_data()): self.process_repertoire(repertoire, str(repertoire.identifier), extract_fn) logging.info("Repertoire {} ({}/{}) processed.".format( repertoire.identifier, index + 1, repertoire_count)) logging.info( f"Currently, there are {self.item_count} items in the comparison data matrix." ) self.merge_tmp_batches_to_matrix()
def test_get_metadata_fields(self): path = EnvironmentSettings.tmp_test_path + "repertoire_dataset/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) self.assertTrue("l1" in dataset.get_metadata_fields()) self.assertTrue("hla" in dataset.get_metadata_fields()) self.assertTrue("subject_id" in dataset.get_metadata_fields()) shutil.rmtree(path)
def create_datasets(self, path: str): repertoires, metadata = RepertoireBuilder.build( [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, { "l1": [1, 0, 1, 0], "l2": [2, 3, 2, 3] }) main_dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) sub_dataset = main_dataset.make_subset([0, 1], path=path, dataset_type="subset") return main_dataset, sub_dataset
def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict: lbl = ["repertoire_identifier"] lbl.extend(params.label_config.get_labels_by_name()) tmp_labels = dataset.get_metadata(lbl, return_df=True) tmp_labels = tmp_labels.iloc[pd.Index( tmp_labels['repertoire_identifier']).get_indexer( dataset.get_repertoire_ids())] tmp_labels = tmp_labels.to_dict("list") del tmp_labels["repertoire_identifier"] return tmp_labels
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: processed_dataset = dataset.clone() repertoires = [] indices =[] for index, repertoire in enumerate(dataset.get_data()): if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \ "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]: repertoires.append(dataset.repertoires[index]) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"]) Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter") return processed_dataset
def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams): labels = params.label_config.get_labels_by_name() assert len(labels) == 1, \ "SequenceAbundanceEncoder: this encoding works only for single label." examples = self._calculate_sequence_abundance(dataset, self.comparison_data, labels[0], params) encoded_data = EncodedData(examples, dataset.get_metadata([labels[0]]) if params.encode_labels else None, dataset.get_repertoire_ids(), [SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE], encoding=SequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) encoded_dataset = RepertoireDataset(params=dataset.params, encoded_data=encoded_data, repertoires=dataset.repertoires) return encoded_dataset
def _encode_repertoires(self, dataset: RepertoireDataset, params): # Rows = repertoires, Columns = reference sequences encoded_repertories = np.zeros((dataset.get_example_count(), len(self.reference_sequences)), dtype=int) labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None for i, repertoire in enumerate(dataset.get_data()): encoded_repertories[i] = self._match_repertoire_to_reference(repertoire) for label in params.label_config.get_labels_by_name(): labels[label].append(repertoire.metadata[label]) return encoded_repertories, labels
def test_repertoire_export(self): path = EnvironmentSettings.tmp_test_path + "airr_exporter_repertoire/" PathBuilder.build(path) repertoire, metadata_path = self.create_dummy_repertoire(path) dataset = RepertoireDataset(repertoires=[repertoire], metadata_file=metadata_path) path_exported = f"{path}exported/" AIRRExporter.export(dataset, path_exported) resulting_data = pd.read_csv( path_exported + f"repertoires/{repertoire.identifier}.tsv", sep="\t") self.assertListEqual(list(resulting_data["sequence_id"]), ["receptor_1", "receptor_2"]) self.assertListEqual(list(resulting_data["cdr3"]), ["GCTGCTGCT", "GGTGGTGGT"]) self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"]) self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1", "TRAV2*01"]) self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1", "TRAJ2"]) self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1", "TRAD2"]) self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"]) self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15]) self.assertListEqual(list(resulting_data["custom_test"]), ["cust1", "cust2"]) self.assertListEqual(list(resulting_data["productive"]), ['T', nan]) self.assertListEqual(list(resulting_data["stop_codon"]), ['F', nan]) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "abundance_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceAbundanceEncoder.build_object(dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8 }) label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=[1, 1, 1, 2, 2], path=path) if dataset_type == "receptor": receptordataset_filename = f"{path}/receptors.pkl" with open(receptordataset_filename, "wb") as file: pickle.dump(test_repertoire.receptors, file) dataset = ReceptorDataset(filenames=[receptordataset_filename], identifier="receptor_dataset") elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: rep_map = {} repertoires = [] indices_to_keep = [] processed_dataset = dataset.clone() PathBuilder.build(params["result_path"]) for index, repertoire in enumerate(processed_dataset.get_data()): if repertoire.metadata["subject_id"] in rep_map.keys(): sequences = np.append( repertoire.sequences, rep_map[repertoire.metadata["subject_id"]].sequences) del rep_map[repertoire.metadata["subject_id"]] repertoires.append( SubjectRepertoireCollector.store_repertoire( params["result_path"], repertoire, sequences)) else: rep_map[repertoire.metadata["subject_id"]] = repertoire indices_to_keep.append(index) for key in rep_map.keys(): repertoires.append( SubjectRepertoireCollector.store_repertoire( params["result_path"], rep_map[key], rep_map[key].sequences)) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata( dataset, indices_to_keep, params["result_path"]) return processed_dataset
def test_create_model(self): test_path = EnvironmentSettings.root_path + "test/tmp/w2v_test_tmp/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA") sequence2 = ReceptorSequence("CASSCCC") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) model_creator = KmerPairModelCreator() model = model_creator.create_model(dataset=dataset, k=2, vector_size=16, batch_size=1, model_path=test_path + "model.model") self.assertTrue(isinstance(model, Word2Vec)) self.assertTrue("CA" in model.wv.vocab) self.assertEqual(400, len(model.wv.vocab)) shutil.rmtree(test_path)
def create_dummy_dataset(self, path): repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={ "label1": ["val1", "val2"], "label2": ["val1", "val2"] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) dataset.name = "my_dataset" PickleExporter.export(dataset, path) return f"{dataset.name}.iml_dataset"
def import_repertoire_dataset(import_class, params: DatasetImportParams, dataset_name: str) -> RepertoireDataset: """ Function to create a dataset from the metadata and a list of repertoire files and exports dataset pickle file Arguments: import_class: class to use for import params: instance of DatasetImportParams class which includes information on path, columns, result path etc. dataset_name: user-defined name of the dataset Returns: RepertoireDataset object that was created """ metadata = pd.read_csv(params.metadata_file, ",") ParameterValidator.assert_keys_present(metadata.columns.tolist(), ["filename"], ImportHelper.__name__, f'{dataset_name}: params: metadata_file') PathBuilder.build(params.result_path + "repertoires/") arguments = [(import_class, row, params) for index, row in metadata.iterrows()] with Pool(params.number_of_processes) as pool: repertoires = pool.starmap(ImportHelper.load_repertoire_as_object, arguments) new_metadata_file = ImportHelper.make_new_metadata_file(repertoires, metadata, params.result_path, dataset_name) potential_labels = list(set(metadata.columns.tolist()) - {"filename"}) dataset = RepertoireDataset(params={key: list(set(metadata[key].values.tolist())) for key in potential_labels}, repertoires=repertoires, metadata_file=new_metadata_file, name=dataset_name) PickleExporter.export(dataset, params.result_path) return dataset
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/clones_per_repertoire_filter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["ACF", "ACF", "ACF"], ["ACF", "ACF"], ["ACF", "ACF", "ACF", "ACF"]], path)[0]) dataset1 = ClonesPerRepertoireFilter.process(dataset, { "lower_limit": 3, "result_path": path }) self.assertEqual(2, dataset1.get_example_count()) dataset2 = ClonesPerRepertoireFilter.process(dataset, { "upper_limit": 2, "result_path": path }) self.assertEqual(1, dataset2.get_example_count()) self.assertRaises(AssertionError, ClonesPerRepertoireFilter.process, dataset, { "lower_limit": 10, "result_path": path }) shutil.rmtree(path)
def _create_dummy_encoded_data(self, path): n_subjects = 8 n_features = 300 n_timepoints = 2 n_examples = n_subjects * n_timepoints diseased_subjects = range(0, 4) subjects = [subject for subject in range(n_subjects) for timepoint in range(n_timepoints)] timepoints = [timepoint for subject in range(n_subjects) for timepoint in range(n_timepoints)] disease_statuses = [subject in diseased_subjects for subject in subjects] kmers = [''.join(random.choices(string.ascii_uppercase, k=3)) for i in range(n_features)] encoded_data = { 'examples': sparse.csr_matrix( np.random.normal(50, 10, n_examples * n_features).reshape((n_examples, n_features))), 'example_ids': [i for i in range(n_examples)], 'labels': { "subject_id": np.array([f"subject {i}" for i in subjects]), "disease_status": np.array([f"disease: {i}" for i in disease_statuses]), "timepoint": np.array([f"timepoint {i}" for i in timepoints]) }, 'feature_names': kmers, 'feature_annotations': pd.DataFrame({ "sequence": kmers }), 'encoding': "random" } dataset = RepertoireDataset(encoded_data=EncodedData(**encoded_data)) return dataset
def annotate_repertoires(dataset: RepertoireDataset, criteria: dict, name: str = "annotation"): """ Takes an encoded dataset and adds a new label to the encoded_dataset with boolean values showing whether a repertoire matched the specified criteria or not. """ dataset = copy.deepcopy(dataset) data = pd.DataFrame(dataset.encoded_data.labels) matcher = CriteriaMatcher() results = matcher.match(criteria=criteria, data=data) labels = dataset.encoded_data.labels labels[name] = np.array(results) encoded = EncodedData( examples=dataset.encoded_data.examples, labels=labels, example_ids=dataset.encoded_data.example_ids, feature_names=dataset.encoded_data.feature_names, feature_annotations=dataset.encoded_data.feature_annotations ) result = RepertoireDataset( params=dataset.params, encoded_data=encoded, repertoires=dataset.repertoires, identifier=dataset.identifier, metadata_file=dataset.metadata_file ) return result
def filter_features(dataset: RepertoireDataset, criteria: dict): """ Takes an encoded dataset and filters features based on a given set of criteria. Only features meeting these criteria will be retained in the new dataset object. """ dataset = copy.deepcopy(dataset) feature_annotations = dataset.encoded_data.feature_annotations matcher = CriteriaMatcher() results = matcher.match(criteria=criteria, data=feature_annotations) indices = np.where(np.array(results))[0] feature_annotations = feature_annotations.iloc[indices, :] examples = dataset.encoded_data.examples[:, indices] repertoires = dataset.repertoires feature_names = [dataset.encoded_data.feature_names[i] for i in indices] encoded = EncodedData( examples=examples, labels=dataset.encoded_data.labels, example_ids=dataset.encoded_data.example_ids, feature_names=feature_names, feature_annotations=feature_annotations ) result = RepertoireDataset( params=dataset.params, encoded_data=encoded, repertoires=repertoires, identifier=dataset.identifier, metadata_file=dataset.metadata_file ) return result
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path + "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path + "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path + "results/" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path + "results/" }) shutil.rmtree(path)
def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData: sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params, comparison_data, label, self.p_value_threshold, self.comparison_attributes, self.relevant_indices_path) if self.relevant_indices_path is None: self.relevant_indices_path = indices_path if self.relevant_sequence_csv_path is None: self.relevant_sequence_csv_path = relevant_sequences_path count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices) feature_names = comparison_data.get_item_names()[sequence_p_values_indices] encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None, dataset.get_repertoire_ids(), feature_names, encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) return encoded_data
def _encode_examples(self, dataset: RepertoireDataset, params: EncoderParams) -> Tuple[list, set, dict]: keys = set() example_count = dataset.get_example_count() arguments = [(repertoire, index, example_count) for index, repertoire in enumerate(dataset.repertoires)] with Pool(params.pool_size) as pool: chunksize = math.floor(dataset.get_example_count() / params.pool_size) + 1 examples = pool.starmap(self._process_repertoire_cached, arguments, chunksize=chunksize) for example in examples: keys.update(list(example.keys())) labels = dataset.get_metadata(params.label_config.get_labels_by_name()) if params.encode_labels else None return examples, keys, labels
def _encode_new_dataset(self, dataset, params: EncoderParams): encoded_dataset = RepertoireDataset(repertoires=dataset.repertoires, params=dataset.params, metadata_file=dataset.metadata_file) encoded_repertoires, labels = self._encode_repertoires(dataset, params) feature_annotations = self._get_feature_info() encoded_dataset.add_encoded_data(EncodedData( examples=encoded_repertoires, labels=labels, feature_names=list(feature_annotations["sequence_id"]), feature_annotations=feature_annotations, example_ids=[repertoire.identifier for repertoire in dataset.get_data()], encoding=MatchedSequencesEncoder.__name__ )) return encoded_dataset
def build_comparison_data(dataset: RepertoireDataset, params: EncoderParams, comparison_attributes, sequence_batch_size): comp_data = ComparisonData(dataset.get_repertoire_ids(), comparison_attributes, sequence_batch_size, params.result_path) comp_data.process_dataset(dataset) return comp_data
def _implant_signals_in_repertoires(simulation_state: SimulationState = None) -> Dataset: PathBuilder.build(simulation_state.result_path + "repertoires/") processed_repertoires = SignalImplanter._implant_signals(simulation_state, SignalImplanter._process_repertoire) processed_dataset = RepertoireDataset(repertoires=processed_repertoires, params={**(simulation_state.dataset.params if simulation_state.dataset.params is not None else {}), **{signal.id: [True, False] for signal in simulation_state.signals}}, name=simulation_state.dataset.name, metadata_file=SignalImplanter._create_metadata_file(processed_repertoires, simulation_state)) return processed_dataset
def create_comparison_data(self, dataset: RepertoireDataset) -> ComparisonData: comparison_data = ComparisonData(dataset.get_repertoire_ids(), self.matching_columns, self.sequence_batch_size, self.path) comparison_data.process_dataset(dataset) return comparison_data
def test_match(self): path = EnvironmentSettings.root_path + "test/tmp/seqmatch/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="3"), ReceptorSequence(amino_acid_sequence="CCCCCC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="4"), ReceptorSequence(amino_acid_sequence="AAAACC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="5"), ReceptorSequence(amino_acid_sequence="TADQVF", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="6") ], metadata={"CD": True}, path=path) dataset = RepertoireDataset(repertoires=[repertoire]) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="1"), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="2") ] matcher = SequenceMatcher() result = matcher.match(dataset, sequences, 2, SequenceMatchingSummaryType.PERCENTAGE) self.assertTrue("repertoires" in result) self.assertEqual( 1, len(result["repertoires"][0]["sequences"][3] ["matching_sequences"])) self.assertTrue(result["repertoires"][0]["metadata"]["CD"]) self.assertEqual(1, len(result["repertoires"])) shutil.rmtree(path)
def match(self, dataset: RepertoireDataset, reference_sequences: list, max_distance: int, summary_type: SequenceMatchingSummaryType) -> dict: matched = {"repertoires": []} for index, repertoire in enumerate(dataset.get_data()): matched["repertoires"].append(self.match_repertoire(repertoire, index, reference_sequences, max_distance, summary_type)) return matched
def create_dataset(self, path: str) -> RepertoireDataset: repertoires, metadata = RepertoireBuilder.build( [["A", "B"], ["B", "C"], ["D"], ["E", "F"], ["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, { "l1": [1, 0, 1, 0, 1, 0, 1, 0], "l2": [2, 3, 2, 3, 2, 3, 3, 3] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) return dataset