def test_get_normalized_sequence_lengths(self): path = EnvironmentSettings.root_path / "test/tmp/datareports/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAA", identifier="1"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="2"), ReceptorSequence(amino_acid_sequence="AAAAA", identifier="3"), ReceptorSequence(amino_acid_sequence="AAA", identifier="4") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAA", identifier="5"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="6"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="7"), ReceptorSequence(amino_acid_sequence="AAA", identifier="8") ], path=path, metadata={}) dataset = RepertoireDataset(repertoires=[rep1, rep2]) sld = SequenceLengthDistribution(dataset, 1, path) result = sld.generate_report() self.assertTrue(os.path.isfile(result.output_figures[0].path)) shutil.rmtree(path)
def _construct_test_repertoiredataset(self, path, positional): receptors1 = ReceptorSequenceList() receptors2 = ReceptorSequenceList() if positional: [receptors1.append(seq) for seq in [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]] [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]] else: [receptors1.append(seq) for seq in [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]] [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]] rep1 = Repertoire.build_from_sequence_objects(receptors1, metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path) rep2 = Repertoire.build_from_sequence_objects(receptors2, metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) return dataset, lc
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector" PathBuilder.build(path) reps = [ Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects( [ReceptorSequence("AAC", identifier="2")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects( [ReceptorSequence("AAC", identifier="3")], path=path, metadata={"subject_id": "patient3"}) ] dataset = RepertoireDataset(repertoires=reps) dataset2 = SubjectRepertoireCollector.process( dataset, {"result_path": path / "result"}) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual(3, len(dataset.get_data())) values = [2, 1] for index, rep in enumerate(dataset2.get_data()): self.assertEqual(values[index], len(rep.sequences)) shutil.rmtree(path)
def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire: counts = repertoire.get_counts() counts = counts if counts is not None else np.full( repertoire.get_element_count(), None) not_none_indices = counts != None counts[not_none_indices] = counts[not_none_indices].astype(np.int) indices_to_keep = np.full(repertoire.get_element_count(), False) if params["remove_without_count"] and params[ "low_count_limit"] is not None: np.greater_equal(counts, params["low_count_limit"], out=indices_to_keep, where=not_none_indices) elif params["remove_without_count"]: indices_to_keep = not_none_indices elif params["low_count_limit"] is not None: indices_to_keep[np.logical_not(not_none_indices)] = True np.greater_equal(counts, params["low_count_limit"], out=indices_to_keep, where=not_none_indices) processed_repertoire = Repertoire.build_like( repertoire, indices_to_keep, params["result_path"], filename_base=f"{repertoire.data_filename.stem}_filtered") return processed_repertoire
def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire: data = pd.DataFrame(repertoire.load_data()) groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(params, data.columns) custom_lists = list(set(data.columns) - set(Repertoire.FIELDS)) agg_dict = DuplicateSequenceFilter._prepare_agg_dict(params, data.columns, custom_lists) # Chain objects can not be aggregated, convert to strings if "chains" in data.columns: data["chains"] = [chain.value if isinstance(chain, Chain) else chain for chain in data["chains"]] else: data["chains"] = None no_duplicates = data.groupby(groupby_fields).agg(agg_dict).reset_index() processed_repertoire = Repertoire.build(sequence_aas=list(no_duplicates["sequence_aas"]) if "sequence_aas" in no_duplicates.columns else None, sequences=list(no_duplicates["sequences"]) if "sequences" in no_duplicates.columns else None, v_genes=list(no_duplicates["v_genes"]) if "v_genes" in no_duplicates.columns else None, j_genes=list(no_duplicates["j_genes"]) if 'j_genes' in no_duplicates.columns else None, chains=[Chain(key) for key in list(no_duplicates["chains"])] if "chains" in no_duplicates.columns else None, counts=list(no_duplicates["counts"]) if "counts" in no_duplicates else None, region_types=list(no_duplicates["region_types"]) if "region_types" in no_duplicates else None, custom_lists={key: list(no_duplicates[key]) for key in custom_lists}, sequence_identifiers=list(no_duplicates["sequence_identifiers"]), metadata=copy.deepcopy(repertoire.metadata), path=params["result_path"], filename_base=f"{repertoire.data_filename.stem}_filtered") return processed_repertoire
def test_create_model(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v_test_tmp/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA") sequence2 = ReceptorSequence("CASSCCC") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) model_creator = KmerPairModelCreator() model = model_creator.create_model(dataset=dataset, k=2, vector_size=16, batch_size=1, model_path=test_path / "model.model") self.assertTrue(isinstance(model, Word2Vec)) self.assertTrue("CA" in model.wv.vocab) self.assertEqual(400, len(model.wv.vocab)) shutil.rmtree(test_path)
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=["1", "1", "1", "2", '2'], path=path) if dataset_type == "receptor": dataset = ReceptorDataset.build_from_objects( test_repertoire.receptors, 100, path, name="receptor_dataset") dataset.identifier = 'receptor_dataset' elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def build(cls, **kwargs): ParameterValidator.assert_keys_present( list(kwargs.keys()), ['metadata_file', 'name', 'repertoire_ids', 'metadata_fields'], RepertoireDataset.__name__, "repertoire dataset") repertoires = [] metadata_df = pd.read_csv(kwargs['metadata_file'], comment=Constants.COMMENT_SIGN) for index, row in metadata_df.iterrows(): filename = Path(kwargs['metadata_file']).parent / row['filename'] if not filename.is_file() and 'repertoires' in str(filename): filename = filename.parent.parent / Path(row['filename']).name repertoire = Repertoire(data_filename=filename, metadata_filename=filename.parent / f'{filename.stem}_metadata.yaml', identifier=row['identifier']) repertoires.append(repertoire) if "repertoire_ids" in kwargs.keys( ) and "repertoires" not in kwargs.keys( ) and kwargs['repertoire_ids'] is not None: assert all(rep.identifier == kwargs['repertoire_ids'][i] for i, rep in enumerate(repertoires)), \ f"{RepertoireDataset.__name__}: repertoire ids from the iml_dataset file and metadata file don't match for the dataset " \ f"{kwargs['name']} with identifier {kwargs['identifier']}." return RepertoireDataset(**{**kwargs, **{"repertoires": repertoires}})
def create_dummy_repertoire(self, path): sequence_objects = [ReceptorSequence(amino_acid_sequence="AAA", nucleotide_sequence="GCTGCTGCT", identifier="receptor_1", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, count=5, region_type="IMGT_CDR3", frame_type="IN", custom_params={"d_call": "TRBD1", "custom_test": "cust1"})), ReceptorSequence(amino_acid_sequence="GGG", nucleotide_sequence="GGTGGTGGT", identifier="receptor_2", metadata=SequenceMetadata(v_gene="TRAV2", v_allele="TRAV2*01", j_gene="TRAJ2", chain=Chain.ALPHA, count=15, frame_type=None, region_type="IMGT_CDR3", custom_params={"d_call": "TRAD2", "custom_test": "cust2"}))] repertoire = Repertoire.build_from_sequence_objects(sequence_objects=sequence_objects, path=path, metadata={"subject_id": "REP1"}) df = pd.DataFrame({"filename": [f"{repertoire.identifier}_data.npy"], "subject_id": ["1"], "repertoire_identifier": [repertoire.identifier]}) df.to_csv(path / "metadata.csv", index=False) return repertoire, path / "metadata.csv"
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=[1, 1, 1, 2, 2], path=path) if dataset_type == "receptor": receptordataset_filename = path / "receptors.pkl" with open(receptordataset_filename, "wb") as file: pickle.dump(test_repertoire.receptors, file) dataset = ReceptorDataset(filenames=[receptordataset_filename], identifier="receptor_dataset") elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path / "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path / "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path / "results" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path / "results" }) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def test_encode(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def test_match_repertoire(self): path = EnvironmentSettings.root_path / "test/tmp/seqmatchrep/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="1", metadata=SequenceMetadata(chain="A", count=3)), ReceptorSequence(amino_acid_sequence="CCCCCC", identifier="2", metadata=SequenceMetadata(chain="A", count=2)), ReceptorSequence(amino_acid_sequence="AAAACC", identifier="3", metadata=SequenceMetadata(chain="A", count=1)), ReceptorSequence(amino_acid_sequence="TADQVF", identifier="4", metadata=SequenceMetadata(chain="A", count=4)) ], metadata={ "CD": True }, path=path) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A")) ] matcher = SequenceMatcher() result = matcher.match_repertoire(repertoire, 0, sequences, 2, SequenceMatchingSummaryType.COUNT) self.assertTrue("sequences" in result) self.assertTrue("repertoire" in result) self.assertTrue("repertoire_index" in result) self.assertEqual(4, len(result["sequences"])) self.assertEqual(1, len(result["sequences"][0]["matching_sequences"])) self.assertEqual(0, len(result["sequences"][1]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][2]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][3]["matching_sequences"])) self.assertEqual( 3, len([ r for r in result["sequences"] if len(r["matching_sequences"]) > 0 ])) self.assertTrue(result["metadata"]["CD"]) result = matcher.match_repertoire( repertoire, 0, sequences, 2, SequenceMatchingSummaryType.CLONAL_PERCENTAGE) self.assertEqual(0.8, result["clonal_percentage"]) shutil.rmtree(path)
def test_match(self): path = EnvironmentSettings.root_path / "test/tmp/seqmatch/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="3"), ReceptorSequence(amino_acid_sequence="CCCCCC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="4"), ReceptorSequence(amino_acid_sequence="AAAACC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="5"), ReceptorSequence(amino_acid_sequence="TADQVF", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="6") ], metadata={"CD": True}, path=path) dataset = RepertoireDataset(repertoires=[repertoire]) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="1"), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="2") ] matcher = SequenceMatcher() result = matcher.match(dataset, sequences, 2, SequenceMatchingSummaryType.PERCENTAGE) self.assertTrue("repertoires" in result) self.assertEqual( 1, len(result["repertoires"][0]["sequences"][3] ["matching_sequences"])) self.assertTrue(result["repertoires"][0]["metadata"]["CD"]) self.assertEqual(1, len(result["repertoires"])) shutil.rmtree(path)
def _build_new_repertoire(self, sequences, repertoire_metadata, signal, path: Path) -> Repertoire: if repertoire_metadata is not None: metadata = copy.deepcopy(repertoire_metadata) else: metadata = {} # when adding implant to a repertoire, only signal id is stored: # more detailed information is available in each receptor_sequence # (specific motif and motif instance) metadata[signal.id] = True repertoire = Repertoire.build_from_sequence_objects(sequences, path, metadata) return repertoire
def test_implant_in_repertoire(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "full_seq_implanting/") signal = Signal("sig1", [Motif("motif1", GappedKmerInstantiation(max_gap=0), "AAAA")], FullSequenceImplanting()) repertoire = Repertoire.build(["CCCC", "CCCC", "CCCC"], path=path) new_repertoire = signal.implant_to_repertoire(repertoire, 0.33, path) self.assertEqual(len(repertoire.sequences), len(new_repertoire.sequences)) self.assertEqual(1, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "AAAA"])) self.assertEqual(2, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "CCCC"])) shutil.rmtree(path)
def test_create_sentences_from_repertoire(self): path = EnvironmentSettings.tmp_test_path / "kmer/" PathBuilder.build(path) rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"), ReceptorSequence(amino_acid_sequence="ACCT"), ReceptorSequence(amino_acid_sequence="AACT")], path, {}) sentences = KmerHelper.create_sentences_from_repertoire(rep, 3, sequence_type=SequenceType.AMINO_ACID) self.assertEqual(3, len(sentences)) self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0]) shutil.rmtree(path)
def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list = None, subject_ids: list = None): if subject_ids is not None: assert len(subject_ids) == len(sequences) if seq_metadata is not None: assert len(sequences) == len(seq_metadata) for index, sequence_list in enumerate(sequences): assert len(sequence_list) == len(seq_metadata[index]) PathBuilder.build(path) rep_path = PathBuilder.build(path / "repertoires") repertoires = [] if subject_ids is None: subject_ids = [] for rep_index, sequence_list in enumerate(sequences): rep_sequences = ReceptorSequenceList() if len(subject_ids) < len(sequences): subject_ids.append("rep_" + str(rep_index)) for seq_index, sequence in enumerate(sequence_list): if seq_metadata is None: m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3") else: m = SequenceMetadata(**seq_metadata[rep_index][seq_index]) s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index)) rep_sequences.append(s) if labels is not None: metadata = {key: labels[key][rep_index] for key in labels.keys()} else: metadata = {} metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}} repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata, filename_base=f"rep_{rep_index}") repertoires.append(repertoire) df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires], "subject_id": subject_ids, "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]}, **(labels if labels is not None else {})}) df.to_csv(path / "metadata.csv", index=False) return repertoires, path / "metadata.csv"
def _process_repertoire(index, repertoire, current_implanting, simulation_state) -> Repertoire: if current_implanting is not None: return SignalImplanter._implant_in_repertoire( index, repertoire, current_implanting, simulation_state) else: new_repertoire = Repertoire.build_from_sequence_objects( repertoire.sequences, simulation_state.result_path / "repertoires", repertoire.metadata) for signal in simulation_state.signals: new_repertoire.metadata[f"signal_{signal.id}"] = False return new_repertoire
def implant_in_repertoire(self, repertoire: Repertoire, repertoire_implanting_rate: float, signal, path: Path): assert all("/" not in motif.seed for motif in signal.motifs), \ f'FullSequenceImplanting: motifs cannot include gaps. Check motifs {[motif.identifier for motif in signal.motifs]}.' sequences = repertoire.sequences new_sequence_count = math.ceil( len(sequences) * repertoire_implanting_rate) assert new_sequence_count > 0, \ f"FullSequenceImplanting: there are too few sequences ({len(sequences)}) in the repertoire with identifier {repertoire.identifier} " \ f"to have the given repertoire implanting rate ({repertoire_implanting_rate}). Please consider increasing the repertoire implanting rate." new_sequences = self._create_new_sequences(sequences, new_sequence_count, signal) metadata = copy.deepcopy(repertoire.metadata) metadata[f"signal_{signal.id}"] = True return Repertoire.build_from_sequence_objects(new_sequences, path, metadata)
def load_repertoire_as_object(import_class, metadata_row, params: DatasetImportParams): try: alternative_load_func = getattr(import_class, "alternative_load_func", None) filename = params.path / f"{metadata_row['filename']}" dataframe = ImportHelper.load_sequence_dataframe(filename, params, alternative_load_func) dataframe = import_class.preprocess_dataframe(dataframe, params) sequence_lists = {field: dataframe[field].values.tolist() for field in Repertoire.FIELDS if field in dataframe.columns} sequence_lists["custom_lists"] = {field: dataframe[field].values.tolist() for field in list(set(dataframe.columns) - set(Repertoire.FIELDS))} repertoire_inputs = {**{"metadata": metadata_row.to_dict(), "path": params.result_path / "repertoires/", "filename_base": filename.stem}, **sequence_lists} repertoire = Repertoire.build(**repertoire_inputs) return repertoire except Exception as exception: raise RuntimeError(f"{ImportHelper.__name__}: error when importing file {metadata_row['filename']}.") from exception
def _repertoire_to_dataframe(repertoire: Repertoire, region_type): # get all fields (including custom fields) df = pd.DataFrame(repertoire.load_data()) for column in ['v_alleles', 'j_alleles', 'v_genes', 'j_genes']: if column not in df.columns: df.loc[:, column] = '' AIRRExporter.update_gene_columns(df, 'alleles', 'genes') # rename mandatory fields for airr-compliance mapper = { "sequence_identifiers": "sequence_id", "v_alleles": "v_call", "j_alleles": "j_call", "chains": "locus", "counts": "duplicate_count", "sequences": AIRRExporter.get_sequence_field(region_type), "sequence_aas": AIRRExporter.get_sequence_aa_field(region_type) } df = df.rename(mapper=mapper, axis="columns") return df
def _process_repertoire(index, repertoire, current_implanting, simulation_state, output_path: Path = None) -> Repertoire: if current_implanting is not None: new_repertoire = SignalImplanter._implant_in_repertoire( index, repertoire, current_implanting, simulation_state) else: new_metadata = { **repertoire.metadata, **{ f"{signal.id}": False for signal in simulation_state.signals } } new_repertoire = Repertoire.build_from_sequence_objects( repertoire.sequences, simulation_state.result_path / "repertoires", metadata=new_metadata) return new_repertoire
def test_encode(self): path = EnvironmentSettings.root_path / "test/tmp/kmerfreqenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence("AAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3') ], metadata={ "l1": 1, "l2": 2, "subject_id": "1" }, path=path) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2"), ReceptorSequence("AAC", identifier="3") ], metadata={ "l1": 0, "l2": 3, "subject_id": "2" }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.IDENTITY.name, "k": 3 }) d1 = encoder.encode( dataset, EncoderParams(result_path=path / "1/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) encoder = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d2 = encoder.encode( dataset, EncoderParams(result_path=path / "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv")) encoder3 = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.BINARY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d3 = encoder3.encode( dataset, EncoderParams(result_path=path / "3/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) shutil.rmtree(path) self.assertTrue(isinstance(d1, RepertoireDataset)) self.assertTrue(isinstance(d2, RepertoireDataset)) self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2)) self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2)) self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))
def _store_repertoire(self, repertoire, sequences): new_repertoire = Repertoire.build_from_sequence_objects( sequence_objects=sequences, path=self.result_path, metadata=repertoire.metadata) return new_repertoire
def test_run(self): r = [] path = EnvironmentSettings.tmp_test_path / "signalImplanter/" if not os.path.isdir(path): os.makedirs(path) sequences = [ ReceptorSequence("ACDEFG", identifier="1"), ReceptorSequence("ACDEFG", identifier="2"), ReceptorSequence("ACDEFG", identifier="3"), ReceptorSequence("ACDEFG", identifier="4") ] for i in range(10): rep = Repertoire.build_from_sequence_objects( sequence_objects=sequences, path=path, metadata={}) r.append(rep) dataset = RepertoireDataset(repertoires=r) m1 = Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="CAS") m2 = Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC") s1 = Signal(identifier="s1", motifs=[m1], implanting_strategy=HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) s2 = Signal(identifier="s2", motifs=[m1, m2], implanting_strategy=HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) simulation = Simulation([ Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s1, s2], name="i1"), Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s2], name="i2") ]) input_params = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[s1, s2], formats=["ImmuneML"]) new_dataset = SignalImplanter.run(input_params) reps_with_s2 = sum([ rep.metadata[s2.id] is True for rep in new_dataset.get_data(batch_size=10) ]) reps_with_s1 = sum([ rep.metadata[s1.id] is True for rep in new_dataset.get_data(batch_size=10) ]) self.assertEqual(10, len(new_dataset.get_example_ids())) self.assertTrue( all([ s1.id in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10) ])) self.assertTrue( all([ s2.id in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10) ])) self.assertTrue(reps_with_s2 == 4) self.assertTrue(reps_with_s1 == 2) self.assertEqual(10, len(new_dataset.get_example_ids())) metadata_filenames = [ filename.name for filename in new_dataset.get_filenames() ] self.assertTrue( all([ repertoire.data_filename.name in metadata_filenames for repertoire in new_dataset.repertoires ])) shutil.rmtree(path)
def store_repertoire(path, repertoire, sequences): new_repertoire = Repertoire.build_from_sequence_objects(sequences, path, repertoire.metadata) return new_repertoire
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/duplicatesequencefilter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=[ Repertoire.build( sequence_aas=["AAA", "AAA", "CCC", "AAA", "CCC", "CCC", "CCC"], sequences=[ "ntAAA", "ntBBB", "ntCCC", "ntAAA", "ntCCC", "ntCCC", "ntDDD" ], v_genes=["v1", "v1", "v1", "v1", "v1", "v1", "v1"], j_genes=["j1", "j1", "j1", "j1", "j1", "j1", "j1"], chains=[ Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.BETA ], counts=[10, 20, 30, 5, 20, None, 40], region_types=[ "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3" ], custom_lists={ "custom1": ["yes", "yes", "yes", "no", "no", "no", "no"], "custom2": ["yes", "yes", "yes", "no", "no", "no", "no"] }, sequence_identifiers=[1, 2, 3, 4, 5, 6, 7], path=path) ]) # collapse by amino acids & use sum counts dupfilter = DuplicateSequenceFilter( filter_sequence_type=SequenceType.AMINO_ACID, count_agg=CountAggregationFunction.SUM, batch_size=1) reduced_repertoire = dupfilter.process_dataset( dataset=dataset, result_path=path).repertoires[0] attr = reduced_repertoire.get_attributes([ "sequence_identifiers", "sequence_aas", "sequences", "counts", "chains" ]) self.assertEqual(3, len(reduced_repertoire.get_sequence_identifiers())) self.assertListEqual(["AAA", "CCC", "CCC"], list(attr["sequence_aas"])) self.assertListEqual(["ntAAA", "ntCCC", "ntDDD"], list(attr["sequences"])) self.assertListEqual([35, 50, 40], list(attr["counts"])) self.assertListEqual([1, 3, 7], list(attr["sequence_identifiers"])) self.assertListEqual(['ALPHA', 'ALPHA', 'BETA'], list(attr["chains"])) # collapse by nucleotides & use min counts dupfilter = DuplicateSequenceFilter( filter_sequence_type=SequenceType.NUCLEOTIDE, count_agg=CountAggregationFunction.MIN, batch_size=4) reduced_repertoire = dupfilter.process_dataset( dataset=dataset, result_path=path).repertoires[0] attr = reduced_repertoire.get_attributes( ["sequence_identifiers", "sequence_aas", "sequences", "counts"]) self.assertEqual(4, len(reduced_repertoire.get_sequence_identifiers())) self.assertListEqual([1, 2, 3, 7], list(attr["sequence_identifiers"])) self.assertListEqual(["AAA", "AAA", "CCC", "CCC"], list(attr["sequence_aas"])) self.assertListEqual(["ntAAA", "ntBBB", "ntCCC", "ntDDD"], list(attr["sequences"])) self.assertListEqual([5, 20, 20, 40], list(attr["counts"])) shutil.rmtree(path)
def test_run(self): dataset = RepertoireDataset(repertoires=[ Repertoire(Path("0.npy"), None, "0"), Repertoire(Path("0.npy"), None, "8"), Repertoire(Path("0.npy"), None, "1"), Repertoire(Path("0.npy"), None, "9"), Repertoire(Path("0.npy"), None, "2"), Repertoire(Path("0.npy"), None, "10"), Repertoire(Path("0.npy"), None, "3"), Repertoire(Path("0.npy"), None, "11"), Repertoire(Path("0.npy"), None, "4"), Repertoire(Path("0.npy"), None, "12"), Repertoire(Path("0.npy"), None, "5"), Repertoire(Path("0.npy"), None, "13"), Repertoire(Path("0.npy"), None, "6"), Repertoire(Path("0.npy"), None, "14"), Repertoire(Path("0.npy"), None, "7") ]) paths = [ EnvironmentSettings.root_path / "test/tmp/datasplitter/split_{}".format(i) for i in range(5) ] for path in paths: PathBuilder.build(path) df = pd.DataFrame( data={ "key1": [0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1], "filename": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] }) df.to_csv(EnvironmentSettings.root_path / "test/tmp/datasplitter/metadata.csv") dataset.metadata_file = EnvironmentSettings.root_path / "test/tmp/datasplitter/metadata.csv" training_percentage = 0.7 trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, training_percentage=training_percentage, split_strategy=SplitType.RANDOM, split_count=5, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(10, len(trains[0].get_data())) self.assertEqual(5, len(tests[0].get_data())) self.assertEqual(5, len(trains)) self.assertEqual(5, len(tests)) self.assertEqual(10, len(trains[0].repertoires)) trains2, tests2 = DataSplitter.run( DataSplitterParams(dataset=dataset, training_percentage=training_percentage, split_strategy=SplitType.RANDOM, split_count=5, paths=paths)) self.assertEqual(trains[0].get_repertoire_ids(), trains2[0].get_repertoire_ids()) paths = [ EnvironmentSettings.root_path / "test/tmp/datasplitter/split_{}".format(i) for i in range(dataset.get_example_count()) ] for path in paths: PathBuilder.build(path) trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, split_strategy=SplitType.LOOCV, split_count=-1, training_percentage=-1, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(14, len(trains[0].get_data())) self.assertEqual(1, len(tests[0].get_data())) self.assertEqual(15, len(trains)) self.assertEqual(15, len(tests)) paths = [ EnvironmentSettings.root_path / "test/tmp/datasplitter/split_{}".format(i) for i in range(5) ] for path in paths: PathBuilder.build(path) trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, split_strategy=SplitType.K_FOLD, split_count=5, training_percentage=-1, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(len(trains[0].get_data()), 12) self.assertEqual(len(tests[0].get_data()), 3) self.assertEqual(5, len(trains)) self.assertEqual(5, len(tests)) trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, split_strategy=SplitType.STRATIFIED_K_FOLD, split_count=3, training_percentage=-1, paths=paths, label_config=LabelConfiguration( [Label("key1", [0, 1, 2])]))) self.assertEqual(len(trains[0].get_data()), 10) self.assertEqual(len(tests[0].get_data()), 5) self.assertEqual(3, len(trains)) self.assertEqual(3, len(tests)) for train in trains: self.assertTrue( all(cls in train.get_metadata(["key1"])["key1"] for cls in [0, 1, 2])) for test in tests: self.assertTrue( all(cls in test.get_metadata(["key1"])["key1"] for cls in [0, 1, 2])) shutil.rmtree(EnvironmentSettings.root_path / "test/tmp/datasplitter/")