def test_create_model(self): test_path = EnvironmentSettings.root_path + "test/tmp/w2v_test_tmp/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA") sequence2 = ReceptorSequence("CASSCCC") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) model_creator = KmerPairModelCreator() model = model_creator.create_model(dataset=dataset, k=2, vector_size=16, batch_size=1, model_path=test_path + "model.model") self.assertTrue(isinstance(model, Word2Vec)) self.assertTrue("CA" in model.wv.vocab) self.assertEqual(400, len(model.wv.vocab)) shutil.rmtree(test_path)
def _construct_test_repertoiredataset(self, path, positional): receptors1 = ReceptorSequenceList() receptors2 = ReceptorSequenceList() if positional: [ receptors1.append(seq) for seq in [ ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1") ] ] [ receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")] ] else: [ receptors1.append(seq) for seq in [ ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3') ] ] [ receptors2.append(seq) for seq in [ ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2") ] ] rep1 = Repertoire.build_from_sequence_objects(receptors1, metadata={ "l1": 1, "l2": 2, "subject_id": "1" }, path=path) rep2 = Repertoire.build_from_sequence_objects(receptors2, metadata={ "l1": 0, "l2": 3, "subject_id": "2" }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) return dataset, lc
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=[1, 1, 1, 2, 2], path=path) if dataset_type == "receptor": receptordataset_filename = f"{path}/receptors.pkl" with open(receptordataset_filename, "wb") as file: pickle.dump(test_repertoire.receptors, file) dataset = ReceptorDataset(filenames=[receptordataset_filename], identifier="receptor_dataset") elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def test_implant_in_repertoire(self): path = EnvironmentSettings.tmp_test_path + "healthysequenceimplanting/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( [ ReceptorSequence(amino_acid_sequence="ACDFQ", identifier="1"), ReceptorSequence(amino_acid_sequence="TGCDF", identifier="2") ], path=path, metadata={"subject_id": "1"}) implanting = HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND) signal = Signal("1", [Motif("m1", GappedKmerInstantiation(), "CCC")], implanting) repertoire2 = implanting.implant_in_repertoire(repertoire, 0.5, signal, path) new_sequences = [ sequence.get_sequence() for sequence in repertoire2.sequences ] self.assertTrue("ACDFQ" in new_sequences or "TGCDF" in new_sequences) self.assertTrue(any(["CCC" in sequence for sequence in new_sequences])) shutil.rmtree(path)
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path + "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path + "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path + "results/" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path + "results/" }) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def test_encode(self): test_path = EnvironmentSettings.root_path + "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire: data = pd.DataFrame(repertoire.load_data()) groupby_fields = DuplicateSequenceFilter._prepare_group_by_field( params, data.columns) custom_lists = list(set(data.columns) - set(Repertoire.FIELDS)) agg_dict = DuplicateSequenceFilter._prepare_agg_dict( params, data.columns, custom_lists) # Chain objects can not be aggregated, convert to strings if "chains" in data.columns: data["chains"] = [ chain.value if isinstance(chain, Chain) else chain for chain in data["chains"] ] else: data["chains"] = None no_duplicates = data.groupby(groupby_fields).agg( agg_dict).reset_index() processed_repertoire = Repertoire.build( sequence_aas=list(no_duplicates["sequence_aas"]) if "sequence_aas" in no_duplicates.columns else None, sequences=list(no_duplicates["sequences"]) if "sequences" in no_duplicates.columns else None, v_genes=list(no_duplicates["v_genes"]) if "v_genes" in no_duplicates.columns else None, j_genes=list(no_duplicates["j_genes"]) if 'j_genes' in no_duplicates.columns else None, chains=[Chain(key) for key in list(no_duplicates["chains"])] if "chains" in no_duplicates.columns else None, counts=list(no_duplicates["counts"]) if "counts" in no_duplicates else None, region_types=list(no_duplicates["region_types"]) if "region_types" in no_duplicates else None, custom_lists={ key: list(no_duplicates[key]) for key in custom_lists }, sequence_identifiers=list(no_duplicates["sequence_identifiers"]), metadata=copy.deepcopy(repertoire.metadata), path=params["result_path"]) return processed_repertoire
def test_match(self): path = EnvironmentSettings.root_path + "test/tmp/seqmatch/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="3"), ReceptorSequence(amino_acid_sequence="CCCCCC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="4"), ReceptorSequence(amino_acid_sequence="AAAACC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="5"), ReceptorSequence(amino_acid_sequence="TADQVF", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="6") ], metadata={"CD": True}, path=path) dataset = RepertoireDataset(repertoires=[repertoire]) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="1"), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="2") ] matcher = SequenceMatcher() result = matcher.match(dataset, sequences, 2, SequenceMatchingSummaryType.PERCENTAGE) self.assertTrue("repertoires" in result) self.assertEqual( 1, len(result["repertoires"][0]["sequences"][3] ["matching_sequences"])) self.assertTrue(result["repertoires"][0]["metadata"]["CD"]) self.assertEqual(1, len(result["repertoires"])) shutil.rmtree(path)
def test_match_repertoire(self): path = EnvironmentSettings.root_path + "test/tmp/seqmatchrep/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="1", metadata=SequenceMetadata(chain="A", count=3)), ReceptorSequence(amino_acid_sequence="CCCCCC", identifier="2", metadata=SequenceMetadata(chain="A", count=2)), ReceptorSequence(amino_acid_sequence="AAAACC", identifier="3", metadata=SequenceMetadata(chain="A", count=1)), ReceptorSequence(amino_acid_sequence="TADQVF", identifier="4", metadata=SequenceMetadata(chain="A", count=4)) ], metadata={ "CD": True }, path=path) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A")) ] matcher = SequenceMatcher() result = matcher.match_repertoire(repertoire, 0, sequences, 2, SequenceMatchingSummaryType.COUNT) self.assertTrue("sequences" in result) self.assertTrue("repertoire" in result) self.assertTrue("repertoire_index" in result) self.assertEqual(4, len(result["sequences"])) self.assertEqual(1, len(result["sequences"][0]["matching_sequences"])) self.assertEqual(0, len(result["sequences"][1]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][2]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][3]["matching_sequences"])) self.assertEqual( 3, len([ r for r in result["sequences"] if len(r["matching_sequences"]) > 0 ])) self.assertTrue(result["metadata"]["CD"]) result = matcher.match_repertoire( repertoire, 0, sequences, 2, SequenceMatchingSummaryType.CLONAL_PERCENTAGE) self.assertEqual(0.8, result["clonal_percentage"]) shutil.rmtree(path)
def _process_repertoire(index, repertoire, current_implanting, simulation_state) -> Repertoire: if current_implanting is not None: return SignalImplanter._implant_in_repertoire(index, repertoire, current_implanting, simulation_state) else: new_repertoire = Repertoire.build_from_sequence_objects(repertoire.sequences, simulation_state.result_path + "repertoires/", repertoire.metadata) for signal in simulation_state.signals: new_repertoire.metadata[f"signal_{signal.id}"] = False return new_repertoire
def test_implant_in_repertoire(self): path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}full_seq_implanting/") signal = Signal("sig1", [Motif("motif1", GappedKmerInstantiation(max_gap=0), "AAAA")], FullSequenceImplanting()) repertoire = Repertoire.build(["CCCC", "CCCC", "CCCC"], path=path) new_repertoire = signal.implant_to_repertoire(repertoire, 0.33, path) self.assertEqual(len(repertoire.sequences), len(new_repertoire.sequences)) self.assertEqual(1, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "AAAA"])) self.assertEqual(2, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "CCCC"])) shutil.rmtree(path)
def implant_in_repertoire(self, repertoire: Repertoire, repertoire_implanting_rate: float, signal, path): assert all("/" not in motif.seed for motif in signal.motifs), \ f'FullSequenceImplanting: motifs cannot include gaps. Check motifs {[motif.identifier for motif in signal.motifs]}.' sequences = repertoire.sequences new_sequence_count = math.ceil(len(sequences) * repertoire_implanting_rate) assert new_sequence_count > 0, \ f"FullSequenceImplanting: there are too few sequences ({len(sequences)}) in the repertoire with identifier {repertoire.identifier} " \ f"to have the given repertoire implanting rate ({repertoire_implanting_rate}). Please consider increasing the repertoire implanting rate." new_sequences = self._create_new_sequences(sequences, new_sequence_count, signal) metadata = copy.deepcopy(repertoire.metadata) metadata[f"signal_{signal.id}"] = True return Repertoire.build_from_sequence_objects(new_sequences, path, metadata)
def test_create_sentences_from_repertoire(self): path = EnvironmentSettings.tmp_test_path + "kmer/" PathBuilder.build(path) rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"), ReceptorSequence(amino_acid_sequence="ACCT"), ReceptorSequence(amino_acid_sequence="AACT")], path, {}) sentences = KmerHelper.create_sentences_from_repertoire(rep, 3) self.assertEqual(3, len(sentences)) self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0]) shutil.rmtree(path)
def _build_new_repertoire(self, sequences, repertoire_metadata, signal, path) -> Repertoire: if repertoire_metadata is not None: metadata = copy.deepcopy(repertoire_metadata) else: metadata = {} # when adding implant to a repertoire, only signal id is stored: # more detailed information is available in each receptor_sequence # (specific motif and motif instance) metadata[f"signal_{signal.id}"] = True repertoire = Repertoire.build_from_sequence_objects( sequences, path, metadata) return repertoire
def _repertoire_to_dataframe(repertoire: Repertoire, region_type): # get all fields (including custom fields) df = pd.DataFrame(repertoire.load_data()) for column in ['v_alleles', 'j_alleles', 'v_genes', 'j_genes']: if column not in df.columns: df.loc[:, column] = None AIRRExporter.update_gene_columns(df, 'alleles', 'genes') # rename mandatory fields for airr-compliance mapper = {"sequence_identifiers": "sequence_id", "v_alleles": "v_call", "j_alleles": "j_call", "chains": "locus", "counts": "duplicate_count", "sequences": AIRRExporter.get_sequence_field(region_type), "sequence_aas": AIRRExporter.get_sequence_aa_field(region_type)} df = df.rename(mapper=mapper, axis="columns") return df
def load_repertoire_as_object(import_class, metadata_row, params: DatasetImportParams): try: alternative_load_func = getattr(import_class, "alternative_load_func", None) dataframe = ImportHelper.load_sequence_dataframe(f"{params.path}{metadata_row['filename']}", params, alternative_load_func) dataframe = import_class.preprocess_dataframe(dataframe, params) sequence_lists = {field: dataframe[field].values.tolist() for field in Repertoire.FIELDS if field in dataframe.columns} sequence_lists["custom_lists"] = {field: dataframe[field].values.tolist() for field in list(set(dataframe.columns) - set(Repertoire.FIELDS))} repertoire_inputs = {**{"metadata": metadata_row.to_dict(), "path": params.result_path + "repertoires/"}, **sequence_lists} repertoire = Repertoire.build(**repertoire_inputs) return repertoire except Exception as exception: raise RuntimeError(f"{ImportHelper.__name__}: error when importing file {metadata_row['filename']}.") from exception
def build(sequences: list, path: str, labels: dict = None, seq_metadata: list = None, subject_ids: list = None): if subject_ids is not None: assert len(subject_ids) == len(sequences) if seq_metadata is not None: assert len(sequences) == len(seq_metadata) for index, sequence_list in enumerate(sequences): assert len(sequence_list) == len(seq_metadata[index]) PathBuilder.build(path) rep_path = PathBuilder.build(path + "repertoires/") repertoires = [] if subject_ids is None: subject_ids = [] for rep_index, sequence_list in enumerate(sequences): rep_sequences = ReceptorSequenceList() if len(subject_ids) < len(sequences): subject_ids.append("rep_" + str(rep_index)) for seq_index, sequence in enumerate(sequence_list): if seq_metadata is None: m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3") else: m = SequenceMetadata(**seq_metadata[rep_index][seq_index]) s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index)) rep_sequences.append(s) if labels is not None: metadata = {key: labels[key][rep_index] for key in labels.keys()} else: metadata = {} metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}} repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata) repertoires.append(repertoire) df = pd.DataFrame({**{"filename": [f"{repertoire.identifier}_data.npy" for repertoire in repertoires], "subject_id": subject_ids, "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]}, **(labels if labels is not None else {})}) df.to_csv(path + "metadata.csv", index=False) return repertoires, path + "metadata.csv"
def create_dummy_repertoire(self, path): sequence_objects = [ ReceptorSequence(amino_acid_sequence="AAA", nucleotide_sequence="GCTGCTGCT", identifier="receptor_1", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, count=5, region_type="IMGT_CDR3", frame_type="IN", custom_params={ "d_call": "TRBD1", "custom_test": "cust1" })), ReceptorSequence(amino_acid_sequence="GGG", nucleotide_sequence="GGTGGTGGT", identifier="receptor_2", metadata=SequenceMetadata(v_gene="TRAV2", v_allele="TRAV2*01", j_gene="TRAJ2", chain=Chain.ALPHA, count=15, frame_type=None, region_type="IMGT_CDR3", custom_params={ "d_call": "TRAD2", "custom_test": "cust2" })) ] repertoire = Repertoire.build_from_sequence_objects( sequence_objects=sequence_objects, path=path, metadata={"subject_id": "REP1"}) df = pd.DataFrame({ "filename": [f"{repertoire.identifier}_data.npy"], "subject_id": ["1"], "repertoire_identifier": [repertoire.identifier] }) df.to_csv(path + "metadata.csv", index=False) return repertoire, path + "metadata.csv"
def test_run(self): r = [] path = EnvironmentSettings.root_path + "test/tmp/signalImplanter/" if not os.path.isdir(path): os.makedirs(path) sequences = [ReceptorSequence("ACDEFG", identifier="1"), ReceptorSequence("ACDEFG", identifier="2"), ReceptorSequence("ACDEFG", identifier="3"), ReceptorSequence("ACDEFG", identifier="4")] for i in range(10): rep = Repertoire.build_from_sequence_objects(sequence_objects=sequences, path=path, metadata={}) r.append(rep) dataset = RepertoireDataset(repertoires=r) m1 = Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="CAS") m2 = Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC") s1 = Signal(identifier="s1", motifs=[m1], implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) s2 = Signal(identifier="s2", motifs=[m1, m2], implanting_strategy=HealthySequenceImplanting(GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) simulation = Simulation([Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s1, s2], name="i1"), Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s2], name="i2")]) input_params = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[s1, s2]) new_dataset = SignalImplanter.run(input_params) reps_with_s2 = sum([rep.metadata[f"signal_{s2.id}"] is True for rep in new_dataset.get_data(batch_size=10)]) reps_with_s1 = sum([rep.metadata[f"signal_{s1.id}"] is True for rep in new_dataset.get_data(batch_size=10)]) self.assertEqual(10, len(new_dataset.get_example_ids())) self.assertTrue(all([f"signal_{s1.id}" in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10)])) self.assertTrue(all([f"signal_{s2.id}" in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10)])) self.assertTrue(reps_with_s2 == 4) self.assertTrue(reps_with_s1 == 2) self.assertEqual(10, len(new_dataset.get_example_ids())) metadata_filenames = new_dataset.get_metadata(["filename"])["filename"] self.assertTrue(all([repertoire.data_filename in metadata_filenames for repertoire in new_dataset.repertoires])) shutil.rmtree(path)
def test_find_label_associated_sequence_p_values(self): path = EnvironmentSettings.tmp_test_path + "comparison_data_find_label_assocseqpvalues/" PathBuilder.build(path) repertoires = [Repertoire.build_from_sequence_objects([ReceptorSequence()], path, { "l1": val, "subject_id": subject_id }) for val, subject_id in zip([True, True, False, False], ["rep_0", "rep_1", "rep_2", "rep_3"])] col_name_index = {repertoires[index].identifier: index for index in range(len(repertoires))} comparison_data = ComparisonData(repertoire_ids=[repertoire.identifier for repertoire in repertoires], comparison_attributes=["sequence_aas"], sequence_batch_size=4, path=path) comparison_data.batches = [ComparisonDataBatch(**{'matrix': np.array([[1., 0., 0., 0.], [1., 1., 0., 0.]]), 'items': [('GGG',), ('III',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 0}), ComparisonDataBatch(**{'matrix': np.array([[1., 1., 0., 1.], [1., 1., 1., 1.]]), 'items': [('LLL',), ('MMM',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 1}), ComparisonDataBatch(**{'matrix': np.array([[0., 1., 0., 0.], [0., 1., 0., 1.]]), 'items': [('DDD',), ('EEE',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 2}), ComparisonDataBatch(**{'matrix': np.array([[0., 1., 1., 1.], [0., 0., 1., 1.]]), 'items': [('FFF',), ('CCC',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 3}), ComparisonDataBatch(**{'matrix': np.array([[0., 0., 0., 1.]]), 'items': [('AAA',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 4})] p_values = SequenceFilterHelper.find_label_associated_sequence_p_values(comparison_data, repertoires, Label('l1', [True, False], positive_class=True)) print(p_values) self.assertTrue( np.allclose([SequenceFilterHelper.INVALID_P_VALUE, 0.1666666666666667, 0.5000000000000001, 1., SequenceFilterHelper.INVALID_P_VALUE, 0.8333333333333331, 1., 1., 2], p_values, equal_nan=True)) shutil.rmtree(path)
def test_receptor(self): path = EnvironmentSettings.tmp_test_path + "receptortestingpathrepertoire/" PathBuilder.build(path) sequences = [ ReceptorSequence(amino_acid_sequence="AAA", identifier="1", metadata=SequenceMetadata(v_gene="V1", cell_id="1", chain=Chain.ALPHA, custom_params={ "cmv": "no", "coeliac": False })), ReceptorSequence(amino_acid_sequence="CCC", identifier="2", metadata=SequenceMetadata(j_gene="J1", cell_id="1", chain=Chain.BETA, custom_params={ "cmv": "yes", "coeliac": True })), ReceptorSequence(amino_acid_sequence="FFF", identifier="3", metadata=SequenceMetadata(v_gene="V1", cell_id="1", chain=Chain.ALPHA, custom_params={ "cmv": "no", "coeliac": False })), ReceptorSequence(amino_acid_sequence="EEE", identifier="4", metadata=SequenceMetadata(j_gene="J1", cell_id="1", chain=Chain.BETA, custom_params={ "cmv": "yes", "coeliac": True })), ReceptorSequence(amino_acid_sequence="FFF", identifier="5", metadata=SequenceMetadata(v_gene="V1", cell_id="2", chain=Chain.GAMMA, custom_params={ "cmv": "no", "coeliac": False })), ReceptorSequence(amino_acid_sequence="EEE", identifier="6", metadata=SequenceMetadata(j_gene="J1", cell_id="2", chain=Chain.DELTA, custom_params={ "cmv": "yes", "coeliac": True })), ReceptorSequence(amino_acid_sequence="EEE", identifier="7", metadata=SequenceMetadata(j_gene="J2", cell_id="2", chain=Chain.DELTA, custom_params={ "cmv": "yes", "coeliac": True })) ] obj = Repertoire.build_from_sequence_objects(sequences, path, { "cmv": "yes", 'subject_id': "1" }) receptors = obj.receptors self.assertEqual(6, len(receptors)) cells = obj.cells self.assertEqual(2, len(cells)) shutil.rmtree(path)
class TestDataSummarizer(TestCase): # 5 features, 3 repertoires. Each repertoire has 3 labels. Each feature has 2 annotations. encoded_data_1 = { 'examples': sparse.csr_matrix( np.array([[1, 2, 3, 4, 5], [0, 0, 0, 1, 1], [1, 1, 0, 0, 0]])), 'example_ids': ['rep1', 'rep2', 'rep3'], 'labels': { "diabetes": ['diabetes pos', 'diabetes neg', 'diabetes neg'], "celiac": ['celiac pos', 'celiac pos', 'celiac pos'], "cmv": ['cmv pos', 'cmv neg', 'cmv pos'] }, 'feature_names': ['a', 'b', 'c', 'd', 'e'], 'feature_annotations': pd.DataFrame({ "specificity": ["cmv", "ebv", "cmv", "gluten", "gluten"], "p_val": [0.01, 0.00001, 0.1, 0, 0.0000001] }) } dataset_1 = RepertoireDataset(encoded_data=EncodedData(**encoded_data_1), repertoires=[ Repertoire("1.npy", None, "1"), Repertoire("2.npy", None, "2"), Repertoire("3.npy", None, "3") ]) encoded_data_2 = { 'examples': sparse.csr_matrix( np.array([[1, 2, 3, 4, 5], [0, 0, 0, 1, 1], [1, 1, 0, 0, 0], [90, 10, 1, 3, 4], [0, 1, 1, 100, 200]])), 'example_ids': ['rep1', 'rep2', 'rep3', 'rep4', 'rep5'], 'labels': { "diabetes": [ 'diabetes pos', 'diabetes neg', 'diabetes neg', 'diabetes pos', 'diabetes pos' ], "celiac": [ 'celiac pos', 'celiac pos', 'celiac pos', 'celiac neg', 'celiac pos' ], "cmv": ['cmv pos', 'cmv neg', 'cmv pos', 'cmv pos', 'cmv neg'] }, 'feature_names': ['a', 'b', 'c', 'd', 'e'], 'feature_annotations': pd.DataFrame({ "specificity": ["cmv", "ebv", "cmv", "gluten", "gluten"], "something": ["a", "b", "b", "a", "a"], "p_val": [0.01, 0.00001, 0.1, 0, 0.0000001] }) } dataset_2 = RepertoireDataset(encoded_data=EncodedData(**encoded_data_2)) def setUp(self) -> None: os.environ[Constants.CACHE_TYPE] = CacheType.TEST.name def test_filter_repertoires(self): dataset = TestDataSummarizer.dataset_1 criteria = { "type": BooleanType.AND, "operands": [{ "type": OperationType.IN, "allowed_values": ["celiac pos"], "value": { "type": DataType.COLUMN, "name": "celiac" } }, { "type": OperationType.IN, "allowed_values": ["cmv pos"], "value": { "type": DataType.COLUMN, "name": "cmv" } }] } filtered = DataSummarizer.filter_repertoires(dataset, criteria) self.assertTrue(filtered.get_example_count() == 2) self.assertTrue(filtered.encoded_data.examples.shape[0] == 2) self.assertTrue(filtered.encoded_data.examples.shape[1] == 5) def test_filter_features(self): dataset = TestDataSummarizer.dataset_1 criteria = { "type": BooleanType.OR, "operands": [{ "type": OperationType.IN, "allowed_values": ["gluten"], "value": { "type": DataType.COLUMN, "name": "specificity" } }, { "type": OperationType.LESS_THAN, "threshold": 0.0001, "value": { "type": DataType.COLUMN, "name": "p_val" } }] } filtered = DataSummarizer.filter_features(dataset, criteria) self.assertEqual(3, filtered.get_example_count()) self.assertTrue(filtered.encoded_data.examples.shape[0] == 3) self.assertTrue(filtered.encoded_data.examples.shape[1] == 3) def test_annotate_repertoires(self): dataset = TestDataSummarizer.dataset_1 criteria = { "type": BooleanType.AND, "operands": [{ "type": OperationType.IN, "allowed_values": ["celiac pos"], "value": { "type": DataType.COLUMN, "name": "celiac" } }, { "type": OperationType.IN, "allowed_values": ["cmv pos"], "value": { "type": DataType.COLUMN, "name": "cmv" } }] } annotated = DataSummarizer.annotate_repertoires( dataset, criteria, "annotate") self.assertTrue(annotated.encoded_data.examples.shape[0] == 3) self.assertTrue(annotated.encoded_data.examples.shape[1] == 5) def test_annotate_features(self): dataset = TestDataSummarizer.dataset_1 criteria = { "type": BooleanType.OR, "operands": [{ "type": OperationType.IN, "allowed_values": ["gluten"], "value": { "type": DataType.COLUMN, "name": "specificity" } }, { "type": OperationType.LESS_THAN, "threshold": 0.0001, "value": { "type": DataType.COLUMN, "name": "p_val" } }] } annotated = DataSummarizer.annotate_features(dataset, criteria, "annotate") self.assertTrue(annotated.encoded_data.examples.shape[0] == 3) self.assertTrue(annotated.encoded_data.examples.shape[1] == 5) def test_annotate_features_2(self): dataset = TestDataSummarizer.dataset_1 criteria = { "type": OperationType.IN, "allowed_values": ["gluten"], "value": { "type": DataType.COLUMN, "name": "specificity" } } annotated = DataSummarizer.annotate_features(dataset, criteria, "annotate") self.assertTrue(annotated.encoded_data.examples.shape[0] == 3) self.assertTrue(annotated.encoded_data.examples.shape[1] == 5)
def test_encode(self): path = EnvironmentSettings.root_path + "test/tmp/kmerfreqenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')], metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path) rep2 = Repertoire.build_from_sequence_objects([ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2"), ReceptorSequence("AAC", identifier="3")], metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.IDENTITY.name, "k": 3 }) d1 = encoder.encode(dataset, EncoderParams( result_path=path + "1/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl" )) encoder = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d2 = encoder.encode(dataset, EncoderParams( result_path=path + "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv" )) encoder3 = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.BINARY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d3 = encoder3.encode(dataset, EncoderParams( result_path=path + "3/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl" )) shutil.rmtree(path) self.assertTrue(isinstance(d1, RepertoireDataset)) self.assertTrue(isinstance(d2, RepertoireDataset)) self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2)) self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2)) self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))
def test_encode(self): path = EnvironmentSettings.root_path + "test/tmp/evennessenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence("AAA", metadata=SequenceMetadata(count=10)) for i in range(1000) ] + [ ReceptorSequence("AAA", metadata=SequenceMetadata(count=100)) for i in range(1000) ] + [ ReceptorSequence("AAA", metadata=SequenceMetadata(count=1)) for i in range(1000) ], metadata={ "l1": "test_1", "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence("AAA", metadata=SequenceMetadata(count=10)) for i in range(1000) ], metadata={ "l1": "test_2", "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", ["test_1", "test_2"]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = EvennessProfileEncoder.build_object( dataset, **{ "min_alpha": 0, "max_alpha": 10, "dimension": 51 }) d1 = encoder.encode( dataset, EncoderParams( result_path=path + "1/", label_config=lc, )) encoder = EvennessProfileEncoder.build_object( dataset, **{ "min_alpha": 0, "max_alpha": 10, "dimension": 11 }) d2 = encoder.encode( dataset, EncoderParams(result_path=path, label_config=lc, pool_size=2)) self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1) self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444) self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1) self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1) shutil.rmtree(path)
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/duplicatesequencefilter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=[ Repertoire.build( sequence_aas=["AAA", "AAA", "CCC", "AAA", "CCC", "CCC", "CCC"], sequences=[ "ntAAA", "ntBBB", "ntCCC", "ntAAA", "ntCCC", "ntCCC", "ntDDD" ], v_genes=["v1", "v1", "v1", "v1", "v1", "v1", "v1"], j_genes=["j1", "j1", "j1", "j1", "j1", "j1", "j1"], chains=[ Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.BETA ], counts=[10, 20, 30, 5, 20, None, 40], region_types=[ "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3" ], custom_lists={ "custom1": ["yes", "yes", "yes", "no", "no", "no", "no"], "custom2": ["yes", "yes", "yes", "no", "no", "no", "no"] }, sequence_identifiers=[1, 2, 3, 4, 5, 6, 7], path=path) ]) # collapse by amino acids & use sum counts dupfilter = DuplicateSequenceFilter( filter_sequence_type=SequenceType.AMINO_ACID, count_agg=CountAggregationFunction.SUM, batch_size=4) reduced_repertoire = dupfilter.process_dataset( dataset=dataset, result_path=path).repertoires[0] attr = reduced_repertoire.get_attributes([ "sequence_identifiers", "sequence_aas", "sequences", "counts", "chains" ]) self.assertEqual(3, len(reduced_repertoire.get_sequence_identifiers())) self.assertListEqual(["AAA", "CCC", "CCC"], list(attr["sequence_aas"])) self.assertListEqual(["ntAAA", "ntCCC", "ntDDD"], list(attr["sequences"])) self.assertListEqual([35, 50, 40], list(attr["counts"])) self.assertListEqual([1, 3, 7], list(attr["sequence_identifiers"])) self.assertListEqual( [Chain.get_chain("A"), Chain.get_chain("A"), Chain.get_chain('B')], list(attr["chains"])) # collapse by nucleotides & use min counts dupfilter = DuplicateSequenceFilter( filter_sequence_type=SequenceType.NUCLEOTIDE, count_agg=CountAggregationFunction.MIN, batch_size=4) reduced_repertoire = dupfilter.process_dataset( dataset=dataset, result_path=path).repertoires[0] attr = reduced_repertoire.get_attributes( ["sequence_identifiers", "sequence_aas", "sequences", "counts"]) self.assertEqual(4, len(reduced_repertoire.get_sequence_identifiers())) self.assertListEqual([1, 2, 3, 7], list(attr["sequence_identifiers"])) self.assertListEqual(["AAA", "AAA", "CCC", "CCC"], list(attr["sequence_aas"])) self.assertListEqual(["ntAAA", "ntBBB", "ntCCC", "ntDDD"], list(attr["sequences"])) self.assertListEqual([5, 20, 20, 40], list(attr["counts"])) shutil.rmtree(path)
def test_run(self): dataset = RepertoireDataset(repertoires=[ Repertoire("0.npy", "", "0"), Repertoire("0.npy", "", "1"), Repertoire("0.npy", "", "2"), Repertoire("0.npy", "", "3"), Repertoire("0.npy", "", "4"), Repertoire("0.npy", "", "5"), Repertoire("0.npy", "", "6"), Repertoire("0.npy", "", "7") ]) paths = [ EnvironmentSettings.root_path + "test/tmp/datasplitter/split_{}".format(i) for i in range(5) ] for path in paths: PathBuilder.build(path) df = pd.DataFrame(data={ "key1": [0, 0, 1, 1, 1, 2, 2, 0], "filename": [0, 1, 2, 3, 4, 5, 6, 7] }) df.to_csv(EnvironmentSettings.root_path + "test/tmp/datasplitter/metadata.csv") dataset.metadata_file = EnvironmentSettings.root_path + "test/tmp/datasplitter/metadata.csv" training_percentage = 0.7 trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, training_percentage=training_percentage, split_strategy=SplitType.RANDOM, split_count=5, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(len(trains[0].get_data()), 5) self.assertEqual(len(tests[0].get_data()), 3) self.assertEqual(5, len(trains)) self.assertEqual(5, len(tests)) self.assertEqual(5, len(trains[0].repertoires)) trains2, tests2 = DataSplitter.run( DataSplitterParams(dataset=dataset, training_percentage=training_percentage, split_strategy=SplitType.RANDOM, split_count=5, paths=paths)) self.assertEqual(trains[0].get_repertoire_ids(), trains2[0].get_repertoire_ids()) paths = [ EnvironmentSettings.root_path + "test/tmp/datasplitter/split_{}".format(i) for i in range(dataset.get_example_count()) ] for path in paths: PathBuilder.build(path) trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, split_strategy=SplitType.LOOCV, split_count=-1, training_percentage=-1, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(len(trains[0].get_data()), 7) self.assertEqual(len(tests[0].get_data()), 1) self.assertEqual(8, len(trains)) self.assertEqual(8, len(tests)) paths = [ EnvironmentSettings.root_path + "test/tmp/datasplitter/split_{}".format(i) for i in range(5) ] for path in paths: PathBuilder.build(path) trains, tests = DataSplitter.run( DataSplitterParams(dataset=dataset, split_strategy=SplitType.K_FOLD, split_count=5, training_percentage=-1, paths=paths)) self.assertTrue(isinstance(trains[0], RepertoireDataset)) self.assertTrue(isinstance(tests[0], RepertoireDataset)) self.assertEqual(len(trains[0].get_data()), 6) self.assertEqual(len(tests[0].get_data()), 2) self.assertEqual(5, len(trains)) self.assertEqual(5, len(tests)) shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/datasplitter/")
def test_repertoire(self): path = EnvironmentSettings.tmp_test_path + "sequencerepertoire/" PathBuilder.build(path) sequences = [ ReceptorSequence(amino_acid_sequence="AAA", identifier="1", metadata=SequenceMetadata(v_gene="V1", cell_id="1", chain=Chain.ALPHA, custom_params={ "cmv": "no", "coeliac": False })), ReceptorSequence(amino_acid_sequence="CCC", identifier="2", metadata=SequenceMetadata(j_gene="J1", cell_id="1", chain=Chain.BETA, custom_params={ "cmv": "yes", "coeliac": True })) ] obj = Repertoire.build_from_sequence_objects(sequences, path, { "cmv": "yes", 'subject_id': "1" }) self.assertTrue(os.path.isfile(obj.data_filename)) self.assertTrue(isinstance(obj, Repertoire)) self.assertTrue( np.array_equal(np.array(["1", "2"]), obj.get_sequence_identifiers())) self.assertTrue( np.array_equal(np.array(["AAA", "CCC"]), obj.get_sequence_aas())) self.assertTrue( np.array_equal(np.array(["V1", None]), obj.get_v_genes())) self.assertTrue( np.array_equal(np.array([None, "J1"]), obj.get_j_genes())) self.assertTrue( np.array_equal(np.array(["no", "yes"]), obj.get_attribute("cmv"))) self.assertTrue( np.array_equal(np.array([False, True]), obj.get_attribute("coeliac"))) self.assertEqual("yes", obj.metadata["cmv"]) self.assertEqual("1", obj.metadata["subject_id"]) rebuilt_sequences = obj.sequences self.assertTrue( all( isinstance(seq, ReceptorSequence) for seq in rebuilt_sequences)) self.assertEqual(2, len(rebuilt_sequences)) self.assertEqual("1", rebuilt_sequences[0].identifier) self.assertEqual("2", rebuilt_sequences[1].identifier) self.assertEqual("AAA", rebuilt_sequences[0].amino_acid_sequence) self.assertEqual("yes", rebuilt_sequences[1].metadata.custom_params["cmv"]) obj.free_memory() self.assertTrue(key in obj.data for key in Repertoire.FIELDS) self.assertTrue(obj.data[key] is None for key in Repertoire.FIELDS) shutil.rmtree(path)
def store_repertoire(path, repertoire, sequences): new_repertoire = Repertoire.build_from_sequence_objects( sequences, path, repertoire.metadata) return new_repertoire