def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector" PathBuilder.build(path) reps = [ Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects( [ReceptorSequence("AAC", identifier="2")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects( [ReceptorSequence("AAC", identifier="3")], path=path, metadata={"subject_id": "patient3"}) ] dataset = RepertoireDataset(repertoires=reps) dataset2 = SubjectRepertoireCollector.process( dataset, {"result_path": path / "result"}) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual(3, len(dataset.get_data())) values = [2, 1] for index, rep in enumerate(dataset2.get_data()): self.assertEqual(values[index], len(rep.sequences)) shutil.rmtree(path)
def _construct_test_repertoiredataset(self, path, positional): receptors1 = ReceptorSequenceList() receptors2 = ReceptorSequenceList() if positional: [receptors1.append(seq) for seq in [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]] [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]] else: [receptors1.append(seq) for seq in [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]] [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]] rep1 = Repertoire.build_from_sequence_objects(receptors1, metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path) rep2 = Repertoire.build_from_sequence_objects(receptors2, metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) return dataset, lc
def test_get_normalized_sequence_lengths(self): path = EnvironmentSettings.root_path / "test/tmp/datareports/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAA", identifier="1"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="2"), ReceptorSequence(amino_acid_sequence="AAAAA", identifier="3"), ReceptorSequence(amino_acid_sequence="AAA", identifier="4") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAA", identifier="5"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="6"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="7"), ReceptorSequence(amino_acid_sequence="AAA", identifier="8") ], path=path, metadata={}) dataset = RepertoireDataset(repertoires=[rep1, rep2]) sld = SequenceLengthDistribution(dataset, 1, path) result = sld.generate_report() self.assertTrue(os.path.isfile(result.output_figures[0].path)) shutil.rmtree(path)
def test_create_model(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v_test_tmp/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA") sequence2 = ReceptorSequence("CASSCCC") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) model_creator = KmerPairModelCreator() model = model_creator.create_model(dataset=dataset, k=2, vector_size=16, batch_size=1, model_path=test_path / "model.model") self.assertTrue(isinstance(model, Word2Vec)) self.assertTrue("CA" in model.wv.vocab) self.assertEqual(400, len(model.wv.vocab)) shutil.rmtree(test_path)
def create_dummy_repertoire(self, path): sequence_objects = [ReceptorSequence(amino_acid_sequence="AAA", nucleotide_sequence="GCTGCTGCT", identifier="receptor_1", metadata=SequenceMetadata(v_gene="TRBV1", j_gene="TRBJ1", chain=Chain.BETA, count=5, region_type="IMGT_CDR3", frame_type="IN", custom_params={"d_call": "TRBD1", "custom_test": "cust1"})), ReceptorSequence(amino_acid_sequence="GGG", nucleotide_sequence="GGTGGTGGT", identifier="receptor_2", metadata=SequenceMetadata(v_gene="TRAV2", v_allele="TRAV2*01", j_gene="TRAJ2", chain=Chain.ALPHA, count=15, frame_type=None, region_type="IMGT_CDR3", custom_params={"d_call": "TRAD2", "custom_test": "cust2"}))] repertoire = Repertoire.build_from_sequence_objects(sequence_objects=sequence_objects, path=path, metadata={"subject_id": "REP1"}) df = pd.DataFrame({"filename": [f"{repertoire.identifier}_data.npy"], "subject_id": ["1"], "repertoire_identifier": [repertoire.identifier]}) df.to_csv(path / "metadata.csv", index=False) return repertoire, path / "metadata.csv"
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path / "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path / "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path / "results" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path / "results" }) shutil.rmtree(path)
def test_encode(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def test_match_repertoire(self): path = EnvironmentSettings.root_path / "test/tmp/seqmatchrep/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", identifier="1", metadata=SequenceMetadata(chain="A", count=3)), ReceptorSequence(amino_acid_sequence="CCCCCC", identifier="2", metadata=SequenceMetadata(chain="A", count=2)), ReceptorSequence(amino_acid_sequence="AAAACC", identifier="3", metadata=SequenceMetadata(chain="A", count=1)), ReceptorSequence(amino_acid_sequence="TADQVF", identifier="4", metadata=SequenceMetadata(chain="A", count=4)) ], metadata={ "CD": True }, path=path) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A")), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A")) ] matcher = SequenceMatcher() result = matcher.match_repertoire(repertoire, 0, sequences, 2, SequenceMatchingSummaryType.COUNT) self.assertTrue("sequences" in result) self.assertTrue("repertoire" in result) self.assertTrue("repertoire_index" in result) self.assertEqual(4, len(result["sequences"])) self.assertEqual(1, len(result["sequences"][0]["matching_sequences"])) self.assertEqual(0, len(result["sequences"][1]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][2]["matching_sequences"])) self.assertEqual(1, len(result["sequences"][3]["matching_sequences"])) self.assertEqual( 3, len([ r for r in result["sequences"] if len(r["matching_sequences"]) > 0 ])) self.assertTrue(result["metadata"]["CD"]) result = matcher.match_repertoire( repertoire, 0, sequences, 2, SequenceMatchingSummaryType.CLONAL_PERCENTAGE) self.assertEqual(0.8, result["clonal_percentage"]) shutil.rmtree(path)
def test_match(self): path = EnvironmentSettings.root_path / "test/tmp/seqmatch/" PathBuilder.build(path) repertoire = Repertoire.build_from_sequence_objects( sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAAAAA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="3"), ReceptorSequence(amino_acid_sequence="CCCCCC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="4"), ReceptorSequence(amino_acid_sequence="AAAACC", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="5"), ReceptorSequence(amino_acid_sequence="TADQVF", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="6") ], metadata={"CD": True}, path=path) dataset = RepertoireDataset(repertoires=[repertoire]) sequences = [ ReceptorSequence("AAAACA", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J2"), identifier="1"), ReceptorSequence("TADQV", metadata=SequenceMetadata(chain="A", v_gene="V1", j_gene="J3"), identifier="2") ] matcher = SequenceMatcher() result = matcher.match(dataset, sequences, 2, SequenceMatchingSummaryType.PERCENTAGE) self.assertTrue("repertoires" in result) self.assertEqual( 1, len(result["repertoires"][0]["sequences"][3] ["matching_sequences"])) self.assertTrue(result["repertoires"][0]["metadata"]["CD"]) self.assertEqual(1, len(result["repertoires"])) shutil.rmtree(path)
def _build_new_repertoire(self, sequences, repertoire_metadata, signal, path: Path) -> Repertoire: if repertoire_metadata is not None: metadata = copy.deepcopy(repertoire_metadata) else: metadata = {} # when adding implant to a repertoire, only signal id is stored: # more detailed information is available in each receptor_sequence # (specific motif and motif instance) metadata[signal.id] = True repertoire = Repertoire.build_from_sequence_objects(sequences, path, metadata) return repertoire
def test_create_sentences_from_repertoire(self): path = EnvironmentSettings.tmp_test_path / "kmer/" PathBuilder.build(path) rep = Repertoire.build_from_sequence_objects([ReceptorSequence(amino_acid_sequence="AACT"), ReceptorSequence(amino_acid_sequence="ACCT"), ReceptorSequence(amino_acid_sequence="AACT")], path, {}) sentences = KmerHelper.create_sentences_from_repertoire(rep, 3, sequence_type=SequenceType.AMINO_ACID) self.assertEqual(3, len(sentences)) self.assertTrue(len(sentences[0]) == 2 and "AAC" in sentences[0] and "ACT" in sentences[0]) shutil.rmtree(path)
def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list = None, subject_ids: list = None): if subject_ids is not None: assert len(subject_ids) == len(sequences) if seq_metadata is not None: assert len(sequences) == len(seq_metadata) for index, sequence_list in enumerate(sequences): assert len(sequence_list) == len(seq_metadata[index]) PathBuilder.build(path) rep_path = PathBuilder.build(path / "repertoires") repertoires = [] if subject_ids is None: subject_ids = [] for rep_index, sequence_list in enumerate(sequences): rep_sequences = ReceptorSequenceList() if len(subject_ids) < len(sequences): subject_ids.append("rep_" + str(rep_index)) for seq_index, sequence in enumerate(sequence_list): if seq_metadata is None: m = SequenceMetadata(v_subgroup="TRBV1", v_gene="TRBV1-1", v_allele="TRBV1-1*01", j_subgroup="TRBJ1", j_gene="TRBJ1-1", j_allele="TRBJ1-1*01", count=1, chain="TRB", region_type="IMGT_CDR3") else: m = SequenceMetadata(**seq_metadata[rep_index][seq_index]) s = ReceptorSequence(amino_acid_sequence=sequence, metadata=m, identifier=str(seq_index)) rep_sequences.append(s) if labels is not None: metadata = {key: labels[key][rep_index] for key in labels.keys()} else: metadata = {} metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}} repertoire = Repertoire.build_from_sequence_objects(rep_sequences, rep_path, metadata, filename_base=f"rep_{rep_index}") repertoires.append(repertoire) df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires], "subject_id": subject_ids, "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]}, **(labels if labels is not None else {})}) df.to_csv(path / "metadata.csv", index=False) return repertoires, path / "metadata.csv"
def _process_repertoire(index, repertoire, current_implanting, simulation_state) -> Repertoire: if current_implanting is not None: return SignalImplanter._implant_in_repertoire( index, repertoire, current_implanting, simulation_state) else: new_repertoire = Repertoire.build_from_sequence_objects( repertoire.sequences, simulation_state.result_path / "repertoires", repertoire.metadata) for signal in simulation_state.signals: new_repertoire.metadata[f"signal_{signal.id}"] = False return new_repertoire
def implant_in_repertoire(self, repertoire: Repertoire, repertoire_implanting_rate: float, signal, path: Path): assert all("/" not in motif.seed for motif in signal.motifs), \ f'FullSequenceImplanting: motifs cannot include gaps. Check motifs {[motif.identifier for motif in signal.motifs]}.' sequences = repertoire.sequences new_sequence_count = math.ceil( len(sequences) * repertoire_implanting_rate) assert new_sequence_count > 0, \ f"FullSequenceImplanting: there are too few sequences ({len(sequences)}) in the repertoire with identifier {repertoire.identifier} " \ f"to have the given repertoire implanting rate ({repertoire_implanting_rate}). Please consider increasing the repertoire implanting rate." new_sequences = self._create_new_sequences(sequences, new_sequence_count, signal) metadata = copy.deepcopy(repertoire.metadata) metadata[f"signal_{signal.id}"] = True return Repertoire.build_from_sequence_objects(new_sequences, path, metadata)
def _process_repertoire(index, repertoire, current_implanting, simulation_state, output_path: Path = None) -> Repertoire: if current_implanting is not None: new_repertoire = SignalImplanter._implant_in_repertoire( index, repertoire, current_implanting, simulation_state) else: new_metadata = { **repertoire.metadata, **{ f"{signal.id}": False for signal in simulation_state.signals } } new_repertoire = Repertoire.build_from_sequence_objects( repertoire.sequences, simulation_state.result_path / "repertoires", metadata=new_metadata) return new_repertoire
def test_encode(self): path = EnvironmentSettings.root_path / "test/tmp/kmerfreqenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence("AAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3') ], metadata={ "l1": 1, "l2": 2, "subject_id": "1" }, path=path) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2"), ReceptorSequence("AAC", identifier="3") ], metadata={ "l1": 0, "l2": 3, "subject_id": "2" }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.IDENTITY.name, "k": 3 }) d1 = encoder.encode( dataset, EncoderParams(result_path=path / "1/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) encoder = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d2 = encoder.encode( dataset, EncoderParams(result_path=path / "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv")) encoder3 = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.BINARY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d3 = encoder3.encode( dataset, EncoderParams(result_path=path / "3/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) shutil.rmtree(path) self.assertTrue(isinstance(d1, RepertoireDataset)) self.assertTrue(isinstance(d2, RepertoireDataset)) self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2)) self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2)) self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))
def _store_repertoire(self, repertoire, sequences): new_repertoire = Repertoire.build_from_sequence_objects( sequence_objects=sequences, path=self.result_path, metadata=repertoire.metadata) return new_repertoire
def test_run(self): r = [] path = EnvironmentSettings.tmp_test_path / "signalImplanter/" if not os.path.isdir(path): os.makedirs(path) sequences = [ ReceptorSequence("ACDEFG", identifier="1"), ReceptorSequence("ACDEFG", identifier="2"), ReceptorSequence("ACDEFG", identifier="3"), ReceptorSequence("ACDEFG", identifier="4") ] for i in range(10): rep = Repertoire.build_from_sequence_objects( sequence_objects=sequences, path=path, metadata={}) r.append(rep) dataset = RepertoireDataset(repertoires=r) m1 = Motif(identifier="m1", instantiation=GappedKmerInstantiation(), seed="CAS") m2 = Motif(identifier="m2", instantiation=GappedKmerInstantiation(), seed="CCC") s1 = Signal(identifier="s1", motifs=[m1], implanting_strategy=HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) s2 = Signal(identifier="s2", motifs=[m1, m2], implanting_strategy=HealthySequenceImplanting( GappedMotifImplanting(), implanting_computation=ImplantingComputation.ROUND)) simulation = Simulation([ Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s1, s2], name="i1"), Implanting(dataset_implanting_rate=0.2, repertoire_implanting_rate=0.5, signals=[s2], name="i2") ]) input_params = SimulationState(dataset=dataset, result_path=path, simulation=simulation, signals=[s1, s2], formats=["ImmuneML"]) new_dataset = SignalImplanter.run(input_params) reps_with_s2 = sum([ rep.metadata[s2.id] is True for rep in new_dataset.get_data(batch_size=10) ]) reps_with_s1 = sum([ rep.metadata[s1.id] is True for rep in new_dataset.get_data(batch_size=10) ]) self.assertEqual(10, len(new_dataset.get_example_ids())) self.assertTrue( all([ s1.id in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10) ])) self.assertTrue( all([ s2.id in rep.metadata.keys() for rep in new_dataset.get_data(batch_size=10) ])) self.assertTrue(reps_with_s2 == 4) self.assertTrue(reps_with_s1 == 2) self.assertEqual(10, len(new_dataset.get_example_ids())) metadata_filenames = [ filename.name for filename in new_dataset.get_filenames() ] self.assertTrue( all([ repertoire.data_filename.name in metadata_filenames for repertoire in new_dataset.repertoires ])) shutil.rmtree(path)
def store_repertoire(path, repertoire, sequences): new_repertoire = Repertoire.build_from_sequence_objects(sequences, path, repertoire.metadata) return new_repertoire
def test_find_label_associated_sequence_p_values(self): path = EnvironmentSettings.tmp_test_path / "comparison_data_find_label_assocseqpvalues/" PathBuilder.build(path) repertoires = [ Repertoire.build_from_sequence_objects([ReceptorSequence()], path, { "l1": val, "subject_id": subject_id }) for val, subject_id in zip([True, True, False, False], ["rep_0", "rep_1", "rep_2", "rep_3"]) ] col_name_index = { repertoires[index].identifier: index for index in range(len(repertoires)) } comparison_data = ComparisonData( repertoire_ids=[ repertoire.identifier for repertoire in repertoires ], comparison_attributes=["sequence_aas"], sequence_batch_size=4, path=path) comparison_data.batches = [ ComparisonDataBatch( **{ 'matrix': np.array([[1., 0., 0., 0.], [1., 1., 0., 0.]]), 'items': [('GGG', ), ('III', )], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 0 }), ComparisonDataBatch( **{ 'matrix': np.array([[1., 1., 0., 1.], [1., 1., 1., 1.]]), 'items': [('LLL', ), ('MMM', )], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 1 }), ComparisonDataBatch( **{ 'matrix': np.array([[0., 1., 0., 0.], [0., 1., 0., 1.]]), 'items': [('DDD', ), ('EEE', )], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 2 }), ComparisonDataBatch( **{ 'matrix': np.array([[0., 1., 1., 1.], [0., 0., 1., 1.]]), 'items': [('FFF', ), ('CCC', )], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 3 }), ComparisonDataBatch( **{ 'matrix': np.array([[0., 0., 0., 1.]]), 'items': [('AAA', )], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 4 }) ] p_values = SequenceFilterHelper.find_label_associated_sequence_p_values( comparison_data, repertoires, Label('l1', [True, False], positive_class=True)) print(p_values) self.assertTrue( np.allclose([ SequenceFilterHelper.INVALID_P_VALUE, 0.1666666666666667, 0.5000000000000001, 1., SequenceFilterHelper.INVALID_P_VALUE, 0.8333333333333331, 1., 1., 2 ], p_values, equal_nan=True)) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.root_path / "test/tmp/evennessenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence("AAA", metadata=SequenceMetadata(count=10)) for i in range(1000) ] + [ ReceptorSequence("AAA", metadata=SequenceMetadata(count=100)) for i in range(1000) ] + [ ReceptorSequence("AAA", metadata=SequenceMetadata(count=1)) for i in range(1000) ], metadata={ "l1": "test_1", "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence("AAA", metadata=SequenceMetadata(count=10)) for i in range(1000) ], metadata={ "l1": "test_2", "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", ["test_1", "test_2"]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = EvennessProfileEncoder.build_object( dataset, **{ "min_alpha": 0, "max_alpha": 10, "dimension": 51 }) d1 = encoder.encode( dataset, EncoderParams( result_path=path / "1/", label_config=lc, )) encoder = EvennessProfileEncoder.build_object( dataset, **{ "min_alpha": 0, "max_alpha": 10, "dimension": 11 }) d2 = encoder.encode( dataset, EncoderParams(result_path=path, label_config=lc, pool_size=2)) self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1) self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444) self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1) self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1) shutil.rmtree(path)