def test_encode_sequence(self): sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="OUT")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="STOP")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="IN")) enc = IdentitySequenceEncoder() self.assertEqual(["AAA"], enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")))
def test_run(self): path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path / 'result/instr1/') shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle') shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path / 'result/') predictions_path = path / "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def _run_test(self, compairr_path): path = EnvironmentSettings.tmp_test_path / "compairr_distance_encoder/" PathBuilder.build(path) dataset = self.create_dataset(path) enc = CompAIRRDistanceEncoder.build_object(dataset, **{"compairr_path": compairr_path, "keep_compairr_input": True, "differences": 0, "indels": False, "ignore_counts": False, "threads": 8, "ignore_genes": False}) enc.set_context({"dataset": dataset}) encoded = enc.encode(dataset, EncoderParams(result_path=path, label_config=LabelConfiguration([Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4, filename="dataset.pkl")) self.assertEqual(8, encoded.encoded_data.examples.shape[0]) self.assertEqual(8, encoded.encoded_data.examples.shape[1]) self.assertEqual(0, encoded.encoded_data.examples[0, 0]) self.assertEqual(0, encoded.encoded_data.examples[1, 1]) self.assertEqual(0, encoded.encoded_data.examples[0, 4]) self.assertTrue(np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"])) self.assertTrue(np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"])) shutil.rmtree(path)
def test_repertoire_flattened(self): path = EnvironmentSettings.root_path / "test/tmp/onehot_recep_flat/" PathBuilder.build(path) dataset, lc = self._construct_test_repertoiredataset(path, positional=False) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=path, label_config=lc, pool_size=1, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, RepertoireDataset)) onehot_a = [1.0] + [0.0] * 19 onehot_t = [0.0] * 16 + [1.0] + [0] * 3 onehot_empty = [0] * 20 self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_a+onehot_a+onehot_t+onehot_a+onehot_empty+onehot_a+onehot_t+onehot_a+onehot_empty) self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_empty+onehot_t+onehot_a+onehot_a+onehot_empty+onehot_empty+onehot_empty+onehot_empty+onehot_empty) self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{seq}_{pos}_{char}" for seq in range(3) for pos in range(4) for char in EnvironmentSettings.get_sequence_alphabet()]) shutil.rmtree(path)
def test__encode_new_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/matched_receptors_encoder/" dataset, label_config, reference_receptors, labels = self.create_dummy_data(path) encoder = MatchedReceptorsEncoder.build_object(dataset, **{ "reference": reference_receptors, "max_edit_distances": 0 }) encoded = encoder.encode(dataset, EncoderParams( result_path=path, label_config=label_config, filename="dataset.csv" )) expected_outcome = [[10, 0, 0, 0],[0, 10, 0, 0],[5, 0, 5, 0], [0, 5, 0, 5], [1, 1, 2, 2]] for index, row in enumerate(expected_outcome): self.assertListEqual(list(encoded.encoded_data.examples[index]), expected_outcome[index]) self.assertDictEqual(encoded.encoded_data.labels, {"label": ["yes", "yes", "no", "no", "no"], "subject_id": ["subject_1", "subject_1", "subject_2", "subject_2", "subject_3"]}) self.assertListEqual(encoded.encoded_data.feature_names, ["100-A0-B0.alpha", "100-A0-B0.beta", "200-A0-B0.alpha", "200-A0-B0.beta"]) self.assertListEqual(list(encoded.encoded_data.feature_annotations.receptor_id), ["100-A0-B0", "100-A0-B0", "200-A0-B0", "200-A0-B0"]) self.assertListEqual(list(encoded.encoded_data.feature_annotations.clonotype_id), [100, 100, 200, 200]) self.assertListEqual(list(encoded.encoded_data.feature_annotations.chain), ["alpha", "beta", "alpha", "beta"]) self.assertListEqual(list(encoded.encoded_data.feature_annotations.sequence), ["AAAA", "SSSS", "CCCC", "TTTT"]) self.assertListEqual(list(encoded.encoded_data.feature_annotations.v_gene), ["V1" for i in range(4)]) self.assertListEqual(list(encoded.encoded_data.feature_annotations.j_gene), ["J1" for i in range(4)]) shutil.rmtree(path)
def test_encode(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "atchley_kmer_encoding/") dataset = RandomDatasetGenerator.generate_repertoire_dataset( 3, {1: 1}, {4: 1}, {"l1": { True: 0.4, False: 0.6 }}, path / "dataset") encoder = AtchleyKmerEncoder.build_object( dataset, **{ "k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE", "normalize_all_features": False }) encoded_dataset = encoder.encode( dataset, EncoderParams(path / "result", LabelConfiguration(labels=[Label("l1")]))) self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape) self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0]) shutil.rmtree(path)
def get_encoded_repertoire(self, repertoire, params: EncoderParams): params.model = vars(self) return CacheHandler.memo_by_params((("encoding_model", params.model), ("type", "kmer_encoding"), ("labels", params.label_config.get_labels_by_name()), ("repertoire_id", repertoire.identifier)), lambda: self.encode_repertoire(repertoire, params), CacheObjectType.ENCODING_STEP)
def encode_dataset(dataset, hp_setting: HPSetting, path: Path, learn_model: bool, context: dict, number_of_processes: int, label_configuration: LabelConfiguration, encode_labels: bool = True, store_encoded_data: bool = False): PathBuilder.build(path) encoded_dataset = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=hp_setting.encoder, encoder_params=EncoderParams( model=hp_setting.encoder_params, result_path=path, pool_size=number_of_processes, label_config=label_configuration, learn_model=learn_model, filename="train_dataset.pkl" if learn_model else "test_dataset.pkl", encode_labels=encode_labels), store_encoded_data=store_encoded_data)) return encoded_dataset
def test_encode_no_v_no_count(self): path = EnvironmentSettings.root_path / "test/tmp/regex_matches_encoder/" dataset, label_config, motif_filepath, labels = self.create_dummy_data(path) encoder = MatchedRegexEncoder.build_object(dataset, **{ "motif_filepath": motif_filepath, "match_v_genes": False, "sum_counts": False }) encoded = encoder.encode(dataset, EncoderParams( result_path=path, label_config=label_config, filename="dataset.csv" )) expected_outcome = [[2, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]] for index, row in enumerate(expected_outcome): self.assertListEqual(list(encoded.encoded_data.examples[index]), expected_outcome[index]) self.assertListEqual(["1_IGL", "1_IGH", "2_IGH", "3_IGL"], encoded.encoded_data.feature_names) self.assertListEqual(["subject_1", "subject_2", "subject_3"], encoded.encoded_data.example_ids) shutil.rmtree(path)
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "tcrdist_motif_discovery/") dataset_path = self._create_dataset(path) dataset = SingleLineReceptorImport.import_dataset({"path": dataset_path, "result_path": path / "dataset/", "separator": ",", "columns_to_load": ["subject", "epitope", "count", "v_a_gene", "j_a_gene", "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa", "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq"], "column_mapping": { "cdr3_a_aa": "alpha_amino_acid_sequence", "cdr3_b_aa": "beta_amino_acid_sequence", "cdr3_a_nucseq": "alpha_nucleotide_sequence", "cdr3_b_nucseq": "beta_nucleotide_sequence", "v_a_gene": "alpha_v_gene", "v_b_gene": "beta_v_gene", "j_a_gene": "alpha_j_gene", "j_b_gene": "beta_j_gene", "clone_id": "identifier" }, "receptor_chains": "TRA_TRB", "region_type": "IMGT_CDR3", "sequence_file_size": 50000, "organism": "mouse"}, 'd1') dataset = TCRdistEncoder(8).encode(dataset, EncoderParams(path / "result", LabelConfiguration([Label("epitope")]))) report = TCRdistMotifDiscovery(train_dataset=dataset, test_dataset=dataset, result_path=path / "report", name="report name", cores=8, positive_class_name="PA", min_cluster_size=3) report._generate() shutil.rmtree(path)
def _get_encoded_repertoire(self, repertoire, params: EncoderParams): params.model = vars(self) return CacheHandler.memo_by_params((("encoding_model", params.model), ("labels", params.label_config.get_labels_by_name()), ("repertoire_id", repertoire.identifier), ("repertoire_data", hashlib.sha256(np.ascontiguousarray(repertoire.get_attribute(self.sequence_type.value))).hexdigest())), lambda: self._encode_repertoire(repertoire, params), CacheObjectType.ENCODING)
def test(self): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="1"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="2"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="3"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="4") ] path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/" PathBuilder.build(path / 'data') dataset = ReceptorDataset.build_from_objects(receptors, path=path, file_size=10) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) encoder = KmerFreqReceptorEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 }) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path / "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv", encode_labels=False)) self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0]) self.assertTrue( all(identifier in encoded_dataset.encoded_data.example_ids for identifier in ['1', '2', '3', '4'])) self.assertTrue( numpy.array_equal(encoded_dataset.encoded_data.examples[0].A, encoded_dataset.encoded_data.examples[2].A)) self.assertTrue( all(feature_name in encoded_dataset.encoded_data.feature_names for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"])) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path / "abundance_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceAbundanceEncoder.build_object( dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8 }) label_config = LabelConfiguration( [Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue( np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue( np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def test_encode_sequence(self): sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ", None, None) result = IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108', 'PRE///109', 'RER///110', 'ERA///111', 'RAT///111.001', 'ATY///111.002', 'TYE///111.003', 'YEQ///111.004', 'EQC///111.005', 'QCA///111.006', 'CAS///111.007', 'ASS///111.008', 'SSP///111.009', 'SPR///111.01', 'PRE///111.011', 'RER///111.012', 'ERA///111.013', 'RAT///112.013', 'ATY///112.012', 'TYE///112.011', 'YEQ///112.01', 'EQC///112.009', 'QCA///112.008', 'CAS///112.007', 'ASS///112.006', 'SSP///112.005', 'SPR///112.004', 'PRE///112.003', 'RER///112.002', 'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114', 'YEQ///115' }, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) sequence = ReceptorSequence("AHCDE", None, None) result = IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'}, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) self.assertEqual( IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 25}, label_config=LabelConfiguration(), result_path="")), None)
def _encode_sequence(self, sequence: ReceptorSequence, params: EncoderParams, sequence_encoder, counts): params.model = vars(self) features = sequence_encoder.encode_sequence(sequence, params) if features is not None: for i in features: if self.reads == ReadsType.UNIQUE: counts[i] += 1 elif self.reads == ReadsType.ALL: counts[i] += sequence.metadata.count return counts
def _test_encode(self, compairr_path): path = EnvironmentSettings.tmp_test_path / "compairr_abundance_encoder/" PathBuilder.build(path) dataset = self._build_test_dataset(path) for ignore_genes in [True, False]: result_path = path / f"ignore_genes={ignore_genes}" encoder = CompAIRRSequenceAbundanceEncoder.build_object( dataset, **{ "p_value_threshold": 0.4, "compairr_path": compairr_path, "sequence_batch_size": 2, "ignore_genes": ignore_genes, "threads": 8 }) label_config = LabelConfiguration( [Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=result_path, label_config=label_config)) self.assertTrue( np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=result_path, label_config=label_config)) self.assertTrue( np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def test(self): path = EnvironmentSettings.tmp_test_path / "onehot_sequence/" PathBuilder.build(path) dataset, lc = self._construct_test_dataset(path) encoder = OneHotEncoder.build_object( dataset, **{ "use_positional_info": False, 'sequence_type': 'amino_acid', "distance_to_seq_middle": None, "flatten": False }) encoded_data = encoder.encode( dataset, EncoderParams(result_path=path / "encoded/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) self.assertTrue(isinstance(encoded_data, SequenceDataset)) onehot_a = [1] + [0] * 19 onehot_t = [0] * 16 + [1] + [0] * 3 onehot_empty = [0] * 20 self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[0]], [onehot_a for i in range(4)]) self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[1]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[2]], [onehot_a, onehot_t, onehot_t, onehot_empty]) self.assertListEqual( encoded_data.encoded_data.example_ids, [receptor.identifier for receptor in dataset.get_data()]) self.assertDictEqual( encoded_data.encoded_data.labels, { "l1": [ receptor_seq.get_attribute("l1") for receptor_seq in dataset.get_data() ], "l2": [ receptor_seq.get_attribute("l2") for receptor_seq in dataset.get_data() ] }) shutil.rmtree(path)
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None): """ encodes the repertoire dataset using KmerFrequencyEncoder Arguments: path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format result_path (str): where to store the results metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None, otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column Returns: encoded dataset with encoded data in encoded_dataset.encoded_data.examples """ path_to_dataset_directory = Path(path_to_dataset_directory) result_path = Path(result_path) if metadata_path is None: metadata_path = generate_random_metadata(path_to_dataset_directory, result_path) else: metadata_path = Path(metadata_path) loader = MiXCRImport() dataset = loader.import_dataset({ "is_repertoire": True, "path": path_to_dataset_directory, "metadata_file": metadata_path, "region_type": "IMGT_CDR3", # import_dataset in only cdr3 "number_of_processes": 4, # number of parallel processes for loading the data "result_path": result_path, "separator": "\t", "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"], "column_mapping": { "cloneCount": "counts", "allVHitsWithScore": "v_alleles", "allJHitsWithScore": "j_alleles" }, }, "mixcr_dataset") label_name = list(dataset.labels.keys())[0] # label that can be used for ML prediction - by default: "disease" with values True/False encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": "relative_frequency", # encode repertoire by the relative frequency of k-mers in repertoire "reads": "unique", # count each sequence only once, do not use clonal count "k": 2, # k-mer length "sequence_type": "amino_acid", "sequence_encoding": "continuous_kmer" # split each sequence in repertoire to overlapping k-mers }), EncoderParams(result_path=result_path, label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])])))) dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset, result_path=result_path / "csv_exported", file_format='csv') dataset_exporter.generate_report() return encoded_dataset
def encode(self, unit: ExploratoryAnalysisUnit, result_path: Path) -> Dataset: if unit.encoder is not None: encoded_dataset = DataEncoder.run(DataEncoderParams(dataset=unit.dataset, encoder=unit.encoder, encoder_params=EncoderParams(result_path=result_path, label_config=unit.label_config, filename="encoded_dataset.pkl", pool_size=unit.number_of_processes, learn_model=True, encode_labels=unit.label_config is not None), store_encoded_data=True)) else: encoded_dataset = unit.dataset return encoded_dataset
def test_encode_sequence(self): seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY") result = KmerSequenceEncoder.encode_sequence(seq, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)) self.assertTrue("CAS" in result) self.assertTrue("ASS" in result) self.assertTrue("SSV" in result) self.assertTrue("SVF" in result) self.assertTrue("VFR" in result) self.assertTrue("FRT" in result) self.assertTrue("RTY" in result) self.assertEqual(7, len(result)) self.assertEqual( KmerSequenceEncoder.encode_sequence( ReceptorSequence(amino_acid_sequence="AC"), EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4) ), None )
def test_encode_sequence(self): sequence = ReceptorSequence("AHCDE", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'AH///105', 'HC///106', 'CD///107', 'DE///116', 'A.C///105', 'H.D///106', 'C.E///107' }, set(kmers)) sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'CA///105', 'AS///106', 'SS///107', 'SP///108', 'PR///109', 'RE///110', 'ER///111', 'RA///111.001', 'AT///112.002', 'TY///112.001', 'YE///112', 'EQ///113', 'QC///114', 'CA///115', 'AY///116', 'C.S///105', 'A.S///106', 'S.P///107', 'S.R///108', 'P.E///109', 'R.R///110', 'E.A///111', 'R.T///111.001', 'A.Y///112.002', 'T.E///112.001', 'Y.Q///112', 'E.C///113', 'Q.A///114', 'C.Y///115' }, set(kmers))
def test_receptor_flattened(self): path = EnvironmentSettings.root_path / "test/tmp/onehot_recep_flat/" PathBuilder.build(path) dataset = self.construct_test_flatten_dataset(path) encoder = OneHotEncoder.build_object( dataset, **{ "use_positional_info": False, "distance_to_seq_middle": None, 'sequence_type': 'amino_acid', "flatten": True }) encoded_data = encoder.encode( dataset, EncoderParams(result_path=path, label_config=LabelConfiguration([ Label(name="l1", values=[1, 0], positive_class="1") ]), pool_size=1, learn_model=True, model={}, filename="dataset.pkl")) self.assertTrue(isinstance(encoded_data, ReceptorDataset)) onehot_a = [1.0] + [0.0] * 19 onehot_t = [0.0] * 16 + [1.0] + [0] * 3 self.assertListEqual( list(encoded_data.encoded_data.examples[0]), onehot_a + onehot_a + onehot_a + onehot_t + onehot_t + onehot_t + onehot_a + onehot_t + onehot_a + onehot_t + onehot_a + onehot_t) self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a * 12) self.assertListEqual(list(encoded_data.encoded_data.examples[2]), onehot_a * 12) self.assertListEqual(list(encoded_data.encoded_data.feature_names), [ f"{chain}_{pos}_{char}" for chain in ("alpha", "beta") for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet() ]) shutil.rmtree(path)
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "kernel_sequence_logo/") dataset = RandomDatasetGenerator.generate_receptor_dataset( receptor_count=500, chain_1_length_probabilities={4: 1}, chain_2_length_probabilities={4: 1}, labels={"CMV": { True: 0.5, False: 0.5 }}, path=path / "dataset") enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode( dataset, EncoderParams(path / "result", LabelConfiguration([Label("CMV", [True, False])]))) cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu", number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5, batch_size=100, training_percentage=0.8, l2_weight_decay=0.0) cnn.fit(enc_dataset.encoded_data, "CMV") report = KernelSequenceLogo(method=cnn, result_path=path / "logos/") report.generate_report() self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.png")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.png")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.png")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.png")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_1.csv")) self.assertTrue(os.path.isfile(path / "logos/alpha_kernel_3_2.csv")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_1.csv")) self.assertTrue(os.path.isfile(path / "logos/beta_kernel_3_2.csv")) self.assertTrue( os.path.isfile(path / "logos/fully_connected_layer_weights.csv")) self.assertTrue( os.path.isfile(path / "logos/fully_connected_layer_weights.html")) shutil.rmtree(path)
def test_encode_sequence(self): sequence = ReceptorSequence("AHCDE", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'AH-105', 'HC-106', 'CD-107', 'DE-116', 'A.C-105', 'H.D-106', 'C.E-107' }, set(kmers)) sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'CA-105', 'AS-106', 'SS-107', 'SP-108', 'PR-109', 'RE-110', 'ER-111', 'RA-111.001', 'AT-112.002', 'TY-112.001', 'YE-112', 'EQ-113', 'QC-114', 'CA-115', 'AY-116', 'C.S-105', 'A.S-106', 'S.P-107', 'S.R-108', 'P.E-109', 'R.R-110', 'E.A-111', 'R.T-111.001', 'A.Y-112.002', 'T.E-112.001', 'Y.Q-112', 'E.C-113', 'Q.A-114', 'C.Y-115' }, set(kmers))
def _encode_dataset(self, encoder, dataset, path, learn_model: bool = True): # encodes the repertoire by frequency of 3-mers lc = LabelConfiguration() lc.add_label("disease", [True, False]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path / "encoded", label_config=lc, learn_model=learn_model, model={})) return encoded_dataset
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def _encode_dataset(self, dataset, path, learn_model: bool = True): encoder = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3, 'sequence_type': SequenceType.AMINO_ACID.name }) # encodes the repertoire by frequency of 3-mers lc = LabelConfiguration() lc.add_label("disease", [True, False]) encoded_dataset = encoder.encode(dataset, EncoderParams( result_path=path / "encoded", label_config=lc, learn_model=learn_model, model={} )) return encoded_dataset
def test_encode(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def test_not_positional(self): path = EnvironmentSettings.root_path / "test/tmp/onehot_vanilla/" PathBuilder.build(path) dataset, lc = self._construct_test_repertoiredataset(path, positional=False) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": 6, "flatten": False}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=path, label_config=lc, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, RepertoireDataset)) onehot_a = [1] + [0] * 19 onehot_t = [0] * 16 + [1] + [0] * 3 onehot_empty = [0] * 20 self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0][0]], [onehot_a for i in range(4)]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0][1]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0][2]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1][0]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1][1]], [onehot_t, onehot_a, onehot_a, onehot_empty]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1][2]], [onehot_empty for i in range(4)]) self.assertListEqual(list(encoded_data.encoded_data.example_ids), [repertoire.identifier for repertoire in dataset.get_data()]) self.assertDictEqual(encoded_data.encoded_data.labels, {"l1": [repertoire.metadata["l1"] for repertoire in dataset.get_data()], "l2": [repertoire.metadata["l2"] for repertoire in dataset.get_data()]}) shutil.rmtree(path)
def test(self): path = EnvironmentSettings.tmp_test_path / "onehot_sequence_1/" PathBuilder.build(path) dataset, lc = self._construct_test_dataset(path) encoder = OneHotEncoder.build_object( dataset, **{ "use_positional_info": False, 'sequence_type': 'amino_acid', "distance_to_seq_middle": 6, "flatten": False }) encoded_data = encoder.encode( dataset, EncoderParams(result_path=path / "encoded", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) self.assertTrue(isinstance(encoded_data, ReceptorDataset)) onehot_a = [1] + [0] * 19 onehot_t = [0] * 16 + [1] + [0] * 3 onehot_empty = [0] * 20 self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[0, 0]], [onehot_a for i in range(4)]) self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[0, 1]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[1, 0]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[1, 1]], [onehot_a, onehot_t, onehot_t, onehot_empty]) shutil.rmtree(path)