def test(self): path = EnvironmentSettings.tmp_test_path / "integration_sequence_classification/" dataset = RandomDatasetGenerator.generate_sequence_dataset(50, {4: 1}, {'l1': {1: 0.5, 2: 0.5}}, path / 'data') os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={"model_selection_cv": False, "model_selection_n_folds": -1}, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction(dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) result = instruction.run(result_path=path) shutil.rmtree(path)
def _create_report(self, path): report = ConfounderAnalysis.build_object( metadata_labels=["age", "HLA"], name='test') report.ml_details_path = path / "ml_details.yaml" report.label = Label("disease") report.result_path = path encoder = KmerFrequencyEncoder.build_object( RepertoireDataset(), **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3, 'sequence_type': SequenceType.AMINO_ACID.name }) report.train_dataset = self._encode_dataset( encoder, self._make_dataset(path / "train", size=100), path) report.test_dataset = self._encode_dataset(encoder, self._make_dataset(path / "test", size=40), path, learn_model=False) report.method = self._create_dummy_lr_model( path, report.train_dataset.encoded_data, Label("disease")) return report
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None): """ encodes the repertoire dataset using KmerFrequencyEncoder Arguments: path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format result_path (str): where to store the results metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None, otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column Returns: encoded dataset with encoded data in encoded_dataset.encoded_data.examples """ path_to_dataset_directory = Path(path_to_dataset_directory) result_path = Path(result_path) if metadata_path is None: metadata_path = generate_random_metadata(path_to_dataset_directory, result_path) else: metadata_path = Path(metadata_path) loader = MiXCRImport() dataset = loader.import_dataset({ "is_repertoire": True, "path": path_to_dataset_directory, "metadata_file": metadata_path, "region_type": "IMGT_CDR3", # import_dataset in only cdr3 "number_of_processes": 4, # number of parallel processes for loading the data "result_path": result_path, "separator": "\t", "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"], "column_mapping": { "cloneCount": "counts", "allVHitsWithScore": "v_alleles", "allJHitsWithScore": "j_alleles" }, }, "mixcr_dataset") label_name = list(dataset.labels.keys())[0] # label that can be used for ML prediction - by default: "disease" with values True/False encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": "relative_frequency", # encode repertoire by the relative frequency of k-mers in repertoire "reads": "unique", # count each sequence only once, do not use clonal count "k": 2, # k-mer length "sequence_type": "amino_acid", "sequence_encoding": "continuous_kmer" # split each sequence in repertoire to overlapping k-mers }), EncoderParams(result_path=result_path, label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])])))) dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset, result_path=result_path / "csv_exported", file_format='csv') dataset_exporter.generate_report() return encoded_dataset
def _encode_dataset(self, dataset, path, learn_model: bool = True): encoder = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3, 'sequence_type': SequenceType.AMINO_ACID.name }) # encodes the repertoire by frequency of 3-mers lc = LabelConfiguration() lc.add_label("disease", [True, False]) encoded_dataset = encoder.encode(dataset, EncoderParams( result_path=path / "encoded", label_config=lc, learn_model=learn_model, model={} )) return encoded_dataset
def test(self): path = EnvironmentSettings.tmp_test_path / "integration_receptor_classification/" dataset = self.create_dataset(path) os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object( dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_folds": -1 }, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction( dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) state = instruction.run(result_path=path) print(vars(state)) self.assertEqual( 1.0, state.assessment_states[0].label_states["l1"]. optimal_assessment_item.performance[ state.optimization_metric.name.lower()]) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.root_path / "test/tmp/kmerfreqenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence("AAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3') ], metadata={ "l1": 1, "l2": 2, "subject_id": "1" }, path=path) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2"), ReceptorSequence("AAC", identifier="3") ], metadata={ "l1": 0, "l2": 3, "subject_id": "2" }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.IDENTITY.name, "k": 3 }) d1 = encoder.encode( dataset, EncoderParams(result_path=path / "1/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) encoder = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d2 = encoder.encode( dataset, EncoderParams(result_path=path / "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv")) encoder3 = KmerFrequencyEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.BINARY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d3 = encoder3.encode( dataset, EncoderParams(result_path=path / "3/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) shutil.rmtree(path) self.assertTrue(isinstance(d1, RepertoireDataset)) self.assertTrue(isinstance(d2, RepertoireDataset)) self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2)) self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2)) self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))