def create_dataset(self, path, dataset_size: int = 50): sequences = [] for i in range(dataset_size): if i % 2 == 0: sequences.append( ReceptorSequence( amino_acid_sequence="AAACCC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 1}))) else: sequences.append( ReceptorSequence( amino_acid_sequence="ACACAC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 2}))) PathBuilder.build(path) filename = path / "sequences.pkl" with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = SequenceDataset(labels={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def test_run(self): path = EnvironmentSettings.tmp_test_path / "explanalysisproc/" PathBuilder.build(path) dataset = self.create_dataset(path) label_config = LabelConfiguration() label_config.add_label("l1", [0, 1]) label_config.add_label("l2", [2, 3]) file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 100a TRA AAAC TRAV12 TRAJ1 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV """ with open(path / "refs.tsv", "w") as file: file.writelines(file_content) refs_dict = {"path": path / "refs.tsv", "format": "VDJdb"} preproc_sequence = [SubjectRepertoireCollector()] units = {"named_analysis_1": ExploratoryAnalysisUnit(dataset=dataset, report=SequenceLengthDistribution(), number_of_processes=16), "named_analysis_2": ExploratoryAnalysisUnit(dataset=dataset, report=SequenceLengthDistribution(), preprocessing_sequence=preproc_sequence)} process = ExploratoryAnalysisInstruction(units, name="exp") process.run(path / "results/") self.assertTrue(units["named_analysis_1"].number_of_processes == 16) self.assertTrue(os.path.isfile(path / "results/exp/analysis_named_analysis_1/report/sequence_length_distribution.html")) self.assertTrue(os.path.isfile(path / "results/exp/analysis_named_analysis_2/report/sequence_length_distribution.html")) shutil.rmtree(path)
def test_encode_sequence(self): sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="OUT")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="STOP")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="IN")) enc = IdentitySequenceEncoder() self.assertEqual(["AAA"], enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")))
def create_dataset(self, path, dataset_size: int = 50): receptors = [] seq1 = ReceptorSequence(amino_acid_sequence="ACACAC") seq2 = ReceptorSequence(amino_acid_sequence="DDDEEE") for i in range(dataset_size): if i % 2 == 0: receptors.append( TCABReceptor(alpha=seq1, beta=seq1, metadata={"l1": 1}, identifier=str(i))) else: receptors.append( TCABReceptor(alpha=seq2, beta=seq2, metadata={"l1": 2}, identifier=str(i))) PathBuilder.build(path) filename = path / "receptors.pkl" with open(filename, "wb") as file: pickle.dump(receptors, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset(labels={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def _construct_test_repertoiredataset(self, path, positional): receptors1 = ReceptorSequenceList() receptors2 = ReceptorSequenceList() if positional: [receptors1.append(seq) for seq in [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]] [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]] else: [receptors1.append(seq) for seq in [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]] [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]] rep1 = Repertoire.build_from_sequence_objects(receptors1, metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path) rep2 = Repertoire.build_from_sequence_objects(receptors2, metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) return dataset, lc
def _construct_test_dataset(self, path): sequences = [ ReceptorSequence(amino_acid_sequence="AAAA", identifier="1", metadata=SequenceMetadata(custom_params={ "l1": 1, "l2": 1 })), ReceptorSequence(amino_acid_sequence="ATA", identifier="2", metadata=SequenceMetadata(custom_params={ "l1": 2, "l2": 1 })), ReceptorSequence(amino_acid_sequence="ATT", identifier="3", metadata=SequenceMetadata(custom_params={ "l1": 1, "l2": 2 })) ] lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [1, 2]) dataset = SequenceDataset.build(sequences=sequences, file_size=10, path=path) return dataset, lc
def test(self): path = EnvironmentSettings.tmp_test_path / "integration_sequence_classification/" dataset = RandomDatasetGenerator.generate_sequence_dataset(50, {4: 1}, {'l1': {1: 0.5, 2: 0.5}}, path / 'data') os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={"model_selection_cv": False, "model_selection_n_folds": -1}, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction(dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) result = instruction.run(result_path=path) shutil.rmtree(path)
def _create_label_config(self, instruction: dict, dataset: Dataset, instruction_key: str) -> LabelConfiguration: labels = instruction["labels"] self._check_label_format(labels, instruction_key) label_config = LabelConfiguration() for label in labels: label_name = label if isinstance(label, str) else list( label.keys())[0] positive_class = label[label_name]['positive_class'] if isinstance( label, dict) else None if dataset.labels is not None and label_name in dataset.labels: label_values = dataset.labels[label_name] elif hasattr(dataset, "get_metadata"): label_values = list( set(dataset.get_metadata([label_name])[label_name])) else: label_values = [] warnings.warn( f"{TrainMLModelParser.__name__}: for instruction {instruction_key}, label values could not be recovered for label " f"{label}, using empty list instead. This could cause problems with some encodings. " f"If that might be the case, check if the dataset {dataset.name} has been properly loaded." ) label_config.add_label(label_name, label_values, positive_class=positive_class) return label_config
def create_label_config(labels_dict: dict, dataset: Dataset, instruction_name: str, yaml_location: str) -> LabelConfiguration: LabelHelper.check_label_format(labels_dict, instruction_name, yaml_location) label_config = LabelConfiguration() for label in labels_dict: label_name = label if isinstance(label, str) else list( label.keys())[0] positive_class = label[label_name]['positive_class'] if isinstance( label, dict) else None if dataset.labels is not None and label_name in dataset.labels: label_values = dataset.labels[label_name] elif hasattr(dataset, "get_metadata"): label_values = list( set(dataset.get_metadata([label_name])[label_name])) else: label_values = [] warnings.warn( f"{instruction_name}: for {yaml_location}, label values could not be recovered for label " f"{label}, using empty list instead. This issue may occur due to improper loading of dataset {dataset.name}," f"and could cause problems with some encodings.") label_config.add_label(label_name, label_values, positive_class=positive_class) return label_config
def test(self): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="1"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="2"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"), beta=ReceptorSequence(amino_acid_sequence="AAACCC"), identifier="3"), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"), beta=ReceptorSequence(amino_acid_sequence="CCC"), identifier="4") ] path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/" PathBuilder.build(path / 'data') dataset = ReceptorDataset.build_from_objects(receptors, path=path, file_size=10) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) encoder = KmerFreqReceptorEncoder.build_object( dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 }) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path / "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv", encode_labels=False)) self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0]) self.assertTrue( all(identifier in encoded_dataset.encoded_data.example_ids for identifier in ['1', '2', '3', '4'])) self.assertTrue( numpy.array_equal(encoded_dataset.encoded_data.examples[0].A, encoded_dataset.encoded_data.examples[2].A)) self.assertTrue( all(feature_name in encoded_dataset.encoded_data.feature_names for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"])) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/smmodel/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, { "default": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ] }) dataset = RepertoireDataset(repertoires=repertoires, labels={"default": [1, 2]}, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("default", [1, 2]) hp_settings = [ HPSetting( Word2VecEncoder.build_object( dataset, **{ "vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3 }), { "vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3 }, LogisticRegression(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, []) ] split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) instruction = TrainMLModelInstruction( dataset, GridSearch(hp_settings), hp_settings, split_config_assessment, split_config_selection, {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) semantic_model = SemanticModel([instruction], path) semantic_model.run() shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0]) dataset.encoded_data = EncodedData( examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]), labels={ "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] }) label_config = LabelConfiguration() label_config.add_label("l1", [1, 3]) label = Label(name='l1', values=[1, 2]) method1 = LogisticRegression() method1.fit(dataset.encoded_data, label=label) res = MLMethodAssessment.run( MLMethodAssessmentParams( dataset=dataset, method=method1, metrics={ Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO }, optimization_metric=Metric.LOG_LOSS, predictions_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv", label=label, ml_score_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv", split_index=1, path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")) self.assertTrue(isinstance(res, dict)) self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1) self.assertTrue( os.path.isfile(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv")) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv") self.assertTrue(df.shape[0] == 1) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv") self.assertEqual(12, df.shape[0]) shutil.rmtree(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")
def __init__(self, split_index: int, train_val_dataset, test_dataset, path: Path, label_configuration: LabelConfiguration): self.split_index = split_index self.train_val_dataset = train_val_dataset self.test_dataset = test_dataset self.path = path self.train_val_data_reports = [] self.test_data_reports = [] # computed self.label_states = {label: HPLabelState(label, label_configuration.get_auxiliary_labels(label)) for label in label_configuration.get_labels_by_name()}
def test_run(self): path = EnvironmentSettings.tmp_test_path / "explanalysisprocintegration/" PathBuilder.build(path) os.environ["cache_type"] = "test" dataset = self.create_dataset(path) label_config = LabelConfiguration() label_config.add_label("l1", [0, 1]) label_config.add_label("l2", [2, 3]) file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 100a TRA AAAC TRAV12 TRAJ1 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV """ with open(path / "refs.tsv", "w") as file: file.writelines(file_content) refs = { "params": { "path": path / "refs.tsv", "region_type": "FULL_SEQUENCE" }, "format": "VDJdb" } units = { "named_analysis_4": ExploratoryAnalysisUnit( dataset=dataset, report=DesignMatrixExporter(name='report', file_format='csv'), label_config=label_config, encoder=MatchedSequencesRepertoireEncoder.build_object( dataset, **{ "max_edit_distance": 1, "reference": refs })) } process = ExploratoryAnalysisInstruction(units, name="exp") process.run(path / "results/") self.assertTrue( os.path.isfile( path / "results/exp/analysis_named_analysis_4/report/design_matrix.csv" )) shutil.rmtree(path)
def _encode_dataset(self, encoder, dataset, path, learn_model: bool = True): # encodes the repertoire by frequency of 3-mers lc = LabelConfiguration() lc.add_label("disease", [True, False]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path / "encoded", label_config=lc, learn_model=learn_model, model={})) return encoded_dataset
def _run_test(self, compairr_path): path = EnvironmentSettings.tmp_test_path / "compairr_distance_encoder/" PathBuilder.build(path) dataset = self.create_dataset(path) enc = CompAIRRDistanceEncoder.build_object(dataset, **{"compairr_path": compairr_path, "keep_compairr_input": True, "differences": 0, "indels": False, "ignore_counts": False, "threads": 8, "ignore_genes": False}) enc.set_context({"dataset": dataset}) encoded = enc.encode(dataset, EncoderParams(result_path=path, label_config=LabelConfiguration([Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4, filename="dataset.pkl")) self.assertEqual(8, encoded.encoded_data.examples.shape[0]) self.assertEqual(8, encoded.encoded_data.examples.shape[1]) self.assertEqual(0, encoded.encoded_data.examples[0, 0]) self.assertEqual(0, encoded.encoded_data.examples[1, 1]) self.assertEqual(0, encoded.encoded_data.examples[0, 4]) self.assertTrue(np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"])) self.assertTrue(np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"])) shutil.rmtree(path)
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "tcrdist_motif_discovery/") dataset_path = self._create_dataset(path) dataset = SingleLineReceptorImport.import_dataset({"path": dataset_path, "result_path": path / "dataset/", "separator": ",", "columns_to_load": ["subject", "epitope", "count", "v_a_gene", "j_a_gene", "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa", "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq"], "column_mapping": { "cdr3_a_aa": "alpha_amino_acid_sequence", "cdr3_b_aa": "beta_amino_acid_sequence", "cdr3_a_nucseq": "alpha_nucleotide_sequence", "cdr3_b_nucseq": "beta_nucleotide_sequence", "v_a_gene": "alpha_v_gene", "v_b_gene": "beta_v_gene", "j_a_gene": "alpha_j_gene", "j_b_gene": "beta_j_gene", "clone_id": "identifier" }, "receptor_chains": "TRA_TRB", "region_type": "IMGT_CDR3", "sequence_file_size": 50000, "organism": "mouse"}, 'd1') dataset = TCRdistEncoder(8).encode(dataset, EncoderParams(path / "result", LabelConfiguration([Label("epitope")]))) report = TCRdistMotifDiscovery(train_dataset=dataset, test_dataset=dataset, result_path=path / "report", name="report name", cores=8, positive_class_name="PA", min_cluster_size=3) report._generate() shutil.rmtree(path)
def _prepare_optional_params(self, analysis: dict, symbol_table: SymbolTable, yaml_location: str) -> dict: params = {} dataset = symbol_table.get(analysis["dataset"]) if "encoding" in analysis: params["encoder"] = symbol_table.get( analysis["encoding"]).build_object( dataset, **symbol_table.get_config( analysis["encoding"])["encoder_params"]) if "labels" in analysis: params["label_config"] = LabelHelper.create_label_config( analysis["labels"], dataset, ExploratoryAnalysisParser.__name__, yaml_location) else: params["label_config"] = LabelConfiguration() if "preprocessing_sequence" in analysis: params["preprocessing_sequence"] = symbol_table.get( analysis["preprocessing_sequence"]) return params
def _parse_split_config(self, instruction_key, instruction: dict, split_key: str, symbol_table: SymbolTable, settings_count: int, label_config: LabelConfiguration) -> SplitConfig: try: default_params = DefaultParamsLoader.load("instructions/", SplitConfig.__name__) report_config_input = self._prepare_report_config(instruction_key, instruction, split_key, symbol_table) instruction[split_key] = {**default_params, **instruction[split_key]} split_strategy = SplitType[instruction[split_key]["split_strategy"].upper()] training_percentage = float(instruction[split_key]["training_percentage"]) if split_strategy == SplitType.RANDOM else -1 if split_strategy == SplitType.RANDOM and training_percentage == 1 and settings_count > 1: raise ValueError(f"{TrainMLModelParser.__name__}: all data under {instruction_key}/{split_key} was specified to be used for " f"training, but {settings_count} settings were specified for evaluation. Please define a test/validation set by " f"reducing the training percentage (e.g., to 0.7) or use only one hyperparameter setting to run the analysis.") if split_strategy == SplitType.STRATIFIED_K_FOLD and len(label_config.get_labels_by_name()) != 1: raise ValueError(f"{TrainMLModelParser.__name__}: Stratified k-fold cross-validation cannot be used when " f"{len(label_config.get_labels_by_name())} labels are specified. It support only one label (and multiple classes).") return SplitConfig(split_strategy=split_strategy, split_count=int(instruction[split_key]["split_count"]), training_percentage=training_percentage, reports=ReportConfig(**report_config_input), manual_config=ManualSplitConfig(**instruction[split_key]["manual_config"]) if "manual_config" in instruction[split_key] else None, leave_one_out_config=LeaveOneOutConfig(**instruction[split_key]["leave_one_out_config"]) if "leave_one_out_config" in instruction[split_key] else None) except KeyError as key_error: raise KeyError(f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under {split_key}.")
def reeval_on_assessment_split(state, train_val_dataset: Dataset, test_dataset: Dataset, hp_setting: HPSetting, path: Path, label: Label, split_index: int) -> MLMethod: """retrain model for specific label, assessment split and hp_setting""" assessment_item = MLProcess( train_dataset=train_val_dataset, test_dataset=test_dataset, label=label, metrics=state.metrics, optimization_metric=state.optimization_metric, path=path, hp_setting=hp_setting, report_context=state.context, ml_reports=state.assessment.reports.model_reports.values(), number_of_processes=state.number_of_processes, encoding_reports=state.assessment.reports.encoding_reports.values( ), label_config=LabelConfiguration([label])).run(split_index) state.assessment_states[split_index].label_states[ label.name].assessment_items[str(hp_setting)] = assessment_item return state
def test_run(self): path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path / 'result/instr1/') shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle') shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path / 'result/') predictions_path = path / "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def test_encode(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "atchley_kmer_encoding/") dataset = RandomDatasetGenerator.generate_repertoire_dataset( 3, {1: 1}, {4: 1}, {"l1": { True: 0.4, False: 0.6 }}, path / "dataset") encoder = AtchleyKmerEncoder.build_object( dataset, **{ "k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE", "normalize_all_features": False }) encoded_dataset = encoder.encode( dataset, EncoderParams(path / "result", LabelConfiguration(labels=[Label("l1")]))) self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape) self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0]) shutil.rmtree(path)
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path) -> MLApplicationInstruction: location = MLApplicationParser.__name__ ParameterValidator.assert_keys(instruction.keys(), [ 'type', 'dataset', 'number_of_processes', 'config_path', 'store_encoded_data' ], location, key) ParameterValidator.assert_in_valid_list( instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset") ParameterValidator.assert_type_and_value( instruction['number_of_processes'], int, location, f"{key}: number_of_processes", min_inclusive=1) ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path') ParameterValidator.assert_type_and_value( instruction['store_encoded_data'], bool, location, f'{key}: store_encoded_data') hp_setting, label = self._parse_hp_setting(instruction, path, key) instruction = MLApplicationInstruction( dataset=symbol_table.get(instruction['dataset']), name=key, number_of_processes=instruction['number_of_processes'], label_configuration=LabelConfiguration([label]), hp_setting=hp_setting, store_encoded_data=instruction['store_encoded_data']) return instruction
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def _encode_dataset(self, dataset, path, learn_model: bool = True): encoder = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3, 'sequence_type': SequenceType.AMINO_ACID.name }) # encodes the repertoire by frequency of 3-mers lc = LabelConfiguration() lc.add_label("disease", [True, False]) encoded_dataset = encoder.encode(dataset, EncoderParams( result_path=path / "encoded", label_config=lc, learn_model=learn_model, model={} )) return encoded_dataset
def test_encode(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def _construct_test_dataset(self, path, dataset_size: int = 50): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"), beta=ReceptorSequence(amino_acid_sequence="ATA"), metadata={"l1": 1}, identifier=str("1")), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"), beta=ReceptorSequence(amino_acid_sequence="ATT"), metadata={"l1": 2}, identifier=str("2")) ] PathBuilder.build(path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset.build(receptors, 2, path) return dataset, lc
def test(self): path = EnvironmentSettings.tmp_test_path / "integration_receptor_classification/" dataset = self.create_dataset(path) os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "sequence_type": SequenceType.AMINO_ACID.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object( dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_folds": -1 }, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction( dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) state = instruction.run(result_path=path) print(vars(state)) self.assertEqual( 1.0, state.assessment_states[0].label_states["l1"]. optimal_assessment_item.performance[ state.optimization_metric.name.lower()]) shutil.rmtree(path)
def test_encode_sequence(self): sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ", None, None) result = IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108', 'PRE///109', 'RER///110', 'ERA///111', 'RAT///111.001', 'ATY///111.002', 'TYE///111.003', 'YEQ///111.004', 'EQC///111.005', 'QCA///111.006', 'CAS///111.007', 'ASS///111.008', 'SSP///111.009', 'SPR///111.01', 'PRE///111.011', 'RER///111.012', 'ERA///111.013', 'RAT///112.013', 'ATY///112.012', 'TYE///112.011', 'YEQ///112.01', 'EQC///112.009', 'QCA///112.008', 'CAS///112.007', 'ASS///112.006', 'SSP///112.005', 'SPR///112.004', 'PRE///112.003', 'RER///112.002', 'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114', 'YEQ///115' }, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) sequence = ReceptorSequence("AHCDE", None, None) result = IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'}, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) self.assertEqual( IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 25}, label_config=LabelConfiguration(), result_path="")), None)
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None): """ encodes the repertoire dataset using KmerFrequencyEncoder Arguments: path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format result_path (str): where to store the results metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None, otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column Returns: encoded dataset with encoded data in encoded_dataset.encoded_data.examples """ path_to_dataset_directory = Path(path_to_dataset_directory) result_path = Path(result_path) if metadata_path is None: metadata_path = generate_random_metadata(path_to_dataset_directory, result_path) else: metadata_path = Path(metadata_path) loader = MiXCRImport() dataset = loader.import_dataset({ "is_repertoire": True, "path": path_to_dataset_directory, "metadata_file": metadata_path, "region_type": "IMGT_CDR3", # import_dataset in only cdr3 "number_of_processes": 4, # number of parallel processes for loading the data "result_path": result_path, "separator": "\t", "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"], "column_mapping": { "cloneCount": "counts", "allVHitsWithScore": "v_alleles", "allJHitsWithScore": "j_alleles" }, }, "mixcr_dataset") label_name = list(dataset.labels.keys())[0] # label that can be used for ML prediction - by default: "disease" with values True/False encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": "relative_frequency", # encode repertoire by the relative frequency of k-mers in repertoire "reads": "unique", # count each sequence only once, do not use clonal count "k": 2, # k-mer length "sequence_type": "amino_acid", "sequence_encoding": "continuous_kmer" # split each sequence in repertoire to overlapping k-mers }), EncoderParams(result_path=result_path, label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])])))) dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset, result_path=result_path / "csv_exported", file_format='csv') dataset_exporter.generate_report() return encoded_dataset