def create_dataset(self, path, dataset_size: int = 50):

        sequences = []

        for i in range(dataset_size):
            if i % 2 == 0:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="AAACCC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 1})))
            else:
                sequences.append(
                    ReceptorSequence(
                        amino_acid_sequence="ACACAC",
                        identifier=str(i),
                        metadata=SequenceMetadata(custom_params={"l1": 2})))

        PathBuilder.build(path)
        filename = path / "sequences.pkl"
        with open(filename, "wb") as file:
            pickle.dump(sequences, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = SequenceDataset(labels={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
    def test_run(self):
        path = EnvironmentSettings.tmp_test_path / "explanalysisproc/"
        PathBuilder.build(path)

        dataset = self.create_dataset(path)

        label_config = LabelConfiguration()
        label_config.add_label("l1", [0, 1])
        label_config.add_label("l2", [2, 3])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
        100a	TRA	AAAC	TRAV12	TRAJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV	                    
        """

        with open(path / "refs.tsv", "w") as file:
            file.writelines(file_content)

        refs_dict = {"path": path / "refs.tsv", "format": "VDJdb"}

        preproc_sequence = [SubjectRepertoireCollector()]

        units = {"named_analysis_1": ExploratoryAnalysisUnit(dataset=dataset, report=SequenceLengthDistribution(), number_of_processes=16),
                 "named_analysis_2": ExploratoryAnalysisUnit(dataset=dataset, report=SequenceLengthDistribution(),
                                                             preprocessing_sequence=preproc_sequence)}

        process = ExploratoryAnalysisInstruction(units, name="exp")
        process.run(path / "results/")

        self.assertTrue(units["named_analysis_1"].number_of_processes == 16)
        self.assertTrue(os.path.isfile(path / "results/exp/analysis_named_analysis_1/report/sequence_length_distribution.html"))
        self.assertTrue(os.path.isfile(path / "results/exp/analysis_named_analysis_2/report/sequence_length_distribution.html"))

        shutil.rmtree(path)
Example #3
0
    def test_encode_sequence(self):
        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="OUT"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(
            amino_acid_sequence="AAA",
            metadata=SequenceMetadata(frame_type="STOP"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(
            enc.encode_sequence(
                sequence,
                EncoderParams(model={},
                              label_config=LabelConfiguration(),
                              result_path="")), ["AAA"])

        sequence = ReceptorSequence(amino_acid_sequence="AAA",
                                    metadata=SequenceMetadata(frame_type="IN"))
        enc = IdentitySequenceEncoder()
        self.assertEqual(["AAA"],
                         enc.encode_sequence(
                             sequence,
                             EncoderParams(model={},
                                           label_config=LabelConfiguration(),
                                           result_path="")))
    def create_dataset(self, path, dataset_size: int = 50):

        receptors = []

        seq1 = ReceptorSequence(amino_acid_sequence="ACACAC")
        seq2 = ReceptorSequence(amino_acid_sequence="DDDEEE")

        for i in range(dataset_size):
            if i % 2 == 0:
                receptors.append(
                    TCABReceptor(alpha=seq1,
                                 beta=seq1,
                                 metadata={"l1": 1},
                                 identifier=str(i)))
            else:
                receptors.append(
                    TCABReceptor(alpha=seq2,
                                 beta=seq2,
                                 metadata={"l1": 2},
                                 identifier=str(i)))

        PathBuilder.build(path)
        filename = path / "receptors.pkl"
        with open(filename, "wb") as file:
            pickle.dump(receptors, file)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset(labels={"l1": [1, 2]},
                                  filenames=[filename],
                                  identifier="d1")
        return dataset
Example #5
0
    def _construct_test_repertoiredataset(self, path, positional):
        receptors1 = ReceptorSequenceList()
        receptors2 = ReceptorSequenceList()

        if positional:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]]
            [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]]
        else:
            [receptors1.append(seq) for seq in
             [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]]
            [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]]

        rep1 = Repertoire.build_from_sequence_objects(receptors1,
                                                      metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path)

        rep2 = Repertoire.build_from_sequence_objects(receptors2,
                                                      metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        return dataset, lc
Example #6
0
    def _construct_test_dataset(self, path):
        sequences = [
            ReceptorSequence(amino_acid_sequence="AAAA",
                             identifier="1",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 1,
                                 "l2": 1
                             })),
            ReceptorSequence(amino_acid_sequence="ATA",
                             identifier="2",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 2,
                                 "l2": 1
                             })),
            ReceptorSequence(amino_acid_sequence="ATT",
                             identifier="3",
                             metadata=SequenceMetadata(custom_params={
                                 "l1": 1,
                                 "l2": 2
                             }))
        ]

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [1, 2])

        dataset = SequenceDataset.build(sequences=sequences,
                                        file_size=10,
                                        path=path)

        return dataset, lc
Example #7
0
    def test(self):

        path = EnvironmentSettings.tmp_test_path / "integration_sequence_classification/"
        dataset = RandomDatasetGenerator.generate_sequence_dataset(50, {4: 1}, {'l1': {1: 0.5, 2: 0.5}}, path / 'data')

        os.environ["cache_type"] = "test"
        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "sequence_type": SequenceType.AMINO_ACID.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(dataset, **encoder_params), encoder_params=encoder_params,
                               ml_method=LogisticRegression(), ml_params={"model_selection_cv": False, "model_selection_n_folds": -1},
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(dataset, GridSearch([hp_setting]), [hp_setting],
                                              SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
                                              SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
                                              {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        result = instruction.run(result_path=path)

        shutil.rmtree(path)
Example #8
0
    def _create_label_config(self, instruction: dict, dataset: Dataset,
                             instruction_key: str) -> LabelConfiguration:
        labels = instruction["labels"]

        self._check_label_format(labels, instruction_key)

        label_config = LabelConfiguration()
        for label in labels:
            label_name = label if isinstance(label, str) else list(
                label.keys())[0]
            positive_class = label[label_name]['positive_class'] if isinstance(
                label, dict) else None
            if dataset.labels is not None and label_name in dataset.labels:
                label_values = dataset.labels[label_name]
            elif hasattr(dataset, "get_metadata"):
                label_values = list(
                    set(dataset.get_metadata([label_name])[label_name]))
            else:
                label_values = []
                warnings.warn(
                    f"{TrainMLModelParser.__name__}: for instruction {instruction_key}, label values could not be recovered for label "
                    f"{label}, using empty list instead.  This could cause problems with some encodings. "
                    f"If that might be the case, check if the dataset {dataset.name} has been properly loaded."
                )

            label_config.add_label(label_name,
                                   label_values,
                                   positive_class=positive_class)
        return label_config
Example #9
0
    def create_label_config(labels_dict: dict, dataset: Dataset,
                            instruction_name: str,
                            yaml_location: str) -> LabelConfiguration:
        LabelHelper.check_label_format(labels_dict, instruction_name,
                                       yaml_location)

        label_config = LabelConfiguration()
        for label in labels_dict:
            label_name = label if isinstance(label, str) else list(
                label.keys())[0]
            positive_class = label[label_name]['positive_class'] if isinstance(
                label, dict) else None
            if dataset.labels is not None and label_name in dataset.labels:
                label_values = dataset.labels[label_name]
            elif hasattr(dataset, "get_metadata"):
                label_values = list(
                    set(dataset.get_metadata([label_name])[label_name]))
            else:
                label_values = []
                warnings.warn(
                    f"{instruction_name}: for {yaml_location}, label values could not be recovered for label "
                    f"{label}, using empty list instead. This issue may occur due to improper loading of dataset {dataset.name},"
                    f"and could cause problems with some encodings.")

            label_config.add_label(label_name,
                                   label_values,
                                   positive_class=positive_class)
        return label_config
    def test(self):

        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="1"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="2"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         beta=ReceptorSequence(amino_acid_sequence="AAACCC"),
                         identifier="3"),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAA"),
                         beta=ReceptorSequence(amino_acid_sequence="CCC"),
                         identifier="4")
        ]

        path = EnvironmentSettings.tmp_test_path / "kmer_receptor_frequency/"
        PathBuilder.build(path / 'data')
        dataset = ReceptorDataset.build_from_objects(receptors,
                                                     path=path,
                                                     file_size=10)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        encoder = KmerFreqReceptorEncoder.build_object(
            dataset, **{
                "normalization_type":
                NormalizationType.RELATIVE_FREQUENCY.name,
                "reads": ReadsType.UNIQUE.name,
                "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
                "sequence_type": SequenceType.AMINO_ACID.name,
                "k": 3
            })

        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(result_path=path / "2/",
                          label_config=lc,
                          pool_size=2,
                          learn_model=True,
                          model={},
                          filename="dataset.csv",
                          encode_labels=False))

        self.assertEqual(4, encoded_dataset.encoded_data.examples.shape[0])
        self.assertTrue(
            all(identifier in encoded_dataset.encoded_data.example_ids
                for identifier in ['1', '2', '3', '4']))
        self.assertTrue(
            numpy.array_equal(encoded_dataset.encoded_data.examples[0].A,
                              encoded_dataset.encoded_data.examples[2].A))
        self.assertTrue(
            all(feature_name in encoded_dataset.encoded_data.feature_names
                for feature_name in ["alpha_AAA", "alpha_AAC", "beta_CCC"]))

        shutil.rmtree(path)
Example #11
0
    def test_run(self):

        path = EnvironmentSettings.root_path / "test/tmp/smmodel/"
        PathBuilder.build(path)
        repertoires, metadata = RepertoireBuilder.build(
            [["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"],
             ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"],
             ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, {
                 "default": [
                     1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
                     2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2
                 ]
             })
        dataset = RepertoireDataset(repertoires=repertoires,
                                    labels={"default": [1, 2]},
                                    metadata_file=metadata)

        label_config = LabelConfiguration()
        label_config.add_label("default", [1, 2])

        hp_settings = [
            HPSetting(
                Word2VecEncoder.build_object(
                    dataset, **{
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }), {
                        "vector_size": 8,
                        "model_type": ModelType.SEQUENCE.name,
                        "k": 3
                    }, LogisticRegression(), {
                        "model_selection_cv": False,
                        "model_selection_n_folds": -1
                    }, [])
        ]

        split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                              ReportConfig())
        split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5,
                                             ReportConfig())

        instruction = TrainMLModelInstruction(
            dataset, GridSearch(hp_settings), hp_settings,
            split_config_assessment, split_config_selection,
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config,
            path)
        semantic_model = SemanticModel([instruction], path)

        semantic_model.run()

        shutil.rmtree(path)
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"],
             ["AA"], ["CC"], ["AA"], ["CC"]], path)[0])
        dataset.encoded_data = EncodedData(
            examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3],
                               [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3,
                                                                        3]]),
            labels={
                "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3],
                "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
            })

        label_config = LabelConfiguration()
        label_config.add_label("l1", [1, 3])

        label = Label(name='l1', values=[1, 2])

        method1 = LogisticRegression()
        method1.fit(dataset.encoded_data, label=label)

        res = MLMethodAssessment.run(
            MLMethodAssessmentParams(
                dataset=dataset,
                method=method1,
                metrics={
                    Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO
                },
                optimization_metric=Metric.LOG_LOSS,
                predictions_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/predictions.csv",
                label=label,
                ml_score_path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/ml_score.csv",
                split_index=1,
                path=EnvironmentSettings.root_path /
                "test/tmp/mlmethodassessment/"))

        self.assertTrue(isinstance(res, dict))
        self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1)

        self.assertTrue(
            os.path.isfile(EnvironmentSettings.root_path /
                           "test/tmp/mlmethodassessment/ml_score.csv"))

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/ml_score.csv")
        self.assertTrue(df.shape[0] == 1)

        df = pd.read_csv(EnvironmentSettings.root_path /
                         "test/tmp/mlmethodassessment/predictions.csv")
        self.assertEqual(12, df.shape[0])

        shutil.rmtree(EnvironmentSettings.root_path /
                      "test/tmp/mlmethodassessment/")
Example #13
0
    def __init__(self, split_index: int, train_val_dataset, test_dataset, path: Path, label_configuration: LabelConfiguration):
        self.split_index = split_index
        self.train_val_dataset = train_val_dataset
        self.test_dataset = test_dataset
        self.path = path
        self.train_val_data_reports = []
        self.test_data_reports = []

        # computed
        self.label_states = {label: HPLabelState(label, label_configuration.get_auxiliary_labels(label))
                             for label in label_configuration.get_labels_by_name()}
Example #14
0
    def test_run(self):
        path = EnvironmentSettings.tmp_test_path / "explanalysisprocintegration/"
        PathBuilder.build(path)
        os.environ["cache_type"] = "test"

        dataset = self.create_dataset(path)

        label_config = LabelConfiguration()
        label_config.add_label("l1", [0, 1])
        label_config.add_label("l2", [2, 3])

        file_content = """complex.id	Gene	CDR3	V	J	Species	MHC A	MHC B	MHC class	Epitope	Epitope gene	Epitope species	Reference	Method	Meta	CDR3fix	Score
        100a	TRA	AAAC	TRAV12	TRAJ1	HomoSapiens	HLA-A*11:01	B2M	MHCI	AVFDRKSDAK	EBNA4	EBV
        """

        with open(path / "refs.tsv", "w") as file:
            file.writelines(file_content)

        refs = {
            "params": {
                "path": path / "refs.tsv",
                "region_type": "FULL_SEQUENCE"
            },
            "format": "VDJdb"
        }

        units = {
            "named_analysis_4":
            ExploratoryAnalysisUnit(
                dataset=dataset,
                report=DesignMatrixExporter(name='report', file_format='csv'),
                label_config=label_config,
                encoder=MatchedSequencesRepertoireEncoder.build_object(
                    dataset, **{
                        "max_edit_distance": 1,
                        "reference": refs
                    }))
        }

        process = ExploratoryAnalysisInstruction(units, name="exp")
        process.run(path / "results/")

        self.assertTrue(
            os.path.isfile(
                path /
                "results/exp/analysis_named_analysis_4/report/design_matrix.csv"
            ))

        shutil.rmtree(path)
Example #15
0
 def _encode_dataset(self,
                     encoder,
                     dataset,
                     path,
                     learn_model: bool = True):
     # encodes the repertoire by frequency of 3-mers
     lc = LabelConfiguration()
     lc.add_label("disease", [True, False])
     encoded_dataset = encoder.encode(
         dataset,
         EncoderParams(result_path=path / "encoded",
                       label_config=lc,
                       learn_model=learn_model,
                       model={}))
     return encoded_dataset
    def _run_test(self, compairr_path):

        path = EnvironmentSettings.tmp_test_path / "compairr_distance_encoder/"

        PathBuilder.build(path)

        dataset = self.create_dataset(path)

        enc = CompAIRRDistanceEncoder.build_object(dataset, **{"compairr_path": compairr_path,
                                                           "keep_compairr_input": True,
                                                        "differences": 0,
                                                        "indels": False,
                                                        "ignore_counts": False,
                                                        "threads": 8,
                                                        "ignore_genes": False})

        enc.set_context({"dataset": dataset})
        encoded = enc.encode(dataset, EncoderParams(result_path=path,
                                                    label_config=LabelConfiguration([Label("l1", [0, 1]), Label("l2", [2, 3])]),
                                                    pool_size=4, filename="dataset.pkl"))

        self.assertEqual(8, encoded.encoded_data.examples.shape[0])
        self.assertEqual(8, encoded.encoded_data.examples.shape[1])

        self.assertEqual(0, encoded.encoded_data.examples[0, 0])
        self.assertEqual(0, encoded.encoded_data.examples[1, 1])
        self.assertEqual(0, encoded.encoded_data.examples[0, 4])

        self.assertTrue(np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"]))
        self.assertTrue(np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"]))

        shutil.rmtree(path)
    def test_generate(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "tcrdist_motif_discovery/")
        dataset_path = self._create_dataset(path)

        dataset = SingleLineReceptorImport.import_dataset({"path": dataset_path,
                                                           "result_path": path / "dataset/",
                                                           "separator": ",",
                                                           "columns_to_load": ["subject", "epitope", "count", "v_a_gene", "j_a_gene", "cdr3_a_aa",
                                                                               "v_b_gene", "j_b_gene", "cdr3_b_aa", "clone_id", "cdr3_a_nucseq",
                                                                               "cdr3_b_nucseq"],
                                                           "column_mapping": {
                                                               "cdr3_a_aa": "alpha_amino_acid_sequence",
                                                               "cdr3_b_aa": "beta_amino_acid_sequence",
                                                               "cdr3_a_nucseq": "alpha_nucleotide_sequence",
                                                               "cdr3_b_nucseq": "beta_nucleotide_sequence",
                                                               "v_a_gene": "alpha_v_gene",
                                                               "v_b_gene": "beta_v_gene",
                                                               "j_a_gene": "alpha_j_gene",
                                                               "j_b_gene": "beta_j_gene",
                                                               "clone_id": "identifier"
                                                           },
                                                           "receptor_chains": "TRA_TRB",
                                                           "region_type": "IMGT_CDR3",
                                                           "sequence_file_size": 50000,
                                                           "organism": "mouse"}, 'd1')

        dataset = TCRdistEncoder(8).encode(dataset, EncoderParams(path / "result", LabelConfiguration([Label("epitope")])))

        report = TCRdistMotifDiscovery(train_dataset=dataset, test_dataset=dataset, result_path=path / "report", name="report name", cores=8,
                                       positive_class_name="PA", min_cluster_size=3)
        report._generate()

        shutil.rmtree(path)
    def _prepare_optional_params(self, analysis: dict,
                                 symbol_table: SymbolTable,
                                 yaml_location: str) -> dict:

        params = {}
        dataset = symbol_table.get(analysis["dataset"])

        if "encoding" in analysis:
            params["encoder"] = symbol_table.get(
                analysis["encoding"]).build_object(
                    dataset,
                    **symbol_table.get_config(
                        analysis["encoding"])["encoder_params"])

            if "labels" in analysis:
                params["label_config"] = LabelHelper.create_label_config(
                    analysis["labels"], dataset,
                    ExploratoryAnalysisParser.__name__, yaml_location)
            else:
                params["label_config"] = LabelConfiguration()

        if "preprocessing_sequence" in analysis:
            params["preprocessing_sequence"] = symbol_table.get(
                analysis["preprocessing_sequence"])

        return params
Example #19
0
    def _parse_split_config(self, instruction_key, instruction: dict, split_key: str, symbol_table: SymbolTable, settings_count: int,
                            label_config: LabelConfiguration) -> SplitConfig:

        try:

            default_params = DefaultParamsLoader.load("instructions/", SplitConfig.__name__)
            report_config_input = self._prepare_report_config(instruction_key, instruction, split_key, symbol_table)
            instruction[split_key] = {**default_params, **instruction[split_key]}

            split_strategy = SplitType[instruction[split_key]["split_strategy"].upper()]
            training_percentage = float(instruction[split_key]["training_percentage"]) if split_strategy == SplitType.RANDOM else -1

            if split_strategy == SplitType.RANDOM and training_percentage == 1 and settings_count > 1:
                raise ValueError(f"{TrainMLModelParser.__name__}: all data under {instruction_key}/{split_key} was specified to be used for "
                                 f"training, but {settings_count} settings were specified for evaluation. Please define a test/validation set by "
                                 f"reducing the training percentage (e.g., to 0.7) or use only one hyperparameter setting to run the analysis.")

            if split_strategy == SplitType.STRATIFIED_K_FOLD and len(label_config.get_labels_by_name()) != 1:
                raise ValueError(f"{TrainMLModelParser.__name__}: Stratified k-fold cross-validation cannot be used when "
                                 f"{len(label_config.get_labels_by_name())} labels are specified. It support only one label (and multiple classes).")

            return SplitConfig(split_strategy=split_strategy,
                               split_count=int(instruction[split_key]["split_count"]),
                               training_percentage=training_percentage,
                               reports=ReportConfig(**report_config_input),
                               manual_config=ManualSplitConfig(**instruction[split_key]["manual_config"]) if "manual_config" in instruction[split_key] else None,
                               leave_one_out_config=LeaveOneOutConfig(**instruction[split_key]["leave_one_out_config"])
                               if "leave_one_out_config" in instruction[split_key] else None)

        except KeyError as key_error:
            raise KeyError(f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under {split_key}.")
Example #20
0
    def reeval_on_assessment_split(state, train_val_dataset: Dataset,
                                   test_dataset: Dataset,
                                   hp_setting: HPSetting, path: Path,
                                   label: Label, split_index: int) -> MLMethod:
        """retrain model for specific label, assessment split and hp_setting"""

        assessment_item = MLProcess(
            train_dataset=train_val_dataset,
            test_dataset=test_dataset,
            label=label,
            metrics=state.metrics,
            optimization_metric=state.optimization_metric,
            path=path,
            hp_setting=hp_setting,
            report_context=state.context,
            ml_reports=state.assessment.reports.model_reports.values(),
            number_of_processes=state.number_of_processes,
            encoding_reports=state.assessment.reports.encoding_reports.values(
            ),
            label_config=LabelConfiguration([label])).run(split_index)

        state.assessment_states[split_index].label_states[
            label.name].assessment_items[str(hp_setting)] = assessment_item

        return state
    def test_run(self):

        path = EnvironmentSettings.tmp_test_path / "mlapplicationtest/"
        PathBuilder.build(path)

        dataset = RandomDatasetGenerator.generate_repertoire_dataset(50, {5: 1}, {5: 1}, {"l1": {1: 0.5, 2: 0.5}}, path / 'dataset/')
        ml_method = LogisticRegression()
        encoder = KmerFreqRepertoireEncoder(NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3,
                                            scale_to_zero_mean=True, scale_to_unit_variance=True)
        label_config = LabelConfiguration([Label("l1", [1, 2])])

        enc_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4))
        ml_method.fit(enc_dataset.encoded_data, 'l1')

        hp_setting = HPSetting(encoder, {"normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer",
                                         "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True}, ml_method, {}, [], 'enc1', 'ml1')

        PathBuilder.build(path / 'result/instr1/')
        shutil.copy(path / 'dict_vectorizer.pickle', path / 'result/instr1/dict_vectorizer.pickle')
        shutil.copy(path / 'scaler.pickle', path / 'result/instr1/scaler.pickle')

        ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False)
        ml_app.run(path / 'result/')

        predictions_path = path / "result/instr1/predictions.csv"
        self.assertTrue(os.path.isfile(predictions_path))

        df = pd.read_csv(predictions_path)
        self.assertEqual(50, df.shape[0])

        shutil.rmtree(path)
    def test_encode(self):
        path = PathBuilder.build(EnvironmentSettings.tmp_test_path /
                                 "atchley_kmer_encoding/")
        dataset = RandomDatasetGenerator.generate_repertoire_dataset(
            3, {1: 1}, {4: 1}, {"l1": {
                True: 0.4,
                False: 0.6
            }}, path / "dataset")

        encoder = AtchleyKmerEncoder.build_object(
            dataset, **{
                "k": 2,
                "skip_first_n_aa": 1,
                "skip_last_n_aa": 1,
                "abundance": "RELATIVE_ABUNDANCE",
                "normalize_all_features": False
            })
        encoded_dataset = encoder.encode(
            dataset,
            EncoderParams(path / "result",
                          LabelConfiguration(labels=[Label("l1")])))

        self.assertEqual((3, 11, 3),
                         encoded_dataset.encoded_data.examples.shape)
        self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0])

        shutil.rmtree(path)
Example #23
0
    def parse(self, key: str, instruction: dict, symbol_table: SymbolTable,
              path: Path) -> MLApplicationInstruction:
        location = MLApplicationParser.__name__
        ParameterValidator.assert_keys(instruction.keys(), [
            'type', 'dataset', 'number_of_processes', 'config_path',
            'store_encoded_data'
        ], location, key)
        ParameterValidator.assert_in_valid_list(
            instruction['dataset'],
            symbol_table.get_keys_by_type(SymbolType.DATASET), location,
            f"{key}: dataset")
        ParameterValidator.assert_type_and_value(
            instruction['number_of_processes'],
            int,
            location,
            f"{key}: number_of_processes",
            min_inclusive=1)
        ParameterValidator.assert_type_and_value(instruction['config_path'],
                                                 str, location,
                                                 f'{key}: config_path')
        ParameterValidator.assert_type_and_value(
            instruction['store_encoded_data'], bool, location,
            f'{key}: store_encoded_data')

        hp_setting, label = self._parse_hp_setting(instruction, path, key)

        instruction = MLApplicationInstruction(
            dataset=symbol_table.get(instruction['dataset']),
            name=key,
            number_of_processes=instruction['number_of_processes'],
            label_configuration=LabelConfiguration([label]),
            hp_setting=hp_setting,
            store_encoded_data=instruction['store_encoded_data'])

        return instruction
Example #24
0
    def test_run(self):
        path = EnvironmentSettings.root_path / "test/tmp/dataencoder/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("AAA", identifier="1")],
            metadata={
                "l1": 1,
                "l2": 2
            },
            path=path)

        rep2 = Repertoire.build_from_sequence_objects(
            [ReceptorSequence("ATA", identifier="2")],
            metadata={
                "l1": 0,
                "l2": 3
            },
            path=path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])
        lc.add_label("l2", [0, 3])

        dataset = RepertoireDataset(repertoires=[rep1, rep2])
        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": ModelType.SEQUENCE.name,
                "vector_size": 6
            })

        res = DataEncoder.run(
            DataEncoderParams(dataset=dataset,
                              encoder=encoder,
                              encoder_params=EncoderParams(
                                  model={},
                                  pool_size=2,
                                  label_config=lc,
                                  result_path=path,
                                  filename="dataset.csv"),
                              store_encoded_data=False))

        self.assertTrue(isinstance(res, RepertoireDataset))
        self.assertTrue(res.encoded_data.examples.shape[0] == 2)

        shutil.rmtree(path)
 def _encode_dataset(self, dataset, path, learn_model: bool = True):
     encoder = KmerFrequencyEncoder.build_object(dataset, **{
         "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
         "reads": ReadsType.UNIQUE.name,
         "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
         "k": 3,
         'sequence_type': SequenceType.AMINO_ACID.name
     })  # encodes the repertoire by frequency of 3-mers
     lc = LabelConfiguration()
     lc.add_label("disease", [True, False])
     encoded_dataset = encoder.encode(dataset, EncoderParams(
         result_path=path / "encoded",
         label_config=lc,
         learn_model=learn_model,
         model={}
     ))
     return encoded_dataset
Example #26
0
    def test_encode(self):

        test_path = EnvironmentSettings.root_path / "test/tmp/w2v/"

        PathBuilder.build(test_path)

        sequence1 = ReceptorSequence("CASSVFA", identifier="1")
        sequence2 = ReceptorSequence("CASSCCC", identifier="2")

        metadata1 = {"T1D": "T1D", "subject_id": "1"}
        rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2],
                                                      test_path, metadata1)

        metadata2 = {"T1D": "CTL", "subject_id": "2"}
        rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path,
                                                      metadata2)

        dataset = RepertoireDataset(repertoires=[rep1, rep2])

        label_configuration = LabelConfiguration()
        label_configuration.add_label("T1D", ["T1D", "CTL"])

        config_params = EncoderParams(model={},
                                      learn_model=True,
                                      result_path=test_path,
                                      label_config=label_configuration,
                                      filename="dataset.pkl")

        encoder = Word2VecEncoder.build_object(
            dataset, **{
                "k": 3,
                "model_type": "sequence",
                "vector_size": 16
            })

        encoded_dataset = encoder.encode(dataset=dataset, params=config_params)

        self.assertIsNotNone(encoded_dataset.encoded_data)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2)
        self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16)
        self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2)
        self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D")
        self.assertTrue(isinstance(encoder, W2VRepertoireEncoder))

        shutil.rmtree(test_path)
    def _construct_test_dataset(self, path, dataset_size: int = 50):
        receptors = [
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATA"),
                         metadata={"l1": 1},
                         identifier=str("1")),
            TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"),
                         beta=ReceptorSequence(amino_acid_sequence="ATT"),
                         metadata={"l1": 2},
                         identifier=str("2"))
        ]

        PathBuilder.build(path)

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        dataset = ReceptorDataset.build(receptors, 2, path)
        return dataset, lc
    def test(self):

        path = EnvironmentSettings.tmp_test_path / "integration_receptor_classification/"
        dataset = self.create_dataset(path)

        os.environ["cache_type"] = "test"

        encoder_params = {
            "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name,
            "reads": ReadsType.UNIQUE.name,
            "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name,
            "sequence_type": SequenceType.AMINO_ACID.name,
            "k": 3
        }

        hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object(
            dataset, **encoder_params),
                               encoder_params=encoder_params,
                               ml_method=LogisticRegression(),
                               ml_params={
                                   "model_selection_cv": False,
                                   "model_selection_n_folds": -1
                               },
                               preproc_sequence=[])

        lc = LabelConfiguration()
        lc.add_label("l1", [1, 2])

        instruction = TrainMLModelInstruction(
            dataset, GridSearch([hp_setting]), [hp_setting],
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()),
            {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path)

        state = instruction.run(result_path=path)
        print(vars(state))

        self.assertEqual(
            1.0, state.assessment_states[0].label_states["l1"].
            optimal_assessment_item.performance[
                state.optimization_metric.name.lower()])

        shutil.rmtree(path)
    def test_encode_sequence(self):
        sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ",
                                    None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(
            sequence,
            EncoderParams(model={"k": 3},
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual(
            {
                'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108',
                'PRE///109', 'RER///110', 'ERA///111', 'RAT///111.001',
                'ATY///111.002', 'TYE///111.003', 'YEQ///111.004',
                'EQC///111.005', 'QCA///111.006', 'CAS///111.007',
                'ASS///111.008', 'SSP///111.009', 'SPR///111.01',
                'PRE///111.011', 'RER///111.012', 'ERA///111.013',
                'RAT///112.013', 'ATY///112.012', 'TYE///112.011',
                'YEQ///112.01', 'EQC///112.009', 'QCA///112.008',
                'CAS///112.007', 'ASS///112.006', 'SSP///112.005',
                'SPR///112.004', 'PRE///112.003', 'RER///112.002',
                'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114',
                'YEQ///115'
            }, set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)

        sequence = ReceptorSequence("AHCDE", None, None)
        result = IMGTKmerSequenceEncoder.encode_sequence(
            sequence,
            EncoderParams(model={"k": 3},
                          label_config=LabelConfiguration(),
                          result_path=""))

        self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'}, set(result))

        self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1)
        self.assertEqual(
            IMGTKmerSequenceEncoder.encode_sequence(
                sequence,
                EncoderParams(model={"k": 25},
                              label_config=LabelConfiguration(),
                              result_path="")), None)
Example #30
0
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None):
    """
    encodes the repertoire dataset using KmerFrequencyEncoder

    Arguments:
        path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format
        result_path (str): where to store the results
        metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None,
            otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column
    Returns:
         encoded dataset with encoded data in encoded_dataset.encoded_data.examples
    """
    path_to_dataset_directory = Path(path_to_dataset_directory)
    result_path = Path(result_path)

    if metadata_path is None:
        metadata_path = generate_random_metadata(path_to_dataset_directory, result_path)
    else:
        metadata_path = Path(metadata_path)

    loader = MiXCRImport()
    dataset = loader.import_dataset({
        "is_repertoire": True,
        "path": path_to_dataset_directory,
        "metadata_file": metadata_path,
        "region_type": "IMGT_CDR3",  # import_dataset in only cdr3
        "number_of_processes": 4,  # number of parallel processes for loading the data
        "result_path": result_path,
        "separator": "\t",
        "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"],
        "column_mapping": {
            "cloneCount": "counts",
            "allVHitsWithScore": "v_alleles",
            "allJHitsWithScore": "j_alleles"
        },
    }, "mixcr_dataset")

    label_name = list(dataset.labels.keys())[0]  # label that can be used for ML prediction - by default: "disease" with values True/False

    encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{
        "normalization_type": "relative_frequency",  # encode repertoire by the relative frequency of k-mers in repertoire
        "reads": "unique",  # count each sequence only once, do not use clonal count
        "k": 2,  # k-mer length
        "sequence_type": "amino_acid",
        "sequence_encoding": "continuous_kmer"  # split each sequence in repertoire to overlapping k-mers
    }), EncoderParams(result_path=result_path,
                      label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])]))))

    dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset,
                                            result_path=result_path / "csv_exported", file_format='csv')
    dataset_exporter.generate_report()

    return encoded_dataset