Exemple #1
0
    def preprocess_dataset(dataset: Dataset,
                           preproc_sequence: list,
                           path: Path,
                           context: dict = None,
                           hp_setting: HPSetting = None) -> Dataset:
        if dataset is not None:
            if isinstance(preproc_sequence,
                          list) and len(preproc_sequence) > 0:
                PathBuilder.build(path)
                tmp_dataset = dataset.clone(
                ) if context is None or "dataset" not in context else context[
                    "dataset"]

                for preprocessing in preproc_sequence:
                    tmp_dataset = preprocessing.process_dataset(
                        tmp_dataset, path)

                if context is not None and "dataset" in context:
                    context["preprocessed_dataset"] = {
                        str(hp_setting): tmp_dataset
                    }
                    indices = [
                        i
                        for i in range(context["dataset"].get_example_count())
                        if context["dataset"].repertoires[i].identifier in
                        dataset.get_example_ids()
                    ]
                    preprocessed_dataset = tmp_dataset.make_subset(
                        indices, path, Dataset.PREPROCESSED)
                else:
                    preprocessed_dataset = tmp_dataset

                return preprocessed_dataset
            else:
                return dataset
    def _update_split_configs(self, assessment: SplitConfig, selection: SplitConfig, dataset: Dataset) -> Tuple[SplitConfig, SplitConfig]:

        if assessment.split_strategy == SplitType.LOOCV:
            assessment.split_count = dataset.get_example_count()
            train_val_example_count = assessment.split_count - 1
        elif assessment.split_strategy == SplitType.K_FOLD:
            train_val_example_count = int(dataset.get_example_count() * (assessment.split_count - 1) / assessment.split_count)
        else:
            train_val_example_count = int(dataset.get_example_count() * assessment.training_percentage)

        if selection.split_strategy == SplitType.LOOCV:
            selection.split_count = train_val_example_count

        return assessment, selection
Exemple #3
0
    def reshape(dataset: Dataset, labels=None):
        """
        Takes a 2D matrix of values from the encoded data and reshapes it to long format,
        retaining the column and row annotations. This is for ease of use in plotting the data.
        It is suggested that some sort of filtering is done first, otherwise the memory usage may explode, as
        the resulting data frame is of shape
        (matrix.shape[0] * matrix.shape[1], labels.shape[0] + feature_annotations.shape[1] + 1)
        """
        if labels is None:
            row_annotations = pd.DataFrame(dataset.encoded_data.labels)
        else:
            row_annotations = pd.DataFrame(dataset.get_metadata(labels, return_df=True))

        row_annotations["example_id"] = dataset.encoded_data.example_ids

        column_annotations = dataset.encoded_data.feature_annotations
        column_annotations["feature"] = dataset.encoded_data.feature_names

        matrix = dataset.encoded_data.examples
        matrix_1d = matrix.A.ravel()

        column_annotations = pd.concat([column_annotations]*matrix.shape[0], ignore_index=True)
        row_annotations = pd.DataFrame(row_annotations.values.repeat(matrix.shape[1], axis=0), columns=row_annotations.columns)
        data = pd.concat([row_annotations.reset_index(drop=True), column_annotations.reset_index(drop=True), pd.DataFrame({"value": matrix_1d})], axis=1)

        for column in data.columns:
            data[column] = pd.to_numeric(data[column], errors="ignore")

        return data
Exemple #4
0
    def create_label_config(labels_dict: dict, dataset: Dataset,
                            instruction_name: str,
                            yaml_location: str) -> LabelConfiguration:
        LabelHelper.check_label_format(labels_dict, instruction_name,
                                       yaml_location)

        label_config = LabelConfiguration()
        for label in labels_dict:
            label_name = label if isinstance(label, str) else list(
                label.keys())[0]
            positive_class = label[label_name]['positive_class'] if isinstance(
                label, dict) else None
            if dataset.labels is not None and label_name in dataset.labels:
                label_values = dataset.labels[label_name]
            elif hasattr(dataset, "get_metadata"):
                label_values = list(
                    set(dataset.get_metadata([label_name])[label_name]))
            else:
                label_values = []
                warnings.warn(
                    f"{instruction_name}: for {yaml_location}, label values could not be recovered for label "
                    f"{label}, using empty list instead. This issue may occur due to improper loading of dataset {dataset.name},"
                    f"and could cause problems with some encodings.")

            label_config.add_label(label_name,
                                   label_values,
                                   positive_class=positive_class)
        return label_config
Exemple #5
0
    def _create_label_config(self, instruction: dict, dataset: Dataset,
                             instruction_key: str) -> LabelConfiguration:
        labels = instruction["labels"]

        self._check_label_format(labels, instruction_key)

        label_config = LabelConfiguration()
        for label in labels:
            label_name = label if isinstance(label, str) else list(
                label.keys())[0]
            positive_class = label[label_name]['positive_class'] if isinstance(
                label, dict) else None
            if dataset.labels is not None and label_name in dataset.labels:
                label_values = dataset.labels[label_name]
            elif hasattr(dataset, "get_metadata"):
                label_values = list(
                    set(dataset.get_metadata([label_name])[label_name]))
            else:
                label_values = []
                warnings.warn(
                    f"{TrainMLModelParser.__name__}: for instruction {instruction_key}, label values could not be recovered for label "
                    f"{label}, using empty list instead.  This could cause problems with some encodings. "
                    f"If that might be the case, check if the dataset {dataset.name} has been properly loaded."
                )

            label_config.add_label(label_name,
                                   label_values,
                                   positive_class=positive_class)
        return label_config
Exemple #6
0
 def preprocess_dataset(dataset: Dataset, preproc_sequence: list, path: Path) -> Dataset:
     if dataset is not None:
         tmp_dataset = dataset.clone()
         if len(preproc_sequence) > 0:
             PathBuilder.build(path)
             for preprocessing in preproc_sequence:
                 tmp_dataset = preprocessing.process_dataset(tmp_dataset, path)
         return tmp_dataset
Exemple #7
0
    def _create_report(self, path):
        report = TrainingPerformance.build_object(name='testcase')

        report.train_dataset = Dataset()
        report.method, report.train_dataset.encoded_data = self._create_dummy_lr_model()
        report.label = Label("l1", values=[0, 1])
        report.result_path = path

        return report
Exemple #8
0
 def prepare_training_ids(dataset: Dataset, params: EncoderParams):
     PathBuilder.build(params.result_path)
     if params.learn_model:
         training_ids = dataset.get_example_ids()
         training_ids_path = params.result_path / "training_ids.pickle"
         with training_ids_path.open("wb") as file:
             pickle.dump(training_ids, file)
     else:
         training_ids_path = params.result_path / "training_ids.pickle"
         with training_ids_path.open("rb") as file:
             training_ids = pickle.load(file)
     return training_ids
Exemple #9
0
    def _create_report(self, path):
        report = ROCCurve.build_object(name='testcase')

        report.method = self._create_dummy_lr_model()
        report.label = Label("l1")
        report.result_path = path
        report.test_dataset = Dataset()
        encoded_te = EncodedData(np.random.rand(100, 20),
                                 {"l1": [i % 2 for i in range(0, 100)]})

        report.test_dataset.encoded_data = encoded_te

        return report
    def _create_report(self, path):
        report = MotifSeedRecovery.build_object(**{"implanted_motifs_per_label": {
            "l1": {"seeds": ["AAA", "A/AA"],
                   "hamming_distance": False,
                   "gap_sizes": [1]}}})

        report.method = self._create_dummy_lr_model(path)
        report.label = Label("l1")
        report.result_path = path
        report.train_dataset = Dataset()
        report.train_dataset.encoded_data = EncodedData(examples=np.zeros((1, 5)), labels={"l1": [1]}, encoding="KmerFrequencyEncoder",
                                                        feature_names=["AAA", "AAC", "CKJ", "KSA", "AKJ"])

        return report
Exemple #11
0
    def _create_report(self, path):
        report = Coefficients.build_object(**{"coefs_to_plot": [CoefficientPlottingSetting.ALL.name,
                                                                CoefficientPlottingSetting.NONZERO.name,
                                                                CoefficientPlottingSetting.CUTOFF.name,
                                                                CoefficientPlottingSetting.N_LARGEST.name],
                                              "cutoff": [10],
                                              "n_largest": [5]})

        report.method = self._create_dummy_lr_model(path)
        report.ml_details_path = path / "ml_details.yaml"
        report.label = "l1"
        report.result_path = path
        report.train_dataset = Dataset()
        report.train_dataset.encoded_data = EncodedData(examples=np.zeros((1, 20)), labels={"A": [1]},
                                                        feature_names=[f"feature{i}" for i in range(20)])

        return report
Exemple #12
0
    def _build_encoded_dataset(self, dataset: Dataset, scaled_examples, labels, params: EncoderParams):

        encoded_dataset = dataset.clone()

        label_names = params.label_config.get_labels_by_name()
        feature_names = [str(i) for i in range(scaled_examples.shape[1])]
        feature_annotations = pd.DataFrame({"feature": feature_names})

        encoded_data = EncodedData(examples=scaled_examples,
                                   labels={label: labels[i] for i, label in enumerate(label_names)} if labels is not None else None,
                                   example_ids=[example.identifier for example in encoded_dataset.get_data()],
                                   feature_names=feature_names,
                                   feature_annotations=feature_annotations,
                                   encoding=Word2VecEncoder.__name__)

        encoded_dataset.add_encoded_data(encoded_data)
        return encoded_dataset
Exemple #13
0
    def export(dataset: Dataset, path: Path):
        PathBuilder.build(path)
        exported_dataset = dataset.clone(keep_identifier=True)
        dataset_name = exported_dataset.name
        dataset_filename = f"{dataset_name}.iml_dataset"

        if isinstance(dataset, RepertoireDataset):
            repertoires_path = PathBuilder.build(path / "repertoires")
            exported_repertoires = ImmuneMLExporter._export_repertoires(
                dataset.repertoires, repertoires_path)
            exported_dataset.repertoires = exported_repertoires
            exported_dataset.metadata_file = ImmuneMLExporter._export_metadata(
                dataset, path, dataset_filename, repertoires_path)
        elif isinstance(dataset, SequenceDataset) or isinstance(
                dataset, ReceptorDataset):
            exported_dataset.set_filenames(
                ImmuneMLExporter._export_receptors(
                    exported_dataset.get_filenames(), path))

        file_path = path / dataset_filename
        with file_path.open("w") as file:
            yaml_dict = {
                **{
                    key: ImmuneMLExporter._parse_val_for_export(val)
                    for key, val in vars(exported_dataset).items() if key not in [
                        'repertoires', 'element_generator', 'encoded_data'
                    ]
                },
                **{
                    'dataset_class': type(exported_dataset).__name__
                }
            }
            yaml.dump(yaml_dict, file)

        version_path = path / "info.txt"
        with version_path.open("w") as file:
            file.writelines(f"immuneML_version: {Constants.VERSION}\n"
                            f"Python_version: {platform.python_version()}\n")

        return exported_dataset
Exemple #14
0
 def check_dataset_not_empty(self,
                             processed_dataset: Dataset,
                             location="Filter"):
     assert processed_dataset.get_example_count() > 0, f"{location}: {type(processed_dataset).__name__} ended up empty after filtering. " \
                                                       f"Please adjust filter settings."
Exemple #15
0
 def make_dataset(dataset: Dataset, indices,
                  input_params: DataSplitterParams, i: int,
                  dataset_type: str):
     path = Util.prepare_path(input_params, i)
     new_dataset = dataset.make_subset(indices, path, dataset_type)
     return new_dataset