def process_dataset(self,
                        dataset: RepertoireDataset,
                        result_path: Path = None):
        self.check_dataset_type(dataset, [RepertoireDataset],
                                "ClonesPerRepertoireFilter")
        self.result_path = result_path if result_path is not None else self.result_path

        processed_dataset = dataset.clone()
        repertoires, indices = [], []

        for index, repertoire in enumerate(dataset.get_data()):
            if self.lower_limit != -1 and len(
                    repertoire.sequences) < self.lower_limit:
                continue
            if self.upper_limit != -1 and len(
                    repertoire.sequences) > self.upper_limit:
                continue
            repertoires.append(dataset.repertoires[index])
            indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = self._build_new_metadata(
            dataset, indices)

        self.check_dataset_not_empty(processed_dataset,
                                     "ClonesPerRepertoireFilter")

        return processed_dataset
Esempio n. 2
0
    def _merge_repertoires(self, dataset: RepertoireDataset):
        rep_map = {}
        repertoires, indices_to_keep = [], []
        processed_dataset = dataset.clone()

        for index, repertoire in enumerate(processed_dataset.get_data()):
            if repertoire.metadata["subject_id"] in rep_map.keys():
                sequences = np.append(
                    repertoire.sequences,
                    rep_map[repertoire.metadata["subject_id"]].sequences)
                del rep_map[repertoire.metadata["subject_id"]]
                repertoires.append(
                    self._store_repertoire(repertoire, sequences))
            else:
                rep_map[repertoire.metadata["subject_id"]] = repertoire
                indices_to_keep.append(index)

        for key in rep_map.keys():
            repertoires.append(
                self._store_repertoire(rep_map[key], rep_map[key].sequences))

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = self._build_new_metadata(
            dataset, indices_to_keep)

        return processed_dataset
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        SubjectRepertoireCollector.check_dataset_type(dataset, [RepertoireDataset], "SubjectRepertoireCollector")

        rep_map = {}
        repertoires = []
        indices_to_keep = []

        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])

        for index, repertoire in enumerate(processed_dataset.get_data()):
            if repertoire.metadata["subject_id"] in rep_map.keys():
                sequences = np.append(repertoire.sequences, rep_map[repertoire.metadata["subject_id"]].sequences)
                del rep_map[repertoire.metadata["subject_id"]]
                repertoires.append(SubjectRepertoireCollector.store_repertoire(
                    params["result_path"], repertoire, sequences))
            else:
                rep_map[repertoire.metadata["subject_id"]] = repertoire
                indices_to_keep.append(index)

        for key in rep_map.keys():
            repertoires.append(SubjectRepertoireCollector.store_repertoire(params["result_path"], rep_map[key], rep_map[key].sequences))

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata(dataset, indices_to_keep, params["result_path"])

        return processed_dataset
Esempio n. 4
0
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        processed_dataset = dataset.clone()
        original_repertoires = processed_dataset.get_data()
        indices = MetadataRepertoireFilter.get_matching_indices(
            processed_dataset, params["criteria"])
        processed_dataset.repertoires = [
            original_repertoires[i] for i in indices
        ]
        processed_dataset.metadata_file = MetadataRepertoireFilter.build_new_metadata(
            dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset,
                                       "MetadataRepertoireFilter")

        return processed_dataset
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        Preprocessor.check_dataset_type(dataset, [RepertoireDataset], "ClonesPerRepertoireFilter")
        processed_dataset = dataset.clone()
        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \
                "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]:
                repertoires.append(dataset.repertoires[index])
                indices.append(index)
        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter")

        return processed_dataset
    def process_dataset(self, dataset: RepertoireDataset, result_path: Path):
        self.check_dataset_type(dataset, [RepertoireDataset],
                                "MetadataRepertoireFilter")
        self.result_path = result_path if result_path is not None else self.result_path

        processed_dataset = dataset.clone()
        original_repertoires = processed_dataset.get_data()
        indices = self._get_matching_indices(processed_dataset)
        processed_dataset.repertoires = [
            original_repertoires[i] for i in indices
        ]
        processed_dataset.metadata_file = self._build_new_metadata(
            dataset, indices)

        self.check_dataset_not_empty(processed_dataset,
                                     "MetadataRepertoireFilter")

        return processed_dataset
    def encode(self, dataset: RepertoireDataset,
               params: EncoderParams) -> RepertoireDataset:
        train_repertoire_ids = EncoderHelper.prepare_training_ids(
            dataset, params)
        labels = self.build_labels(dataset,
                                   params) if params.encode_labels else None

        distance_matrix = self.build_distance_matrix(dataset, params,
                                                     train_repertoire_ids)

        encoded_dataset = dataset.clone()
        encoded_dataset.encoded_data = EncodedData(
            examples=distance_matrix.to_numpy(),
            labels=labels,
            feature_names=distance_matrix.columns.values,
            example_ids=distance_matrix.index.values,
            encoding=CompAIRRDistanceEncoder.__name__)
        return encoded_dataset
Esempio n. 8
0
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])
        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if all(sequence.metadata.chain == Chain.get_chain(
                    params["keep_chain"])
                   for sequence in repertoire.sequences):
                repertoires.append(repertoire)
                indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata(
            processed_dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset,
                                       "ChainRepertoireFilter")

        return processed_dataset
Esempio n. 9
0
    def process_dataset(self, dataset: RepertoireDataset, result_path: Path):
        self.check_dataset_type(dataset, [RepertoireDataset],
                                "ChainRepertoireFilter")
        processed_dataset = dataset.clone()
        self.result_path = result_path if result_path is not None else self.result_path

        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if all(sequence.metadata.chain == self.keep_chain
                   for sequence in repertoire.sequences):
                repertoires.append(repertoire)
                indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = self._build_new_metadata(
            processed_dataset, indices)

        self.check_dataset_not_empty(processed_dataset,
                                     "ChainRepertoireFilter")

        return processed_dataset