Beispiel #1
0
 def __init__(self,
              labels: dict = None,
              encoded_data: EncodedData = None,
              filenames: list = None,
              identifier: str = None,
              file_size: int = 50000,
              name: str = None):
     super().__init__()
     self.labels = labels
     self.encoded_data = encoded_data
     self.identifier = identifier if identifier is not None else uuid4().hex
     self._filenames = sorted(filenames) if filenames is not None else []
     self.element_generator = ElementGenerator(self._filenames, file_size)
     self.file_size = file_size
     self.element_ids = None
     self.name = name
Beispiel #2
0
    def test_build_batch_generator(self):
        path = EnvironmentSettings.tmp_test_path / "element_batch_generator/"
        PathBuilder.build(path)
        receptors = [BCReceptor(identifier=str(i), heavy=ReceptorSequence('A'), light=ReceptorSequence('C')) for i in range(307)]
        file_list = [path / f"batch{i}.npy" for i in range(4)]

        for i in range(4):
            matrix = np.core.records.fromrecords([r.get_record() for r in receptors[i * 100: (i+1) * 100]], names=BCReceptor.get_record_names())
            np.save(str(file_list[i]), matrix, allow_pickle=False)

        receptor_generator = ElementGenerator(file_list, element_class_name=BCReceptor.__name__)
        generator = receptor_generator.build_batch_generator()

        counter = 0

        for batch in generator:
            for receptor in batch:
                self.assertEqual(counter, int(receptor.identifier))
                self.assertTrue(isinstance(receptor, BCReceptor))
                counter += 1

        self.assertEqual(307, counter)

        generator = receptor_generator.build_batch_generator()

        counter = 0

        for batch in generator:
            for receptor in batch:
                self.assertEqual(counter, int(receptor.identifier))
                self.assertTrue(isinstance(receptor, BCReceptor))
                counter += 1

        self.assertEqual(307, counter)

        shutil.rmtree(path)
    def test_build_batch_generator(self):
        path = EnvironmentSettings.tmp_test_path / "element_batch_generator/"
        PathBuilder.build(path)
        receptors = [BCReceptor(identifier=str(i)) for i in range(307)]
        file_list = [path / f"batch{i}.pkl" for i in range(4)]

        for i in range(4):
            with file_list[i].open("wb") as file:
                pickle.dump(receptors[i * 100:(i + 1) * 100], file)

        receptor_generator = ElementGenerator(file_list)
        generator = receptor_generator.build_batch_generator()

        counter = 0

        for batch in generator:
            for receptor in batch:
                self.assertEqual(counter, int(receptor.identifier))
                self.assertTrue(isinstance(receptor, BCReceptor))
                counter += 1

        self.assertEqual(307, counter)

        generator = receptor_generator.build_batch_generator()

        counter = 0

        for batch in generator:
            for receptor in batch:
                self.assertEqual(counter, int(receptor.identifier))
                self.assertTrue(isinstance(receptor, BCReceptor))
                counter += 1

        self.assertEqual(307, counter)

        shutil.rmtree(path)
Beispiel #4
0
class ElementDataset(Dataset):
    """
    This is the base class for ReceptorDataset and SequenceDataset which implements all the functionality for both classes. The only difference between
    these two classes is whether paired or single chain data is stored.
    """
    def __init__(self,
                 labels: dict = None,
                 encoded_data: EncodedData = None,
                 filenames: list = None,
                 identifier: str = None,
                 file_size: int = 50000,
                 name: str = None):
        super().__init__()
        self.labels = labels
        self.encoded_data = encoded_data
        self.identifier = identifier if identifier is not None else uuid4().hex
        self._filenames = sorted(filenames) if filenames is not None else []
        self.element_generator = ElementGenerator(self._filenames, file_size)
        self.file_size = file_size
        self.element_ids = None
        self.name = name

    def get_data(self, batch_size: int = 10000):
        self._filenames.sort()
        self.element_generator.file_list = self._filenames
        return self.element_generator.build_element_generator()

    def get_batch(self, batch_size: int = 10000):
        self._filenames.sort()
        self.element_generator.file_list = self._filenames
        return self.element_generator.build_batch_generator()

    def get_filenames(self):
        return self._filenames

    def set_filenames(self, filenames):
        self._filenames = filenames

    def get_example_count(self):
        return len(self.get_example_ids())

    def get_example_ids(self):
        if self.element_ids is None or (isinstance(self.element_ids, list)
                                        and len(self.element_ids) == 0):
            self.element_ids = []
            for element in self.get_data():
                self.element_ids.append(element.identifier)
        return self.element_ids

    def make_subset(self, example_indices, path, dataset_type: str):
        """
        Creates a new dataset object with only those examples (receptors or receptor sequences) available which were given by index in example_indices argument.

        Args:
            example_indices (list): a list of indices of examples (receptors or receptor sequences) to use in the new dataset
            path (Path): a path where to store the newly created dataset
            dataset_type (str): a type of the dataset used as a part of the name of the resulting dataset; the values are defined as constants in :py:obj:`~immuneML.data_model.dataset.Dataset.Dataset`

        Returns:

            a new dataset object (ReceptorDataset or SequenceDataset, as the original dataset) which includes only the examples specified under example_indices

        """
        new_dataset = self.__class__(labels=self.labels,
                                     file_size=self.file_size)
        batch_filenames = self.element_generator.make_subset(
            example_indices, path, dataset_type, new_dataset.identifier)
        new_dataset.set_filenames(batch_filenames)
        new_dataset.name = f"{self.name}_split_{dataset_type.lower()}"
        return new_dataset

    def get_label_names(self):
        """Returns the list of metadata fields which can be used as labels"""
        return [
            label for label in list(self.labels.keys())
            if label not in ['region_type', 'receptor_chains', 'organism']
        ]

    def clone(self):
        raise NotImplementedError