def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams):
        # Rows = repertoires, Columns = regex matches (one chain per column)
        encoded_repertoires = np.zeros((dataset.get_example_count(),
                                        self.feature_count),
                                       dtype=int)
        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        n_repertoires = dataset.get_example_count()

        for i, repertoire in enumerate(dataset.get_data()):
            print(f"{datetime.datetime.now()}: Encoding repertoire {i+1}/{n_repertoires}")
            encoded_repertoires[i] = self._match_repertoire_to_regexes(repertoire)

            if labels is not None:
                for label_name in params.label_config.get_labels_by_name():
                    labels[label_name].append(repertoire.metadata[label_name])

        return encoded_repertoires, labels
Exemple #2
0
 def process_dataset(self, dataset: RepertoireDataset):
     extract_fn = self.build_matching_fn()
     repertoire_count = dataset.get_example_count()
     for index, repertoire in enumerate(dataset.get_data()):
         self.process_repertoire(repertoire, str(repertoire.identifier),
                                 extract_fn)
         logging.info("Repertoire {} ({}/{}) processed.".format(
             repertoire.identifier, index + 1, repertoire_count))
         logging.info(
             f"Currently, there are {self.item_count} items in the comparison data matrix."
         )
     self.merge_tmp_batches_to_matrix()
    def _encode_examples(self, dataset: RepertoireDataset,
                         params: EncoderParams) -> Tuple[list, set, dict]:

        keys = set()
        example_count = dataset.get_example_count()

        arguments = [(repertoire, index, example_count)
                     for index, repertoire in enumerate(dataset.repertoires)]

        with Pool(params.pool_size) as pool:
            chunksize = math.floor(
                dataset.get_example_count() / params.pool_size) + 1
            examples = pool.starmap(self._process_repertoire_cached,
                                    arguments,
                                    chunksize=chunksize)

        for example in examples:
            keys.update(list(example.keys()))

        labels = dataset.get_metadata(params.label_config.get_labels_by_name()
                                      ) if params.encode_labels else None

        return examples, keys, labels
    def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams):
        # Rows = repertoires, Columns = reference chains (two per sequence receptor)
        encoded_repertories = np.zeros((dataset.get_example_count(),
                                        len(self.reference_receptors) * 2),
                                       dtype=int)
        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_receptors(repertoire)

            if labels is not None:
                for label_name in params.label_config.get_labels_by_name():
                    labels[label_name].append(repertoire.metadata[label_name])

        return encoded_repertories, labels, dataset.get_repertoire_ids()
Exemple #5
0
    def _encode_repertoires(self, dataset: RepertoireDataset, params):
        # Rows = repertoires, Columns = reference sequences
        encoded_repertories = np.zeros(
            (dataset.get_example_count(), len(self.reference_sequences)),
            dtype=int)

        labels = {
            label: []
            for label in params.label_config.get_labels_by_name()
        } if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_reference(
                repertoire)

            for label_name in params.label_config.get_labels_by_name():
                labels[label_name].append(repertoire.metadata[label_name])

        return encoded_repertories, labels
Exemple #6
0
    def compare_repertoires(self, dataset: RepertoireDataset, comparison_fn):
        self.comparison_data = self.memo_by_params(dataset)
        repertoire_count = dataset.get_example_count()
        comparison_result = np.zeros([repertoire_count, repertoire_count])
        repertoire_identifiers = dataset.get_repertoire_ids()

        for index1 in range(repertoire_count):
            repertoire_vector_1 = self.comparison_data.get_repertoire_vector(
                repertoire_identifiers[index1])
            for index2 in range(index1, repertoire_count):
                repertoire_vector_2 = self.comparison_data.get_repertoire_vector(
                    repertoire_identifiers[index2])
                comparison_result[index1, index2] = comparison_fn(
                    repertoire_vector_1, repertoire_vector_2)
                comparison_result[index2, index1] = comparison_result[index1,
                                                                      index2]

        comparison_df = pd.DataFrame(comparison_result,
                                     columns=repertoire_identifiers,
                                     index=repertoire_identifiers)

        return comparison_df
Exemple #7
0
    def test_run(self):
        dataset = RepertoireDataset(repertoires=[
            Repertoire(Path("0.npy"), None, "0"),
            Repertoire(Path("0.npy"), None, "8"),
            Repertoire(Path("0.npy"), None, "1"),
            Repertoire(Path("0.npy"), None, "9"),
            Repertoire(Path("0.npy"), None, "2"),
            Repertoire(Path("0.npy"), None, "10"),
            Repertoire(Path("0.npy"), None, "3"),
            Repertoire(Path("0.npy"), None, "11"),
            Repertoire(Path("0.npy"), None, "4"),
            Repertoire(Path("0.npy"), None, "12"),
            Repertoire(Path("0.npy"), None, "5"),
            Repertoire(Path("0.npy"), None, "13"),
            Repertoire(Path("0.npy"), None, "6"),
            Repertoire(Path("0.npy"), None, "14"),
            Repertoire(Path("0.npy"), None, "7")
        ])

        paths = [
            EnvironmentSettings.root_path /
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        df = pd.DataFrame(
            data={
                "key1": [0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1],
                "filename": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
            })
        df.to_csv(EnvironmentSettings.root_path /
                  "test/tmp/datasplitter/metadata.csv")

        dataset.metadata_file = EnvironmentSettings.root_path / "test/tmp/datasplitter/metadata.csv"

        training_percentage = 0.7

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(10, len(trains[0].get_data()))
        self.assertEqual(5, len(tests[0].get_data()))
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))
        self.assertEqual(10, len(trains[0].repertoires))

        trains2, tests2 = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertEqual(trains[0].get_repertoire_ids(),
                         trains2[0].get_repertoire_ids())

        paths = [
            EnvironmentSettings.root_path /
            "test/tmp/datasplitter/split_{}".format(i)
            for i in range(dataset.get_example_count())
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.LOOCV,
                               split_count=-1,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(14, len(trains[0].get_data()))
        self.assertEqual(1, len(tests[0].get_data()))
        self.assertEqual(15, len(trains))
        self.assertEqual(15, len(tests))

        paths = [
            EnvironmentSettings.root_path /
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.K_FOLD,
                               split_count=5,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 12)
        self.assertEqual(len(tests[0].get_data()), 3)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.STRATIFIED_K_FOLD,
                               split_count=3,
                               training_percentage=-1,
                               paths=paths,
                               label_config=LabelConfiguration(
                                   [Label("key1", [0, 1, 2])])))

        self.assertEqual(len(trains[0].get_data()), 10)
        self.assertEqual(len(tests[0].get_data()), 5)
        self.assertEqual(3, len(trains))
        self.assertEqual(3, len(tests))
        for train in trains:
            self.assertTrue(
                all(cls in train.get_metadata(["key1"])["key1"]
                    for cls in [0, 1, 2]))
        for test in tests:
            self.assertTrue(
                all(cls in test.get_metadata(["key1"])["key1"]
                    for cls in [0, 1, 2]))

        shutil.rmtree(EnvironmentSettings.root_path / "test/tmp/datasplitter/")
    def test_run(self):
        dataset = RepertoireDataset(repertoires=[
            Repertoire("0.npy", None, "0"),
            Repertoire("0.npy", None, "1"),
            Repertoire("0.npy", None, "2"),
            Repertoire("0.npy", None, "3"),
            Repertoire("0.npy", None, "4"),
            Repertoire("0.npy", None, "5"),
            Repertoire("0.npy", None, "6"),
            Repertoire("0.npy", None, "7")
        ])

        paths = [
            EnvironmentSettings.root_path /
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        df = pd.DataFrame(data={
            "key1": [0, 0, 1, 1, 1, 2, 2, 0],
            "filename": [0, 1, 2, 3, 4, 5, 6, 7]
        })
        df.to_csv(EnvironmentSettings.root_path /
                  "test/tmp/datasplitter/metadata.csv")

        dataset.metadata_file = EnvironmentSettings.root_path / "test/tmp/datasplitter/metadata.csv"

        training_percentage = 0.7

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 5)
        self.assertEqual(len(tests[0].get_data()), 3)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))
        self.assertEqual(5, len(trains[0].repertoires))

        trains2, tests2 = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertEqual(trains[0].get_repertoire_ids(),
                         trains2[0].get_repertoire_ids())

        paths = [
            EnvironmentSettings.root_path /
            "test/tmp/datasplitter/split_{}".format(i)
            for i in range(dataset.get_example_count())
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.LOOCV,
                               split_count=-1,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 7)
        self.assertEqual(len(tests[0].get_data()), 1)
        self.assertEqual(8, len(trains))
        self.assertEqual(8, len(tests))

        paths = [
            EnvironmentSettings.root_path /
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.K_FOLD,
                               split_count=5,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 6)
        self.assertEqual(len(tests[0].get_data()), 2)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))

        shutil.rmtree(EnvironmentSettings.root_path / "test/tmp/datasplitter/")