コード例 #1
0
    def _encode_examples(self, dataset: RepertoireDataset, params: EncoderParams) -> Tuple[list, set, dict]:

        keys = set()
        example_count = dataset.get_example_count()

        arguments = [(repertoire, index, example_count) for index, repertoire in enumerate(dataset.repertoires)]

        with Pool(params.pool_size) as pool:
            chunksize = math.floor(dataset.get_example_count() / params.pool_size) + 1
            examples = pool.starmap(self._process_repertoire_cached, arguments, chunksize=chunksize)

        for example in examples:
            keys.update(list(example.keys()))

        labels = dataset.get_metadata(params.label_config.get_labels_by_name()) if params.encode_labels else None

        return examples, keys, labels
コード例 #2
0
 def process_dataset(self, dataset: RepertoireDataset):
     extract_fn = self.build_matching_fn()
     repertoire_count = dataset.get_example_count()
     for index, repertoire in enumerate(dataset.get_data()):
         self.process_repertoire(repertoire, str(repertoire.identifier),
                                 extract_fn)
         logging.info("Repertoire {} ({}/{}) processed.".format(
             repertoire.identifier, index + 1, repertoire_count))
         logging.info(
             f"Currently, there are {self.item_count} items in the comparison data matrix."
         )
     self.merge_tmp_batches_to_matrix()
コード例 #3
0
    def _encode_repertoires(self, dataset: RepertoireDataset,
                            params: EncoderParams):
        # Rows = repertoires, Columns = regex matches (one chain per column)
        encoded_repertoires = np.zeros(
            (dataset.get_example_count(), self.feature_count), dtype=int)
        labels = {
            label: []
            for label in params.label_config.get_labels_by_name()
        } if params.encode_labels else None

        n_repertoires = dataset.get_example_count()

        for i, repertoire in enumerate(dataset.get_data()):
            print(
                f"{datetime.datetime.now()}: Encoding repertoire {i+1}/{n_repertoires}"
            )
            encoded_repertoires[i] = self._match_repertoire_to_regexes(
                repertoire)

            if labels is not None:
                for label in params.label_config.get_labels_by_name():
                    labels[label].append(repertoire.metadata[label])

        return encoded_repertoires, labels
コード例 #4
0
    def _encode_repertoires(self, dataset: RepertoireDataset, params):
        # Rows = repertoires, Columns = reference sequences
        encoded_repertories = np.zeros((dataset.get_example_count(),
                                        len(self.reference_sequences)),
                                       dtype=int)

        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_reference(repertoire)

            for label in params.label_config.get_labels_by_name():
                labels[label].append(repertoire.metadata[label])

        return encoded_repertories, labels
コード例 #5
0
    def compare_repertoires(self, dataset: RepertoireDataset, comparison_fn):
        self.comparison_data = self.memo_by_params(dataset)
        repertoire_count = dataset.get_example_count()
        comparison_result = np.zeros([repertoire_count, repertoire_count])
        repertoire_identifiers = dataset.get_repertoire_ids()

        for index1 in range(repertoire_count):
            repertoire_vector_1 = self.comparison_data.get_repertoire_vector(
                repertoire_identifiers[index1])
            for index2 in range(index1, repertoire_count):
                repertoire_vector_2 = self.comparison_data.get_repertoire_vector(
                    repertoire_identifiers[index2])
                comparison_result[index1, index2] = comparison_fn(
                    repertoire_vector_1, repertoire_vector_2)
                comparison_result[index2, index1] = comparison_result[index1,
                                                                      index2]

        comparison_df = pd.DataFrame(comparison_result,
                                     columns=repertoire_identifiers,
                                     index=repertoire_identifiers)

        return comparison_df
コード例 #6
0
ファイル: test_dataSplitter.py プロジェクト: rofrank/immuneML
    def test_run(self):
        dataset = RepertoireDataset(repertoires=[
            Repertoire("0.npy", "", "0"),
            Repertoire("0.npy", "", "1"),
            Repertoire("0.npy", "", "2"),
            Repertoire("0.npy", "", "3"),
            Repertoire("0.npy", "", "4"),
            Repertoire("0.npy", "", "5"),
            Repertoire("0.npy", "", "6"),
            Repertoire("0.npy", "", "7")
        ])

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        df = pd.DataFrame(data={
            "key1": [0, 0, 1, 1, 1, 2, 2, 0],
            "filename": [0, 1, 2, 3, 4, 5, 6, 7]
        })
        df.to_csv(EnvironmentSettings.root_path +
                  "test/tmp/datasplitter/metadata.csv")

        dataset.metadata_file = EnvironmentSettings.root_path + "test/tmp/datasplitter/metadata.csv"

        training_percentage = 0.7

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 5)
        self.assertEqual(len(tests[0].get_data()), 3)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))
        self.assertEqual(5, len(trains[0].repertoires))

        trains2, tests2 = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertEqual(trains[0].get_repertoire_ids(),
                         trains2[0].get_repertoire_ids())

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i)
            for i in range(dataset.get_example_count())
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.LOOCV,
                               split_count=-1,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 7)
        self.assertEqual(len(tests[0].get_data()), 1)
        self.assertEqual(8, len(trains))
        self.assertEqual(8, len(tests))

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.K_FOLD,
                               split_count=5,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 6)
        self.assertEqual(len(tests[0].get_data()), 2)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))

        shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/datasplitter/")