def test_process(self):
        path = EnvironmentSettings.root_path + "test/tmp/metadata_filter/"
        PathBuilder.build(path)
        dataset = RepertoireDataset(repertoires=RepertoireBuilder.build(
            [["ACF", "ACF", "ACF"], ["ACF", "ACF"],
             ["ACF", "ACF", "ACF", "ACF"]], path)[0])

        df = pd.DataFrame(data={"key1": [0, 1, 2], "key2": [0, 1, 2]})
        df.to_csv(path + "metadata.csv")

        dataset.metadata_file = path + "metadata.csv"

        dataset1 = MetadataRepertoireFilter.process(
            dataset, {
                "criteria": {
                    "type": OperationType.GREATER_THAN,
                    "value": {
                        "type": DataType.COLUMN,
                        "name": "key2"
                    },
                    "threshold": 1
                },
                "result_path": path
            })

        self.assertEqual(1, dataset1.get_example_count())

        self.assertRaises(
            AssertionError, MetadataRepertoireFilter.process, dataset, {
                "criteria": {
                    "type": OperationType.GREATER_THAN,
                    "value": {
                        "type": DataType.COLUMN,
                        "name": "key2"
                    },
                    "threshold": 10
                },
                "result_path": path
            })

        shutil.rmtree(path)
Example #2
0
    def test_run(self):
        dataset = RepertoireDataset(repertoires=[
            Repertoire("0.npy", "", "0"),
            Repertoire("0.npy", "", "1"),
            Repertoire("0.npy", "", "2"),
            Repertoire("0.npy", "", "3"),
            Repertoire("0.npy", "", "4"),
            Repertoire("0.npy", "", "5"),
            Repertoire("0.npy", "", "6"),
            Repertoire("0.npy", "", "7")
        ])

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        df = pd.DataFrame(data={
            "key1": [0, 0, 1, 1, 1, 2, 2, 0],
            "filename": [0, 1, 2, 3, 4, 5, 6, 7]
        })
        df.to_csv(EnvironmentSettings.root_path +
                  "test/tmp/datasplitter/metadata.csv")

        dataset.metadata_file = EnvironmentSettings.root_path + "test/tmp/datasplitter/metadata.csv"

        training_percentage = 0.7

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 5)
        self.assertEqual(len(tests[0].get_data()), 3)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))
        self.assertEqual(5, len(trains[0].repertoires))

        trains2, tests2 = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               training_percentage=training_percentage,
                               split_strategy=SplitType.RANDOM,
                               split_count=5,
                               paths=paths))

        self.assertEqual(trains[0].get_repertoire_ids(),
                         trains2[0].get_repertoire_ids())

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i)
            for i in range(dataset.get_example_count())
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.LOOCV,
                               split_count=-1,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 7)
        self.assertEqual(len(tests[0].get_data()), 1)
        self.assertEqual(8, len(trains))
        self.assertEqual(8, len(tests))

        paths = [
            EnvironmentSettings.root_path +
            "test/tmp/datasplitter/split_{}".format(i) for i in range(5)
        ]
        for path in paths:
            PathBuilder.build(path)

        trains, tests = DataSplitter.run(
            DataSplitterParams(dataset=dataset,
                               split_strategy=SplitType.K_FOLD,
                               split_count=5,
                               training_percentage=-1,
                               paths=paths))

        self.assertTrue(isinstance(trains[0], RepertoireDataset))
        self.assertTrue(isinstance(tests[0], RepertoireDataset))
        self.assertEqual(len(trains[0].get_data()), 6)
        self.assertEqual(len(tests[0].get_data()), 2)
        self.assertEqual(5, len(trains))
        self.assertEqual(5, len(tests))

        shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/datasplitter/")