コード例 #1
0
    def runTest(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "many_iterators")
        os.makedirs(expDir)

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")
        expFile = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "test", "exp", "exp_many_iterators.xml")
        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e = ExperimentEngine(
            **fromXML(expFile, corpusPrefix=patternDir, dir=expDir))
        e.run()
        # Now, we examine the result. 6 models, 18 runs. There's no particular order for the
        # models - corpus iterations may or may not come before build iterations. Actually,
        # if I'm going to respect "innermost", build iterations have to come last.
        # So the order of the file sizes with max_iterations from 2 to 6 by 2
        # and corpus from 4 to 8 by 4 should be: 4 4 4 8 8 8.
        allInstances = e.getModel("test").allInstances
        self.assertEqual(len(allInstances), 6)
        self.assertEqual([len(m.trainingSet.getFiles()) for m in allInstances],
                         [4, 4, 4, 8, 8, 8])
        self.assertTrue(
            set(allInstances[0].trainingSet.getFiles()) < set(
                allInstances[3].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[1].trainingSet.getFiles()) < set(
                allInstances[4].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[2].trainingSet.getFiles()) < set(
                allInstances[5].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[0].trainingSet.getFiles()) == set(
                allInstances[1].trainingSet.getFiles()) == set(
                    allInstances[2].trainingSet.getFiles()))
        self.assertEqual(
            [m.engineSettings["max_iterations"] for m in allInstances],
            [2, 4, 6, 2, 4, 6])
        # The runs have a similar structure.
        allRuns = e.runTable["test"].allInstances
        self.assertEqual(len(allRuns), 18)
        self.assertEqual([r.engineOptions["prior_adjust"] for r in allRuns],
                         [-1.0, 0.0, 1.0] * 6)
コード例 #2
0
    def runTest(self):
        # What's the test look like? Seems to me that I need to create a corpus,
        # build a model, and then restart in the same directory. The chances of
        # the same elements being chosen in the same order for multiple runs
        # is very, very slim.

        expDir = os.path.join(self.testContext["TMPDIR"],
                              "restarted_size_iterators")
        os.makedirs(expDir)

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")
        expFile = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "test", "exp", "exp_iterative.xml")
        from MAT.CarafeTrain import ExperimentEngine, CorpusSizeIterator, PreparedCorpus, TrainingRun, _unmarkDone
        e = ExperimentEngine(
            dir=expDir,
            task=self.task,
            corpora=[
                PreparedCorpus("test",
                               partitions=[("train", 4), ("test", 1)],
                               filePats=["*.json"],
                               prefix=patternDir)
            ],
            models=[
                TrainingRun(
                    "test",
                    trainingCorpora=[("test", "train")],
                    iterators=[CorpusSizeIterator(startVal=6, increment=1)])
            ])
        e.run()
        # Now, let's retrieve the training set files. They won't be
        # in order, because of the shuffling, so I'm going to need to
        # look specifically at the next-to-last iteration.
        allInstances = e.getModel("test").allInstances
        self.assertEqual(len(allInstances), 3)
        self.assertEqual([len(m.trainingSet.getFiles()) for m in allInstances],
                         [6, 7, 8])
        firstSet = allInstances[0].trainingSet.getFiles()
        secondSet = allInstances[1].trainingSet.getFiles()[:]
        thirdSet = allInstances[2].trainingSet.getFiles()[:]
        # Now, mark 7 and 8 as not done.
        _unmarkDone(allInstances[1].modelDir)
        _unmarkDone(allInstances[2].modelDir)
        # Get a new experiment object, and rerun.
        e = ExperimentEngine(
            dir=expDir,
            task=self.task,
            corpora=[
                PreparedCorpus("test",
                               partitions=[("train", 4), ("test", 1)],
                               filePats=["*.json"],
                               prefix=patternDir)
            ],
            models=[
                TrainingRun(
                    "test",
                    trainingCorpora=[("test", "train")],
                    iterators=[CorpusSizeIterator(startVal=6, increment=1)])
            ])
        e.run()
        allInstances = e.getModel("test").allInstances
        self.assertEqual(set(secondSet),
                         set(allInstances[1].trainingSet.getFiles()))