def testPrep(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_PREP)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        # So now, we should have the expDir directory as the prefix for all
        # the documents in the corpus.

        remoteCorpus = e1.corporaTable["test"]
        prepPath = os.path.join(e1.dir, "corpora", "test", "preprocessed",
                                "out")
        for f in remoteCorpus.getFiles():
            self.assertTrue(f.startswith(prepPath))
    def testPrepWithLocalSplit(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_PREP)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML_WITH_SPLIT)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        localCorpus = e2.corporaTable["local_test"]
        prepPath = os.path.join(e1.dir, "corpora", "test", "preprocessed",
                                "out")
        for f in localCorpus.getFiles():
            self.assertTrue(f.startswith(prepPath))
    def testRemoteTruncate(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_TRUNCATION)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        # Now, let's build a new directory.

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML_WITH_BIG_LIMIT)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        localCorpus = e2.corporaTable["local_test"]
        # 5, even though I asked for ten, because the remote corpus
        # is already truncated.
        self.assertEqual(len(localCorpus.getFiles()), 5)
    def testInheritedSplitWithVar(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        # Now, let's build a new directory.

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML_WITH_VAR)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        remoteCorpus = e1.corporaTable["test"]
        localCorpus = e2.corporaTable["local_test"]
        for k in remoteCorpus.partitionDict.keys():
            self.assertEqual(set(remoteCorpus.getFiles(partition=k)),
                             set(localCorpus.getFiles(partition=k)))
    def runTest(self):

        # I'm going to run a simple experiment, constructed from objects.
        expDir = os.path.join(self.testContext["TMPDIR"],
                              "sample_ne_exp_in_code")
        os.makedirs(expDir)
        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")
        from MAT.CarafeTrain import ExperimentEngine, PreparedCorpus, TrainingRun, TestRun
        e = ExperimentEngine(
            dir=expDir,
            task=self.task,
            corpora=[
                PreparedCorpus("test",
                               partitions=[("train", 4), ("test", 1)],
                               filePats=["*.json"],
                               prefix=patternDir)
            ],
            models=[TrainingRun("test", trainingCorpora=[("test", "train")])],
            runs=[
                TestRun("test",
                        model="test",
                        testCorpora=[("test", "test")],
                        engineOptions={
                            "steps": "zone,tokenize,tag",
                            "workflow": "Demo"
                        })
            ])
        e.run()

        # Now, check the scores to ensure that they have variance columns in them.
        import csv
        fp = open(os.path.join(expDir, "allbytoken_excel.csv"), "r")
        row = csv.reader(fp).next()
        fp.close()
        for header in MV_HEADERS:
            self.failUnless(header in row)
        fp = open(
            os.path.join(expDir, "runs", "test", "test", "bytoken_excel.csv"),
            "r")
        row = csv.reader(fp).next()
        fp.close()
        for header in MV_HEADERS:
            self.failUnless(header in row)
    def runTest(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "many_iterators")
        os.makedirs(expDir)

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")
        expFile = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "test", "exp", "exp_many_iterators.xml")
        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e = ExperimentEngine(
            **fromXML(expFile, corpusPrefix=patternDir, dir=expDir))
        e.run()
        # Now, we examine the result. 6 models, 18 runs. There's no particular order for the
        # models - corpus iterations may or may not come before build iterations. Actually,
        # if I'm going to respect "innermost", build iterations have to come last.
        # So the order of the file sizes with max_iterations from 2 to 6 by 2
        # and corpus from 4 to 8 by 4 should be: 4 4 4 8 8 8.
        allInstances = e.getModel("test").allInstances
        self.assertEqual(len(allInstances), 6)
        self.assertEqual([len(m.trainingSet.getFiles()) for m in allInstances],
                         [4, 4, 4, 8, 8, 8])
        self.assertTrue(
            set(allInstances[0].trainingSet.getFiles()) < set(
                allInstances[3].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[1].trainingSet.getFiles()) < set(
                allInstances[4].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[2].trainingSet.getFiles()) < set(
                allInstances[5].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[0].trainingSet.getFiles()) == set(
                allInstances[1].trainingSet.getFiles()) == set(
                    allInstances[2].trainingSet.getFiles()))
        self.assertEqual(
            [m.engineSettings["max_iterations"] for m in allInstances],
            [2, 4, 6, 2, 4, 6])
        # The runs have a similar structure.
        allRuns = e.runTable["test"].allInstances
        self.assertEqual(len(allRuns), 18)
        self.assertEqual([r.engineOptions["prior_adjust"] for r in allRuns],
                         [-1.0, 0.0, 1.0] * 6)
    def testInheritedSplitWithLimit(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_LIMIT)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        remoteCorpus = e1.corporaTable["test"]
        for k in remoteCorpus.partitionDict.keys():
            self.assertTrue(
                set(remoteCorpus.getFiles(
                    partition=k)) < set(remoteCorpus.partitionDict[k]))
        self.assertEqual(len(remoteCorpus.getFiles()), 5)
        # And the truncated partitions must equal the truncate file list.
        allFiles = []
        for k in remoteCorpus.partitionDict.keys():
            allFiles += remoteCorpus.getFiles(partition=k)
        self.assertEqual(len(allFiles), len(remoteCorpus.getFiles()))
        self.assertEqual(set(allFiles), set(remoteCorpus.getFiles()))

        # Now, let's build a new directory.

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        localCorpus = e2.corporaTable["local_test"]
        for k in remoteCorpus.partitionDict.keys():
            # But the files shouldn't have been changed.
            self.assertEqual(set(remoteCorpus.partitionDict[k]),
                             set(localCorpus.partitionDict[k]))
    def testSplitOverride(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_LIMIT)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        # Now, let's build a new directory.

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML_WITH_SPLIT)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        localCorpus = e2.corporaTable["local_test"]
        self.assertEqual(set(localCorpus.partitionDict.keys()),
                         set(["sp4", "sp5"]))
        self.assertEqual(len(localCorpus.getFiles()), 5)
        allFiles = []
        for k in localCorpus.partitionDict.keys():
            allFiles += localCorpus.getFiles(partition=k)
        self.assertEqual(set(localCorpus.getFiles()), set(allFiles))
    def runTest(self):
        # What's the test look like? Seems to me that I need to create a corpus,
        # build a model, and then restart in the same directory. The chances of
        # the same elements being chosen in the same order for multiple runs
        # is very, very slim.

        expDir = os.path.join(self.testContext["TMPDIR"],
                              "restarted_size_iterators")
        os.makedirs(expDir)

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")
        expFile = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "test", "exp", "exp_iterative.xml")
        from MAT.CarafeTrain import ExperimentEngine, CorpusSizeIterator, PreparedCorpus, TrainingRun, _unmarkDone
        e = ExperimentEngine(
            dir=expDir,
            task=self.task,
            corpora=[
                PreparedCorpus("test",
                               partitions=[("train", 4), ("test", 1)],
                               filePats=["*.json"],
                               prefix=patternDir)
            ],
            models=[
                TrainingRun(
                    "test",
                    trainingCorpora=[("test", "train")],
                    iterators=[CorpusSizeIterator(startVal=6, increment=1)])
            ])
        e.run()
        # Now, let's retrieve the training set files. They won't be
        # in order, because of the shuffling, so I'm going to need to
        # look specifically at the next-to-last iteration.
        allInstances = e.getModel("test").allInstances
        self.assertEqual(len(allInstances), 3)
        self.assertEqual([len(m.trainingSet.getFiles()) for m in allInstances],
                         [6, 7, 8])
        firstSet = allInstances[0].trainingSet.getFiles()
        secondSet = allInstances[1].trainingSet.getFiles()[:]
        thirdSet = allInstances[2].trainingSet.getFiles()[:]
        # Now, mark 7 and 8 as not done.
        _unmarkDone(allInstances[1].modelDir)
        _unmarkDone(allInstances[2].modelDir)
        # Get a new experiment object, and rerun.
        e = ExperimentEngine(
            dir=expDir,
            task=self.task,
            corpora=[
                PreparedCorpus("test",
                               partitions=[("train", 4), ("test", 1)],
                               filePats=["*.json"],
                               prefix=patternDir)
            ],
            models=[
                TrainingRun(
                    "test",
                    trainingCorpora=[("test", "train")],
                    iterators=[CorpusSizeIterator(startVal=6, increment=1)])
            ])
        e.run()
        allInstances = e.getModel("test").allInstances
        self.assertEqual(set(secondSet),
                         set(allInstances[1].trainingSet.getFiles()))