def testPrepWithLocalSplit(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_PREP)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML_WITH_SPLIT)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        localCorpus = e2.corporaTable["local_test"]
        prepPath = os.path.join(e1.dir, "corpora", "test", "preprocessed",
                                "out")
        for f in localCorpus.getFiles():
            self.assertTrue(f.startswith(prepPath))
    def testRemoteTruncate(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_TRUNCATION)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        # Now, let's build a new directory.

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML_WITH_BIG_LIMIT)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        localCorpus = e2.corporaTable["local_test"]
        # 5, even though I asked for ten, because the remote corpus
        # is already truncated.
        self.assertEqual(len(localCorpus.getFiles()), 5)
    def testInheritedSplitWithVar(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        # Now, let's build a new directory.

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML_WITH_VAR)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        remoteCorpus = e1.corporaTable["test"]
        localCorpus = e2.corporaTable["local_test"]
        for k in remoteCorpus.partitionDict.keys():
            self.assertEqual(set(remoteCorpus.getFiles(partition=k)),
                             set(localCorpus.getFiles(partition=k)))
    def testInheritedSplitWithLimit(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_LIMIT)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        remoteCorpus = e1.corporaTable["test"]
        for k in remoteCorpus.partitionDict.keys():
            self.assertTrue(
                set(remoteCorpus.getFiles(
                    partition=k)) < set(remoteCorpus.partitionDict[k]))
        self.assertEqual(len(remoteCorpus.getFiles()), 5)
        # And the truncated partitions must equal the truncate file list.
        allFiles = []
        for k in remoteCorpus.partitionDict.keys():
            allFiles += remoteCorpus.getFiles(partition=k)
        self.assertEqual(len(allFiles), len(remoteCorpus.getFiles()))
        self.assertEqual(set(allFiles), set(remoteCorpus.getFiles()))

        # Now, let's build a new directory.

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        localCorpus = e2.corporaTable["local_test"]
        for k in remoteCorpus.partitionDict.keys():
            # But the files shouldn't have been changed.
            self.assertEqual(set(remoteCorpus.partitionDict[k]),
                             set(localCorpus.partitionDict[k]))
    def testPrep(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_PREP)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        # So now, we should have the expDir directory as the prefix for all
        # the documents in the corpus.

        remoteCorpus = e1.corporaTable["test"]
        prepPath = os.path.join(e1.dir, "corpora", "test", "preprocessed",
                                "out")
        for f in remoteCorpus.getFiles():
            self.assertTrue(f.startswith(prepPath))
    def testSplitOverride(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split")
        os.makedirs(expDir)
        fp = open(os.path.join(expDir, "exp.xml"), "w")
        fp.write(self.REMOTE_CORPUS_XML_WITH_LIMIT)
        fp.close()

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")

        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"),
                                        corpusPrefix=patternDir,
                                        dir=expDir))
        e1.run()

        # Now, let's build a new directory.

        expLocalDir = os.path.join(self.testContext["TMPDIR"],
                                   "inherited_split_local")
        os.makedirs(expLocalDir)
        fp = open(os.path.join(expLocalDir, "exp.xml"), "w")
        fp.write(self.LOCAL_CORPUS_XML_WITH_SPLIT)
        fp.close()

        e2 = ExperimentEngine(
            **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir))
        e2.run()

        localCorpus = e2.corporaTable["local_test"]
        self.assertEqual(set(localCorpus.partitionDict.keys()),
                         set(["sp4", "sp5"]))
        self.assertEqual(len(localCorpus.getFiles()), 5)
        allFiles = []
        for k in localCorpus.partitionDict.keys():
            allFiles += localCorpus.getFiles(partition=k)
        self.assertEqual(set(localCorpus.getFiles()), set(allFiles))
    def runTest(self):

        expDir = os.path.join(self.testContext["TMPDIR"], "many_iterators")
        os.makedirs(expDir)

        patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                                  "ne", "resources", "data", "json")
        expFile = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "test", "exp", "exp_many_iterators.xml")
        from MAT.CarafeTrain import ExperimentEngine, fromXML
        e = ExperimentEngine(
            **fromXML(expFile, corpusPrefix=patternDir, dir=expDir))
        e.run()
        # Now, we examine the result. 6 models, 18 runs. There's no particular order for the
        # models - corpus iterations may or may not come before build iterations. Actually,
        # if I'm going to respect "innermost", build iterations have to come last.
        # So the order of the file sizes with max_iterations from 2 to 6 by 2
        # and corpus from 4 to 8 by 4 should be: 4 4 4 8 8 8.
        allInstances = e.getModel("test").allInstances
        self.assertEqual(len(allInstances), 6)
        self.assertEqual([len(m.trainingSet.getFiles()) for m in allInstances],
                         [4, 4, 4, 8, 8, 8])
        self.assertTrue(
            set(allInstances[0].trainingSet.getFiles()) < set(
                allInstances[3].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[1].trainingSet.getFiles()) < set(
                allInstances[4].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[2].trainingSet.getFiles()) < set(
                allInstances[5].trainingSet.getFiles()))
        self.assertTrue(
            set(allInstances[0].trainingSet.getFiles()) == set(
                allInstances[1].trainingSet.getFiles()) == set(
                    allInstances[2].trainingSet.getFiles()))
        self.assertEqual(
            [m.engineSettings["max_iterations"] for m in allInstances],
            [2, 4, 6, 2, 4, 6])
        # The runs have a similar structure.
        allRuns = e.runTable["test"].allInstances
        self.assertEqual(len(allRuns), 18)
        self.assertEqual([r.engineOptions["prior_adjust"] for r in allRuns],
                         [-1.0, 0.0, 1.0] * 6)