def testPrepWithLocalSplit(self): expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split") os.makedirs(expDir) fp = open(os.path.join(expDir, "exp.xml"), "w") fp.write(self.REMOTE_CORPUS_XML_WITH_PREP) fp.close() patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample", "ne", "resources", "data", "json") from MAT.CarafeTrain import ExperimentEngine, fromXML e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"), corpusPrefix=patternDir, dir=expDir)) e1.run() expLocalDir = os.path.join(self.testContext["TMPDIR"], "inherited_split_local") os.makedirs(expLocalDir) fp = open(os.path.join(expLocalDir, "exp.xml"), "w") fp.write(self.LOCAL_CORPUS_XML_WITH_SPLIT) fp.close() e2 = ExperimentEngine( **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir)) e2.run() localCorpus = e2.corporaTable["local_test"] prepPath = os.path.join(e1.dir, "corpora", "test", "preprocessed", "out") for f in localCorpus.getFiles(): self.assertTrue(f.startswith(prepPath))
def testRemoteTruncate(self): expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split") os.makedirs(expDir) fp = open(os.path.join(expDir, "exp.xml"), "w") fp.write(self.REMOTE_CORPUS_XML_WITH_TRUNCATION) fp.close() patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample", "ne", "resources", "data", "json") from MAT.CarafeTrain import ExperimentEngine, fromXML e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"), corpusPrefix=patternDir, dir=expDir)) e1.run() # Now, let's build a new directory. expLocalDir = os.path.join(self.testContext["TMPDIR"], "inherited_split_local") os.makedirs(expLocalDir) fp = open(os.path.join(expLocalDir, "exp.xml"), "w") fp.write(self.LOCAL_CORPUS_XML_WITH_BIG_LIMIT) fp.close() e2 = ExperimentEngine( **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir)) e2.run() localCorpus = e2.corporaTable["local_test"] # 5, even though I asked for ten, because the remote corpus # is already truncated. self.assertEqual(len(localCorpus.getFiles()), 5)
def testInheritedSplitWithVar(self): expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split") os.makedirs(expDir) fp = open(os.path.join(expDir, "exp.xml"), "w") fp.write(self.REMOTE_CORPUS_XML) fp.close() patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample", "ne", "resources", "data", "json") from MAT.CarafeTrain import ExperimentEngine, fromXML e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"), corpusPrefix=patternDir, dir=expDir)) e1.run() # Now, let's build a new directory. expLocalDir = os.path.join(self.testContext["TMPDIR"], "inherited_split_local") os.makedirs(expLocalDir) fp = open(os.path.join(expLocalDir, "exp.xml"), "w") fp.write(self.LOCAL_CORPUS_XML_WITH_VAR) fp.close() e2 = ExperimentEngine( **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir)) e2.run() remoteCorpus = e1.corporaTable["test"] localCorpus = e2.corporaTable["local_test"] for k in remoteCorpus.partitionDict.keys(): self.assertEqual(set(remoteCorpus.getFiles(partition=k)), set(localCorpus.getFiles(partition=k)))
def testInheritedSplitWithLimit(self): expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split") os.makedirs(expDir) fp = open(os.path.join(expDir, "exp.xml"), "w") fp.write(self.REMOTE_CORPUS_XML_WITH_LIMIT) fp.close() patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample", "ne", "resources", "data", "json") from MAT.CarafeTrain import ExperimentEngine, fromXML e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"), corpusPrefix=patternDir, dir=expDir)) e1.run() remoteCorpus = e1.corporaTable["test"] for k in remoteCorpus.partitionDict.keys(): self.assertTrue( set(remoteCorpus.getFiles( partition=k)) < set(remoteCorpus.partitionDict[k])) self.assertEqual(len(remoteCorpus.getFiles()), 5) # And the truncated partitions must equal the truncate file list. allFiles = [] for k in remoteCorpus.partitionDict.keys(): allFiles += remoteCorpus.getFiles(partition=k) self.assertEqual(len(allFiles), len(remoteCorpus.getFiles())) self.assertEqual(set(allFiles), set(remoteCorpus.getFiles())) # Now, let's build a new directory. expLocalDir = os.path.join(self.testContext["TMPDIR"], "inherited_split_local") os.makedirs(expLocalDir) fp = open(os.path.join(expLocalDir, "exp.xml"), "w") fp.write(self.LOCAL_CORPUS_XML) fp.close() e2 = ExperimentEngine( **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir)) e2.run() localCorpus = e2.corporaTable["local_test"] for k in remoteCorpus.partitionDict.keys(): # But the files shouldn't have been changed. self.assertEqual(set(remoteCorpus.partitionDict[k]), set(localCorpus.partitionDict[k]))
def testPrep(self): expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split") os.makedirs(expDir) fp = open(os.path.join(expDir, "exp.xml"), "w") fp.write(self.REMOTE_CORPUS_XML_WITH_PREP) fp.close() patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample", "ne", "resources", "data", "json") from MAT.CarafeTrain import ExperimentEngine, fromXML e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"), corpusPrefix=patternDir, dir=expDir)) e1.run() # So now, we should have the expDir directory as the prefix for all # the documents in the corpus. remoteCorpus = e1.corporaTable["test"] prepPath = os.path.join(e1.dir, "corpora", "test", "preprocessed", "out") for f in remoteCorpus.getFiles(): self.assertTrue(f.startswith(prepPath))
def testSplitOverride(self): expDir = os.path.join(self.testContext["TMPDIR"], "inherited_split") os.makedirs(expDir) fp = open(os.path.join(expDir, "exp.xml"), "w") fp.write(self.REMOTE_CORPUS_XML_WITH_LIMIT) fp.close() patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample", "ne", "resources", "data", "json") from MAT.CarafeTrain import ExperimentEngine, fromXML e1 = ExperimentEngine(**fromXML(os.path.join(expDir, "exp.xml"), corpusPrefix=patternDir, dir=expDir)) e1.run() # Now, let's build a new directory. expLocalDir = os.path.join(self.testContext["TMPDIR"], "inherited_split_local") os.makedirs(expLocalDir) fp = open(os.path.join(expLocalDir, "exp.xml"), "w") fp.write(self.LOCAL_CORPUS_XML_WITH_SPLIT) fp.close() e2 = ExperimentEngine( **fromXML(os.path.join(expLocalDir, "exp.xml"), dir=expLocalDir)) e2.run() localCorpus = e2.corporaTable["local_test"] self.assertEqual(set(localCorpus.partitionDict.keys()), set(["sp4", "sp5"])) self.assertEqual(len(localCorpus.getFiles()), 5) allFiles = [] for k in localCorpus.partitionDict.keys(): allFiles += localCorpus.getFiles(partition=k) self.assertEqual(set(localCorpus.getFiles()), set(allFiles))
def runTest(self): expDir = os.path.join(self.testContext["TMPDIR"], "many_iterators") os.makedirs(expDir) patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample", "ne", "resources", "data", "json") expFile = os.path.join(self.testContext["MAT_PKG_HOME"], "sample", "ne", "test", "exp", "exp_many_iterators.xml") from MAT.CarafeTrain import ExperimentEngine, fromXML e = ExperimentEngine( **fromXML(expFile, corpusPrefix=patternDir, dir=expDir)) e.run() # Now, we examine the result. 6 models, 18 runs. There's no particular order for the # models - corpus iterations may or may not come before build iterations. Actually, # if I'm going to respect "innermost", build iterations have to come last. # So the order of the file sizes with max_iterations from 2 to 6 by 2 # and corpus from 4 to 8 by 4 should be: 4 4 4 8 8 8. allInstances = e.getModel("test").allInstances self.assertEqual(len(allInstances), 6) self.assertEqual([len(m.trainingSet.getFiles()) for m in allInstances], [4, 4, 4, 8, 8, 8]) self.assertTrue( set(allInstances[0].trainingSet.getFiles()) < set( allInstances[3].trainingSet.getFiles())) self.assertTrue( set(allInstances[1].trainingSet.getFiles()) < set( allInstances[4].trainingSet.getFiles())) self.assertTrue( set(allInstances[2].trainingSet.getFiles()) < set( allInstances[5].trainingSet.getFiles())) self.assertTrue( set(allInstances[0].trainingSet.getFiles()) == set( allInstances[1].trainingSet.getFiles()) == set( allInstances[2].trainingSet.getFiles())) self.assertEqual( [m.engineSettings["max_iterations"] for m in allInstances], [2, 4, 6, 2, 4, 6]) # The runs have a similar structure. allRuns = e.runTable["test"].allInstances self.assertEqual(len(allRuns), 18) self.assertEqual([r.engineOptions["prior_adjust"] for r in allRuns], [-1.0, 0.0, 1.0] * 6)