def testCorpusSizeSimple(self):
     # There should be a difference between forceLast and not forceLast.
     patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "resources", "data", "json")
     from MAT.Bootstrap import Bootstrapper, DocumentSet, TrainingRun, TestRun, CorpusSizeIterator
     e = Bootstrapper(dir=self.expDir,
                      task=self.task,
                      corpora=[
                          DocumentSet("test",
                                      partitions=[("train", 4),
                                                  ("test", 1)],
                                      filePats=["*.json"],
                                      prefix=patternDir)
                      ],
                      models=[
                          TrainingRun("test",
                                      trainingCorpora=[("test", "train")],
                                      iterators=[CorpusSizeIterator(2)])
                      ])
     e.run()
     # OK, this works. There should be a subdirectory for each model.
     allInstances = e.getModel("test").allInstances
     self.assertEqual(len(allInstances), 4)
     self.assertEqual([len(m.trainingSet.getFiles()) for m in allInstances],
                      [2, 4, 6, 8])
     self.assertTrue(
         set(allInstances[0].trainingSet.getFiles()) < set(
             allInstances[1].trainingSet.getFiles()))
     self.assertTrue(
         set(allInstances[1].trainingSet.getFiles()) < set(
             allInstances[2].trainingSet.getFiles()))
     self.assertTrue(
         set(allInstances[2].trainingSet.getFiles()) < set(
             allInstances[3].trainingSet.getFiles()))
 def testIncrementLastStep(self):
     # There should be a difference between forceLast and not forceLast.
     patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "resources", "data", "json")
     from MAT.Bootstrap import Bootstrapper, DocumentSet, TrainingRun, TestRun, IncrementIterator
     e = Bootstrapper(
         dir=self.expDir,
         task=self.task,
         corpora=[
             DocumentSet("test",
                         partitions=[("train", 4), ("test", 1)],
                         filePats=["*.json"],
                         prefix=patternDir)
         ],
         models=[
             TrainingRun("test",
                         trainingCorpora=[("test", "train")],
                         iterators=[
                             IncrementIterator("engineSettings",
                                               "max_iterations",
                                               4,
                                               7,
                                               2,
                                               forceLast=True)
                         ])
         ])
     e.run()
     # OK, this works. There should be a subdirectory for each model.
     self.assertTrue(len(e.getModel("test").allInstances) == 3)
 def testDoubleModelIncrement(self):
     # Forcing last here because of float rounding.
     patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "resources", "data", "json")
     from MAT.Bootstrap import Bootstrapper, DocumentSet, TrainingRun, TestRun, IncrementIterator
     e = Bootstrapper(
         dir=self.expDir,
         task=self.task,
         corpora=[
             DocumentSet("test",
                         partitions=[("train", 4), ("test", 1)],
                         filePats=["*.json"],
                         prefix=patternDir)
         ],
         models=[
             TrainingRun("test",
                         trainingCorpora=[("test", "train")],
                         engineSettings={"l1": True},
                         iterators=[
                             IncrementIterator("engineSettings",
                                               "max_iterations", 4, 8, 2),
                             IncrementIterator("engineSettings",
                                               "l1_c",
                                               0.1,
                                               0.3,
                                               .1,
                                               forceLast=True)
                         ])
         ])
     e.run()
     self.assertTrue(len(e.getModel("test").allInstances) == 9)
     self.assertEqual([
         m.engineSettings["max_iterations"]
         for m in e.getModel("test").allInstances
     ], [4, 4, 4, 6, 6, 6, 8, 8, 8])
     self.assertEqual([
         str(m.engineSettings["l1_c"])
         for m in e.getModel("test").allInstances
     ], ['0.1', '0.2', '0.3', '0.1', '0.2', '0.3', '0.1', '0.2', '0.3'])
     self.assertEqual(
         [m.engineSettings["l1"] for m in e.getModel("test").allInstances],
         [True] * 9)
     self.assertEqual([
         m.modelSubdir for m in e.getModel("test").allInstances
     ], [
         "test_max_iterations_4_l1_c_0_1", "test_max_iterations_4_l1_c_0_2",
         "test_max_iterations_4_l1_c_0_3", "test_max_iterations_6_l1_c_0_1",
         "test_max_iterations_6_l1_c_0_2", "test_max_iterations_6_l1_c_0_3",
         "test_max_iterations_8_l1_c_0_1", "test_max_iterations_8_l1_c_0_2",
         "test_max_iterations_8_l1_c_0_3"
     ])
 def testModelPlusRunIncrement(self):
     patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "resources", "data", "json")
     from MAT.Bootstrap import Bootstrapper, DocumentSet, TrainingRun, TestRun, IncrementIterator
     e = Bootstrapper(
         dir=self.expDir,
         task=self.task,
         corpora=[
             DocumentSet("test",
                         partitions=[("train", 4), ("test", 1)],
                         filePats=["*.json"],
                         prefix=patternDir)
         ],
         models=[
             TrainingRun("test",
                         trainingCorpora=[("test", "train")],
                         iterators=[
                             IncrementIterator("engineSettings",
                                               "max_iterations", 4, 8, 2)
                         ])
         ],
         runs=[
             TestRun("test",
                     model="test",
                     testCorpora=[("test", "test")],
                     engineOptions={
                         "steps": "zone,tokenize,tag",
                         "workflow": "Demo"
                     },
                     iterators=[
                         IncrementIterator("engineOptions", "prior_adjust",
                                           -1, 1, 1)
                     ])
         ])
     e.run()
     # OK, this works. There should be a subdirectory for each model.
     self.assertTrue(len(e.getModel("test").allInstances) == 3)
     self.assertEqual([
         m.engineSettings["max_iterations"]
         for m in e.getModel("test").allInstances
     ], [4, 6, 8])
     self.assertEqual(
         [m.modelSubdir for m in e.getModel("test").allInstances], [
             "test_max_iterations_4", "test_max_iterations_6",
             "test_max_iterations_8"
         ])
     self.assertTrue(len(e.runTable["test"].allInstances) == 9)
     # Interleaved, so model dominant.
     self.assertEqual([(os.path.basename(os.path.dirname(
         r.runDir)), os.path.basename(r.runDir))
                       for r in e.runTable["test"].allInstances],
                      [("test_prior_adjust__1", "test_max_iterations_4"),
                       ("test_prior_adjust_0", "test_max_iterations_4"),
                       ("test_prior_adjust_1", "test_max_iterations_4"),
                       ("test_prior_adjust__1", "test_max_iterations_6"),
                       ("test_prior_adjust_0", "test_max_iterations_6"),
                       ("test_prior_adjust_1", "test_max_iterations_6"),
                       ("test_prior_adjust__1", "test_max_iterations_8"),
                       ("test_prior_adjust_0", "test_max_iterations_8"),
                       ("test_prior_adjust_1", "test_max_iterations_8")])
 def testSimple(self):
     # I'm going to do a simple boostrap, constructed from objects.
     patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "resources", "data", "json")
     from MAT.Bootstrap import Bootstrapper, DocumentSet, TrainingRun, TestRun
     e = Bootstrapper(
         dir=self.expDir,
         task=self.task,
         corpora=[
             DocumentSet("test",
                         partitions=[
                             ("train", 3),
                             ("test", DocumentSet.FIXED_PARTITION_REMAINDER)
                         ],
                         partitionIsFixed=True,
                         filePats=["*.json"],
                         prefix=patternDir)
         ],
         models=[TrainingRun("test", trainingCorpora=[("test", "train")])],
         runs=[
             TestRun("test",
                     model="test",
                     testCorpora=[("test", "test")],
                     engineOptions={
                         "steps": "zone,tokenize,tag",
                         "workflow": "Demo"
                     })
         ])
     e.run()
     # OK, now we need to make sure that there's a model, and that in the
     # run input and the hyp input, there are two files each, whose names are
     # the prefix of the test slice of the test corpus.
     self.assertTrue(len(e.getModel("test").allInstances) == 1)
     m = e.getModel("test").allInstances[0]
     self.assertTrue(os.path.exists(os.path.join(m.modelDir, "model")))
     files = e.corporaTable["test"].getFiles(partition="test")
     self.assertEqual(len(files), 7)
     self.assertEqual(len(m.trainingSet.getFiles()), 3)
     self.assertTrue(len(e.runTable["test"].allInstances) == 1)
     r = e.runTable["test"].allInstances[0]
     runDir = r.runDir
     for file in files:
         self.assertTrue(
             os.path.exists(
                 os.path.join(runDir, "hyp", os.path.basename(file)) +
                 ".prepped.tag.json"))
         self.assertTrue(
             os.path.exists(
                 os.path.join(runDir, "run_input", os.path.basename(file)) +
                 ".prepped"))
 def testFiveWay(self):
     # I'm going to do a simple boostrap, constructed from objects.
     patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "resources", "data", "json")
     from MAT.Bootstrap import Bootstrapper, DocumentSet, TrainingRun, TestRun
     e = Bootstrapper(dir=self.expDir,
                      task=self.task,
                      corpora=[
                          DocumentSet("test",
                                      partitions=[("s1", 1), ("s2", 1),
                                                  ("s3", 1), ("s4", 1),
                                                  ("s5", 1)],
                                      filePats=["*.json"],
                                      prefix=patternDir)
                      ],
                      models=[
                          TrainingRun("s1234",
                                      trainingCorpora=[("test", "s1"),
                                                       ("test", "s2"),
                                                       ("test", "s3"),
                                                       ("test", "s4")]),
                          TrainingRun("s1235",
                                      trainingCorpora=[("test", "s1"),
                                                       ("test", "s2"),
                                                       ("test", "s3"),
                                                       ("test", "s5")]),
                          TrainingRun("s1245",
                                      trainingCorpora=[("test", "s1"),
                                                       ("test", "s2"),
                                                       ("test", "s4"),
                                                       ("test", "s5")]),
                          TrainingRun("s1345",
                                      trainingCorpora=[("test", "s1"),
                                                       ("test", "s3"),
                                                       ("test", "s4"),
                                                       ("test", "s5")]),
                          TrainingRun("s2345",
                                      trainingCorpora=[("test", "s2"),
                                                       ("test", "s3"),
                                                       ("test", "s4"),
                                                       ("test", "s5")])
                      ],
                      runs=[
                          TestRun("s1",
                                  model="s2345",
                                  testCorpora=[("test", "s1")],
                                  engineOptions={
                                      "steps": "zone,tokenize,tag",
                                      "workflow": "Demo"
                                  }),
                          TestRun("s2",
                                  model="s1345",
                                  testCorpora=[("test", "s2")],
                                  engineOptions={
                                      "steps": "zone,tokenize,tag",
                                      "workflow": "Demo"
                                  }),
                          TestRun("s3",
                                  model="s1245",
                                  testCorpora=[("test", "s3")],
                                  engineOptions={
                                      "steps": "zone,tokenize,tag",
                                      "workflow": "Demo"
                                  }),
                          TestRun("s4",
                                  model="s1235",
                                  testCorpora=[("test", "s4")],
                                  engineOptions={
                                      "steps": "zone,tokenize,tag",
                                      "workflow": "Demo"
                                  }),
                          TestRun("s5",
                                  model="s1234",
                                  testCorpora=[("test", "s5")],
                                  engineOptions={
                                      "steps": "zone,tokenize,tag",
                                      "workflow": "Demo"
                                  })
                      ])
     e.run()
     # OK, now we need to make sure that there's a model, and that in the
     # run input and the hyp input, there are two files each, whose names are
     # the prefix of the test slice of the test corpus.
     for m, mTemplate in e.modelSetTable.items():
         self.assertTrue(len(mTemplate.allInstances) == 1)
         self.assertTrue(
             os.path.exists(
                 os.path.join(e.getModelDir(mTemplate.allInstances[0]),
                              "model")))
         self.assertEqual(
             len(mTemplate.allInstances[0].trainingSet.getFiles()), 8)
     for p in ["s1", "s2", "s3", "s4", "s5"]:
         files = e.corporaTable["test"].getFiles(partition=p)
         self.assertEqual(len(files), 2)
         r = e.runTable[p]
         self.assertTrue(len(r.allInstances) == 1)
         runDir = r.allInstances[0].runDir
         for file in files:
             self.assertTrue(
                 os.path.exists(
                     os.path.join(runDir, "hyp", os.path.basename(file)) +
                     ".prepped.tag.json"))
             self.assertTrue(
                 os.path.exists(
                     os.path.join(runDir, "run_input", os.path.basename(
                         file)) + ".prepped"))
 def testIncrement(self):
     patternDir = os.path.join(self.testContext["MAT_PKG_HOME"], "sample",
                               "ne", "resources", "data", "json")
     from MAT.Bootstrap import Bootstrapper, DocumentSet, TrainingRun, TestRun, IncrementIterator
     e = Bootstrapper(dir=self.expDir,
                      task=self.task,
                      corpora=[
                          DocumentSet("test",
                                      partitions=[("train", 4),
                                                  ("test", 1)],
                                      filePats=["*.json"],
                                      prefix=patternDir)
                      ],
                      models=[
                          TrainingRun("test",
                                      trainingCorpora=[("test", "train")],
                                      iterators=[
                                          IncrementIterator(
                                              "engineSettings",
                                              "max_iterations", 4, 8, 2)
                                      ])
                      ],
                      runs=[
                          TestRun("test",
                                  model="test",
                                  testCorpora=[("test", "test")],
                                  engineOptions={
                                      "steps": "zone,tokenize,tag",
                                      "workflow": "Demo"
                                  })
                      ])
     e.run()
     # OK, this works. There should be a subdirectory for each model.
     self.assertTrue(len(e.getModel("test").allInstances) == 3)
     for m in e.getModel("test").allInstances:
         self.assertTrue(os.path.exists(os.path.join(m.modelDir, "model")))
         self.assertEqual(len(m.trainingSet.getFiles()), 8)
     self.assertEqual([
         m.engineSettings["max_iterations"]
         for m in e.getModel("test").allInstances
     ], [4, 6, 8])
     self.assertEqual(
         [m.modelSubdir for m in e.getModel("test").allInstances], [
             "test_max_iterations_4", "test_max_iterations_6",
             "test_max_iterations_8"
         ])
     files = e.corporaTable["test"].getFiles(partition="test")
     self.assertEqual(len(files), 2)
     self.assertTrue(len(e.runTable["test"].allInstances) == 3)
     for r in e.runTable["test"].allInstances:
         runDir = r.runDir
         for file in files:
             self.assertTrue(
                 os.path.exists(
                     os.path.join(runDir, "hyp", os.path.basename(file)) +
                     ".prepped.tag.json"))
             self.assertTrue(
                 os.path.exists(
                     os.path.join(runDir, "run_input", os.path.basename(
                         file)) + ".prepped"))
     self.assertEqual([
         os.path.basename(r.runDir) for r in e.runTable["test"].allInstances
     ], [
         "test_max_iterations_4", "test_max_iterations_6",
         "test_max_iterations_8"
     ])