class TestSplitter(TestCase): def setUp(self): super(TestSplitter, self).setUp() self.testModel = Model(depth=3, skip=1) testcases = resource_filename("sandhisplitter.tests", "resources/samples.txt") self.entries = open(testcases, "r", encoding='utf-8') def test_load(self): count = 0 firstline = None for line in self.entries: count += 1 if count == 1: firstline = line (word, splits, locs) = extract(line) self.testModel.add_entry(word, splits, locs) m = self.testModel.serialize() self.testModel.load(m) self.splitter = Splitter(m) # Test probale splits (word, splits, locs) = extract(firstline) locs = list(locs) sps = self.splitter.splits(word) self.assertEqual(sps, locs)
class TestModel(TestCase): def setUp(self): super(TestModel, self).setUp() self.testModel = Model(depth=3, skip=1) testcases = resource_filename("sandhisplitter.tests", "resources/samples.txt") self.entries = open(testcases, "r", encoding='utf-8') def test_load(self): count = 0 firstline = None for line in self.entries: count += 1 if count == 1: firstline = line (word, splits, locs) = extract(line) self.testModel.add_entry(word, splits, locs) m = self.testModel.serialize() self.testModel.load(m) self.assertEqual(self.testModel.k, 3) self.assertEqual(self.testModel.initial_skip, 1) self.assertEqual(self.testModel.k, m["k"]) self.assertEqual(self.testModel.initial_skip, m["initial_skip"]) # Test probale splits (word, splits, locs) = extract(firstline) locs = list(locs) sps = self.testModel.probable_splits(word) self.assertEqual(sps, locs) def test_error(self): self.assertRaises(ValueError, Model, "what")
class TestSandhisplitter(TestCase): def setUp(self): super(TestSandhisplitter, self).setUp() self.model = Model(depth=3, skip=1) self.SS = Sandhisplitter() testcases = resource_filename("sandhisplitter.tests", "resources/samples.txt") self.entries = open(testcases, "r", encoding='utf-8') def test_splits(self): count = 0 entries = map(lambda x: x.strip(), self.entries.readlines()) for line in entries: count += 1 (word, splits, locs) = extract(line) self.model.add_entry(word, splits, locs) m = self.model.serialize() self.SS.set_model(m) for line in entries: (word, splits, locs) = extract(line) obtained, pos = self.SS.split(word) self.assertEqual(locs, pos) self.assertEqual(splits, obtained) def test_details(self): self.assertEqual(self.SS.get_module_name(), "Sandhi-Splitter") self.assertEqual(self.SS.get_info(), "Sandhi-splitter for malayalam") def test_instance(self): self.assertEqual(isinstance(getInstance(), Sandhisplitter), True)
def main(): # if __name__ == '__main__': # pragma: no cover parser = argparse.ArgumentParser(description="Train a model") arguments = [ ["-k", "--depth", "depth of the trie", int, "depth"], ["-s", "--skip", "initial skip", int, "skip"], ["-i", "--trainfile", "path to training file", str, "trainfile"], ["-o", "--outputfile", "path to store model", str, "modelfile"], ] # Add options for arg in arguments: unix, gnu, desc, typename, dest = arg parser.add_argument(unix, gnu, help=desc, type=typename, required=True, dest=dest) args = parser.parse_args() # Load training file and add entries to model data = open(args.trainfile, "r", encoding="utf-8") line_number = 0 model = Model(depth=args.depth, skip=args.skip) try: for line in data: line = line.strip() line_number += 1 word, splits, locs = extract(line) model.add_entry(word, splits, locs) except: print("Input file syntax error in line %d" % (line_number)) raise # Serialize the model and export to file exported = model.serialize() output_file = open(args.modelfile, "w", encoding="utf-8") result = json.dumps(exported, ensure_ascii=False) output_file.write(result)