Beispiel #1
0
 def test_splits(self):
     count = 0
     entries = map(lambda x: x.strip(), self.entries.readlines())
     for line in entries:
         count += 1
         (word, splits, locs) = extract(line)
         self.model.add_entry(word, splits, locs)
     m = self.model.serialize()
     self.SS.set_model(m)
     for line in entries:
         (word, splits, locs) = extract(line)
         obtained, pos = self.SS.split(word)
         self.assertEqual(locs, pos)
         self.assertEqual(splits, obtained)
 def test_load(self):
     count = 0
     firstline = None
     for line in self.entries:
         count += 1
         if count == 1:
             firstline = line
         (word, splits, locs) = extract(line)
         self.testModel.add_entry(word, splits, locs)
     m = self.testModel.serialize()
     self.testModel.load(m)
     self.splitter = Splitter(m)
     # Test probale splits
     (word, splits, locs) = extract(firstline)
     locs = list(locs)
     sps = self.splitter.splits(word)
     self.assertEqual(sps, locs)
Beispiel #3
0
 def test_load(self):
     count = 0
     firstline = None
     for line in self.entries:
         count += 1
         if count == 1:
             firstline = line
         (word, splits, locs) = extract(line)
         self.testModel.add_entry(word, splits, locs)
     m = self.testModel.serialize()
     self.testModel.load(m)
     self.assertEqual(self.testModel.k, 3)
     self.assertEqual(self.testModel.initial_skip, 1)
     self.assertEqual(self.testModel.k, m["k"])
     self.assertEqual(self.testModel.initial_skip, m["initial_skip"])
     # Test probale splits
     (word, splits, locs) = extract(firstline)
     locs = list(locs)
     sps = self.testModel.probable_splits(word)
     self.assertEqual(sps, locs)
def main():
    # if __name__ == '__main__':  # pragma: no cover
    parser = argparse.ArgumentParser(description="Test a model")
    arguments = [
        ["-m", "--modelfile", "path to model file",
            str, "modelfile"],
        ["-t", "--testfile", "path to test file",
            str, "testfile"],
        ["-o", "--output", "file to store output",
            str, "output"],
    ]
    for arg in arguments:
        unix, gnu, desc, typename, dest = arg
        parser.add_argument(unix, gnu, help=desc, type=typename,
                            required=True, dest=dest)
    args = parser.parse_args()
    # Load data into the model
    modelfile = open(args.modelfile, "r", encoding="utf-8")
    model = json.load(modelfile)
    splitter = Splitter(model)
    postprocessor = PostProcessor()
    output = open(args.output, "w", encoding="utf-8")
    stats = (0, 0, 0, 0)
    linenumber = 0
    testfile = open(args.testfile, "r", encoding="utf-8")
    for line in testfile:
        linenumber += 1
        line = line.strip()
        # print(line)
        try:
            word, desired_splits, desired_locs = extract(line)
        except ValueError:
            print("Error in line %d" % linenumber)
        sps = splitter.splits(word)
        splits = postprocessor.split(word, sps)
        outstring = compress(word, splits, sps) + '\n'
        # split_metrics = split_error(desired_splits, splits)
        # Check what matches and not matches
        location_metrics = location_error(desired_locs, sps, len(word))
        stats = map(add, stats, location_metrics)
        output.write(outstring)

    # Get the aggregate measures
    results = measures(stats)
    print("Split point identification stats:")
    skeys = sorted(results.keys())
    for key in skeys:
        print('  ', key, ':', results[key])
Beispiel #5
0
def main():
    # if __name__ == '__main__':  # pragma: no cover
    parser = argparse.ArgumentParser(description="Train a model")
    arguments = [
        ["-k", "--depth", "depth of the trie", int, "depth"],
        ["-s", "--skip", "initial skip", int, "skip"],
        ["-i", "--trainfile", "path to training file", str, "trainfile"],
        ["-o", "--outputfile", "path to store model", str, "modelfile"],
    ]

    # Add options
    for arg in arguments:
        unix, gnu, desc, typename, dest = arg
        parser.add_argument(unix,
                            gnu,
                            help=desc,
                            type=typename,
                            required=True,
                            dest=dest)

    args = parser.parse_args()

    # Load training file and add entries to model
    data = open(args.trainfile, "r", encoding="utf-8")
    line_number = 0
    model = Model(depth=args.depth, skip=args.skip)
    try:
        for line in data:
            line = line.strip()
            line_number += 1
            word, splits, locs = extract(line)
            model.add_entry(word, splits, locs)
    except:
        print("Input file syntax error in line %d" % (line_number))
        raise

    # Serialize the model and export to file
    exported = model.serialize()
    output_file = open(args.modelfile, "w", encoding="utf-8")
    result = json.dumps(exported, ensure_ascii=False)
    output_file.write(result)
Beispiel #6
0
def main():
    # if __name__ == '__main__':  # pragma: no cover
    parser = argparse.ArgumentParser(description="Train a model")
    arguments = [
        ["-k", "--depth", "depth of the trie", int, "depth"],
        ["-s", "--skip", "initial skip", int, "skip"],
        ["-i", "--trainfile", "path to training file",
            str, "trainfile"],
        ["-o", "--outputfile", "path to store model",
            str, "modelfile"],
    ]

    # Add options
    for arg in arguments:
        unix, gnu, desc, typename, dest = arg
        parser.add_argument(unix, gnu, help=desc, type=typename,
                            required=True, dest=dest)

    args = parser.parse_args()

    # Load training file and add entries to model
    data = open(args.trainfile, "r", encoding="utf-8")
    line_number = 0
    model = Model(depth=args.depth, skip=args.skip)
    try:
        for line in data:
            line = line.strip()
            line_number += 1
            word, splits, locs = extract(line)
            model.add_entry(word, splits, locs)
    except:
        print("Input file syntax error in line %d" % (line_number))
        raise

    # Serialize the model and export to file
    exported = model.serialize()
    output_file = open(args.modelfile, "w", encoding="utf-8")
    result = json.dumps(exported, ensure_ascii=False)
    output_file.write(result)
 def test_extract_compress(self):
     entries = map(lambda x: x.strip(), self.entries.readlines())
     for inline in entries:
         s, sps, l = extract(inline)
         outline = compress(s, sps, l)
         self.assertEqual(inline, outline)
 def test_splits(self):
     for line in self.entries:
         (word, splits, locs) = extract(line)
         splits_generated = self.PP.split(word, locs)
         self.assertEqual(splits, splits_generated)