def main(count=None, start=0):
    # Collect inputs
    with open(input_file) as fp:
        inputs = fp.readlines()
    num_inputs = len(inputs)
    stop = start + count if count else None
    inputs = itertools.islice(inputs, start, stop)
    max_value = count or num_inputs - start

    # Write the reference outputs
    with open(ref_file) as fp, open(ref_output_file, "w") as out:
        refs = fp.readlines()
        refs = itertools.islice(refs, stop)
        out.write("".join(refs))

    # Create the objects for scoring
    inria = LexicalSplitMetrics("inria", 10)
    combined = LexicalSplitMetrics("combined", 10)

    bar = progressbar.ProgressBar(max_value=max_value)
    with outputctx(strict_io=False):
        for line in bar(inputs):
            s = SanskritObject(line.strip(),
                               encoding=SLP1,
                               replace_ending_visarga=None)
            logger.debug("Input in SLP1 = %s", s.canonical())
            # Compute splits
            inria.update(s)
            combined.update(s)
        print("{:20s} | {:30s} | {:5s}".format("Name", "BLEU", "CHRF"))
        print("-" * 70)
        inria.print_metrics()
        combined.print_metrics()
def test_file_splits(lexan, splittext_refs):
    f = splittext_refs[0]
    s = splittext_refs[1]
    with outputctx(False):
        i = SanskritObject(f, encoding=SLP1, strict_io=True, replace_ending_visarga=None)
        graph = lexan.getSandhiSplits(i)
        assert graph is not None
        splits = graph.find_all_paths(max_paths=300, sort=False)
    assert s in [list(map(str, ss)) for ss in splits]
Ejemplo n.º 3
0
    def main():
        args = getArgs()
        if args.input_encoding is None:
            ie = None
        else:
            ie = SCHEMES[args.input_encoding]

        # Setup logging
        if args.loglevel:
            numeric_level = getattr(logging, args.loglevel.upper(), None)
            if not isinstance(numeric_level, int):
                raise ValueError('Invalid log level: %s' % args.loglevel)
            logging.basicConfig(filename="sandhi.log",
                                filemode="wb",
                                level=numeric_level)

        logging.info("---------------------------------------------------")
        logging.info("Started processing at %s", datetime.datetime.now())

        sandhi = Sandhi()
        # if neither split nor join is chosen, just demo both
        if not args.split and not args.join:
            print(
                "Neither split nor join option chosen. Here's a demo of joining"
            )
            args.join = True
        with outputctx(args.strict_io):
            if args.split:
                word_in = SanskritNormalizedString(args.word,
                                                   encoding=ie,
                                                   strict_io=args.strict_io)
                if args.all:
                    print("All possible splits for {}".format(args.word))
                    splits = sandhi.split_all(word_in)
                else:
                    pos = int(args.word_or_pos)
                    print("Splitting {0} at {1}".format(args.word, pos))
                    splits = sandhi.split_at(word_in, pos)
                print(splits)
            if args.join:
                print("Joining {0} {1}".format(args.word, args.word_or_pos))
                first_in = SanskritNormalizedString(args.word,
                                                    encoding=ie,
                                                    strict_io=args.strict_io)
                second_in = SanskritNormalizedString(args.word_or_pos,
                                                     encoding=ie,
                                                     strict_io=args.strict_io)
                joins = sandhi.join(first_in, second_in)
                print(joins)

        logging.info("Finished processing at %s", datetime.datetime.now())
        logging.info("---------------------------------------------------")
        logging.shutdown()
    def main():
        global need_lakara
        args = getArgs()
        print("Input String:", args.data)
        need_lakara = args.need_lakara

        if args.debug:
            logging.basicConfig(filename='SanskritMorphologicalAnalyzer.log', filemode='w', level=logging.DEBUG)
        s = SanskritMorphologicalAnalyzer(args.lexical_lookup)
        if args.input_encoding is None:
            ie = None
        else:
            ie = SanskritBase.SCHEMES[args.input_encoding]
        i = SanskritBase.SanskritObject(args.data, encoding=ie,
                                        strict_io=args.strict_io,
                                        replace_ending_visarga=None)
        print("Input String in SLP1:", i.canonical())
        import time
        print("Start Split")
        start_split = time.time()
        graph = s.getSandhiSplits(i, tag=True)
        end_split = time.time()
        print("End DAG generation")
        with SanskritBase.outputctx(args.strict_io):
            if graph:
                start_path = time.time()
                splits = graph.findAllPaths(max_paths=args.max_paths)
                end_path = time.time()
                print("End pathfinding")
                print("Splits:")
                for sp in splits:
                    print("Lexical Split:", sp)
                    p = s.constrainPath(sp)
                    if p:
                        print("Valid Morphologies")
                        for pp in p:
                            print([(spp, pp[str(spp)]) for spp in sp])
                    else:
                        print("No valid morphologies for this split")
                print("End Morphological Analysis")
                print("-----------")
                print("Performance")
                print("Time taken for split: {0:0.6f}s".format(end_split-start_split))
                print("Time taken for path: {0:0.6f}s".format(end_path-start_path))
            else:
                print("No Valid Splits Found")
                return
def test_file_splits(lexan, kosh_entry):
    clean_input = False
    f = kosh_entry['Word']
    s = kosh_entry['Split']
    clean_input = True
    with outputctx(False):
        i = SanskritObject(f, encoding=DEVANAGARI, strict_io=True, replace_ending_visarga=None)
        if clean_input:
            sl = [SanskritObject(x, encoding=DEVANAGARI, strict_io=True,
                                 replace_ending_visarga=None).canonical()
                  for x in s.strip().replace(" ", "+").split('+')]
        else:
            sl = [SanskritObject(x, encoding=DEVANAGARI, strict_io=True, replace_ending_visarga=None).canonical() for x in s.split('+')]
        graph = lexan.getSandhiSplits(i)
        assert graph is not None
        splits = graph.find_all_paths(max_paths=300, sort=False)
        assert sl in [list(map(str, ss)) for ss in splits]
Ejemplo n.º 6
0
def tags(argv=None):
    args = getTagsArgs(argv)
    if args.strict_io:
        print("Interpreting input strictly")
    else:
        print("Interpreting input loosely (strict_io set to false)")
    logger.info(f"Input String: {args.data}")
    if args.input_encoding is None:
        ie = None
    else:
        ie = SCHEMES[args.input_encoding]
    s = LexicalSandhiAnalyzer(args.lexical_lookup)
    with outputctx(args.strict_io):
        i = SanskritNormalizedString(args.data,
                                     encoding=ie,
                                     strict_io=args.strict_io,
                                     replace_ending_visarga='s')
        print("Input String in SLP1:", i.canonical())
        ts = s.getMorphologicalTags(i, tmap=args.map_tags)
        print("Morphological tags:")
        if ts is not None:
            for t in ts:
                print(t)
        # Possible rakaranta
        # Try by replacing end visarga with 'r' instead
        elif not args.strict_io:
            i = SanskritNormalizedString(args.data,
                                         encoding=ie,
                                         strict_io=args.strict_io,
                                         replace_ending_visarga='r')
            ts = s.getMorphologicalTags(i)
            if ts is not None:
                print("Input String in SLP1:", i.canonical())
                for t in ts:
                    print(t)
        if args.tag_set or args.base:
            if args.tag_set is not None:
                g = set(args.tag_set)
            else:
                g = None
            if args.base is not None:
                b = SanskritNormalizedString(args.base)
            else:
                b = None
            print(s.hasTag(i, b, g))
Ejemplo n.º 7
0
def sort_file_splits(kosh_entry):
    global le
    clean_input = False
    f = kosh_entry['Word']
    s = kosh_entry['Split']
    clean_input = True
    with outputctx(False):
        i = SanskritObject(f,
                           encoding=DEVANAGARI,
                           strict_io=True,
                           replace_ending_visarga=None)
        if not (isinstance(s, str)):
            # Bad input
            return "Bad_Input"
        if clean_input:
            sl = [
                SanskritObject(x,
                               encoding=DEVANAGARI,
                               strict_io=True,
                               replace_ending_visarga=None).canonical()
                for x in s.strip().replace(" ", "+").split('+')
            ]
        else:
            sl = [
                SanskritObject(x,
                               encoding=DEVANAGARI,
                               strict_io=True,
                               replace_ending_visarga=None).canonical()
                for x in s.split('+')
            ]
        graph = le.getSandhiSplits(i)
        if graph is not None:
            splits = graph.find_all_paths(max_paths=300, sort=False)
            if sl in [list(map(str, ss)) for ss in splits]:
                # Pass
                print("P", end='', flush=True)
                return "Pass"
            else:
                print("F", end='', flush=True)
                return "Fail"
        else:
            print("S", end='', flush=True)
            return "Split_Fail"
def main(argv=None):
    graph = None
    args = getArgs(argv)
    if args.strict_io:
        print("Interpreting input strictly")
    else:
        print("Interpreting input loosely (strict_io set to false)")
    print("Input String:", args.data)

    s = LexicalSandhiAnalyzer(args.lexical_lookup)
    if args.input_encoding is None:
        ie = None
    else:
        ie = SanskritBase.SCHEMES[args.input_encoding]
    with SanskritBase.outputctx(args.strict_io):
        if not args.split:
            i = SanskritBase.SanskritNormalizedString(
                args.data,
                encoding=ie,
                strict_io=args.strict_io,
                replace_ending_visarga='s')
            print("Input String in SLP1:", i.canonical())
            ts = s.getMorphologicalTags(i, tmap=args.map_tags)
            print("Morphological tags:")
            if ts is not None:
                for t in ts:
                    print(t)
            # Possible rakaranta
            # Try by replacing end visarga with 'r' instead
            elif not args.strict_io:
                i = SanskritBase.SanskritNormalizedString(
                    args.data,
                    encoding=ie,
                    strict_io=args.strict_io,
                    replace_ending_visarga='r')
                ts = s.getMorphologicalTags(i)
                if ts is not None:
                    print("Input String in SLP1:", i.canonical())
                    for t in ts:
                        print(t)
            if args.tag_set or args.base:
                if args.tag_set is not None:
                    g = set(args.tag_set)
                else:
                    g = None
                if args.base is not None:
                    b = SanskritBase.SanskritNormalizedString(args.base)
                else:
                    b = None
                print(s.hasTag(i, b, g))
        else:
            import time
            i = SanskritBase.SanskritNormalizedString(
                args.data,
                encoding=ie,
                strict_io=args.strict_io,
                replace_ending_visarga=None)
            print("Input String in SLP1:", i.canonical())
            print("Start Split")
            start_split = time.time()
            graph = s.getSandhiSplits(i)
            end_graph = time.time()
            print("End DAG generation")
            if graph:
                logger.debug("Graph has %d nodes and %d edges" %
                             (len(graph.G.nodes()), len(graph.G.edges())))
                splits = graph.find_all_paths(max_paths=args.max_paths,
                                              score=args.score)
                print("End pathfinding", time.time())
                print("Splits:")
                if splits:
                    for split in splits:
                        print(split)
                else:
                    print("None")
            else:
                print("No Valid Splits Found")
            end_split = time.time()
            print("-----------")
            print("Performance")
            print("Time for graph generation = {0:0.6f}s".format(end_graph -
                                                                 start_split))
            print("Total time for graph generation + find paths = {0:0.6f}s".
                  format(end_split - start_split))
        return graph
Ejemplo n.º 9
0
    def main():
        args = getArgs()
        if args.strict_io:
            print("Interpreting input strictly")
        else:
            print("Interpreting input loosely (strict_io set to false)")
        print("Input String:", args.data)

        if args.debug:
            logging.basicConfig(filename='SanskritLexicalAnalyzer.log',
                                filemode='w',
                                level=logging.DEBUG)
        else:
            logging.basicConfig(filename='SanskritLexicalAnalyzer.log',
                                filemode='w',
                                level=logging.INFO)

        s = SanskritLexicalAnalyzer(args.lexical_lookup)
        if args.input_encoding is None:
            ie = None
        else:
            ie = SanskritBase.SCHEMES[args.input_encoding]
        with SanskritBase.outputctx(args.strict_io):
            if not args.split:
                i = SanskritBase.SanskritObject(args.data,
                                                encoding=ie,
                                                strict_io=args.strict_io,
                                                replace_ending_visarga='s')
                print("Input String in SLP1:", i.canonical())
                ts = s.getLexicalTags(i)
                print(ts)
                # Possible rakaranta
                # Try by replacing end visarga with 'r' instead
                if not args.strict_io:
                    i = SanskritBase.SanskritObject(args.data,
                                                    encoding=ie,
                                                    strict_io=args.strict_io,
                                                    replace_ending_visarga='r')
                    ts = s.getLexicalTags(i)
                    if ts is not None:
                        print("Input String in SLP1:", i.canonical())
                        print(ts)
                if args.tag_set or args.base:
                    if args.tag_set:
                        g = set(args.tag_set)
                    print(
                        s.hasTag(i, SanskritBase.SanskritObject(args.base), g))
            else:
                import time
                i = SanskritBase.SanskritObject(args.data,
                                                encoding=ie,
                                                strict_io=args.strict_io,
                                                replace_ending_visarga=None)
                print("Input String in SLP1:", i.canonical())
                print("Start Split")
                start_split = time.time()
                graph = s.getSandhiSplits(i)
                end_graph = time.time()
                print("End DAG generation")
                if graph:
                    logger.debug("Graph has %d nodes and %d edges" %
                                 (len(graph.G.nodes()), len(graph.G.edges())))
                    splits = graph.findAllPaths(max_paths=args.max_paths,
                                                score=args.score)
                    print("End pathfinding", time.time())
                    print("Splits:")
                    if splits:
                        for split in splits:
                            print(split)
                    else:
                        print("None")
                else:
                    print("No Valid Splits Found")
                end_split = time.time()
                print("-----------")
                print("Performance")
                print("Time for graph generation = {0:0.6f}s".format(
                    end_graph - start_split))
                print(
                    "Total time for graph generation + find paths = {0:0.6f}s".
                    format(end_split - start_split))