def test_splits(lexan, uohd_refs):
    # Check if s is in splits
    def _in_splits(s, splits):
        return s in [list(map(str, ss)) for ss in splits]

    f = uohd_refs[0]
    s = uohd_refs[1]
    i = SanskritObject(f, encoding=SLP1)
    try:
        # for sss in s:
        #    if not lexan.forms.valid(sss):
        #        return "Skip"
        graph = lexan.getSandhiSplits(i)
        if graph is None:
            logger.error("FAIL: Empty split for {}".format(
                i.canonical().encode('utf-8')))
            return False
        # Reducing max_paths to 100
        splits = graph.findAllPaths(max_paths=100, sort=False)
        r = _in_splits(s, splits)
        if splits is None or (not r):
            logger.error("FAIL: {} not in {}".format(s, splits))
        return r
    except:  # noqa
        logger.warning("Split Exception: {}".format(
            i.canonical().encode('utf-8')))
        return "Error"
Ejemplo n.º 2
0
 def main():
     args = getArgs()
     print("Input Dhatu:", args.dhatu)
     if args.debug:
         logging.basicConfig(filename='DhatuWrapper.log',
                             filemode='w',
                             level=logging.DEBUG)
     else:
         logging.basicConfig(filename='DhatuWrapper.log',
                             filemode='w',
                             level=logging.INFO)
     logger = logging.getLogger(__name__)
     if args.input_encoding is None:
         ie = None
     else:
         ie = SCHEMES[args.input_encoding]
     i = SanskritObject(args.dhatu, encoding=ie)
     it = i.canonical()
     print("Input String in SLP1:", it)
     logger.info("Input String in SLP1: {}".format(it))
     w = DhatuWrapper(logger=logger)
     if args.tags == "all":
         res = w._get_dhatus(it)
     else:
         res = map(lambda x: x[args.tags], w._get_dhatus(it))
     print(res)
     print("Is {} sakarmaka?: {}".format(it, w.is_sakarmaka(it)))
     logger.info("Reported {}".format(res))
def main(count=None, start=0):
    # Collect inputs
    with open(input_file) as fp:
        inputs = fp.readlines()
    num_inputs = len(inputs)
    stop = start + count if count else None
    inputs = itertools.islice(inputs, start, stop)
    max_value = count or num_inputs - start

    # Write the reference outputs
    with open(ref_file) as fp, open(ref_output_file, "w") as out:
        refs = fp.readlines()
        refs = itertools.islice(refs, stop)
        out.write("".join(refs))

    # Create the objects for scoring
    inria = LexicalSplitMetrics("inria", 10)
    combined = LexicalSplitMetrics("combined", 10)

    bar = progressbar.ProgressBar(max_value=max_value)
    with outputctx(strict_io=False):
        for line in bar(inputs):
            s = SanskritObject(line.strip(),
                               encoding=SLP1,
                               replace_ending_visarga=None)
            logger.debug("Input in SLP1 = %s", s.canonical())
            # Compute splits
            inria.update(s)
            combined.update(s)
        print("{:20s} | {:30s} | {:5s}".format("Name", "BLEU", "CHRF"))
        print("-" * 70)
        inria.print_metrics()
        combined.print_metrics()
Ejemplo n.º 4
0
 def get(self, v):
     """ Parse a presegmented sentence """
     strict_p = True
     if request.args.get("strict") == "false":
         strict_p = False
     vobj = SanskritObject(v,
                           strict_io=strict_p,
                           replace_ending_visarga=None)
     parser = Parser(input_encoding="SLP1",
                     output_encoding="Devanagari",
                     replace_ending_visarga='s')
     mres = []
     print(v)
     for split in parser.split(vobj.canonical(),
                               limit=10,
                               pre_segmented=True):
         parses = list(split.parse(limit=10))
         sdot = split.to_dot()
         mres = [x.serializable() for x in parses]
         pdots = [x.to_dot() for x in parses]
     r = {
         "input": v,
         "devanagari": vobj.devanagari(),
         "analysis": mres,
         "split_dot": sdot,
         "parse_dots": pdots
     }
     return r
Ejemplo n.º 5
0
 def get(self, v):
     """ Presegmented Split """
     vobj = SanskritObject(v, strict_io=True, replace_ending_visarga=None)
     parser = Parser(input_encoding="SLP1",
                     output_encoding="Devanagari",
                     replace_ending_visarga='s')
     splits = parser.split(vobj.canonical(), limit=10, pre_segmented=True)
     r = {
         "input": v,
         "devanagari": vobj.devanagari(),
         "splits": [x.serializable()['split'] for x in splits]
     }
     return r
Ejemplo n.º 6
0
def process(sentences, tag_mapper, ws):
    inria_metrics = WordLevelMetrics("inria")
    sdata_metrics = WordLevelMetrics("sanskrit_data")
    comb_metrics = WordLevelMetrics("combined")

    stats = {'inria': (0, 0, 0), 'sdata': (0, 0, 0), 'combo': (0, 0, 0)}
    missing = []
    for sent in sentences:
        if sent is None:
            continue
        text_obj = SanskritObject(sent.text, encoding=IAST, strict_io=False)
        words = text_obj.canonical().strip().split(" ")
        if len(words) != len(sent.dcsAnalysisDecomposition):
            continue
        for w, analysis in zip(words, sent.dcsAnalysisDecomposition):
            if len(analysis) != 1:
                continue
            word_analysis = analysis[0]
            if word_analysis.dcsGrammarHint == []:
                continue
            word_slp = SanskritObject(w, encoding=IAST,
                                      strict_io=False).canonical()
            tags = tag_mapper(word_analysis.dcsGrammarHint)
            root = SanskritObject(word_analysis.root,
                                  encoding=IAST,
                                  strict_io=False).canonical()
            i_valid = inria_metrics.update(word_slp, root, tags)
            s_valid = sdata_metrics.update(word_slp, root, tags)
            comb_metrics.update(word_slp, root, tags)
            if not i_valid or not s_valid:
                missing.append([
                    word_slp, word_analysis.root, word_analysis.dcsGrammarHint,
                    i_valid, s_valid, not i_valid and not s_valid, i_valid
                    and not s_valid, not i_valid and s_valid
                ])

    stats['inria'] = inria_metrics.metrics()
    stats['sdata'] = sdata_metrics.metrics()
    stats['combo'] = comb_metrics.metrics()
    stats['missing'] = missing
    return stats