Beispiel #1
0
    def _parse(toklist):
        """ Parse a token list as a query """

        # Parse with the nonterminal 'QueryRoot' as the grammar root
        with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp:

            sent_begin = 0
            num_sent = 0
            num_parsed_sent = 0
            rdc = Reducer(bp.grammar)
            trees = dict()
            sent = []

            for ix, t in enumerate(toklist):
                if t[0] == TOK.S_BEGIN:
                    sent = []
                    sent_begin = ix
                elif t[0] == TOK.S_END:
                    slen = len(sent)
                    if not slen:
                        continue
                    num_sent += 1
                    # Parse the accumulated sentence
                    num = 0
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest is not None:
                            num = Fast_Parser.num_combinations(forest)
                            if num > 1:
                                # Reduce the resulting forest
                                forest = rdc.go(forest)
                    except ParseError as e:
                        forest = None
                    if num > 0:
                        num_parsed_sent += 1
                        # Obtain a text representation of the parse tree
                        trees[num_sent] = ParseForestDumper.dump_forest(forest)
                        #ParseForestPrinter.print_forest(forest)

                elif t[0] == TOK.P_BEGIN:
                    pass
                elif t[0] == TOK.P_END:
                    pass
                else:
                    sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees
Beispiel #2
0
def parse(toklist, single, use_reducer, dump_forest = False, keep_trees = False):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    # Accumulate parsed sentences in a text dump format
    trees = OrderedDict()

    with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages

        version = bp.version
        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0 # Number of tree combinations in forest
                    score = 0 # Reducer score of the best parse tree

                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                            if single and dump_forest:
                                # Dump the parse tree to parse.txt
                                with open("parse.txt", mode = "w", encoding= "utf-8") as f:
                                    print("Reynir parse tree for sentence '{0}'".format(" ".join(sent)), file = f)
                                    print("{0} combinations\n".format(num), file = f)
                                    if num < 10000:
                                        ParseForestPrinter.print_forest(forest, file = f)
                                    else:
                                        print("Too many combinations to dump", file = f)

                        if use_reducer and num > 1:
                            # Reduce the resulting forest
                            forest, score = rdc.go_with_score(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                            if Settings.DEBUG:
                                print(ParseForestDumper.dump_forest(forest))

                            num = 1

                    except ParseError as e:
                        forest = None
                        # Obtain the index of the offending token
                        err_index = e.token_index

                    if Settings.DEBUG:
                        print("Parsed sentence of length {0} with {1} combinations, score {2}{3}"
                            .format(slen, num, score,
                                "\n" + (" ".join(s[1] for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num ** (1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                        if keep_trees:
                            # We want to keep the trees for further processing down the line:
                            # reduce and dump the best tree to text
                            if num > 1:
                                # Reduce the resulting forest before dumping it to text format
                                forest = rdc.go(forest)
                            trees[num_sent] = ParseForestDumper.dump_forest(forest)

                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(
        version = version,
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )

    # noinspection PyRedundantParentheses
    return (result, trees)
Beispiel #3
0
def parse_tokens(toklist, mim_tags, fast_p):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0
    tag_ix = 0
    ntags = len(mim_tags)

    rdc = Reducer(fast_p.grammar)

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            if slen:
                # Parse the accumulated sentence
                err_index = None
                num = 0 # Number of tree combinations in forest
                score = 0 # Reducer score of the best parse tree

                try:
                    # Progress indicator: sentence count
                    print("{}".format(num_sent), end="\r")
                    # Parse the sentence
                    forest = fast_p.go(sent)
                    if forest:
                        num = Fast_Parser.num_combinations(forest)

                    if num > 1:
                        # Reduce the resulting forest
                        forest = rdc.go(forest)

                except ParseError as e:
                    forest = None
                    num = 0
                    # Obtain the index of the offending token
                    err_index = e.token_index

                if num > 0:
                    num_parsed_sent += 1

                    # Extract the POS tags for the terminals in the forest
                    pos_tags = find_pos_tags(forest)

                    # Calculate the 'ambiguity factor'
                    ambig_factor = num ** (1 / slen)
                    # Do a weighted average on sentence length
                    total_ambig += ambig_factor * slen
                    total_tokens += slen
                # Mark the sentence beginning with the number of parses
                # and the index of the offending token, if an error occurred
                toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)
            # Check whether the token streams are in sync
            if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]:
                #print("Warning: mismatch between MIM token '{0}' and Greynir token '{1}'".format(mim_tags[tag_ix][1], t[1]))
                # Attempt to sync again by finding the Greynir token in the MIM tag stream
                gap = 1
                MAX_LOOKAHEAD = 4
                while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]:
                    gap += 1
                if gap < MAX_LOOKAHEAD:
                    # Found the Greynir token ahead
                    #print("Re-synced by skipping ahead by {0} tokens".format(gap))
                    tag_ix += gap
            if tag_ix < ntags:
                tag_ix += 1

    return dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )
Beispiel #4
0
def parse_tokens(toklist, mim_tags, fast_p):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0
    tag_ix = 0
    ntags = len(mim_tags)

    rdc = Reducer(fast_p.grammar)

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            if slen:
                # Parse the accumulated sentence
                err_index = None
                num = 0 # Number of tree combinations in forest
                score = 0 # Reducer score of the best parse tree

                try:
                    # Progress indicator: sentence count
                    print("{}".format(num_sent), end="\r")
                    # Parse the sentence
                    forest = fast_p.go(sent)
                    if forest:
                        num = Fast_Parser.num_combinations(forest)

                    if num > 1:
                        # Reduce the resulting forest
                        forest = rdc.go(forest)

                except ParseError as e:
                    forest = None
                    # Obtain the index of the offending token
                    err_index = e.token_index

                if num > 0:
                    num_parsed_sent += 1

                    # Extract the POS tags for the terminals in the forest
                    pos_tags = find_pos_tags(forest)

                    # Calculate the 'ambiguity factor'
                    ambig_factor = num ** (1 / slen)
                    # Do a weighted average on sentence length
                    total_ambig += ambig_factor * slen
                    total_tokens += slen
                # Mark the sentence beginning with the number of parses
                # and the index of the offending token, if an error occurred
                toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)
            # Check whether the token streams are in sync
            if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]:
                print("Warning: mismatch between MIM token '{0}' and Reynir token '{1}'".format(mim_tags[tag_ix][1], t[1]))
                # Attempt to sync again by finding the Reynir token in the MIM tag stream
                gap = 1
                MAX_LOOKAHEAD = 3
                while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]:
                    gap += 1
                if gap < MAX_LOOKAHEAD:
                    # Found the Reynir token ahead
                    print("Re-synced by skipping ahead by {0} tokens".format(gap))
                    tag_ix += gap
            if tag_ix < ntags:
                tag_ix += 1

    return dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )
Beispiel #5
0
def analyze():
    """ Find word categories in the submitted text """

    txt = request.form.get("txt", "").strip()

    # Tokenize the text entered as-is and return the token list
    toklist = list(tokenize(txt))

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    with Fast_Parser(verbose=False) as bp:  # Don't emit diagnostic messages

        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0  # Number of tree combinations in forest
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                        # Mark the token list with the identified word categories
                        mark_categories(forest, toklist, sent_begin + 1)

                    except ParseError as e:
                        # Obtain the index of the offending token
                        err_index = e.token_index
                    print(
                        "Parsed sentence of length {0} with {1} combinations{2}"
                        .format(
                            slen, num, "\n" +
                            (" ".join(s[1]
                                      for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num**(1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(
                        num_parses=num, err_index=err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(tokens=toklist,
                  tok_num=len(toklist),
                  num_sent=num_sent,
                  num_parsed_sent=num_parsed_sent,
                  avg_ambig_factor=(total_ambig /
                                    total_tokens) if total_tokens > 0 else 1.0)

    # Return the tokens as a JSON structure to the client
    return jsonify(result=result)
Beispiel #6
0
def analyze():
    """ Find word categories in the submitted text """

    txt = request.form.get("txt", "").strip()

    # Tokenize the text entered as-is and return the token list
    toklist = list(tokenize(txt))

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages

        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0 # Number of tree combinations in forest
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                        # Mark the token list with the identified word categories
                        mark_categories(forest, toklist, sent_begin + 1)

                    except ParseError as e:
                        # Obtain the index of the offending token
                        err_index = e.token_index
                    print("Parsed sentence of length {0} with {1} combinations{2}".format(slen, num,
                        "\n" + (" ".join(s[1] for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num ** (1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )

    # Return the tokens as a JSON structure to the client
    return jsonify(result = result)