Beispiel #1
0
    def _parse(toklist):
        """ Parse a token list as a query """

        # Parse with the nonterminal 'QueryRoot' as the grammar root
        with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp:

            sent_begin = 0
            num_sent = 0
            num_parsed_sent = 0
            rdc = Reducer(bp.grammar)
            trees = dict()
            sent = []

            for ix, t in enumerate(toklist):
                if t[0] == TOK.S_BEGIN:
                    sent = []
                    sent_begin = ix
                elif t[0] == TOK.S_END:
                    slen = len(sent)
                    if not slen:
                        continue
                    num_sent += 1
                    # Parse the accumulated sentence
                    num = 0
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest is not None:
                            num = Fast_Parser.num_combinations(forest)
                            if num > 1:
                                # Reduce the resulting forest
                                forest = rdc.go(forest)
                    except ParseError as e:
                        forest = None
                    if num > 0:
                        num_parsed_sent += 1
                        # Obtain a text representation of the parse tree
                        trees[num_sent] = ParseForestDumper.dump_forest(forest)
                        #ParseForestPrinter.print_forest(forest)

                elif t[0] == TOK.P_BEGIN:
                    pass
                elif t[0] == TOK.P_END:
                    pass
                else:
                    sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees
Beispiel #2
0
 def parse(self):
     """ Parse the sentence """
     num = 0
     try:
         forest = self._ip._parser.go(self._s)
         if forest is not None:
             num = Fast_Parser.num_combinations(forest)
             if num > 1:
                 forest = self._ip._reducer.go(forest)
     except ParseError as e:
         forest = None
         self._err_index = e.token_index
     self._tree = forest
     self._ip._add_sentence(self, num)
     return num > 0
Beispiel #3
0
    def tag_text(session, text):
        """ Parse plain text and return the parsed paragraphs as lists of sentences
            where each sentence is a list of tagged tokens """

        t0 = time.time()
        # Demarcate paragraphs in the input
        text = Fetcher.mark_paragraphs(text)
        # Tokenize the result
        toklist = list(tokenize(text, enclosing_session=session))
        # Paragraph list, containing sentences, containing tokens
        pgs = []
        t1 = time.time()

        with Fast_Parser(
                verbose=False) as bp:  # Don't emit diagnostic messages

            ip = IncrementalParser(bp, toklist, verbose=True)

            for p in ip.paragraphs():
                pgs.append([])
                for sent in p.sentences():
                    if sent.parse():
                        # Parsed successfully
                        pgs[-1].append(
                            Article._dump_tokens(sent.tokens, sent.tree, None))
                    else:
                        # Errror in parse
                        pgs[-1].append(
                            Article._dump_tokens(sent.tokens, None, None,
                                                 sent.err_index))

            t2 = time.time()
            stats = dict(num_tokens=ip.num_tokens,
                         num_sentences=ip.num_sentences,
                         num_parsed=ip.num_parsed,
                         ambiguity=ip.ambiguity,
                         tok_time=t1 - t0,
                         parse_time=t2 - t1,
                         total_time=t2 - t0)

        # Add a name register to the result
        register = create_name_register(toklist, session)

        return (pgs, stats, register)
Beispiel #4
0
def analyze():
    """ Analyze text manually entered by the user, i.e. not coming from an article """

    text = request.form.get("text", "").strip()[0:_MAX_TEXT_LENGTH]

    with SessionContext(commit=True) as session:

        # Demarcate paragraphs in the input
        text = Fetcher.mark_paragraphs(text)
        # Tokenize the result
        toklist = list(tokenize(text, enclosing_session=session))
        # Paragraph list, containing sentences, containing tokens
        pgs = []

        with Fast_Parser(
                verbose=False) as bp:  # Don't emit diagnostic messages

            ip = IncrementalParser(bp, toklist, verbose=True)

            for p in ip.paragraphs():
                pgs.append([])
                for sent in p.sentences():
                    if sent.parse():
                        # Parsed successfully
                        pgs[-1].append(
                            ArticleProxy._dump_tokens(sent.tokens, sent.tree,
                                                      None))
                    else:
                        # Errror in parse
                        pgs[-1].append(
                            ArticleProxy._dump_tokens(sent.tokens, None, None,
                                                      sent.err_index))

            stats = dict(num_tokens=ip.num_tokens,
                         num_sentences=ip.num_sentences,
                         num_parsed=ip.num_parsed,
                         ambiguity=ip.ambiguity)
            # Add a name register to the result
            register = create_name_register(toklist, session)

    # Return the tokens as a JSON structure to the client
    return jsonify(result=pgs, stats=stats, register=register)
Beispiel #5
0
    pr.dump_stats(filename)
    return result


def speed_test(uuid):
    try:
        print("Starting speed test")
        t0 = time.time()
        with SessionContext(commit=True) as session:
            # Load the article
            a = Article.load_from_uuid(uuid, session)
            if a is not None:
                # Parse it and store the updated version
                a.parse(session, verbose=True)
        t1 = time.time()
        print("Parsing finished in {0:.2f} seconds".format(t1 - t0))
    finally:
        Article.cleanup()


print("Welcome to speedtest")

Settings.read(os.path.join(basepath, "config/Greynir.conf"))
with Fast_Parser() as fp:
    pass

#speed_test("dbc585e4-736c-11e6-a2bb-04014c605401")
profile(speed_test, "dbc585e4-736c-11e6-a2bb-04014c605401")

print("speedtest done")
Beispiel #6
0
 def _init_class(cls):
     """ Initialize class attributes """
     if cls._parser is None:
         cls._parser = Fast_Parser(
             verbose=False)  # Don't emit diagnostic messages
Beispiel #7
0
def parse_grid():
    """ Show the parse grid for a particular parse tree of a sentence """

    MAX_LEVEL = 32 # Maximum level of option depth we can handle
    txt = request.form.get('txt', "")
    parse_path = request.form.get('option', "")
    use_reducer = not ("noreduce" in request.form)

    # Tokenize the text
    tokens = list(tokenize(txt))

    # Parse the text
    with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages
        err = dict()
        grammar = bp.grammar
        try:
            forest = bp.go(tokens)
        except ParseError as e:
            err["msg"] = str(e)
            # Relay information about the parser state at the time of the error
            err["info"] = None # e.info
            forest = None

    # Find the number of parse combinations
    combinations = 0 if forest is None else Fast_Parser.num_combinations(forest)
    score = 0

    if Settings.DEBUG:
        # Dump the parse tree to parse.txt
        with open("parse.txt", mode = "w", encoding= "utf-8") as f:
            if forest is not None:
                print("Reynir parse tree for sentence '{0}'".format(txt), file = f)
                print("{0} combinations\n".format(combinations), file = f)
                if combinations < 10000:
                    ParseForestPrinter.print_forest(forest, file = f)
                else:
                    print("Too many combinations to dump", file = f)
            else:
                print("No parse available for sentence '{0}'".format(txt), file = f)

    if forest is not None and use_reducer:
        # Reduce the parse forest
        forest, score = Reducer(grammar).go_with_score(forest)
        if Settings.DEBUG:
            print(ParseForestDumper.dump_forest(forest))

    # Make the parse grid with all options
    grid, ncols = make_grid(forest) if forest else ([], 0)
    # The grid is columnar; convert it to row-major
    # form for convenient translation into HTML
    # There will be as many columns as there are tokens
    nrows = len(grid)
    tbl = [ [] for _ in range(nrows) ]
    # Info about previous row spans
    rs = [ [] for _ in range(nrows) ]

    # The particular option path we are displaying
    if not parse_path:
        # Not specified: display the all-zero path
        path = [(0,) * i for i in range(1, MAX_LEVEL)]
    else:
        # Disassemble the passed-in path

        def toint(s):
            """ Safe conversion of string to int """
            try:
                n = int(s)
            except ValueError:
                n = 0
            return n if n >= 0 else 0

        p = [ toint(s) for s in parse_path.split("_") ]
        path = [tuple(p[0 : i + 1]) for i in range(len(p))]

    # This set will contain all option path choices
    choices = set()
    NULL_TUPLE = tuple()

    for gix, gcol in enumerate(grid):
        # gcol is a dictionary of options
        # Accumulate the options that we want do display
        # according to chosen path
        cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [] # Default content
        # Add the options we're displaying
        for p in path:
            if p in gcol:
                cols.extend(gcol[p])
        # Accumulate all possible path choices
        choices |= gcol.keys()
        # Sort the columns that will be displayed
        cols.sort(key = lambda x: x[0])
        col = 0
        for startcol, endcol, info in cols:
            assert isinstance(info, Nonterminal) or isinstance(info, tuple)
            if col < startcol:
                gap = startcol - col
                gap -= sum(1 for c in rs[gix] if c < startcol)
                if gap > 0:
                    tbl[gix].append((gap, 1, "", ""))
            rowspan = 1
            if isinstance(info, tuple):
                cls = { "terminal" }
                rowspan = nrows - gix
                for i in range(gix + 1, nrows):
                    # Note the rowspan's effect on subsequent rows
                    rs[i].append(startcol)
            else:
                cls = { "nonterminal" }
                # Get the 'pure' name of the nonterminal in question
                assert isinstance(info, Nonterminal)
                info = info.name
            if endcol - startcol == 1:
                cls |= { "vertical" }
            tbl[gix].append((endcol-startcol, rowspan, info, cls))
            col = endcol
        ncols_adj = ncols - len(rs[gix])
        if col < ncols_adj:
            tbl[gix].append((ncols_adj - col, 1, "", ""))
    # Calculate the unique path choices available for this parse grid
    choices -= { NULL_TUPLE } # Default choice: don't need it in the set
    unique_choices = choices.copy()
    for c in choices:
        # Remove all shorter prefixes of c from the unique_choices set
        unique_choices -= { c[0:i] for i in range(1, len(c)) }
    # Create a nice string representation of the unique path choices
    uc_list = [ "_".join(str(c) for c in choice) for choice in unique_choices ]
    if not parse_path:
        # We are displaying the longest possible all-zero choice: find it
        i = 0
        while (0,) * (i + 1) in unique_choices:
            i += 1
        parse_path = "_".join(["0"] * i)

    #debug()

    return render_template("parsegrid.html", txt = txt, err = err, tbl = tbl,
        combinations = combinations, score = score,
        choice_list = uc_list, parse_path = parse_path)
Beispiel #8
0
def parse(toklist, single, use_reducer, dump_forest = False, keep_trees = False):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    # Accumulate parsed sentences in a text dump format
    trees = OrderedDict()

    with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages

        version = bp.version
        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0 # Number of tree combinations in forest
                    score = 0 # Reducer score of the best parse tree

                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                            if single and dump_forest:
                                # Dump the parse tree to parse.txt
                                with open("parse.txt", mode = "w", encoding= "utf-8") as f:
                                    print("Reynir parse tree for sentence '{0}'".format(" ".join(sent)), file = f)
                                    print("{0} combinations\n".format(num), file = f)
                                    if num < 10000:
                                        ParseForestPrinter.print_forest(forest, file = f)
                                    else:
                                        print("Too many combinations to dump", file = f)

                        if use_reducer and num > 1:
                            # Reduce the resulting forest
                            forest, score = rdc.go_with_score(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                            if Settings.DEBUG:
                                print(ParseForestDumper.dump_forest(forest))

                            num = 1

                    except ParseError as e:
                        forest = None
                        # Obtain the index of the offending token
                        err_index = e.token_index

                    if Settings.DEBUG:
                        print("Parsed sentence of length {0} with {1} combinations, score {2}{3}"
                            .format(slen, num, score,
                                "\n" + (" ".join(s[1] for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num ** (1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                        if keep_trees:
                            # We want to keep the trees for further processing down the line:
                            # reduce and dump the best tree to text
                            if num > 1:
                                # Reduce the resulting forest before dumping it to text format
                                forest = rdc.go(forest)
                            trees[num_sent] = ParseForestDumper.dump_forest(forest)

                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(
        version = version,
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )

    # noinspection PyRedundantParentheses
    return (result, trees)
Beispiel #9
0
def test3():

    print("\n\n------ Test 3 ---------")

    # p = BIN_Parser(verbose = False) # Don't emit diagnostic messages

    with Fast_Parser(verbose=False) as fast_p:

        g = fast_p.grammar

        print(
            "Greynir.grammar has {0} nonterminals, {1} terminals, {2} productions"
            .format(g.num_nonterminals, g.num_terminals, g.num_productions))

        # g.follow_set(g.root)
        # return

        # Dump the grammar
        # print("\n" + str(g))

        def create_sentence_table():
            """ Only used to create a test fresh sentence table if one doesn't exist """
            with closing(Test_DB.open_db()) as db:

                try:
                    db.create_sentence_table()

                    TEXTS = [
                        "Páll fór út með stóran kött og Jón keypti heitan graut.",
                        "Unga fallega konan frá Garðabæ elti ljóta og feita karlinn rösklega og fumlaust í svörtu myrkrinu",
                        "Kötturinn sem strákurinn átti veiddi feitu músina",
                        "Gamla bláa kommóðan var máluð fjólublá með olíumálningu",
                        "Landsframleiðslan hefur aukist frá því í fyrra",
                        "Guðmundur og Guðrún kusu Framsóknarflokkinn",
                        "Þú skalt fara til Danmerkur.",
                        "Ég og þú fórum til Frakklands í utanlandsferð",
                        "Stóru bláu könnunni mun hafa verið fleygt í ruslið",
                        "Már Guðmundsson segir margskonar misskilnings gæta hjá Hannesi Hólmsteini",
                        "Már Guðmundsson seðlabankastjóri Íslands segir þetta við Morgunblaðið í dag.",
                        "Það er náttúrlega einungis í samfélögum sem eiga við býsna stór vandamál að stríða " + \
                            "að ný stjórnmálaöfl geta snögglega sveiflast upp í þriðjungs fylgi.",
                        "Áætlaður kostnaður verkefnisins var tíu milljónir króna og áætluð verklok eru í byrjun september næstkomandi.",
                        "Pakkinn snerist um að ábyrgjast innlán og skuldabréfaútgáfu danskra fjármálafyrirtækja.",
                        "Kynningarfundurinn sem ég hélt í dag fjallaði um lausnina á þessum vanda.",
                        "Kynningarfundurinn sem haldinn var í dag fjallaði um lausnina á þessum vanda.",
                        "Það sakamál sé til meðferðar við Héraðsdóm Suðurlands."
                    ]

                    for t in TEXTS:
                        db.add_sentence(t)

                    slist = db.sentences()
                    for s in slist:
                        print("{0}".format(s))

                except Exception as e:
                    print("{0}".format(e))

        for test in run_test(fast_p):

            print("\n'{0}'\n{1} parse trees found in {2:.3f} seconds\n".format(
                test["sentence"], test["numtrees"], test["parse_time"]))

            if test["numtrees"] > 0:
                # ParseForestPrinter.print_forest(test["forest"])
                # print("{0}".format(Parser.make_schema(test["forest"])))
                pass
            elif test["err"]:
                print("Error: {0}".format(test["err"]))
Beispiel #10
0
def run_test(fast_p):
    """ Run a test parse on all sentences in the test table """

    with closing(Test_DB.open_db()) as db:

        slist = db.sentences()

        for s in slist:

            txt = s["sentence"]
            target = s["target"]  # The ideal number of parse trees (1 or 0)

            tokens = tokenize(txt)

            tlist = list(tokens)
            err = ""

            # Run the all-Python parser
            #try:
            #    t0 = time.time()
            #    forest = p.go(tlist)
            #except ParseError as e:
            #    err = "{0}".format(e)
            #    forest = None
            #finally:
            #    t1 = time.time()

            # ParseForestPrinter.print_forest(p.grammar, forest, detailed = True)

            # Run the C++ parser
            try:
                tf0 = time.time()
                forest2 = fast_p.go(tlist)
            except ParseError as e:
                err = "{0}".format(e)
                forest2 = None
            finally:
                tf1 = time.time()

            # num = 0 if forest is None else Parser.num_combinations(forest)
            num2 = 0 if forest2 is None else Fast_Parser.num_combinations(
                forest2)

            if Settings.DEBUG:
                #print("Python: Parsed in {0:.4f} seconds, {1} combinations".format(t1 - t0, num))
                print("C++:    Parsed in {0:.4f} seconds, {1} combinations".
                      format(tf1 - tf0, num2))

            best = s["best"]
            if best <= 0 or abs(target - num2) < abs(target - best):
                # We are closer to the ideal number of parse trees (target) than
                # the best parse so far: change the best one
                best = num2

            db.update_sentence(s["identity"], s["sentence"], num2, best,
                               target)

            yield dict(
                identity=s["identity"],
                sentence=txt,
                numtrees=num2,
                best=best,
                target=target,
                parse_time=tf1 - tf0,
                err="" if target == 0 else
                err,  # Don't bother showing errors that are expected
                forest=forest2)
Beispiel #11
0
def parse_tokens(toklist, mim_tags, fast_p):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0
    tag_ix = 0
    ntags = len(mim_tags)

    rdc = Reducer(fast_p.grammar)

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            if slen:
                # Parse the accumulated sentence
                err_index = None
                num = 0 # Number of tree combinations in forest
                score = 0 # Reducer score of the best parse tree

                try:
                    # Progress indicator: sentence count
                    print("{}".format(num_sent), end="\r")
                    # Parse the sentence
                    forest = fast_p.go(sent)
                    if forest:
                        num = Fast_Parser.num_combinations(forest)

                    if num > 1:
                        # Reduce the resulting forest
                        forest = rdc.go(forest)

                except ParseError as e:
                    forest = None
                    num = 0
                    # Obtain the index of the offending token
                    err_index = e.token_index

                if num > 0:
                    num_parsed_sent += 1

                    # Extract the POS tags for the terminals in the forest
                    pos_tags = find_pos_tags(forest)

                    # Calculate the 'ambiguity factor'
                    ambig_factor = num ** (1 / slen)
                    # Do a weighted average on sentence length
                    total_ambig += ambig_factor * slen
                    total_tokens += slen
                # Mark the sentence beginning with the number of parses
                # and the index of the offending token, if an error occurred
                toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)
            # Check whether the token streams are in sync
            if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]:
                #print("Warning: mismatch between MIM token '{0}' and Greynir token '{1}'".format(mim_tags[tag_ix][1], t[1]))
                # Attempt to sync again by finding the Greynir token in the MIM tag stream
                gap = 1
                MAX_LOOKAHEAD = 4
                while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]:
                    gap += 1
                if gap < MAX_LOOKAHEAD:
                    # Found the Greynir token ahead
                    #print("Re-synced by skipping ahead by {0} tokens".format(gap))
                    tag_ix += gap
            if tag_ix < ntags:
                tag_ix += 1

    return dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )
Beispiel #12
0
        dpath = join(rootpath, d)
        if isdir(dpath):
            parse_directory(dpath, fast_p)


if __name__ == "__main__":

    # Initialize the parsing module

    try:
        # Read configuration file
        Settings.read("config/Greynir.conf")
    except ConfigError as e:
        print("Configuration error: {0}".format(e))
        quit()

    print("Running Greynir with debug={0}, host={1}, db_hostname={2}"
        .format(Settings.DEBUG, Settings.HOST, Settings.DB_HOSTNAME))

    with Fast_Parser(verbose = False) as fast_p:

        g = fast_p.grammar

        print("Greynir.grammar has {0} nonterminals, {1} terminals, {2} productions"
            .format(g.num_nonterminals, g.num_terminals, g.num_productions))

        # Attempt to parse all XML files in subdirectories within mim/

        parse_directories("mim", fast_p)

Beispiel #13
0
def parse_tokens(toklist, mim_tags, fast_p):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0
    tag_ix = 0
    ntags = len(mim_tags)

    rdc = Reducer(fast_p.grammar)

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            if slen:
                # Parse the accumulated sentence
                err_index = None
                num = 0 # Number of tree combinations in forest
                score = 0 # Reducer score of the best parse tree

                try:
                    # Progress indicator: sentence count
                    print("{}".format(num_sent), end="\r")
                    # Parse the sentence
                    forest = fast_p.go(sent)
                    if forest:
                        num = Fast_Parser.num_combinations(forest)

                    if num > 1:
                        # Reduce the resulting forest
                        forest = rdc.go(forest)

                except ParseError as e:
                    forest = None
                    # Obtain the index of the offending token
                    err_index = e.token_index

                if num > 0:
                    num_parsed_sent += 1

                    # Extract the POS tags for the terminals in the forest
                    pos_tags = find_pos_tags(forest)

                    # Calculate the 'ambiguity factor'
                    ambig_factor = num ** (1 / slen)
                    # Do a weighted average on sentence length
                    total_ambig += ambig_factor * slen
                    total_tokens += slen
                # Mark the sentence beginning with the number of parses
                # and the index of the offending token, if an error occurred
                toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)
            # Check whether the token streams are in sync
            if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]:
                print("Warning: mismatch between MIM token '{0}' and Reynir token '{1}'".format(mim_tags[tag_ix][1], t[1]))
                # Attempt to sync again by finding the Reynir token in the MIM tag stream
                gap = 1
                MAX_LOOKAHEAD = 3
                while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]:
                    gap += 1
                if gap < MAX_LOOKAHEAD:
                    # Found the Reynir token ahead
                    print("Re-synced by skipping ahead by {0} tokens".format(gap))
                    tag_ix += gap
            if tag_ix < ntags:
                tag_ix += 1

    return dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )
Beispiel #14
0
def analyze():
    """ Find word categories in the submitted text """

    txt = request.form.get("txt", "").strip()

    # Tokenize the text entered as-is and return the token list
    toklist = list(tokenize(txt))

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    with Fast_Parser(verbose=False) as bp:  # Don't emit diagnostic messages

        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0  # Number of tree combinations in forest
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                        # Mark the token list with the identified word categories
                        mark_categories(forest, toklist, sent_begin + 1)

                    except ParseError as e:
                        # Obtain the index of the offending token
                        err_index = e.token_index
                    print(
                        "Parsed sentence of length {0} with {1} combinations{2}"
                        .format(
                            slen, num, "\n" +
                            (" ".join(s[1]
                                      for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num**(1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(
                        num_parses=num, err_index=err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(tokens=toklist,
                  tok_num=len(toklist),
                  num_sent=num_sent,
                  num_parsed_sent=num_parsed_sent,
                  avg_ambig_factor=(total_ambig /
                                    total_tokens) if total_tokens > 0 else 1.0)

    # Return the tokens as a JSON structure to the client
    return jsonify(result=result)
Beispiel #15
0
def parse_grid():
    """ Show the parse grid for a particular parse tree of a sentence """

    MAX_LEVEL = 32  # Maximum level of option depth we can handle
    txt = request.form.get('txt', "")
    parse_path = request.form.get('option', "")
    debug_mode = get_json_bool(request, 'debug')
    use_reducer = not ("noreduce" in request.form)

    # Tokenize the text
    tokens = list(tokenize(txt))

    # Parse the text
    with Fast_Parser(verbose=False) as bp:  # Don't emit diagnostic messages
        err = dict()
        grammar = bp.grammar
        try:
            forest = bp.go(tokens)
        except ParseError as e:
            err["msg"] = str(e)
            # Relay information about the parser state at the time of the error
            err["info"] = None  # e.info
            forest = None

    # Find the number of parse combinations
    combinations = 0 if forest is None else Fast_Parser.num_combinations(
        forest)
    score = 0

    if Settings.DEBUG:
        # Dump the parse tree to parse.txt
        with open("parse.txt", mode="w", encoding="utf-8") as f:
            if forest is not None:
                print("Reynir parse forest for sentence '{0}'".format(txt),
                      file=f)
                print("{0} combinations\n".format(combinations), file=f)
                if combinations < 10000:
                    ParseForestPrinter.print_forest(forest, file=f)
                else:
                    print("Too many combinations to dump", file=f)
            else:
                print("No parse available for sentence '{0}'".format(txt),
                      file=f)

    if forest is not None and use_reducer:
        # Reduce the parse forest
        forest, score = Reducer(grammar).go_with_score(forest)
        if Settings.DEBUG:
            # Dump the reduced tree along with node scores
            with open("reduce.txt", mode="w", encoding="utf-8") as f:
                print("Reynir parse tree for sentence '{0}' after reduction".
                      format(txt),
                      file=f)
                ParseForestPrinter.print_forest(forest, file=f)

    # Make the parse grid with all options
    grid, ncols = make_grid(forest) if forest else ([], 0)
    # The grid is columnar; convert it to row-major
    # form for convenient translation into HTML
    # There will be as many columns as there are tokens
    nrows = len(grid)
    tbl = [[] for _ in range(nrows)]
    # Info about previous row spans
    rs = [[] for _ in range(nrows)]

    # The particular option path we are displaying
    if not parse_path:
        # Not specified: display the all-zero path
        path = [(0, ) * i for i in range(1, MAX_LEVEL)]
    else:
        # Disassemble the passed-in path

        def toint(s):
            """ Safe conversion of string to int """
            try:
                n = int(s)
            except ValueError:
                n = 0
            return n if n >= 0 else 0

        p = [toint(s) for s in parse_path.split("_")]
        path = [tuple(p[0:i + 1]) for i in range(len(p))]

    # This set will contain all option path choices
    choices = set()
    NULL_TUPLE = tuple()

    for gix, gcol in enumerate(grid):
        # gcol is a dictionary of options
        # Accumulate the options that we want do display
        # according to chosen path
        cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [
        ]  # Default content
        # Add the options we're displaying
        for p in path:
            if p in gcol:
                cols.extend(gcol[p])
        # Accumulate all possible path choices
        choices |= gcol.keys()
        # Sort the columns that will be displayed
        cols.sort(key=lambda x: x[0])
        col = 0
        for startcol, endcol, info in cols:
            #assert isinstance(info, Nonterminal) or isinstance(info, tuple)
            if col < startcol:
                gap = startcol - col
                gap -= sum(1 for c in rs[gix] if c < startcol)
                if gap > 0:
                    tbl[gix].append((gap, 1, "", ""))
            rowspan = 1
            if isinstance(info, tuple):
                cls = {"terminal"}
                rowspan = nrows - gix
                for i in range(gix + 1, nrows):
                    # Note the rowspan's effect on subsequent rows
                    rs[i].append(startcol)
            else:
                cls = {"nonterminal"}
                # Get the 'pure' name of the nonterminal in question
                #assert isinstance(info, Nonterminal)
                info = info.name
            if endcol - startcol == 1:
                cls |= {"vertical"}
            tbl[gix].append((endcol - startcol, rowspan, info, cls))
            col = endcol
        ncols_adj = ncols - len(rs[gix])
        if col < ncols_adj:
            tbl[gix].append((ncols_adj - col, 1, "", ""))
    # Calculate the unique path choices available for this parse grid
    choices -= {NULL_TUPLE}  # Default choice: don't need it in the set
    unique_choices = choices.copy()
    for c in choices:
        # Remove all shorter prefixes of c from the unique_choices set
        unique_choices -= {c[0:i] for i in range(1, len(c))}
    # Create a nice string representation of the unique path choices
    uc_list = ["_".join(str(c) for c in choice) for choice in unique_choices]
    if not parse_path:
        # We are displaying the longest possible all-zero choice: find it
        i = 0
        while (0, ) * (i + 1) in unique_choices:
            i += 1
        parse_path = "_".join(["0"] * i)

    return render_template("parsegrid.html",
                           txt=txt,
                           err=err,
                           tbl=tbl,
                           combinations=combinations,
                           score=score,
                           debug_mode=debug_mode,
                           choice_list=uc_list,
                           parse_path=parse_path)
Beispiel #16
0
def analyze():
    """ Find word categories in the submitted text """

    txt = request.form.get("txt", "").strip()

    # Tokenize the text entered as-is and return the token list
    toklist = list(tokenize(txt))

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages

        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0 # Number of tree combinations in forest
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                        # Mark the token list with the identified word categories
                        mark_categories(forest, toklist, sent_begin + 1)

                    except ParseError as e:
                        # Obtain the index of the offending token
                        err_index = e.token_index
                    print("Parsed sentence of length {0} with {1} combinations{2}".format(slen, num,
                        "\n" + (" ".join(s[1] for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num ** (1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )

    # Return the tokens as a JSON structure to the client
    return jsonify(result = result)