Esempio n. 1
0
def test3():

    print("\n\n------ Test 3 ---------")

    p = BIN_Parser()
    g = p.grammar()

    print("Reynir.grammar has {0} nonterminals, {1} terminals, {2} productions"
        .format(g.num_nonterminals(), g.num_terminals(), g.num_productions()))

    def create_sentence_table():
        """ Only used to create a test fresh sentence table if one doesn't exist """
        with closing(Test_DB.open_db()) as db:

            try:
                db.create_sentence_table()

                TEXTS = [
                    "Páll fór út með stóran kött og Jón keypti heitan graut.",
                    "Unga fallega konan frá Garðabæ elti ljóta og feita karlinn rösklega og fumlaust í svörtu myrkrinu",
                    "Kötturinn sem strákurinn átti veiddi feitu músina",
                    "Gamla bláa kommóðan var máluð fjólublá með olíumálningu",
                    "Landsframleiðslan hefur aukist frá því í fyrra",
                    "Guðmundur og Guðrún kusu Framsóknarflokkinn",
                    "Þú skalt fara til Danmerkur.",
                    "Ég og þú fórum til Frakklands í utanlandsferð",
                    "Stóru bláu könnunni mun hafa verið fleygt í ruslið",
                    "Már Guðmundsson segir margskonar misskilnings gæta hjá Hannesi Hólmsteini",
                    "Már Guðmundsson seðlabankastjóri Íslands segir þetta við Morgunblaðið í dag.",
                    "Það er náttúrlega einungis í samfélögum sem eiga við býsna stór vandamál að stríða að ný stjórnmálaöfl geta snögglega sveiflast upp í þriðjungs fylgi.",
                    "Áætlaður kostnaður verkefnisins var tíu milljónir króna og áætluð verklok eru í byrjun september næstkomandi.",
                    "Pakkinn snerist um að ábyrgjast innlán og skuldabréfaútgáfu danskra fjármálafyrirtækja.",
                    "Kynningarfundurinn sem ég hélt í dag fjallaði um lausnina á þessum vanda.",
                    "Kynningarfundurinn sem haldinn var í dag fjallaði um lausnina á þessum vanda.",
                    "Það sakamál sé til meðferðar við Héraðsdóm Suðurlands."
                ]

                for t in TEXTS:
                    db.add_sentence(t)

                slist = db.sentences()
                for s in slist:
                    print("{0}".format(s))

            except Exception as e:
                print("{0}".format(e))

    for test in run_test(p):

        print("\n'{0}'\n{1} parse trees found in {2:.3f} seconds\n"
            .format(test["sentence"], test["numtrees"], test["parse_time"]))

        if test["numtrees"] > 0:
            Parser.print_parse_forest(test["forest"])
            # print("{0}".format(Parser.make_schema(test["forest"])))
        elif test["err"]:
            print("Error: {0}".format(test["err"]))
Esempio n. 2
0
def test():
    """ Handler for a page of sentences for testing """

    # Run test and show the result
    bp = BIN_Parser()

    return render_template("test.html", result=run_test(bp))
Esempio n. 3
0
def parse_grid():
    """ Show the parse grid for a particular parse tree of a sentence """

    MAX_LEVEL = 32 # Maximum level of option depth we can handle
    txt = request.form.get('txt', "")
    parse_path = request.form.get('option', "")

    # Tokenize the text
    tokens = list(tokenize(txt))
    # Parse the text
    bp = BIN_Parser()
    err = dict()

    try:
        forest = bp.go(tokens)
    except ParseError as e:
        err["msg"] = str(e)
        # Relay information about the parser state at the time of the error
        err["info"] = e.info()
        forest = None

    # Find the number of parse combinations
    combinations = Parser.num_combinations(forest) if forest else 0
    # Make the parse grid with all options
    grid, ncols = Parser.make_grid(forest) if forest else ([], 0)
    # The grid is columnar; convert it to row-major
    # form for convenient translation into HTML
    # There will be as many columns as there are tokens
    nrows = len(grid)
    tbl = [ [] for _ in range(nrows) ]
    # Info about previous row spans
    rs = [ [] for _ in range(nrows) ]

    # The particular option path we are displaying
    if not parse_path:
        # Not specified: display the all-zero path
        path = [(0,) * i for i in range(1, MAX_LEVEL)]
    else:
        # Disassemble the passed-in path

        def toint(s):
            """ Safe conversion of string to int """
            try:
                n = int(s)
            except ValueError:
                n = 0
            return n if n >= 0 else 0

        p = [ toint(s) for s in parse_path.split("_") ]
        path = [tuple(p[0 : i + 1]) for i in range(len(p))]

    # This set will contain all option path choices
    choices = set()
    NULL_TUPLE = tuple()

    for gix, gcol in enumerate(grid):
        # gcol is a dictionary of options
        # Accumulate the options that we want do display
        # according to chosen path
        cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [] # Default content
        # Add the options we're displaying
        for p in path:
            if p in gcol:
                cols.extend(gcol[p])
        # Accumulate all possible path choices
        choices |= gcol.keys()
        # Sort the columns that will be displayed
        cols.sort(key = lambda x: x[0])
        col = 0
        for startcol, endcol, info in cols:
            assert isinstance(info, Nonterminal) or isinstance(info, tuple)
            if col < startcol:
                gap = startcol - col
                gap -= sum(1 for c in rs[gix] if c < startcol)
                if gap > 0:
                    tbl[gix].append((gap, 1, "", ""))
            rowspan = 1
            if isinstance(info, tuple):
                cls = { "terminal" }
                rowspan = nrows - gix
                for i in range(gix + 1, nrows):
                    # Note the rowspan's effect on subsequent rows
                    rs[i].append(startcol)
            else:
                cls = { "nonterminal" }
                # Get the 'pure' name of the nonterminal in question
                assert isinstance(info, Nonterminal)
                info = info.name()
            if endcol - startcol == 1:
                cls |= { "vertical" }
            tbl[gix].append((endcol-startcol, rowspan, info, cls))
            col = endcol
        ncols_adj = ncols - len(rs[gix])
        if col < ncols_adj:
            tbl[gix].append((ncols_adj - col, 1, "", ""))
    # Calculate the unique path choices available for this parse grid
    choices -= { NULL_TUPLE } # Default choice: don't need it in the set
    unique_choices = choices.copy()
    for c in choices:
        # Remove all shorter prefixes of c from the unique_choices set
        unique_choices -= { c[0:i] for i in range(1, len(c)) }
    # Create a nice string representation of the unique path choices
    uc_list = [ "_".join(str(c) for c in choice) for choice in unique_choices ]
    if not parse_path:
        # We are displaying the longest possible all-zero choice: find it
        i = 0
        while (0,) * (i + 1) in unique_choices:
            i += 1
        parse_path = "_".join(["0"] * i)

    #debug()

    return render_template("parsegrid.html", txt = txt, err = err, tbl = tbl,
        combinations = combinations, choice_list = uc_list,
        parse_path = parse_path)
Esempio n. 4
0
def analyze():
    """ Analyze text from a given URL """

    url = request.form.get("url", "").strip()
    t0 = time.time()

    if url.startswith("http:") or url.startswith("https:"):
        # Scrape the URL, tokenize the text content and return the token list
        toklist = list(process_url(url))
    else:
        # Tokenize the text entered as-is and return the token list
        toklist = list(tokenize(url))

    tok_time = time.time() - t0

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0

    sent_begin = 0
    bp = BIN_Parser()

    t0 = time.time()

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            # Parse the accumulated sentence
            err_index = None
            try:
                forest = bp.go(sent)
            except ParseError as e:
                forest = None
                # Obtain the index of the offending token
                err_index = e.token_index()
            num = 0 if forest is None else Parser.num_combinations(forest)
            print("Parsed sentence of length {0} with {1} combinations{2}".format(slen, num,
                "\n" + " ".join(s[1] for s in sent) if num >= 100 else ""))
            if num > 0:
                num_parsed_sent += 1
                # Calculate the 'ambiguity factor'
                ambig_factor = num ** (1 / slen)
                # Do a weighted average on sentence length
                total_ambig += ambig_factor * slen
                total_tokens += slen
            # Mark the sentence beginning with the number of parses
            # and the index of the offending token, if an error occurred
            toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)

    parse_time = time.time() - t0

    result = dict(
        tokens = toklist,
        tok_time = tok_time,
        tok_num = len(toklist),
        parse_time = parse_time,
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )

    # Dump the tokens to a text file for inspection
    # dump_tokens_to_file("txt", toklist)

    # Return the tokens as a JSON structure to the client
    return jsonify(result = result)
Esempio n. 5
0
def parse_grid():
    """ Show the parse grid for a particular parse tree of a sentence """

    MAX_LEVEL = 32  # Maximum level of option depth we can handle
    txt = request.form.get('txt', "")
    parse_path = request.form.get('option', "")

    # Tokenize the text
    tokens = list(tokenize(txt))
    # Parse the text
    bp = BIN_Parser()
    err = dict()

    try:
        forest = bp.go(tokens)
    except ParseError as e:
        err["msg"] = str(e)
        # Relay information about the parser state at the time of the error
        err["info"] = e.info()
        forest = None

    # Find the number of parse combinations
    combinations = Parser.num_combinations(forest) if forest else 0
    # Make the parse grid with all options
    grid, ncols = Parser.make_grid(forest) if forest else ([], 0)
    # The grid is columnar; convert it to row-major
    # form for convenient translation into HTML
    # There will be as many columns as there are tokens
    nrows = len(grid)
    tbl = [[] for _ in range(nrows)]
    # Info about previous row spans
    rs = [[] for _ in range(nrows)]

    # The particular option path we are displaying
    if not parse_path:
        # Not specified: display the all-zero path
        path = [(0, ) * i for i in range(1, MAX_LEVEL)]
    else:
        # Disassemble the passed-in path

        def toint(s):
            """ Safe conversion of string to int """
            try:
                n = int(s)
            except ValueError:
                n = 0
            return n if n >= 0 else 0

        p = [toint(s) for s in parse_path.split("_")]
        path = [tuple(p[0:i + 1]) for i in range(len(p))]

    # This set will contain all option path choices
    choices = set()
    NULL_TUPLE = tuple()

    for gix, gcol in enumerate(grid):
        # gcol is a dictionary of options
        # Accumulate the options that we want do display
        # according to chosen path
        cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [
        ]  # Default content
        # Add the options we're displaying
        for p in path:
            if p in gcol:
                cols.extend(gcol[p])
        # Accumulate all possible path choices
        choices |= gcol.keys()
        # Sort the columns that will be displayed
        cols.sort(key=lambda x: x[0])
        col = 0
        for startcol, endcol, info in cols:
            assert isinstance(info, Nonterminal) or isinstance(info, tuple)
            if col < startcol:
                gap = startcol - col
                gap -= sum(1 for c in rs[gix] if c < startcol)
                if gap > 0:
                    tbl[gix].append((gap, 1, "", ""))
            rowspan = 1
            if isinstance(info, tuple):
                cls = {"terminal"}
                rowspan = nrows - gix
                for i in range(gix + 1, nrows):
                    # Note the rowspan's effect on subsequent rows
                    rs[i].append(startcol)
            else:
                cls = {"nonterminal"}
                # Get the 'pure' name of the nonterminal in question
                assert isinstance(info, Nonterminal)
                info = info.name()
            if endcol - startcol == 1:
                cls |= {"vertical"}
            tbl[gix].append((endcol - startcol, rowspan, info, cls))
            col = endcol
        ncols_adj = ncols - len(rs[gix])
        if col < ncols_adj:
            tbl[gix].append((ncols_adj - col, 1, "", ""))
    # Calculate the unique path choices available for this parse grid
    choices -= {NULL_TUPLE}  # Default choice: don't need it in the set
    unique_choices = choices.copy()
    for c in choices:
        # Remove all shorter prefixes of c from the unique_choices set
        unique_choices -= {c[0:i] for i in range(1, len(c))}
    # Create a nice string representation of the unique path choices
    uc_list = ["_".join(str(c) for c in choice) for choice in unique_choices]
    if not parse_path:
        # We are displaying the longest possible all-zero choice: find it
        i = 0
        while (0, ) * (i + 1) in unique_choices:
            i += 1
        parse_path = "_".join(["0"] * i)

    #debug()

    return render_template("parsegrid.html",
                           txt=txt,
                           err=err,
                           tbl=tbl,
                           combinations=combinations,
                           choice_list=uc_list,
                           parse_path=parse_path)
Esempio n. 6
0
def analyze():
    """ Analyze text from a given URL """

    url = request.form.get("url", "").strip()
    t0 = time.time()

    if url.startswith("http:") or url.startswith("https:"):
        # Scrape the URL, tokenize the text content and return the token list
        toklist = list(process_url(url))
    else:
        # Tokenize the text entered as-is and return the token list
        toklist = list(tokenize(url))

    tok_time = time.time() - t0

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0

    sent_begin = 0
    bp = BIN_Parser()

    t0 = time.time()

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            # Parse the accumulated sentence
            err_index = None
            try:
                forest = bp.go(sent)
            except ParseError as e:
                forest = None
                # Obtain the index of the offending token
                err_index = e.token_index()
            num = 0 if forest is None else Parser.num_combinations(forest)
            print("Parsed sentence of length {0} with {1} combinations{2}".
                  format(
                      slen, num,
                      "\n" + " ".join(s[1]
                                      for s in sent) if num >= 100 else ""))
            if num > 0:
                num_parsed_sent += 1
                # Calculate the 'ambiguity factor'
                ambig_factor = num**(1 / slen)
                # Do a weighted average on sentence length
                total_ambig += ambig_factor * slen
                total_tokens += slen
            # Mark the sentence beginning with the number of parses
            # and the index of the offending token, if an error occurred
            toklist[sent_begin] = TOK.Begin_Sentence(num_parses=num,
                                                     err_index=err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)

    parse_time = time.time() - t0

    result = dict(tokens=toklist,
                  tok_time=tok_time,
                  tok_num=len(toklist),
                  parse_time=parse_time,
                  num_sent=num_sent,
                  num_parsed_sent=num_parsed_sent,
                  avg_ambig_factor=(total_ambig /
                                    total_tokens) if total_tokens > 0 else 1.0)

    # Dump the tokens to a text file for inspection
    # dump_tokens_to_file("txt", toklist)

    # Return the tokens as a JSON structure to the client
    return jsonify(result=result)