def _parse(toklist): """ Parse a token list as a query """ # Parse with the nonterminal 'QueryRoot' as the grammar root with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp: sent_begin = 0 num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) #ParseForestPrinter.print_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees
def parse(self): """ Parse the sentence """ num = 0 try: forest = self._ip._parser.go(self._s) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: forest = self._ip._reducer.go(forest) except ParseError as e: forest = None self._err_index = e.token_index self._tree = forest self._ip._add_sentence(self, num) return num > 0
def parse_grid(): """ Show the parse grid for a particular parse tree of a sentence """ MAX_LEVEL = 32 # Maximum level of option depth we can handle txt = request.form.get('txt', "") parse_path = request.form.get('option', "") use_reducer = not ("noreduce" in request.form) # Tokenize the text tokens = list(tokenize(txt)) # Parse the text with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages err = dict() grammar = bp.grammar try: forest = bp.go(tokens) except ParseError as e: err["msg"] = str(e) # Relay information about the parser state at the time of the error err["info"] = None # e.info forest = None # Find the number of parse combinations combinations = 0 if forest is None else Fast_Parser.num_combinations(forest) score = 0 if Settings.DEBUG: # Dump the parse tree to parse.txt with open("parse.txt", mode = "w", encoding= "utf-8") as f: if forest is not None: print("Reynir parse tree for sentence '{0}'".format(txt), file = f) print("{0} combinations\n".format(combinations), file = f) if combinations < 10000: ParseForestPrinter.print_forest(forest, file = f) else: print("Too many combinations to dump", file = f) else: print("No parse available for sentence '{0}'".format(txt), file = f) if forest is not None and use_reducer: # Reduce the parse forest forest, score = Reducer(grammar).go_with_score(forest) if Settings.DEBUG: print(ParseForestDumper.dump_forest(forest)) # Make the parse grid with all options grid, ncols = make_grid(forest) if forest else ([], 0) # The grid is columnar; convert it to row-major # form for convenient translation into HTML # There will be as many columns as there are tokens nrows = len(grid) tbl = [ [] for _ in range(nrows) ] # Info about previous row spans rs = [ [] for _ in range(nrows) ] # The particular option path we are displaying if not parse_path: # Not specified: display the all-zero path path = [(0,) * i for i in range(1, MAX_LEVEL)] else: # Disassemble the passed-in path def toint(s): """ Safe conversion of string to int """ try: n = int(s) except ValueError: n = 0 return n if n >= 0 else 0 p = [ toint(s) for s in parse_path.split("_") ] path = [tuple(p[0 : i + 1]) for i in range(len(p))] # This set will contain all option path choices choices = set() NULL_TUPLE = tuple() for gix, gcol in enumerate(grid): # gcol is a dictionary of options # Accumulate the options that we want do display # according to chosen path cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [] # Default content # Add the options we're displaying for p in path: if p in gcol: cols.extend(gcol[p]) # Accumulate all possible path choices choices |= gcol.keys() # Sort the columns that will be displayed cols.sort(key = lambda x: x[0]) col = 0 for startcol, endcol, info in cols: assert isinstance(info, Nonterminal) or isinstance(info, tuple) if col < startcol: gap = startcol - col gap -= sum(1 for c in rs[gix] if c < startcol) if gap > 0: tbl[gix].append((gap, 1, "", "")) rowspan = 1 if isinstance(info, tuple): cls = { "terminal" } rowspan = nrows - gix for i in range(gix + 1, nrows): # Note the rowspan's effect on subsequent rows rs[i].append(startcol) else: cls = { "nonterminal" } # Get the 'pure' name of the nonterminal in question assert isinstance(info, Nonterminal) info = info.name if endcol - startcol == 1: cls |= { "vertical" } tbl[gix].append((endcol-startcol, rowspan, info, cls)) col = endcol ncols_adj = ncols - len(rs[gix]) if col < ncols_adj: tbl[gix].append((ncols_adj - col, 1, "", "")) # Calculate the unique path choices available for this parse grid choices -= { NULL_TUPLE } # Default choice: don't need it in the set unique_choices = choices.copy() for c in choices: # Remove all shorter prefixes of c from the unique_choices set unique_choices -= { c[0:i] for i in range(1, len(c)) } # Create a nice string representation of the unique path choices uc_list = [ "_".join(str(c) for c in choice) for choice in unique_choices ] if not parse_path: # We are displaying the longest possible all-zero choice: find it i = 0 while (0,) * (i + 1) in unique_choices: i += 1 parse_path = "_".join(["0"] * i) #debug() return render_template("parsegrid.html", txt = txt, err = err, tbl = tbl, combinations = combinations, score = score, choice_list = uc_list, parse_path = parse_path)
def parse(toklist, single, use_reducer, dump_forest = False, keep_trees = False): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 # Accumulate parsed sentences in a text dump format trees = OrderedDict() with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages version = bp.version rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if single and dump_forest: # Dump the parse tree to parse.txt with open("parse.txt", mode = "w", encoding= "utf-8") as f: print("Reynir parse tree for sentence '{0}'".format(" ".join(sent)), file = f) print("{0} combinations\n".format(num), file = f) if num < 10000: ParseForestPrinter.print_forest(forest, file = f) else: print("Too many combinations to dump", file = f) if use_reducer and num > 1: # Reduce the resulting forest forest, score = rdc.go_with_score(forest) assert Fast_Parser.num_combinations(forest) == 1 if Settings.DEBUG: print(ParseForestDumper.dump_forest(forest)) num = 1 except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index if Settings.DEBUG: print("Parsed sentence of length {0} with {1} combinations, score {2}{3}" .format(slen, num, score, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen if keep_trees: # We want to keep the trees for further processing down the line: # reduce and dump the best tree to text if num > 1: # Reduce the resulting forest before dumping it to text format forest = rdc.go(forest) trees[num_sent] = ParseForestDumper.dump_forest(forest) # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict( version = version, tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 ) # noinspection PyRedundantParentheses return (result, trees)
def run_test(fast_p): """ Run a test parse on all sentences in the test table """ with closing(Test_DB.open_db()) as db: slist = db.sentences() for s in slist: txt = s["sentence"] target = s["target"] # The ideal number of parse trees (1 or 0) tokens = tokenize(txt) tlist = list(tokens) err = "" # Run the all-Python parser #try: # t0 = time.time() # forest = p.go(tlist) #except ParseError as e: # err = "{0}".format(e) # forest = None #finally: # t1 = time.time() # ParseForestPrinter.print_forest(p.grammar, forest, detailed = True) # Run the C++ parser try: tf0 = time.time() forest2 = fast_p.go(tlist) except ParseError as e: err = "{0}".format(e) forest2 = None finally: tf1 = time.time() # num = 0 if forest is None else Parser.num_combinations(forest) num2 = 0 if forest2 is None else Fast_Parser.num_combinations( forest2) if Settings.DEBUG: #print("Python: Parsed in {0:.4f} seconds, {1} combinations".format(t1 - t0, num)) print("C++: Parsed in {0:.4f} seconds, {1} combinations". format(tf1 - tf0, num2)) best = s["best"] if best <= 0 or abs(target - num2) < abs(target - best): # We are closer to the ideal number of parse trees (target) than # the best parse so far: change the best one best = num2 db.update_sentence(s["identity"], s["sentence"], num2, best, target) yield dict( identity=s["identity"], sentence=txt, numtrees=num2, best=best, target=target, parse_time=tf1 - tf0, err="" if target == 0 else err, # Don't bother showing errors that are expected forest=forest2)
def parse_tokens(toklist, mim_tags, fast_p): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 tag_ix = 0 ntags = len(mim_tags) rdc = Reducer(fast_p.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Progress indicator: sentence count print("{}".format(num_sent), end="\r") # Parse the sentence forest = fast_p.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None num = 0 # Obtain the index of the offending token err_index = e.token_index if num > 0: num_parsed_sent += 1 # Extract the POS tags for the terminals in the forest pos_tags = find_pos_tags(forest) # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) # Check whether the token streams are in sync if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]: #print("Warning: mismatch between MIM token '{0}' and Greynir token '{1}'".format(mim_tags[tag_ix][1], t[1])) # Attempt to sync again by finding the Greynir token in the MIM tag stream gap = 1 MAX_LOOKAHEAD = 4 while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]: gap += 1 if gap < MAX_LOOKAHEAD: # Found the Greynir token ahead #print("Re-synced by skipping ahead by {0} tokens".format(gap)) tag_ix += gap if tag_ix < ntags: tag_ix += 1 return dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 )
def parse_tokens(toklist, mim_tags, fast_p): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 tag_ix = 0 ntags = len(mim_tags) rdc = Reducer(fast_p.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Progress indicator: sentence count print("{}".format(num_sent), end="\r") # Parse the sentence forest = fast_p.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index if num > 0: num_parsed_sent += 1 # Extract the POS tags for the terminals in the forest pos_tags = find_pos_tags(forest) # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) # Check whether the token streams are in sync if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]: print("Warning: mismatch between MIM token '{0}' and Reynir token '{1}'".format(mim_tags[tag_ix][1], t[1])) # Attempt to sync again by finding the Reynir token in the MIM tag stream gap = 1 MAX_LOOKAHEAD = 3 while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]: gap += 1 if gap < MAX_LOOKAHEAD: # Found the Reynir token ahead print("Re-synced by skipping ahead by {0} tokens".format(gap)) tag_ix += gap if tag_ix < ntags: tag_ix += 1 return dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 )
def analyze(): """ Find word categories in the submitted text """ txt = request.form.get("txt", "").strip() # Tokenize the text entered as-is and return the token list toklist = list(tokenize(txt)) # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 with Fast_Parser(verbose=False) as bp: # Don't emit diagnostic messages rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) assert Fast_Parser.num_combinations(forest) == 1 # Mark the token list with the identified word categories mark_categories(forest, toklist, sent_begin + 1) except ParseError as e: # Obtain the index of the offending token err_index = e.token_index print( "Parsed sentence of length {0} with {1} combinations{2}" .format( slen, num, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num**(1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence( num_parses=num, err_index=err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(tokens=toklist, tok_num=len(toklist), num_sent=num_sent, num_parsed_sent=num_parsed_sent, avg_ambig_factor=(total_ambig / total_tokens) if total_tokens > 0 else 1.0) # Return the tokens as a JSON structure to the client return jsonify(result=result)
def parse_grid(): """ Show the parse grid for a particular parse tree of a sentence """ MAX_LEVEL = 32 # Maximum level of option depth we can handle txt = request.form.get('txt', "") parse_path = request.form.get('option', "") debug_mode = get_json_bool(request, 'debug') use_reducer = not ("noreduce" in request.form) # Tokenize the text tokens = list(tokenize(txt)) # Parse the text with Fast_Parser(verbose=False) as bp: # Don't emit diagnostic messages err = dict() grammar = bp.grammar try: forest = bp.go(tokens) except ParseError as e: err["msg"] = str(e) # Relay information about the parser state at the time of the error err["info"] = None # e.info forest = None # Find the number of parse combinations combinations = 0 if forest is None else Fast_Parser.num_combinations( forest) score = 0 if Settings.DEBUG: # Dump the parse tree to parse.txt with open("parse.txt", mode="w", encoding="utf-8") as f: if forest is not None: print("Reynir parse forest for sentence '{0}'".format(txt), file=f) print("{0} combinations\n".format(combinations), file=f) if combinations < 10000: ParseForestPrinter.print_forest(forest, file=f) else: print("Too many combinations to dump", file=f) else: print("No parse available for sentence '{0}'".format(txt), file=f) if forest is not None and use_reducer: # Reduce the parse forest forest, score = Reducer(grammar).go_with_score(forest) if Settings.DEBUG: # Dump the reduced tree along with node scores with open("reduce.txt", mode="w", encoding="utf-8") as f: print("Reynir parse tree for sentence '{0}' after reduction". format(txt), file=f) ParseForestPrinter.print_forest(forest, file=f) # Make the parse grid with all options grid, ncols = make_grid(forest) if forest else ([], 0) # The grid is columnar; convert it to row-major # form for convenient translation into HTML # There will be as many columns as there are tokens nrows = len(grid) tbl = [[] for _ in range(nrows)] # Info about previous row spans rs = [[] for _ in range(nrows)] # The particular option path we are displaying if not parse_path: # Not specified: display the all-zero path path = [(0, ) * i for i in range(1, MAX_LEVEL)] else: # Disassemble the passed-in path def toint(s): """ Safe conversion of string to int """ try: n = int(s) except ValueError: n = 0 return n if n >= 0 else 0 p = [toint(s) for s in parse_path.split("_")] path = [tuple(p[0:i + 1]) for i in range(len(p))] # This set will contain all option path choices choices = set() NULL_TUPLE = tuple() for gix, gcol in enumerate(grid): # gcol is a dictionary of options # Accumulate the options that we want do display # according to chosen path cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [ ] # Default content # Add the options we're displaying for p in path: if p in gcol: cols.extend(gcol[p]) # Accumulate all possible path choices choices |= gcol.keys() # Sort the columns that will be displayed cols.sort(key=lambda x: x[0]) col = 0 for startcol, endcol, info in cols: #assert isinstance(info, Nonterminal) or isinstance(info, tuple) if col < startcol: gap = startcol - col gap -= sum(1 for c in rs[gix] if c < startcol) if gap > 0: tbl[gix].append((gap, 1, "", "")) rowspan = 1 if isinstance(info, tuple): cls = {"terminal"} rowspan = nrows - gix for i in range(gix + 1, nrows): # Note the rowspan's effect on subsequent rows rs[i].append(startcol) else: cls = {"nonterminal"} # Get the 'pure' name of the nonterminal in question #assert isinstance(info, Nonterminal) info = info.name if endcol - startcol == 1: cls |= {"vertical"} tbl[gix].append((endcol - startcol, rowspan, info, cls)) col = endcol ncols_adj = ncols - len(rs[gix]) if col < ncols_adj: tbl[gix].append((ncols_adj - col, 1, "", "")) # Calculate the unique path choices available for this parse grid choices -= {NULL_TUPLE} # Default choice: don't need it in the set unique_choices = choices.copy() for c in choices: # Remove all shorter prefixes of c from the unique_choices set unique_choices -= {c[0:i] for i in range(1, len(c))} # Create a nice string representation of the unique path choices uc_list = ["_".join(str(c) for c in choice) for choice in unique_choices] if not parse_path: # We are displaying the longest possible all-zero choice: find it i = 0 while (0, ) * (i + 1) in unique_choices: i += 1 parse_path = "_".join(["0"] * i) return render_template("parsegrid.html", txt=txt, err=err, tbl=tbl, combinations=combinations, score=score, debug_mode=debug_mode, choice_list=uc_list, parse_path=parse_path)
def analyze(): """ Find word categories in the submitted text """ txt = request.form.get("txt", "").strip() # Tokenize the text entered as-is and return the token list toklist = list(tokenize(txt)) # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) assert Fast_Parser.num_combinations(forest) == 1 # Mark the token list with the identified word categories mark_categories(forest, toklist, sent_begin + 1) except ParseError as e: # Obtain the index of the offending token err_index = e.token_index print("Parsed sentence of length {0} with {1} combinations{2}".format(slen, num, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 ) # Return the tokens as a JSON structure to the client return jsonify(result = result)