def _parse(toklist): """ Parse a token list as a query """ # Parse with the nonterminal 'QueryRoot' as the grammar root with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp: sent_begin = 0 num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) #ParseForestPrinter.print_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees
def parse(toklist, single, use_reducer, dump_forest = False, keep_trees = False): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 # Accumulate parsed sentences in a text dump format trees = OrderedDict() with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages version = bp.version rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if single and dump_forest: # Dump the parse tree to parse.txt with open("parse.txt", mode = "w", encoding= "utf-8") as f: print("Reynir parse tree for sentence '{0}'".format(" ".join(sent)), file = f) print("{0} combinations\n".format(num), file = f) if num < 10000: ParseForestPrinter.print_forest(forest, file = f) else: print("Too many combinations to dump", file = f) if use_reducer and num > 1: # Reduce the resulting forest forest, score = rdc.go_with_score(forest) assert Fast_Parser.num_combinations(forest) == 1 if Settings.DEBUG: print(ParseForestDumper.dump_forest(forest)) num = 1 except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index if Settings.DEBUG: print("Parsed sentence of length {0} with {1} combinations, score {2}{3}" .format(slen, num, score, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen if keep_trees: # We want to keep the trees for further processing down the line: # reduce and dump the best tree to text if num > 1: # Reduce the resulting forest before dumping it to text format forest = rdc.go(forest) trees[num_sent] = ParseForestDumper.dump_forest(forest) # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict( version = version, tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 ) # noinspection PyRedundantParentheses return (result, trees)
def parse_tokens(toklist, mim_tags, fast_p): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 tag_ix = 0 ntags = len(mim_tags) rdc = Reducer(fast_p.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Progress indicator: sentence count print("{}".format(num_sent), end="\r") # Parse the sentence forest = fast_p.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None num = 0 # Obtain the index of the offending token err_index = e.token_index if num > 0: num_parsed_sent += 1 # Extract the POS tags for the terminals in the forest pos_tags = find_pos_tags(forest) # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) # Check whether the token streams are in sync if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]: #print("Warning: mismatch between MIM token '{0}' and Greynir token '{1}'".format(mim_tags[tag_ix][1], t[1])) # Attempt to sync again by finding the Greynir token in the MIM tag stream gap = 1 MAX_LOOKAHEAD = 4 while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]: gap += 1 if gap < MAX_LOOKAHEAD: # Found the Greynir token ahead #print("Re-synced by skipping ahead by {0} tokens".format(gap)) tag_ix += gap if tag_ix < ntags: tag_ix += 1 return dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 )
def parse_tokens(toklist, mim_tags, fast_p): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 tag_ix = 0 ntags = len(mim_tags) rdc = Reducer(fast_p.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Progress indicator: sentence count print("{}".format(num_sent), end="\r") # Parse the sentence forest = fast_p.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index if num > 0: num_parsed_sent += 1 # Extract the POS tags for the terminals in the forest pos_tags = find_pos_tags(forest) # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) # Check whether the token streams are in sync if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]: print("Warning: mismatch between MIM token '{0}' and Reynir token '{1}'".format(mim_tags[tag_ix][1], t[1])) # Attempt to sync again by finding the Reynir token in the MIM tag stream gap = 1 MAX_LOOKAHEAD = 3 while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]: gap += 1 if gap < MAX_LOOKAHEAD: # Found the Reynir token ahead print("Re-synced by skipping ahead by {0} tokens".format(gap)) tag_ix += gap if tag_ix < ntags: tag_ix += 1 return dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 )
def analyze(): """ Find word categories in the submitted text """ txt = request.form.get("txt", "").strip() # Tokenize the text entered as-is and return the token list toklist = list(tokenize(txt)) # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 with Fast_Parser(verbose=False) as bp: # Don't emit diagnostic messages rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) assert Fast_Parser.num_combinations(forest) == 1 # Mark the token list with the identified word categories mark_categories(forest, toklist, sent_begin + 1) except ParseError as e: # Obtain the index of the offending token err_index = e.token_index print( "Parsed sentence of length {0} with {1} combinations{2}" .format( slen, num, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num**(1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence( num_parses=num, err_index=err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(tokens=toklist, tok_num=len(toklist), num_sent=num_sent, num_parsed_sent=num_parsed_sent, avg_ambig_factor=(total_ambig / total_tokens) if total_tokens > 0 else 1.0) # Return the tokens as a JSON structure to the client return jsonify(result=result)
def analyze(): """ Find word categories in the submitted text """ txt = request.form.get("txt", "").strip() # Tokenize the text entered as-is and return the token list toklist = list(tokenize(txt)) # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) assert Fast_Parser.num_combinations(forest) == 1 # Mark the token list with the identified word categories mark_categories(forest, toklist, sent_begin + 1) except ParseError as e: # Obtain the index of the offending token err_index = e.token_index print("Parsed sentence of length {0} with {1} combinations{2}".format(slen, num, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 ) # Return the tokens as a JSON structure to the client return jsonify(result = result)