def _parse(toklist): """ Parse a token list as a query """ # Parse with the nonterminal 'QueryRoot' as the grammar root with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp: sent_begin = 0 num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) #ParseForestPrinter.print_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees
def make_tree(text: str) -> Tree: toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) return tree
def parse_text_with_full_tree(session, text, all_names=False): """ Parse plain text, assumed to contain one sentence only, and return its simplified form as well as its full form. """ full_tree = None def xform(tokens, tree, err_index): """ Transformation function that yields a simplified parse tree with POS-tagged, normalized terminal leaves for the sentence """ if err_index is not None: return TreeUtility.dump_tokens(tokens, tree, error_index=err_index) # Successfully parsed: return a simplified tree for the sentence nonlocal full_tree # We are assuming that there is only one parsed sentence if full_tree is None: # Note the full tree of the first parsed paragraph full_tree = tree return TreeUtility._simplify_tree(tokens, tree) with Fast_Parser(verbose=False) as parser: pgs, stats, _ = TreeUtility._process_text(parser, session, text, all_names, xform) if (not pgs or stats["num_parsed"] == 0 or not pgs[0] or any("err" in t for t in pgs[0][0])): # The first sentence didn't parse: let's not beat around the bush with that fact return (None, None, stats) # Return the simplified tree, full tree and stats assert full_tree is not None return (pgs[0][0], full_tree, stats)
def tag_text( session: Session, text: str, all_names: bool = False ) -> Tuple[PgsList, StatsDict, Optional["RegisterType"]]: """ Parse plain text and return the parsed paragraphs as lists of sentences where each sentence is a list of tagged tokens """ # Don't emit diagnostic messages with Fast_Parser(verbose=False) as parser: return TreeUtility.raw_tag_text(parser, session, text, all_names=all_names)
def tag_text(session, text, all_names=False): """ Parse plain text and return the parsed paragraphs as lists of sentences where each sentence is a list of tagged tokens """ # Don't emit diagnostic messages with Fast_Parser(verbose=False) as parser: return TreeUtility.raw_tag_text(parser, session, text, all_names=all_names)
def parse_text_to_bracket_form(session: Session, text: str): """ Parse plain text and return the parsed paragraphs as bracketed strings """ def xform( tokens: List[Tok], tree: Optional[Node], err_index: Optional[int] ) -> str: """ Transformation function that yields a simplified parse tree with POS-tagged, normalized terminal leaves for the sentence """ if err_index is not None: # Return an empty string for sentences that don't parse return "" # Successfully parsed: obtain a simplified tree for the sentence result = [] def push(node: Optional[CanonicalTokenDict]) -> None: """ Append information about a node to the result list """ if node is None: return nonlocal result if node.get("k") == "NONTERMINAL": node = cast(SimpleTreeNode, node) result.append("(" + node.get("i", "")) # Recursively add the children of this nonterminal for child in node.get("p", []): result.append(" ") push(child) result.append(")") elif node.get("k") == "PUNCTUATION": pass # Include punctuation? # If so, do something like: # result.push("(PUNCT |" + node["x"] + "|)") else: # Terminal: append the text result.append(node.get("x", "").replace(" ", "_")) # This uses a custom simplification scheme simple_tree = TreeUtility._simplify_tree( tokens, tree, nt_map=_TEST_NT_MAP, id_map=_TEST_ID_MAP, terminal_map=_TEST_TERMINAL_MAP, ) push(simple_tree) return "".join(result) with Fast_Parser(verbose=False) as parser: # The cast(XformFunc, xform) type annotation is a hack pgs, stats, _ = TreeUtility._process_text( parser, session, text, all_names=False, xform=cast(XformFunc, xform) ) # pgs is a list of paragraphs, each being a list of sentences # To access the first parsed sentence, use pgs[0][0] return (pgs, stats)
def raw_tag_toklist(session, toklist, root=None): """ Parse plain text and return the parsed paragraphs as lists of sentences where each sentence is a list of tagged tokens. The result does not include a name register. """ def xform(tokens, tree, err_index): """ Transformation function that simply returns a list of POS-tagged, normalized tokens for the sentence """ return TreeUtility.dump_tokens(tokens, tree, error_index=err_index) with Fast_Parser(verbose=False, root=root) as parser: return TreeUtility._process_toklist(parser, session, toklist, xform)
def _parse(toklist: Iterable[Tok]) -> Tuple[ResponseDict, Dict[int, str]]: """ Parse a token list as a query """ bp = Query._parser assert bp is not None num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees: Dict[int, str] = dict() sent: List[Tok] = [] for t in toklist: if t[0] == TOK.S_BEGIN: if num_sent > 0: # A second sentence is beginning: this is not valid for a query raise ParseError( "A query cannot contain more than one sentence") sent = [] elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError: forest = None num = 0 if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree assert forest is not None trees[num_sent] = ParseForestDumper.dump_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result: ResponseDict = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees
def parse_text(session, text, all_names=False): """ Parse plain text and return the parsed paragraphs as simplified trees """ def xform(tokens, tree, err_index): """ Transformation function that yields a simplified parse tree with POS-tagged, normalized terminal leaves for the sentence """ if err_index is not None: return TreeUtility.dump_tokens(tokens, tree, None, err_index) # Successfully parsed: return a simplified tree for the sentence return TreeUtility._simplify_tree(tokens, tree) with Fast_Parser( verbose=False) as parser: # Don't emit diagnostic messages return TreeUtility._process_text(parser, session, text, all_names, xform)
def _parse(toklist): """ Parse a token list as a query """ # Parse with the nonterminal 'QueryRoot' as the grammar root with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp: sent_begin = 0 num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) # ParseForestPrinter.print_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees
def tag_toklist(session, toklist, all_names=False): """ Parse plain text and return the parsed paragraphs as lists of sentences where each sentence is a list of tagged tokens """ def xform(tokens, tree, err_index): """ Transformation function that simply returns a list of POS-tagged, normalized tokens for the sentence """ return TreeUtility.dump_tokens(tokens, tree, error_index=err_index) with Fast_Parser( verbose=False) as parser: # Don't emit diagnostic messages pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) from queries.builtin import create_name_register register = create_name_register(toklist, session, all_names=all_names) return pgs, stats, register
def raw_tag_toklist( toklist: Iterable[Tok], root: Optional[str] = None ) -> Tuple[PgsList, StatsDict]: """ Parse plain text and return the parsed paragraphs as lists of sentences where each sentence is a list of tagged tokens. The result does not include a name register. """ def xform( tokens: List[Tok], tree: Optional[Node], err_index: Optional[int] ) -> List[TokenDict]: """ Transformation function that simply returns a list of POS-tagged, normalized tokens for the sentence """ return TreeUtility.dump_tokens(tokens, tree, error_index=err_index) with Fast_Parser(verbose=False, root=root) as parser: return TreeUtility._process_toklist(parser, toklist, xform)
def parse(self): """ Parse the sentence """ num = 0 score = 0 try: forest = self._ip._parser.go(self._s) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: forest, score = self._ip._reducer.go_with_score(forest) except ParseError as e: forest = None self._err_index = e.token_index self._tree = forest self._score = score self._ip._add_sentence(self, num) return num > 0
def _parse(toklist): """ Parse a token list as a query """ bp = Query._parser num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for t in toklist: if t[0] == TOK.S_BEGIN: sent = [] elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees
def parse_text( session: Session, text: str, all_names: bool = False ) -> Tuple[PgsList, StatsDict, Optional["RegisterType"]]: """ Parse plain text and return the parsed paragraphs as simplified trees """ def xform( tokens: List[Tok], tree: Optional[Node], err_index: Optional[int] ) -> Union[None, List[TokenDict], CanonicalTokenDict]: """ Transformation function that yields a simplified parse tree with POS-tagged, normalized terminal leaves for the sentence """ if err_index is not None: return TreeUtility.dump_tokens(tokens, tree, error_index=err_index) # Successfully parsed: return a simplified tree for the sentence return TreeUtility._simplify_tree(tokens, tree) with Fast_Parser(verbose=False) as parser: # Don't emit diagnostic messages # The type annotation cast(XformFunc, xform) is a hack return TreeUtility._process_text( parser, session, text, all_names, cast(XformFunc, xform) )
def parse_text_with_full_tree( session: Session, text: str, all_names: bool = False ) -> Tuple[Optional[List[TokenDict]], Optional[Node], StatsDict]: """ Parse plain text, assumed to contain one sentence only, and return its simplified form as well as its full form. """ full_tree: Optional[Node] = None def xform( tokens: List[Tok], tree: Optional[Node], err_index: Optional[int] ) -> Union[None, List[TokenDict], CanonicalTokenDict]: """ Transformation function that yields a simplified parse tree with POS-tagged, normalized terminal leaves for the sentence """ if err_index is not None: return TreeUtility.dump_tokens(tokens, tree, error_index=err_index) # Successfully parsed: return a simplified tree for the sentence nonlocal full_tree # We are assuming that there is only one parsed sentence if full_tree is None: # Note the full tree of the first parsed paragraph full_tree = tree return TreeUtility._simplify_tree(tokens, tree) with Fast_Parser(verbose=False) as parser: # The cast(XformFunction, xform) type annotation is a hack pgs, stats, _ = TreeUtility._process_text( parser, session, text, all_names, cast(XformFunc, xform) ) if ( not pgs or stats["num_parsed"] == 0 or not pgs[0] or any("err" in t for t in pgs[0][0]) ): # The first sentence didn't parse: let's not beat around the bush with that fact return (None, None, stats) # Return the simplified tree, full tree and stats assert full_tree is not None return (pgs[0][0], full_tree, stats)
def _make_tree(text: str) -> Tree: """Tokenize and parse text, create tree representation string from all the parse trees, return Tree object and token JSON.""" toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree assert sent.tree is not None tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) pgs[-1].append(token_dicts) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) tree = Tree() tree.load(tree_string) return tree, tokens_json
else: raise finally: ArticleProxy.cleanup() BIN_Db.cleanup() else: app.config["PRODUCTION"] = True # Suppress information log messages from Werkzeug werkzeug_log = logging.getLogger("werkzeug") if werkzeug_log: werkzeug_log.setLevel(logging.WARNING) # Log our startup log_str = ("Greynir instance starting with " "host={0}:{1}, db_host={2}:{3} on Python {4}".format( Settings.HOST, Settings.PORT, Settings.DB_HOSTNAME, Settings.DB_PORT, sys.version.replace("\n", " "), )) logging.info(log_str) print(log_str) sys.stdout.flush() # Running as a server module: pre-load the grammar into memory with Fast_Parser() as fp: pass
def test_entities(): text = """ Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota. Danska byggingavörukeðjan Bygma hefur keypt íslenska verslunarfyrirtækið Húsasmiðjuna. Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs eru hluthafar í Arion banka. Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra. Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám. Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám. Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær. Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær. Primera Air var íslenskt flugfélag. Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag. Villeneuve-Loubet er franskt þorp. Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda. Í miðbæ Reykjavíkur er herrafataverslunin Geysir. Mér er sagt að Geysir sé hættur að gjósa. Geysir er hættur að gjósa. Geysir er gamall goshver. Fyrirtækið Apple-búðin selur Apple Mac tölvur. Fyrirtækið Origo selur IBM tölvur. Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf. """ toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse" # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) session = SessionShim() tree.process(session, entities) session.check(("Bygma", "er", "dönsk byggingavörukeðja")) session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki")) session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður")) session.check( ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður")) session.check(("Primera Air", "var", "íslenskt flugfélag")) session.check(("Villeneuve-Loubet", "er", "franskt þorp")) session.check(("Valdís", "er", "ísbúð")) session.check(("Fosshótel", "var", "rekin með tapi")) session.check(("Fosshótel", "er", "stór hótelkeðja")) session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði")) session.check(("Lax", "er", "stór fiskur af ætt laxfiska")) session.check(("Geysir", "er", "gamall goshver")) session.check(("Eimskipafélag Íslands hf", "er", "skipafélag")) session.check(("Origo", "er", "fyrirtæki")) session.check(("AirBerlin", "er", "flugfélag")) assert session.is_empty()
def _init_class(cls): """ Initialize class attributes """ if cls._parser is None: cls._parser = Fast_Parser( verbose=False) # Don't emit diagnostic messages