def postag_api(version=1): """ API to parse text and return POS tagged tokens in a verbose JSON format """ if not (1 <= version <= 1): # Unsupported version return better_jsonify(valid=False, reason="Unsupported version") try: text = text_from_request(request) except: return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True) # Amalgamate the result into a single list of sentences if pgs: # Only process the first paragraph, if there are many of them if len(pgs) == 1: pgs = pgs[0] else: # More than one paragraph: gotta concatenate 'em all pa = [] for pg in pgs: pa.extend(pg) pgs = pa for sent in pgs: # Transform the token representation into a # nice canonical form for outside consumption # err = any("err" in t for t in sent) for t in sent: canonicalize_token(t) # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
def parse_api(version=1): """ API to parse text and return POS tagged tokens in JSON format """ if not (1 <= version <= 1): # Unsupported version return better_jsonify(valid=False, reason="Unsupported version") try: text = text_from_request(request) except: return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.parse_text(session, text, all_names=True) # In this case, we should always get a single paragraph back if pgs: # Only process the first paragraph, if there are many of them if len(pgs) == 1: pgs = pgs[0] else: # More than one paragraph: gotta concatenate 'em all pa = [] for pg in pgs: pa.extend(pg) pgs = pa # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
def postag_api(version=1): """ API to parse text and return POS tagged tokens in a verbose JSON format """ if not (1 <= version <= 1): # Unsupported version return better_jsonify(valid=False, reason="Unsupported version") try: text = text_from_request(request) except Exception: return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True) # Amalgamate the result into a single list of sentences pa: List[List[TokenDict]] = [] if pgs: # Only process the first paragraph, if there are many of them if len(pgs) == 1: pa = pgs[0] else: # More than one paragraph: gotta concatenate 'em all for pg in pgs: pa.extend(pg) for sent in pa: # Transform the token representation into a # nice canonical form for outside consumption # err = any("err" in t for t in sent) for t in sent: canonicalize_token(t) # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pa, stats=stats, register=register)
def make_tree(text: str) -> Tree: toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) return tree
def analyze_api(version=1): """ Analyze text manually entered by the user, i.e. not coming from an article. This is a lower level API used by the Greynir web front-end. """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") # try: text = text_from_request(request) # except: # return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True) # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
def gen_tokens(): """ Generate a Greynir token sequence from a tagging result """ ix = 0 for t in toklist: if not t.txt: continue # The code below should correspond to TreeUtility._describe_token() d = dict(x = t.txt) if t.kind == TOK.WORD: # set d["m"] to the meaning pass else: d["k"] = t.kind if t.val is not None and t.kind not in { TOK.WORD, TOK.ENTITY, TOK.PUNCTUATION }: # For tokens except words, entities and punctuation, include the val field if t.kind == TOK.PERSON: d["v"], d["g"] = TreeUtility.choose_full_name(t.val, case = None, gender = None) else: d["v"] = t.val if t.kind in { TOK.WORD, TOK.ENTITY, TOK.PERSON, TOK.NUMBER, TOK.YEAR, TOK.ORDINAL, TOK.PERCENT }: d["i"] = tags[ix] ix += 1 if t.kind == TOK.WORD and " " in d["x"]: # Some kind of phrase: split it xlist = d["x"].split() for x in xlist: d["x"] = x if x == "og": # Probably intermediate word: fjármála- og efnahagsráðherra yield dict(x = "og", i = "c") else: yield d.copy() elif t.kind == TOK.PERSON: # Split person tokens into subtokens for each name component xlist = d["x"].split() # Name as it originally appeared slist = d["v"].split() # Stem (nominal) form of name # xlist may be shorter than slist, but that is OK for x, s in zip(xlist, slist): d["x"] = x d["v"] = s yield d.copy() elif t.kind == TOK.ENTITY: # Split entity tokens into subtokens for each name component xlist = d["x"].split() # Name as it originally appeared for x in xlist: d["x"] = x yield d.copy() # !!! TBD: Tokens such as dates, amounts and currencies # !!! should be split here into multiple subtokens else: yield d
def launch_search(query, session, qkey): """ Launch a search with the given search terms """ pgs, _ = TreeUtility.raw_tag_toklist( session, query.token_list, # root=_QUERY_ROOT ) # Collect the list of search terms terms = [] tweights = [] fixups = [] for pg in pgs: for sent in pg: for t in sent: # Obtain search stems for the tokens. d = dict(x=t["x"], w=0.0) tweights.append(d) # The terms are represented as (stem, category) tuples. stems = stems_of_token(t) if stems: terms.extend(stems) fixups.append((d, len(stems))) assert sum(n for _, n in fixups) == len(terms) if Settings.DEBUG: print("Terms are:\n {0}".format(terms)) # Launch the search and return the answers, as well as the # search terms augmented with information about # whether and how they were used result = Search.list_similar_to_terms(session, terms, _MAXLEN_SEARCH) if "weights" not in result or not result["weights"]: # Probably unable to connect to the similarity server raise RuntimeError("Unable to connect to the similarity server") weights = result["weights"] assert len(weights) == len(terms) # Insert the weights at the proper places in the # token weight list index = 0 for d, n in fixups: d["w"] = sum(weights[index:index + n]) / n index += n return dict(answers=result["articles"], weights=tweights)
def launch_search(query, session, qkey): """ Launch a search with the given search terms """ pgs, stats = TreeUtility.raw_tag_toklist( session, query.token_list(), root=_QUERY_ROOT ) # Collect the list of search terms terms = [] tweights = [] fixups = [] for pg in pgs: for sent in pg: for t in sent: # Obtain search stems for the tokens. d = dict(x=t["x"], w=0.0) tweights.append(d) # The terms are represented as (stem, category) tuples. stems = stems_of_token(t) if stems: terms.extend(stems) fixups.append((d, len(stems))) assert sum(n for _, n in fixups) == len(terms) if Settings.DEBUG: print("Terms are:\n {0}".format(terms)) # Launch the search and return the answers, as well as the # search terms augmented with information about # whether and how they were used result = Search.list_similar_to_terms(session, terms, _MAXLEN_SEARCH) weights = result["weights"] assert len(weights) == len(terms) # Insert the weights at the proper places in the # token weight list index = 0 for d, n in fixups: d["w"] = sum(weights[index : index + n]) / n index += n return dict(answers=result["articles"], weights=tweights)
def fá_þáttun(self, setningar): þáttun = [] for setning in setningar: with SessionContext(read_only = True) as session: pgs, stats = tu.parse_text_to_bracket_form(session, setning) if len(pgs[0]) > 1: # Greint sem margar setningar, vil sameina allar = "" for pg in pgs: for þáttuð_setning in pg: allar = allar + þáttuð_setning hrein_þáttun = self.forvinnsla(allar) þáttun.append(hrein_þáttun) continue for pg in pgs: if not pg[0]: # Tóm setning þáttun.append("(M (S x))") # Default grunngreining setningar -- breytt til að Evalb þoli! continue for þáttuð_setning in pg: # Hreinsa setningu hrein_þáttun = self.forvinnsla(þáttuð_setning) þáttun.append(hrein_þáttun) return þáttun
def _make_tree(text: str) -> Tree: """Tokenize and parse text, create tree representation string from all the parse trees, return Tree object and token JSON.""" toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree assert sent.tree is not None tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) pgs[-1].append(token_dicts) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) tree = Tree() tree.load(tree_string) return tree, tokens_json
def fá_þáttun(self, setningar): þáttun = [] for setning in setningar: with SessionContext(read_only=True) as session: pgs, stats = tu.parse_text_to_bracket_form(session, setning) if len(pgs[0]) > 1: # Greint sem margar setningar, vil sameina allar = "" for pg in pgs: for þáttuð_setning in pg: allar = allar + þáttuð_setning hrein_þáttun = self.forvinnsla(allar) þáttun.append(hrein_þáttun) continue for pg in pgs: if not pg[0]: # Tóm setning þáttun.append( "(M (S x))" ) # Default grunngreining setningar -- breytt til að Evalb þoli! continue for þáttuð_setning in pg: # Hreinsa setningu hrein_þáttun = self.forvinnsla(þáttuð_setning) þáttun.append(hrein_þáttun) return þáttun
#!/usr/bin/env python import os import sys # Hack to make this Python program executable from the utils subdirectory basepath, _ = os.path.split(os.path.realpath(__file__)) _UTILS = os.sep + "utils" if basepath.endswith(_UTILS): basepath = basepath[0:-len(_UTILS)] sys.path.append(basepath) from settings import Settings from db import SessionContext from treeutil import TreeUtility as tu Settings.read(os.path.join(basepath, "config", "Reynir.conf")) Settings.DEBUG = False TEXT = "Ég bý í Baugatanga 6. Hér er prófun á þáttun texta." with SessionContext(read_only = True) as session: pgs, stats = tu.parse_text_to_bracket_form(session, TEXT) for pg in pgs: for sent in pg: print(sent)
def tree_grid(): """ Show a simplified parse tree for a single sentence """ txt = request.args.get("txt", "") with SessionContext(commit=True) as session: # Obtain simplified tree, full tree and stats tree, full_tree, stats = TreeUtility.parse_text_with_full_tree( session, txt) if full_tree is not None: # Create a more manageable, flatter tree from the binarized raw parse tree full_tree = ParseForestFlattener.flatten(full_tree) # Preprocess the trees for display, projecting them to a 2d table structure def _wrap_build_tbl(tbl, root, is_nt_func, children_func, nt_info_func, t_info_func): def _build_tbl(level, offset, nodelist): """ Add the tree node data to be displayed at a particular level (row) in the result table """ while len(tbl) <= level: tbl.append([]) tlevel = tbl[level] left = sum(t[0] for t in tlevel) while left < offset: # Insert a left margin if required # (necessary if we'we alread inserted a terminal at a # level above this one) tlevel.append((1, None)) left += 1 index = offset if nodelist: for n in nodelist: if is_nt_func(n): # Nonterminal: display the child nodes in deeper levels # and add a header on top of them, spanning their total width cnt = _build_tbl(level + 1, index, children_func(n)) tlevel.append((cnt, nt_info_func(n))) index += cnt else: # Terminal: display it in a single column tlevel.append((1, t_info_func(n))) index += 1 return index - offset return _build_tbl(0, 0, [root]) def _normalize_tbl(tbl, width): """ Fill out the table with blanks so that it is square """ for row in tbl: rw = sum(t[0] for t in row) # Right-pad as required while rw < width: row.append((1, None)) rw += 1 tbl = [] full_tbl = [] if tree is None: full_tree = None width = 0 full_width = 0 height = 0 # Height of simplified table full_height = 0 # Height of full table else: # Build a table structure for a simplified tree width = _wrap_build_tbl( tbl, tree, is_nt_func=lambda n: n["k"] == "NONTERMINAL", children_func=lambda n: n["p"], nt_info_func=lambda n: dict(n=n["n"], error=False), t_info_func=lambda n: n, ) height = len(tbl) if width and height: _normalize_tbl(tbl, width) # Build a table structure for a full tree full_width = _wrap_build_tbl( full_tbl, full_tree, is_nt_func=lambda n: n.is_nonterminal, children_func=lambda n: n.children, nt_info_func=lambda n: dict( n=n.p.name, sc=n.score, error=n.p.has_tag("error")), t_info_func=lambda n: dict(t=n.p[0].name, sc=n.score, x=n.p[1].t1), ) assert full_width == width full_height = len(full_tbl) if full_width and full_height: _normalize_tbl(full_tbl, full_width) return render_template( "treegrid.html", txt=txt, tree=tree, stats=stats, tbl=tbl, height=height, full_tbl=full_tbl, full_height=full_height, )
def test_entities(): text = """ Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota. Danska byggingavörukeðjan Bygma hefur keypt íslenska verslunarfyrirtækið Húsasmiðjuna. Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs eru hluthafar í Arion banka. Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra. Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám. Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám. Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær. Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær. Primera Air var íslenskt flugfélag. Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag. Villeneuve-Loubet er franskt þorp. Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda. Í miðbæ Reykjavíkur er herrafataverslunin Geysir. Mér er sagt að Geysir sé hættur að gjósa. Geysir er hættur að gjósa. Geysir er gamall goshver. Fyrirtækið Apple-búðin selur Apple Mac tölvur. Fyrirtækið Origo selur IBM tölvur. Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf. """ toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse" # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) session = SessionShim() tree.process(session, entities) session.check(("Bygma", "er", "dönsk byggingavörukeðja")) session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki")) session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður")) session.check( ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður")) session.check(("Primera Air", "var", "íslenskt flugfélag")) session.check(("Villeneuve-Loubet", "er", "franskt þorp")) session.check(("Valdís", "er", "ísbúð")) session.check(("Fosshótel", "var", "rekin með tapi")) session.check(("Fosshótel", "er", "stór hótelkeðja")) session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði")) session.check(("Lax", "er", "stór fiskur af ætt laxfiska")) session.check(("Geysir", "er", "gamall goshver")) session.check(("Eimskipafélag Íslands hf", "er", "skipafélag")) session.check(("Origo", "er", "fyrirtæki")) session.check(("AirBerlin", "er", "flugfélag")) assert session.is_empty()
def tree_grid(): """ Show a simplified parse tree for a single sentence """ txt = request.args.get("txt", "") with SessionContext(commit=True) as session: # Obtain simplified tree, full tree and stats tree, full_tree, stats = TreeUtility.parse_text_with_full_tree(session, txt) if full_tree is not None: # Create a more manageable, flatter tree from the binarized raw parse tree full_tree = ParseForestFlattener.flatten(full_tree) # Preprocess the trees for display, projecting them to a 2d table structure def _wrap_build_tbl( tbl, root, is_nt_func, children_func, nt_info_func, t_info_func ): def _build_tbl(level, offset, nodelist): """ Add the tree node data to be displayed at a particular level (row) in the result table """ while len(tbl) <= level: tbl.append([]) tlevel = tbl[level] left = sum(t[0] for t in tlevel) while left < offset: # Insert a left margin if required # (necessary if we'we alread inserted a terminal at a # level above this one) tlevel.append((1, None)) left += 1 index = offset if nodelist is not None: for n in nodelist: if is_nt_func(n): # Nonterminal: display the child nodes in deeper levels # and add a header on top of them, spanning their total width cnt = _build_tbl(level + 1, index, children_func(n)) tlevel.append((cnt, nt_info_func(n))) index += cnt else: # Terminal: display it in a single column tlevel.append((1, t_info_func(n))) index += 1 return index - offset return _build_tbl(0, 0, [root]) def _normalize_tbl(tbl, width): """ Fill out the table with blanks so that it is square """ for row in tbl: rw = sum(t[0] for t in row) # Right-pad as required while rw < width: row.append((1, None)) rw += 1 tbl = [] full_tbl = [] if tree is None: full_tree = None width = 0 full_width = 0 height = 0 # Height of simplified table full_height = 0 # Height of full table else: # Build a table structure for a simplified tree width = _wrap_build_tbl( tbl, tree, is_nt_func=lambda n: n["k"] == "NONTERMINAL", children_func=lambda n: n["p"], nt_info_func=lambda n: dict(n=n["n"], error=False), t_info_func=lambda n: n, ) height = len(tbl) if width and height: _normalize_tbl(tbl, width) # Build a table structure for a full tree full_width = _wrap_build_tbl( full_tbl, full_tree, is_nt_func=lambda n: n.is_nonterminal, children_func=lambda n: n.children, nt_info_func=lambda n: dict(n=n.p.name, error=n.p.has_tag("error")), t_info_func=lambda n: dict(t=n.p[0].name, x=n.p[1].t1), ) assert full_width == width full_height = len(full_tbl) if full_width and full_height: _normalize_tbl(full_tbl, full_width) return render_template( "treegrid.html", txt=txt, tree=tree, stats=stats, tbl=tbl, height=height, full_tbl=full_tbl, full_height=full_height, )
#!/usr/bin/env python import os import sys # Hack to make this Python program executable from the utils subdirectory basepath, _ = os.path.split(os.path.realpath(__file__)) _UTILS = os.sep + "utils" if basepath.endswith(_UTILS): basepath = basepath[0:-len(_UTILS)] sys.path.append(basepath) from settings import Settings from db import SessionContext from treeutil import TreeUtility as tu Settings.read(os.path.join(basepath, "config", "Greynir.conf")) Settings.DEBUG = False TEXT = "Ég bý í Baugatanga 6. Hér er prófun á þáttun texta." with SessionContext(read_only=True) as session: pgs, stats = tu.parse_text_to_bracket_form(session, TEXT) for pg in pgs: for sent in pg: print(sent)
def _parse(self, enclosing_session=None, verbose=False): """ Parse the article content to yield parse trees and annotated token list """ with SessionContext(enclosing_session) as session: # Convert the content soup to a token iterable (generator) toklist = Fetcher.tokenize_html(self._url, self._html, session) bp = self.get_parser() ip = IncrementalParser(bp, toklist, verbose=verbose) # List of paragraphs containing a list of sentences containing token lists # for sentences in string dump format (1-based paragraph and sentence indices) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() # Word stem dictionary, indexed by (stem, cat) words = defaultdict(int) num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) # We don't attempt to parse very long sentences (>100 tokens) # since they are memory intensive (>16 GB) and may take # minutest to process if num_tokens <= MAX_SENTENCE_TOKENS and sent.parse(): # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens( sent.tokens, sent.tree, words) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest( sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join([ "C{0}".format(sent.score), "L{0}".format(num_tokens), tree ]) else: # Error, sentence too long or no parse: # add an error index entry for this sentence if num_tokens > MAX_SENTENCE_TOKENS: # Set the error index at the first # token outside the maximum limit eix = MAX_SENTENCE_TOKENS else: eix = sent.err_index token_dicts = TreeUtility.dump_tokens( sent.tokens, None, None, eix) trees[num_sent] = "E{0}".format(eix) pgs[-1].append(token_dicts) # parse_time = ip.parse_time self._parsed = datetime.utcnow() self._parser_version = bp.version self._num_tokens = ip.num_tokens self._num_sentences = ip.num_sentences self._num_parsed = ip.num_parsed self._ambiguity = ip.ambiguity # Make one big JSON string for the paragraphs, sentences and tokens self._raw_tokens = pgs self._tokens = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) # Keep the bag of words (stem, category, count for each word) self._words = words # Create a tree representation string out of all the accumulated parse trees self._tree = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())