def make_tree(text: str) -> Tree: toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) return tree
def _make_tree(text: str) -> Tree: """Tokenize and parse text, create tree representation string from all the parse trees, return Tree object and token JSON.""" toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree assert sent.tree is not None tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) pgs[-1].append(token_dicts) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) tree = Tree() tree.load(tree_string) return tree, tokens_json
def test_entities(): text = """ Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota. Danska byggingavörukeðjan Bygma hefur keypt íslenska verslunarfyrirtækið Húsasmiðjuna. Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs eru hluthafar í Arion banka. Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra. Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám. Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám. Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær. Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær. Primera Air var íslenskt flugfélag. Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag. Villeneuve-Loubet er franskt þorp. Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda. Í miðbæ Reykjavíkur er herrafataverslunin Geysir. Mér er sagt að Geysir sé hættur að gjósa. Geysir er hættur að gjósa. Geysir er gamall goshver. Fyrirtækið Apple-búðin selur Apple Mac tölvur. Fyrirtækið Origo selur IBM tölvur. Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf. """ toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse" # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) session = SessionShim() tree.process(session, entities) session.check(("Bygma", "er", "dönsk byggingavörukeðja")) session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki")) session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður")) session.check( ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður")) session.check(("Primera Air", "var", "íslenskt flugfélag")) session.check(("Villeneuve-Loubet", "er", "franskt þorp")) session.check(("Valdís", "er", "ísbúð")) session.check(("Fosshótel", "var", "rekin með tapi")) session.check(("Fosshótel", "er", "stór hótelkeðja")) session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði")) session.check(("Lax", "er", "stór fiskur af ætt laxfiska")) session.check(("Geysir", "er", "gamall goshver")) session.check(("Eimskipafélag Íslands hf", "er", "skipafélag")) session.check(("Origo", "er", "fyrirtæki")) session.check(("AirBerlin", "er", "flugfélag")) assert session.is_empty()
def _parse(self, enclosing_session=None, verbose=False): """ Parse the article content to yield parse trees and annotated token list """ with SessionContext(enclosing_session) as session: # Convert the content soup to a token iterable (generator) toklist = Fetcher.tokenize_html(self._url, self._html, session) bp = self.get_parser() ip = IncrementalParser(bp, toklist, verbose=verbose) # List of paragraphs containing a list of sentences containing token lists # for sentences in string dump format (1-based paragraph and sentence indices) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() # Word stem dictionary, indexed by (stem, cat) words = defaultdict(int) num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) # We don't attempt to parse very long sentences (>100 tokens) # since they are memory intensive (>16 GB) and may take # minutest to process if num_tokens <= MAX_SENTENCE_TOKENS and sent.parse(): # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens( sent.tokens, sent.tree, words) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest( sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join([ "C{0}".format(sent.score), "L{0}".format(num_tokens), tree ]) else: # Error, sentence too long or no parse: # add an error index entry for this sentence if num_tokens > MAX_SENTENCE_TOKENS: # Set the error index at the first # token outside the maximum limit eix = MAX_SENTENCE_TOKENS else: eix = sent.err_index token_dicts = TreeUtility.dump_tokens( sent.tokens, None, None, eix) trees[num_sent] = "E{0}".format(eix) pgs[-1].append(token_dicts) # parse_time = ip.parse_time self._parsed = datetime.utcnow() self._parser_version = bp.version self._num_tokens = ip.num_tokens self._num_sentences = ip.num_sentences self._num_parsed = ip.num_parsed self._ambiguity = ip.ambiguity # Make one big JSON string for the paragraphs, sentences and tokens self._raw_tokens = pgs self._tokens = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) # Keep the bag of words (stem, category, count for each word) self._words = words # Create a tree representation string out of all the accumulated parse trees self._tree = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())