def read_treefile(hyptreefile,reftreefile): hfile = codecs.open(hyptreefile,"r",encoding='utf-8') rfile = codecs.open(reftreefile,"r",encoding='utf-8') scoredic = {} #store rtree into rtreelist suppose there are more than one reference rtreel = [] for i in rfile: refl = [] if i.strip() != "": refl.append(i.strip()) rstr = " ".join(refl) rtree = Tree.fromstring(rstr) rtreel.append(rtree) #store hyptree into hyplist htreel = [] senl = [] for i in hfile: if i.strip() != "": senl.append(i.strip()) else: htreel.append(Tree.fromstring(" ".join(senl))) senl = [] #loop and score for r in rtreel: for h in htreel: score,hword,rword= score_similarity(h,r) scoredic[" ".join(hword)] = score return scoredic
def _muc_read_text(s, top_node): # The tokenizer sometimes splits within coref tags. def __fix_tokenization(sents): for index in range(len(sents)): next = 1 while sents[index].count('<COREF') != sents[index].count('</COREF>'): sents[index] += ' ' sents[index] += sents[index + next] sents[index + next] = '' next += 1 sents = filter(None, sents) return sents if s: tree = Tree(top_node, []) if _MUC6_PARA_RE.match(s): for para in _MUC6_PARA_RE.findall(s): if para and para[0] and para[0].strip(): tree.append(Tree('P', [])) for sent in _MUC6_SENT_RE.findall(para[0]): words = _MUC6_SENT_RE.match(sent[0]).group('sent').strip() # There are empty sentences <s></s> in the MUC6 corpus. if words: tree[-1].append(_muc_read_words(words, 'S')) elif _MUC7_PARA_RE.match(s): for para in _MUC7_PARA_SPLIT_RE.split(s): if para and para.strip(): tree.append(Tree('P', [])) for sent in __fix_tokenization(_SENT_TOKENIZER.tokenize(para)): tree[-1].append(_muc_read_words(sent, 'S')) return tree
def test_pcfg(self): o = pcfg.PCFG() tree = Tree('S', (Tree('NP', ('foo',)), Tree('VP', ('bar',)))) o.update_counts(tree) self.assertSetEqual( set([(p, 1) for p in tree.productions()]), set(o.production_counts.items())) self.assertSetEqual(set([(p.lhs(), 1) for p in tree.productions()]), set(o.lhs_counts.items())) o.update_counts(tree) tree = Tree('S', (Tree('VP', ('foo',)), Tree('NP', ('bar',)))) o.update_counts(tree) o.update_counts(tree) self.assertEqual(6, len(o.production_counts)) for count in o.production_counts.values(): self.assertEqual(2, count) self.assertEqual(3, len(o.lhs_counts)) for count in o.lhs_counts.values(): self.assertEqual(4, count) o.compute_scores() for production, score in o.scored_productions.items(): self.assertAlmostEqual(-0.69314718055, score, msg='%s' % production)
def match(self, tree): try: if tree.label() != 'ROOT': raise IndexError if tree[0].label() != 'SBARQ': raise IndexError if tree[0][0][0].label() != 'WRB': raise IndexError if tree[0][0][0][0].lower() != 'when': raise IndexError if tree[0][1].label() != 'SQ': raise IndexError if tree[0][1][0].label() != 'VBD': raise IndexError if tree[0][1][1].label() != 'NP': raise IndexError if tree[0][1][2].label() != 'VP': raise IndexError part = Pattern.Part() part.object = ParentedTree.fromstring(str(tree[0][1][1])) part.property = ParentedTree.fromstring(str(Tree('VP', [ Tree.fromstring(str(tree[0][0][0])), Tree.fromstring(str(tree[0][1][0])), Tree.fromstring(str(tree[0][1][2])) ]))) return [part] except IndexError: return []
def parser_output_to_parse_deriv_trees(output): lines = output.strip().split("\n") deriv_tree_lines = lines[::2] parse_tree_lines = lines[1::2] parse_trees = [Tree.fromstring(line.replace('\x06', 'epsilon_')) for line in parse_tree_lines if line != ''] deriv_trees = [Tree.fromstring(line) for line in deriv_tree_lines if line != ''] return parse_trees, deriv_trees
def test_evalb_correctly_calculates_bracketing_metrics_over_multiple_trees(self): tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))") tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1, tree2], [tree2, tree2]) metrics = evalb_scorer.get_metric() assert metrics["evalb_recall"] == 0.875 assert metrics["evalb_precision"] == 0.875 assert metrics["evalb_f1_measure"] == 0.875
def add_top_to_tree(treebank_file): f = open(treebank_file, "r") root_set = set([]) for sentence in f: t = Tree.fromstring(sentence, remove_empty_top_bracketing=False) top_node = Tree("TOP", []) top_node.append(t) print NewTree.flat_print(top_node) f.close()
def test_evalb_correctly_scores_identical_trees(self): tree1 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1], [tree2]) metrics = evalb_scorer.get_metric() assert metrics["evalb_recall"] == 1.0 assert metrics["evalb_precision"] == 1.0 assert metrics["evalb_f1_measure"] == 1.0
def _construct_node_from_actions(self, current_node: Tree, remaining_actions: List[List[str]], add_var_function: bool) -> List[List[str]]: """ Given a current node in the logical form tree, and a list of actions in an action sequence, this method fills in the children of the current node from the action sequence, then returns whatever actions are left. For example, we could get a node with type ``c``, and an action sequence that begins with ``c -> [<r,c>, r]``. This method will add two children to the input node, consuming actions from the action sequence for nodes of type ``<r,c>`` (and all of its children, recursively) and ``r`` (and all of its children, recursively). This method assumes that action sequences are produced `depth-first`, so all actions for the subtree under ``<r,c>`` appear before actions for the subtree under ``r``. If there are any actions in the action sequence after the ``<r,c>`` and ``r`` subtrees have terminated in leaf nodes, they will be returned. """ if not remaining_actions: logger.error("No actions left to construct current node: %s", current_node) raise ParsingError("Incomplete action sequence") left_side, right_side = remaining_actions.pop(0) if left_side != current_node.label(): logger.error("Current node: %s", current_node) logger.error("Next action: %s -> %s", left_side, right_side) logger.error("Remaining actions were: %s", remaining_actions) raise ParsingError("Current node does not match next action") if right_side[0] == '[': # This is a non-terminal expansion, with more than one child node. for child_type in right_side[1:-1].split(', '): if child_type.startswith("'lambda"): # We need to special-case the handling of lambda here, because it's handled a # bit weirdly in the action sequence. This is stripping off the single quotes # around something like `'lambda x'`. child_type = child_type[1:-1] child_node = Tree(child_type, []) current_node.append(child_node) # you add a child to an nltk.Tree with `append` if not self.is_terminal(child_type): remaining_actions = self._construct_node_from_actions(child_node, remaining_actions, add_var_function) elif self.is_terminal(right_side): # The current node is a pre-terminal; we'll add a single terminal child. We need to # check first for whether we need to add a (var _) around the terminal node, though. if add_var_function and right_side in self._lambda_variables: right_side = f"(var {right_side})" if add_var_function and right_side == 'var': raise ParsingError('add_var_function was true, but action sequence already had var') current_node.append(Tree(right_side, [])) # you add a child to an nltk.Tree with `append` else: # The only way this can happen is if you have a unary non-terminal production rule. # That is almost certainly not what you want with this kind of grammar, so we'll crash. # If you really do want this, open a PR with a valid use case. raise ParsingError(f"Found a unary production rule: {left_side} -> {right_side}. " "Are you sure you want a unary production rule in your grammar?") return remaining_actions
def drawTrees(chart): for state in chart[-1]: if state.isParse(grammar): treeString = buildTreeString(state,'') tree = Tree(treeString) print 'Showing parse tree. Close window to continue.' tree.draw() ans = raw_input('Do you want to see another parse tree?(y/n): ') if ans == 'n': return print 'No more valid parses'
def extract_itg(alignments_file_name, parses_file_name, inv_extension): """Extract a inversion transduction grammar (ITG) from the given files. Keyword arguments: alignments_file_name -- name of file containing alignments between sentences in l1_file_name and l2_file_name parses_file_name -- name of file containing parse trees of the sentences in l1_file_name inv_extension -- extension denoting whether a node is inverted Returns a Counter of binary ITG rules and unary rules. Each ITG rule is represented as the tuple (lhs, rhs), where rhs is a tuple of nodes.""" binary_itg = Counter() unary_itg = Counter() num_lines = number_of_lines(parses_file_name) alignments_file = open(alignments_file_name) parses_file = open(parses_file_name) for i, l1_parse in enumerate(parses_file): if i % (num_lines/100) is 0: sys.stdout.write('\r%d%%' % (i*100/num_lines,)) sys.stdout.flush() try: # TODO remove try/catch reordered_indexes = str_to_reordered_indexes(alignments_file.next()) # remove outer brackets from Berkeley parse l1_parse = l1_parse.strip() l1_parse = l1_parse[1:len(l1_parse)-1] l1_parse = l1_parse.strip() parse_tree = Tree(l1_parse) parse_forest = generate_forest(parse_tree, reordered_indexes, inv_extension) except: error_log = open('error.log', 'a') error_log.write('%s -- in extract_itg/3\n' % time.asctime()) error_log.write('line: %s\n' % i) error_log.write('%s\n' % l1_parse.strip()) error_log.write('%s\n' % reordered_indexes) error_log.write('\n') error_log.close() print 'Error in extract_itg/3. See error.log' raise binary_rules, unary_rules = extract_rules(parse_forest, parse_tree.leaves()) for rule in binary_rules: binary_itg[rule] += 1 for rule in unary_rules: unary_itg[rule] += 1 alignments_file.close() parses_file.close() return binary_itg, unary_itg
def test_evalb_correctly_scores_imperfect_trees(self): # Change to constiutency label (VP ... )should effect scores, but change to POS # tag (NP dog) should have no effect. tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))") tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1], [tree2]) metrics = evalb_scorer.get_metric() assert metrics["evalb_recall"] == 0.75 assert metrics["evalb_precision"] == 0.75 assert metrics["evalb_f1_measure"] == 0.75
def test_evalb_with_terrible_trees_handles_nan_f1(self): # If precision and recall are zero, evalb returns nan f1. # This checks that we handle the zero division. tree1 = Tree.fromstring("(PP (VROOT (PP That) (VROOT (PP could) " "(VROOT (PP cost) (VROOT (PP him))))) (PP .))") tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1], [tree2]) metrics = evalb_scorer.get_metric() assert metrics["evalb_recall"] == 0.0 assert metrics["evalb_precision"] == 0.0 assert metrics["evalb_f1_measure"] == 0.0
def get_sentence_and_indexes(parsed_sentence): sentence_tree = Tree(parsed_sentence) if sentence_tree.node == bitpar_top: #remove designated TOP-symbol sentence_tree = sentence_tree[0] rlist = [0]*len(sentence_tree.leaves()) slist = [""]*len(sentence_tree.leaves()) get_sentence_and_indexes_rec_helper(sentence_tree, rlist, slist) reordered_sentence = " ".join(slist) return reordered_sentence, rlist
def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict: """ Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`. Parameters ---------- tree : ``Tree``, required. The tree to convert into Hierplane JSON. index : int, required. The character index into the tree, used for creating spans. is_root : bool An indicator which allows us to add the outer Hierplane JSON which is required for rendering. Returns ------- A JSON dictionary render-able by Hierplane for the given tree. """ children = [] for child in tree: if isinstance(child, Tree): # If the child is a tree, it has children, # as NLTK leaves are just strings. children.append(self._build_hierplane_tree(child, index, is_root=False)) else: # We're at a leaf, so add the length of # the word to the character index. index += len(child) label = tree.label() span = " ".join(tree.leaves()) hierplane_node = { "word": span, "nodeType": label, "attributes": [label], "link": label } if children: hierplane_node["children"] = children # TODO(Mark): Figure out how to span highlighting to the leaves. if is_root: hierplane_node = { "linkNameToLabel": LINK_TO_LABEL, "nodeTypeToStyle": NODE_TYPE_TO_STYLE, "text": span, "root": hierplane_node } return hierplane_node
def evaluate(fragments, sumfunc, condition, normalization, verbose=True, perbook=False, topfragments=False, breakdown=True, conftable=False): green = "\033[32m"; red = "\033[31m"; gray = "\033[0m" # ANSI codes names = set(map(getauthor, fragments.values()[0])) results = {} # heading if verbose and not perbook: print "\n &", 21 * " ", print "&".join(a.rjust(16) for a in sorted(names)), print "&\tguess &\t\t\tconfidence\\\\" prev = "foo.bar" # loop over texts to be classified for text in sorted(fragments): if perbook and getauthor(text) != getauthor(prev): print "\n &", 21 * " ", print " &".join("\\rotatebox{45}{%s}" % a.split(" - ")[-1].split(".")[0].replace("&","\\&") for a in sorted(fragments[text])), "\\\\" if verbose: print text.split(" - ")[-1].split(".")[0][:25].replace("&","\\&").ljust(25), inter = {} # loop over possible authors for author in sorted(fragments[text]): inter[author] = sum(map(sumfunc, filter(condition, fragments[text][author].items()))) / normalization(text, author) if verbose: for author in sorted(inter): if inter[author] == max(inter.values()): l,r = "\\textbf{","}" else: l, r = "".ljust(8), " " if isinstance(inter[author], float): print ("& %s%.2f%s" % (l,inter[author],r)).rjust(16), elif isinstance(inter[author], int): print ("& %s%d%s" % (l,inter[author],r)).rjust(16), else: print "& %s%s" % (l,repr(inter[author]).rjust(8),r), actualauthor = getauthor(text) guess = max(inter, key=inter.get) results.setdefault(actualauthor, []).append(guess) if verbose and not perbook: print "&", print green+"correct:" if getauthor(guess) == actualauthor else red+"wrong: ", print getauthor(guess).ljust(10), gray, try: confidence = (100 * (max(inter.values()) - sorted(inter.values())[-2]) / float(max(inter.values()))) except ZeroDivisionError: confidence = 0.0 except IndexError: confidence = 0.0 print "& %s%5.2f%s " % ((red if confidence < 50 else green), confidence, gray) elif verbose: print "\\\\" prev = text if verbose: print if topfragments: print "top fragments" for name in sorted(names) if topfragments else (): for text in sorted(fragments): if not getauthor(text) == name: continue print text for label in ("(ROOT", "(S ", "(NP ", "(VP ", "(PP "): guess = max(fragments[text], key=lambda x: sum(sumfunc(a) for a in fragments[text][x].items() if condition(a)) / norm(x)) try: frag = max((a[0] for a in fragments[text][guess].iteritems() if condition(a) and a[0].startswith(label)), key=lambda x: (sumfunc((x,fragments[text][guess][x])), fragments[text][guess][x])) except ValueError: pass else: f1 = Tree(frag) f2 = Tree(frag) print "%2d" % fragments[text][guess][frag], " ".join(a.replace(" ", "_")[:-1] for a in re.findall(r" \)|[^ )]+\)", frag)), try: f2.un_chomsky_normal_form() except: print f1.pprint(margin=9999, parens=("[", " ]")) else: print f2.pprint(margin=9999, parens=("[", " ]"))
def reduce_nps(sentence): """ take any occurrences of NP trees that contain only one NP tree and reduce them """ res = Tree('S',[]) for child in sentence: #print child if isinstance(child, Tree): #print len(child) if len(child) == 1: res.append(child[0]) continue res.append(child) return res
def calc(param): p = ["He", "he", "Him", "him", "She", "she", "Her", "her", "It", "it", "They", "they"] r = ["Himself", "himself", "Herself", "herself", "Itself", "itself", "Themselves", "themselves"] fname = param[1] pro = param[2] with open(fname) as f: sents = f.readlines() trees = [Tree.fromstring(s) for s in sents] pos = get_pos(trees[-1], pro) pos = pos[:-1] if pro in p: tree, pos = hobbs(trees, pos) #for t in trees: # print t, '\n' #print "Proposed antecedent for '"+pro+"':", tree[pos] return tree, tree[pos] elif pro in r: tree, pos = resolve_reflexive(trees, pos) #for t in trees: # print t, '\n' #print "Proposed antecedent for '"+pro+"':", tree[pos] return tree, tree[pos]
def parse_ccgbank_tree(s): t = Tree.parse(s, parse_node=parse_ccgbank_node, parse_leaf=parse_ccgbank_leaf, node_pattern=ccgbank_node_pattern, leaf_pattern=ccgbank_leaf_pattern) return excise_empty_nodes(t)
def test_simple_tags(self): grammar = "ANIMAL : {<ANIMAL>}" rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S el/DT (ANIMAL perro/NN/ANIMAL) ladra/VB al/DT (ANIMAL gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result, expected)
def testConvert(self): sample_tree = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))") converter = DotLanguageConverter() str = converter.convert(sample_tree) expected_tree_string = ("digraph parse_tree {\n" "\t\"S\" [label=\"S\"];\n" "\t\"NP\" [label=\"NP\"];\n" "\t\"S\"-> \"NP\";\n" "\t\"I\" [label=\"I\"];\n" "\t\"NP\"-> \"I\";\n" "\t\"VP\" [label=\"VP\"];\n" "\t\"S\"-> \"VP\";\n" "\t\"V\" [label=\"V\"];\n" "\t\"VP\"-> \"V\";\n" "\t\"saw\" [label=\"saw\"];\n" "\t\"V\"-> \"saw\";\n" "\t\"NP_1\" [label=\"NP\"];\n" "\t\"VP\"-> \"NP_1\";\n" "\t\"him\" [label=\"him\"];\n" "\t\"NP_1\"-> \"him\";\n" "}") self.assertEqual(str, expected_tree_string)
def __render_tree(self): string = self.output_text_area.get("1.0", END) string = string.replace("\n", "") tree = Tree.fromstring(string) tree.draw()
def extract_trees(filename="./out/toy_pcfg2.gen"): trees = [] with open(filename) as fh: for line in fh: trees.append(Tree.fromstring(line)) return trees
def pprint(self, **kwargs): """Returns a representation of the tree compatible with the LaTeX qtree package. Requires the nltk module. See http://www.nltk.org/_modules/nltk/tree.html.""" from nltk import Tree as NLTKTree tree = NLTKTree.fromstring(self.ptb()) return tree.pprint(**kwargs)
def tag(self, input_tree): """ Tag an input tree using the rules in parsed grammars. """ #clean input tree: input_tree = self.clean(input_tree) text = self.from_tree_to_text(input_tree) #print "INPUT TEXT: "+text for rule in self.rules: rule_name = rule.keys()[0] rule = rule.values()[0] matches = re.finditer(rule, text, re.I) for match in matches: match_text = match.group(rule_name) #eliminar espacios al principio y al final del matching text, #para controlar que cada subarbol <NAME> está bien delimitado #en el texto resultante (no se come espacios opcionales): match_text = match_text.strip() text = string.replace(text, match_text, "<"+rule_name+">") #print "TEXT = "+text self.stack.append(match_text) #print "OUTPUT TEXT : "+text output_tree_str = "(S "+self.from_text_to_tree_str(text)+" )" #print "OUTPUT TREE STR: "+output_tree_str output_tree = Tree.parse(output_tree_str, parse_leaf=self.from_string_token_to_tuple) return output_tree
def main(argv): if len(sys.argv) == 2 and argv[1] == "demo": demo() else: if len(sys.argv) > 3 or len(sys.argv) < 2: print "Enter the file and the pronoun to resolve." elif len(sys.argv) == 3: p = ["He", "he", "Him", "him", "She", "she", "Her", "her", "It", "it", "They", "they"] r = ["Himself", "himself", "Herself", "herself", "Itself", "itself", "Themselves", "themselves"] fname = sys.argv[1] pro = sys.argv[2] with open(fname) as f: sents = f.readlines() trees = [Tree.fromstring(s) for s in sents] pos = get_pos(trees[-1], pro) pos = pos[:-1] if pro in p: tree, pos = hobbs(trees, pos) for t in trees: print t, '\n' print "Proposed antecedent for '"+pro+"':", tree[pos] elif pro in r: tree, pos = resolve_reflexive(trees, pos) for t in trees: print t, '\n' print "Proposed antecedent for '"+pro+"':", tree[pos]
def test_construct_tree_from_spans_handles_nested_labels(self): # The tree construction should split the "S-NP" into (S (NP ...)). tree_spans = [((0, 1), 'D'), ((1, 2), 'N'), ((0, 2), 'S-NP')] sentence = ["the", "dog"] tree = self.model.construct_tree_from_spans({x:y for x, y in tree_spans}, sentence) correct_tree = Tree.fromstring("(S (NP (D the) (N dog)))") assert tree == correct_tree
def syntactic_parse_features(paragraph, parse): """ Returns the count for the usage of S, SBAR units in the syntactic parse, plus statistics about the height of the trees """ KEPT_FEATURES = ['S', 'SBAR'] # Increment the count for the part-of-speech of each head of phrase counts_of_heads = Counter() tree_heights = [] for t_string in parse: t = Tree.fromstring(t_string) for st in t.subtrees(): counts_of_heads[st.label()] += 1 tree_heights.append(t.height()) # Keep only the head parts-of-speech that appear in KEPT_FEATURES features = dict(("syntactic_head_"+key, counts_of_heads[key]) for key in counts_of_heads if key in KEPT_FEATURES) features = Counter(features) # Add in the features related to tree height features["tree_height_mean"] = np.mean(tree_heights) features["tree_height_median"] = np.median(tree_heights) features["tree_height_max"] = np.max(tree_heights) features["tree_height_min"] = np.min(tree_heights) features["tree_height_spread"] = np.max(tree_heights) - np.min(tree_heights) return Counter(features)
def treebank_bracket_parse(t): try: return Tree.fromstring(t, remove_empty_top_bracketing=True) except IndexError: # in case it's the real treebank format, # strip first and last brackets before parsing return tree.bracket_parse(t.strip()[1:-1])
def negra_tree_iter(corpus_root): pieces = [] for line in open(corpus_root): if line.startswith('%'): s = ''.join(pieces).strip() if len(s): yield Tree.parse(s) pieces = [] else: pieces.append(line) if len(pieces): s = ''.join(pieces).strip() yield Tree.parse(s)
else: nonterm.append((lhs, rhses[0], rhses[1], math.log(prob))) for line in open('../../data/wiki-en-short.tok'): print(line.strip('\n')) best_score = defaultdict(lambda: -1000000) best_edge = dict() words = line.strip('\n').split() for i in range(len(words)): for lhs, log_prob in preterm[words[i]]: best_score['{} {} {}'.format(lhs, i, i + 1)] = log_prob for j in range(2, len(words) + 1): for i in reversed(range(j - 1)): for k in range(i + 1, j): for sym, lsym, rsym, logprob in nonterm: if '{} {} {}'.format( lsym, i, k) in best_score and '{} {} {}'.format( rsym, k, j) in best_score: my_lp = best_score['{} {} {}'.format( lsym, i, k)] + best_score['{} {} {}'.format( rsym, k, j)] + logprob if my_lp > best_score['{} {} {}'.format(sym, i, j)]: best_score['{} {} {}'.format(sym, i, j)] = my_lp best_edge['{} {} {}'.format( sym, i, j)] = ('{} {} {}'.format(lsym, i, k), '{} {} {}'.format(rsym, k, j)) tree = Tree.fromstring( print_tree('S 0 ' + str(len(words)), words, best_edge)) print(tree)
sentence = temp_string matches = re.findall(grammar, sentence) sentence = re.sub(grammar, "| ", sentence) sentence = "(S " + sentence + ")" for i in range(len(matches)): temp_sentence = "(" + chunk_name + " " for j in range(len(matches[i])): temp_sentence += matches[i][j] + " " temp_sentence += ")" sentence = sentence.replace("|", temp_sentence, 1) print(sentence) return sentence if __name__ == "__main__": parser = GrammarParse() # sentence = "The quick brown fox jumps over the lazy dog" sentence = "I was hugging an amazing spectacular dog" # result = parser.regExParse("NP", "(\w*/DT)? ?(\w*/JJ)* ?(\w*/NN)", sentence) result = parser.regExParse( "VP", "(\w*/VB\w*) ?(\w*/DT)? ?(\w*/JJ)* ?(\w*/NN) ?(\w*/RB\w?)?", sentence) tr = Tree.fromstring(result) print(tr) tr.draw()
def _construct_node_from_actions( self, current_node: Tree, remaining_actions: List[List[str]], add_var_function: bool) -> List[List[str]]: """ Given a current node in the logical form tree, and a list of actions in an action sequence, this method fills in the children of the current node from the action sequence, then returns whatever actions are left. For example, we could get a node with type ``c``, and an action sequence that begins with ``c -> [<r,c>, r]``. This method will add two children to the input node, consuming actions from the action sequence for nodes of type ``<r,c>`` (and all of its children, recursively) and ``r`` (and all of its children, recursively). This method assumes that action sequences are produced `depth-first`, so all actions for the subtree under ``<r,c>`` appear before actions for the subtree under ``r``. If there are any actions in the action sequence after the ``<r,c>`` and ``r`` subtrees have terminated in leaf nodes, they will be returned. """ if not remaining_actions: logger.error("No actions left to construct current node: %s", current_node) raise ParsingError("Incomplete action sequence") left_side, right_side = remaining_actions.pop(0) if left_side != current_node.label(): logger.error("Current node: %s", current_node) logger.error("Next action: %s -> %s", left_side, right_side) logger.error("Remaining actions were: %s", remaining_actions) raise ParsingError("Current node does not match next action") if right_side[0] == '[': # This is a non-terminal expansion, with more than one child node. for child_type in right_side[1:-1].split(', '): if child_type.startswith("'lambda"): # We need to special-case the handling of lambda here, because it's handled a # bit weirdly in the action sequence. This is stripping off the single quotes # around something like `'lambda x'`. child_type = child_type[1:-1] child_node = Tree(child_type, []) current_node.append( child_node ) # you add a child to an nltk.Tree with `append` if not self.is_terminal(child_type): remaining_actions = self._construct_node_from_actions( child_node, remaining_actions, add_var_function) elif self.is_terminal(right_side): # The current node is a pre-terminal; we'll add a single terminal child. We need to # check first for whether we need to add a (var _) around the terminal node, though. if add_var_function and right_side in self._lambda_variables: right_side = f"(var {right_side})" if add_var_function and right_side == 'var': raise ParsingError( 'add_var_function was true, but action sequence already had var' ) current_node.append( Tree(right_side, [])) # you add a child to an nltk.Tree with `append` else: # The only way this can happen is if you have a unary non-terminal production rule. # That is almost certainly not what you want with this kind of grammar, so we'll crash. # If you really do want this, open a PR with a valid use case. raise ParsingError( f"Found a unary production rule: {left_side} -> {right_side}. " "Are you sure you want a unary production rule in your grammar?" ) return remaining_actions
def add_sibling(self, next_token): self.stack[-1].append(Tree(next_token, []))
def induce_CYK(self, sentence, show=True, beam_size=20): binaries = {} for lhs in self.proba_cfg.keys(): for rhs in self.proba_cfg[lhs]: if not rhs in binaries.keys(): binaries[rhs] = set() binaries[rhs].add(lhs) lb = set([B[0] for B in binaries.keys()]) rb = set([B[1] for B in binaries.keys()]) bi = set(binaries.keys()) axioms = self.axiomes #import pdb; pdb.set_trace() init_sentence = sentence sentence = self.oov.beam_search_decoder(sentence=init_sentence, show=show, beam_size=beam_size).split() n = len(sentence) hist_scores = [[dict() for i in range(n + 1)] for k in range(n + 1)] hist_track = [[dict() for i in range(n + 1)] for k in range(n + 1)] r_pos = [[set() for i in range(n + 1)] for k in range(n + 1)] l_pos = [[set() for i in range(n + 1)] for k in range(n + 1)] for i, word in enumerate(sentence): #word = word.lower() for A, words in self.proba_lexicons.items(): if word in words.keys(): hist_scores[i][i + 1][A] = words[word] if A in lb: l_pos[i][i + 1].add(A) if A in rb: r_pos[i][i + 1].add(A) for window in range(2, n + 1): for start in range(n + 1 - window): end = start + window for split in range(start + 1, end): left, right = hist_scores[start][split], hist_scores[ split][end] l_int, r_int = l_pos[start][split] & lb, r_pos[split][ end] & rb final_int = set(product(l_int, r_int)) & bi for (B, C) in final_int: for A in binaries[(B, C)]: prob = left[B] * right[C] * self.proba_cfg[A][(B, C)] if prob > hist_scores[start][end].get(A, 0.0): hist_scores[start][end][A] = prob hist_track[start][end][A] = (split, B, C) if A in lb: l_pos[start][end].add(A) if A in rb: r_pos[start][end].add(A) tree = self.get_tree(hist_track, hist_scores, 0, n, init_sentence.split(), n, 'SENT') if tree == 'NOT IN GRAMMAR': ret = self.OOG(init_sentence.split()), 0 else: tree = Tree.fromstring(tree) tree.un_chomsky_normal_form() ret = ' '.join(tree.pformat().split()), 1 return ret
def deep_copy_tree(tree): if type(tree) == Tree: return Tree(tree.label(), [deep_copy_tree(child) for child in tree]) return tree
def to_nltk_tree(node): if len(node.children) > 0: return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children]) else: return tok_format(node)
def convert_leaf_str_to_tree(_tree): if isinstance(_tree, str): return Tree(_tree, []) else: return Tree(_tree.label(), [convert_leaf_str_to_tree(_tree_e) for _tree_e in _tree])
def simplify_tree(t:Tree): if t.label() == "call": # remove call, make first arg of it the parent of the other args assert(len(t[0]) == 0) # if not t[0].label().startswith("SW."): # print(t) # assert(t[0].label().startswith("SW.")) t.set_label(t[0].label()) del t[0] elif t.label() == "string": # remove, annotate afterstring.update(set([tc.label() for tc in t])) assert(len(t) == 1) assert(len(t[0]) == 0) t.set_label(f"arg:{t[0].label()}") del t[0] if t.label().startswith("edu.stanford.nlp.sempre.overnight.SimpleWorld."): t.set_label("SW:" + t.label()[len("edu.stanford.nlp.sempre.overnight.SimpleWorld."):]) if t.label() == "SW:getProperty": assert(len(t) == 2) if self.full_simplify: ret = simplify_tree(t[1]) ret.append(simplify_tree(t[0])) else: children = [simplify_tree(te) for te in t] ret = t ret[:] = children return ret elif t.label() == "SW:singleton": assert(len(t) == 1) assert(len(t[0]) == 0) if not self.full_simplify: t[0].set_label(f"singleton:{t[0].label()}") return simplify_tree(t[0]) elif t.label() == "SW:ensureNumericProperty": assert(len(t) == 1) # assert(len(t[0]) == 1) # t[0][0].set_label(f"numeric:{t[0][0].label()}") if self.full_simplify: ret = simplify_tree(t[0]) else: ret = t ret[:] = [simplify_tree(te) for te in ret] return ret elif t.label() == "SW:ensureNumericEntity": assert(len(t) == 1) if self.full_simplify: ret = simplify_tree(t[0]) else: ret = t ret[:] = [simplify_tree(te) for te in ret] return ret elif t.label() == "SW:aggregate": assert(len(t) == 2) ret = simplify_tree(t[0]) assert(ret.label() in ["arg:avg", "arg:sum"]) assert(len(ret) == 0) ret.set_label(f"agg:{ret.label()}") ret.append(simplify_tree(t[1])) return ret else: t[:] = [simplify_tree(tc) for tc in t] return t
def __call__(self, x:Tree): if self.p > self.rng.random_sample(): return Tree(self.mask_symbol, []) else: return Tree(x.label(), [self(xe) for xe in x])
""" Recursively turn a tree into a binary tree. """ if isinstance(tree, str): return tree elif len(tree) == 1: return binarize(tree[0]) else: label = tree.label() return reduce(lambda x, y: Tree(label, (binarize(x), binarize(y))), tree) #return reduce(lambda x, y: (binarize(x), binarize(y)), tree) t = Tree.fromstring( '(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (DT an) (NN example)) (PP (IN of) (NP (NN tokenziation)))))))' ) bt = binarize(t) import re import string from stanfordcorenlp import StanfordCoreNLP from nltk import Tree from functools import reduce ''' regex = re.compile('[%s]' % re.escape(string.punctuation)) def parse_sentence(sentence): nlp = StanfordCoreNLP(r'./stanford-corenlp-full-2018-02-27') sentence = regex.sub('', sentence)
def test_process(self): #Test for only stop words removal nltka = NLTK(stopwords_removal=True, url_tagging=True) nltka.set_lang("") self.assertEqual( nltka.process( "The striped bats are hanging on their feet for the best"), ["striped", "bats", "hanging", "feet", "best"]) #Test for only stemming nltka.stemming = True nltka.stopwords_removal = False self.assertEqual( nltka.process( "My name is Francesco and I am a student at the University of the city of Bari" ), [ "my", "name", "is", "francesco", "and", "i", "am", "a", "student", "at", "the", "univers", "of", "the", "citi", "of", "bari" ]) nltka.stemming = False #Test for only lemmatization nltka.lemmatization = True self.assertEqual( nltka.process( "The striped bats are hanging on their feet for best"), [ "The", "strip", "bat", "be", "hang", "on", "their", "foot", "for", "best" ]) #Test for lemmatization with multiple whitespaces removal nltka.strip_multiple_whitespaces = True self.assertEqual( nltka.process( "The striped bats are hanging on their feet for best" ), [ "The", "strip", "bat", "be", "hang", "on", "their", "foot", "for", "best" ]) #Test for lemmatization with multiple whitespaces removal and URL tagging nltka.url_tagging = True self.assertEqual( nltka.process( "The striped http://facebook.com bats https://github.com are http://facebook.com hanging on their feet for best http://twitter.it" ), [ "The", "strip", "<URL>", "bat", "<URL>", "be", "<URL>", "hang", "on", "their", "foot", "for", "best", "<URL>" ]) # Test for lemmatization, multiple whitespaces removal, URL tagging and stemming nltka.stemming = True self.assertEqual( nltka.process( "The striped http://facebook.com bats https://github.com are http://facebook.com hanging on their feet for best http://twitter.it" ), [ "the", "strip", "<url>", "bat", "<url>", "be", "<url>", "hang", "on", "their", "foot", "for", "best", "<url>" ]) # Test for lemmatization, multiple whitespaces removal, URL tagging, stemming, stop words removal nltka.stopwords_removal = True self.assertEqual( nltka.process( "The striped http://facebook.com bats https://github.com are http://facebook.com hanging on their feet for best http://twitter.it" ), [ "strip", "<url>", "bat", "<url>", "<url>", "hang", "foot", "best", "<url>" ]) nltka.named_entity_recognition = True nltka.stopwords_removal = False nltka.stemming = False nltka.lemmatization = False result = nltka.process( "Facebook was fined by Hewlett Packard for spending 100€ to buy Cristiano Ronaldo from Juventus" ) self.assertEqual( result, Tree('S', [ Tree('PERSON', [('Facebook', 'NNP')]), ('was', 'VBD'), ('fined', 'VBN'), ('by', 'IN'), Tree('PERSON', [('Hewlett', 'NNP'), ('Packard', 'NNP')]), ('for', 'IN'), ('spending', 'VBG'), ('100€', 'CD'), ('to', 'TO'), ('buy', 'VB'), Tree('PERSON', [('Cristiano', 'NNP'), ('Ronaldo', 'NNP')]), ('from', 'IN'), Tree('GPE', [('Juventus', 'NNP')]) ]))
type=int, default=1, help= "Number of processors to use, -1 means use all processors, in Windows, multiprocessing doesn't work, go for n_jobs = 1" ) args = parser.parse_args() print('Reading the training corpus :') filename = args.data_file corpus = read_corpus_pcfg(filename) print('Binarizing the trees :') trees = [Tree.fromstring(sentence) for sentence in corpus] for tree in trees: tree.chomsky_normal_form(horzMarkov=2) tree.collapse_unary(True, True) train_frac = args.train_frac print('Training on %.2f %% of the data: ' % (100 * train_frac)) size = len(corpus) train_size = int(train_frac * size) train, test = corpus[:train_size], corpus[train_size:] train_t, test_t = trees[:train_size], trees[train_size:] if not args.test_mode: entername = 'frac_data_sentences.txt'
def parse(self, doc): batch = doc.to_batch(self.device) output = self.forward(batch) tree = output['tree'][0] tree = Tree.fromstring(tree) return tree
def testing(): converter = TreesConverter() t = Tree('?', [Tree('a', []), Tree('b', [])]) converted = converter.convert_tree(t) backwards = converter.reverse_convert_tree(converted) print(t, converted, backwards)
def to_nltk_tree(node): if node.n_lefts + node.n_rights > 0: return Tree(node.orth_, [to_nltk_tree(child) for child in node.children]) else: return node.orth_
from nltk import Tree from nltk import induce_pcfg import pickle from nltk import Nonterminal terminal_dict = {} non_terminal_dict = {} with open("TrainingTree.txt", 'r') as f: lines = f.readlines() lines = [line.replace('[','(').replace(']',')') for line in lines] rules = [] for line in lines: t = Tree.fromstring(line) rules += t.productions() S = Nonterminal('S') grammar = induce_pcfg(S, rules) print(grammar) with open("grammar.pkl", 'wb') as pickle_file: pickle.dump(grammar, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
from nltk import Tree, RegexpChunkParser from nltk.chunk import RegexpParser from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')] # forth chunker = RegexpParser(r''' NP: {<DT><NN.*><.*>*<NN.*>} }<VB.*>{''') print(chunker.parse(s)) # back t = Tree('S', s) cs = ChunkString(t) print(cs) ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns') ur.apply(cs) print(cs) ir = ChinkRule('<VB.*>', 'chink verbs') ir.apply(cs) print(cs) print(cs.to_chunkstruct()) # cs.to_chunkstruct().draw() chunker = RegexpChunkParser([ur, ir])
def get_labels_from_tree(x:Tree): ret = {x.label()} for child in x: ret |= get_labels_from_tree(child) return ret
def is_proper_name(noun_phrase:nltk.Tree): for word, typ in noun_phrase.leaves(): if (not word.istitle()) and (not typ in []): return False return True
else: ret.append(t) prev_t = t return ret with gzip.open(sys.argv[1]) as f: while 1: line = f.readline() if not line: break line = line.strip() if MODE == HEAD: hl = remove_digits(fix_paren(line)) headline = Tree.fromstring(hl).leaves() MODE = NEXT if MODE == TEXT: article_parse.append(remove_digits(fix_paren(line))) if MODE == SENT and re.match(r'<token id=\"[\d]+\">', line): words.append(f.readline().strip().replace("<word>", "").replace( "</word>", "")) lemmas.append(f.readline().strip().replace("<lemma>", "").replace( "</lemma>", "")) for _ in range(3): f.readline() ners.append(f.readline().strip().replace("<NER>", "").replace("</NER>", ""))
def to_nltk_tree(node): if node.n_lefts + node.n_rights > 0: return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children]) else: return tok_format(node)
"??", "").replace("- ", "").replace(">>", "") + " " sentences = all.replace("-", "").replace(";", ".").replace( "?", ".").replace("!", ".").lower().split(".") for sentence in sentences: sentence = unicode(sentence, errors='ignore') total += 1 output = nlp_stan.annotate( sentence, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) t = Tree.fromstring(output['sentences'][0]['parse']) height = t.height() - 1 total_comp += height doc = nlp(sentence) sub_toks = [ tok for tok in doc if (tok.dep_ in SUBJECTS) ] obj_toks = [ tok for tok in doc if (tok.dep_ in OBJECTS) ] for sub in sub_toks: sub = str(sub) zir_sub = sub.split(" ") for zir in zir_sub: if zir in female_cat:
def to_nltk_tree(node): if node.n_lefts + node.n_rights > 0: parsed_child_nodes = [to_nltk_tree(child) for child in node.children] return Tree(node.orth_, parsed_child_nodes) else: return node.orth_
def _rename_tags(self, tree): """ Rename the tags and restructure the tree to cnf with Roark factorization. Find duplicates in leaves and make them unique. Add information about heads. """ def _put_head_on_subtree(tree, leaf_nodes): """ Put the head information on the correct subtree labels :count int: assign unique ident to duplicated leaves (this is needed later for correct parsing) """ oldlabel = tree.label() simplified_label = re.split("-", oldlabel)[0] head, head_pos = None, None for subtree in tree: #subtree is a terminal if subtree[0] in leaf_nodes: #create head and make head lowercase temp_head, temp_head_pos = subtree[0].lower(), subtree.label() #check whether the head should project higher up if PHRASES.get(simplified_label) and temp_head_pos and temp_head_pos[0] in PHRASES[simplified_label]: head, head_pos = temp_head, temp_head_pos temp_newlabel = "{}|SPL{}#MID{}|SPL".format(temp_head_pos, temp_head, temp_head_pos) subtree.set_label(temp_newlabel) #subtree is not a terminal else: temp_head, temp_head_pos = _put_head_on_subtree(subtree, leaf_nodes) if PHRASES.get(simplified_label) and temp_head_pos and temp_head_pos[0] in PHRASES[simplified_label]: head, head_pos = temp_head, temp_head_pos #if we found a head in one of the subtrees if head: #create new label with head info newlabel = "{}|SPL{}#MID{}|SPL".format(oldlabel, head, head_pos) tree.set_label(newlabel) return head, head_pos def _recurse_tags(tree, parent, sibling, branches): #Obtain the terminal nodes of this tree leaves = tree.leaves() # Check if the queue of branches that need to be processed # is filled. If this is the case, processing the branches # takes top priority. current_label = tree.label() tree.set_label(current_label) for subtree in tree: try: current_label = subtree.label() except AttributeError: pass else: subtree.set_label(current_label) if branches != []: newparent = str(tree.label()) newlabel = "{}^{}".format(parent, sibling) lefttree = branches.pop(0) # Construct both branch sides of the tree leftside = _recurse_tags(tree, newparent, None, []) rightside = _recurse_tags(lefttree, newlabel, newparent, branches) return "({} {} {})".format(newlabel, leftside, rightside) # Else if the current rule in the tree maps to more than # 2 children, put the branches in a queue. elif len(tree) > 2: # branches on the queue are all branches that will be nested: # These are the 3rd+ branch in a tree. The new parent is the # current tree label. branches = [tree[i] for i in range(2, len(tree))] newparent = str(tree.label()) newsibling = str(tree[0].label()) # The rightmost branch will be written like a normal tree. # The left side will get nested and labels will be rewritten. leftside = _recurse_tags(tree[0], newparent, None, []) rightside = _recurse_tags(tree[1], newparent, newsibling, branches) # If the current node is on the left hand side or does # not have a parent, do not change the label. Else, # reformat the label to <parent>^<sibling> if parent is None or sibling is None: return "({} {} {})".format( str(tree.label()), leftside, rightside) else: newlabel = "{}^{}".format(parent, sibling) return "({}({} {} {}))".format( newlabel, str(tree.label()), leftside, rightside) # If a rule is binary, check for whether we are on # the right or left branch of the tree. elif len(tree) == 2: # If on the left branch, the new label will be the # current label of the tree. if sibling is None: newlabel = str(tree.label()) newsibling = str(tree[0].label()) # Recursion, change tags of the subtrees leftside = _recurse_tags(tree[0], newlabel, None, []) rightside = _recurse_tags(tree[1], newlabel, newsibling, []) return "({} {} {})".format(newlabel, leftside, rightside) # If on the right branch, the new label will be # reformatted to <parent>^<sibling>, which will # then be rewritten as the current label. else: newlabel = "{}^{}".format(parent, sibling) newparent = str(tree.label()) newsibling = str(tree[0].label()) # Recursion, change tags of the subtrees leftside = _recurse_tags(tree[0], newparent, None, []) rightside = _recurse_tags(tree[1], newparent, newsibling, []) return "({} ({} {} {}))".format( newlabel, newparent, leftside, rightside) # If a rule is unary, first check whether the rule leads # to a nonterminal symbol. else: # If the unary rule leads to a nonterminal symbol if tree[0] not in leaves: subtree = _recurse_tags(tree[0], tree.label(), None, []) if sibling is None: return "({} {})".format(tree.label(), subtree) else: newlabel = "{}^{}".format(parent, sibling) return "({} ({} {}))".format( newlabel, tree.label(), subtree) # Else if the unary rule leads to a terminal symbol elif sibling is None: newlabel = tree.label() new_word = tree[0].lower() word_label = re.split("\|SPL", newlabel)[0] lemma = lemmatize(pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word), word_label) if lemma != pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word): self._tagged_words.update([(lemma, word_label)]) self._tagged_words.update([(pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word), word_label)]) else: self._tagged_words.update([(lemma, word_label)]) if new_word in set_of_dupls: new_word = "".join([new_word, "#", str(unique_counts[0])]) unique_counts[0] += 1 return "({} {})".format(pattern.sub(lambda m: replacing[re.escape(m.group(0))], newlabel), pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word)) else: newlabel = "{}^{}".format(parent, sibling) new_word = tree[0].lower() word_label = re.split("\|SPL", tree.label())[0] lemma = lemmatize(pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word), word_label) if lemma != pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word): self._tagged_words.update([(lemma, word_label)]) #split because terminal nodes should not store infor about lex heads self._tagged_words.update([(pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word), word_label)]) else: self._tagged_words.update([(lemma, word_label)]) if new_word in set_of_dupls: new_word = "".join([new_word, "#", str(unique_counts[0])]) unique_counts[0] += 1 pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word) return "({} ({} {}))".format( newlabel, tree.label(), pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word)) # Start recursion if self._lexicalized == True: leaf_nodes = tree.leaves() _put_head_on_subtree(tree, leaf_nodes) temp_counter = Counter([x.lower() for x in leaf_nodes]) set_of_dupls = {x for x in temp_counter if temp_counter[x] > 1} unique_counts = [0] string = _recurse_tags(tree, None, None, []) return Tree.fromstring(string)
if len(item) < 1: i = i + 1 sub_trees.append("") else: sub_trees[i] = sub_trees[i] + " " + item i = 0 for item in sub_trees: sub_trees[i] = ' '.join(item.split()) i = i + 1 sub_trees = [t for t in sub_trees if t != ''] return sub_trees target = "I know this has already been answered, but I wanted to share a potentially better looking way to call Popen via the use of from x import x and functions." sub_tree = getSentenceRelations("S", target) root = parser.raw_parse(target) tree_string = list(root)[0] tree_string = str(tree_string).replace("\n", "") tree_string = ' '.join(tree_string.split()) root = Tree.fromstring(tree_string) list(root)[0].pretty_print() for item in sub_tree: tree = Tree.fromstring(item) list(tree)[0].pretty_print()
from nltk import RegexpParser, Tree from pos_tagged_oz import pos_tagged_oz # define adjective-noun chunk grammar here chunk_grammar = "AN: {<JJ><NN>}" # create RegexpParser object here chunk_parser = RegexpParser(chunk_grammar) # chunk the pos-tagged sentence at index 282 in pos_tagged_oz here scaredy_cat = chunk_parser.parse(pos_tagged_oz[282]) print(scaredy_cat) # pretty_print the chunked sentence here Tree.fromstring(str(scaredy_cat)).pretty_print()
def generate_parse_tree(node): if node.n_lefts + node.n_rights > 0: return Tree(node.orth_, [generate_parse_tree(child) for child in node.children]) else: return node.orth_
def tree_to_ptree(tree: nltk.Tree): tree_str = tree.__str__() ptree = PTree.fromstring(tree_str) return ptree
def main(): parser = argparse.ArgumentParser( description= "ignore input; make a demo grammar that is compliant in form", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input file (ignored)") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file (grammar)") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) """ #workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__), dir=os.getenv('TMPDIR', '/tmp')) def cleanwork(): shutil.rmtree(workdir, ignore_errors=True) if args.debug: print(workdir) else: atexit.register(cleanwork) """ infile = prepfile(args.infile, 'r') outfile = prepfile(args.outfile, 'w') # S is a list of all trees read from train.trees.pre.unk S = [] for line in infile: S.append(line) from nltk import Tree from collections import defaultdict D = defaultdict(int) d = defaultdict(int) # for all trees in S for tree in range(0, len(S)): t = Tree.fromstring(S[tree]) # length stores no of productions in each tree length = len(t.productions()) # for each production in t for i in range(0, length): rule = str(t.productions()[i]) #D stores count of each distinct rule D[rule] += 1 #d stores count of unique heads of rules d[rule.split()[0]] += 1 #used to find the rule with max frequency W = [] for w in sorted(D, key=D.get, reverse=True): W.append([w, D.get(w)]) #Rule with max frequency #print(W[0]) # stores a list of all distinct rules with their frequencies W_new = [] for w in D: W_new.append([w, D.get(w)]) import math # stores a list of all rules and their probabilities H = [] # for each distinct rule for w in W_new: r = d.get(w[0].split()[0]) H.append([w[0], round((float(D.get(w[0])) / r), 7)]) # stores the grammar rules along with their probabilities cfg = [] for i in H: rule = str(i[0].replace('\'', '') + " # " + str(i[1])) #print(rule) cfg.append(rule) for i in cfg: outfile.write(i) outfile.write('\n')