def lexical_rules(tree, cutoff=0): bad_tags = ('X', 'FRAG', 'ROOT') rules = dict() for subtree in tree.subtrees(lambda x: is_valid_tag(x.node) and len(x) > 0 and x.node.split('-')[0] not in bad_tags): productions = [ n.node.split('-')[0] for n in subtree if not isinstance(n, str) and is_valid_tag(n.node) and len(subtree) > 0 ] if len(productions) > 0 and len( set(bad_tags).intersection(productions)) == 0: prod = (subtree.node.split('-')[0], tuple(productions)) rules[prod] = rules.get(prod, 0) + 1 return rules
def lexical_rules(tree, cutoff=0): bad_tags = ("X", "FRAG", "ROOT") rules = dict() for subtree in tree.subtrees( lambda x: is_valid_tag(x.node) and len(x) > 0 and x.node.split("-")[0] not in bad_tags ): productions = [ n.node.split("-")[0] for n in subtree if not isinstance(n, str) and is_valid_tag(n.node) and len(subtree) > 0 ] if len(productions) > 0 and len(set(bad_tags).intersection(productions)) == 0: prod = (subtree.node.split("-")[0], tuple(productions)) rules[prod] = rules.get(prod, 0) + 1 return rules
def get_leaf_transitions(): file_name = 'penn_leaf_transition_counts.data' try: f = open(os.path.join('cache', file_name), 'rb') data = pickle.load(f) f.close() return data except (IOError, EOFError): from tag_utils import is_valid_tag cmd_utils.log("Building leaf counts from Penn Treebank corpus", 1) f = open(os.path.join('cache', file_name), 'wb') for sentence in nltk.corpus.treebank.parsed_sents(): leaves = list( sentence.subtrees( lambda x: len(x) > 0 and isinstance(x[0], basestring))) leaves = [ n[0].node.split("-")[0] for n in leaves if n.node not in is_valid_tag(n[0].node) ] leaves = ['START'] + leaves cmd_utils.log("Finished building tag counts", 1) pickle.dump(store_transitions._counts, f) f.close() return store_transitions._counts
def transitions_in_tree(tree): transitions = [] for subtree in tree.subtrees(): num_children = len(subtree) children = [] for c_index in range(0, num_children): node = subtree[c_index] if node.__class__ == str: continue simple_node = simple_tag(node.node) if is_valid_tag(simple_node): children.append(simple_node) simplified_transitions = simplify_tags(children) if len(simplified_transitions) > 1: transitions.append(simplified_transitions) return transitions
def get_leaf_transitions(): file_name = 'penn_leaf_transition_counts.data' try: f = open(os.path.join('cache', file_name), 'rb') data = pickle.load(f) f.close() return data except (IOError, EOFError): from tag_utils import is_valid_tag cmd_utils.log("Building leaf counts from Penn Treebank corpus", 1) f = open(os.path.join('cache', file_name), 'wb') for sentence in nltk.corpus.treebank.parsed_sents(): leaves = list(sentence.subtrees(lambda x: len(x) > 0 and isinstance(x[0], basestring))) leaves = [n[0].node.split("-")[0] for n in leaves if n.node not in is_valid_tag(n[0].node)] leaves = ['START'] + leaves cmd_utils.log("Finished building tag counts", 1) pickle.dump(store_transitions._counts, f) f.close() return store_transitions._counts