def get_example( self, # type: ignore tree: ParentedTree, ancestor: str): """ Given a ParentedTree, extract the labels of the parents, grandparents, or greatgrandparents. Parameters ---------- tree: ParentedTree ParentedTree to extract the example from. ancestor: str Whether the labels should be the parent, grandparent, or great-grandparent of each leaf. """ tokens = tree.leaves() labels: List[str] = [] for child in tree: if isinstance(child, ParentedTree): if len(list(child.subtrees())) > 1: labels.extend(self.get_example(child, self._ancestor)[1]) else: labels.append(self._get_label(child, self._ancestor)) return tokens, labels
def traverse_and_store(self, tree: ParentedTree, parse_tree_stored: List[Dict]): label = tree.label() words = [x.split('_')[0] for x in tree.leaves()] indices = [int(x.split('_')[-1]) for x in tree.leaves()] ngram_info = len(words) words = " ".join(words) if tree.height() > self.TREE_HEIGHT and ngram_info < self.NGRAM_LIMIT: parse_tree_stored.append({ 'phrase_label': label, 'phrase': words, 'ngram': ngram_info, 'indices': indices }) for subtree in tree: if type(subtree) == ParentedTree: self.traverse_and_store(tree=subtree, parse_tree_stored=parse_tree_stored) return parse_tree_stored
import sys simpfi = open(sys.argv[2]).readlines() compfi = open(sys.argv[1]).readlines() #print Tree(simpfi[0]).leaves() #for chunk in common_chunks(Tree(simpfi[0]).leaves(), Tree(compfi[0]).leaves()): # print chunk # print [Tree(simpfi[0]).leaves()[tup[0]] for tup in chunk] #print longest_common_substring(Tree(simpfi[0]).leaves(), Tree(compfi[0]).leaves()) for i in xrange(0, len(simpfi)): simptree = ParentedTree(simpfi[i].lower()) comptree = ParentedTree(compfi[i].lower()) chunk_list = get_substrings(comptree.leaves(), simptree.leaves(), ([''], (0, 0), (0, 0)), []) #print chunk_list #print comptree alignlist = [] for chunk in chunk_list: #print chunk comprange = chunk[1] simprange = chunk[2] simpidx = simprange[0] for j in xrange(comprange[0], comprange[1]): alignlist.append(str(simpidx) + '-' + str(j)) simpidx += 1 try: compposition = comptree.treeposition_spanning_leaves( comprange[0], comprange[1])
def get_terminals(ptree: ParentedTree) -> list: terms = ptree.subtrees(filter=lambda x: len(list(x.subtrees())) == 1) terms = list(terms) assert len(ptree.leaves()) == len(terms) # Pull out to unit test? return terms
align_c2s = {} align_s2c = {} for x in alignfi[i].split(): c_num = int(x.split('-')[1]) s_num = int(x.split('-')[0]) align_c2s[c_num] = align_c2s.setdefault(c_num, []) + [s_num] align_s2c[s_num] = align_s2c.setdefault(s_num, []) + [c_num] comptree = ParentedTree(compfi[i]) simptree = ParentedTree(simpfi[i]) if DEBUG: print '######################' print 'comptree:', comptree print 'simptree:', simptree print 'c2s align:', align_c2s complength = len(comptree.leaves()) simplength = len(simptree.leaves()) if complength > simplength: maxlength = complength longdict = align_c2s chunk_list = [] mychunk = [] for j in xrange(0, maxlength): if j in longdict: mychunk.append((j, longdict[j][0])) else: continue try: if longdict[j + 1] != [longdict[j][0] + 1]: chunk_list.append(mychunk) mychunk = []