def compare_trees(fG, fH): output = [] def compare(tG, tH): output = '' if tG.c == tH.c and len(tG.words()) == len(tH.words()) and len( tG.ch) == len(tH.ch) and (len(tH.ch) < 2 or len( tG.ch[0].words()) == len(tH.ch[0].words())): output += (' (' if len(tG.ch) > 0 else ' ') + tG.c for i in range(len(tH.ch)): output += compare(tG.ch[i] if i < len(tG.ch) else tree.Tree(), tH.ch[i]) if len(tG.ch) > 0: output += ')' else: output += ' <***GOLD***> ' + str(tG) + ' <**HYPOTH**> ' + str( tH) + ' <**********>' return output for i, (lineH, lineG) in enumerate(zip(fH, fG)): tH = tree.Tree() tH.read(lineH) tG = tree.Tree() tG.read(lineG) output.append('TREE ' + str(i + 1) + ':\n') output.append(compare(tG, tH) + '\n') return output
def wrap_terms(t): if len(t.ch) > 1: for i in range(len(t.ch)): if term(t.ch[i]): t.ch[i] = tree.Tree(pos.pop(0), [tree.Tree(t.ch[i].c, [])]) else: wrap_terms(t.ch[i]) elif len(t.ch) == 1 and len(t.ch[0].ch) == 0: t.c = pos.pop(0) return t
def trees2deps(trees_buffer, model_buffer, debug=False): out = [] heads = {} deps = {} def preterms(t): if preterm(t): return [t] x = [] for ch in t.ch: x += preterms(ch) return x def preterm(t): return len(t.ch) == 1 and t.ch[0].ch == [] def get_deps(t, ix, words): if preterm(t): deps[t] = ix return (words.index(t) + 1) heads[t] = max(t.ch, key=lambda x: head_model[t.c][x]) if debug: heads[t].c = 'HEAD:' + heads[t].c + '->' + str(ix) children = t.ch[:] head = children.pop(children.index(heads[t])) headix = get_deps(head, ix, words) for ch in children: get_deps(ch, headix, words) if debug: ch.c += '->' + str(headix) return (headix) head_model = pcfg_model.CondModel('R') for line in model_buffer: head_model.read(line) t = tree.Tree() for line in trees_buffer: heads = {} deps = {} t.read(line) preterminals = preterms(t) get_deps(t, 0, preterminals) preterminals.insert(0, tree.Tree('X', [tree.Tree('ROOT', [])])) if debug: out.append(str(t) + '\n') for i in range(1, len(preterminals)): out.append('X(' + preterminals[deps[preterminals[i]]].ch[0].c + '-' + str(deps[preterminals[i]]) + ', ' + str(preterminals[i].ch[0].c) + '-' + str(i) + ')\n') out.append('\n') return out
def out(inputs): t = tree.Tree() outputs = [] for x in inputs: x = x.strip() if (x != '') and (x[0] != '%'): t.read(x) outputs.append(' '.join(t.words()) + '\n') return outputs
def out(inputs): t = tree.Tree() outputs = [] for x in inputs: x = x.strip() if (x != '') and (x[0] != '%'): t.read(x) t.upper() outputs.append(str(t) + '\n') return outputs
def out(inputs, n): t = tree.Tree() outputs = [] for x in inputs: x = x.strip() if (x != '') and (x[0] != '%'): t.read(x) if len(t.words()) <= n: outputs.append(x + '\n') return outputs
def plug_leaves(trees, words): t = tree.Tree() output = [] for tr, wr in zip(trees, words): t.read(tr) plug_words(t, wr.split()) output.append('%s\n' % t) return output
def out(inputs): def is_curr(x): return x.c == '$' t = tree.Tree() outputs = [] for i, x in enumerate(inputs): x = x.strip() if (x != '') and (x[0] != '%'): t.read(x) t.prune(is_curr) outputs.append(str(t) + '\n') return outputs
def compare(tG, tH): output = '' if tG.c == tH.c and len(tG.words()) == len(tH.words()) and len( tG.ch) == len(tH.ch) and (len(tH.ch) < 2 or len( tG.ch[0].words()) == len(tH.ch[0].words())): output += (' (' if len(tG.ch) > 0 else ' ') + tG.c for i in range(len(tH.ch)): output += compare(tG.ch[i] if i < len(tG.ch) else tree.Tree(), tH.ch[i]) if len(tG.ch) > 0: output += ')' else: output += ' <***GOLD***> ' + str(tG) + ' <**HYPOTH**> ' + str( tH) + ' <**********>' return output
def out(inputs): mapper = {'(': '-LRB-', ')': '-RRB-'} def labelmap(x): return mapper.get(x, x) t = tree.Tree() outputs = [] for i, x in enumerate(inputs): x = x.strip() if (x != '') and (x[0] != '%'): t.read(x) t.mapLabels(labelmap) outputs.append(str(t) + '\n') return outputs
def out(inputs): t = tree.Tree() left = '(1 ' right = ') ' outputs = [] for x in inputs: x = x.strip() if x != '': words = x.split() out = '' for word in words[::-1]: w = left + word + right if out: out = left + w + out + right else: out = w t.read(out) outputs.append(str(t) + '\n') return outputs
+ str(getPOS(T)) + ' ' + str(depdirSyn) + ' ' + str(depdirSem) + ' ' + str(depdirSynM) + ' ' + str(depdirSemM) + ' ' \ + str(int(isPhrasePunc(T.ch[0])))) else: if len(T.ch[0].c) > 1 and T.ch[0].c.endswith('lC') and not '-c' in T.c: coords.append(T) ends.append(last(T)) for t in T.ch: printToks(t) print('word dltdc dltdcv ' \ + 'dlt dltc dltcv dltv ' \ + 'dltm dltcm dltcvm dltvm ' \ + 'pos depdirSyn depdirSem depdirSynM depdirSemM' + ' ' \ + 'punc') for line in sys.stdin: if (line.strip() != '') and (line.strip()[0] != '%'): terms = [] DLTcosts = [] DLTcostsV = [] coords = [] ends = [] complete = [] cCosts = [] cvCosts = [] post = [0, 0, 0, 0, 0, 0, 0, 0] postpost = [0, 0, 0, 0, 0, 0, 0, 0] T = tree.Tree() T.read(line) printToks(T)
def deps2trees(buffer, format='stanford', debug=False): out = [] # Regexp for extracting dependency information from a stanford dependencies file stan_dep = re.compile(' *[^ ]*\([^ ]+-([0-9]+) *, *([^ ]+)-([0-9]+)\)') # Reports whether a tree is terminal def term(t): return t.ch == [] # Ensures that each terminal in t has a unary pre-terminal parent def wrap_terms(t): if len(t.ch) > 1: for i in range(len(t.ch)): if term(t.ch[i]): t.ch[i] = tree.Tree(pos.pop(0), [tree.Tree(t.ch[i].c, [])]) else: wrap_terms(t.ch[i]) elif len(t.ch) == 1 and len(t.ch[0].ch) == 0: t.c = pos.pop(0) return t # Start reading the input line = next(buffer) while line: # list of dependency tokens deps = [] pos = [] # Each token is on its own line, and sents are separated by newlines. # Reads until the end of the sentence is encountered and creates # a new token object for each line while line and not line.strip() == '': # Each token must have 'word', 'dep', and 'ix' fields. # The following lines read these in according to the # input format. if format.lower() == 'conll': tok = {'word': line.split()[1], 'dep': int(line.split()[7]), 'ix': int(line.split()[0])} pos += [str(line.split()[3])] elif format.lower() == 'conll-x': tok = {'word': line.split()[1], 'dep': int(line.split()[6]), 'ix': int(line.split()[0])} pos += [str(line.split()[3])] elif format.lower() == 'stanford': word = stan_dep.match(line).group(2) dep = stan_dep.match(line).group(1) ix = stan_dep.match(line).group(3) pos += ['X'] tok = {'word': word, 'dep': int(dep), 'ix': int(ix)} else: raise ValueError('Unsupported format %s' % format) deps.append(tok) if debug: out.append('%s\n' % tok) line = next(buffer) # Dictionary of trees indexed by head sentpos trees = {0: tree.Tree()} # Add a preterminal to trees for each token in the sentence for tok in deps: trees[tok['ix']] = tree.Tree('X', [tree.Tree(tok['word'], [])]) # Combine trees based on their dependencies (deps to 0 are the main head) for tok in deps: # Dep to 0, this is the main head if tok['dep'] == 0: trees[0] = trees[tok['ix']] # Dep to following head, insert tree as preceding sibling of head elif tok['ix'] < tok['dep']: trees[tok['dep']].ch.insert(-1, trees[tok['ix']]) # Dep to preceding head, insert tree as following sibling of head else: trees[tok['dep']].ch.append(trees[tok['ix']]) # Make sure all terminals have unary pre-terminal parents trees[0] = wrap_terms(trees[0]) # Print the main tree out.append('%s\n' % trees[0]) # Start reading the next sentence try: line = next(buffer) except StopIteration: line = None return out