def track(node): i, j, label = next(node) if j == i + 1: children = [leaves[i]] else: children = track(node) + track(node) if label.endswith('|<>'): return children labels = label.split('+') tree = Tree(labels[-1], children) for label in reversed(labels[:-1]): tree = Tree(label, [tree]) return [tree]
def load_file(self, filepath: str): with open(filepath) as src: for line in src: line = line.strip() if not line: continue yield {'constituency': Tree.fromstring(line)}
def list_to_tree(L): if isinstance(L, str): return L while len(L) == 1: L = L[0] if isinstance(L, str): return L return Tree(L[0], [list_to_tree(child) for child in L[1]])
def load_bracketed_trees(chtbs) -> List[Tree]: trees = [] for f in chtbs: with open(f, encoding='utf-8') as src: content = src.read() trees = [x for x in content.split('\n\n') if x.strip()] for tree in trees: tree = Tree.fromstring(tree) trees.append(tree) return trees
def make_ctb_tasks(chtbs, out_root, part): for task in ['cws', 'pos', 'par', 'dep']: os.makedirs(join(out_root, task), exist_ok=True) timer = CountdownTimer(len(chtbs)) par_path = join(out_root, 'par', f'{part}.txt') with open(join(out_root, 'cws', f'{part}.txt'), 'w', encoding='utf-8') as cws, \ open(join(out_root, 'pos', f'{part}.tsv'), 'w', encoding='utf-8') as pos, \ open(par_path, 'w', encoding='utf-8') as par: for f in chtbs: with open(f, encoding='utf-8') as src: content = src.read() trees = split_str_to_trees(content) for tree in trees: try: tree = Tree.fromstring(tree) except ValueError: print(tree) exit(1) words = [] for word, tag in tree.pos(): if tag == '-NONE-' or not tag: continue tag = tag.split('-')[0] if tag == 'X': # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT tag = 'FW' pos.write('{}\t{}\n'.format(word, tag)) words.append(word) cws.write(' '.join(words)) par.write(tree.pformat(margin=sys.maxsize)) for fp in cws, pos, par: fp.write('\n') timer.log( f'Preprocesing the [blue]{part}[/blue] set of CTB [blink][yellow]...[/yellow][/blink]', erase=False) remove_all_ec(par_path) dep_path = join(out_root, 'dep', f'{part}.conllx') convert_to_stanford_dependency_330(par_path, dep_path) sents = list(read_conll(dep_path)) with open(dep_path, 'w') as out: for sent in sents: for i, cells in enumerate(sent): tag = cells[3] tag = tag.split('-')[0] # NT-SHORT ---> NT if tag == 'X': # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT tag = 'FW' cells[3] = cells[4] = tag out.write('\t'.join(str(x) for x in cells)) out.write('\n') out.write('\n')
def build_tree(tokens: List[str], sequence): r""" Builds a constituency tree from the sequence. The sequence is generated in pre-order. During building the tree, the sequence is de-binarized to the original format (i.e., the suffixes ``|<>`` are ignored, the collapsed labels are recovered). Args: tokens : All tokens in a sentence. sequence (list[tuple]): A list of tuples used for generating a tree. Each tuple consits of the indices of left/right span boundaries and label of the span. Returns: A result constituency tree. Examples: >>> tree = Tree.totree(['She', 'enjoys', 'playing', 'tennis', '.'], 'TOP') >>> sequence = [(0, 5, 'S'), (0, 4, 'S|<>'), (0, 1, 'NP'), (1, 4, 'VP'), (1, 2, 'VP|<>'), (2, 4, 'S+VP'), (2, 3, 'VP|<>'), (3, 4, 'NP'), (4, 5, 'S|<>')] >>> print(Tree.build_tree(root, sequence)) (TOP (S (NP (_ She)) (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis))))) (_ .))) """ if not tokens: # User passed in [], which is the tokenized result of '' return Tree('TOP', []) tree = Tree('TOP', [Tree('_', [t]) for t in tokens]) root = tree.label() leaves = [ subtree for subtree in tree.subtrees() if not isinstance(subtree[0], Tree) ] def track(node): i, j, label = next(node) if j == i + 1: children = [leaves[i]] else: children = track(node) + track(node) if label.endswith('|<>'): return children labels = label.split('+') tree = Tree(labels[-1], children) for label in reversed(labels[:-1]): tree = Tree(label, [tree]) return [tree] return Tree(root, track(iter(sequence)))
def dfs_linearize_constituency(sample: dict, tokenizer: PENMANBartTokenizer, remove_space=False) -> dict: amr = sample.get('amr', None) if amr: l, e = tokenizer.linearize(amr) sample['graph_tokens'] = e['linearized_graphs'] sample['graph_token_ids'] = l tree = Tree.from_list(json.loads(sample['amr'].metadata['con_list'])) for each in tree.subtrees(lambda x: x.height() == 2): if each[0] == '(': each[0] = '<LBR>' elif each[0] == ')': each[0] = '<RBR>' text = tree.pformat(margin=10e7) tokens = [] buffer = [] for c in text: if c == '(' or c == ')': tokens.append(''.join(buffer)) tokens.append(c) buffer.clear() continue buffer.append(c) if buffer: tokens.append(''.join(buffer)) tokens = [x.strip() for x in tokens] tokens = [x for x in tokens if x] restore_bracket = {'<LBR>': '(', '<RBR>': ')'} tokens = [restore_bracket.get(x, x) for x in tokens] ids = [] for each in tokens: pairs = each.split(' ', 1) if len(pairs) == 2: con, token = pairs ids.append( tokenizer.convert_tokens_to_ids(tokenizer.INIT + con)) ids.extend(tokenizer.encode(token, add_special_tokens=False)) else: ids.append( tokenizer.convert_tokens_to_ids(tokenizer.INIT + each)) if remove_space: text = ''.join(text.split()) sample['text'] = text sample['text_token_ids'] = [tokenizer.bos_token_id ] + ids + [tokenizer.eos_token_id] return sample
def binarize(tree: Tree): r""" Conducts binarization over the tree. First, the tree is transformed to satisfy `Chomsky Normal Form (CNF)`_. Here we call :meth:`~tree.Tree.chomsky_normal_form` to conduct left-binarization. Second, all unary productions in the tree are collapsed. Args: tree (tree.Tree): The tree to be binarized. Returns: The binarized tree. Examples: >>> tree = Tree.fromstring(''' (TOP (S (NP (_ She)) (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis))))) (_ .))) ''') >>> print(Tree.binarize(tree)) (TOP (S (S|<> (NP (_ She)) (VP (VP|<> (_ enjoys)) (S+VP (VP|<> (_ playing)) (NP (_ tennis))))) (S|<> (_ .)))) .. _Chomsky Normal Form (CNF): https://en.wikipedia.org/wiki/Chomsky_normal_form """ tree: Tree = tree.copy(True) nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, Tree): nodes.extend([child for child in node]) if len(node) > 1: for i, child in enumerate(node): if not isinstance(child[0], Tree): node[i] = Tree(f"{node.label()}|<>", [child]) tree.chomsky_normal_form('left', 0, 0) tree.collapse_unary() return tree
def list_to_tree(L): if isinstance(L, str): return L return Tree(L[0], [list_to_tree(child) for child in L[1]])
def _conll_rows_to_sentence(self, conll_rows: List[str]) -> OntonotesSentence: document_id: str = None sentence_id: int = None # The words in the sentence. sentence: List[str] = [] # The pos tags of the words in the sentence. pos_tags: List[str] = [] # the pieces of the parse tree. parse_pieces: List[str] = [] # The lemmatised form of the words in the sentence which # have SRL or word sense information. predicate_lemmas: List[str] = [] # The FrameNet ID of the predicate. predicate_framenet_ids: List[str] = [] # The sense of the word, if available. word_senses: List[float] = [] # The current speaker, if available. speakers: List[str] = [] verbal_predicates: List[str] = [] span_labels: List[List[str]] = [] current_span_labels: List[str] = [] # Cluster id -> List of (start_index, end_index) spans. clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list) # Cluster id -> List of start_indices which are open for this id. coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for index, row in enumerate(conll_rows): conll_components = row.split() document_id = conll_components[0] sentence_id = int(conll_components[1]) word = conll_components[3] pos_tag = conll_components[4] parse_piece = conll_components[5] # Replace brackets in text and pos tags # with a different token for parse trees. if pos_tag != "XX" and word != "XX": if word == "(": parse_word = "-LRB-" elif word == ")": parse_word = "-RRB-" else: parse_word = word if pos_tag == "(": pos_tag = "-LRB-" if pos_tag == ")": pos_tag = "-RRB-" (left_brackets, right_hand_side) = parse_piece.split("*") # only keep ')' if there are nested brackets with nothing in them. right_brackets = right_hand_side.count(")") * ")" parse_piece = f"{left_brackets} ({pos_tag} {parse_word}) {right_brackets}" else: # There are some bad annotations in the CONLL data. # They contain no information, so to make this explicit, # we just set the parse piece to be None which will result # in the overall parse tree being None. parse_piece = None lemmatised_word = conll_components[6] framenet_id = conll_components[7] word_sense = conll_components[8] speaker = conll_components[9] if not span_labels: # If this is the first word in the sentence, create # empty lists to collect the NER and SRL BIO labels. # We can't do this upfront, because we don't know how many # components we are collecting, as a sentence can have # variable numbers of SRL frames. span_labels = [[] for _ in conll_components[10:-1]] # Create variables representing the current label for each label # sequence we are collecting. current_span_labels = [None for _ in conll_components[10:-1]] self._process_span_annotations_for_word(conll_components[10:-1], span_labels, current_span_labels) # If any annotation marks this word as a verb predicate, # we need to record its index. This also has the side effect # of ordering the verbal predicates by their location in the # sentence, automatically aligning them with the annotations. word_is_verbal_predicate = any("(V" in x for x in conll_components[11:-1]) if word_is_verbal_predicate: verbal_predicates.append(word) self._process_coref_span_annotations_for_word( conll_components[-1], index, clusters, coref_stacks) sentence.append(word) pos_tags.append(pos_tag) parse_pieces.append(parse_piece) predicate_lemmas.append( lemmatised_word if lemmatised_word != "-" else None) predicate_framenet_ids.append( framenet_id if framenet_id != "-" else None) word_senses.append( float(word_sense) if word_sense != "-" else None) speakers.append(speaker if speaker != "-" else None) named_entities = span_labels[0] srl_frames = [ (predicate, labels) for predicate, labels in zip(verbal_predicates, span_labels[1:]) ] if all(parse_pieces): parse_tree = Tree.fromstring("".join(parse_pieces)) else: parse_tree = None coref_span_tuples: Set[TypedSpan] = { (cluster_id, span) for cluster_id, span_list in clusters.items() for span in span_list } return OntonotesSentence( document_id, sentence_id, sentence, pos_tags, parse_tree, predicate_lemmas, predicate_framenet_ids, word_senses, speakers, named_entities, srl_frames, coref_span_tuples, )