def add_pos_ne_encoding(tokens, doc, vectors, pos=True, ne=True): ''' This function takes in the tokens for a clause, the nlp doc with the tags, and the vectors. It concatenates the POS tag encoding to their respective word vectors. @param tokens: @param doc: @param vectors: @return: ''' # for each token, find its POS tag, get encoding, concatenate to its vector. # if the token doesn't have a tag, pad with 0s if (not pos) and (not ne): return vectors spacy_tokens = [token.text for token in doc] roberta_tokens = tokens a2b, b2a = tokenizations.get_alignments(spacy_tokens, roberta_tokens) new_vectors = [] for index, alignment in enumerate(b2a): if alignment: # get the tags from that spacy token and concat named_entity_tags = doc[alignment[0]].ent_type_ pos_tags = doc[alignment[0]].pos_ new_vectors.append(tf.concat([vectors[0][index], get_one_hot_pos(pos_tags), get_one_hot_ne(named_entity_tags)], axis=0)) else: # concat zeros new_vectors.append(tf.concat([vectors[0][index], tf.zeros([37])], 0)) new_vectors = tf.stack(new_vectors) return new_vectors
def build_bert_emb(sents: List[str], tokenizer, model, device: str): bert_embs = list() for i, sent in enumerate(sents): joint_sent = ' '.join(sent) bert_tokens = tokenizer.tokenize(joint_sent) input_ids = torch.tensor([tokenizer.encode(joint_sent, add_special_tokens=True)], device=device) # calculate BERT last layer embeddings with torch.no_grad(): last_hidden_states = model(input_ids)[0].squeeze(0).to('cpu') trunc_hidden_states = last_hidden_states[1:-1, :] ori2bert, bert2ori = get_alignments(sent, bert_tokens) emb_list = list() for idx in ori2bert: emb = trunc_hidden_states[idx, :] emb_list.append(emb.mean(dim=0)) # TODO: using the embedding of [CLS] may not be the best idea # It does not matter since that embedding is not used in the training emb_list = [last_hidden_states[0, :]] + emb_list bert_emb = torch.stack(emb_list) bert_embs.append(bert_emb) return bert_embs
def align_tokens(old_tokens, new_tokens, old_labels): a2b, b2a = tokenizations.get_alignments(old_tokens, new_tokens) l2 = [None] * len(new_tokens) for i in range(len(a2b)): l = old_labels[i] for j in a2b[i]: l2[j] = l return l2
def get_alignment(spans: List[Span], wordpieces: List[List[str]]) -> Ragged: """Compute a ragged alignment array that records, for each unique token in `spans`, the corresponding indices in the flattened `wordpieces` array. For instance, imagine you have two overlapping spans: [[I, like, walking], [walking, outdoors]] And their wordpieces are: [[I, like, walk, ing], [walk, ing, out, doors]] We want to align "walking" against [walk, ing, walk, ing], which have indices [2, 3, 4, 5] once the nested wordpieces list is flattened. The nested alignment list would be: [[0], [1], [2, 3, 4, 5], [6, 7]] I like walking outdoors Which gets flattened into the ragged array: [0, 1, 2, 3, 4, 5, 6, 7] [1, 1, 4, 2] The ragged format allows the aligned data to be computed via: tokens = Ragged(wp_tensor[align.data], align.lengths) This produces a ragged format, indicating which tokens need to be collapsed to make the aligned array. The reduction is deferred for a later step, so the user can configure it. The indexing is especially efficient in trivial cases like this where the indexing array is completely continuous. """ if len(spans) != len(wordpieces): raise ValueError("Cannot align batches of different sizes.") # Tokens can occur more than once, and we need the alignment of each token # to its place in the concatenated wordpieces array. token_positions = get_token_positions(spans) alignment: List[Set[int]] = [set() for _ in range(len(token_positions))] wp_start = 0 for i, (span, wp_toks) in enumerate(zip(spans, wordpieces)): sp_toks = [token.text for token in span] span2wp, wp2span = tokenizations.get_alignments(sp_toks, wp_toks) for token, wp_js in zip(span, span2wp): position = token_positions[token] alignment[position].update(wp_start + j for j in wp_js) wp_start += len(wp_toks) lengths: List[int] = [] flat: List[int] = [] for a in alignment: lengths.append(len(a)) flat.extend(sorted(a)) align = Ragged(numpy.array(flat, dtype="i"), numpy.array(lengths, dtype="i")) return align
def read_text(text_path): sents = [] sent = [] end_of_multiword = 0 multiword_combined = "" multiword_separate = [] multiword_sp_after = False with open(text_path) as f: for line in f: if not line.strip() or line.startswith("#"): if sent: sents.append(([w for w, sp in sent], [sp for w, sp in sent])) sent = [] assert end_of_multiword == 0 continue fields = line.split("\t", 2) num_or_range = fields[0] w = fields[1] if "-" in num_or_range: end_of_multiword = int(num_or_range.split("-")[1]) multiword_combined = w multiword_separate = [] multiword_sp_after = "SpaceAfter=No" not in fields[-1] continue elif int(num_or_range) <= end_of_multiword: multiword_separate.append(w) if int(num_or_range) == end_of_multiword: _, separate_to_combined = tokenizations.get_alignments( multiword_combined, multiword_separate ) have_up_to = 0 for i, char_idxs in enumerate(separate_to_combined): if i == len(multiword_separate) - 1: word = multiword_combined[have_up_to:] sent.append((word, multiword_sp_after)) elif char_idxs: word = multiword_combined[have_up_to : max(char_idxs) + 1] sent.append((word, False)) have_up_to = max(char_idxs) + 1 else: sent.append(("", False)) assert int(num_or_range) == len(sent) end_of_multiword = 0 multiword_combined = "" multiword_separate = [] multiword_sp_after = False continue else: assert int(num_or_range) == len(sent) + 1 sp = "SpaceAfter=No" not in fields[-1] sent.append((w, sp)) return sents
def _align(self, segment, wp_tokens, *, offset=0): spacy_tokens = [w.text for w in segment] a2b, b2a = get_alignments(spacy_tokens, wp_tokens) # a2b must contain the boundary of `segment` (head and last token index) # so insert them when they are missed. if a2b and b2a: if len(b2a[0]) == 0: a2b[0].insert(0, 0) if len(b2a[-1]) == 0: a2b[-1].append(len(b2a) - 1) a2b = [[i + offset for i in a] for a in a2b] return wp_tokens, a2b
def convert_to_revised_tokenization(orig_trees, revised_trees): for orig_tree, revised_tree in zip(orig_trees, revised_trees): orig_words = [standardize_form(word) for word in orig_tree.leaves()] revised_words = [ standardize_form(word) for word in revised_tree.leaves() ] o2r, r2o = tokenizations.get_alignments(orig_words, revised_words) assert all(len(x) >= 1 for x in o2r) converted_tree = orig_tree.copy(deep=True) for j in range(len(revised_words)): if len(r2o[j]) > 1: for i in r2o[j][1:]: orig_treeposition = orig_tree.leaf_treeposition(i) if len(orig_treeposition) > 1 and len( orig_tree[orig_treeposition[:-1]]) == 1: converted_tree[orig_treeposition[:-1]] = nltk.Tree( DUMMY_LABEL, [DUMMY_WORD]) else: converted_tree[orig_treeposition] = DUMMY_LABEL for i in range(len(orig_words)): if converted_tree[orig_tree.leaf_treeposition(i)] == DUMMY_LABEL: continue elif len(o2r[i]) == 1: j = o2r[i][0] converted_tree[orig_tree.leaf_treeposition(i)] = revised_tree[ revised_tree.leaf_treeposition(j)] else: orig_treeposition = orig_tree.leaf_treeposition(i) if len(orig_treeposition) > 1 and len( orig_tree[orig_treeposition[:-1]]) == 1: orig_treeposition = orig_treeposition[:-1] revised_leaves = [ revised_tree[revised_tree.leaf_treeposition(j)[:-1]] for j in o2r[i] ] assert all(len(x) == 1 for x in revised_leaves) converted_tree[orig_treeposition] = nltk.Tree( DUMMY_LABEL, revised_leaves) else: converted_tree[orig_treeposition] = nltk.Tree( DUMMY_LABEL, [ revised_tree[revised_tree.leaf_treeposition(j)] for j in o2r[i] ]) yield converted_tree
def respan(src_tokens: List[str], tgt_tokens: List[str], src_span: List[tuple]): """ transfer original spans to target spans :param src_tokens: source tokens :param tgt_tokens: target tokens :param src_span: a list of span tuples. The first element in the tuple should be the start index and the second should be the end index :return: a list of transferred span tuples. """ s2t, _ = get_alignments(src_tokens, tgt_tokens) tgt_spans = list() for spans in src_span: start = s2t[spans[0]][0] if spans[1] < len(s2t): end = s2t[spans[1]][-1] else: end = s2t[-1][-1] if end == start: end += 1 tgt_spans.append((start, end)) return tgt_spans
def _get_transformers_align(doc: Doc) -> List[List[int]]: """Get tokens alignment from spacy tokens to transformers tokens""" trf_tokens = doc._.get(ATTRS.cleaned_tokens) return get_alignments([token.text for token in doc], trf_tokens)[0]
def test_random(a, b): tokenizations.get_alignments(a, b)
def test_get_alignments(input_, expected): output = tokenizations.get_alignments(*input_) assert output == expected
def test_equality(a): a2b, b2a = tokenizations.get_alignments(a, a) assert a2b == b2a assert a2b == [[i] if len(aa) else [] for i, aa in enumerate(a)]
def process_single_core(self, proc_id: int, row_inds: np.ndarray, df: DataFrame, embedder: Embedder, text_col: str, tree: bool) -> Tuple[dict, Counter, Counter]: num_dependencies = len(self.dependencies) adjacency_dict = {} dep_list = [] pos_tag_list = [] for curr_ind, row_ind in enumerate(row_inds): if curr_ind % 100 == 0: print(f'Core: {proc_id}, {curr_ind} from {len(row_inds)} rows processed.') row = df.iloc[row_ind].to_dict() id_ = row['id'] text = row[text_col] # NOTE: Temporary fix for RoBERTa models from huggingface transformers. text = text.replace(').', ') .') document = self.nlp(text) adjacency_dict[id_] = {} current_dict = adjacency_dict[id_] # NOTE: Could be modified to other columns. token_dict = embedder(text) input_ids = token_dict['input_ids'] spacy_tokens = [token.text for token in document] transformer_tokens = embedder.tokenizer.convert_ids_to_tokens( input_ids[0]) alignment, _ = get_alignments(spacy_tokens, transformer_tokens) current_dict['alignment'] = deepcopy(alignment) # Review length after tokenization (-2 for removing special tokens). # NOTE: Review length is aligned with the BERT embeddings, which may contain word-piece tokenization that is not consistent with the tokenization of spaCy. review_len = len(spacy_tokens) embedding_len = input_ids.shape[1] - 2 # Note down the start and end offsets for the review for later use. current_dict['start_offset'] = 1 current_dict['end_offset'] = embedding_len + 1 # adjacency_tensor = np.zeros( # shape=(num_dependencies + 1, review_len, review_len), dtype=np.float32) adjacency_list = [None for _ in np.arange(num_dependencies + 1)] row = [[] for _ in np.arange(num_dependencies)] col = [[] for _ in np.arange(num_dependencies)] pos_tags = [] if tree: for token in document: dep = token.dep_ dep_list.append(dep) dep_ind = self.dependency2ind[dep] # NOTE: Self-loop may be spared for our implementation. # Non-existent syntactic relations do not have adjacency weights. # NOTE 2: Self-loop can be indicated as a separate relation. # if token.i < review_len: # # adjacency_tensor[dep_ind, :, :] = np.eye(review_len, dtype=np.float32) # adjacency_tensor[dep_ind, token.head.i, token.i] = 1 row[dep_ind].append(token.head.i) col[dep_ind].append(token.i) pos_tag_list.append(token.pos_) pos_tags.append(token.pos_) # adjacency_tensor[-1, :, :] = np.eye(review_len, dtype=np.float32) # current_dict['adjacency_tensor'] = adjacency_tensor for dep_ind in np.arange(num_dependencies): adjacency_list[dep_ind] = coo_matrix( ([1 for _ in np.arange(len(row[dep_ind]))], (row[dep_ind], col[dep_ind])), shape=(review_len, review_len), dtype=np.float32 ) adjacency_list[-1] = coo_matrix( np.eye(review_len, dtype=np.float32)) current_dict['adjacency_list'] = deepcopy(adjacency_list) current_dict['pos_tags'] = deepcopy(pos_tags) else: for token in document: dep = token.dep_ dep_list.append(dep) dep_ind = self.dependency2ind[dep] # NOTE: Self-loop may be spared for our implementation. # Non-existent syntactic relations do not have adjacency weights. # NOTE 2: Self-loop can be indicated as a separate relation. # if token.i < review_len: # # adjacency_tensor[dep_ind, :, :] = np.eye(review_len, dtype=np.float32) # adjacency_tensor[dep_ind, token.head.i, token.i] = 1 # adjacency_tensor[dep_ind, token.i, token.head.i] = 1 row[dep_ind].append(token.head.i) col[dep_ind].append(token.i) row[dep_ind].append(token.i) col[dep_ind].append(token.head.i) pos_tag_list.append(token.pos_) pos_tags.append(token.pos_) # adjacency_tensor[-1, :, :] = np.eye(review_len, dtype=np.float32) # current_dict['adjacency_tensor'] = adjacency_tensor for dep_ind in np.arange(num_dependencies): adjacency_list[dep_ind] = coo_matrix( ([1 for _ in np.arange(len(row[dep_ind]))], (row[dep_ind], col[dep_ind])), shape=(review_len, review_len), dtype=np.float32 ) adjacency_list[-1] = coo_matrix( np.eye(review_len, dtype=np.float32)) current_dict['adjacency_list'] = deepcopy(adjacency_list) current_dict['pos_tags'] = deepcopy(pos_tags) print(f'Core: {proc_id}, all {len(row_inds)} rows processed.') return adjacency_dict, Counter(dep_list), Counter(pos_tag_list)
def get_raw_text_for_trees(treebank_root, splits, tree_files): lines = [] for fname in glob_raw_files(treebank_root, splits): with open(fname, 'r', encoding="windows-1252") as f: for line in f: if line.strip() and not line.startswith('.START'): # Delete invalid gcharacters caused by encoding issues line = line.replace("Õ", "").replace("å", "") lines.append(line) reader = BracketParseCorpusReader('.', tree_files) target_sents = reader.sents() line_iter = iter(lines) line = "" pairs = [] for target_sent in target_sents: if not line.strip(): line = next(line_iter) # Handle PTB-style escaping mismatches target_sent = [standardize_form(word) for word in target_sent] # Handle transpositions: sometimes the raw text transposes punctuation, # while the parsed version cleans up this transposition if 'U.S..' in ''.join(target_sent): target_sent = [x.replace('U.S.', 'U.S') for x in target_sent] if 'Co.,' in ''.join(target_sent) and 'Co,.' in line: target_sent = [x.replace('Co.', 'Co') for x in target_sent] if "But that 's" in ' '.join(target_sent) and "But's that" in line: target_sent = [x.replace("that", "tha") for x in target_sent] target_sent = [x.replace("'s", "t") for x in target_sent] if ('-- Freshman football player' in line or '-- Sophomore football player' in line or '-- Junior football player' in line or '-- Senior football player' in line or '-- Graduate-student football player' in line or '-- Football player' in line or '-- Freshman basketball player' in line or '-- Sophomore basketball player' in line or '-- Junior basketball player' in line or '-- Senior basketball player' in line or '-- Basketball player' in line) and ( '" .' in ' '.join(target_sent) and target_sent[-1] == '.'): target_sent = target_sent[:-1] # Attempt to align raw and parsed text r2p, p2r = tokenizations.get_alignments(line.replace("`", "'"), target_sent) # Handle skips: some lines in the raw data are not parsed while not all(p2r): go_next = False if line.startswith('(See') and '-- WSJ' in line: go_next = True elif line == 'San Diego ': go_next = True elif line == '" ': go_next = True if go_next: line = next(line_iter) r2p, p2r = tokenizations.get_alignments( line.replace("`", "'"), target_sent) else: break # Handle line breaks in raw format that come in the middle of the sentence # (such as mid-sentence line breaks in poems) for _ in range(12): # Loop limit is to aid in debugging if not all(p2r): line = line + next(line_iter) r2p, p2r = tokenizations.get_alignments( line.replace("`", "'"), target_sent) assert all(p2r) end = max([max(x) for x in p2r]) + 1 # Trim excess raw text at the start line_to_save = line[:end] r2p, p2r = tokenizations.get_alignments(line_to_save.replace("`", "'"), target_sent) while True: _, alt_p2r = tokenizations.get_alignments( '\n'.join(line_to_save.replace("`", "'").splitlines()[1:]), target_sent) if sum([len(x) for x in p2r]) == sum([len(x) for x in alt_p2r]): line_to_save = '\n'.join(line_to_save.splitlines()[1:]) else: break pairs.append((line_to_save, target_sent)) line = line[end:] assert len(pairs) == len(target_sents) return [line for (line, target_sent) in pairs]
def get_words_and_whitespace(treebank_root, splits, tree_files): reader = BracketParseCorpusReader('.', tree_files) target_sents = reader.sents() raw_sents = get_raw_text_for_trees(treebank_root, splits, tree_files) pairs = [] for line, target_sent in zip(raw_sents, target_sents): # Fix some errors in the raw text that are also fixed in the parsed trees if "But's that just" in line: line = line.replace("But's that just", "But that's just") if 'Co,.' in line: line = line.replace('Co,.', 'Co.,') if 'U.S..' in ''.join(target_sent): # Address cases where underlying "U.S." got tokenized as "U.S." "."" # This is expected in the sentence-final position, but it seems to # occur in other places, too. line = line.replace('U.S.', 'U.S..').replace( 'U.S.. market', 'U.S. market').replace('U.S.. agenda', 'U.S. agenda').replace( 'U.S.. even', 'U.S. even').replace( 'U.S.. counterpart', 'U.S. counterpart').replace( 'U.S.. unit', 'U.S. unit').replace('U.S..,', 'U.S.,') words = target_sent[:] target_sent = [ standardize_form(word).replace("``", '"') for word in target_sent ] r2p, p2r = tokenizations.get_alignments(line.replace("`", "'"), target_sent) last_char_for_parsed = [max(x) if x else None for x in p2r] have_space_after = [None] * len(words) for i, word in enumerate(target_sent): if last_char_for_parsed[i] is None: continue char_after_word = line[last_char_for_parsed[i] + 1:last_char_for_parsed[i] + 2] have_space_after[i] = (char_after_word != char_after_word.lstrip()) # Fix the few cases where the word form in the parsed data is incorrect if word == "'T-" and target_sent[i + 1] == 'is': target_sent[i] = "'T" if word == "16" and target_sent[i + 1:i + 5] == [ '64', '-', 'inch', 'opening' ]: # This error occurs in the test set, and moreover would affect # tokenization by introducing an extra '/', so we don't fix it. # target_sent[i] = "16/" have_space_after[i] = True if word == "Gaming" and target_sent[i - 1:i + 2] == [ 'and', 'Gaming', 'company' ]: target_sent[i] = "gaming" pairs.append((target_sent, have_space_after)) # For each token in the treebank, we have now queried the raw string to # determine if the token should have whitespace following it. The lines # below are a sanity check that the reconstructed text matches the raw # version as closely as possible. to_delete = set() for indices in p2r: if not indices: continue to_delete |= set(range(min(indices), max(indices) + 1)) - set(indices) raw = list(line) for i in sorted(to_delete, reverse=True): del raw[i] raw = "".join(raw) raw = " ".join(x.strip() for x in raw.split()) guess = "".join([ w + (" " if sp else "") for (w, sp) in zip(target_sent, have_space_after) ]) if "filings policy-making" in guess: # The parsed version of this sentence drops an entire span from the raw # text. Maybe we shouldn't be training on this bad example, but for now # we'll just skip validating it. continue # Fix some issues with the raw text that are corrected in the parsed version raw = raw.replace("`", "'") raw = raw.replace("and <Tourism", "and Tourism") raw = raw.replace("staf reporter", "staff reporter") if " S$" in raw and " S$" not in guess: raw = raw.replace(" S$", " US$") raw = raw.replace("16/ 64-inch opening", "16 64-inch opening") if raw != guess and raw.replace('."', '".') == guess: raw = raw.replace('."', '".') # assert raw == guess if raw != guess: print(raw) print(guess) print() return pairs