def add_pos_ne_encoding(tokens, doc, vectors, pos=True, ne=True):
  '''
  This function takes in the tokens for a clause, the nlp doc with the tags, and the vectors.
  It concatenates the POS tag encoding to their respective word vectors.
  @param tokens:
  @param doc:
  @param vectors:
  @return: 
  '''
  # for each token, find its POS tag, get encoding, concatenate to its vector.
  # if the token doesn't have a tag, pad with 0s 

  if (not pos) and (not ne):
    return vectors

  spacy_tokens = [token.text for token in doc]
  roberta_tokens = tokens 
  a2b, b2a = tokenizations.get_alignments(spacy_tokens, roberta_tokens)

  new_vectors = []
  for index, alignment in enumerate(b2a):
    if alignment:
      # get the tags from that spacy token and concat
      named_entity_tags = doc[alignment[0]].ent_type_
      pos_tags = doc[alignment[0]].pos_
      new_vectors.append(tf.concat([vectors[0][index], get_one_hot_pos(pos_tags), get_one_hot_ne(named_entity_tags)], axis=0)) 
    else:
      # concat zeros 
      new_vectors.append(tf.concat([vectors[0][index], tf.zeros([37])], 0))  

  new_vectors = tf.stack(new_vectors)
  return new_vectors
Ejemplo n.º 2
0
def build_bert_emb(sents: List[str],
                   tokenizer,
                   model,
                   device: str):
    bert_embs = list()
    for i, sent in enumerate(sents):

        joint_sent = ' '.join(sent)
        bert_tokens = tokenizer.tokenize(joint_sent)

        input_ids = torch.tensor([tokenizer.encode(joint_sent, add_special_tokens=True)], device=device)
        # calculate BERT last layer embeddings
        with torch.no_grad():
            last_hidden_states = model(input_ids)[0].squeeze(0).to('cpu')
            trunc_hidden_states = last_hidden_states[1:-1, :]

        ori2bert, bert2ori = get_alignments(sent, bert_tokens)

        emb_list = list()
        for idx in ori2bert:
            emb = trunc_hidden_states[idx, :]
            emb_list.append(emb.mean(dim=0))

        # TODO: using the embedding of [CLS] may not be the best idea
        # It does not matter since that embedding is not used in the training
        emb_list = [last_hidden_states[0, :]] + emb_list
        bert_emb = torch.stack(emb_list)
        bert_embs.append(bert_emb)
    return bert_embs
Ejemplo n.º 3
0
def align_tokens(old_tokens, new_tokens, old_labels):
    a2b, b2a = tokenizations.get_alignments(old_tokens, new_tokens)
    l2 = [None] * len(new_tokens)
    for i in range(len(a2b)):
        l = old_labels[i]
        for j in a2b[i]:
            l2[j] = l
    return l2
Ejemplo n.º 4
0
def get_alignment(spans: List[Span], wordpieces: List[List[str]]) -> Ragged:
    """Compute a ragged alignment array that records, for each unique token in
    `spans`, the corresponding indices in the flattened `wordpieces` array.
    For instance, imagine you have two overlapping spans:
    
        [[I, like, walking], [walking, outdoors]]

    And their wordpieces are:

        [[I, like, walk, ing], [walk, ing, out, doors]]

    We want to align "walking" against [walk, ing, walk, ing], which have
    indices [2, 3, 4, 5] once the nested wordpieces list is flattened.

    The nested alignment list would be:

    [[0], [1], [2, 3, 4, 5], [6, 7]]
      I   like    walking    outdoors

    Which gets flattened into the ragged array:

    [0, 1, 2, 3, 4, 5, 6, 7]
    [1, 1, 4, 2]

    The ragged format allows the aligned data to be computed via:

    tokens = Ragged(wp_tensor[align.data], align.lengths)

    This produces a ragged format, indicating which tokens need to be collapsed
    to make the aligned array. The reduction is deferred for a later step, so
    the user can configure it. The indexing is especially efficient in trivial
    cases like this where the indexing array is completely continuous.
    """
    if len(spans) != len(wordpieces):
        raise ValueError("Cannot align batches of different sizes.")
    # Tokens can occur more than once, and we need the alignment of each token
    # to its place in the concatenated wordpieces array.
    token_positions = get_token_positions(spans)
    alignment: List[Set[int]] = [set() for _ in range(len(token_positions))]
    wp_start = 0
    for i, (span, wp_toks) in enumerate(zip(spans, wordpieces)):
        sp_toks = [token.text for token in span]
        span2wp, wp2span = tokenizations.get_alignments(sp_toks, wp_toks)
        for token, wp_js in zip(span, span2wp):
            position = token_positions[token]
            alignment[position].update(wp_start + j for j in wp_js)
        wp_start += len(wp_toks)
    lengths: List[int] = []
    flat: List[int] = []
    for a in alignment:
        lengths.append(len(a))
        flat.extend(sorted(a))
    align = Ragged(numpy.array(flat, dtype="i"), numpy.array(lengths,
                                                             dtype="i"))
    return align
Ejemplo n.º 5
0
def read_text(text_path):
    sents = []
    sent = []
    end_of_multiword = 0
    multiword_combined = ""
    multiword_separate = []
    multiword_sp_after = False
    with open(text_path) as f:
        for line in f:
            if not line.strip() or line.startswith("#"):
                if sent:
                    sents.append(([w for w, sp in sent], [sp for w, sp in sent]))
                    sent = []
                    assert end_of_multiword == 0
                continue
            fields = line.split("\t", 2)
            num_or_range = fields[0]
            w = fields[1]

            if "-" in num_or_range:
                end_of_multiword = int(num_or_range.split("-")[1])
                multiword_combined = w
                multiword_separate = []
                multiword_sp_after = "SpaceAfter=No" not in fields[-1]
                continue
            elif int(num_or_range) <= end_of_multiword:
                multiword_separate.append(w)
                if int(num_or_range) == end_of_multiword:
                    _, separate_to_combined = tokenizations.get_alignments(
                        multiword_combined, multiword_separate
                    )
                    have_up_to = 0
                    for i, char_idxs in enumerate(separate_to_combined):
                        if i == len(multiword_separate) - 1:
                            word = multiword_combined[have_up_to:]
                            sent.append((word, multiword_sp_after))
                        elif char_idxs:
                            word = multiword_combined[have_up_to : max(char_idxs) + 1]
                            sent.append((word, False))
                            have_up_to = max(char_idxs) + 1
                        else:
                            sent.append(("", False))
                    assert int(num_or_range) == len(sent)
                    end_of_multiword = 0
                    multiword_combined = ""
                    multiword_separate = []
                    multiword_sp_after = False
                continue
            else:
                assert int(num_or_range) == len(sent) + 1
                sp = "SpaceAfter=No" not in fields[-1]
                sent.append((w, sp))
    return sents
Ejemplo n.º 6
0
    def _align(self, segment, wp_tokens, *, offset=0):
        spacy_tokens = [w.text for w in segment]
        a2b, b2a = get_alignments(spacy_tokens, wp_tokens)

        # a2b must contain the boundary of `segment` (head and last token index)
        # so insert them when they are missed.
        if a2b and b2a:
            if len(b2a[0]) == 0:
                a2b[0].insert(0, 0)
            if len(b2a[-1]) == 0:
                a2b[-1].append(len(b2a) - 1)
        a2b = [[i + offset for i in a] for a in a2b]
        return wp_tokens, a2b
def convert_to_revised_tokenization(orig_trees, revised_trees):
    for orig_tree, revised_tree in zip(orig_trees, revised_trees):
        orig_words = [standardize_form(word) for word in orig_tree.leaves()]
        revised_words = [
            standardize_form(word) for word in revised_tree.leaves()
        ]
        o2r, r2o = tokenizations.get_alignments(orig_words, revised_words)
        assert all(len(x) >= 1 for x in o2r)

        converted_tree = orig_tree.copy(deep=True)
        for j in range(len(revised_words)):
            if len(r2o[j]) > 1:
                for i in r2o[j][1:]:
                    orig_treeposition = orig_tree.leaf_treeposition(i)
                    if len(orig_treeposition) > 1 and len(
                            orig_tree[orig_treeposition[:-1]]) == 1:
                        converted_tree[orig_treeposition[:-1]] = nltk.Tree(
                            DUMMY_LABEL, [DUMMY_WORD])
                    else:
                        converted_tree[orig_treeposition] = DUMMY_LABEL

        for i in range(len(orig_words)):
            if converted_tree[orig_tree.leaf_treeposition(i)] == DUMMY_LABEL:
                continue
            elif len(o2r[i]) == 1:
                j = o2r[i][0]
                converted_tree[orig_tree.leaf_treeposition(i)] = revised_tree[
                    revised_tree.leaf_treeposition(j)]
            else:
                orig_treeposition = orig_tree.leaf_treeposition(i)
                if len(orig_treeposition) > 1 and len(
                        orig_tree[orig_treeposition[:-1]]) == 1:
                    orig_treeposition = orig_treeposition[:-1]
                    revised_leaves = [
                        revised_tree[revised_tree.leaf_treeposition(j)[:-1]]
                        for j in o2r[i]
                    ]
                    assert all(len(x) == 1 for x in revised_leaves)
                    converted_tree[orig_treeposition] = nltk.Tree(
                        DUMMY_LABEL, revised_leaves)
                else:
                    converted_tree[orig_treeposition] = nltk.Tree(
                        DUMMY_LABEL, [
                            revised_tree[revised_tree.leaf_treeposition(j)]
                            for j in o2r[i]
                        ])

        yield converted_tree
Ejemplo n.º 8
0
def respan(src_tokens: List[str], tgt_tokens: List[str],
           src_span: List[tuple]):
    """
    transfer original spans to target spans
    :param src_tokens: source tokens
    :param tgt_tokens: target tokens
    :param src_span: a list of span tuples. The first element in the tuple
    should be the start index and the second should be the end index
    :return: a list of transferred span tuples.
    """
    s2t, _ = get_alignments(src_tokens, tgt_tokens)
    tgt_spans = list()
    for spans in src_span:
        start = s2t[spans[0]][0]
        if spans[1] < len(s2t):
            end = s2t[spans[1]][-1]
        else:
            end = s2t[-1][-1]
        if end == start:
            end += 1
        tgt_spans.append((start, end))

    return tgt_spans
Ejemplo n.º 9
0
def _get_transformers_align(doc: Doc) -> List[List[int]]:
    """Get tokens alignment from spacy tokens to transformers tokens"""
    trf_tokens = doc._.get(ATTRS.cleaned_tokens)
    return get_alignments([token.text for token in doc], trf_tokens)[0]
Ejemplo n.º 10
0
def test_random(a, b):
    tokenizations.get_alignments(a, b)
Ejemplo n.º 11
0
def test_get_alignments(input_, expected):
    output = tokenizations.get_alignments(*input_)
    assert output == expected
Ejemplo n.º 12
0
def test_equality(a):
    a2b, b2a = tokenizations.get_alignments(a, a)
    assert a2b == b2a
    assert a2b == [[i] if len(aa) else [] for i, aa in enumerate(a)]
Ejemplo n.º 13
0
    def process_single_core(self, proc_id: int, row_inds: np.ndarray, df: DataFrame, embedder: Embedder, text_col: str, tree: bool) -> Tuple[dict, Counter, Counter]:
        num_dependencies = len(self.dependencies)
        adjacency_dict = {}
        dep_list = []
        pos_tag_list = []
        for curr_ind, row_ind in enumerate(row_inds):
            if curr_ind % 100 == 0:
                print(f'Core: {proc_id}, {curr_ind} from {len(row_inds)} rows processed.')
            row = df.iloc[row_ind].to_dict()
            id_ = row['id']
            text = row[text_col]
            # NOTE: Temporary fix for RoBERTa models from huggingface transformers.
            text = text.replace(').', ') .')
            document = self.nlp(text)
            adjacency_dict[id_] = {}
            current_dict = adjacency_dict[id_]
            # NOTE: Could be modified to other columns.
            token_dict = embedder(text)
            input_ids = token_dict['input_ids']
            spacy_tokens = [token.text for token in document]
            transformer_tokens = embedder.tokenizer.convert_ids_to_tokens(
                input_ids[0])
            alignment, _ = get_alignments(spacy_tokens, transformer_tokens)
            current_dict['alignment'] = deepcopy(alignment)
            # Review length after tokenization (-2 for removing special tokens).
            # NOTE: Review length is aligned with the BERT embeddings, which may contain word-piece tokenization that is not consistent with the tokenization of spaCy.
            review_len = len(spacy_tokens)
            embedding_len = input_ids.shape[1] - 2
            # Note down the start and end offsets for the review for later use.
            current_dict['start_offset'] = 1
            current_dict['end_offset'] = embedding_len + 1
            # adjacency_tensor = np.zeros(
            #     shape=(num_dependencies + 1, review_len, review_len), dtype=np.float32)
            adjacency_list = [None for _ in np.arange(num_dependencies + 1)]
            row = [[] for _ in np.arange(num_dependencies)]
            col = [[] for _ in np.arange(num_dependencies)]
            pos_tags = []
            if tree:
                for token in document:
                    dep = token.dep_
                    dep_list.append(dep)
                    dep_ind = self.dependency2ind[dep]
                    # NOTE: Self-loop may be spared for our implementation.
                    # Non-existent syntactic relations do not have adjacency weights.
                    # NOTE 2: Self-loop can be indicated as a separate relation.
                    # if token.i < review_len:
                    #     # adjacency_tensor[dep_ind, :, :] = np.eye(review_len, dtype=np.float32)
                    #     adjacency_tensor[dep_ind, token.head.i, token.i] = 1

                    row[dep_ind].append(token.head.i)
                    col[dep_ind].append(token.i)

                    pos_tag_list.append(token.pos_)
                    pos_tags.append(token.pos_)
                # adjacency_tensor[-1, :, :] = np.eye(review_len, dtype=np.float32)
                # current_dict['adjacency_tensor'] = adjacency_tensor

                for dep_ind in np.arange(num_dependencies):
                    adjacency_list[dep_ind] = coo_matrix(
                        ([1 for _ in np.arange(len(row[dep_ind]))],
                         (row[dep_ind], col[dep_ind])),
                        shape=(review_len, review_len),
                        dtype=np.float32
                    )
                adjacency_list[-1] = coo_matrix(
                    np.eye(review_len, dtype=np.float32))
                current_dict['adjacency_list'] = deepcopy(adjacency_list)
                current_dict['pos_tags'] = deepcopy(pos_tags)
            else:
                for token in document:
                    dep = token.dep_
                    dep_list.append(dep)
                    dep_ind = self.dependency2ind[dep]
                    # NOTE: Self-loop may be spared for our implementation.
                    # Non-existent syntactic relations do not have adjacency weights.
                    # NOTE 2: Self-loop can be indicated as a separate relation.
                    # if token.i < review_len:
                    #     # adjacency_tensor[dep_ind, :, :] = np.eye(review_len, dtype=np.float32)
                    #     adjacency_tensor[dep_ind, token.head.i, token.i] = 1
                    #     adjacency_tensor[dep_ind, token.i, token.head.i] = 1

                    row[dep_ind].append(token.head.i)
                    col[dep_ind].append(token.i)
                    row[dep_ind].append(token.i)
                    col[dep_ind].append(token.head.i)

                    pos_tag_list.append(token.pos_)
                    pos_tags.append(token.pos_)
                # adjacency_tensor[-1, :, :] = np.eye(review_len, dtype=np.float32)
                # current_dict['adjacency_tensor'] = adjacency_tensor

                for dep_ind in np.arange(num_dependencies):
                    adjacency_list[dep_ind] = coo_matrix(
                        ([1 for _ in np.arange(len(row[dep_ind]))],
                         (row[dep_ind], col[dep_ind])),
                        shape=(review_len, review_len),
                        dtype=np.float32
                    )
                adjacency_list[-1] = coo_matrix(
                    np.eye(review_len, dtype=np.float32))
                current_dict['adjacency_list'] = deepcopy(adjacency_list)
                current_dict['pos_tags'] = deepcopy(pos_tags)
        print(f'Core: {proc_id}, all {len(row_inds)} rows processed.')
        return adjacency_dict, Counter(dep_list), Counter(pos_tag_list)
def get_raw_text_for_trees(treebank_root, splits, tree_files):
    lines = []
    for fname in glob_raw_files(treebank_root, splits):
        with open(fname, 'r', encoding="windows-1252") as f:
            for line in f:
                if line.strip() and not line.startswith('.START'):
                    # Delete invalid gcharacters caused by encoding issues
                    line = line.replace("Õ", "").replace("å", "")
                    lines.append(line)

    reader = BracketParseCorpusReader('.', tree_files)
    target_sents = reader.sents()

    line_iter = iter(lines)
    line = ""
    pairs = []
    for target_sent in target_sents:
        if not line.strip():
            line = next(line_iter)

        # Handle PTB-style escaping mismatches
        target_sent = [standardize_form(word) for word in target_sent]

        # Handle transpositions: sometimes the raw text transposes punctuation,
        # while the parsed version cleans up this transposition
        if 'U.S..' in ''.join(target_sent):
            target_sent = [x.replace('U.S.', 'U.S') for x in target_sent]
        if 'Co.,' in ''.join(target_sent) and 'Co,.' in line:
            target_sent = [x.replace('Co.', 'Co') for x in target_sent]
        if "But that 's" in ' '.join(target_sent) and "But's that" in line:
            target_sent = [x.replace("that", "tha") for x in target_sent]
            target_sent = [x.replace("'s", "t") for x in target_sent]
        if ('-- Freshman football player' in line
                or '-- Sophomore football player' in line
                or '-- Junior football player' in line
                or '-- Senior football player' in line
                or '-- Graduate-student football player' in line
                or '-- Football player' in line
                or '-- Freshman basketball player' in line
                or '-- Sophomore basketball player' in line
                or '-- Junior basketball player' in line
                or '-- Senior basketball player' in line
                or '-- Basketball player' in line) and (
                    '" .' in ' '.join(target_sent) and target_sent[-1] == '.'):
            target_sent = target_sent[:-1]

        # Attempt to align raw and parsed text
        r2p, p2r = tokenizations.get_alignments(line.replace("`", "'"),
                                                target_sent)

        # Handle skips: some lines in the raw data are not parsed
        while not all(p2r):
            go_next = False
            if line.startswith('(See') and '-- WSJ' in line:
                go_next = True
            elif line == 'San Diego ':
                go_next = True
            elif line == '" ':
                go_next = True
            if go_next:
                line = next(line_iter)
                r2p, p2r = tokenizations.get_alignments(
                    line.replace("`", "'"), target_sent)
            else:
                break

        # Handle line breaks in raw format that come in the middle of the sentence
        # (such as mid-sentence line breaks in poems)
        for _ in range(12):  # Loop limit is to aid in debugging
            if not all(p2r):
                line = line + next(line_iter)
                r2p, p2r = tokenizations.get_alignments(
                    line.replace("`", "'"), target_sent)

        assert all(p2r)
        end = max([max(x) for x in p2r]) + 1

        # Trim excess raw text at the start
        line_to_save = line[:end]
        r2p, p2r = tokenizations.get_alignments(line_to_save.replace("`", "'"),
                                                target_sent)
        while True:
            _, alt_p2r = tokenizations.get_alignments(
                '\n'.join(line_to_save.replace("`", "'").splitlines()[1:]),
                target_sent)
            if sum([len(x) for x in p2r]) == sum([len(x) for x in alt_p2r]):
                line_to_save = '\n'.join(line_to_save.splitlines()[1:])
            else:
                break

        pairs.append((line_to_save, target_sent))
        line = line[end:]

    assert len(pairs) == len(target_sents)
    return [line for (line, target_sent) in pairs]
def get_words_and_whitespace(treebank_root, splits, tree_files):
    reader = BracketParseCorpusReader('.', tree_files)
    target_sents = reader.sents()
    raw_sents = get_raw_text_for_trees(treebank_root, splits, tree_files)

    pairs = []
    for line, target_sent in zip(raw_sents, target_sents):
        # Fix some errors in the raw text that are also fixed in the parsed trees
        if "But's that just" in line:
            line = line.replace("But's that just", "But that's just")
        if 'Co,.' in line:
            line = line.replace('Co,.', 'Co.,')
        if 'U.S..' in ''.join(target_sent):
            # Address cases where underlying "U.S." got tokenized as "U.S." ".""
            # This is expected in the sentence-final position, but it seems to
            # occur in other places, too.
            line = line.replace('U.S.', 'U.S..').replace(
                'U.S.. market',
                'U.S. market').replace('U.S.. agenda', 'U.S. agenda').replace(
                    'U.S.. even', 'U.S. even').replace(
                        'U.S.. counterpart', 'U.S. counterpart').replace(
                            'U.S.. unit',
                            'U.S. unit').replace('U.S..,', 'U.S.,')
        words = target_sent[:]
        target_sent = [
            standardize_form(word).replace("``", '"') for word in target_sent
        ]

        r2p, p2r = tokenizations.get_alignments(line.replace("`", "'"),
                                                target_sent)

        last_char_for_parsed = [max(x) if x else None for x in p2r]
        have_space_after = [None] * len(words)
        for i, word in enumerate(target_sent):
            if last_char_for_parsed[i] is None:
                continue
            char_after_word = line[last_char_for_parsed[i] +
                                   1:last_char_for_parsed[i] + 2]
            have_space_after[i] = (char_after_word != char_after_word.lstrip())

            # Fix the few cases where the word form in the parsed data is incorrect
            if word == "'T-" and target_sent[i + 1] == 'is':
                target_sent[i] = "'T"
            if word == "16" and target_sent[i + 1:i + 5] == [
                    '64', '-', 'inch', 'opening'
            ]:
                # This error occurs in the test set, and moreover would affect
                # tokenization by introducing an extra '/', so we don't fix it.
                # target_sent[i] = "16/"
                have_space_after[i] = True
            if word == "Gaming" and target_sent[i - 1:i + 2] == [
                    'and', 'Gaming', 'company'
            ]:
                target_sent[i] = "gaming"
        pairs.append((target_sent, have_space_after))

        # For each token in the treebank, we have now queried the raw string to
        # determine if the token should have whitespace following it. The lines
        # below are a sanity check that the reconstructed text matches the raw
        # version as closely as possible.
        to_delete = set()
        for indices in p2r:
            if not indices:
                continue
            to_delete |= set(range(min(indices),
                                   max(indices) + 1)) - set(indices)
        raw = list(line)
        for i in sorted(to_delete, reverse=True):
            del raw[i]
        raw = "".join(raw)
        raw = " ".join(x.strip() for x in raw.split())

        guess = "".join([
            w + (" " if sp else "")
            for (w, sp) in zip(target_sent, have_space_after)
        ])

        if "filings policy-making" in guess:
            # The parsed version of this sentence drops an entire span from the raw
            # text. Maybe we shouldn't be training on this bad example, but for now
            # we'll just skip validating it.
            continue

        # Fix some issues with the raw text that are corrected in the parsed version
        raw = raw.replace("`", "'")
        raw = raw.replace("and <Tourism", "and Tourism")
        raw = raw.replace("staf reporter", "staff reporter")
        if " S$" in raw and " S$" not in guess:
            raw = raw.replace(" S$", " US$")
        raw = raw.replace("16/ 64-inch opening", "16 64-inch opening")
        if raw != guess and raw.replace('."', '".') == guess:
            raw = raw.replace('."', '".')

        # assert raw == guess
        if raw != guess:
            print(raw)
            print(guess)
            print()

    return pairs