Beispiel #1
def retokenize_record(record, tokenizer_name):
    """Retokenize an edge probing example. Modifies in-place."""
    text = record['text']
    aligner_fn = retokenize.get_aligner_fn(tokenizer_name)
    ta, new_tokens = aligner_fn(text)
    record['text'] = " ".join(new_tokens)
    for target in record['targets']:
        if 'span1' in target:
            target['span1'] = list(map(int, ta.project_span(*target['span1'])))
        if 'span2' in target:
            target['span2'] = list(map(int, ta.project_span(*target['span2'])))
    return record
Beispiel #2
def get_tags(text, current_tags, tokenizer_name, tag_dict):
    aligner_fn = retokenize.get_aligner_fn(tokenizer_name)
    assert len(text) == len(current_tags)
    res_tags = []
    introduced_tokenizer_tag = len(tag_dict)
    for i in range(len(text)):
        token = text[i]
        _, new_toks = aligner_fn(token)
        if len(new_toks) > 1:
            for tok in new_toks[1:]:
                # based on BERT-paper for wordpiece, we only keep the tag
                # for the first part of the word.
    _, aligned_text = aligner_fn(" ".join(text))
    assert len(aligned_text) == len(res_tags)
    str_tags = [str(s) for s in res_tags]
    return " ".join(str_tags)
def realign_spans(record, tokenizer_name):
    Builds the indices alignment while also tokenizing the input
    piece by piece.

        record: dict with the below fields
            text: str
            targets: list of dictionaries
                label: bool
                span1_index: int, start index of first span
                span1_text: str, text of first span
                span2_index: int, start index of second span
                span2_text: str, text of second span
        tokenizer_name: str

        record: dict with the below fields:
            text: str in tokenized form
            targets: dictionary with the below fields
                -label: bool
                -span_1: (int, int) of token indices
                -span1_text: str, the string
                -span2: (int, int) of token indices
                -span2_text: str, the string

    # find span indices and text
    text = record["text"].split()
    span1 = record["targets"][0]["span1_index"]
    span1_text = record["targets"][0]["span1_text"]
    span2 = record["targets"][0]["span2_index"]
    span2_text = record["targets"][0]["span2_text"]

    # construct end spans given span text space-tokenized length
    span1 = [span1, span1 + len(span1_text.strip().split())]
    span2 = [span2, span2 + len(span2_text.strip().split())]
    indices = [span1, span2]

    sorted_indices = sorted(indices, key=lambda x: x[0])
    current_tokenization = []
    span_mapping = {}

    # align first span to tokenized text
    aligner_fn = retokenize.get_aligner_fn(tokenizer_name)
    _, new_tokens = aligner_fn(" ".join(text[: sorted_indices[0][0]]))
    new_span1start = len(current_tokenization)
    _, span_tokens = aligner_fn(" ".join(text[sorted_indices[0][0] : sorted_indices[0][1]]))
    new_span1end = len(current_tokenization)
    span_mapping[sorted_indices[0][0]] = [new_span1start, new_span1end]

    # re-indexing second span
    _, new_tokens = aligner_fn(" ".join(text[sorted_indices[0][1] : sorted_indices[1][0]]))
    new_span2start = len(current_tokenization)
    _, span_tokens = aligner_fn(" ".join(text[sorted_indices[1][0] : sorted_indices[1][1]]))
    new_span2end = len(current_tokenization)
    span_mapping[sorted_indices[1][0]] = [new_span2start, new_span2end]

    # save back into record
    _, all_text = aligner_fn(" ".join(text))
    record["targets"][0]["span1"] = span_mapping[record["targets"][0]["span1_index"]]
    record["targets"][0]["span2"] = span_mapping[record["targets"][0]["span2_index"]]
    record["text"] = " ".join(all_text)
    return record