Esempio n. 1
0
def convert_documents_to_features(examples, tokenizer, max_seq_length,
                                  doc_stride):
    """Loads a data file into a list of `InputBatch`s."""

    unique_id = 1000000000

    features = []
    for (example_index, example) in enumerate(tqdm(examples,
                                                   desc='converting')):

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = max_seq_length - 2

        # We can have documents that are longer than the maximum sequence length.
        # To deal with this we do a sliding window approach, where we take chunks
        # of the up to our max length with a stride of `doc_stride`.
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):
                break
            start_offset += min(length, doc_stride)

        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = []
            token_to_orig_map = {}
            token_is_max_context = {}
            tokens.append("[CLS]")

            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(
                    tokens)] = tok_to_orig_index[split_token_index]

                is_max_context = _check_is_max_context(doc_spans,
                                                       doc_span_index,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
            tokens.append("[SEP]")

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length

            if example_index < 20:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info(
                    "tokens: %s" %
                    " ".join([tokenization.printable_text(x) for x in tokens]))
                logger.info("token_to_orig_map: %s" % " ".join([
                    "%d:%d" % (x, y)
                    for (x, y) in six.iteritems(token_to_orig_map)
                ]))
                logger.info("token_is_max_context: %s" % " ".join([
                    "%d:%s" % (x, y)
                    for (x, y) in six.iteritems(token_is_max_context)
                ]))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
                logger.info("input_mask: %s" %
                            " ".join([str(x) for x in input_mask]))

            features.append(
                ContextFeatures(unique_id=unique_id,
                                example_index=example_index,
                                doc_span_index=doc_span_index,
                                tokens=tokens,
                                token_to_orig_map=token_to_orig_map,
                                token_is_max_context=token_is_max_context,
                                input_ids=input_ids,
                                input_mask=input_mask))
            unique_id += 1

    return features
Esempio n. 2
0
def convert_documents_to_features(examples, tokenizer, max_seq_length,
                                  doc_stride):
    """Loads a data file into a list of `InputBatch`s."""

    unique_id = 1000000000
    features = []

    for (example_index,
         example) in enumerate(tqdm(examples, desc='Converting documents')):

        # Creating a map between word <=> (sub)token
        tok_to_word_index = []
        word_to_tok_index = []  # word to (start of) subtokens
        all_doc_tokens = []
        for (i, word) in enumerate(example.doc_words):
            word_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(word)
            for sub_token in sub_tokens:
                tok_to_word_index.append(i)
                all_doc_tokens.append(sub_token)

        # The -2 accounts for [CLS], [SEP]
        max_tokens_for_doc = max_seq_length - 2

        # Split sequence by max_seq_len with doc_stride, _DocSpan is based on tokens without [CLS], [SEP]
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_tok_offset = 0  # From all_doc_tokens

        # Get doc_spans with stride and offset
        while start_tok_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_tok_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_tok_offset, length=length))
            if start_tok_offset + length == len(all_doc_tokens):
                break
            start_tok_offset += min(
                length, doc_stride)  # seems to prefer doc_stride always
            assert doc_stride < length, "length is no larger than doc_stride for {}".format(
                doc_spans)

        # Iterate each doc_span and make out_tokens
        for (doc_span_index, doc_span) in enumerate(doc_spans):
            out_tokens = []  # doc
            out_tokens.append("[CLS]")
            token_to_word_map = {
            }  # The difference with tok_to_word_index is it includes special tokens
            token_is_max_context = {}

            # For each doc token, create token_to_word_map and is_max_context, and add to out_tokens
            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_word_map[len(
                    out_tokens)] = tok_to_word_index[split_token_index]
                is_max_context = _check_is_max_context(doc_spans,
                                                       doc_span_index,
                                                       split_token_index)
                token_is_max_context[len(out_tokens)] = is_max_context
                out_tokens.append(all_doc_tokens[split_token_index])
            out_tokens.append("[SEP]")

            # Convert to ids and masks
            input_ids = tokenizer.convert_tokens_to_ids(out_tokens)
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length

            # Printing for debug
            if example_index < 1 and doc_span_index < 1:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info("tokens: %s" % " ".join(
                    [tokenization.printable_text(x) for x in out_tokens]))
                logger.info("token_to_word_map: %s" % " ".join([
                    "%d:%d" % (x, y)
                    for (x, y) in six.iteritems(token_to_word_map)
                ]))
                logger.info("token_is_max_context: %s" % " ".join([
                    "%d:%s" % (x, y)
                    for (x, y) in six.iteritems(token_is_max_context)
                ]))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
                logger.info("input_mask: %s" %
                            " ".join([str(x) for x in input_mask]))

            # Append feature
            features.append(
                ContextFeatures(unique_id=unique_id,
                                example_index=example_index,
                                doc_span_index=doc_span_index,
                                tokens=out_tokens,
                                token_to_word_map=token_to_word_map,
                                token_is_max_context=token_is_max_context,
                                input_ids=input_ids,
                                input_mask=input_mask))
            unique_id += 1

    return features
Esempio n. 3
0
def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 doc_stride, max_query_length, is_training):
    """Loads a data file into a list of `InputBatch`s."""

    unique_id = 1000000000

    features = []
    question_features = []
    for (example_index, example) in enumerate(tqdm(examples,
                                                   desc='converting')):

        query_tokens = tokenizer.tokenize(example.question_text)
        if len(query_tokens) > max_query_length:
            query_tokens = query_tokens[0:max_query_length]

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        tok_start_position = None
        tok_end_position = None
        if is_training:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[example.end_position +
                                                     1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1
            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position,
                tokenizer, example.orig_answer_text)

        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = max_seq_length - 2

        # We can have documents that are longer than the maximum sequence length.
        # To deal with this we do a sliding window approach, where we take chunks
        # of the up to our max length with a stride of `doc_stride`.
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):
                break
            start_offset += min(length, doc_stride)

        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = []
            tokens_ = []
            token_to_orig_map = {}
            token_is_max_context = {}
            tokens.append("[CLS]")
            tokens_.append("[CLS]")
            for token in query_tokens:
                tokens_.append(token)
            tokens_.append("[SEP]")

            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(
                    tokens)] = tok_to_orig_index[split_token_index]

                is_max_context = _check_is_max_context(doc_spans,
                                                       doc_span_index,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
            tokens.append("[SEP]")

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_ids_ = tokenizer.convert_tokens_to_ids(tokens_)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)
            input_mask_ = [1] * len(input_ids_)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length

            while len(input_ids_) < max_query_length + 2:
                input_ids_.append(0)
                input_mask_.append(0)

            assert len(input_ids_) == max_query_length + 2
            assert len(input_mask_) == max_query_length + 2

            start_position = None
            end_position = None
            if example.start_position is not None and example.start_position < 0:
                start_position, end_position = -1, -1
            elif is_training:
                # For training, if our document chunk does not contain an annotation
                # we throw it out, since there is nothing to predict.
                doc_start = doc_span.start
                doc_end = doc_span.start + doc_span.length - 1
                if (example.start_position < doc_start
                        or example.end_position < doc_start
                        or example.start_position > doc_end
                        or example.end_position > doc_end):
                    continue

                doc_offset = 1
                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

            if example_index < 20:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info(
                    "tokens: %s" %
                    " ".join([tokenization.printable_text(x) for x in tokens]))
                logger.info("token_to_orig_map: %s" % " ".join([
                    "%d:%d" % (x, y)
                    for (x, y) in six.iteritems(token_to_orig_map)
                ]))
                logger.info("token_is_max_context: %s" % " ".join([
                    "%d:%s" % (x, y)
                    for (x, y) in six.iteritems(token_is_max_context)
                ]))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
                logger.info("input_mask: %s" %
                            " ".join([str(x) for x in input_mask]))
                if is_training:
                    answer_text = " ".join(
                        tokens[start_position:(end_position + 1)])
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
                    logger.info("answer: %s" %
                                (tokenization.printable_text(answer_text)))

            features.append(
                ContextFeatures(unique_id=unique_id,
                                example_index=example_index,
                                doc_span_index=doc_span_index,
                                tokens=tokens,
                                token_to_orig_map=token_to_orig_map,
                                token_is_max_context=token_is_max_context,
                                input_ids=input_ids,
                                input_mask=input_mask,
                                start_position=start_position,
                                end_position=end_position))
            question_features.append(
                QuestionFeatures(unique_id=unique_id,
                                 example_index=example_index,
                                 input_ids=input_ids_,
                                 input_mask=input_mask_,
                                 tokens=tokens_))
            unique_id += 1

    return features, question_features
Esempio n. 4
0
def convert_examples_to_features(examples,
                                 tokenizer,
                                 max_seq_length,
                                 doc_stride,
                                 max_query_length,
                                 return_answers,
                                 skip_no_answer,
                                 verbose=False,
                                 save_with_prob=False,
                                 msg="Converting examples"):
    """Loads a data file into a list of `InputBatch`s."""

    unique_id = 1000000000
    features = []
    question_features = []

    for (example_index, example) in enumerate(tqdm(examples, desc=msg)):

        # Tokenize query into (sub)tokens
        query_tokens = tokenizer.tokenize(example.question_text)
        if len(query_tokens) > max_query_length:
            query_tokens = query_tokens[0:max_query_length]

        # Creating a map between word <=> (sub)token
        tok_to_word_index = []
        word_to_tok_index = []  # word to (start of) subtokens
        all_doc_tokens = []
        for (i, word) in enumerate(example.doc_words):
            word_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(word)
            for sub_token in sub_tokens:
                tok_to_word_index.append(i)
                all_doc_tokens.append(sub_token)

        # The -2 accounts for [CLS], [SEP]
        max_tokens_for_doc = max_seq_length - 2

        # Split sequence by max_seq_len with doc_stride, _DocSpan is based on tokens without [CLS], [SEP]
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_tok_offset = 0  # From all_doc_tokens

        # Get doc_spans with stride and offset
        while start_tok_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_tok_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_tok_offset, length=length))
            if start_tok_offset + length == len(all_doc_tokens):
                break
            start_tok_offset += min(
                length, doc_stride)  # seems to prefer doc_stride always
            assert doc_stride < length, "length is no larger than doc_stride for {}".format(
                doc_spans)

        # Iterate each doc_span and make out_tokens
        for (doc_span_index, doc_span) in enumerate(doc_spans):

            # Find answer position based on new out_tokens
            start_position = None
            end_position = None

            # For no_answer, same (-1, -1) applies
            if example.start_position is not None and example.start_position < 0:
                assert example.start_position == -1 and example.end_position == -1
                start_position, end_position = NO_ANS, NO_ANS

            # For existing answers, find answers if exist
            elif return_answers:

                # Get token-level start/end position
                tok_start_position = word_to_tok_index[example.start_position]
                if example.end_position < len(example.doc_words) - 1:
                    tok_end_position = word_to_tok_index[
                        example.end_position +
                        1] - 1  # By backwarding from next word
                else:
                    assert example.end_position == len(example.doc_words) - 1
                    tok_end_position = len(all_doc_tokens) - 1

                # Improve answer span by subword-level
                (tok_start_position, tok_end_position) = _improve_answer_span(
                    all_doc_tokens, tok_start_position, tok_end_position,
                    tokenizer, example.orig_answer_text)

                # Throw away training samples without answers (due to doc_span split)
                doc_start = doc_span.start
                doc_end = doc_span.start + doc_span.length - 1
                if (tok_start_position < doc_start
                        or tok_end_position < doc_start
                        or tok_start_position > doc_end
                        or tok_end_position > doc_end):
                    if skip_no_answer:
                        continue
                    else:
                        # For NQ, only add this in 2% (50 times downsample)
                        if save_with_prob:
                            if np.random.randint(100) < 2:
                                start_position, end_position = NO_ANS, NO_ANS
                            else:
                                continue
                        else:
                            start_position, end_position = NO_ANS, NO_ANS

                # Training samples with answers
                else:
                    doc_offset = 1  # For [CLS]
                    start_position = tok_start_position - doc_start + doc_offset
                    end_position = tok_end_position - doc_start + doc_offset
                    assert start_position >= 0 and end_position >= 0, (
                        start_position, end_position)

            out_tokens = []  # doc
            out_tokens_ = []  # quesry
            out_tokens.append("[CLS]")
            out_tokens_.append("[CLS]")
            token_to_word_map = {
            }  # The difference with tok_to_word_index is it includes special tokens
            token_is_max_context = {}

            # For query tokens, just copy and add [SEP]
            for token in query_tokens:
                out_tokens_.append(token)
            out_tokens_.append("[SEP]")

            # For each doc token, create token_to_word_map and is_max_context, and add to out_tokens
            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_word_map[len(
                    out_tokens)] = tok_to_word_index[split_token_index]
                is_max_context = _check_is_max_context(doc_spans,
                                                       doc_span_index,
                                                       split_token_index)
                token_is_max_context[len(out_tokens)] = is_max_context
                out_tokens.append(all_doc_tokens[split_token_index])
            out_tokens.append("[SEP]")

            # Convert to ids and masks
            input_ids = tokenizer.convert_tokens_to_ids(out_tokens)
            input_ids_ = tokenizer.convert_tokens_to_ids(out_tokens_)
            input_mask = [1] * len(input_ids)
            input_mask_ = [1] * len(input_ids_)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            while len(
                    input_ids_) < max_query_length + 2:  # +2 for [CLS], [SEP]
                input_ids_.append(0)
                input_mask_.append(0)
            assert len(input_ids_) == max_query_length + 2
            assert len(input_mask_) == max_query_length + 2

            # Printing for debug
            if example_index < 1 and verbose:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
                logger.info("tokens: %s" % " ".join(
                    [tokenization.printable_text(x) for x in out_tokens]))
                logger.info("q tokens: %s" % " ".join(
                    [tokenization.printable_text(x) for x in out_tokens_]))
                logger.info("token_to_word_map: %s" % " ".join([
                    "%d:%d" % (x, y)
                    for (x, y) in six.iteritems(token_to_word_map)
                ]))
                logger.info("token_is_max_context: %s" % " ".join([
                    "%d:%s" % (x, y)
                    for (x, y) in six.iteritems(token_is_max_context)
                ]))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
                logger.info("input_mask: %s" %
                            " ".join([str(x) for x in input_mask]))
                if return_answers:
                    answer_text = " ".join(
                        out_tokens[start_position:(end_position + 1)])
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
                    logger.info("answer: %s" %
                                (tokenization.printable_text(answer_text)))

            # Append feature
            features.append(
                ContextFeatures(unique_id=unique_id,
                                example_index=example_index,
                                doc_span_index=doc_span_index,
                                tokens=out_tokens,
                                token_to_word_map=token_to_word_map,
                                token_is_max_context=token_is_max_context,
                                input_ids=input_ids,
                                input_mask=input_mask,
                                start_position=start_position,
                                end_position=end_position))
            question_features.append(
                QuestionFeatures(unique_id=unique_id,
                                 example_index=example_index,
                                 tokens_=out_tokens_,
                                 input_ids=input_ids_,
                                 input_mask=input_mask_))

            # Check validity of answer
            if return_answers:
                if start_position <= NO_ANS:
                    assert start_position == NO_ANS and end_position == NO_ANS, (
                        start_position, end_position)
                else:
                    assert out_tokens[start_position:end_position+1] == \
                            all_doc_tokens[tok_start_position:tok_end_position+1]
                    orig_text, start_pos, end_pos = get_final_text_(
                        example, features[-1], start_position, end_position,
                        True, False)
                    phrase = orig_text[start_pos:end_pos]
                    try:
                        assert phrase == example.orig_answer_text
                    except Exception as e:
                        # print('diff ans [%s]/[%s]'%(phrase, example.orig_answer_text))
                        pass
            unique_id += 1

    return features, question_features