Ejemplo n.º 1
0
def tokenize_data(data, token_to_id, char_to_id, limit=None):
    """
    Tokenize a data set, with mapping of tokens to index in origin.
    Also create and update the vocabularies.

    :param: data: a flat, organize view of the data, as a list of qid, passage,
            query and answer indexes.
    :param: vocab: a dict of token to id; updated.
    :param: c_vocab: a dict of char to id; update.

    :return: a tokenized view of the data, as a list of qid, passage, query,
    answer indexes, and token to char indexes mapping.
    Passage and queries are tokenized into a tuple (token, chars).
    Answer indexes are start:stop range of tokens.
    """
    tokenized = []
    for qid, passage, query, (start, stop) in data:
        q_tokens, q_chars, _, _, _ = \
            rich_tokenize(query, token_to_id, char_to_id, update=True)
        p_tokens, p_chars, _, _, mapping = \
            rich_tokenize(passage['passage_text'],
                          token_to_id, char_to_id, update=True)

        if start == 0 and stop == 0:
            pass  # No answer; nop, since 0 == 0
        elif start == 0 and stop == len(passage):
            stop = len(p_tokens)  # Now point to just after last token.
        else:
            t_start = None
            t_end = len(p_tokens)
            for t_ind, (_start, _end) in enumerate(mapping):
                if start < _end:
                    t_start = t_ind
                    break
            assert t_start is not None
            for t_ind, (_start, _end) in \
                    enumerate(mapping[t_start:], t_start):
                if stop < _start:
                    t_end = t_ind
                    break
            start = t_start  # Now point to first token in answer.
            stop = t_end  # Now point to after the last token in answer.

        # Keep or not based on length of passage.
        if limit is not None and len(p_tokens) > limit:
            if stop <= limit:
                # Passage is too long, but it can be trimmed.
                p_tokens = p_tokens[:limit]
            else:
                # Passage is too long, but it cannot be trimmed.
                continue

        tokenized.append(
            (qid,
             (p_tokens, p_chars),
             (q_tokens, q_chars),
             (start, stop),
             mapping))

    return tokenized
Ejemplo n.º 2
0
def tokenize_data(data, token_to_id, char_to_id, limit=None):
    """
    Tokenize a data set, with mapping of tokens to index in origin.
    Also create and update the vocabularies.

    :param: data: a flat, organize view of the data, as a list of qid, passage,
            query and answer indexes.
    :param: vocab: a dict of token to id; updated.
    :param: c_vocab: a dict of char to id; update.

    :return: a tokenized view of the data, as a list of qid, passage, query,
    answer indexes, and token to char indexes mapping.
    Passage and queries are tokenized into a tuple (token, chars).
    Answer indexes are start:stop range of tokens.
    """
    tokenized = []
    for qid, passage, query, (start, stop) in data:
        q_tokens, q_chars, _, _, _ = \
            rich_tokenize(query, token_to_id, char_to_id, update=True)
        p_tokens, p_chars, _, _, mapping = \
            rich_tokenize(passage['passage_text'],
                          token_to_id, char_to_id, update=True)

        if start == 0 and stop == 0:
            pass  # No answer; nop, since 0 == 0
        elif start == 0 and stop == len(passage):
            stop = len(p_tokens)  # Now point to just after last token.
        else:
            t_start = None
            t_end = len(p_tokens)
            for t_ind, (_start, _end) in enumerate(mapping):
                if start < _end:
                    t_start = t_ind
                    break
            assert t_start is not None
            for t_ind, (_start, _end) in \
                    enumerate(mapping[t_start:], t_start):
                if stop < _start:
                    t_end = t_ind
                    break
            start = t_start  # Now point to first token in answer.
            stop = t_end  # Now point to after the last token in answer.

        # Keep or not based on length of passage.
        if limit is not None and len(p_tokens) > limit:
            if stop <= limit:
                # Passage is too long, but it can be trimmed.
                p_tokens = p_tokens[:limit]
            else:
                # Passage is too long, but it cannot be trimmed.
                continue

        tokenized.append((qid, (p_tokens, p_chars), (q_tokens, q_chars),
                          (start, stop), mapping))

    return tokenized
Ejemplo n.º 3
0
    def _to_batch(self, texts):
        mappings = []
        lengths = []
        c_lengths = []
        tokens = []
        chars = []
        for text in texts:
            _tokens, _chars, length, _c_lengths, mapping = \
                rich_tokenize(text,
                              self.vocab,
                              self.c_vocab, {}, update=False)
            mappings.append(mapping)
            lengths.append(length)
            c_lengths.append(_c_lengths)
            tokens.append(_tokens)
            chars.append(_chars)

        lengths = np.array(lengths)
        p_length = lengths.max()
        p_c_length = max(max(_c_lengths) for _c_lengths in c_lengths)

        b_tokens = []
        b_chars = []
        for _tokens, _chars in zip(tokens, chars):
            _tokens, _chars = pad_to_size(_tokens, _chars, p_length,
                                          max(5, p_c_length))
            b_tokens.append(_tokens)
            b_chars.append(_chars)

        b_tokens = np.concatenate(b_tokens)
        b_chars = np.concatenate(b_chars)

        return b_tokens, b_chars, lengths, mappings
Ejemplo n.º 4
0
    def _to_batch(self, texts):
        mappings = []
        lengths = []
        c_lengths = []
        tokens = []
        chars = []
        for text in texts:
            _tokens, _chars, length, _c_lengths, mapping = \
                rich_tokenize(text,
                              self.vocab,
                              self.c_vocab, update=False)
            mappings.append(mapping)
            lengths.append(length)
            c_lengths.append(_c_lengths)
            tokens.append(_tokens)
            chars.append(_chars)

        lengths = np.array(lengths)
        p_length = lengths.max()
        p_c_length = max(max(_c_lengths) for _c_lengths in c_lengths)

        b_tokens = []
        b_chars = []
        for _tokens, _chars in zip(tokens, chars):
            _tokens, _chars = pad_to_size(_tokens, _chars,
                                          p_length,
                                          max(5, p_c_length))
            b_tokens.append(_tokens)
            b_chars.append(_chars)

        b_tokens = np.concatenate(b_tokens)
        b_chars = np.concatenate(b_chars)

        return b_tokens, b_chars, lengths, mappings
def tokenize_data(data, token_to_id, char_to_id, limit=None):
    """
    Tokenize a data set, with mapping of tokens to index in origin.
    Also create and update the vocabularies.

    :param: data: a flat, organize view of the data, as a list of qid, passage,
            query and answer indexes.
    :param: vocab: a dict of token to id; updated.
    :param: c_vocab: a dict of char to id; update.

    :return: a tokenized view of the data, as a list of qid, passage, query,
    answer indexes, and token to char indexes mapping.
    Passage and queries are tokenized into a tuple (token, chars).
    Answer indexes are start:stop range of tokens.
    """
    tokenized = []
    for qid, passage, query, (start, stop) in data:
        q_tokens, q_chars, _, _, _ = \
            rich_tokenize(query, token_to_id, char_to_id, update=True)
        p_tokens, p_chars, _, _, mapping = \
            rich_tokenize(passage['passage_text'],
                          token_to_id, char_to_id, update=True)
        """Convert char position to token position."""
        if start == 0 and stop == 0:
            pass  # No answer; nop, since 0 == 0
        elif start == 0 and stop == len(passage):
            stop = len(p_tokens)  # Now point to just after last token.
        else:
            t_start = None  # token idx
            t_end = len(p_tokens)
            for t_ind, (_start, _end) in enumerate(mapping):
                if start < _end:  # char idx
                    t_start = t_ind
                    break
            assert t_start is not None
            """
            >>> for idx,(i,j) in enumerate(np.array([[1,2],[3,4]]), 6):
            ...     print(idx, i, j)
            ... 
            6 1 2
            7 3 4
            """
            for t_ind, (_start, _end) in \
                    enumerate(mapping[t_start:], t_start):  # do so to ensure t_start < t_end.
                if stop < _start:
                    t_end = t_ind
                    break
            start = t_start  # Now point to first token in answer.
            stop = t_end  # Now point to after the last token in answer.

        # Keep or not based on length of passage.
        if limit is not None and len(p_tokens) > limit:  # limit=None (default)
            if stop <= limit:
                # Passage is too long, but it can be trimmed, because the answer end position is in limited length.
                p_tokens = p_tokens[:limit]
            else:
                # Passage is too long, but it cannot be trimmed.
                continue

        tokenized.append((
            qid,  # query id
            (p_tokens, p_chars),
            (q_tokens, q_chars),
            (start, stop),
            mapping))

    return tokenized