Ejemplo n.º 1
0
    def _convert_y(self, y):
        target_ids = []
        sos_id = self.tokenizer.convert_tokens_to_ids(['<s>'])[0]
        eos_id = self.tokenizer.convert_tokens_to_ids(['</s>'])[0]

        for _y in y:
            if isinstance(_y, str):
                _target_ids = self.tokenizer.convert_tokens_to_ids(
                    self.tokenizer.tokenize(_y))
            elif isinstance(_y, list):
                assert isinstance(
                    _y[0], str), ('Machine translation module only supports '
                                  'single sentence inputs.')
                _target_ids = self.tokenizer.convert_tokens_to_ids(_y)

            utils.truncate_segments([_target_ids],
                                    self.target_max_seq_length - 2,
                                    truncate_method=self.truncate_method)
            _target_ids = [sos_id] + _target_ids + [eos_id]

            if len(_target_ids) < self.target_max_seq_length:
                _target_ids.extend([
                    0 for _ in range(self.target_max_seq_length -
                                     len(_target_ids))
                ])
            target_ids.append(_target_ids)

        return target_ids
Ejemplo n.º 2
0
    def _convert_X(self, X_target, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_tokens.append(self._convert_x(
                    example, tokenized))
            except Exception:
                tf.logging.warning('Wrong input format (line %d): \'%s\'. ' %
                                   (ex_id, example))

        # If `max_seq_length` is not mannually assigned,
        # the value will be set to the maximum length of
        # `input_ids`.
        if not self.max_seq_length:
            max_seq_length = 0
            for segments in segment_input_tokens:
                # subtract `[CLS]` and `[SEP]s`
                seq_length = sum([len(seg) + 1 for seg in segments]) + 1
                max_seq_length = max(max_seq_length, seq_length)
            self.max_seq_length = max_seq_length
            tf.logging.info('Adaptive max_seq_length: %d' %
                            self.max_seq_length)

        input_ids = []
        input_mask = []
        segment_ids = []
        for ex_id, segments in enumerate(segment_input_tokens):
            _input_ids = []
            _input_mask = []
            _segment_ids = []

            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) - 1,
                                    truncate_method=self.truncate_method)

            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_ids.extend(
                    self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            _input_ids.append(CLS_ID)
            _input_mask.append(1)
            _segment_ids.append(SEG_ID_CLS)

            # padding
            if len(_input_ids) < self.max_seq_length:
                delta_len = self.max_seq_length - len(_input_ids)
                _input_ids = [0] * delta_len + _input_ids
                _input_mask = [1] * delta_len + _input_mask  # it's 1, no error
                _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)

        return input_ids, input_mask, segment_ids
Ejemplo n.º 3
0
    def _convert_X(self, X_target, tokenized):
        input_ids = []
        input_mask = []
        segment_ids = []

        # tokenize input texts
        for ex_id, example in enumerate(X_target):
            _input_tokens = self._convert_x(example, tokenized)

            utils.truncate_segments([_input_tokens],
                                    self.max_seq_length,
                                    truncate_method=self.truncate_method)

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
            _input_mask = [0 for _ in range(len(_input_tokens))]
            _segment_ids = [0 for _ in range(len(_input_tokens))]

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.insert(0, 0)
                _input_mask.insert(0, 1)  # it's 1, no error
                _segment_ids.insert(0, SEG_ID_PAD)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)

        return input_ids, input_mask, segment_ids
Ejemplo n.º 4
0
    def _convert_X(self, X_target, tokenized, is_training):
        input_ids = []
        input_mask = []
        label_ids = []

        dupe_factor = self._dupe_factor if is_training else 1
        for _ in range(dupe_factor):

            for ex_id, example in enumerate(X_target):
                try:
                    _input_tokens = self._convert_x(example, tokenized)
                except Exception:
                    tf.logging.warning(
                        'Wrong input format (line %d): \'%s\'. ' %
                        (ex_id, example))
                _input_tokens = ['[CLS]'] + _input_tokens
                _input_ids = self.tokenizer.convert_tokens_to_ids(
                    _input_tokens)

                utils.truncate_segments([_input_ids],
                                        self.max_seq_length,
                                        truncate_method=self.truncate_method)
                nonpad_seq_length = len(_input_ids)
                _input_mask = [1] * nonpad_seq_length

                if nonpad_seq_length < self.max_seq_length:
                    _input_ids.extend(
                        [0] * (self.max_seq_length - nonpad_seq_length))
                    _input_mask.extend(
                        [0] * (self.max_seq_length - nonpad_seq_length))

                _dilated_ids = []
                _dilated_mask = []
                _label_ids = []
                for i, _input_id in enumerate(_input_ids):
                    _dilated_ids.extend([_input_id, 0])
                    _dilated_mask.extend([_input_mask[i], _input_mask[i]])
                    _label_ids.extend([_input_id, 0])

                # replace/add/subtract
                if is_training:
                    max_replace = int(nonpad_seq_length * self._replace_prob)
                    max_add = int(nonpad_seq_length * self._add_prob)
                    max_subtract = int(nonpad_seq_length * self._subtract_prob)

                    sample_wrong_tokens(_dilated_ids,
                                        _dilated_mask,
                                        _label_ids,
                                        max_replace,
                                        max_add,
                                        max_subtract,
                                        nonpad_seq_length=nonpad_seq_length,
                                        vocab_size=len(self.tokenizer.vocab))

                input_ids.append(_dilated_ids)
                input_mask.append(_dilated_mask)
                label_ids.append(_label_ids)

        return input_ids, input_mask, label_ids
Ejemplo n.º 5
0
    def _convert_X(self, X_target, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_tokens.append(self._convert_x(
                    example, tokenized))
            except Exception:
                raise ValueError('Wrong input format (line %d): \'%s\'. ' %
                                 (ex_id, example))

        input_ids = []
        input_mask = []
        segment_ids = []
        for ex_id, segments in enumerate(segment_input_tokens):
            _input_ids = []
            _input_mask = []
            _segment_ids = []

            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) - 1,
                                    truncate_method=self.truncate_method)

            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_ids.extend(
                    self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            _input_ids.append(CLS_ID)
            _input_mask.append(1)
            _segment_ids.append(SEG_ID_CLS)

            # padding
            if len(_input_ids) < self.max_seq_length:
                delta_len = self.max_seq_length - len(_input_ids)
                _input_ids = [0] * delta_len + _input_ids
                _input_mask = [1] * delta_len + _input_mask  # it's 1, no error
                _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)

        return input_ids, input_mask, segment_ids
Ejemplo n.º 6
0
    def _convert_X(self, X_target, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_tokens.append(self._convert_x(
                    example, tokenized))
            except Exception:
                raise ValueError('Wrong input format (line %d): \'%s\'. ' %
                                 (ex_id, example))

        input_ids = []
        input_mask = []
        segment_ids = []
        for ex_id, segments in enumerate(segment_input_tokens):
            _input_tokens = []
            _input_ids = []
            _input_mask = []
            _segment_ids = []

            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments),
                                    truncate_method=self.truncate_method)
            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)

        return input_ids, input_mask, segment_ids
Ejemplo n.º 7
0
    def _convert_X(self, X_target, tokenized):
        input_ids = []

        for ex_id, example in enumerate(X_target):
            try:
                _input_tokens = self._convert_x(example, tokenized)
            except Exception:
                tf.logging.warning('Wrong input format (line %d): \'%s\'. ' %
                                   (ex_id, example))
            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            utils.truncate_segments([_input_ids],
                                    self.max_seq_length - 1,
                                    truncate_method=self.truncate_method)
            _input_ids.append(self._eos_id)

            if len(_input_ids) < self.max_seq_length:
                _input_ids.extend(
                    [0 for _ in range(self.max_seq_length - len(_input_ids))])
            input_ids.append(_input_ids)

        return input_ids
Ejemplo n.º 8
0
    def _convert_X(self, X_target, tokenized):

        # tokenize input texts
        segment_input_values = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_values.append(self._convert_x(example))
            except Exception:
                raise ValueError(
                    'Wrong input format (line %d): \'%s\'. An example: '
                    '`X_tokenized = [[[0.0023, -0.0001, 0.0015, ...], ...], '
                    '...]`' % (ex_id, example))

        input_values = []
        input_mask = []
        for ex_id, segments in enumerate(segment_input_values):
            _input_values = []
            _input_mask = []

            utils.truncate_segments([segments],
                                    self.max_seq_length - 1,
                                    truncate_method=self.truncate_method)
            for s_id, segment in enumerate(segments):
                assert len(segment) == self.max_unit_length, (
                    '`max_unit_length` must be equal to the input length of '
                    'each time spot.')
                _input_values.append(segment)
                _input_mask.append(1)

            # padding
            _input_mask.append(1)
            for _ in range(self.max_seq_length - 1 - len(_input_values)):
                _input_values.append([0] * self.max_unit_length)
                _input_mask.append(0)

            input_values.append(_input_values)
            input_mask.append(_input_mask)

        return input_values, input_mask
Ejemplo n.º 9
0
    def _convert_X(self, X_target, tokenized):
        source_ids = []

        for ex_id, example in enumerate(X_target):
            try:
                _source_tokens = self._convert_x(example, tokenized)
            except Exception:
                raise ValueError('Wrong input format (line %d): \'%s\'. ' %
                                 (ex_id, example))
            _source_ids = self.tokenizer.convert_tokens_to_ids(_source_tokens)

            utils.truncate_segments([_source_ids],
                                    self.source_max_seq_length,
                                    truncate_method=self.truncate_method)

            if len(_source_ids) < self.source_max_seq_length:
                _source_ids.extend([
                    0 for _ in range(self.source_max_seq_length -
                                     len(_source_ids))
                ])
            source_ids.append(_source_ids)

        return source_ids
Ejemplo n.º 10
0
    def _convert_X(self, X_target, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_tokens.append(self._convert_x(
                    example, tokenized))
            except Exception:
                raise ValueError(
                    'Wrong input format (line %d): \'%s\'. '
                    'An untokenized example: '
                    '`X = [{\'doc\': \'...\', \'question\': \'...\', ...}, '
                    '...]`' % (ex_id, example))

        # backup for answer mapping
        if self._on_predict:
            self._input_tokens = []

        input_ids = []
        input_mask = []
        sa_mask = []
        segment_ids = []
        doc_ids = []
        doc_text = []
        doc_start = []
        for ex_id, segments in enumerate(segment_input_tokens):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]
            _sa_mask = np.zeros((self.max_seq_length, self.max_seq_length),
                                dtype=np.int32)
            _sa_mask[0, 0] = 1

            _doc_sent_tokens = segments.pop('doc')
            _doc_sent_len = len(_doc_sent_tokens)
            segments = list(segments.values()) + _doc_sent_tokens
            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) -
                                    _doc_sent_len - 2,
                                    truncate_method=self.truncate_method)
            _doc_sent_tokens = segments[-_doc_sent_len:]

            segments = segments[:-_doc_sent_len]
            for s_id, segment in enumerate(segments):
                _segment_len = len(segment) + 1
                _start_pos = len(_input_tokens)
                _end_pos = _start_pos + len(segment)
                _sa_mask[_start_pos:_end_pos, _start_pos:_end_pos] = 1
                _sa_mask[_end_pos, _end_pos] = 1
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([min(s_id, 1)] * (len(segment) + 1))
            _doc_start = len(_input_tokens)
            if not tokenized:
                _split_tokens = self.tokenizer.tokenize(self.split_sign)
            else:
                _split_tokens = []
            for s_id, segment in enumerate(_doc_sent_tokens):
                _segment_len = len(segment) + len(_split_tokens)
                _start_pos = len(_input_tokens)
                _end_pos = _start_pos + _segment_len
                _sa_mask[_start_pos:_end_pos, _start_pos:_end_pos] = 1
                _input_tokens.extend(segment + _split_tokens)
                _input_mask.extend([1] * _segment_len)
                _segment_ids.extend([1] * _segment_len)
            _input_tokens.append('[SEP]')
            _input_mask.append(1)
            _segment_ids.append(1)

            # backup for answer mapping
            if self._on_predict:
                self._input_tokens.append(_input_tokens)
            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
            _doc_ids = _input_ids[_doc_start:-1]

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            sa_mask.append(np.reshape(_sa_mask, [-1]).tolist())
            segment_ids.append(_segment_ids)
            doc_ids.append(_doc_ids)
            doc_text.append(X_target[ex_id]['doc'])
            doc_start.append(_doc_start)

        return (input_ids, input_mask, sa_mask, segment_ids, doc_ids, doc_text,
                doc_start)
Ejemplo n.º 11
0
    def _convert_X(self, X_target, is_training, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_tokens.append(self._convert_x(
                    example, tokenized))
            except Exception:
                tf.logging.warning('Wrong input format (line %d): \'%s\'. ' %
                                   (ex_id, example))

        input_ids = []
        input_mask = []
        segment_ids = []
        masked_lm_positions = []
        masked_lm_ids = []
        masked_lm_weights = []

        # duplicate raw inputs
        if is_training and self.dupe_factor > 1:
            new_segment_input_tokens = []
            for _ in range(self.dupe_factor):
                new_segment_input_tokens.extend(
                    copy.deepcopy(segment_input_tokens))
            segment_input_tokens = new_segment_input_tokens

        for ex_id, segments in enumerate(segment_input_tokens):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]
            _masked_lm_positions = []
            _masked_lm_ids = []
            _masked_lm_weights = []

            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) - 1,
                                    truncate_method=self.truncate_method)

            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            # random sampling of masked tokens
            if is_training:
                (_input_tokens, _masked_lm_positions, _masked_lm_labels) = \
                    create_masked_lm_predictions(
                        tokens=_input_tokens,
                        masked_lm_prob=self.masked_lm_prob,
                        max_predictions_per_seq=self._max_predictions_per_seq,
                        vocab_words=list(self.tokenizer.vocab.keys()),
                        do_whole_word_mask=self.do_whole_word_mask)
                _masked_lm_ids = \
                    self.tokenizer.convert_tokens_to_ids(_masked_lm_labels)
                _masked_lm_weights = [1.0] * len(_masked_lm_positions)

                # padding
                for _ in range(self._max_predictions_per_seq -
                               len(_masked_lm_positions)):
                    _masked_lm_positions.append(0)
                    _masked_lm_ids.append(0)
                    _masked_lm_weights.append(0.0)
            else:
                # `masked_lm_positions` is required for both training
                # and inference of BERT language modeling.
                for i in range(len(_input_tokens)):
                    if _input_tokens[i] == '[MASK]':
                        _masked_lm_positions.append(i)

                # padding
                for _ in range(self._max_predictions_per_seq -
                               len(_masked_lm_positions)):
                    _masked_lm_positions.append(0)
                for _ in range(self._max_predictions_per_seq):
                    _masked_lm_ids.append(0)

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)
            masked_lm_positions.append(_masked_lm_positions)
            masked_lm_ids.append(_masked_lm_ids)
            masked_lm_weights.append(_masked_lm_weights)

        return (input_ids, input_mask, segment_ids, masked_lm_positions,
                masked_lm_ids, masked_lm_weights)
Ejemplo n.º 12
0
    def _convert_X(self, X_target, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_tokens.append(self._convert_x(
                    example, tokenized))
            except Exception:
                raise ValueError(
                    'Wrong input format (line %d): \'%s\'. '
                    'An untokenized example: '
                    '`X = [{\'doc\': \'...\', \'question\': \'...\', ...}, '
                    '...]`' % (ex_id, example))

        # backup for answer mapping
        if self._on_predict:
            self._input_tokens = []

        input_ids = []
        input_mask = []
        query_mask = []
        segment_ids = []
        doc_ids = []
        doc_text = []
        doc_start = []
        for ex_id, segments in enumerate(segment_input_tokens):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _query_mask = [1]
            _segment_ids = [0]

            _doc_tokens = segments.pop('doc')
            segments = list(segments.values()) + [_doc_tokens]
            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) - 1,
                                    truncate_method=self.truncate_method)
            _doc_tokens = segments[-1]

            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                if s_id == 0:
                    _query_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))
            _doc_start = len(_input_tokens) - len(_doc_tokens) - 1

            # backup for answer mapping
            if self._on_predict:
                self._input_tokens.append(_input_tokens)
            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
            _doc_ids = _input_ids[_doc_start:-1]

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)
            for _ in range(self.max_seq_length - len(_query_mask)):
                _query_mask.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            query_mask.append(_query_mask)
            segment_ids.append(_segment_ids)
            doc_ids.append(_doc_ids)
            doc_text.append(X_target[ex_id]['doc'])
            doc_start.append(_doc_start)

        return (input_ids, input_mask, query_mask, segment_ids, doc_ids,
                doc_text, doc_start)
Ejemplo n.º 13
0
    def _convert_X(self, X_target, tokenized):

        # tokenize input texts
        segment_inputs = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_inputs.append(
                    {'Wide': example['Wide'],
                     'Deep': self._convert_x(example['Deep'], tokenized)})
            except Exception:
                raise ValueError(
                    'Wrong input format (line %d): \'%s\'. An untokenized '
                    'example: X = [{\'Wide\': [1, 5, \'positive\'], '
                    '\'Deep\': \'I bet she will win.\'}, ...]'
                    % (ex_id, example))

        if self.wide_features is None:
            self.wide_features = set()
            for segments in segment_inputs:
                for feature in segments['Wide']:
                    self.wide_features.add(feature)
            self.wide_features = list(self.wide_features)
        elif not isinstance(self.wide_features, list):
            raise ValueError(
                '`wide_features` should be a list of possible values '
                '(integer or string). '
                'E.g. [1, \'Positive\', \'Subjective\'].')
        wide_features_map = {
            self.wide_features[i]: i + 1
            for i in range(len(self.wide_features))}

        input_ids = []
        input_mask = []
        segment_ids = []
        n_wide_features = []
        wide_features = []
        for ex_id, segments in enumerate(segment_inputs):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]
            _wide_features = []
            for feature in segments['Wide']:
                try:
                    _wide_features.append(wide_features_map[feature])
                except:
                    tf.logging.warning(
                        'Unregistered wide feature: %s. Ignored.' % feature)
                    continue
            _n_wide_features = len(_wide_features)

            segments = segments['Deep']
            utils.truncate_segments(
                segments, self.max_seq_length - len(segments) - 1,
                truncate_method=self.truncate_method)
            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)
            for _ in range(len(self.wide_features) - _n_wide_features):
                _wide_features.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)
            n_wide_features.append(_n_wide_features)
            wide_features.append(_wide_features)

        return (input_ids, input_mask, segment_ids,
                n_wide_features, wide_features)
Ejemplo n.º 14
0
    def _convert_X(self, X_target, is_training, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            if self.mode in ('l2r', 'r2l'):
                info = '`l2r` or `r2l` only supports single sentence inputs.'
                if not tokenized:
                    assert isinstance(example, str), info
                else:
                    assert isinstance(example[0], str), info
            elif self.mode == 's2s':
                info = '`s2s` only supports 2-sentence inputs.'
                assert len(example) == 2, info
            try:
                segment_input_tokens.append(
                    self._convert_x(example, tokenized))
            except Exception:
                raise ValueError(
                    'Wrong input format (line %d): \'%s\'. '
                    % (ex_id, example))

        input_ids = []
        input_mask = []
        segment_ids = []
        masked_lm_positions = []
        masked_lm_ids = []
        masked_lm_weights = []
        next_sentence_labels = []

        # random sampling of next sentence
        if is_training and self.mode == 'bi' and self.do_sample_next_sentence:
            new_segment_input_tokens = []
            for ex_id in range(len(segment_input_tokens)):
                instances = create_instances_from_document(
                    all_documents=segment_input_tokens,
                    document_index=ex_id,
                    max_seq_length=self.max_seq_length - 3,
                    masked_lm_prob=self.masked_lm_prob,
                    max_predictions_per_seq=self._max_predictions_per_seq,
                    short_seq_prob=self.short_seq_prob,
                    vocab_words=list(self.tokenizer.vocab.keys()))
                for (segments, is_random_next) in instances:
                    new_segment_input_tokens.append(segments)
                    next_sentence_labels.append(is_random_next)
            segment_input_tokens = new_segment_input_tokens
        else:
            next_sentence_labels = [1] * len(segment_input_tokens)

        for ex_id, segments in enumerate(segment_input_tokens):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]
            _masked_lm_positions = []
            _masked_lm_ids = []
            _masked_lm_weights = []

            utils.truncate_segments(
                segments, self.max_seq_length - len(segments) - 1,
                truncate_method=self.truncate_method)

            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            # special values for `_input_tokens` and `input_mask`
            if self.mode == 's2s':
                _input_tokens.pop()
                _input_tokens.append('[EOS]')
                _input_mask = [len(_input_ids)] * (len(segments[0]) + 2)
                for i in range(len(segments[1]) + 1):
                    _input_mask.append(_input_mask[0] + i + 1)

            # random sampling of masked tokens
            if is_training:
                if (ex_id + 1) % 10000 == 0:
                    tf.logging.info(
                        'Sampling masks of input %d' % (ex_id + 1))
                (_input_tokens, _masked_lm_positions, _masked_lm_labels) = \
                    create_masked_lm_predictions(
                        tokens=_input_tokens,
                        masked_lm_prob=self.masked_lm_prob,
                        max_predictions_per_seq=self._max_predictions_per_seq,
                        vocab_words=list(self.tokenizer.vocab.keys()),
                        do_whole_word_mask=self.do_whole_word_mask)
                _masked_lm_ids = \
                    self.tokenizer.convert_tokens_to_ids(_masked_lm_labels)
                _masked_lm_weights = [1.0] * len(_masked_lm_positions)

                # padding
                for _ in range(self._max_predictions_per_seq -
                               len(_masked_lm_positions)):
                    _masked_lm_positions.append(0)
                    _masked_lm_ids.append(0)
                    _masked_lm_weights.append(0.0)
            else:
                # `masked_lm_positions` is required for both training
                # and inference of BERT language modeling.
                for i in range(len(_input_tokens)):
                    if _input_tokens[i] == '[MASK]':
                        _masked_lm_positions.append(i)

                # padding
                for _ in range(self._max_predictions_per_seq -
                               len(_masked_lm_positions)):
                    _masked_lm_positions.append(0)

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)
            masked_lm_positions.append(_masked_lm_positions)
            masked_lm_ids.append(_masked_lm_ids)
            masked_lm_weights.append(_masked_lm_weights)

        return (input_ids, input_mask, segment_ids,
                masked_lm_positions, masked_lm_ids, masked_lm_weights,
                next_sentence_labels)
Ejemplo n.º 15
0
    def _convert_X_reimp(self, X_target, y, tokenized):

        # tokenize input texts
        sup_ori_input_tokens = []
        aug_input_tokens = []
        is_supervised = []
        for ex_id, example in enumerate(X_target):
            try:
                label = y[ex_id]

                if label is None:
                    assert len(example) == 2
                    sup_ori_input_tokens.append(
                        self._convert_x(example[0], tokenized))
                    aug_input_tokens.append(
                        self._convert_x(example[1], tokenized))
                    is_supervised.append(0)
                else:
                    sup_ori_input_tokens.append(
                        self._convert_x(example, tokenized))
                    aug_input_tokens.append([])
                    is_supervised.append(1)
            except AssertionError:
                raise AssertionError(
                    'Must have exactly two inputs for an '
                    'unsupervised example, respectively original '
                    'and augmented.')
            except Exception:
                raise ValueError('Wrong input format (line %d): \'%s\'. ' %
                                 (ex_id, example))

        input_ids = []
        input_mask = []
        segment_ids = []
        for ex_id, segments in enumerate(sup_ori_input_tokens):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]

            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) - 1,
                                    truncate_method=self.truncate_method)
            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)

        aug_input_ids = []
        aug_input_mask = []
        aug_segment_ids = []
        for ex_id, segments in enumerate(aug_input_tokens):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]

            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) - 1,
                                    truncate_method=self.truncate_method)
            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)

            aug_input_ids.append(_input_ids)
            aug_input_mask.append(_input_mask)
            aug_segment_ids.append(_segment_ids)

        return (input_ids, input_mask, segment_ids, aug_input_ids,
                aug_input_mask, aug_segment_ids, is_supervised)
Ejemplo n.º 16
0
    def _convert_X(self, X_target, is_training, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_tokens.append(self._convert_x(
                    example, tokenized))
            except Exception:
                raise ValueError('Wrong input format (line %d): \'%s\'. ' %
                                 (ex_id, example))

        input_ids = []
        input_mask = []
        segment_ids = []
        masked_lm_positions = []
        masked_lm_ids = []
        masked_lm_weights = []

        # random sampling of next sentence
        if is_training:
            new_segment_input_tokens = []
            for ex_id in range(len(segment_input_tokens)):
                instances = create_instances_from_document(
                    all_documents=segment_input_tokens,
                    document_index=ex_id,
                    max_seq_length=self.max_seq_length - 2,
                    masked_lm_prob=self.masked_lm_prob,
                    max_predictions_per_seq=self._max_predictions_per_seq,
                    vocab_words=list(self.tokenizer.vocab.keys()))
                for segments in instances:
                    new_segment_input_tokens.append(segments)
            segment_input_tokens = new_segment_input_tokens

        for ex_id, segments in enumerate(segment_input_tokens):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]
            _masked_lm_positions = []
            _masked_lm_ids = []
            _masked_lm_weights = []

            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) - 1,
                                    truncate_method=self.truncate_method)

            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            # random sampling of masked tokens
            if is_training:
                if (ex_id + 1) % 10000 == 0:
                    tf.logging.info('Sampling masks of input %d' % (ex_id + 1))
                (_input_tokens, _masked_lm_positions, _masked_lm_labels) = \
                    create_masked_lm_predictions(
                        tokens=_input_tokens,
                        masked_lm_prob=self.masked_lm_prob,
                        max_predictions_per_seq=self._max_predictions_per_seq,
                        vocab_words=list(self.tokenizer.vocab.keys()),
                        do_whole_word_mask=self.do_whole_word_mask)
                _masked_lm_ids = \
                    self.tokenizer.convert_tokens_to_ids(_masked_lm_labels)
                _masked_lm_weights = [1.0] * len(_masked_lm_positions)

                # padding
                for _ in range(self._max_predictions_per_seq -
                               len(_masked_lm_positions)):
                    _masked_lm_positions.append(0)
                    _masked_lm_ids.append(0)
                    _masked_lm_weights.append(0.0)
            else:
                # `masked_lm_positions` is required for both training
                # and inference of BERT language modeling.
                for i in range(len(_input_tokens)):
                    if _input_tokens[i] == '[MASK]':
                        _masked_lm_positions.append(i)

                # padding
                for _ in range(self._max_predictions_per_seq -
                               len(_masked_lm_positions)):
                    _masked_lm_positions.append(0)

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)
            masked_lm_positions.append(_masked_lm_positions)
            masked_lm_ids.append(_masked_lm_ids)
            masked_lm_weights.append(_masked_lm_weights)

        return (input_ids, input_mask, segment_ids, masked_lm_positions,
                masked_lm_ids, masked_lm_weights)
Ejemplo n.º 17
0
    def _convert_X(self, X_target, tokenized, is_training):

        # backup for answer mapping
        if self._on_predict:
            self._input_tokens = []

        # tokenize input texts and scan over corpus
        tokenized_input_ids = []
        vocab_size = len(self.tokenizer.vocab)
        vocab_ind = list(range(vocab_size))
        vocab_p = [0] * vocab_size
        for ex_id, sample in enumerate(X_target):
            _input_tokens = self._convert_x(sample, tokenized)

            # skip noise training data
            if is_training:
                if len(_input_tokens) == 0 or \
                        len(_input_tokens) > self.max_seq_length:
                    continue
            else:
                utils.truncate_segments([_input_tokens],
                                        self.max_seq_length,
                                        truncate_method=self.truncate_method)

            # backup for answer mapping
            if self._on_predict:
                self._input_tokens.append(_input_tokens)

            # count char
            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
            if is_training:
                for _input_id in _input_ids:
                    vocab_p[_input_id] += 1

            tokenized_input_ids.append(_input_ids)
        if is_training:
            vocab_p_sum = sum(vocab_p)
            vocab_p = [n / vocab_p_sum for n in vocab_p]

        input_ids = []
        add_label_ids = []
        del_label_ids = []
        for ex_id in range(len(tokenized_input_ids)):
            _input_ids = tokenized_input_ids[ex_id]

            nonpad_seq_length = len(_input_ids)
            for _ in range(self.max_seq_length - nonpad_seq_length):
                _input_ids.append(0)

            _add_label_ids = []
            _del_label_ids = []

            # add/del
            if is_training:
                if (ex_id + 1) % 10000 == 0:
                    tf.logging.info('Sampling wrong tokens of input %d' %
                                    (ex_id + 1))

                _add_label_ids = [0] * self.max_seq_length
                _del_label_ids = [0] * self.max_seq_length

                max_add = np.sum(
                    np.random.random(nonpad_seq_length) < self._add_prob)
                max_del = np.sum(
                    np.random.random(nonpad_seq_length) < self._del_prob)

                sample_wrong_tokens(_input_ids,
                                    _add_label_ids,
                                    _del_label_ids,
                                    max_add=max_add,
                                    max_del=max_del,
                                    nonpad_seq_length=nonpad_seq_length,
                                    vocab_size=vocab_size,
                                    vocab_ind=vocab_ind,
                                    vocab_p=vocab_p)

            input_ids.append(_input_ids)
            add_label_ids.append(_add_label_ids)
            del_label_ids.append(_del_label_ids)

        return input_ids, add_label_ids, del_label_ids
Ejemplo n.º 18
0
    def _convert_X(self, X_target, is_training, tokenized):

        # tokenize input texts
        segment_input_tokens = []
        for ex_id, example in enumerate(X_target):
            try:
                segment_input_tokens.append(self._convert_x(
                    example, tokenized))
            except Exception:
                tf.logging.warning('Wrong input format (line %d): \'%s\'. ' %
                                   (ex_id, example))

        # If `max_seq_length` is not mannually assigned,
        # the value will be set to the maximum length of
        # `input_ids`.
        if not self.max_seq_length:
            max_seq_length = 0
            for segments in segment_input_tokens:
                # subtract `[CLS]` and `[SEP]s`
                seq_length = sum([len(seg) + 1 for seg in segments]) + 1
                max_seq_length = max(max_seq_length, seq_length)
            self.max_seq_length = max_seq_length
            tf.logging.info('Adaptive max_seq_length: %d' %
                            self.max_seq_length)

        input_ids = []
        input_mask = []
        segment_ids = []
        masked_lm_positions = []
        masked_lm_ids = []
        masked_lm_weights = []
        sentence_order_labels = []

        # duplicate raw inputs
        if is_training and self._dupe_factor > 1:
            new_segment_input_tokens = []
            for _ in range(self._dupe_factor):
                new_segment_input_tokens.extend(
                    copy.deepcopy(segment_input_tokens))
            segment_input_tokens = new_segment_input_tokens

        # random sampling of next sentence
        if is_training and self._do_sample_sentence:
            new_segment_input_tokens = []
            for ex_id in range(len(segment_input_tokens)):
                instances = create_instances_from_document(
                    all_documents=segment_input_tokens,
                    document_index=ex_id,
                    max_seq_length=self.max_seq_length - 3,
                    masked_lm_prob=self._masked_lm_prob,
                    max_predictions_per_seq=self._max_predictions_per_seq,
                    short_seq_prob=self._short_seq_prob,
                    vocab_words=list(self.tokenizer.vocab.keys()))
                for (segments, is_random_next) in instances:
                    new_segment_input_tokens.append(segments)
                    sentence_order_labels.append(is_random_next)
            segment_input_tokens = new_segment_input_tokens

        for ex_id, segments in enumerate(segment_input_tokens):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]
            _masked_lm_positions = []
            _masked_lm_ids = []
            _masked_lm_weights = []

            utils.truncate_segments(segments,
                                    self.max_seq_length - len(segments) - 1,
                                    truncate_method=self.truncate_method)

            for s_id, segment in enumerate(segments):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            # random sampling of masked tokens
            if is_training:
                (_input_tokens, _masked_lm_positions, _masked_lm_labels) = \
                    create_masked_lm_predictions(
                        tokens=_input_tokens,
                        masked_lm_prob=self._masked_lm_prob,
                        max_predictions_per_seq=self._max_predictions_per_seq,
                        vocab_words=list(self.tokenizer.vocab.keys()),
                        ngram=self._ngram,
                        favor_shorter_ngram=self._favor_shorter_ngram,
                        do_permutation=self._do_permutation,
                        do_whole_word_mask=self._do_whole_word_mask)
                _masked_lm_ids = \
                    self.tokenizer.convert_tokens_to_ids(_masked_lm_labels)
                _masked_lm_weights = [1.0] * len(_masked_lm_positions)

                # padding
                for _ in range(self._max_predictions_per_seq *
                               (1 + self._do_permutation) -
                               len(_masked_lm_positions)):
                    _masked_lm_positions.append(0)
                    _masked_lm_ids.append(0)
                    _masked_lm_weights.append(0.0)
            else:
                # `masked_lm_positions` is required for both training
                # and inference of BERT language modeling.
                for i in range(len(_input_tokens)):
                    if _input_tokens[i] == '[MASK]':
                        _masked_lm_positions.append(i)

                # padding
                for _ in range(self._max_predictions_per_seq -
                               len(_masked_lm_positions)):
                    _masked_lm_positions.append(0)

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)
            masked_lm_positions.append(_masked_lm_positions)
            masked_lm_ids.append(_masked_lm_ids)
            masked_lm_weights.append(_masked_lm_weights)

        return (input_ids, input_mask, segment_ids, masked_lm_positions,
                masked_lm_ids, masked_lm_weights, sentence_order_labels)
Ejemplo n.º 19
0
    def _convert_X(self, X_target, tokenized):

        # tokenize input texts
        segment_inputs = []
        for ex_id, example in enumerate(X_target):
            try:
                assert len(example['Text']) == len(example['Sem'])
                if isinstance(example['Text'][0], list):
                    for i in range(len(example['Text'])):
                        assert len(example['Text'][i]) == len(example['Sem'][i])
                sem = copy.deepcopy(example['Sem'])
                if not isinstance(sem[0], list):
                    sem = [sem]
                segment_inputs.append(
                    {'Sem': sem,
                     'Text': self._convert_x(example['Text'], tokenized)})
            except Exception:
                raise ValueError(
                    'Wrong input format (line %d): %s. An example: '
                    'X_tokenized = [{\'Sem\': [\'n\', \'v\', \'n\'], '
                    '\'Text\': [\'I\', \'love\', \'you\']}, ...]'
                    % (ex_id, example))

        if self.sem_features is None:
            self.sem_features = set()
            for segments in segment_inputs:
                for segment in segments['Sem']:
                    for feature in segment:
                        self.sem_features.add(feature)
            self.sem_features = list(self.sem_features)
        elif not isinstance(self.sem_features, list):
            raise ValueError(
                '`sem_features` should be a list of possible values '
                '(integer or string). E.g. [\'n\', \'v\', \'adj\'].')
        sem_features_map = {
            self.sem_features[i]: i + 3
            for i in range(len(self.sem_features))}

        input_ids = []
        input_mask = []
        segment_ids = []
        sem_features = []
        for ex_id, segments in enumerate(segment_inputs):
            _input_tokens = ['[CLS]']
            _input_ids = []
            _input_mask = [1]
            _segment_ids = [0]
            _sem_features = [1]  # same as [CLS]

            utils.truncate_segments(
                segments['Text'], self.max_seq_length - len(segments['Text']) - 1,
                truncate_method=self.truncate_method)
            for s_id, segment in enumerate(segments['Text']):
                _segment_id = min(s_id, 1)
                _input_tokens.extend(segment + ['[SEP]'])
                _input_mask.extend([1] * (len(segment) + 1))
                _segment_ids.extend([_segment_id] * (len(segment) + 1))

            _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)

            for i in range(len(segments['Sem'])):
                segment = segments['Sem'][i]
                n = len(segments['Text'][i])
                for feature in segment[:n]:
                    try:
                        _sem_features.append(sem_features_map[feature])
                    except:
                        tf.logging.warning(
                            'Unregistered semantic feature: %s. Ignored.'
                            % feature)
                        continue
                _sem_features.append(2)  # same as [SEP]

            # padding
            for _ in range(self.max_seq_length - len(_input_ids)):
                _input_ids.append(0)
                _input_mask.append(0)
                _segment_ids.append(0)
                _sem_features.append(0)

            input_ids.append(_input_ids)
            input_mask.append(_input_mask)
            segment_ids.append(_segment_ids)
            sem_features.append(_sem_features)

        return (input_ids, input_mask, segment_ids, sem_features)