def _pad_token_sequences(sequences, max_tokens, padding, truncating, value):
    # TODO: better variable names (see below)
    return keras_pad_sequences(sequences,
                               maxlen=max_tokens,
                               padding=padding,
                               truncating=truncating,
                               value=value)
Ejemplo n.º 2
0
def _pad_token_sequences(sequences,
                         max_tokens=None,
                         padding='pre',
                         truncating='pre',
                         value=0.):
    return keras_pad_sequences(sequences,
                               maxlen=max_tokens,
                               padding=padding,
                               truncating=truncating,
                               value=value)
Ejemplo n.º 3
0
    def _transform(self, X, y=None, **kwargs):
        if 'maxlen' in self.pad_sequences_args:
            raise ValueError(
                'The `maxlen` argument should not be set in `pad_sequences_args`. Set it in `pad_sequences` instead.'
            )

        analyzer = self.count_vectorizer_.build_analyzer()
        V = self.vocabulary_

        X_transformed = []
        for seq in X:
            indexes = []
            for j, tok in enumerate(analyzer(seq)):
                index = V.get(tok)

                if not getattr(self, 'skip_unknown', False):
                    indexes.append(0 if index is None else (index + 1))
                elif index is not None:
                    indexes.append(index)
            #end for

            X_transformed.append(indexes)
        #end for

        if self.pad_sequences is not None:
            from keras.preprocessing.sequence import pad_sequences as keras_pad_sequences

            maxlen = getattr(
                self, 'pad_sequences_maxlen_',
                None if self.pad_sequences is True else self.pad_sequences)
            X_transformed = keras_pad_sequences(X_transformed,
                                                maxlen=maxlen,
                                                **self.pad_sequences_args)
            if self.pad_sequences is True or maxlen is not None:
                logger.debug(
                    'TokensToIndexTransformer transformed sequences has max length {}.'
                    .format(X_transformed.shape[1]))

            self.pad_sequences_maxlen_ = X_transformed.shape[1]
        #end if

        return X_transformed
Ejemplo n.º 4
0
def test_is_reverse_operation():
    test_seq = [[0, 1, 2, 3], [4], [5, 6]]
    padded = keras_pad_sequences(test_seq, padding='post')
    unpadded = unpad_sequences(padded, [4, 1, 2])
    assert [list(item) for item in unpadded] == test_seq