def _pad_token_sequences(sequences, max_tokens, padding, truncating, value): # TODO: better variable names (see below) return keras_pad_sequences(sequences, maxlen=max_tokens, padding=padding, truncating=truncating, value=value)
def _pad_token_sequences(sequences, max_tokens=None, padding='pre', truncating='pre', value=0.): return keras_pad_sequences(sequences, maxlen=max_tokens, padding=padding, truncating=truncating, value=value)
def _transform(self, X, y=None, **kwargs): if 'maxlen' in self.pad_sequences_args: raise ValueError( 'The `maxlen` argument should not be set in `pad_sequences_args`. Set it in `pad_sequences` instead.' ) analyzer = self.count_vectorizer_.build_analyzer() V = self.vocabulary_ X_transformed = [] for seq in X: indexes = [] for j, tok in enumerate(analyzer(seq)): index = V.get(tok) if not getattr(self, 'skip_unknown', False): indexes.append(0 if index is None else (index + 1)) elif index is not None: indexes.append(index) #end for X_transformed.append(indexes) #end for if self.pad_sequences is not None: from keras.preprocessing.sequence import pad_sequences as keras_pad_sequences maxlen = getattr( self, 'pad_sequences_maxlen_', None if self.pad_sequences is True else self.pad_sequences) X_transformed = keras_pad_sequences(X_transformed, maxlen=maxlen, **self.pad_sequences_args) if self.pad_sequences is True or maxlen is not None: logger.debug( 'TokensToIndexTransformer transformed sequences has max length {}.' .format(X_transformed.shape[1])) self.pad_sequences_maxlen_ = X_transformed.shape[1] #end if return X_transformed
def test_is_reverse_operation(): test_seq = [[0, 1, 2, 3], [4], [5, 6]] padded = keras_pad_sequences(test_seq, padding='post') unpadded = unpad_sequences(padded, [4, 1, 2]) assert [list(item) for item in unpadded] == test_seq