Example #1
0
 def __init__(self, args):
     self.learning_rate = args.learningrate
     self.dropout_rate = args.dropoutrate
     self.reg_factor = args.regfactor
     self.n_a = args.hiddensize
     self.step = args.step
     self.seqlen = args.seqlength
     #self.tokenizer = nltk.RegexpTokenizer("\S+|\n+")
     # self.tokenizer = nltk.RegexpTokenizer("\,|\.|&gt|\¯\\\_\(\ツ\)\_\/\¯|\<\@\w+\>|\:\w+\:|\/gif|_|\"| |\w+\'\w+|\w+|\n")
     self.tokenizer = SlidingWindowTokenizer(args)
 def __init__(self, args, data):
     self.freq_threshold=args.freqthreshold
     self.len_threshold=args.lenthreshold
     self.step = args.step
     self.seqlen = args.seqlength
     self.tokenizer = SlidingWindowTokenizer(args)
     # datadir = os.path.join(args.volumedir, args.datadir)
     # file_pattern = datadir + "*.csv"
     # files = tf.data.Dataset.list_files(file_pattern)
     # self.ds = tf.data.TextLineDataset(files)
     self.preprocess(data)
Example #3
0
    def __init__(self, args):
        self.learning_rate = args.learningrate
        self.dropout_rate = args.dropoutrate
        self.reg_factor = args.regfactor
        self.n_a = args.hiddensize
        self.step = args.step
        self.seqlen = args.seqlength
        self.minlen = args.minlength
        self.maxlen = args.maxlength
        self.embedding = args.embedding
        self.embeddingsize = args.embeddingsize
        self.ffdim = args.ffdim
        self.transformer_layers = args.transformer_layers
        self.attention_heads = args.attention_heads

        self.tokenizer = SlidingWindowTokenizer(args)
class SlackTextLineDataset():
    def __init__(self, args, data):
        self.freq_threshold=args.freqthreshold
        self.len_threshold=args.lenthreshold
        self.step = args.step
        self.seqlen = args.seqlength
        self.tokenizer = SlidingWindowTokenizer(args)
        # datadir = os.path.join(args.volumedir, args.datadir)
        # file_pattern = datadir + "*.csv"
        # files = tf.data.Dataset.list_files(file_pattern)
        # self.ds = tf.data.TextLineDataset(files)
        self.preprocess(data)

    def preprocess(self, data):
        token_counts = {}
        self.tokens = []
        for msg in data:
            tokens = self.tokenizer.word_tokenize(msg)
            if len(tokens) < self.len_threshold:
                continue
            token_counts = count_tokens(token_counts, tokens)
            self.tokens.append(tokens)

        freq_filtered = filter(lambda elem: elem[1] >= self.freq_threshold, token_counts.items())
        self.vocab = sorted([elem[0] for elem in list(freq_filtered)])
        self.vocab += ["<UNK>", "<START>", "<PAD>"]
        self.reverse_token_map = {t: i for i, t in enumerate(self.vocab)}

        # self.ds = self.ds.map(lambda x: tf.py_function(func=self.tokenize, inp=[x], Tout=tf.string))
        # token_counts = self.ds.reduce({}, lambda state, x: tf.py_function(count_tokens,[state,x], tf.)
        # self.ds = tf.data.Dataset.from_tensor_slices([vect for seq in seqs for vect in self.tokens_to_sequences(seq)])
        self.ds = None
        self.Xseqs = []
        self.Yseqs = []
        for seq in self.tokens:
            Xs, Ys = self.tokens_to_sequences(seq)
            self.Xseqs.extend(Xs)
            self.Yseqs.extend(Ys)
        self.Xseqs = np.array(self.Xseqs)
        self.Yseqs = np.array(self.Yseqs)
        #     ds = self.tokens_to_sequences(seq)
        #     if self.ds is None:
        #         self.ds = ds
        #     else:
        #         self.ds = self.ds.concatenate(ds)
        # print("SEQUENCES BUILT")
        # return self.ds
        # return selfXseqs, Yseqs

    def tokens_to_sequences(self, tokens):
        if len(tokens) < self.seqlen:
            tokens = char_padded(tokens, "<PAD>", self.seqlen)
        Xseqs = []
        Yseqs = []
        pad_masks = []
        for i in range(0,len(tokens)-self.seqlen+1, self.step):
            x0 = "<START>" if i == 0 else tokens[i - 1]
            Yseq = [get_ix_from_token(self.reverse_token_map, token) for token in tokens[i:i+self.seqlen]]
            Xseq = [get_ix_from_token(self.reverse_token_map, x0)] + Yseq[:-1]
            Yseq = np.array(Yseq)
            Xseq = np.array(Xseq)
            # pad_mask = (Yseq != get_ix_from_token(self.reverse_token_map, "<PAD>")).astype(np.int64)
            # pad_masks.append(pad_mask)
            Yseqs.append(Yseq)
            Xseqs.append(Xseq)
        # Yseqs = tf.data.Dataset.from_tensor_slices(Yseqs)
        # Xseqs = tf.data.Dataset.from_tensor_slices(Xseqs, Yseqs)
        # seqs = tf.data.Dataset.from_tensor_slices((Xseqs,Yseqs))
        return Xseqs, Yseqs
        # return tf.data.Dataset.from_tensor_slices((Xseqs,Yseqs))

    def get_dataset(self):
        return self.Xseqs, self.Yseqs, self.vocab, self.tokens
Example #5
0
class WordLanguageModelBuilder:
    def __init__(self, args):
        self.learning_rate = args.learningrate
        self.dropout_rate = args.dropoutrate
        self.reg_factor = args.regfactor
        self.n_a = args.hiddensize
        self.step = args.step
        self.seqlen = args.seqlength
        #self.tokenizer = nltk.RegexpTokenizer("\S+|\n+")
        # self.tokenizer = nltk.RegexpTokenizer("\,|\.|&gt|\¯\\\_\(\ツ\)\_\/\¯|\<\@\w+\>|\:\w+\:|\/gif|_|\"| |\w+\'\w+|\w+|\n")
        self.tokenizer = SlidingWindowTokenizer(args)

        # self.tokenizer = TFVectTokenizer(self.seqlen, self.step, args.freqthreshold)

    def tokenize(self, data, freq_threshold=None):
        return self.tokenizer.tokenize(data)

    # def tokenize(self, data, freq_threshold=5):
    #     #tokens = " ".join(data).split(" ")
    #     tokens = self.tokenizer.tokenize(" ".join(data))
    #     token_counts = {}
    #     for t in tokens:
    #         if t not in token_counts.keys():
    #             token_counts[t] = 1
    #         else:
    #             token_counts[t] += 1
    #     freq_filtered = filter(lambda elem: elem[1] >= freq_threshold, token_counts.items())
    #     vocab = sorted([elem[0] for elem in list(freq_filtered)])
    #     #vocab = sorted(list(set(tokens)))
    #     vocab += ["<UNK>"]
    #     reverse_token_map = {t: i for i, t in enumerate(vocab)}
    #     return tokens, vocab, reverse_token_map

    def create_model(self, vocab):
        vocab_size = len(vocab)
        reg = regularizers.l2(self.reg_factor)
        # tf.keras.backend.set_floatx('float64')
        x = Input(shape=(self.seqlen, vocab_size), name="input")
        out = LSTM(self.n_a, return_sequences=True, kernel_regularizer=reg, recurrent_regularizer=reg)(x)
        out = Dropout(self.dropout_rate)(out)
        out = LSTM(self.n_a, return_sequences=True, kernel_regularizer=reg, recurrent_regularizer=reg)(out)
        out = Dropout(self.dropout_rate)(out)
        out = Dense(self.n_a, activation='relu', kernel_regularizer=reg)(out)
        out = Dense(vocab_size, activation='softmax', kernel_regularizer=reg)(out)
        model = keras.Model(inputs=x, outputs=out)
        # opt = RMSprop(learning_rate=self.learning_rate, clipvalue=3)
        # model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["accuracy"])
        # model.summary(print_fn=logger.info)
        return model

    def sample(self, model, tokens, vocab, reverse_token_map):
        seqlen = self.seqlen
        vocab_size = len(vocab)
        token_ix = -1
        i = random.randint(0, len(tokens) - seqlen - 1)
        inpt = tokens[i:i + seqlen]
        output = ""
        for t in inpt:
            output += t
        output += "->"
        mintokens = 15
        maxtokens = 100
        i = 0
        while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['\n']):
            x = np.zeros((1, seqlen, vocab_size))
            x[0] = [token_to_oh(get_ix_from_token(reverse_token_map, token), vocab_size) for token in inpt]
            preds = model.predict(x, verbose=0)[0]
            preds = preds[min(i, self.seqlen - 1)]
            token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
            new_token = vocab[token_ix]
            output += new_token
            inpt = inpt[1:] + [new_token]
            i+=1
        logger.info("\n" + output)
        return output

    # def get_input_sequences(self, tokens, reverse_token_map):
    #     nummsgs = math.floor((len(tokens) - self.seqlen) / self.step) + 1
    #     j = 0
    #     seqs = []
    #     for i in range(0, len(tokens) - self.seqlen, self.step):
    #         last_ix = min(i + self.seqlen, len(tokens)-1)
    #         Xseq = [get_ix_from_token(reverse_token_map, token) for token in tokens[i:last_ix]]
    #         Yix = get_ix_from_token(reverse_token_map, tokens[last_ix])
    #         seqs.append((Xseq, Yix))
    #         j += 1
    #     return seqs

    def get_input_sequences(self, tokens, reverse_token_map):
        return self.tokenizer.get_input_sequences(tokens,reverse_token_map)

    # def build_input_vectors(self, seqs, vocab, reverse_token_map):
    #     X = np.zeros((len(seqs), self.seqlen))
    #     Y = np.zeros((len(seqs), self.seqlen))
    #
    #     for i, (Xseq, Yseq) in enumerate(seqs):
    #         X[i, :] = Xseq
    #         Y[i, :] = Yseq
    #     return X,Y, None

    def build_input_vectors(self, seqs, vocab, reverse_token_map):
        X = np.zeros((len(seqs), self.seqlen, len(vocab)))
        Y = np.zeros((len(seqs), len(vocab)))
        j = 0
        for Xseq, Yix in seqs:
            X[j, :, :] = [token_to_oh(ix, len(vocab)) for ix in Xseq]
            Y[j, :] = token_to_oh(Yix, len(vocab))
            j+=1
        return X, Y
Example #6
0
class AttentionModelBuilder:
    def __init__(self, args):
        self.learning_rate = args.learningrate
        self.dropout_rate = args.dropoutrate
        self.reg_factor = args.regfactor
        self.n_a = args.hiddensize
        self.step = args.step
        self.seqlen = args.seqlength
        self.minlen = args.minlength
        self.maxlen = args.maxlength
        self.embedding = args.embedding
        self.embeddingsize = args.embeddingsize
        self.ffdim = args.ffdim
        self.transformer_layers = args.transformer_layers
        self.attention_heads = args.attention_heads

        self.tokenizer = SlidingWindowTokenizer(args)
        # self.tokenizer = TFVectTokenizer(self.seqlen, self.step, args.freqthreshold)

    def tokenize(self, data, freq_threshold=None):
        return self.tokenizer.tokenize(data)

    def get_masked_datasets(self,
                            dataset,
                            mask_token_ix,
                            vocab_size,
                            pad_mask=None):
        #     dataset = dataset.to_numpy().reshape((dataset.shape[0],1))

        # 15% BERT masking
        if pad_mask is None:
            pad_mask = np.zeros(dataset.shape)
        inp_mask = np.logical_and(
            np.random.rand(*dataset.shape) < 0.15, pad_mask == False)

        labels = -1 * np.ones(dataset.shape)
        labels[inp_mask] = 0

        masked_dataset = np.copy(dataset)

        # 90% of mask indices get set to mask token
        inp_mask = np.logical_and(inp_mask,
                                  np.random.rand(*dataset.shape) < 0.9)
        masked_dataset[inp_mask] = mask_token_ix

        # 10% of mask indices get set to a random token (and 10% remain unchanged)
        inp_mask = np.logical_and(inp_mask,
                                  np.random.rand(*dataset.shape) < 1 / 9)
        masked_dataset[inp_mask] = np.random.randint(0, mask_token_ix,
                                                     inp_mask.sum())

        # To be used to scale loss function to only count masked tokens
        loss_mask = np.ones(dataset.shape, dtype=int)
        loss_mask[labels == -1] = 0

        # The Y labels are just the original dataset
        y_labels = np.copy(dataset)

        return masked_dataset, y_labels, loss_mask

    def get_input_sequences(self, tokens, reverse_token_map):
        return self.tokenizer.get_input_sequences(tokens, reverse_token_map)

    def build_masked_input_vectors(self, seqs, vocab, reverse_token_map):
        mask_token_ix = reverse_token_map["<MASK>"]
        seqs = np.asarray(seqs)
        masked_ds, masked_y, sample_weights = self.get_masked_datasets(
            seqs, mask_token_ix, len(vocab))
        return masked_ds, masked_y, sample_weights

    def build_input_vectors(self, seqs, vocab, reverse_token_map):
        X = np.zeros((len(seqs), self.seqlen))
        Y = np.zeros((len(seqs), self.seqlen))

        for i, (Xseq, Yseq) in enumerate(seqs):
            X[i, :] = Xseq
            Y[i, :] = Yseq
        return X, Y, None

    def transformer_encoder(self, x, i, reg, mask=None):
        # Embedding, self-attention, dropout, residual layerNorm, ffn, residual layerNorm

        # attn_layer = keras.layers.MultiHeadAttention(self.attention_heads, self.n_a//self.attention_heads)
        # attn_out = attn_layer(x,x,x, attention_mask=mask)
        attn_out = multihead_attention(i,
                                       x,
                                       x,
                                       x,
                                       self.attention_heads,
                                       self.n_a,
                                       reg,
                                       self.dropout_rate,
                                       self.seqlen,
                                       mask=mask)

        x = keras.layers.add([attn_out, x])
        x = keras.layers.LayerNormalization(
            epsilon=1e-6, name="encoder_{}/attn_norm".format(i))(x)

        # Feed-forward layer
        ffn = keras.Sequential(
            [
                keras.layers.Dense(
                    self.ffdim, kernel_regularizer=reg, activation="relu"),
                keras.layers.Dense(self.n_a, kernel_regularizer=reg),
            ],
            name="encoder_{}/ffn".format(i),
        )

        ffn_out = ffn(x)
        ffn_out = keras.layers.Dropout(
            self.dropout_rate,
            name="encoder_{}/ffn_dropout".format(i))(ffn_out)

        x = keras.layers.add([ffn_out, x])
        x = keras.layers.LayerNormalization(
            epsilon=1e-6, name="encoder_{}/ffn_norm".format(i))(x)
        return x

    def subsequent_mask(self, shape):
        "Mask out subsequent positions."
        subsequent_mask = np.triu(np.ones(shape), k=1).astype('uint8')
        return subsequent_mask == 0

    def transformer_decoder(self, encoder_out, targets, reg, mask=None):
        # Embedding, self attention, encoder attention, dropout, residual layerNorm, ffn, dropout, res norm, dense softmax
        m = tf.shape(x)[0]

        attn_layer_1 = keras.layers.MultiHeadAttention(4, self.n_a // 4)
        attn_out_1 = attn_layer_1(targets,
                                  targets,
                                  targets,
                                  attention_mask=mask)
        # attn_out = multihead_attention(x, x, x, 4, self.n_a, m, reg, self.dropout_rate, mask=mask)
        #     attn_out = tf.reshape(out, (m,seqlen*n_a))

        attn_out_1 = keras.layers.LayerNormalization(
            epsilon=1e-6, name="decoder/attn_norm_1")(targets + attn_out_1)

        attn_layer_2 = keras.layers.MultiHeadAttention(4, self.n_a // 4)
        attn_out_2 = attn_layer_2(encoder_out,
                                  attn_out_1,
                                  attn_out_1,
                                  attention_mask=mask)
        # attn_out = multihead_attention(x, x, x, 4, self.n_a, m, reg, self.dropout_rate, mask=mask)
        #     attn_out = tf.reshape(out, (m,seqlen*n_a))

        attn_out_2 = keras.layers.LayerNormalization(
            epsilon=1e-6, name="decoder/attn_norm_2")(attn_out_1 + attn_out_2)

        # Feed-forward layer
        ffn = keras.Sequential(
            [
                keras.layers.Dense(
                    self.n_a, kernel_regularizer=reg, activation="relu"),
                keras.layers.Dense(self.n_a, kernel_regularizer=reg),
            ],
            name="decoder/ffn",
        )

        ffn_out = ffn(attn_out_2)
        ffn_out = keras.layers.Dropout(self.dropout_rate,
                                       name="decoder/ffn_dropout")(ffn_out)

        x = keras.layers.LayerNormalization(
            epsilon=1e-6, name="decoder/ffn_norm")(attn_out_2 + ffn_out)
        return x

    def create_model(self, vocab, mask=None):
        vocab_size = len(vocab)
        reg = keras.regularizers.l2(self.reg_factor)

        inpt = keras.layers.Input(shape=(self.seqlen), name="input")
        out = keras.layers.Embedding(vocab_size,
                                     self.n_a,
                                     input_length=self.seqlen)(inpt)
        pos_enc = positional_encoding(self.seqlen, self.n_a)
        pos_emb = keras.layers.Embedding(
            input_dim=self.seqlen,
            output_dim=self.n_a,
            weights=[pos_enc],
            name="position_embedding",
        )(tf.range(start=0, limit=self.seqlen, delta=1))
        # encoder_out = out + pos_emb
        encoder_out = tf.math.add(out, pos_emb)
        mask = self.subsequent_mask(self.seqlen)
        for i in range(self.transformer_layers):
            encoder_out = self.transformer_encoder(encoder_out, i, reg, mask)
        # decoder_out = self.transformer_decoder(encoder_out, target_emb, reg, mask)
        out = keras.layers.Dense(self.ffdim,
                                 activation="relu",
                                 kernel_regularizer=reg,
                                 name="penult_dense")(encoder_out)
        out = keras.layers.Dense(vocab_size,
                                 activation="softmax",
                                 kernel_regularizer=reg,
                                 name="final_dense")(out)

        # masked_model = MaskedLanguageModel(inputs=inpt, outputs=masked_out)
        # return masked_model

        # model = keras.Model(inputs=inpt, outputs=out)
        reverse_token_map = {t: i for i, t in enumerate(vocab)}
        model = CustomMetricsModel(reverse_token_map, inputs=inpt, outputs=out)
        return model

    def logtop5(self, p, vocab):
        top5 = tf.math.top_k(p, k=5)
        top5 = [(vocab[tix], top5.values[ix].numpy())
                for (ix, tix) in enumerate(top5.indices)]
        logger.info(top5)

    def sample(self, model, tokens, vocab, reverse_token_map, temp=1):
        seqlen = self.seqlen
        vocab_size = len(vocab)
        token_ix = -1
        # start = np.random.randint(0, len(tokens) - self.seqlen)
        # inpt = tokens[start:start+self.seqlen]
        inpt = [" " for i in range(self.seqlen)]
        inpt[0] = "<START>"
        output = ""
        mintokens = 15
        maxtokens = 100
        i = 0
        while i < maxtokens and (i < mintokens
                                 or token_ix != reverse_token_map['<START>']):
            # x = np.zeros((1, seqlen))
            # logger.info(inpt)
            x = [get_ix_from_token(reverse_token_map, token) for token in inpt]
            x = np.asarray(x)
            x = x.reshape((1, seqlen))
            preds = model.predict(x, verbose=0)[0]
            preds = preds[min(i, self.seqlen - 1)]
            # topk = tf.math.top_k(preds, k=50)
            # topk_preds = keras.layers.Softmax()(topk.values/temp)
            # token_ix = np.random.choice(topk.indices, p=topk_preds)
            token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
            retries = 0
            while retries < 10 and token_ix == reverse_token_map["<UNK>"]:
                token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
                retries += 1
            new_token = vocab[token_ix]
            # logger.info(new_token)
            output += new_token
            output += " "
            if (i + 1 < len(inpt)):
                inpt[i + 1] = new_token
            else:
                inpt = inpt[1:] + [new_token]
            i += 1
        logger.info(output)
        return output

    def masked_sample(self, model, tokens, vocab, reverse_token_map):
        seqlen = self.seqlen
        vocab_size = len(vocab)
        token_ix = -1
        inpt = ["<MASK>" for i in range(self.seqlen)]
        inpt[0] = "<START>"
        output = ""
        mintokens = 15
        maxtokens = 100
        i = 1
        while i < maxtokens and (i < mintokens
                                 or token_ix != reverse_token_map['<START>']):
            maskix = min(i, self.seqlen - 1)
            # x = np.zeros((1, seqlen))
            x = [get_ix_from_token(reverse_token_map, token) for token in inpt]
            x = np.asarray(x)
            x = x.reshape((1, seqlen))
            preds = model.predict(x, verbose=0)[0][min(i, self.seqlen - 1)]
            token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
            while token_ix == reverse_token_map["<UNK>"]:
                token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
            new_token = vocab[token_ix]
            output += new_token
            inpt[maskix] = new_token
            if maskix == self.seqlen - 1:
                inpt = inpt[1:] + ["<MASK>"]
            i += 1
        return output
Example #7
0
class WordLanguageModelEmbeddingsBuilder:
    def __init__(self, args):
        self.learning_rate = args.learningrate
        self.dropout_rate = args.dropoutrate
        self.reg_factor = args.regfactor
        self.n_a = args.hiddensize
        self.step = args.step
        self.seqlen = args.seqlength
        #self.tokenizer = nltk.RegexpTokenizer("\S+|\n+")
        # self.tokenizer = nltk.RegexpTokenizer("\,|\.|&gt|\¯\\\_\(\ツ\)\_\/\¯|\<\@\w+\>|\:\w+\:|\/gif|_|\"| |\w+\'\w+|\w+|\n")
        self.tokenizer = SlidingWindowTokenizer(args)

    def tokenize(self, data, freq_threshold=None):
        return self.tokenizer.tokenize(data)

    # def tokenize(self, data, freq_threshold=5):
    #     #tokens = " ".join(data).split(" ")
    #     tokens = self.tokenizer.tokenize("START ".join(data))
    #     token_counts = {}
    #     for t in tokens:
    #         if t not in token_counts.keys():
    #             token_counts[t] = 1
    #         else:
    #             token_counts[t] += 1
    #     freq_filtered = filter(lambda elem: elem[1] >= freq_threshold, token_counts.items())
    #     vocab = sorted([elem[0] for elem in list(freq_filtered)])
    #     #vocab = sorted(list(set(tokens)))
    #     vocab += ["<UNK>"]
    #     reverse_token_map = {t: i for i, t in enumerate(vocab)}
    #     return tokens, vocab, reverse_token_map

    def create_model(self, vocab):
        vocab_size = len(vocab)
        reg = regularizers.l2(self.reg_factor)
        # tf.keras.backend.set_floatx('float64')
        x = Input(shape=(self.seqlen), name="input")
        out = Embedding(vocab_size, 100, input_length=self.seqlen)(x)
        out = LSTM(self.n_a, return_sequences=True, kernel_regularizer=reg, recurrent_regularizer=reg)(out)
        out = Dropout(self.dropout_rate)(out)
        out = LSTM(self.n_a, return_sequences=True, kernel_regularizer=reg, recurrent_regularizer=reg)(out)
        out = Dropout(self.dropout_rate)(out)
        out = Dense(self.n_a, activation='relu', kernel_regularizer=reg)(out)
        out = Dense(vocab_size, activation='softmax', kernel_regularizer=reg)(out)
        # model = keras.Model(inputs=x, outputs=out)
        reverse_token_map = {t: i for i, t in enumerate(vocab)}
        model = CustomMetricsModel(reverse_token_map, inputs=x, outputs=out)
        # opt = RMSprop(learning_rate=self.learning_rate, clipvalue=3)
        # model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["accuracy"])
        return model

    def sample(self, model, tokens, vocab, reverse_token_map, temp=1):
        seqlen = self.seqlen
        vocab_size = len(vocab)
        token_ix = -1
        inpt = [" " for i in range(self.seqlen)]
        inpt[0] = "<START>"
        output = ""
        mintokens = 15
        maxtokens = 100
        i = 0
        while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['<START>']):
            x = [get_ix_from_token(reverse_token_map, token) for token in inpt]
            x = np.asarray(x)
            x = x.reshape((1,seqlen))
            preds = model.predict(x, verbose=0)[0]
            preds = preds[min(i, self.seqlen - 1)]
            token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
            retries = 0
            while retries < 10 and token_ix == reverse_token_map["<UNK>"]:
                token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
                retries += 1
            new_token = vocab[token_ix]
            output += new_token
            output += " "
            if (i + 1 < len(inpt)):
                inpt[i + 1] = new_token
            else:
                inpt = inpt[1:] + [new_token]
            i += 1
        logger.info(output)
        return output

    # def sample(self, model, tokens, vocab, reverse_token_map):
    #     seqlen = self.seqlen
    #     vocab_size = len(vocab)
    #     token_ix = -1
    #     i = random.randint(0, len(tokens) - seqlen - 1)
    #     inpt = tokens[i:i + seqlen]
    #     output = ""
    #     for t in inpt:
    #         output += t
    #     output += "->"
    #     mintokens = 15
    #     maxtokens = 100
    #     i = 0
    #     while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['\n']):
    #         x = np.zeros((1, seqlen))
    #         x[0] = [get_ix_from_token(reverse_token_map, token) for token in inpt]
    #         preds = model.predict(x, verbose=0)[0]
    #         token_ix = np.random.choice(range(vocab_size), p=preds.ravel())
    #         new_token = vocab[token_ix]
    #         output += new_token
    #         inpt = inpt[1:] + [new_token]
    #         i+=1
    #     logger.info("\n" + output)
    #     return output

    # def get_input_sequences(self, tokens, reverse_token_map):
    #     seqs = []
    #     for i in range(0, len(tokens) - self.seqlen, self.step):
    #         last_ix = min(i + self.seqlen, len(tokens)-1)
    #         Xseq = [get_ix_from_token(reverse_token_map, token) for token in tokens[i:last_ix]]
    #         Yix = get_ix_from_token(reverse_token_map, tokens[last_ix])
    #         seqs.append((Xseq, Yix))
    #     return seqs

    def get_input_sequences(self, tokens, reverse_token_map):
        return self.tokenizer.get_input_sequences(tokens,reverse_token_map)

    def build_input_vectors(self, seqs, vocab, reverse_token_map):
        X = np.zeros((len(seqs), self.seqlen))
        Y = np.zeros((len(seqs), self.seqlen))

        for i, (Xseq, Yseq) in enumerate(seqs):
            X[i, :] = Xseq
            Y[i, :] = Yseq
        return X,Y, None