def sample(self, model, tokens, vocab, reverse_token_map): seqlen = self.seqlen vocab_size = len(vocab) token_ix = -1 inpt = ["START" for i in range(self.seqlen)] output = "" mintokens = 15 maxtokens = 100 i = 0 while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['START']): if self.embedding: x = np.zeros((1, seqlen)) x[0] = [ get_ix_from_token(reverse_token_map, token) for token in inpt ] else: x = np.zeros((1, seqlen, vocab_size)) x[0] = [ token_to_oh(get_ix_from_token(reverse_token_map, token), vocab_size) for token in inpt ] preds = model.predict(x, verbose=0)[0][min(i, self.seqlen - 1)] token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) while token_ix == reverse_token_map["<UNK>"]: token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) new_token = vocab[token_ix] output += new_token if (i + 1 < len(inpt)): inpt[i + 1] = new_token else: inpt = inpt[1:] + [new_token] i += 1 return output
def get_input_sequences(self, tokens, reverse_token_map, full=True, sliding_window=True): nummsgs = math.floor((len(tokens) - self.seqlen) / self.step) + 1 seqs = [] for context, target in tokens: padded_sequence = char_padded(target[:self.seqlen], " ", self.seqlen) decoder_output = [ get_ix_from_token(reverse_token_map, token) for token in padded_sequence ] decoder_input = [get_ix_from_token(reverse_token_map, "START") ] + decoder_output[:-1] encoder_input = char_padded(context[:self.context_len], " ", self.context_len) encoder_input = [ get_ix_from_token(reverse_token_map, token) for token in encoder_input ] seqs.append((encoder_input, decoder_input, decoder_output)) return seqs
def get_input_sequences(self, tokens, reverse_token_map): seqs = [] for i in range(0, len(tokens) - self.seqlen, self.step): # x0 = "<START>" if i == 0 else tokens[i - 1] # last_ix = min(i + self.seqlen, len(tokens) - 1) # padded_sequence = char_padded(tokens[i:last_ix], " ", self.seqlen) X = self.slice_padded_sequence(tokens, i) Y = self.slice_padded_sequence(tokens, i + 1) Xseq = [get_ix_from_token(reverse_token_map, token) for token in X] Yseq = [get_ix_from_token(reverse_token_map, token) for token in Y] # Yseq = [get_ix_from_token(reverse_token_map, token) for token in padded_sequence] # Xseq = [get_ix_from_token(reverse_token_map, x0)] + Yseq[:-1] seqs.append((Xseq, Yseq)) return seqs
def sliding_window_input_sequences(self, tokens, reverse_token_map): nummsgs = math.floor((len(tokens) - self.seqlen) / self.step) + 1 seqs = [] x0 = "START" for i in range(0, len(tokens) - self.seqlen, self.step): x0 = "START" if i == 0 else tokens[i - 1] last_ix = min(i + self.seqlen, len(tokens) - 1) padded_sequence = char_padded(tokens[i:last_ix], " ", self.seqlen) Yseq = [ get_ix_from_token(reverse_token_map, token) for token in padded_sequence ] Xseq = [get_ix_from_token(reverse_token_map, x0)] + Yseq[:-1] seqs.append((Xseq, Yseq)) return seqs
def sample(self, model, tokens, vocab, reverse_token_map): seqlen = self.seqlen vocab_size = len(vocab) token_ix = -1 i = random.randint(0, len(tokens) - seqlen - 1) inpt = tokens[i:i + seqlen] output = "" for t in inpt: output += t output += "->" mintokens = 15 maxtokens = 100 i = 0 while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['\n']): x = np.zeros((1, seqlen, vocab_size)) x[0] = [token_to_oh(get_ix_from_token(reverse_token_map, token), vocab_size) for token in inpt] preds = model.predict(x, verbose=0)[0] preds = preds[min(i, self.seqlen - 1)] token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) new_token = vocab[token_ix] output += new_token inpt = inpt[1:] + [new_token] i+=1 logger.info("\n" + output) return output
def masked_sample(self, model, tokens, vocab, reverse_token_map): seqlen = self.seqlen vocab_size = len(vocab) token_ix = -1 inpt = ["<MASK>" for i in range(self.seqlen)] inpt[0] = "<START>" output = "" mintokens = 15 maxtokens = 100 i = 1 while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['<START>']): maskix = min(i, self.seqlen - 1) # x = np.zeros((1, seqlen)) x = [get_ix_from_token(reverse_token_map, token) for token in inpt] x = np.asarray(x) x = x.reshape((1, seqlen)) preds = model.predict(x, verbose=0)[0][min(i, self.seqlen - 1)] token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) while token_ix == reverse_token_map["<UNK>"]: token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) new_token = vocab[token_ix] output += new_token inpt[maskix] = new_token if maskix == self.seqlen - 1: inpt = inpt[1:] + ["<MASK>"] i += 1 return output
def sample(self, model, tokens, vocab, reverse_token_map, temp=1): seqlen = self.seqlen vocab_size = len(vocab) token_ix = -1 inpt = [" " for i in range(self.seqlen)] inpt[0] = "<START>" output = "" mintokens = 15 maxtokens = 100 i = 0 while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['<START>']): x = [get_ix_from_token(reverse_token_map, token) for token in inpt] x = np.asarray(x) x = x.reshape((1,seqlen)) preds = model.predict(x, verbose=0)[0] preds = preds[min(i, self.seqlen - 1)] token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) retries = 0 while retries < 10 and token_ix == reverse_token_map["<UNK>"]: token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) retries += 1 new_token = vocab[token_ix] output += new_token output += " " if (i + 1 < len(inpt)): inpt[i + 1] = new_token else: inpt = inpt[1:] + [new_token] i += 1 logger.info(output) return output
def tokens_to_sequences(self, tokens): if len(tokens) < self.seqlen: tokens = char_padded(tokens, "<PAD>", self.seqlen) Xseqs = [] Yseqs = [] pad_masks = [] for i in range(0,len(tokens)-self.seqlen+1, self.step): x0 = "<START>" if i == 0 else tokens[i - 1] Yseq = [get_ix_from_token(self.reverse_token_map, token) for token in tokens[i:i+self.seqlen]] Xseq = [get_ix_from_token(self.reverse_token_map, x0)] + Yseq[:-1] Yseq = np.array(Yseq) Xseq = np.array(Xseq) # pad_mask = (Yseq != get_ix_from_token(self.reverse_token_map, "<PAD>")).astype(np.int64) # pad_masks.append(pad_mask) Yseqs.append(Yseq) Xseqs.append(Xseq) # Yseqs = tf.data.Dataset.from_tensor_slices(Yseqs) # Xseqs = tf.data.Dataset.from_tensor_slices(Xseqs, Yseqs) # seqs = tf.data.Dataset.from_tensor_slices((Xseqs,Yseqs)) return Xseqs, Yseqs
def get_input_sequences(self, tokens, reverse_token_map, full=True, sliding_window=True): if full: return self.get_full_input_sequences(tokens, reverse_token_map) if sliding_window: return self.sliding_window_input_sequences(tokens, reverse_token_map) nummsgs = math.floor((len(tokens) - self.seqlen) / self.step) + 1 seqs = [] for line in tokens: padded_sequence = char_padded(line[:self.seqlen], " ", self.seqlen) Yseq = [ get_ix_from_token(reverse_token_map, token) for token in padded_sequence ] Xseq = [get_ix_from_token(reverse_token_map, " ")] + Yseq[:-1] seqs.append((Xseq, Yseq)) return seqs
def get_full_input_sequences(self, tokens, reverse_token_map): tokens = sorted(tokens, key=lambda a: len(a)) left = 0 right = len(tokens) - 1 seqs = [] Xseq = [] Yseq = [] while left < right: if len(Yseq) + len(tokens[right][:self.seqlen]) <= self.seqlen: newSeq = tokens[right][:self.seqlen] Yseq += newSeq Xseq += ["START"] + newSeq[:-1] right -= 1 if len(Yseq) + len(tokens[left][:self.seqlen]) <= self.seqlen: newSeq = tokens[left][:self.seqlen] Yseq += newSeq Xseq += ["START"] + newSeq[:-1] left += 1 else: paddedX = [ get_ix_from_token(reverse_token_map, token) for token in char_padded(Xseq, " ", self.seqlen) ] paddedY = [ get_ix_from_token(reverse_token_map, token) for token in char_padded(Yseq, " ", self.seqlen) ] seqs.append((paddedX, paddedY)) Yseq = [] Xseq = [] paddedX = [ get_ix_from_token(reverse_token_map, token) for token in char_padded(Xseq, " ", self.seqlen) ] paddedY = [ get_ix_from_token(reverse_token_map, token) for token in char_padded(Yseq, " ", self.seqlen) ] seqs.append((paddedX, paddedY)) return seqs
def sample(self, model, tokens, vocab, reverse_token_map, temp=1): seqlen = self.seqlen vocab_size = len(vocab) token_ix = -1 # start = np.random.randint(0, len(tokens) - self.seqlen) # inpt = tokens[start:start+self.seqlen] inpt = [" " for i in range(self.seqlen)] inpt[0] = "<START>" output = "" mintokens = 15 maxtokens = 100 i = 0 while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['<START>']): # x = np.zeros((1, seqlen)) # logger.info(inpt) x = [get_ix_from_token(reverse_token_map, token) for token in inpt] x = np.asarray(x) x = x.reshape((1, seqlen)) preds = model.predict(x, verbose=0)[0] preds = preds[min(i, self.seqlen - 1)] # topk = tf.math.top_k(preds, k=50) # topk_preds = keras.layers.Softmax()(topk.values/temp) # token_ix = np.random.choice(topk.indices, p=topk_preds) token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) retries = 0 while retries < 10 and token_ix == reverse_token_map["<UNK>"]: token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) retries += 1 new_token = vocab[token_ix] # logger.info(new_token) output += new_token output += " " if (i + 1 < len(inpt)): inpt[i + 1] = new_token else: inpt = inpt[1:] + [new_token] i += 1 logger.info(output) return output
def sample(self, model, tokens, vocab, reverse_token_map): seqlen = self.seqlen vocab_size = len(vocab) tf.keras.backend.set_floatx('float64') if (not hasattr(self, "encoder_model") or not hasattr(self, "decoder_model")): self.build_sample_model(model) i = random.randint(0, len(tokens)) context = tokens[i][0] actual_message = tokens[i][1] encoder_input = [ get_ix_from_token(reverse_token_map, token) for token in context[:self.context_len] ] encoder_input = [token_to_oh(ix, len(vocab)) for ix in encoder_input] encoder_input = np.array([encoder_input]) encoder_state = self.encoder_model.predict(encoder_input) inpt = ["START" for i in range(self.seqlen)] output = "" token_ix = -1 mintokens = 15 maxtokens = 100 i = 0 while i < maxtokens and (i < mintokens or token_ix != reverse_token_map['\n']): if self.embedding: x = np.zeros((1, seqlen)) x[0] = [ get_ix_from_token(reverse_token_map, token) for token in inpt ] else: x = np.zeros((1, seqlen, vocab_size)) x[0] = [ token_to_oh(get_ix_from_token(reverse_token_map, token), vocab_size) for token in inpt ] preds = self.decoder_model.predict([x] + encoder_state, verbose=0)[0] preds = preds[0][min(i, self.seqlen - 1)] probs = preds.ravel() token_ix = np.random.choice(range(vocab_size), p=probs) retries = 0 while (retries < 10 and token_ix == reverse_token_map["<UNK>"] ) or (token_ix == reverse_token_map[" "] and output[-1] == " "): token_ix = np.random.choice(range(vocab_size), p=preds.ravel()) retries += 1 new_token = vocab[token_ix] output += new_token if (i + 1 < len(inpt)): inpt[i + 1] = new_token else: inpt = inpt[1:] + [new_token] i += 1 print(context) print(output) print(actual_message) print(len(output)) return output