def sample(self, sess, vocab, prime=' '): tokens = word_tokenize(prime) targets = np.zeros( (len(tokens), self.args.w2v_size)) #? TODO remove punctuation? word = np.zeros((len(tokens), self.args.letter_size)) seq_l = self.args.seq_length for i, token in enumerate(tokens): x = letters2vec(token, vocab) word[i] = x if (((i % (seq_l - 1) == 0) and (i != 0)) or (i == (len(tokens) - 1))) and (i > seq_l - 2): fix_words = word[-seq_l:].reshape( (1, seq_l, self.args.letter_size)) feed = { self.input_data: fix_words, } [target] = sess.run([self.target], feed) targets[i - (seq_l - 1):i + 1] = (np.squeeze(target)) if (i == (len(tokens) - 1)) and (len(tokens) < seq_l): word = np.append( word, np.zeros( (seq_l - len(tokens), self.args.letter_size))) fix_words = word.reshape((1, seq_l, self.args.letter_size)) feed = { self.input_data: fix_words, } [target] = sess.run([self.target], feed) return np.squeeze(target) return targets
def sample(self, sess, vocab, prime_batch=' ', batch_size=1, pad=128): self.initial_state = tf.convert_to_tensor( self.cell.zero_state(batch_size, tf.float32)) max_seq = pad data = np.zeros( (batch_size, max_seq, 7 * len(vocab))) # 7*len(vocab) is letter2vec encoding size for i, _sent in enumerate(prime_batch): sent = word_tokenize(_sent) if len(sent) > max_seq: sent = sent[:max_seq] sent_vecs = [] for t in sent: x = letters2vec(t, vocab).reshape((1, 1, -1)) sent_vecs.append(x) data[i, :len(sent_vecs)] = sent_vecs data = data.transpose([1, 0, 2]) state_fw = self.initial_state.eval() target_vectors = [] for word_batch in data: feed = { self.input_data: np.expand_dims(word_batch, 1), self.initial_state: state_fw, self.change: np.zeros((batch_size, )) } [last_state, word_vec] = sess.run([self.final_state, self.target], feed) state_fw = last_state target_vectors.append(word_vec) target_vectors = np.array(target_vectors).transpose([1, 0, 2]) return target_vectors
def valid_run(self, sess, vocab, prime): tokens = word_tokenize(prime) valids = [] # np.zeros((len(tokens), self.args.w2v_size)) word = np.zeros((len(tokens), self.args.letter_size)) seq_l = self.args.seq_length for i, token in enumerate(tokens): x = letters2vec(token, vocab) word[i] = x if (((i % (seq_l - 1) == 0) and (i != 0)) or (i == (len(tokens) - 1))) and (i > seq_l - 2): fix_words = word[-seq_l:].reshape( (1, seq_l, self.args.letter_size)) feed = { self.valid_input_data: fix_words, } [target] = sess.run([self.valid_target], feed) valids.append(np.squeeze(target)) if (i == (len(tokens) - 1)) and (len(tokens) < seq_l): word = np.append( word, np.zeros( (seq_l - len(tokens), self.args.letter_size))) fix_words = word.reshape((1, seq_l, self.args.letter_size)) feed = { self.valid_input_data: fix_words, } [target] = sess.run([self.valid_target], feed) return np.squeeze(target) return valids
def sample(self, sess, vocab, prime=' '): initial_state_fw = [] initial_state_bw = [] for (cell_fw, cell_bw) in zip(self.cells_fw, self.cells_bw): initial_state_fw.append(tf.convert_to_tensor(cell_fw.zero_state(1, tf.float32))) initial_state_bw.append(tf.convert_to_tensor(cell_bw.zero_state(1, tf.float32))) self.initial_state_bw = initial_state_bw self.initial_state_fw = initial_state_fw state_fw = np.array([initial_state_fw[0].eval(), initial_state_fw[1].eval()]) state_bw = np.array([initial_state_bw[0].eval(), initial_state_bw[1].eval()]) tokens = word_tokenize(prime) targets = [] for token in tokens: x = letters2vec(token, vocab).reshape((1, 1, -1)) feed = {self.input_data: x, self.initial_state_fw[0]: state_fw[0], self.initial_state_fw[1]: state_fw[1], self.initial_state_bw[0]: state_bw[0], self.initial_state_bw[1]: state_bw[1], self.change: np.zeros((1,)) } [last_state, target] = sess.run([self.final_state, self.target], feed) state_fw = last_state[0] state_bw = last_state[1] targets.append(np.squeeze(target)) return targets
def valid_run(self, sess, vocab, prime): valid_initial_state_fw = [] valid_initial_state_bw = [] for (cell_fw, cell_bw) in zip(self.cells_fw, self.cells_bw): valid_initial_state_fw.append(tf.convert_to_tensor(cell_fw.zero_state(1, tf.float32))) valid_initial_state_bw.append(tf.convert_to_tensor(cell_bw.zero_state(1, tf.float32))) self.valid_initial_state_bw = valid_initial_state_bw self.valid_initial_state_fw = valid_initial_state_fw state_fw = np.array([valid_initial_state_fw[0].eval(), valid_initial_state_fw[1].eval()]) state_bw = np.array([valid_initial_state_bw[0].eval(), valid_initial_state_bw[1].eval()]) tokens = word_tokenize(prime) targets = [] for token in tokens: x = letters2vec(token, vocab).reshape((1, 1, -1)) feed = {self.valid_data: x, self.valid_initial_state_fw[0]: state_fw[0], self.valid_initial_state_fw[1]: state_fw[1], self.valid_initial_state_bw[0]: state_bw[0], self.valid_initial_state_bw[1]: state_bw[1], } [last_state, target] = sess.run([self.valid_state, self.valid_vector], feed) state_fw = last_state[0] state_bw = last_state[1] targets.append(np.squeeze(target)) return targets
def sample(self, sess, vocab, prime_batch, batch_size=1, pad=128): """ :param sess: tf session :param vocab: char vocabulary :param prime_batch: list of strings :return: sequence of robust word vectors """ self.initial_state_fw = tf.convert_to_tensor(self.cell_fw.zero_state(batch_size, tf.float32)) self.initial_state_bw = tf.convert_to_tensor(self.cell_bw.zero_state(batch_size, tf.float32)) max_seq = pad data = np.zeros((batch_size, max_seq, 7*len(vocab))) for i, _sent in enumerate(prime_batch): sent = word_tokenize(_sent) if len(sent) > max_seq: sent = sent[:max_seq] sent_vecs = [] for t in sent: x = letters2vec(t, vocab).reshape((1, 1, -1)) sent_vecs.append(x) data[i, :len(sent_vecs)] = sent_vecs feed = { self.input_data: data, self.initial_state_fw: self.initial_state_fw.eval(), self.initial_state_bw: self.initial_state_bw.eval() } target_vectors = sess.run(self.target, feed) return target_vectors
def _preprocess(self, text): tokens_vecs = np.zeros((self.max_text_length, 7 * len(self.vocab))) tokens = word_tokenize(text) if len(tokens) > self.max_text_length: tokens = tokens[:self.max_text_length] for i, t in enumerate(tokens): x = letters2vec(t, self.vocab).reshape((1, 1, -1)) tokens_vecs[i] = x return tokens_vecs
def valid_run(self, sess, vocab, prime): state = self.cell.zero_state(1, tf.float32).eval() tokens = word_tokenize(prime) targets = [] for token in tokens: x = letters2vec(token, vocab).reshape((1, 1, -1)) feed = { self.valid_data: x, self.valid_initial_state: state, } [state, target] = sess.run([self.valid_state, self.valid_vector], feed) targets.append(np.squeeze(target)) return targets
def valid_run(self, sess, vocab, prime): self.valid_initial_state_fw = tf.convert_to_tensor(self.cell_fw.zero_state(1, tf.float32)) self.valid_initial_state_bw = tf.convert_to_tensor(self.cell_bw.zero_state(1, tf.float32)) state_fw = self.valid_initial_state_fw.eval() state_bw = self.valid_initial_state_bw.eval() tokens = word_tokenize(prime) targets = [] for token in tokens: x = letters2vec(token, vocab).reshape((1, 1, -1)) feed = {self.valid_data: x, self.valid_initial_state_fw: state_fw, self.valid_initial_state_bw: state_bw, } [last_state, target] = sess.run([self.valid_state, self.valid_vector], feed) state_fw = last_state[0] state_bw = last_state[1] targets.append(np.squeeze(target)) return targets