Esempio n. 1
0
    def sample(self, sess, vocab, prime=' '):
        tokens = word_tokenize(prime)
        targets = np.zeros(
            (len(tokens), self.args.w2v_size))  #? TODO remove punctuation?
        word = np.zeros((len(tokens), self.args.letter_size))
        seq_l = self.args.seq_length
        for i, token in enumerate(tokens):
            x = letters2vec(token, vocab)
            word[i] = x

            if (((i % (seq_l - 1) == 0) and (i != 0)) or
                (i == (len(tokens) - 1))) and (i > seq_l - 2):
                fix_words = word[-seq_l:].reshape(
                    (1, seq_l, self.args.letter_size))

                feed = {
                    self.input_data: fix_words,
                }
                [target] = sess.run([self.target], feed)
                targets[i - (seq_l - 1):i + 1] = (np.squeeze(target))
            if (i == (len(tokens) - 1)) and (len(tokens) < seq_l):
                word = np.append(
                    word, np.zeros(
                        (seq_l - len(tokens), self.args.letter_size)))
                fix_words = word.reshape((1, seq_l, self.args.letter_size))
                feed = {
                    self.input_data: fix_words,
                }
                [target] = sess.run([self.target], feed)
                return np.squeeze(target)

        return targets
Esempio n. 2
0
    def sample(self, sess, vocab, prime_batch=' ', batch_size=1, pad=128):
        self.initial_state = tf.convert_to_tensor(
            self.cell.zero_state(batch_size, tf.float32))
        max_seq = pad
        data = np.zeros(
            (batch_size, max_seq,
             7 * len(vocab)))  # 7*len(vocab) is letter2vec encoding size

        for i, _sent in enumerate(prime_batch):
            sent = word_tokenize(_sent)
            if len(sent) > max_seq:
                sent = sent[:max_seq]
            sent_vecs = []
            for t in sent:
                x = letters2vec(t, vocab).reshape((1, 1, -1))
                sent_vecs.append(x)

            data[i, :len(sent_vecs)] = sent_vecs

        data = data.transpose([1, 0, 2])
        state_fw = self.initial_state.eval()
        target_vectors = []

        for word_batch in data:
            feed = {
                self.input_data: np.expand_dims(word_batch, 1),
                self.initial_state: state_fw,
                self.change: np.zeros((batch_size, ))
            }
            [last_state, word_vec] = sess.run([self.final_state, self.target],
                                              feed)
            state_fw = last_state
            target_vectors.append(word_vec)
        target_vectors = np.array(target_vectors).transpose([1, 0, 2])
        return target_vectors
    def valid_run(self, sess, vocab, prime):
        tokens = word_tokenize(prime)
        valids = []  # np.zeros((len(tokens), self.args.w2v_size))
        word = np.zeros((len(tokens), self.args.letter_size))
        seq_l = self.args.seq_length
        for i, token in enumerate(tokens):
            x = letters2vec(token, vocab)
            word[i] = x

            if (((i % (seq_l - 1) == 0) and (i != 0)) or
                (i == (len(tokens) - 1))) and (i > seq_l - 2):
                fix_words = word[-seq_l:].reshape(
                    (1, seq_l, self.args.letter_size))

                feed = {
                    self.valid_input_data: fix_words,
                }
                [target] = sess.run([self.valid_target], feed)
                valids.append(np.squeeze(target))
            if (i == (len(tokens) - 1)) and (len(tokens) < seq_l):
                word = np.append(
                    word, np.zeros(
                        (seq_l - len(tokens), self.args.letter_size)))
                fix_words = word.reshape((1, seq_l, self.args.letter_size))
                feed = {
                    self.valid_input_data: fix_words,
                }
                [target] = sess.run([self.valid_target], feed)
                return np.squeeze(target)
        return valids
Esempio n. 4
0
    def sample(self, sess, vocab, prime=' '):
        initial_state_fw = []
        initial_state_bw = []

        for (cell_fw, cell_bw) in zip(self.cells_fw, self.cells_bw):
            initial_state_fw.append(tf.convert_to_tensor(cell_fw.zero_state(1, tf.float32)))
            initial_state_bw.append(tf.convert_to_tensor(cell_bw.zero_state(1, tf.float32)))

        self.initial_state_bw = initial_state_bw
        self.initial_state_fw = initial_state_fw
        state_fw = np.array([initial_state_fw[0].eval(), initial_state_fw[1].eval()])
        state_bw = np.array([initial_state_bw[0].eval(), initial_state_bw[1].eval()])

        tokens = word_tokenize(prime)
        targets = []
        for token in tokens:
            x = letters2vec(token, vocab).reshape((1, 1, -1))
            feed = {self.input_data: x,
                    self.initial_state_fw[0]: state_fw[0],
                    self.initial_state_fw[1]: state_fw[1],
                    self.initial_state_bw[0]: state_bw[0],
                    self.initial_state_bw[1]: state_bw[1],
                    self.change: np.zeros((1,))
                    }

            [last_state, target] = sess.run([self.final_state, self.target], feed)
            state_fw = last_state[0]
            state_bw = last_state[1]
            targets.append(np.squeeze(target))
        return targets
Esempio n. 5
0
    def valid_run(self, sess, vocab, prime):
        valid_initial_state_fw = []
        valid_initial_state_bw = []

        for (cell_fw, cell_bw) in zip(self.cells_fw, self.cells_bw):
            valid_initial_state_fw.append(tf.convert_to_tensor(cell_fw.zero_state(1, tf.float32)))
            valid_initial_state_bw.append(tf.convert_to_tensor(cell_bw.zero_state(1, tf.float32)))

        self.valid_initial_state_bw = valid_initial_state_bw
        self.valid_initial_state_fw = valid_initial_state_fw
        state_fw = np.array([valid_initial_state_fw[0].eval(), valid_initial_state_fw[1].eval()])
        state_bw = np.array([valid_initial_state_bw[0].eval(), valid_initial_state_bw[1].eval()])

        tokens = word_tokenize(prime)
        targets = []
        for token in tokens:
            x = letters2vec(token, vocab).reshape((1, 1, -1))
            feed = {self.valid_data: x,
                    self.valid_initial_state_fw[0]: state_fw[0],
                    self.valid_initial_state_fw[1]: state_fw[1],
                    self.valid_initial_state_bw[0]: state_bw[0],
                    self.valid_initial_state_bw[1]: state_bw[1],
                    }
            [last_state, target] = sess.run([self.valid_state, self.valid_vector], feed)
            state_fw = last_state[0]
            state_bw = last_state[1]
            targets.append(np.squeeze(target))
        return targets
Esempio n. 6
0
    def sample(self, sess, vocab, prime_batch, batch_size=1, pad=128):
        """
        :param sess: tf session
        :param vocab: char vocabulary
        :param prime_batch: list of strings

        :return: sequence of robust word vectors
        """
        self.initial_state_fw = tf.convert_to_tensor(self.cell_fw.zero_state(batch_size, tf.float32))
        self.initial_state_bw = tf.convert_to_tensor(self.cell_bw.zero_state(batch_size, tf.float32))

        max_seq = pad
        data = np.zeros((batch_size, max_seq, 7*len(vocab)))
        for i, _sent in enumerate(prime_batch):
            sent = word_tokenize(_sent)
            if len(sent) > max_seq:
                sent = sent[:max_seq]
            sent_vecs = []
            for t in sent:
                x = letters2vec(t, vocab).reshape((1, 1, -1))
                sent_vecs.append(x)

            data[i, :len(sent_vecs)] = sent_vecs

        feed = {
            self.input_data: data,
            self.initial_state_fw: self.initial_state_fw.eval(),
            self.initial_state_bw: self.initial_state_bw.eval()
        }
        target_vectors = sess.run(self.target, feed)
        return target_vectors
Esempio n. 7
0
    def _preprocess(self, text):
        tokens_vecs = np.zeros((self.max_text_length, 7 * len(self.vocab)))
        tokens = word_tokenize(text)
        if len(tokens) > self.max_text_length:
            tokens = tokens[:self.max_text_length]
        for i, t in enumerate(tokens):
            x = letters2vec(t, self.vocab).reshape((1, 1, -1))
            tokens_vecs[i] = x

        return tokens_vecs
Esempio n. 8
0
 def valid_run(self, sess, vocab, prime):
     state = self.cell.zero_state(1, tf.float32).eval()
     tokens = word_tokenize(prime)
     targets = []
     for token in tokens:
         x = letters2vec(token, vocab).reshape((1, 1, -1))
         feed = {
             self.valid_data: x,
             self.valid_initial_state: state,
         }
         [state, target] = sess.run([self.valid_state, self.valid_vector],
                                    feed)
         targets.append(np.squeeze(target))
     return targets
Esempio n. 9
0
    def valid_run(self, sess, vocab, prime):

        self.valid_initial_state_fw = tf.convert_to_tensor(self.cell_fw.zero_state(1, tf.float32))
        self.valid_initial_state_bw = tf.convert_to_tensor(self.cell_bw.zero_state(1, tf.float32))
        state_fw = self.valid_initial_state_fw.eval()
        state_bw = self.valid_initial_state_bw.eval()
        tokens = word_tokenize(prime)
        targets = []

        for token in tokens:
            x = letters2vec(token, vocab).reshape((1, 1, -1))
            feed = {self.valid_data: x,
                    self.valid_initial_state_fw: state_fw,
                    self.valid_initial_state_bw: state_bw,
                    }
            [last_state, target] = sess.run([self.valid_state, self.valid_vector], feed)
            state_fw = last_state[0]
            state_bw = last_state[1]
            targets.append(np.squeeze(target))
        return targets