def func(save_dir=r'../outputs', vocab=r'../outputs/vocabulary.txt', train=r'../outputs/train-data.npz', valid=r'../outputs/valid-data.npz', bidirectional=1, batch_size=32, num_epochs=1, learning_rate=0.001, dropout_keep=1.0, interval=1000, lstm_units=500, embedding_size=100, in_embeddings=None, in_train_embeddings=False): logging.basicConfig(level=logging.INFO) sess = tf.Session() wd = utils.WordDictionary(vocab) embeddings = load_or_create_embeddings(in_embeddings, wd.vocabulary_size, embedding_size) logging.info('Reading training data') train_data = utils.load_binary_data(train) logging.info('Reading validation data') valid_data = utils.load_binary_data(valid) logging.info('Creating model') train_embeddings = in_train_embeddings if in_embeddings else True model = seq.seq2seqModel(lstm_units, embeddings, wd.eos_index, train_embeddings=train_embeddings, bidirectional=bidirectional, condition=True) sess.run(tf.global_variables_initializer()) show_parameter_count(model.get_trainable_variables()) logging.info('Initialized the model and all variables. Starting training.') model.train(sess, save_dir, train_data, valid_data, batch_size, num_epochs, learning_rate, dropout_keep, 5.0, report_interval=interval) pass
def func(model=r'../outputs',vocabulary=r'../outputs/vocabulary.txt',lower=-1): logging.basicConfig(level=logging.INFO) logging.info('Reading model') sess = tf.InteractiveSession() model = seq.seq2seqModel.load(model, sess) word_dict = utils.WordDictionary(vocabulary) index_dict = word_dict.inverse_dictionary() while True: string = input('Type tokenized sentence: ') sent = SentenceWrapper(string, word_dict, lower) answer = model.run(sess, [sent.indices], [len(sent)]) answer_words = [index_dict[i] for i in answer] answer_str = ' '.join(answer_words) print('Model output:', answer_str)
""" Run the encoder part of the autoencoder in a corpus to generate the memory cell representation for them. """ if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('model', help='Directory with saved model') parser.add_argument('input', help='File with one sentence per line') parser.add_argument('vocabulary', help='File with autoencoder vocabulary') parser.add_argument('output', help='Numpy file to write output') args = parser.parse_args() wd = utils.WordDictionary(args.vocabulary) sess = tf.InteractiveSession() model = TextAutoencoder.load(args.model, sess) excelfile = args.input para = pd.read_excel(excelfile) all_states = [] for i in range(len(para)): sents = nltk.tokenize.sent_tokenize(para['paragraph'][i]) sentences, sizes = utils.load_text_data_from_list(sents, wd) state = model.encode(sess, sentences, sizes) state = state.mean(axis=0) all_states.append(state) print(len(all_states))
type=int, default=1000) parser.add_argument('-g', help='gpu number', dest='num_gpus', type=int, default=2) parser.add_argument('--embeddings', help='Numpy embeddings file. If not supplied, ' 'random embeddings are generated.') parser.add_argument('data', help='data directory name') args = parser.parse_args() logging.basicConfig(level=logging.INFO) path = args.data + '/vocabulary.txt' wd = utils.WordDictionary(path) embeddings = load_or_create_embeddings(args.embeddings, wd.vocabulary_size, args.embedding_size) logging.info('Reading training data') path = args.data + '/train-data.npz' train_data = utils.load_binary_data(path) logging.info('Reading validation data') path = args.data + '/valid-data.npz' valid_data = utils.load_binary_data(path) logging.info('Creating model') model = autoencoder.TextAutoencoder(args.lstm_units, embeddings, wd.eos_index, args.num_gpus) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True
self.indices = np.array([word_dict[token] for token in self.tokens]) def __len__(self): return len(self.tokens) if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('model', help='Directory with saved model files') parser.add_argument('vocabulary', help='Vocabulary file') parser.add_argument('-l', dest='lower', action='store_true', help='Convert text to lowercase') args = parser.parse_args() logging.basicConfig(level=logging.INFO) logging.info('Reading model') sess = tf.InteractiveSession() model = autoencoder.TextAutoencoder.load(args.model, sess) word_dict = utils.WordDictionary(args.vocabulary) index_dict = word_dict.inverse_dictionary() while True: string = input('Type tokenized sentence: ') sent = SentenceWrapper(string, word_dict, args.lower) answer = model.run(sess, [sent.indices], [len(sent)]) answer_words = [index_dict[i] for i in answer] answer_str = ' '.join(answer_words) print('Model output:', answer_str)
def __init__(self, lstm_units, embeddings, go, train=True, train_embeddings=False, bidirectional=True): """ Initialize the encoder/decoder and creates Tensor objects :param lstm_units: number of LSTM units :param embeddings: numpy array with initial embeddings :param go: index of the GO symbol in the embedding matrix :param train_embeddings: whether to adjust embeddings during training :param bidirectional: whether to create a bidirectional autoencoder (if False, a simple linear LSTM is used) """ # EOS and GO share the same symbol. Only GO needs to be embedded, and # only EOS exists as a possible network output self.go = go self.eos = go self.word_dict = utils.WordDictionary('../hri_data/vocabulary.txt') self.index_dict = self.word_dict.inverse_dictionary() self.bidirectional = bidirectional self.vocab_size = embeddings.shape[0] self.embedding_size = embeddings.shape[1] self.global_step = tf.Variable(0, name='global_step', trainable=False) # the sentence is the object to be memorized self.sentence = tf.placeholder(tf.int32, [None, None], 'sentence') self.sentence_size = tf.placeholder(tf.int32, [None], 'sentence_size') self.l2_constant = tf.placeholder(tf.float32, name='l2_constant') self.clip_value = tf.placeholder(tf.float32, name='clip') self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') self.dropout_keep = tf.placeholder(tf.float32, name='dropout_keep') self.decoder_step_input = tf.placeholder(tf.int32, [None], 'prediction_step') name = 'decoder_fw_step_state_c' self.decoder_fw_step_c = tf.placeholder(tf.float32, [None, lstm_units], name) name = 'decoder_fw_step_state_h' self.decoder_fw_step_h = tf.placeholder(tf.float32, [None, lstm_units], name) self.decoder_bw_step_c = tf.placeholder(tf.float32, [None, lstm_units], 'decoder_bw_step_state_c') self.decoder_bw_step_h = tf.placeholder(tf.float32, [None, lstm_units], 'decoder_bw_step_state_h') with tf.variable_scope('autoencoder') as self.scope: self.embeddings = tf.Variable(embeddings, name='embeddings', trainable=train_embeddings) initializer = tf.glorot_normal_initializer() self.lstm_fw = tf.nn.rnn_cell.LSTMCell(lstm_units, initializer=initializer) self.lstm_bw = tf.nn.rnn_cell.LSTMCell(lstm_units, initializer=initializer) embedded = tf.nn.embedding_lookup(self.embeddings, self.sentence) embedded = tf.nn.dropout(embedded, self.dropout_keep) # encoding step if bidirectional: bdr = tf.nn.bidirectional_dynamic_rnn ret = bdr(self.lstm_fw, self.lstm_bw, embedded, dtype=tf.float32, sequence_length=self.sentence_size, scope=self.scope) else: ret = tf.nn.dynamic_rnn(self.lstm_fw, embedded, dtype=tf.float32, sequence_length=self.sentence_size, scope=self.scope) _, self.encoded_state = ret if bidirectional: encoded_state_fw, encoded_state_bw = self.encoded_state # set the scope name used inside the decoder. # maybe there's a more elegant way to do it? fw_scope_name = self.scope.name + '/fw' bw_scope_name = self.scope.name + '/bw' else: encoded_state_fw = self.encoded_state fw_scope_name = self.scope self.scope.reuse_variables() # generate a batch of embedded GO # sentence_size has the batch dimension go_batch = self._generate_batch_go(self.sentence_size) embedded_eos = tf.nn.embedding_lookup(self.embeddings, go_batch) embedded_eos = tf.reshape(embedded_eos, [-1, 1, self.embedding_size]) decoder_input = tf.concat([embedded_eos, embedded], axis=1) # decoding step # We give the same inputs to the forward and backward LSTMs, # but each one has its own hidden state # their outputs are concatenated and fed to the softmax layer if bidirectional: outputs, _ = tf.nn.bidirectional_dynamic_rnn( self.lstm_fw, self.lstm_bw, decoder_input, self.sentence_size, encoded_state_fw, encoded_state_bw) # concat fw and bw outputs outputs = tf.concat(outputs, -1) else: outputs, _ = tf.nn.dynamic_rnn( self.lstm_fw, decoder_input, self.sentence_size, encoded_state_fw) self.decoder_outputs = outputs # now project the outputs to the vocabulary with tf.variable_scope('projection') as self.projection_scope: # decoder_outputs has shape (batch, max_sentence_size, vocab_size) self.logits = tf.layers.dense(outputs, self.vocab_size) # tensors for running a model embedded_step = tf.nn.embedding_lookup(self.embeddings, self.decoder_step_input) state_fw = tf.nn.rnn_cell.LSTMStateTuple(self.decoder_fw_step_c, self.decoder_fw_step_h) state_bw = tf.nn.rnn_cell.LSTMStateTuple(self.decoder_bw_step_c, self.decoder_bw_step_h) with tf.variable_scope(fw_scope_name, reuse=True): ret_fw = self.lstm_fw(embedded_step, state_fw) step_output_fw, self.decoder_fw_step_state = ret_fw if bidirectional: with tf.variable_scope(bw_scope_name, reuse=True): ret_bw = self.lstm_bw(embedded_step, state_bw) step_output_bw, self.decoder_bw_step_state = ret_bw step_output = tf.concat(axis=1, values=[step_output_fw, step_output_bw]) else: step_output = step_output_fw with tf.variable_scope(self.projection_scope, reuse=True): self.projected_step_output = tf.layers.dense(step_output, self.vocab_size) if train: self._create_training_tensors()