Beispiel #1
0
def func(save_dir=r'../outputs',
         vocab=r'../outputs/vocabulary.txt',
         train=r'../outputs/train-data.npz',
         valid=r'../outputs/valid-data.npz',
         bidirectional=1,
         batch_size=32,
         num_epochs=1,
         learning_rate=0.001,
         dropout_keep=1.0,
         interval=1000,
         lstm_units=500,
         embedding_size=100,
         in_embeddings=None,
         in_train_embeddings=False):
    logging.basicConfig(level=logging.INFO)

    sess = tf.Session()
    wd = utils.WordDictionary(vocab)
    embeddings = load_or_create_embeddings(in_embeddings, wd.vocabulary_size,
                                           embedding_size)

    logging.info('Reading training data')
    train_data = utils.load_binary_data(train)
    logging.info('Reading validation data')
    valid_data = utils.load_binary_data(valid)
    logging.info('Creating model')

    train_embeddings = in_train_embeddings if in_embeddings else True
    model = seq.seq2seqModel(lstm_units,
                             embeddings,
                             wd.eos_index,
                             train_embeddings=train_embeddings,
                             bidirectional=bidirectional,
                             condition=True)

    sess.run(tf.global_variables_initializer())
    show_parameter_count(model.get_trainable_variables())
    logging.info('Initialized the model and all variables. Starting training.')
    model.train(sess,
                save_dir,
                train_data,
                valid_data,
                batch_size,
                num_epochs,
                learning_rate,
                dropout_keep,
                5.0,
                report_interval=interval)
    pass
Beispiel #2
0
def func(model=r'../outputs',vocabulary=r'../outputs/vocabulary.txt',lower=-1):
    logging.basicConfig(level=logging.INFO)
    logging.info('Reading model')
    sess = tf.InteractiveSession()
    model = seq.seq2seqModel.load(model, sess)
    word_dict = utils.WordDictionary(vocabulary)
    index_dict = word_dict.inverse_dictionary()

    while True:
        string = input('Type tokenized sentence: ')
        sent = SentenceWrapper(string, word_dict, lower)
        answer = model.run(sess, [sent.indices], [len(sent)])
        answer_words = [index_dict[i] for i in answer]
        answer_str = ' '.join(answer_words)
        print('Model output:', answer_str)
"""
Run the encoder part of the autoencoder in a corpus to generate
the memory cell representation for them.
"""

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('model', help='Directory with saved model')
    parser.add_argument('input', help='File with one sentence per line')
    parser.add_argument('vocabulary', help='File with autoencoder vocabulary')
    parser.add_argument('output', help='Numpy file to write output')

    args = parser.parse_args()

    wd = utils.WordDictionary(args.vocabulary)
    sess = tf.InteractiveSession()
    model = TextAutoencoder.load(args.model, sess)

    excelfile = args.input
    para = pd.read_excel(excelfile)
    all_states = []
    for i in range(len(para)):
        sents = nltk.tokenize.sent_tokenize(para['paragraph'][i])
        sentences, sizes = utils.load_text_data_from_list(sents, wd)
        state = model.encode(sess, sentences, sizes)
        state = state.mean(axis=0)
        all_states.append(state)
        print(len(all_states))

    
                        type=int,
                        default=1000)
    parser.add_argument('-g',
                        help='gpu number',
                        dest='num_gpus',
                        type=int,
                        default=2)
    parser.add_argument('--embeddings',
                        help='Numpy embeddings file. If not supplied, '
                        'random embeddings are generated.')
    parser.add_argument('data', help='data directory name')
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    path = args.data + '/vocabulary.txt'
    wd = utils.WordDictionary(path)
    embeddings = load_or_create_embeddings(args.embeddings, wd.vocabulary_size,
                                           args.embedding_size)

    logging.info('Reading training data')
    path = args.data + '/train-data.npz'
    train_data = utils.load_binary_data(path)
    logging.info('Reading validation data')
    path = args.data + '/valid-data.npz'
    valid_data = utils.load_binary_data(path)
    logging.info('Creating model')

    model = autoencoder.TextAutoencoder(args.lstm_units, embeddings,
                                        wd.eos_index, args.num_gpus)
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
        self.indices = np.array([word_dict[token] for token in self.tokens])

    def __len__(self):
        return len(self.tokens)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('model', help='Directory with saved model files')
    parser.add_argument('vocabulary', help='Vocabulary file')
    parser.add_argument('-l',
                        dest='lower',
                        action='store_true',
                        help='Convert text to lowercase')
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logging.info('Reading model')
    sess = tf.InteractiveSession()
    model = autoencoder.TextAutoencoder.load(args.model, sess)
    word_dict = utils.WordDictionary(args.vocabulary)
    index_dict = word_dict.inverse_dictionary()

    while True:
        string = input('Type tokenized sentence: ')
        sent = SentenceWrapper(string, word_dict, args.lower)
        answer = model.run(sess, [sent.indices], [len(sent)])
        answer_words = [index_dict[i] for i in answer]
        answer_str = ' '.join(answer_words)
        print('Model output:', answer_str)
Beispiel #6
0
    def __init__(self, lstm_units, embeddings, go, train=True,
                 train_embeddings=False, bidirectional=True):
        """
        Initialize the encoder/decoder and creates Tensor objects

        :param lstm_units: number of LSTM units
        :param embeddings: numpy array with initial embeddings
        :param go: index of the GO symbol in the embedding matrix
        :param train_embeddings: whether to adjust embeddings during training
        :param bidirectional: whether to create a bidirectional autoencoder
            (if False, a simple linear LSTM is used)
        """
        # EOS and GO share the same symbol. Only GO needs to be embedded, and
        # only EOS exists as a possible network output
        self.go = go
        self.eos = go

        self.word_dict = utils.WordDictionary('../hri_data/vocabulary.txt')
        self.index_dict = self.word_dict.inverse_dictionary()

        self.bidirectional = bidirectional
        self.vocab_size = embeddings.shape[0]
        self.embedding_size = embeddings.shape[1]
        self.global_step = tf.Variable(0, name='global_step', trainable=False)

        # the sentence is the object to be memorized
        self.sentence = tf.placeholder(tf.int32, [None, None], 'sentence')
        self.sentence_size = tf.placeholder(tf.int32, [None],
                                            'sentence_size')
        self.l2_constant = tf.placeholder(tf.float32, name='l2_constant')
        self.clip_value = tf.placeholder(tf.float32, name='clip')
        self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
        self.dropout_keep = tf.placeholder(tf.float32, name='dropout_keep')

        self.decoder_step_input = tf.placeholder(tf.int32,
                                                 [None],
                                                 'prediction_step')

        name = 'decoder_fw_step_state_c'
        self.decoder_fw_step_c = tf.placeholder(tf.float32,
                                                [None, lstm_units], name)
        name = 'decoder_fw_step_state_h'
        self.decoder_fw_step_h = tf.placeholder(tf.float32,
                                                [None, lstm_units], name)
        self.decoder_bw_step_c = tf.placeholder(tf.float32,
                                                [None, lstm_units],
                                                'decoder_bw_step_state_c')
        self.decoder_bw_step_h = tf.placeholder(tf.float32,
                                                [None, lstm_units],
                                                'decoder_bw_step_state_h')

        with tf.variable_scope('autoencoder') as self.scope:
            self.embeddings = tf.Variable(embeddings, name='embeddings',
                                          trainable=train_embeddings)

            initializer = tf.glorot_normal_initializer()
            self.lstm_fw = tf.nn.rnn_cell.LSTMCell(lstm_units,
                                                   initializer=initializer)
            self.lstm_bw = tf.nn.rnn_cell.LSTMCell(lstm_units,
                                                   initializer=initializer)

            embedded = tf.nn.embedding_lookup(self.embeddings, self.sentence)
            embedded = tf.nn.dropout(embedded, self.dropout_keep)

            # encoding step
            if bidirectional:
                bdr = tf.nn.bidirectional_dynamic_rnn
                ret = bdr(self.lstm_fw, self.lstm_bw,
                          embedded, dtype=tf.float32,
                          sequence_length=self.sentence_size,
                          scope=self.scope)
            else:
                ret = tf.nn.dynamic_rnn(self.lstm_fw, embedded,
                                        dtype=tf.float32,
                                        sequence_length=self.sentence_size,
                                        scope=self.scope)
            _, self.encoded_state = ret
            if bidirectional:
                encoded_state_fw, encoded_state_bw = self.encoded_state

                # set the scope name used inside the decoder.
                # maybe there's a more elegant way to do it?
                fw_scope_name = self.scope.name + '/fw'
                bw_scope_name = self.scope.name + '/bw'
            else:
                encoded_state_fw = self.encoded_state
                fw_scope_name = self.scope

            self.scope.reuse_variables()

            # generate a batch of embedded GO
            # sentence_size has the batch dimension
            go_batch = self._generate_batch_go(self.sentence_size)
            embedded_eos = tf.nn.embedding_lookup(self.embeddings,
                                                  go_batch)
            embedded_eos = tf.reshape(embedded_eos,
                                      [-1, 1, self.embedding_size])
            decoder_input = tf.concat([embedded_eos, embedded], axis=1)

            # decoding step

            # We give the same inputs to the forward and backward LSTMs,
            # but each one has its own hidden state
            # their outputs are concatenated and fed to the softmax layer
            if bidirectional:
                outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                    self.lstm_fw, self.lstm_bw, decoder_input,
                    self.sentence_size, encoded_state_fw, encoded_state_bw)

                # concat fw and bw outputs
                outputs = tf.concat(outputs, -1)
            else:
                outputs, _ = tf.nn.dynamic_rnn(
                    self.lstm_fw, decoder_input, self.sentence_size,
                    encoded_state_fw)

            self.decoder_outputs = outputs

        # now project the outputs to the vocabulary
        with tf.variable_scope('projection') as self.projection_scope:
            # decoder_outputs has shape (batch, max_sentence_size, vocab_size)
            self.logits = tf.layers.dense(outputs, self.vocab_size)

        # tensors for running a model
        embedded_step = tf.nn.embedding_lookup(self.embeddings,
                                               self.decoder_step_input)
        state_fw = tf.nn.rnn_cell.LSTMStateTuple(self.decoder_fw_step_c,
                                                 self.decoder_fw_step_h)
        state_bw = tf.nn.rnn_cell.LSTMStateTuple(self.decoder_bw_step_c,
                                                 self.decoder_bw_step_h)
        with tf.variable_scope(fw_scope_name, reuse=True):
            ret_fw = self.lstm_fw(embedded_step, state_fw)
        step_output_fw, self.decoder_fw_step_state = ret_fw

        if bidirectional:
            with tf.variable_scope(bw_scope_name, reuse=True):
                ret_bw = self.lstm_bw(embedded_step, state_bw)
                step_output_bw, self.decoder_bw_step_state = ret_bw
                step_output = tf.concat(axis=1, values=[step_output_fw,
                                                        step_output_bw])
        else:
            step_output = step_output_fw

        with tf.variable_scope(self.projection_scope, reuse=True):
            self.projected_step_output = tf.layers.dense(step_output,
                                                         self.vocab_size)

        if train:
            self._create_training_tensors()