Ejemplo n.º 1
0
 def load_model(self):
     tf.compat.v1.disable_eager_execution()
     # placeholders
     self.x = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None])
     self.y = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None])
     self.mems_i = [tf.compat.v1.placeholder(tf.float32, [self.mem_len, self.batch_size, self.d_model]) for _ in range(self.n_layer)]
     # model
     self.global_step = tf.compat.v1.train.get_or_create_global_step()
     initializer = tf.compat.v1.initializers.random_normal(stddev=0.02, seed=None)
     proj_initializer = tf.compat.v1.initializers.random_normal(stddev=0.01, seed=None)
     with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
         xx = tf.transpose(self.x, [1, 0])
         yy = tf.transpose(self.y, [1, 0])
         loss, self.logits, self.new_mem = modules.transformer(
             dec_inp=xx,
             target=yy,
             mems=self.mems_i,
             n_token=self.n_token,
             n_layer=self.n_layer,
             d_model=self.d_model,
             d_embed=self.d_embed,
             n_head=self.n_head,
             d_head=self.d_head,
             d_inner=self.d_ff,
             dropout=self.dropout,
             dropatt=self.dropout,
             initializer=initializer,
             proj_initializer=proj_initializer,
             is_training=self.is_training,
             mem_len=self.mem_len,
             cutoffs=[],
             div_val=-1,
             tie_projs=[],
             same_length=False,
             clamp_len=-1,
             input_perms=None,
             target_perms=None,
             head_target=None,
             untie_r=False,
             proj_same_dim=True)
     self.avg_loss = tf.reduce_mean(loss)
     # vars
     all_vars = tf.compat.v1.trainable_variables()
     grads = tf.gradients(self.avg_loss, all_vars)
     grads_and_vars = list(zip(grads, all_vars))
     all_trainable_vars = tf.reduce_sum([tf.reduce_prod(v.shape) for v in tf.compat.v1.trainable_variables()])
     # optimizer
     decay_lr = tf.compat.v1.train.cosine_decay(
         self.learning_rate,
         global_step=self.global_step,
         decay_steps=400000,
         alpha=0.004)
     optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=decay_lr)
     self.train_op = optimizer.apply_gradients(grads_and_vars, self.global_step)
     # saver
     self.saver = tf.compat.v1.train.Saver()
     config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
     config.gpu_options.allow_growth = True
     self.sess = tf.compat.v1.Session(config=config)
     self.saver.restore(self.sess, self.checkpoint_path)
Ejemplo n.º 2
0
 def load_model(self):
     # placeholders
     self.x = tf.compat.v1.placeholder(tf.int32,
                                       shape=[self.batch_size, None])
     self.y = tf.compat.v1.placeholder(tf.int32,
                                       shape=[self.batch_size, None])
     self.mems_i = [
         tf.compat.v1.placeholder(
             tf.float32, [self.mem_len, self.batch_size, self.d_model])
         for _ in range(self.n_layer)
     ]
     # model
     initializer = tf.compat.v1.initializers.random_normal(stddev=0.02,
                                                           seed=None)
     proj_initializer = tf.compat.v1.initializers.random_normal(stddev=0.01,
                                                                seed=None)
     with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
         xx = tf.transpose(self.x, [1, 0])
         yy = tf.transpose(self.y, [1, 0])
         loss, self.logits, self.new_mem = modules.transformer(
             dec_inp=xx,
             target=yy,
             mems=self.mems_i,
             n_token=self.n_token,
             n_layer=self.n_layer,
             d_model=self.d_model,
             d_embed=self.d_embed,
             n_head=self.n_head,
             d_head=self.d_head,
             d_inner=self.d_ff,
             dropout=0.0,
             dropatt=0.0,
             initializer=initializer,
             proj_initializer=proj_initializer,
             is_training=False,
             mem_len=self.mem_len,
             cutoffs=[],
             div_val=-1,
             tie_projs=[],
             same_length=False,
             clamp_len=-1,
             input_perms=None,
             target_perms=None,
             head_target=None,
             untie_r=False,
             proj_same_dim=True)
     # restore
     self.saver = tf.compat.v1.train.Saver()
     config = tf.compat.v1.ConfigProto()
     config.gpu_options.allow_growth = True
     self.sess = tf.compat.v1.Session(config=config)
     self.saver.restore(self.sess, self.checkpoint_path)
Ejemplo n.º 3
0
 def test_transformer(self):
     in_seq = torch.LongTensor([[1, 2, 3, 4], [8, 7, 6, 5]])
     out_seq = torch.LongTensor([[0, 1, 2], [3, 9, 3]])
     transformer = Transformer(10, 6, 6, 8, 3, 2, 2, 4)
     probs = transformer(in_seq, out_seq)
     self.assertEqual(probs.shape, torch.Size([2, 3, 10]))
Ejemplo n.º 4
0
    def load_model(self):
        # placeholders
        self.x = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None])
        self.y = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None])
        self.mems_i = [tf.compat.v1.placeholder(tf.float32, [self.mem_len, self.batch_size, self.d_model]) for _ in
                       range(self.n_layer)]
        # model
        self.global_step = tf.compat.v1.train.get_or_create_global_step()

        initializer = tf.compat.v1.keras.initializers.glorot_normal()
        proj_initializer = tf.compat.v1.keras.initializers.glorot_normal()

        with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
            xx = tf.transpose(self.x, [1, 0])
            yy = tf.transpose(self.y, [1, 0])
            loss, self.logits, self.new_mem = modules.transformer(
                dec_inp=xx,
                target=yy,
                mems=self.mems_i,
                n_token=self.n_token,
                n_layer=self.n_layer,
                d_model=self.d_model,
                d_embed=self.d_embed,
                n_head=self.n_head,
                d_head=self.d_head,
                d_inner=self.d_ff,
                dropout=self.dropout,
                dropatt=self.dropout,
                initializer=initializer,
                proj_initializer=proj_initializer,
                is_training=self.is_training,
                mem_len=self.mem_len,
                rezero=self.rezero,
                cutoffs=[],
                div_val=-1,
                tie_projs=[],
                same_length=False,
                clamp_len=-1,
                input_perms=None,
                target_perms=None,
                head_target=None,
                untie_r=False,
                proj_same_dim=True)
            variables = tf.trainable_variables()
        grads = tf.gradients(self.loss, variables)
        grads_and_vars = list(zip(grads, variables))

        self.avg_loss = tf.reduce_mean(loss)
        # vars
        decay_lr = tf.compat.v1.train.cosine_decay(
            self.learning_rate,
            global_step=self.global_step,
            decay_steps=400000,
            alpha=0.004)

        optimizer = RAdamOptimizer(decay_lr)
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
        self.train_op = optimizer.apply_gradients(grads_and_vars, self.global_step)

        # saver
        self.saver = tf.compat.v1.train.Saver()
        config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
        self.sess = tf.compat.v1.Session(config=config)
        self.saver.restore(self.sess, self.checkpoint_path)