def __init__(self, env, args): # TODO: Create a suitable model. # # Apart from the model defined in `reinforce`, define also another # model for computing baseline (with one output, using a dense layer # without activation). # # Using Adam optimizer with given `args.learning_rate` for both models # is a good default. inputs = tf.keras.Input(shape=env.observation_space.shape) baseline_inputs = tf.keras.Input(shape=env.observation_space.shape) x = tf.keras.layers.Conv2D(filters=args.cnn_filters, kernel_size=8, strides=2)(inputs) x = tf.keras.layers.ReLU()(x) # x = tf.keras.layers.Conv2D(filters=args.cnn_filters * 2, kernel_size=4, strides=2)(x) # x = tf.keras.layers.ReLU()(x) policy_features = tf.keras.layers.Flatten()(x) x = tf.keras.layers.Conv2D(filters=args.cnn_filters, kernel_size=8, strides=2)(baseline_inputs) x = tf.keras.layers.ReLU()(x) # x = tf.keras.layers.Conv2D(filters=args.cnn_filters * 2, kernel_size=4, strides=2)(x) # x = tf.keras.layers.ReLU()(x) baseline_features = tf.keras.layers.Flatten()(x) hidden = policy_features hidden_b = baseline_features for i in range(args.hidden_layers): hidden = tf.keras.layers.Dense(args.hidden_layer_size, activation=args.activation, kernel_regularizer='l2')(hidden) hidden = tf.keras.layers.Dropout(args.dropout)(hidden) hidden_b = tf.keras.layers.Dense(args.hidden_layer_size, activation=args.activation, kernel_regularizer='l2')(hidden_b) hidden_b = tf.keras.layers.Dropout(args.dropout)(hidden_b) out = tf.keras.layers.Dense(env.action_space.n, activation='softmax')(hidden) out_b = tf.keras.layers.Dense(1)(hidden_b) out_b = tf.keras.layers.Flatten()(out_b) self._model = tf.keras.Model(inputs, out) # self._model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer=tf.keras.optimizers.Adam(0, clipnorm=args.grad_clipping)) self._model.compile( loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer=RAdamOptimizer(args.learning_rate)) self._baseline_model = tf.keras.Model(baseline_inputs, out_b) loss = tf.keras.losses.Huber() # self._baseline_model.compile(loss=loss, optimizer=tf.keras.optimizers. optimizer=tf.keras.optimizers.Adam(0, clipnorm=args.grad_clipping)) self._baseline_model.compile(loss=loss, optimizer=RAdamOptimizer( args.learning_rate))
def __init__(self, env, args): # TODO: Analogously to paac, your model should contain two components: # - actor, which predicts distribution over the actions # - critic, which predicts the value function # # The given states are tile encoded, so they are integral indices of # tiles intersecting the state. Therefore, you should convert them # to dense encoding (one-hot-like, with with `args.tiles` ones). # (Or you can even use embeddings for better efficiency.) # # The actor computes `mus` and `sds`, each of shape [batch_size, actions]. # Compute each independently using states as input, adding a fully connected # layer with `args.hidden_layer_size` units and ReLU activation. Then: # - For `mus`, add a fully connected layer with `actions` outputs. # To avoid `mus` moving from the required range, you should apply # properly scaled `tf.tanh` activation. # - For `sds`, add a fully connected layer with `actions` outputs # and `tf.nn.softplus` activation. # # The critic should be a usual one, passing states through one hidden # layer with `args.hidden_layer_size` ReLU units and then predicting # the value function. policy_in = tf.keras.Input(shape=args.tiles) x = tf.keras.layers.Embedding(env.observation_space.nvec[-1], args.hidden_layer_size, input_length=args.tiles)(policy_in) x = tf.keras.layers.GlobalAveragePooling1D( data_format="channels_last")(x) x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(x) self.mu = tf.keras.layers.Dense( 1, activation=lambda x: tf.constant(2.0) * tf.tanh(x))(x) self.sd = tf.keras.layers.Dense( 1, activation=tf.keras.activations.softplus)(x) policy_out = tf.keras.layers.Concatenate()([self.mu, self.sd]) self.actor = tf.keras.Model(policy_in, policy_out) self.policy_optimizer = RAdamOptimizer(args.learning_rate) value_in = tf.keras.Input(shape=args.tiles) x = tf.keras.layers.Embedding(env.observation_space.nvec[-1], args.hidden_layer_size, input_length=args.tiles)(value_in) x = tf.keras.layers.GlobalAveragePooling1D( data_format="channels_last")(x) x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(x) value_out = tf.keras.layers.Dense(1)(x) self.critic = tf.keras.Model(value_in, value_out) self.critic.compile(optimizer=RAdamOptimizer(args.learning_rate), loss=tf.keras.losses.MeanSquaredError())
def __init__(self, env, args): policy_in = tf.keras.Input(shape=env.observation_space.shape) x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(policy_in) policy_out = tf.keras.layers.Dense(env.action_space.n, activation='softmax')(x) self.policy = tf.keras.Model(policy_in, policy_out) self.policy.compile( optimizer=RAdamOptimizer(args.learning_rate), loss=tf.keras.losses.SparseCategoricalCrossentropy()) value_in = tf.keras.Input(shape=env.observation_space.shape) x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(value_in) value_out = tf.keras.layers.Dense(1)(x) self.value = tf.keras.Model(value_in, value_out) self.value.compile(optimizer=RAdamOptimizer(args.learning_rate), loss=tf.keras.losses.MeanSquaredError())
def fit(self, dishsize=[250, 150, 50, 20], misdishsize=[200, 100, 50, 20], glr=3e-6, dlr=4e-6): tf.reset_default_graph() self.X = tf.placeholder(tf.float32, [None, self.n_dim], name="X") self.missX = tf.placeholder(tf.float32, [None, self.raw_dim], name="missX") self.Z = tf.placeholder(tf.float32, [None, self.g_dim], name="Z") self.Conditions = tf.placeholder(tf.float32, [None, self.n_control], name="Condition") self.batch_size = tf.placeholder(tf.int32, None, name="BatchSize") self.is_training = tf.placeholder(tf.bool) ## Data 생성 values = generator(self.Z, self.n_dim, self.Conditions, bn=True, is_training=self.is_training, onehot_key_store=self.onehot_key_store) out, G_sample, G_x, G_arg_x = values ## 생성된 데이터에 대해서 결측치 확인 여부 G_sample_stop = tf.stop_gradient(G_sample) G_miss = MissGenerator(G_sample_stop, self.raw_dim, self.Conditions, reuse=False) delta = tf.constant(0.5) ## 진짜 데이터에서 미싱 데이터를 1.5로 처리하기 imputedX = tf.where(tf.math.is_nan(self.X), tf.ones_like(self.X) * 1.5, self.X) ## 생성된 데이터 결측 확률 threshold를 통해서, missing indicator 만들기 miss_indicator = tf.where(G_miss > delta, tf.ones_like(G_miss), tf.zeros_like(G_miss)) self.miss_indicator2, NumMissGenerator = MissGeneratorByVar( miss_indicator, self.overall_where) ## 결측치 G_sample(확률 값으로) G_x (one hot) miss_G_sample = G_sample * (1 - self.miss_indicator2) + tf.constant( [1.5]) * NumMissGenerator self.miss_G_sample_eval = G_x * ( 1 - self.miss_indicator2) + tf.constant([1.5]) * NumMissGenerator _, real_logit = discriminator(imputedX, self.Conditions, gpu_n=1, hsize=dishsize, reuse=False) _, fake_logit = discriminator(miss_G_sample, self.Conditions, gpu_n=1, hsize=dishsize, reuse=True) miss_real_logit = MissDiscriminator(self.missX, self.Conditions, gpu_n=0, hsize=misdishsize, reuse=False) miss_fake_logit = MissDiscriminator(G_miss, self.Conditions, gpu_n=0, hsize=misdishsize, reuse=True) _ = [ tf.summary.histogram(i.name, i) for i in tf.get_collection("weight_variables") ] ########################################### e = tf.random_uniform([self.batch_size, 1], 0, 1) x_hat = e * imputedX + (1 - e) * miss_G_sample grad = tf.gradients( discriminator(x_hat, self.Conditions, hsize=dishsize, reuse=True, gpu_n=1), x_hat)[0] slopes = tf.sqrt(1e-8 + tf.reduce_sum(tf.square(grad), axis=[1])) gradient_penalty = 5 * tf.reduce_mean((slopes - 1.)**2) loss_func = "gan-gp" ##lsgan | agan | gan | gan-gp | dragan | hinge with tf.variable_scope("Discriminator_Loss"): with tf.variable_scope("Original_Loss"): self.disc_loss = discriminator_loss(Ra=True, loss_func=loss_func, real=real_logit, fake=fake_logit) self.disc_loss += gradient_penalty with tf.variable_scope("Indicator_Loss"): self.miss_disc_loss = discriminator_loss(Ra=True, loss_func=loss_func, real=miss_real_logit, fake=miss_fake_logit) # if loss_func in ["wgan-gp", "gan-gp"] : with tf.variable_scope("Generator_Loss"): with tf.variable_scope("Original_Loss"): self.gen_loss = generator_loss(Ra=True, loss_func=loss_func, real=real_logit, fake=fake_logit) with tf.variable_scope("Indicator_Loss"): self.miss_gen_loss = generator_loss(Ra=True, loss_func=loss_func, real=miss_real_logit, fake=miss_fake_logit) ###################################################################### tf.summary.scalar(f"gradient_penalty_loss", gradient_penalty) tf.summary.scalar(f"disc_loss", self.disc_loss) tf.summary.scalar(f"miss_disc_loss", self.miss_disc_loss) tf.summary.scalar(f"generate_loss", self.gen_loss) tf.summary.scalar(f"miss_generate_loss", self.miss_gen_loss) t_vars = tf.trainable_variables() self.global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) gen_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="GAN/Generator") disc_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="GAN/Discriminator") miss_gen_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="GAN/MissGenerator") miss_disc_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="GAN/MissDiscriminator") # self.gen_loss = gen_loss + miss_gen_loss # tf.train.RMSPropOptimizer glearning_rate = tf.train.exponential_decay( glr, self.global_step, decay_steps=100, decay_rate=0.999, staircase=False, ) dlearning_rate = tf.train.exponential_decay( dlr, self.global_step, decay_steps=100, decay_rate=0.999, staircase=False, ) with tf.variable_scope("Optimizer"): self.gen_step = RAdamOptimizer( learning_rate=glearning_rate).minimize( self.gen_loss, var_list=gen_vars) # G Train step + miss_gen_vars self.miss_gen_step = RAdamOptimizer( learning_rate=glearning_rate).minimize( self.miss_gen_loss, var_list=miss_gen_vars) # G Train step # + miss_disc_vars self.disc_step = tf.train.RMSPropOptimizer( learning_rate=dlearning_rate).minimize( self.disc_loss, var_list=disc_vars) # D Train step # miss_gen_step = RAdamOptimizer(learning_rate=learning_rate).minimize(miss_gen_loss, # var_list = + gen_vars ) # G Train step self.miss_disc_step = RAdamOptimizer( learning_rate=dlearning_rate).minimize( self.miss_disc_loss, var_list=miss_disc_vars) # D Train step print("fitting!!")
def fit(self, ): self.update_dict_(self.env) self.update_dict_(self.__dict__) select_w_init = np.random.randint(0, 2, size=1)[0] seed_n = np.random.randint(1, 1000, size=1)[0] # self.patience = patience # self.cut_off = cutoff # self.ck_max_norm = max_norm # self.ck_SN = SN # self.Gact = Gact # self.Dact = Dact # self.epoch = epoch + 1 # trainX, trainM = TrainSet # testX, testM = ValidSet # self.p_hint = hint # self.mb_size = mb_size # self.alpha = alpha self.relu_w_init = [ tf.keras.initializers.he_uniform(seed=seed_n), tf.keras.initializers.he_normal(seed=seed_n) ][select_w_init] self.tanh_w_init = [ tf.keras.initializers.glorot_normal(seed=seed_n), tf.keras.initializers.glorot_uniform(seed=seed_n) ][select_w_init] self.s_elu_w_init = [ tf.keras.initializers.lecun_normal(seed=seed_n), tf.keras.initializers.lecun_uniform(seed=seed_n) ][select_w_init] self.nomal_w_init = tf.keras.initializers.truncated_normal(seed=seed_n) self.ck_max_norm = self.max_norm self.ck_SN = self.SN self.p_hint = self.hint weight_regularizer = self.weight_regularizer lr = self.lr trainX, trainM = self.TrainSet testX, testM = self.ValidSet self.trainX = typecheck(trainX) ##################### Matrix는 1-미싱 => 미싱 아닌 부분 self.trainM = typecheck(1 - 1 * trainM) self.testX = typecheck(testX) self.testM = typecheck(1 - 1 * testM) self.total_X = np.concatenate((trainX, testX)) self.total_M = np.concatenate((self.trainM, self.testM)) self.Train_No, self.Dim = self.trainX.shape self.total = self.Dim ## modeling tf.reset_default_graph() self.define() ## M 은 미싱이 아닌 것! ## 미싱이 부분에 진짜를 가짜인 부분에 생성된 것을 result = self.generator(self.New_X) Logit, G_sample, OnehotResult, ArgResult = result if self.fac_var == []: for v, col in enumerate(self.in_var): value = tf.slice(G_sample, [0, v], [-1, 1]) # tf.summary.histogram("Input_" + col.replace(" ", "_"), value) else: self.ArgResult = tf.identity(ArgResult, name="Arg_G") for v, col in enumerate(self.in_var): value = tf.slice(ArgResult, [0, v], [-1, 1]) # tf.summary.histogram("Input_" + col.replace(" ", "_"), value) Hat_New_X = self.M * self.New_X + (1 - self.M) * G_sample ## M은 미싱인 부분 # imputed = self.New_X * (1-self.M) + G_sample * self.M self.Hat_New_X = tf.identity(Hat_New_X, name="imputed") self.G_sample = tf.identity(G_sample, name="generated") # Discriminator D_prob = self.discriminator(Hat_New_X, self.H) t_vars = tf.trainable_variables() if weight_regularizer > 0: G_L2 = [] D_L2 = [] for v in t_vars: if re.search('Weight', v.name): if re.search("Generator", v.name): print("G : ", v.name) G_L2.append(tf.nn.l2_loss(v)) elif re.search("Discriminator", v.name): print("D : ", v.name) D_L2.append(tf.nn.l2_loss(v)) self.Generator_W_l2 = tf.add_n(G_L2) * weight_regularizer self.Discriminator_W_l2 = tf.add_n(D_L2) * weight_regularizer else: self.Generator_W_l2 = tf.constant(0.0) self.Discriminator_W_l2 = tf.constant(0.0) for var in t_vars: tf.summary.histogram(var.op.name, var) self.D_1 = -self.M * tf.log(D_prob + 1e-8) self.D_2 = -(1 - self.M) * tf.log(1. - D_prob + 1e-8) self.D_3 = tf.reduce_mean(self.D_1 + self.D_2) self.D_loss = self.D_3 + self.Discriminator_W_l2 self.G_loss1 = -tf.reduce_mean((1 - self.M) * tf.log(D_prob + 1e-8)) ## 미싱이 아닌 부분 -> 미싱 부분 if self.fac_var == []: Logit = self.G_final_Act(Logit) else: pass self.MSE_train_loss = self.CatNumEmb_Loss(Logit, self.X, self.M, self.cond, self.key_cond, self.weight_info) # self.MSE_train_loss_2 =self.CatNumEmb_Loss(Logit , self.X , self.M , seg = "Test") self.G_loss = \ self.G_loss1 + self.alpha * self.MSE_train_loss + self.Generator_W_l2 # self.MSE_test_loss =\ # tf.reduce_mean( tf.square( (1-self.M) * self.X - (1-self.M) * G_sample ) ) with tf.variable_scope("Original/Loss"): tf.summary.scalar("Total_G_loss", self.G_loss) tf.summary.scalar("Not_Missing_Loss", self.MSE_train_loss) tf.summary.scalar("D_Loss", self.D_loss) self.clip_all_weights = tf.get_collection("max_norm") gen_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="GAN/Generator") disc_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="GAN/Discriminator") self.global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) self.lr = tf.train.cosine_decay_restarts(lr, self.global_step, first_decay_steps=100, t_mul=1.2, m_mul=0.95, alpha=0.5) self.D_solver = RAdamOptimizer(learning_rate=self.lr, beta1=0.5, beta2=0.5, weight_decay=0.0).minimize( self.D_loss, var_list=disc_vars) self.G_solver = tf.train.RMSPropOptimizer( learning_rate=self.lr, ).minimize(self.G_loss, var_list=gen_vars) comment = "{} \n{}{}{}\n{}".format("=" * 56, " " * 24, "모델피팅", " " * 24, "=" * 56) return print(comment)
class Network: def __init__(self, env, args): # TODO: Analogously to paac, your model should contain two components: # - actor, which predicts distribution over the actions # - critic, which predicts the value function # # The given states are tile encoded, so they are integral indices of # tiles intersecting the state. Therefore, you should convert them # to dense encoding (one-hot-like, with with `args.tiles` ones). # (Or you can even use embeddings for better efficiency.) # # The actor computes `mus` and `sds`, each of shape [batch_size, actions]. # Compute each independently using states as input, adding a fully connected # layer with `args.hidden_layer_size` units and ReLU activation. Then: # - For `mus`, add a fully connected layer with `actions` outputs. # To avoid `mus` moving from the required range, you should apply # properly scaled `tf.tanh` activation. # - For `sds`, add a fully connected layer with `actions` outputs # and `tf.nn.softplus` activation. # # The critic should be a usual one, passing states through one hidden # layer with `args.hidden_layer_size` ReLU units and then predicting # the value function. policy_in = tf.keras.Input(shape=args.tiles) x = tf.keras.layers.Embedding(env.observation_space.nvec[-1], args.hidden_layer_size, input_length=args.tiles)(policy_in) x = tf.keras.layers.GlobalAveragePooling1D( data_format="channels_last")(x) x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(x) self.mu = tf.keras.layers.Dense( 1, activation=lambda x: tf.constant(2.0) * tf.tanh(x))(x) self.sd = tf.keras.layers.Dense( 1, activation=tf.keras.activations.softplus)(x) policy_out = tf.keras.layers.Concatenate()([self.mu, self.sd]) self.actor = tf.keras.Model(policy_in, policy_out) self.policy_optimizer = RAdamOptimizer(args.learning_rate) value_in = tf.keras.Input(shape=args.tiles) x = tf.keras.layers.Embedding(env.observation_space.nvec[-1], args.hidden_layer_size, input_length=args.tiles)(value_in) x = tf.keras.layers.GlobalAveragePooling1D( data_format="channels_last")(x) x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(x) value_out = tf.keras.layers.Dense(1)(x) self.critic = tf.keras.Model(value_in, value_out) self.critic.compile(optimizer=RAdamOptimizer(args.learning_rate), loss=tf.keras.losses.MeanSquaredError()) @wrappers.typed_np_function(np.float32, np.float32, np.float32) @tf.function def train(self, states, actions, returns): with tf.GradientTape() as critic_tape: pred_values = self.critic(states) critic_loss = self.critic.loss(returns, pred_values) critic_grads = critic_tape.gradient(critic_loss, self.critic.trainable_variables) self.critic.optimizer.apply_gradients( zip(critic_grads, self.critic.trainable_variables)) with tf.GradientTape() as policy_tape: pred_actions = self.actor(states) mus = pred_actions[:, 0] sds = pred_actions[:, 1] # mus = tf.clip_by_value(mus, clip_value_min=-1, clip_value_max=1) # sds = tf.clip_by_value(sds, clip_value_min=0, clip_value_max=1) action_distribution = tfp.distributions.Normal(mus, sds) advantage = returns - pred_values[:, 0] nll = -action_distribution.log_prob(actions[:, 0]) loss = nll * advantage policy_loss = tf.math.reduce_mean(loss) # entropy penalization entropy = tf.math.reduce_mean(tf.math.log(sds)) # policy_loss -= args.beta * entropy # print(policy_loss) # print("Policy_loss", policy_loss) # print(self.actor.trainable_variables) policy_grad = policy_tape.gradient(policy_loss, self.actor.trainable_variables) # print(policy_grad) self.policy_optimizer.apply_gradients( zip(policy_grad, self.actor.trainable_variables)) # TODO: Run the model on given `states` and compute # sds, mus and predicted values. Then create `action_distribution` using # `tfp.distributions.Normal` class and computed mus and sds. # In PyTorch, the corresponding class is `torch.distributions.normal.Normal`. # # TODO: Compute total loss as a sum of three losses: # - negative log likelihood of the `actions` in the `action_distribution` # (using the `log_prob` method). You then need to sum the log probabilities # of actions in a single batch example (using `tf.math.reduce_sum` with `axis=1`). # Finally multiply the resulting vector by (returns - predicted values) # and compute its mean. Note that the gradient must not flow through # the predicted values (you can use `tf.stop_gradient` if necessary). # - negative value of the distribution entropy (use `entropy` method of # the `action_distribution`) weighted by `args.entropy_regularization`. # - mean square error of the `returns` and predicted values. @wrappers.typed_np_function(np.float32) @tf.function def predict_actions(self, states): # TODO: Return predicted action distributions (mus and sds). mus_sds = tf.transpose(self.actor(states), (1, 0)) # return tf.clip_by_value(mus_sds[0], -1, 1), tf.clip_by_value(mus_sds[1], 0, 1) return mus_sds @wrappers.typed_np_function(np.float32) @tf.function def predict_values(self, states): # TODO: Return predicted state-action values. return self.critic(states)[:, 0]
def create_graph(self): RSE_network.is_training = True """Creates graph for training""" self.base_cost = 0.0 self.accuracy = 0 num_sizes = len(self.bins) self.cost_list = [] sum_weight = 0 self.bin_losses = [] saturation_loss = [] total_mean_loss = 0 # Create all bins and calculate losses for them with vs.variable_scope("var_lengths"): for seqLength, itemCount, ind in zip(self.bins, self.count_list, range(num_sizes)): x_in = tf.compat.v1.placeholder(cnf.input_type, [itemCount, seqLength]) y_in = tf.compat.v1.placeholder("int64", [itemCount, seqLength]) self.x_input.append(x_in) self.y_input.append(y_in) RSE_network.saturation_costs = [] RSE_network.gate_mem = [] RSE_network.reset_mem = [] RSE_network.candidate_mem = [] RSE_network.prev_mem_list = [] RSE_network.residual_list = [] RSE_network.info_alpha = [] if self.use_two_gpus: device = "/device:GPU:" + ( "0" if seqLength >= self.bins[-1] else "1") with tf.device(device): c, a, mem1, logits, per_item_cost, _, _ = self.create_loss( x_in, y_in, seqLength) else: c, a, mem1, logits, per_item_cost, _, _ = self.create_loss( x_in, y_in, seqLength) weight = 1.0 sat_cost = ( tf.add_n(RSE_network.saturation_costs) / (seqLength * len(RSE_network.saturation_costs) * itemCount) if len(RSE_network.saturation_costs) > 0 else 0) saturation_loss.append(sat_cost * weight) self.bin_losses.append(per_item_cost) self.base_cost += c * weight sum_weight += weight self.accuracy += a self.cost_list.append(c) mean_loss = tf.reduce_mean(input_tensor=tf.square(mem1)) total_mean_loss += mean_loss tf.compat.v1.get_variable_scope().reuse_variables() # calculate the total loss self.base_cost /= sum_weight self.accuracy /= num_sizes total_mean_loss /= num_sizes tf.compat.v1.summary.scalar("base/loss", self.base_cost) tf.compat.v1.summary.scalar("base/error", 1 - self.accuracy) tf.compat.v1.summary.scalar("base/error_longest", 1 - a) tf.compat.v1.summary.histogram("logits", logits) if cnf.task is not "musicnet": if RSE_network.gate_mem: gate_img = tf.stack(RSE_network.gate_mem) gate_img = gate_img[:, 0:1, :, :] gate_img = tf.cast(gate_img * 255, dtype=tf.uint8) tf.compat.v1.summary.image("gate", tf.transpose(a=gate_img, perm=[3, 0, 2, 1]), max_outputs=16) if RSE_network.reset_mem: reset_img = tf.stack(RSE_network.reset_mem) reset_img = tf.clip_by_value(reset_img, -2, 2) tf.compat.v1.summary.histogram("reset", reset_img) reset_img = reset_img[:, 0:1, :, :] tf.compat.v1.summary.image( "reset", tf.transpose(a=reset_img, perm=[3, 0, 2, 1]), max_outputs=16, ) if RSE_network.prev_mem_list: prev_img = tf.stack(RSE_network.prev_mem_list) prev_img = prev_img[:, 0:1, :, :] prev_img = tf.cast(prev_img * 255, dtype=tf.uint8) tf.compat.v1.summary.image( "prev_mem", tf.transpose(a=prev_img, perm=[3, 0, 2, 1]), max_outputs=16, ) if RSE_network.residual_list: prev_img = tf.stack(RSE_network.residual_list) prev_img = prev_img[:, 0:1, :, :] prev_img = tf.cast(prev_img * 255, dtype=tf.uint8) tf.compat.v1.summary.image( "residual_mem", tf.transpose(a=prev_img, perm=[3, 0, 2, 1]), max_outputs=16, ) if RSE_network.info_alpha: prev_img = tf.stack(RSE_network.info_alpha) prev_img = prev_img[:, 0:1, :, :] tf.compat.v1.summary.image( "info_alpha", tf.transpose(a=prev_img, perm=[3, 0, 2, 1]), max_outputs=16, ) candidate_img = tf.stack(RSE_network.candidate_mem) candidate_img = candidate_img[:, 0:1, :, :] candidate_img = tf.cast((candidate_img + 1.0) * 127.5, dtype=tf.uint8) tf.compat.v1.summary.image( "candidate", tf.transpose(a=candidate_img, perm=[3, 0, 2, 1]), max_outputs=16, ) mem1 = mem1[:, 0:1, :, :] tf.compat.v1.summary.image("mem", tf.transpose(a=mem1, perm=[3, 0, 2, 1]), max_outputs=16) saturation = tf.reduce_sum( input_tensor=tf.stack(saturation_loss)) / sum_weight tf.compat.v1.summary.scalar("base/activation_mean", tf.sqrt(total_mean_loss)) self.sat_loss = saturation * self.saturation_weight cost = self.base_cost + self.sat_loss tvars = [v for v in tf.compat.v1.trainable_variables()] for var in tvars: name = var.name.replace("var_lengths", "") tf.compat.v1.summary.histogram(name + "/histogram", var) regvars = [var for var in tvars if "CvK" in var.name] print(regvars) reg_costlist = [ tf.reduce_sum(input_tensor=tf.square(var)) for var in regvars ] reg_cost = tf.add_n(reg_costlist) tf.compat.v1.summary.scalar("base/regularize_loss", reg_cost) # optimizer self.local_lr = self.learning_rate optimizer = RAdamOptimizer( self.local_lr, epsilon=1e-5, L2_decay=0.01, L1_decay=0.00, decay_vars=regvars, total_steps=cnf.training_iters, warmup_proportion=cnf.num_warmup_steps / cnf.training_iters, clip_gradients=True, ) self.optimizer = optimizer.minimize(cost, global_step=self.global_step) # some values for printout max_vals = [] for var in tvars: var_v = optimizer.get_slot(var, "v") max_vals.append(tf.sqrt(var_v)) self.gnorm = tf.linalg.global_norm(max_vals) tf.compat.v1.summary.scalar("base/gnorm", self.gnorm) self.cost_list = tf.stack(self.cost_list)
def load_model(self): # placeholders self.x = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.y = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.mems_i = [tf.compat.v1.placeholder(tf.float32, [self.mem_len, self.batch_size, self.d_model]) for _ in range(self.n_layer)] # model self.global_step = tf.compat.v1.train.get_or_create_global_step() initializer = tf.compat.v1.keras.initializers.glorot_normal() proj_initializer = tf.compat.v1.keras.initializers.glorot_normal() with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): xx = tf.transpose(self.x, [1, 0]) yy = tf.transpose(self.y, [1, 0]) loss, self.logits, self.new_mem = modules.transformer( dec_inp=xx, target=yy, mems=self.mems_i, n_token=self.n_token, n_layer=self.n_layer, d_model=self.d_model, d_embed=self.d_embed, n_head=self.n_head, d_head=self.d_head, d_inner=self.d_ff, dropout=self.dropout, dropatt=self.dropout, initializer=initializer, proj_initializer=proj_initializer, is_training=self.is_training, mem_len=self.mem_len, rezero=self.rezero, cutoffs=[], div_val=-1, tie_projs=[], same_length=False, clamp_len=-1, input_perms=None, target_perms=None, head_target=None, untie_r=False, proj_same_dim=True) variables = tf.trainable_variables() grads = tf.gradients(self.loss, variables) grads_and_vars = list(zip(grads, variables)) self.avg_loss = tf.reduce_mean(loss) # vars decay_lr = tf.compat.v1.train.cosine_decay( self.learning_rate, global_step=self.global_step, decay_steps=400000, alpha=0.004) optimizer = RAdamOptimizer(decay_lr) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) self.train_op = optimizer.apply_gradients(grads_and_vars, self.global_step) # saver self.saver = tf.compat.v1.train.Saver() config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 self.sess = tf.compat.v1.Session(config=config) self.saver.restore(self.sess, self.checkpoint_path)
def create_graph(self): """Creates graph for training""" self.cost = 0.0 self.accuracy = 0 num_sizes = len(self.bins) self.cost_list = [] self.bin_losses = [] # Create all bins and calculate losses for them with vs.variable_scope("var_lengths"): for seqLength, itemCount, ind in zip(self.bins, self.count_list, range(num_sizes)): x_in = tf.placeholder("int64", [itemCount, seqLength]) y_in = tf.placeholder("int64", [itemCount, seqLength]) self.x_input.append(x_in) self.y_input.append(y_in) network.saturation_costs = [] network.gate_mem = [] network.reset_mem = [] network.candidate_mem = [] network.prev_mem_list = [] if self.use_two_gpus: device = "/device:GPU:" + ("0" if seqLength >= self.bins[-1] else "1") with tf.device(device): c, a, mem1, _, perItemCost, _ = self.create_loss(x_in, y_in, seqLength) else: c, a, mem1, _, perItemCost, _ = self.create_loss(x_in, y_in, seqLength) # /seqLength self.bin_losses.append(perItemCost) self.cost += c self.accuracy += a self.cost_list.append(c) tf.get_variable_scope().reuse_variables() # calculate the total loss self.cost /= num_sizes self.accuracy /= num_sizes # tensorboard output tf.summary.scalar("base/loss", self.cost) tf.summary.scalar("base/accuracy", self.accuracy) tf.summary.scalar("base/accuracy_longest", a) gate_img = tf.stack(network.gate_mem) gate_img = gate_img[:, 0:1, :, :] gate_img = tf.cast(gate_img * 255, dtype=tf.uint8) tf.summary.image("gate", tf.transpose(gate_img, [3, 0, 2, 1]), max_outputs=16) reset_img = tf.stack(network.reset_mem) reset_img = reset_img[:, 0:1, :, :] reset_img = tf.cast(reset_img * 255, dtype=tf.uint8) tf.summary.image("reset", tf.transpose(reset_img, [3, 0, 2, 1]), max_outputs=16) if network.prev_mem_list: prev_img = tf.stack(network.prev_mem_list) prev_img = prev_img[:, 0:1, :, :] prev_img = tf.cast(prev_img * 255, dtype=tf.uint8) tf.summary.image("prev_mem", tf.transpose(prev_img, [3, 0, 2, 1]), max_outputs=16) candidate_img = tf.stack(network.candidate_mem) candidate_img = candidate_img[:, 0:1, :, :] candidate_img = tf.cast((candidate_img + 1.0) * 127.5, dtype=tf.uint8) tf.summary.image("candidate", tf.transpose(candidate_img, [3, 0, 2, 1]), max_outputs=16) mem1 = mem1[:, 0:1, :, :] tf.summary.image("mem", tf.transpose(mem1, [3, 0, 2, 1]), max_outputs=16) tvars = tf.trainable_variables() for var in tvars: name = var.name.replace("var_lengths", "") tf.summary.histogram(name + '/histogram', var) # we use a small L2 regularization, although it is questionable if it helps regularizable_vars = [var for var in tvars if "CvK" in var.name] reg_costlist = [tf.reduce_sum(tf.square(var)) for var in regularizable_vars] reg_cost = tf.add_n(reg_costlist) tf.summary.scalar("base/regularize_loss", reg_cost) optimizer = RAdamOptimizer(self.learning_rate, epsilon=1e-5, L2_decay=0.01, decay_vars=regularizable_vars, total_steps=cnf.training_iters, warmup_proportion=0.0) #Adam optimizer works as well self.optimizer = optimizer.minimize(self.cost, global_step=self.global_step) # some values for printout max_vals = [] for var in tvars: varV = optimizer.get_slot(var, "v") max_vals.append(varV) self.gnorm = tf.global_norm(max_vals) self.cost_list = tf.stack(self.cost_list)
folder_best_model = args.model_path name_best_model = os.path.join(folder_best_model, 'best') dataset_path = args.dataset loader = Loader.Loader(dataFolderPath=dataset_path, n_classes=n_classes, problemType='segmentation', width=width, height=height, channels=channels_image, channels_events=channels_events) if not os.path.exists(folder_best_model): os.makedirs(folder_best_model) # build model and optimizer model = Segception.Segception_small(num_classes=n_classes, weights=None, input_shape=(None, None, channels)) # optimizer learning_rate = tfe.Variable(lr) #optimizer = tf.train.AdamOptimizer(learning_rate) optimizer = RAdamOptimizer(learning_rate) # Init models (optional, just for get_params function) init_model(model, input_shape=(batch_size, width, height, channels)) variables_to_restore = model.variables # [x for x in model.variables if 'block1_conv1' not in x.name] variables_to_save = model.variables variables_to_optimize = model.variables # Init saver. can use also ckpt = tfe.Checkpoint((model=model, optimizer=optimizer,learning_rate=learning_rate, global_step=global_step) saver_model = tfe.Saver(var_list=variables_to_save) restore_model = tfe.Saver(var_list=variables_to_restore) # restore if model saved and show number of params # restore_state(restore_model, name_best_model) get_params(model)