def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantages = K.placeholder(shape=[ None, ]) policy = self.actor.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob + 1e-10) * advantages actor_loss = -K.mean(eligibility) entropy = K.mean(policy * K.log(policy + 1e-10), axis=1) entropy = K.mean(entropy) loss = actor_loss + 0.01 * entropy optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.00001, decay=0.99, clipnorm=0.5) # optimizer = Adam(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantages], [loss], updates=updates) return train
def actor_optimizer(self): """ This method updates actor network (policy network) """ action = K.placeholder(shape=[None, self.action_size]) advantages = K.placeholder(shape=[ None, ]) policy = self.actor.output action_prob = K.sum(action * policy, axis=1) # Cross entropy loss function about policy cross_entropy = K.log(action_prob + 1e-10) * advantages cross_entropy = -K.sum(cross_entropy) # Remained actor network agents should continuously interact environment # This is entropy loss function for continuous exploration entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) entropy = K.sum(entropy) # You create final loss function by adding two loss functions loss = cross_entropy + 0.01 * entropy optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantages], [loss], updates=updates) return train
def optimizer(self): a = K.placeholder(shape=(None, ), dtype='int32') y = K.placeholder(shape=(None, ), dtype='float32') # the output tensor for the state-action pairs py_x = self.q_duelling_part.output a_one_hot = K.one_hot(a, 3) q_value = K.sum(py_x * a_one_hot, axis=1) error = K.abs(y - q_value) quadratic_part = K.clip(error, 0.0, 1.0) linear_part = error - quadratic_part loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) optimizer = RMSprop(lr=0.00025, epsilon=0.01) updates = optimizer.get_updates(self.q_duelling_part.trainable_weights, [], loss) train = K.function([self.q_duelling_part.input, a, y], [loss], updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantages = K.placeholder(shape=[ None, ]) policy = self.actor.output # actor loss good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob + 1e-10) * advantages actor_loss = -K.sum(eligibility) # entropy-loss to encourage exploration entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) # None,1 entropy = K.sum(entropy) # scalar # total loss loss = actor_loss + 0.01 * entropy optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantages], [loss], updates=updates) return train
def optimizer(self): a = K.placeholder(shape=(None, ), dtype='int32') y = K.placeholder(shape=(None, ), dtype='float32') prediction = self.model.output a_one_hot = K.one_hot(a, self.action_size) q_value = K.sum(prediction * a_one_hot, axis=1) error = K.abs(y - q_value) quadratic_part = K.clip(error, 0.0, 1.0) linear_part = error - quadratic_part loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) optimizer = RMSprop(lr=0.00025, epsilon=0.01) updates = optimizer.get_updates(self.model.trainable_weights, [], loss) train = K.function([self.model.input, a, y], [loss], updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantages = K.placeholder(shape=[ None, ]) policy = self.actor.output # Policy cross entropy error function action_prob = K.sum(action * policy, axis=1) cross_entropy = K.log(action_prob + 1e-10) * advantages cross_entropy = -K.sum(cross_entropy) # Entropy error for exploring steadily entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) entropy = K.sum(entropy) # Generation of final error function by adding two error functions loss = cross_entropy + 0.01 * entropy optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantages], [loss], updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantages = K.placeholder(shape=[ None, ]) policy = self.actor.output # Policy cross-entropy loss function action_prob = K.sum(action * policy, axis=1) cross_entropy = K.log(action_prob + 1e-10) * advantages cross_entropy = -K.sum(cross_entropy) # Entropy loss for continuous exploration entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) entropy = K.sum(entropy) # Final loss function using both entropy loss = cross_entropy + 0.01 * entropy optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01, clipnorm=40) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantages], [loss], updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantages = K.placeholder(shape=[ None, ]) policy = self.actor.output # 정책 크로스 엔트로피 오류함수 action_prob = K.sum(action * policy, axis=1) cross_entropy = K.log(action_prob + 1e-10) * advantages cross_entropy = -K.sum(cross_entropy) # 탐색을 지속적으로 하기 위한 엔트로피 오류 entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) entropy = K.sum(entropy) # 두 오류함수를 더해 최종 오류함수를 만듬 loss = cross_entropy + 0.01 * entropy optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantages], [loss], updates=updates) return train
def optimizer(self): a = K.placeholder(shape=(None, ), dtype='int32') y = K.placeholder(shape=(None, ), dtype='float32') py_x = self.model.output #output is the Q value of each action a_one_hot = K.one_hot(a, self.action_size) q_value = K.sum(py_x * a_one_hot, axis=1) #one hot encoding Q value of actions error = K.abs( y - q_value ) #looks like MAE for Huber Loss, caring about all losses equally quadratic_part = K.clip(error, 0.0, 1.0) linear_part = error - quadratic_part loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) #the MSE for low error values optimizer = RMSprop(lr=0.00025, epsilon=0.01) # #get_updates is tf (loss, param) updates = optimizer.get_updates(self.model.trainable_weights, [], loss) #instantiate Keras fxn but args are(list placholder tensors, list output tensor, list update ops) #why output tensor is loss? train = K.function([self.model.input, a, y], [loss], updates=updates) #GD step? return train
def wgan_loss(inputshape, noiseshape, generator, discriminator, K ): opt = RMSprop(lr=5e-5, clipvalue=0.01) realimg = Input(shape=imageshape) noise = Input(shape=noiseshape) fakeimg = generator(noise) d_real = discriminator(realimg) d_fake = discriminator(fakeimg) d_loss1 = K.mean(d_real, axis=-1) d_loss2 = K.mean(d_fake, axis=-1) d_loss = - d_loss1 + d_loss2 d_training_updates = opt.get_updates(discriminator.trainable_weights,[], d_loss) d_train = K.function([realimg, noise], [d_loss], d_training_updates) g_loss = - K.mean(d_fake, axis=-1) g_training_updates = opt.get_updates(generator.trainable_weights,[], g_loss) g_train = K.function([noise], [g_loss], g_training_updates) return d_train, g_train
def _build_model(graph, state_size, skip_frames, action_size, learning_rate): __keras_imports() INPUT_SHAPE = (state_size, ) # input image size to model ACTION_SIZE = action_size # With the functional API we need to define the inputs. LInput = layers.Input(INPUT_SHAPE, name='inputs') # "The final hidden layer is fully-connected and consists of 256 rectifier units." h1 = layers.Dense(64, activation='relu')(LInput) h2 = layers.Dense(128, activation='relu')(h1) rms_opt = RMSprop(lr=learning_rate, epsilon=0.1, rho=0.99) x1 = layers.Dense(128, activation='relu')(h2) x2 = layers.Dense(128, activation='relu')(h2) # "The output layer is a fully-connected linear layer with a single output for each valid action." output_actions = layers.Dense(ACTION_SIZE, activation='softmax', name='out1')(x1) output_value = layers.Dense(1, activation='linear', name='out2')(x2) pmodel = Model(inputs=[LInput], outputs=[output_actions]) vmodel = Model(inputs=[LInput], outputs=[output_value]) action_pl = K.placeholder(shape=(None, action_size)) advantages_pl = K.placeholder(shape=(None, )) discounted_r = K.placeholder(shape=(None, )) weighted_actions = K.sum(action_pl * pmodel.output, axis=1) eligibility = K.log(weighted_actions + 1e-10) * K.stop_gradient(advantages_pl) entropy = K.sum(pmodel.output * K.log(pmodel.output + 1e-10), axis=1) ploss = 0.001 * entropy - K.sum(eligibility) updates = rms_opt.get_updates(pmodel.trainable_weights, [], ploss) optimizer = K.function([pmodel.input, action_pl, advantages_pl], [], updates=updates) closs = K.mean(K.square(discounted_r - vmodel.output)) updates2 = rms_opt.get_updates(vmodel.trainable_weights, [], closs) optimizer2 = K.function([vmodel.input, discounted_r], [], updates=updates2) return (pmodel, vmodel, optimizer, optimizer2)
def _compile_learning(self): # Tensor Variables s = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) a = K.placeholder(ndim=1, dtype='int32') r = K.placeholder(ndim=1, dtype='float32') s2 = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) t = K.placeholder(ndim=1, dtype='float32') # Q(s, a) q = self.network(s / self.normalize) preds = slice_tensor_tensor(q, a) # r + (1 - t) * gamma * max_a(Q'(s')) q2 = self.target_network(s2 / self.normalize) if self.ddqn: q2_net = K.stop_gradient(self.network(s2 / self.normalize)) a_max = K.argmax(q2_net, axis=1) q2_max = slice_tensor_tensor(q2, a_max) else: q2_max = K.max(q2, axis=1) # over-estimation correction if len(self.bootstrap_corr) > 0: q2_max -= (q2_max - np.float32(self.bootstrap_corr[1])) * ( q2_max > self.bootstrap_corr[1]) q2_max -= (q2_max - np.float32(self.bootstrap_corr[0])) * ( q2_max < self.bootstrap_corr[0]) targets = r + (np.float32(1) - t) * self.gamma * q2_max # Loss and Updates cost = clipped_sum_error(y_true=targets, y_pred=preds) optimizer = RMSprop(lr=self.learning_rate, rho=.95, epsilon=1e-7) updates = optimizer.get_updates(params=self.network.trainable_weights, loss=cost, constraints={}) # Update Target Network target_updates = [] for target_weight, network_weight in zip( self.target_network.trainable_weights, self.network.trainable_weights): target_updates.append(K.update(target_weight, network_weight)) # Compiled Functions self._train_on_batch = K.function(inputs=[s, a, r, s2, t], outputs=[cost], updates=updates) self.predict_network = K.function(inputs=[s], outputs=[q]) self.predict_target = K.function(inputs=[s2], outputs=[q2]) self.update_weights = K.function(inputs=[], outputs=[], updates=target_updates)
def __update_critic__(self): discounted_rewards = K.placeholder(shape=(None, )) value = self.critic_.output loss = K.mean(K.square(discounted_rewards - value)) gradient = RMSprop() updates = gradient.get_updates(self.critic_.trainable_weights, [], loss) train = K.function([self.critic_.input, discounted_rewards],[self.critic_.output],updates=updates) return train
def critic_optimizer(self): discounted_reward = K.placeholder(shape=(None, )) value = self.critic.output loss = K.mean(K.square(discounted_reward - value)) optimizer = RMSprop(lr=self.critic_lr, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([self.critic.input, discounted_reward], [loss], updates=updates) return train
def _compile_learning(self): s = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) a = K.placeholder(ndim=1, dtype='int32') r = K.placeholder(ndim=2, dtype='float32') s2 = K.placeholder(shape=tuple([None] + [self.history_len] + self.state_shape)) t = K.placeholder(ndim=1, dtype='float32') updates = [] costs = 0 qs = [] q2s = [] for i in range(len(self.networks)): local_s = s local_s2 = s2 if self.remove_features: local_s = self._remove_features(local_s, i) local_s2 = self._remove_features(local_s2, i) qs.append(self.networks[i](local_s)) q2s.append(self.target_networks[i](local_s2)) if self.use_hra: cost = self._compute_cost(qs[-1], a, r[:, i], t, q2s[-1]) optimizer = RMSprop(lr=self.learning_rate, rho=.95, epsilon=1e-7) updates += optimizer.get_updates(params=self.networks[i].trainable_weights, loss=cost, constraints={}) costs += cost if not self.use_hra: q = sum(qs) q2 = sum(q2s) summed_reward = K.sum(r, axis=-1) cost = self._compute_cost(q, a, summed_reward, t, q2) optimizer = RMSprop(lr=self.learning_rate, rho=.95, epsilon=1e-7) updates += optimizer.get_updates(params=self.all_params, loss=cost, constraints={}) costs += cost target_updates = [] for network, target_network in zip(self.networks, self.target_networks): for target_weight, network_weight in zip(target_network.trainable_weights, network.trainable_weights): target_updates.append(K.update(target_weight, network_weight)) self._train_on_batch = K.function(inputs=[s, a, r, s2, t], outputs=[costs], updates=updates) self.predict_network = K.function(inputs=[s], outputs=qs) self.update_weights = K.function(inputs=[], outputs=[], updates=target_updates)
def _critic_optimizer(self): discount_prediction = K.placeholder(shape=(None,)) value = self.cric.output # [반환값 - 가치]의 제곱을 오류함수로 함. loss = K.mean(K.square(discount_prediction - value)) optimizer = RMSprop(lr=2.5e-4, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.cric.trainable_weights, [], loss) train = K.function([self.cric.input, discount_prediction], [loss], updates=updates) return train
def critic_optimize(self): target = K.placeholder(shape=(None, )) loss = K.mean(K.square(target - self.critic.output)) optimizer = RMSprop(lr=self.critic_learning_rate, rho=0.99, epsilon=0.01) update = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([self.critic.input, target], [loss], updates=update) return train
def critic_optimizer(): R = K.placeholder(shape=(None,)) critic = model_critic.output critic = K.print_tensor(critic, message='critic: ') Lv = K.mean(K.square(R - critic)) Lv = K.sum(Lv) Lv = K.print_tensor(Lv, message='Lv: ') optimizer = RMSprop(lr=2.5e-4, rho=0.99, epsilon=0.01) #optimizer = my_optimizer_critic updates = optimizer.get_updates(model_critic.trainable_weights, [], Lv) train = K.function([model_critic.input, R], [Lv], updates=updates) return train
def critic_optimizer2(self): target = K.placeholder(shape=[ None, ]) loss = K.mean(K.square(target - self.critic.output)) optimizer = RMSprop(lr=self.critic_lr * 50) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([self.critic.input, target], [], updates=updates) return train
def optimizer(self): """ The critic loss: mean squared error over discounted rewards """ #Placeholders discounted_returns_placeholder = K.placeholder(name='discounted_return',shape=(None,)) critic_loss = K.mean(K.square(discounted_returns_placeholder - self.model.output)) #Define optimizer adam_critic = RMSprop(lr = self.lr, epsilon = 0.1, rho = 0.99) #arbitray pars = self.model.trainable_weights updates = adam_critic.get_updates(params=pars,loss=critic_loss) return K.function([self.model.input, discounted_returns_placeholder], [], updates=updates)
def actor_optimizer(): a_t = K.placeholder(shape=[None, ACTION_COUNT]) A = K.placeholder(shape=(None, )) policy = model_actor.output Lpi = -K.sum(K.log(K.sum(policy*a_t, axis=1) + 1e-10) * A) LH = K.sum(K.sum(pi * K.log(policy + 1e-10), axis=1)) L = Lpi + 0.01 * LH optimizer = RMSprop(lr=2.5e-4, rho=0.99, epsilon=0.01) #optimizer = my_optimizer updates = optimizer.get_updates(model_actor.trainable_weights, [], L) train = K.function([model_actor.input, a_t, A], [L], updates=updates) return train
def __update_actor__(self): action = K.placeholder(shape=(None, self.output_shape_)) advantages = K.placeholder(shape=(None, )) policy = self.actor_.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob+1e-10) * K.stop_gradient(advantages) loss = -K.sum(eligibility) gradient = RMSprop() updates = gradient.get_updates(self.actor_.trainable_weights,[],loss) train = K.function([self.actor_.input,action,advantages],[self.actor_.output],updates=updates) return train
def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantage = K.placeholder(shape=[ None, ]) action_prob = K.sum(action * self.actor.output, axis=1) cross_entropy = K.log(action_prob) * advantage loss = -K.sum(cross_entropy) optimizer = RMSprop(lr=self.actor_lr) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantage], [], updates=updates) return train
def critic_optimizer(self): """ This method updates critic network (value network) """ discounted_prediction = K.placeholder(shape=(None, )) value = self.critic.output # You use loss function as mean squre error loss = K.mean(K.square(discounted_prediction - value)) optimizer = RMSprop(lr=self.critic_lr, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) train = K.function([self.critic.input, discounted_prediction], [loss], updates=updates) return train
def __mse(self): ''' Mean squared error loss :return: Keras function ''' q_values = self._online_model.output # trace of taken actions target = K.placeholder(shape=(None, ), name='target_value') a_1_hot = K.placeholder(shape=(None, self._action_dim), name='chosen_actions') q_value = K.sum(q_values * a_1_hot, axis=1) squared_error = K.square(target - q_value) mse = K.mean(squared_error) optimizer = RMSprop(lr=self._lr) updates = optimizer.get_updates(loss=mse, params=self._online_model.trainable_weights) return K.function(inputs=[self._online_model.input, target, a_1_hot], outputs=[], updates=updates)
def __build_train_fn(self): action_Q_placeholder = self.policy_network.output action_onehot_placeholder = K.placeholder(shape=(None, self.n_action)) target_Q_placeholder = K.placeholder(shape=(None, )) action_Qvalue = K.sum(action_Q_placeholder * action_onehot_placeholder, axis=1) loss = K.max(K.square(action_Qvalue - target_Q_placeholder)) adam = Adam(lr=0.0001) rmsprop = RMSprop(lr=0.0001, rho=0.99) updates = rmsprop.get_updates( params=self.policy_network.trainable_weights, loss=loss) self.train_fn = K.function(inputs=[ self.policy_network.input, action_onehot_placeholder, target_Q_placeholder ], outputs=[], updates=updates)
def build_functions(self): S = Input(shape=self.state_size) NS = Input(shape=self.state_size) A = Input(shape=(1, ), dtype='int32') R = Input(shape=(1, ), dtype='float32') T = Input(shape=(1, ), dtype='int32') self.build_model() self.value_fn = K.function([S], self.model(S)) VS = self.model(S) VNS = disconnected_grad(self.model(NS)) future_value = (1 - T) * VNS.max(axis=1, keepdims=True) discounted_future_value = self.discount * future_value target = R + discounted_future_value cost = ((VS[:, A] - target)**2).mean() opt = RMSprop(0.0001) params = self.model.trainable_weights updates = opt.get_updates(params, [], cost) self.train_fn = K.function([S, NS, A, R, T], cost, updates=updates)
def actor_optimizer(self): action = K.placeholder(shape=(None, self.output_pa)) advantages = K.placeholder(shape=(None, )) policy = self.actor.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages) loss = -K.sum(eligibility) entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) actor_loss = loss + 0.01*entropy # optimizer = Adam(lr=0.01) optimizer = RMSprop(lr=2.5e-4, rho=0.99, epsilon=0.0001) updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) train = K.function([self.actor.input, action, advantages], [], updates=updates) return train
def build_functions(self): S = Input(shape=self.state_size) NS = Input(shape=self.state_size) A = Input(shape=(1,), dtype='int32') R = Input(shape=(1,), dtype='float32') T = Input(shape=(1,), dtype='int32') self.build_model() self.value_fn = K.function([S], self.model(S)) VS = self.model(S) VNS = disconnected_grad(self.model(NS)) future_value = (1-T) * VNS.max(axis=1, keepdims=True) discounted_future_value = self.discount * future_value target = R + discounted_future_value cost = ((VS[:, A] - target)**2).mean() opt = RMSprop(0.0001) params = self.model.trainable_weights updates = opt.get_updates(params, [], cost) self.train_fn = K.function([S, NS, A, R, T], cost, updates=updates)
def actor_optimizer(self): action = K.placeholder(shape=[None, self.action_size]) advantages = K.placeholder(shape=[None, ]) policy = self.actor.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob + 1e-10) * advantages actor_loss = -K.sum(eligibility) entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) entropy = K.sum(entropy) loss = actor_loss + 0.01*entropy optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantages], [loss], updates=updates) return train
def optimizer(self): a = K.placeholder(shape=(None, ), dtype='int32') y = K.placeholder(shape=(None, ), dtype='float32') py_x = self.model.output a_one_hot = K.one_hot(a, self.action_size) q_value = K.sum(py_x * a_one_hot, axis=1) error = K.abs(y - q_value) quadratic_part = K.clip(error, 0.0, 1.0) linear_part = error - quadratic_part loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) optimizer = RMSprop(lr=0.00025, epsilon=0.01) updates = optimizer.get_updates(self.model.trainable_weights, [], loss) train = K.function([self.model.input, a, y], [loss], updates=updates) return train
def ActorOptimizer(self): action = K.placeholder(shape=[None, self.actionSpace]) advantages = K.placeholder(shape=[None, ]) policy = self.actor.output good_prob = K.sum(action * policy, axis=1) eligibility = K.log(good_prob + 1e-10) * advantages actor_loss = -K.sum(eligibility) entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) entropy = K.sum(entropy) loss = actor_loss + 0.01*entropy optimizer = RMSprop(lr=self.actorLearningrate, rho=0.99, epsilon=0.01) updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) train = K.function([self.actor.input, action, advantages], [loss], updates=updates) return train
def optimizer(self): a = K.placeholder(shape=(None,), dtype='int32') y = K.placeholder(shape=(None,), dtype='float32') # history에서 예측한 q value prediction = self.model.output # history에서 취한 action을 one_hot # action이 0일 때 [1,0,0] a_one_hot = K.one_hot(a, self.action_size) # [1,0,0] * [0.32113, 0.1123, 0.00123] = [0.32113,0,0]과 같이 된다. # 위의 sum하면 0.32113 q_value = K.sum(prediction * a_one_hot, axis=1) # target(reward + dicount_factor * np.max(model.predict(next_history)) )으로 받은 값이 y # 즉 정답인 y - q_value만큼의 오차가 발생하는데 현재 q_value의 값이 잘 못 되었을 경우 오차가 크게 발생 # 위에서 q_value는 우리가 예측한 값이다. (q hat) error = K.abs(y - q_value) quadratic_part = K.clip(error, 0.0, 1.0) linear_part = error - quadratic_part loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) optimizer = RMSprop(lr=0.00025, epsilon=0.01) # updates는 list updates = optimizer.get_updates(self.model.trainable_weights, [], loss) train = K.function([self.model.input, a, y], [loss], updates=updates) """Instantiates a Keras function. # Arguments inputs: List of placeholder tensors. outputs: List of output tensors. updates: List of update ops. **kwargs: Passed to `tf.Session.run`. # Returns Output values as Numpy arrays. # Raises ValueError: if invalid kwargs are passed in. """ return train
def build_functions(self): self.build_model() S = Input(shape=(self.state_size,)) NS = Input(shape=(self.state_size,)) A = Input(shape=(1,), dtype='int32') R = Input(shape=(1,), dtype='float32') T = Input(shape=(1,), dtype='int32') self.value_fn = kb.function([S], [self.model(S)]) values = self.model(S) next_values = self.model(NS) #disconnected_grad(self.model(NS)) future_value = kb.cast((1-T), dtype='float32') * kb.max(next_values, axis=1, keepdims=True) discounted_future_value = self.discount * future_value target = R + discounted_future_value cost = kb.mean(kb.pow(values - target, 2)) opt = RMSprop(0.0001) params = self.model.trainable_weights updates = opt.get_updates(params, [], cost) self.train_fn = kb.function([S, NS, A, R, T], [cost], updates=updates)
params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) # learning_rate = numpy.float32(0.01) # updates = [ # (param_i, param_i - learning_rate * grad_i) # for param_i, grad_i in zip(params, grads) # ] constraints_list = [] for param in params: constraints_list.append(identity()) rms = RMSprop() updates = rms.get_updates(params, constraints_list, cost) def read_image(address): img = Image.open(open(address)) img = numpy.asarray(img, dtype='float32') / 256. # put image in 4D tensor of shape (1, 3, height, width) img = img.transpose(2, 0, 1).reshape(1, 3, 128, 48) return img # img1 = read_image('/home/austin/Documents/Datasets/VIPeR/cam_a/001_45.bmp') # img2 = read_image('/home/austin/Documents/Datasets/VIPeR/cam_b/091_90.bmp') # f = theano.function([X1, X2, Y], [cost, layer2.similarity]) # y = numpy.asarray([-1], dtype='int32') # [tmp, sim] = f(img1, img2, y)
def __init__(self): rng = numpy.random.RandomState(23455) self.X1 = T.tensor4('X1', dtype='float32') self.X2 = T.tensor4('X2', dtype='float32') self.Y = T.ivector('Y') self.layer0 = Layer.ConvMaxPool2Layer( rng, input1=self.X1, input2=self.X2, filter_shape=[25, 3, 5, 5], poolsize=[2, 2] ) self.layer1 = Layer.ConvMaxPool2Layer( rng, input1=self.layer0.output1, input2=self.layer0.output2, filter_shape=[25, 25, 3, 3], poolsize=[2, 2] ) self.layer2 = Layer.SecretLayer( rng, input1=self.layer1.output1, input2=self.layer1.output2, filter_shape=[25, 25, 5, 5] ) # self.layer3 = Layer.MultiConvMaxPoolLayer( # rng, # input=self.layer2.results, # filter_shape=[25, 25, 3, 3], # poolsize=(2, 2) # ) self.layer3 = Layer.LocalCovLayerDropout( rng, input=self.layer2.results, n_in=18*9*25, n_out=200 ) self.layer4 = Layer.HiddenLayerDropout( rng, train_input=self.layer3.train_output, test_input=self.layer3.test_output, # n_in=25*24*3, n_in=800, n_out=200 ) # self.layer2 = Layer.ConvMaxPoolLayer( # rng, # input=T.abs_(self.layer1.output1 - self.layer1.output2), # filter_shape=[25, 25, 3, 3], # poolsize=[2, 2] # ) # # self.layer3 = Layer.HiddenLayer( # rng, # input=self.layer2.output, # n_in=25*18*5, # n_out=500 # ) # self.layer5 = Layer.LogisticRegression(self.layer4.output, 500, 2) # self.cost = self.layer5.negative_log_likelihood(self.Y) self.layer5 = Layer.LogisticRegressionDropout( train_input=self.layer4.train_output, test_input=self.layer4.test_output, n_in=200, n_out=2 ) self.cost = self.layer5.negative_log_likelihood_train(self.Y) self.params = self.layer5.params + self.layer4.params + self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params self.grads = T.grad(self.cost, self.params) # learning_rate = numpy.float32(0.01) # updates = [ # (param_i, param_i - learning_rate * grad_i) # for param_i, grad_i in zip(params, grads) # ] constraints_list = [] for param in self.params: constraints_list.append(identity()) rms = RMSprop() self.updates = rms.get_updates(self.params, constraints_list, self.cost)
regularizers += _regularizers constraints += _consts updates += _updates print('parameters:') print(params) print('regularizers:') print(regularizers) print('constrains:') print(constraints) print('updates:') print(updates) '''updates''' optimizer = RMSprop() _updates = optimizer.get_updates(params, constraints, train_loss) updates += _updates print('after RMSprop, updates:') for update in updates: print(update) train_ins = [X_train, y, weights] test_ins = [X_test, y, weights] predict_ins = [X_test] '''Get functions''' _train = K.function(train_ins, [train_loss], updates=updates) _train_with_acc = K.function(train_ins, [train_loss, train_accuracy], updates=updates) _predict = K.function(predict_ins, [y_test], updates=state_updates) _test = K.function(test_ins, [test_loss])