def save_value(self, app_id, key, subkey, value): ''' Saves a value in the database. Creates the needed references of key and app if needed. @param app_id: string The identification of the requesting app @param key: string The key that will identify all given values @param subkey: string The subkey that identifies this specific value @param value: string Value to store @raise TypeProviderError: Notifies that the given key matches with another ResourceType ''' app = App.get_by(name=app_id) if app == None: app = App(name=app_id) key_db = Key.get_by(name=key,app_name=app_id) if key_db == None: key_db = Key(name=key,app=app, type_name=self.providerType) if key_db.type_name != self.providerType: raise TypeProviderError(self.providerType, key_db.type_name) value_db = Value() value_db.key = key_db value_db.subkey = subkey value_db.value = value session.commit()
def __init__(self, state_size, action_size, batch_size=128, gamma=0.99, mean_lambda=1e-3, std_lambda=1e-3, z_lambda=0.0): self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size) self.mean_lambda = mean_lambda self.std_lambda = std_lambda self.z_lambda = z_lambda self.current_value = Value(state_size).to(device) self.target_value = Value(state_size).to(device) self.softQ = soft_Q(state_size, action_size) self.policy = Policy(state_size, action_size) self.value_optimizer = optim.Adam(self.current_value.parameters(), lr=3e-4) self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4)
def createValue(value, series, time, id=None): if id: val = Value(time=time, value=value, series=series, parent=series, key_name='k' + id) else: val = Value(time=time, value=value, series=series, parent=series) val.put() return val
def text_message(self, message): email = extract_email(message.sender) try: sender = users.User(email) except users.UserNotFoundError as e: message.reply("You don't seem to have an account that I can find.") appuser = User.all().filter("info = ", sender).get() if not appuser: appuser = User(info = sender) appuser.put() try: datum = parser.parse(message.body) except parser.ParseException as e: message.reply("I couldn't understand you. (Message was: {msg})".format(msg = e.message)) variable = Variable.all().filter("name = ", datum["variable"]).get() if not variable: variable = Variable(name = datum["variable"], user = appuser) variable.put() value = Value(value = datum["value"], variable = variable) value.put() message.reply("I've logged variable {variable} as being {value}".format(sender = email, variable = datum["variable"], value = datum["value"]))
def trpo(args): env = gym.make(args.env_name) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] env.seed(args.seed) torch.manual_seed(args.seed) policy_net = Policy(num_inputs, num_actions) value_net = Value(num_inputs) running_state = ZFilter((num_inputs,), clip=5) running_reward = ZFilter((1,), demean=False, clip=10) reward_record = [] global_steps = 0 for i_episode in range(args.num_episode): memory = Memory() # sample data: single path method num_steps = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(args.max_step_per_episode): action = select_single_action(policy_net, state) next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if done: break state = next_state num_steps += (t + 1) global_steps += (t + 1) reward_record.append({'steps': global_steps, 'reward': reward_sum}) batch = memory.sample() batch_size = len(memory) # update params rewards = Tensor(batch.reward) masks = Tensor(batch.mask) actions = Tensor(batch.action) states = Tensor(batch.state) values = value_net(states) returns = Tensor(batch_size) deltas = Tensor(batch_size) advantages = Tensor(batch_size) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(batch_size)): returns[i] = rewards[i] + args.gamma * prev_return * masks[i] deltas[i] = rewards[i] + args.gamma * prev_value * masks[i] - values[i] # ref: https://arxiv.org/pdf/1506.02438.pdf (generalization advantage estimate) # notation following PPO paper advantages[i] = deltas[i] + args.gamma * args.lamda * prev_advantage * masks[i] prev_return = returns[i] prev_value = values[i] prev_advantage = advantages[i] advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # optimize value network loss_func_args = (value_net, states, returns) old_loss, _ = get_value_loss(value_net.get_flat_params(), *loss_func_args) flat_params, opt_loss, opt_info = sciopt.fmin_l_bfgs_b(get_value_loss, value_net.get_flat_params(), args=loss_func_args, maxiter=args.value_opt_max_iter) value_net.set_flat_params(flat_params) print('ValueNet optimization: old loss = {}, new loss = {}'.format(old_loss, opt_loss)) # optimize policy network # 1. find search direction for network parameter optimization, use conjugate gradient (CG) # the direction can be found analytically, it s = - A^{-1} g, # where A is the Fisher Information Matrix (FIM) w.r.t. action probability distribution # and g is the gradient w.r.t. loss function \dfrac{\pi_\theta (a|s)}{q(a|s)} Q(s, a) policy_net.set_old_loss(states, actions) loss = policy_net.get_loss(states, actions, advantages) g = torch.autograd.grad(loss, policy_net.parameters()) flat_g = torch.cat([grad.view(-1) for grad in g]).data Av = lambda v: policy_net.kl_hessian_times_vector(states, v) step_dir = conjugate_gradient(Av, - flat_g, nsteps=args.cg_nsteps) # 2. find maximum stepsize along the search direction # the problem: min g * x s.t. 1/2 * x^T * A * x <= delta # can be solved analytically with x = beta * s # where beta = sqrt(2 delta / s^T A s) sAs = 0.5 * (step_dir * Av(step_dir)).sum(0) beta = torch.sqrt(2 * args.max_kl / sAs) full_step = (beta * step_dir).data.numpy() # 3. do line search along the found direction, with maximum change = full_step # the maximum change is restricted by the KL divergence constraint # line search with backtracking method get_policy_loss = lambda x: policy_net.get_loss(states, actions, advantages) old_loss = get_policy_loss(None) success, new_params = line_search(policy_net, get_policy_loss, full_step, flat_g) policy_net.set_flat_params(new_params) new_loss = get_policy_loss(None) print('PolicyNet optimization: old loss = {}, new loss = {}'.format(old_loss, new_loss)) if i_episode % args.log_num_episode == 0: print('Finished episode: {} Mean Reward: {}'.format(i_episode, reward_record[-1])) print('-----------------') policy_net.save_model_policy() value_net.save_model_value() return reward_record
def __init__(self, v): assert isinstance(v, np.ndarray), 'Unsupported type: %s' % type(v) Value.__init__(self, v)
class Agent(): def __init__(self, state_size, action_size, batch_size=128, gamma=0.99, mean_lambda=1e-3, std_lambda=1e-3, z_lambda=0.0): self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size) self.mean_lambda = mean_lambda self.std_lambda = std_lambda self.z_lambda = z_lambda self.current_value = Value(state_size).to(device) self.target_value = Value(state_size).to(device) self.softQ = soft_Q(state_size, action_size) self.policy = Policy(state_size, action_size) self.value_optimizer = optim.Adam(self.current_value.parameters(), lr=3e-4) self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4) def act(self, state): #state = torch.from_numpy(np.asarray(state)).float().to(device) action = self.policy.act(state) if self.memory.__len__() > self.batch_size: self.update() return action def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) def update(self): state, action, reward, next_state, done = self.memory.sample() expected_soft_q_value = self.softQ.forward(state, action) expected_value = self.current_value.forward(state) new_action, log_prob, z, mean, log_std = self.policy.evaluate(state) target_value = self.target_value.forward(next_state) next_soft_q_value = reward + self.gamma * target_value * (1 - done) q_val_mse = F.mse_loss(expected_soft_q_value, next_soft_q_value.detach()) expected_new_q_val = self.softQ.forward(state, new_action) next_value = expected_new_q_val - log_prob val_loss = F.mse_loss(expected_value, next_value.detach()) log_prob_target = expected_new_q_val - expected_value policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() mean_loss = self.mean_lambda * mean.pow(2).mean() std_loss = self.std_lambda * log_std.pow(2).mean() z_loss = self.z_lambda * z.pow(2).sum(1).mean() policy_loss += mean_loss + std_loss + z_loss self.soft_q_optimizer.zero_grad() q_val_mse.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() val_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.soft_update(self.current_value, self.target_value, TAU) def soft_update(self, local_model, target_model, TRANSFER_RATE): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(TRANSFER_RATE * local_param.data + (1.0 - TRANSFER_RATE) * target_param.data)
def json(self): Value.get(Key(self.require('key'))).delete() return {}
def main(args): policy = Cvae() policy_optimizer = optim.Adam(policy.parameters(), lr=args.lr_cvae) value_network = Value() value_optimizer = optim.Adam(value_network.parameters(), lr=args.lr_value) mse_loss = nn.MSELoss() env = gym.make('Acrobot-v1') time_str, trained_model = cvae_policy_train(env, policy, value_network, policy_optimizer, value_optimizer, mse_loss, args) # Test the trained model using argmax. env = gym.make('Acrobot-v1') if args.record: # Store results from different runs of the model separately. results_directory = ''.join(['/tmp', args.directory_name, '/test/', time_str, '_discounting_', str(args.gamma), '_update_frequency_', str(args.update_frequency), '_value_update_times_', str(args.value_update_times)]) env = gym.wrappers.Monitor(env, results_directory) if not args.cuda: plt.ion() test_returns = [] for i in range(args.test_time): state_ = env.reset() done = False cumulative_return = 0 for timestep in range(0, 500): if not done: if not args.cuda: env.render() state_ = th.from_numpy(state_.reshape(1, -1)) state = Variable(state_, requires_grad=False).type(Tensor) padding = Variable(th.zeros(1, 3), requires_grad=False).type(Tensor) state_padded = th.cat([state, padding], 1) _, _, p = trained_model.forward(state_padded) action = th.max(p, 1)[1].data[0] next_state_, reward_, done, info_ = env.step(action) cumulative_return += (args.gamma ** timestep) * reward_ state_ = next_state_ test_returns.append(cumulative_return) print('====> Cumulative return: {}'.format(cumulative_return)) plt.clf() plt.figure(1) plt.xlabel('episodes') plt.ylabel('cumulative returns') plt.plot(test_returns) plt.show() plt.savefig(''.join(['cvae/test/', time_str, '_discounting_', str(args.gamma), '_update_frequency_', str(args.update_frequency), '_value_update_times_', str(args.value_update_times)]) + '.png') if not args.cuda: plt.ioff() plt.close() env.close()