Exemple #1
0
 def save_value(self, app_id, key, subkey, value):
     '''
     Saves a value in the database.
     Creates the needed references of key and app if needed.
     
     @param app_id: string The identification of the requesting app
     @param key: string The key that will identify all given values
     @param subkey: string The subkey that identifies this specific value
     @param value: string Value to store
     
     @raise TypeProviderError: Notifies that the given key matches with another ResourceType
     '''
     app = App.get_by(name=app_id)
     if app == None:
         app = App(name=app_id)
     key_db = Key.get_by(name=key,app_name=app_id)
     if key_db == None:
         key_db = Key(name=key,app=app, type_name=self.providerType)
     if key_db.type_name != self.providerType:
         raise TypeProviderError(self.providerType, key_db.type_name)
         
     value_db = Value()
     value_db.key = key_db
     value_db.subkey = subkey
     value_db.value = value
     session.commit()
Exemple #2
0
    def __init__(self,
                 state_size,
                 action_size,
                 batch_size=128,
                 gamma=0.99,
                 mean_lambda=1e-3,
                 std_lambda=1e-3,
                 z_lambda=0.0):

        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size)

        self.mean_lambda = mean_lambda
        self.std_lambda = std_lambda
        self.z_lambda = z_lambda

        self.current_value = Value(state_size).to(device)
        self.target_value = Value(state_size).to(device)

        self.softQ = soft_Q(state_size, action_size)
        self.policy = Policy(state_size, action_size)

        self.value_optimizer = optim.Adam(self.current_value.parameters(),
                                          lr=3e-4)
        self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4)
Exemple #3
0
def createValue(value, series, time, id=None):
    if id:
        val = Value(time=time, value=value, series=series, parent=series, key_name='k' + id)
    else:
        val = Value(time=time, value=value, series=series, parent=series)
    val.put()
    return val
Exemple #4
0
    def text_message(self, message):
        email = extract_email(message.sender)

        try:
            sender = users.User(email)
        except users.UserNotFoundError as e:
            message.reply("You don't seem to have an account that I can find.")

        appuser = User.all().filter("info = ", sender).get()

        if not appuser:
            appuser = User(info = sender)
            appuser.put()

        try:
            datum = parser.parse(message.body)
        except parser.ParseException as e:
            message.reply("I couldn't understand you. (Message was: {msg})".format(msg = e.message))

        variable = Variable.all().filter("name = ", datum["variable"]).get()
        if not variable:
            variable = Variable(name = datum["variable"], user = appuser)
            variable.put()

        value = Value(value = datum["value"], variable = variable)
        value.put()

        message.reply("I've logged variable {variable} as being {value}".format(sender = email,
                                                                                variable = datum["variable"],
                                                                                value = datum["value"]))
Exemple #5
0
def trpo(args):
    env = gym.make(args.env_name)
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    env.seed(args.seed)
    torch.manual_seed(args.seed)

    policy_net = Policy(num_inputs, num_actions)
    value_net = Value(num_inputs)
    
    running_state = ZFilter((num_inputs,), clip=5)
    running_reward = ZFilter((1,), demean=False, clip=10)
    
    reward_record = []
    global_steps = 0

    for i_episode in range(args.num_episode):
        memory = Memory()
        
        # sample data: single path method
        num_steps = 0
        while num_steps < args.batch_size:
            state = env.reset()
            state = running_state(state)
            
            reward_sum = 0
            for t in range(args.max_step_per_episode):
                action = select_single_action(policy_net, state)
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)
                
                mask = 0 if done else 1
                
                memory.push(state, action, mask, next_state, reward)
                
                if done:
                    break
                    
                state = next_state
                
            num_steps += (t + 1)
            global_steps += (t + 1)
            reward_record.append({'steps': global_steps, 'reward': reward_sum})

        batch = memory.sample()
        batch_size = len(memory)
        
        # update params
        rewards = Tensor(batch.reward)
        masks = Tensor(batch.mask)
        actions = Tensor(batch.action)
        states = Tensor(batch.state)
        values = value_net(states)
        
        returns = Tensor(batch_size)
        deltas = Tensor(batch_size)
        advantages = Tensor(batch_size)

        prev_return = 0
        prev_value = 0
        prev_advantage = 0
        for i in reversed(range(batch_size)):
            returns[i] = rewards[i] + args.gamma * prev_return * masks[i]
            deltas[i] = rewards[i] + args.gamma * prev_value * masks[i] - values[i]
            # ref: https://arxiv.org/pdf/1506.02438.pdf (generalization advantage estimate)
            # notation following PPO paper
            advantages[i] = deltas[i] + args.gamma * args.lamda * prev_advantage * masks[i]

            prev_return = returns[i]
            prev_value = values[i]
            prev_advantage = advantages[i]
        advantages = (advantages - advantages.mean()) / (advantages.std() + EPS)
            
        # optimize value network
        loss_func_args = (value_net, states, returns)
        old_loss, _ = get_value_loss(value_net.get_flat_params(), *loss_func_args)
        flat_params, opt_loss, opt_info = sciopt.fmin_l_bfgs_b(get_value_loss, 
            value_net.get_flat_params(), args=loss_func_args, maxiter=args.value_opt_max_iter)
        value_net.set_flat_params(flat_params)
        print('ValueNet optimization: old loss = {}, new loss = {}'.format(old_loss, opt_loss))

        # optimize policy network
        # 1. find search direction for network parameter optimization, use conjugate gradient (CG)
        #       the direction can be found analytically, it s = - A^{-1} g, 
        #       where A is the Fisher Information Matrix (FIM) w.r.t. action probability distribution 
        #       and g is the gradient w.r.t. loss function \dfrac{\pi_\theta (a|s)}{q(a|s)} Q(s, a)
        policy_net.set_old_loss(states, actions)
        loss = policy_net.get_loss(states, actions, advantages)
        g = torch.autograd.grad(loss, policy_net.parameters())
        flat_g = torch.cat([grad.view(-1) for grad in g]).data
        Av = lambda v: policy_net.kl_hessian_times_vector(states, v)
        step_dir = conjugate_gradient(Av, - flat_g, nsteps=args.cg_nsteps)

        # 2. find maximum stepsize along the search direction
        #       the problem: min g * x  s.t. 1/2 * x^T * A * x <= delta
        #       can be solved analytically with x = beta * s
        #       where beta = sqrt(2 delta / s^T A s)
        sAs = 0.5 * (step_dir * Av(step_dir)).sum(0)
        beta = torch.sqrt(2 * args.max_kl / sAs)
        full_step = (beta * step_dir).data.numpy()

        # 3. do line search along the found direction, with maximum change = full_step
        #       the maximum change is restricted by the KL divergence constraint
        #       line search with backtracking method
        get_policy_loss = lambda x: policy_net.get_loss(states, actions, advantages)
        old_loss = get_policy_loss(None)
        success, new_params = line_search(policy_net, get_policy_loss, full_step, flat_g)
        policy_net.set_flat_params(new_params)
        new_loss = get_policy_loss(None)
        print('PolicyNet optimization: old loss = {}, new loss = {}'.format(old_loss, new_loss))

        if i_episode % args.log_num_episode == 0:
            print('Finished episode: {} Mean Reward: {}'.format(i_episode, reward_record[-1]))
            print('-----------------')
    
    policy_net.save_model_policy()
    value_net.save_model_value()
    return reward_record
Exemple #6
0
 def __init__(self, v):
     assert isinstance(v, np.ndarray), 'Unsupported type: %s' % type(v)
     Value.__init__(self, v)
Exemple #7
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 batch_size=128,
                 gamma=0.99,
                 mean_lambda=1e-3,
                 std_lambda=1e-3,
                 z_lambda=0.0):

        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.memory = ReplayBuffer(BUFFERSIZE, self.batch_size)

        self.mean_lambda = mean_lambda
        self.std_lambda = std_lambda
        self.z_lambda = z_lambda

        self.current_value = Value(state_size).to(device)
        self.target_value = Value(state_size).to(device)

        self.softQ = soft_Q(state_size, action_size)
        self.policy = Policy(state_size, action_size)

        self.value_optimizer = optim.Adam(self.current_value.parameters(),
                                          lr=3e-4)
        self.soft_q_optimizer = optim.Adam(self.softQ.parameters(), lr=3e-4)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=3e-4)

    def act(self, state):

        #state = torch.from_numpy(np.asarray(state)).float().to(device)
        action = self.policy.act(state)

        if self.memory.__len__() > self.batch_size:
            self.update()

        return action

    def add_to_memory(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

    def update(self):

        state, action, reward, next_state, done = self.memory.sample()

        expected_soft_q_value = self.softQ.forward(state, action)
        expected_value = self.current_value.forward(state)

        new_action, log_prob, z, mean, log_std = self.policy.evaluate(state)

        target_value = self.target_value.forward(next_state)
        next_soft_q_value = reward + self.gamma * target_value * (1 - done)

        q_val_mse = F.mse_loss(expected_soft_q_value,
                               next_soft_q_value.detach())

        expected_new_q_val = self.softQ.forward(state, new_action)
        next_value = expected_new_q_val - log_prob
        val_loss = F.mse_loss(expected_value, next_value.detach())

        log_prob_target = expected_new_q_val - expected_value
        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()

        mean_loss = self.mean_lambda * mean.pow(2).mean()
        std_loss = self.std_lambda * log_std.pow(2).mean()
        z_loss = self.z_lambda * z.pow(2).sum(1).mean()

        policy_loss += mean_loss + std_loss + z_loss

        self.soft_q_optimizer.zero_grad()
        q_val_mse.backward()
        self.soft_q_optimizer.step()

        self.value_optimizer.zero_grad()
        val_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.soft_update(self.current_value, self.target_value, TAU)

    def soft_update(self, local_model, target_model, TRANSFER_RATE):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(TRANSFER_RATE * local_param.data +
                                    (1.0 - TRANSFER_RATE) * target_param.data)
Exemple #8
0
 def json(self):
     Value.get(Key(self.require('key'))).delete()
     return {}
Exemple #9
0
def main(args):
    policy = Cvae()
    policy_optimizer = optim.Adam(policy.parameters(), lr=args.lr_cvae)
    value_network = Value()
    value_optimizer = optim.Adam(value_network.parameters(), lr=args.lr_value)
    mse_loss = nn.MSELoss()

    env = gym.make('Acrobot-v1')

    time_str, trained_model = cvae_policy_train(env,
                                                policy,
                                                value_network,
                                                policy_optimizer,
                                                value_optimizer,
                                                mse_loss,
                                                args)

    # Test the trained model using argmax.
    env = gym.make('Acrobot-v1')
    if args.record:
        # Store results from different runs of the model separately.
        results_directory = ''.join(['/tmp', args.directory_name, '/test/', time_str, '_discounting_',
                                     str(args.gamma), '_update_frequency_', str(args.update_frequency),
                                     '_value_update_times_', str(args.value_update_times)])
        env = gym.wrappers.Monitor(env, results_directory)

    if not args.cuda:
        plt.ion()
        test_returns = []

    for i in range(args.test_time):
        state_ = env.reset()
        done = False
        cumulative_return = 0

        for timestep in range(0, 500):
            if not done:
                if not args.cuda:
                    env.render()
                state_ = th.from_numpy(state_.reshape(1, -1))
                state = Variable(state_, requires_grad=False).type(Tensor)
                padding = Variable(th.zeros(1, 3), requires_grad=False).type(Tensor)
                state_padded = th.cat([state, padding], 1)
                _, _, p = trained_model.forward(state_padded)
                action = th.max(p, 1)[1].data[0]
                next_state_, reward_, done, info_ = env.step(action)
                cumulative_return += (args.gamma ** timestep) * reward_
                state_ = next_state_

        test_returns.append(cumulative_return)

        print('====> Cumulative return: {}'.format(cumulative_return))

        plt.clf()
        plt.figure(1)
        plt.xlabel('episodes')
        plt.ylabel('cumulative returns')
        plt.plot(test_returns)
        plt.show()
        plt.savefig(''.join(['cvae/test/', time_str, '_discounting_',
                             str(args.gamma), '_update_frequency_', str(args.update_frequency),
                             '_value_update_times_', str(args.value_update_times)]) + '.png')

    if not args.cuda:
        plt.ioff()
        plt.close()

    env.close()