Example #1
0
def main():
    # define arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--render",
                        action="store_true",
                        help="Render the state")
    parser.add_argument("--render_interval",
                        type=int,
                        default=10,
                        help="Number of rollouts to skip before rendering")
    parser.add_argument("--num_rollouts",
                        type=int,
                        default=1000,
                        help="Number of max rollouts")
    parser.add_argument("--logfile",
                        type=str,
                        help="Indicate where to save rollout data")
    parser.add_argument(
        "--load_params",
        type=str,
        help="Load previously learned parameters from [LOAD_PARAMS]")
    parser.add_argument("--save_params",
                        type=str,
                        help="Save learned parameters to [SAVE_PARAMS]")
    parser.add_argument("--gamma",
                        type=float,
                        default=0.99,
                        help="Discount factor")
    parser.add_argument("--test", action="store_true", help="Test the params")
    args = parser.parse_args()

    signal.signal(signal.SIGINT, stopsigCallback)
    global stopsig

    # create the basketball environment
    env = BasketballVelocityEnv(fps=60.0,
                                timeInterval=0.1,
                                goal=[0, 5, 0],
                                initialLengths=np.array([0, 0, 1, 1, 1, 0, 1]),
                                initialAngles=np.array(
                                    [0, 45, -20, -20, 0, -20, 0]))

    # create space
    stateSpace = ContinuousSpace(ranges=env.state_range())
    actionSpace = ContinuousSpace(ranges=env.action_range())

    # create the model and policy functions
    modelFn = PoWERDistribution(stateSpace.n,
                                actionSpace.n,
                                sigma=5.0 if not args.test else 0)
    if args.load_params:
        print("Loading params...")
        modelFn.load_params(args.load_params)

    replayBuffer = ReplayBuffer(1024)
    if args.logfile:
        log = open(args.logfile, "a")

    rollout = 0
    while args.num_rollouts == -1 or rollout < args.num_rollouts:
        print("Iteration:", rollout)
        state = env.reset()
        reward = 0
        done = False
        steps = 0
        while not done and steps < 5:
            if stopsig:
                break
            action, eps = modelFn.predict(
                state, replayBuffer.sample(gamma=args.gamma))
            if steps == 4:
                action[-1] = 1.0
            nextState, reward, done, info = env.step(action)
            replayBuffer.append(state,
                                action,
                                reward,
                                nextState=nextState,
                                info={"eps": eps})
            state = nextState
            steps += 1
            if args.render and rollout % args.render_interval == 0:
                env.render()
        if stopsig:
            break

        # no importance sampling, implement it when we have small datasets
        replayBuffer.reset()
        dataset = replayBuffer.sample(gamma=args.gamma)
        modelFn.fit(dataset)

        avgR = np.sum(dataset["rewards"]) / float(len(dataset["rewards"]))
        avgQ = np.sum(dataset["values"]) / float(len(dataset["values"]))
        print("Rollouts:", rollout, "Error:", modelFn.score(), "Average Q",
              avgQ, "Average R", avgR)
        if args.logfile:
            log.write("[" + str(rollout) + ", " + str(modelFn.score()) + ", " +
                      str(avgQ) + ", " + str(avgR) + "]\n")
        rollout += 1

    if args.logfile:
        log.close()
    if args.save_params:
        print("Saving params...")
        modelFn.save_params(args.save_params)
Example #2
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):
        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.discrete = args.discrete

        net_config = {
            'hidden1' : args.hidden1,
            'hidden2' : args.hidden2
        }

        # Actor and Critic initialization
        self.actor = Actor(self.nb_states, self.nb_actions, **net_config)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_config)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr)

        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)

        # Replay Buffer and noise
        self.memory = ReplayBuffer(args.memory_size)
        self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions))

        self.last_state = None
        self.last_action = None

        # Hyper parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount

        # CUDA
        self.use_cuda = args.cuda
        if self.use_cuda:
            self.cuda()

    def cuda(self):
        self.actor.to(device)
        self.actor_target.to(device)
        self.critic.to(device)
        self.critic_target.to(device)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def reset(self, obs):
        self.last_state = obs
        self.noise.reset()

    def observe(self, reward, state, done):
        self.memory.append([self.last_state, self.last_action, reward, state, done])
        self.last_state = state

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.last_action = action
        return action.argmax() if self.discrete else action

    def select_action(self, state, apply_noise=False):
        self.eval()
        action = to_numpy(self.actor(to_tensor(np.array([state]), device=device))).squeeze(0)
        self.train()
        if apply_noise:
            action = action + self.noise.sample()
        action = np.clip(action, -1., 1.)
        self.last_action = action
        #print('action:', action, 'output:', action.argmax())
        return action.argmax() if self.discrete else action

    def update_policy(self):
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)
        state = to_tensor(np.array(state_batch), device=device)
        action = to_tensor(np.array(action_batch), device=device)
        next_state = to_tensor(np.array(next_state_batch), device=device)

        # compute target Q value
        next_q_value = self.critic_target([next_state, self.actor_target(next_state)])
        target_q_value = to_tensor(reward_batch, device=device) \
                         + self.discount * to_tensor((1 - terminal_batch.astype(np.float)), device=device) * next_q_value

        # Critic and Actor update
        self.critic.zero_grad()
        with torch.set_grad_enabled(True):
            q_values = self.critic([state, action])
            critic_loss = criterion(q_values, target_q_value.detach())
            critic_loss.backward()
            self.critic_optim.step()

        self.actor.zero_grad()
        with torch.set_grad_enabled(True):
            policy_loss = -self.critic([state.detach(), self.actor(state)]).mean()
            policy_loss.backward()
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return to_numpy(-policy_loss), to_numpy(critic_loss), to_numpy(q_values.mean())

    def save_model(self, output, num=1):
        if self.use_cuda:
            self.actor.to(torch.device("cpu"))
            self.critic.to(torch.device("cpu"))
        torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.actor.to(device)
            self.critic.to(device)

    def load_model(self, output, num=1):
        self.actor.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num)))
        if self.use_cuda:
            self.cuda()
Example #3
0
def main():
    # define arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--render",
                        action="store_true",
                        help="Render the state")
    parser.add_argument("--render_interval",
                        type=int,
                        default=10,
                        help="Number of rollouts to skip before rendering")
    parser.add_argument("--num_rollouts",
                        type=int,
                        default=-1,
                        help="Number of max rollouts")
    parser.add_argument("--logfile",
                        type=str,
                        help="Indicate where to save rollout data")
    parser.add_argument(
        "--load_params",
        type=str,
        help="Load previously learned parameters from [LOAD_PARAMS]")
    parser.add_argument("--save_params",
                        type=str,
                        help="Save learned parameters to [SAVE_PARAMS]")
    args = parser.parse_args()

    signal.signal(signal.SIGINT, stopsigCallback)
    global stopsig

    # create the basketball environment
    env = BasketballVelocityEnv(fps=60.0,
                                timeInterval=0.1,
                                goal=[0, 5, 0],
                                initialLengths=np.array([0, 0, 1, 1, 0, 0, 0]),
                                initialAngles=np.array([0, 45, 0, 0, 0, 0, 0]))

    # create space
    stateSpace = ContinuousSpace(ranges=env.state_range())
    actionRange = env.action_range()
    actionSpace = DiscreteSpace(
        intervals=[15 for i in range(2)] + [1],
        ranges=[actionRange[1], actionRange[2], actionRange[7]])
    processor = JointProcessor(actionSpace)

    # create the model and policy functions
    modelFn = MxFullyConnected(sizes=[stateSpace.n + actionSpace.n, 64, 32, 1],
                               alpha=0.001,
                               use_gpu=True)
    if args.load_params:
        print("loading params...")
        modelFn.load_params(args.load_params)

    softmax = lambda s: np.exp(s) / np.sum(np.exp(s))
    policyFn = EpsilonGreedyPolicy(
        epsilon=0.5,
        getActionsFn=lambda state: actionSpace.sample(1024),
        distributionFn=lambda qstate: softmax(modelFn(qstate)))
    dataset = ReplayBuffer()
    if args.logfile:
        log = open(args.logfile, "a")

    rollout = 0
    while args.num_rollouts == -1 or rollout < args.num_rollouts:
        print("Iteration:", rollout)
        state = env.reset()
        reward = 0
        done = False
        steps = 0
        while not done:
            if stopsig:
                break
            action = policyFn(state)
            nextState, reward, done, info = env.step(
                createAction(processor.process_env_action(action)))
            dataset.append(state, action, reward, nextState)
            state = nextState
            steps += 1
            if args.render and rollout % args.render_interval == 0:
                env.render()
        if stopsig:
            break

        dataset.reset()  # push trajectory into the dataset buffer
        modelFn.fit(processor.process_Q(dataset.sample(1024)), num_epochs=10)
        print("Reward:", reward if (reward >= 0.00001) else 0, "with Error:",
              modelFn.score(), "with steps:", steps)
        if args.logfile:
            log.write("[" + str(rollout) + ", " + str(reward) + ", " +
                      str(modelFn.score()) + "]\n")

        rollout += 1
        if rollout % 100 == 0:
            policyFn.epsilon *= 0.95
            print("Epsilon is now:", policyFn.epsilon)

    if args.logfile:
        log.close()
    if args.save_params:
        print("saving params...")
        modelFn.save_params(args.save_params)
Example #4
0
def train(config_file_path: str, save_dir: str, use_vime: bool, random_policy: bool, device: str, visualize_interval: int):
    conf_d = toml.load(open(config_file_path))
    conf = namedtuple('Config', conf_d.keys())(*conf_d.values())

    # Check if saving directory is valid
    if "test" in save_dir and os.path.exists(save_dir):
        shutil.rmtree(save_dir)
    if os.path.exists(save_dir):
        raise ValueError("Directory {} already exists.".format(save_dir))
    # Create save dir
    os.makedirs(save_dir)
    ckpt_dir = os.path.join(save_dir, 'checkpoints')
    os.makedirs(ckpt_dir)
    log_dir = os.path.join(save_dir, 'logs')
    os.makedirs(log_dir)
    # Save config file
    shutil.copyfile(config_file_path, os.path.join(save_dir, os.path.basename(config_file_path)))

    # Set random variable
    np.random.seed(int(time.time()))
    torch.manual_seed(int(time.time()))
    device = torch.device(device)
    if device.type == 'cuda':
        torch.cuda.manual_seed(int(time.time()))

    # Set up log metrics
    metrics = {
        'episode': [],
        'collected_samples': [],
        'reward': [], # cummulated reward
        'curiosity_reward': [], # cummulated reward with information gain
        'likelihood': [], # likelihood of leanred dynamics model
        'D_KL_median': [], 'D_KL_mean': [],
        'q1_loss': [], 'policy_loss': [], 'alpha_loss': [], 'alpha': [],
        'ELBO': [],
        'step': [], 'step_reward': [],
        'test_episode': [], 'test_reward': [],
    }

    # Set up environment
    print("----------------------------------------\nTrain in {}\n----------------------------------------".format(conf.environment))
    env = gym.make(conf.environment)

    if use_vime:
        print("Use VIME")
    if random_policy:
        print("Keep using random policy.")

    # Training set up
    agent = SAC(env.observation_space, env.action_space, device, **conf.agent)
    memory = ReplayBuffer(conf.replay_buffer_capacity, env.observation_space.shape, env.action_space.shape)
    vime = VIME(env.observation_space.shape[0], env.action_space.shape[0], device, **conf.vime) if use_vime else None
    # Load checkpoint if specified in config
    if conf.checkpoint != '':
        ckpt = torch.load(conf.checkpoint, map_location=device)
        metrics = ckpt['metrics']
        agent.load_state_dict(ckpt['agent'])
        memory.load_state_dict(ckpt['memory'])
        if use_vime:
            vime.load_state_dict(ckpt['vime'])

    def save_checkpoint():
        # Save checkpoint
        ckpt = {'metrics': metrics, 'agent': agent.state_dict(), 'memory': memory.state_dict()}
        if use_vime:
            ckpt['vime'] = vime.state_dict()
        path = os.path.join(ckpt_dir, 'checkpoint.pth')
        torch.save(ckpt, path)

        # Save agent model only
        model_ckpt = {'agent': agent.state_dict()}
        model_path = os.path.join(ckpt_dir, 'model.pth')
        torch.save(model_ckpt, model_path)

        # Save metrics only
        metrics_ckpt = {'metrics': metrics}
        metrics_path = os.path.join(ckpt_dir, 'metrics.pth')
        torch.save(metrics_ckpt, metrics_path)

    # Train agent
    init_episode = 0 if len(metrics['episode']) == 0 else metrics['episode'][-1] + 1
    pbar = tqdm.tqdm(range(init_episode, conf.episodes))
    reward_moving_avg = None
    moving_avg_coef = 0.1
    agent_update_count = 0
    total_steps = 0

    for episode in pbar:
        o = env.reset()
        rewards, curiosity_rewards = [], []
        info_gains = []
        log_likelihoods = []
        q1_losses, q2_losses, policy_losses, alpha_losses, alphas = [],[],[],[],[]

        for t in range(conf.horizon):
            if len(memory) < conf.random_sample_num or random_policy:
                a = env.action_space.sample()
            else:
                a = agent.select_action(o, eval=False)

            o_next, r, done, _ = env.step(a)
            total_steps += 1
            metrics['step'].append(total_steps)
            metrics['step_reward'].append(r)
            done = False if t == env._max_episode_steps - 1 else bool(done)  # done should be False if an episode is terminated forcefully
            rewards.append(r)

            if use_vime and len(memory) >= conf.random_sample_num:
                # Calculate curiosity reward in VIME
                info_gain, log_likelihood = vime.calc_info_gain(o, a, o_next)
                assert not np.isnan(info_gain).any() and not np.isinf(info_gain).any(), "invalid information gain, {}".format(info_gains)
                info_gains.append(info_gain)
                log_likelihoods.append(log_likelihood)
                vime.memorize_episodic_info_gains(info_gain)            
                r = vime.calc_curiosity_reward(r, info_gain)
            curiosity_rewards.append(r)

            memory.append(o, a, r, o_next, done)
            o = o_next

            # Update agent
            if len(memory) >= conf.random_sample_num and not random_policy:
                for _ in range(conf.agent_update_per_step):
                    batch_data = memory.sample(conf.agent_update_batch_size)
                    q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters(batch_data, agent_update_count)
                    q1_losses.append(q1_loss)
                    q2_losses.append(q2_loss)
                    policy_losses.append(policy_loss)
                    alpha_losses.append(alpha_loss)
                    alphas.append(alpha)
                    agent_update_count += 1

            if done:
                break

        if len(log_likelihoods) == 0:
            log_likelihoods.append(-np.inf)

        # Display performance
        episodic_reward = np.sum(rewards)
        reward_moving_avg = episodic_reward if reward_moving_avg is None else (1-moving_avg_coef) * reward_moving_avg + moving_avg_coef * episodic_reward
        if use_vime:
            pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Curiosity {:.1f}, Rwd {:.1f} (m.avg {:.1f}), Likelihood {:.2E}".format(
                episode, memory.step, len(memory), len(rewards), np.sum(curiosity_rewards), episodic_reward, reward_moving_avg, np.mean(np.exp(log_likelihoods))))
        else:
            pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Rwd {:.1f} (mov avg {:.1f})".format(
                episode, memory.step, len(memory), len(rewards), episodic_reward, reward_moving_avg))

        # Save episodic metrics
        metrics['episode'].append(episode)
        metrics['collected_samples'].append(total_steps)
        metrics['reward'].append(episodic_reward)
        metrics['curiosity_reward'].append(np.sum(curiosity_rewards))
        metrics['likelihood'].append(np.mean(np.exp(log_likelihoods)))
        if episode % visualize_interval == 0:
            lineplot(metrics['step'][-len(metrics['step_reward']):], metrics['step_reward'], 'stepwise_reward', log_dir, xaxis='total step')
            lineplot(metrics['episode'][-len(metrics['reward']):], metrics['reward'], 'reward', log_dir)
            lineplot(metrics['collected_samples'][-len(metrics['reward']):], metrics['reward'], 'sample-reward', log_dir, xaxis='total step')
            lineplot(metrics['episode'][-len(metrics['curiosity_reward']):], metrics['curiosity_reward'], 'curiosity_reward', log_dir)
            lineplot(metrics['episode'][-len(metrics['likelihood']):], metrics['likelihood'], 'likelihood', log_dir)
        # Agent update related metrics
        if len(policy_losses) > 0 and not random_policy:
            metrics['q1_loss'].append(np.mean(q1_losses))
            metrics['policy_loss'].append(np.mean(policy_losses))
            metrics['alpha_loss'].append(np.mean(alpha_losses))
            metrics['alpha'].append(np.mean(alphas))
            if episode % visualize_interval == 0:
                lineplot(metrics['episode'][-len(metrics['q1_loss']):], metrics['q1_loss'], 'q1_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['policy_loss']):], metrics['policy_loss'], 'policy_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha_loss']):], metrics['alpha_loss'], 'alpha_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha']):], metrics['alpha'], 'alpha', log_dir)

        # Update VIME
        if use_vime and len(memory) >= conf.random_sample_num:
            for _ in range(conf.vime_update_per_episode):
                batch_s, batch_a, _, batch_s_next, _ = memory.sample(conf.vime_update_batch_size)
                elbo = vime.update_posterior(batch_s, batch_a, batch_s_next)
            metrics['ELBO'].append(elbo)
            lineplot(metrics['episode'][-len(metrics['ELBO']):], metrics['ELBO'], 'ELBO', log_dir)
            if len(info_gains) > 0:
                metrics['D_KL_median'].append(np.median(info_gains))
                metrics['D_KL_mean'].append(np.mean(info_gains))
                multiple_lineplot(metrics['episode'][-len(metrics['D_KL_median']):], np.array([metrics['D_KL_median'], metrics['D_KL_mean']]).T, 'D_KL', ['median', 'mean'], log_dir)

        # Test current policy
        if episode % conf.test_interval == 0:
            rewards = []
            for _ in range(conf.test_times):
                o = env.reset()
                done = False
                episode_reward = 0
                while not done:
                    a = agent.select_action(o, eval=True)
                    o_next, r, done, _ = env.step(a)
                    episode_reward += r
                    o = o_next

                rewards.append(episode_reward)

            mean, std = np.mean(rewards), np.std(rewards)
            print("\nTEST AT EPISODE {} ({} episodes) --- Avg. Reward {:.2f} (+- {:.2f})".format(episode, conf.test_times, mean, std))

            metrics['test_episode'].append(episode)
            metrics['test_reward'].append(rewards)
            lineplot(metrics['test_episode'][-len(metrics['test_reward']):], metrics['test_reward'], 'test_reward', log_dir)
            

        # Save checkpoint
        if episode % conf.checkpoint_interval == 0:
            save_checkpoint()

    save_checkpoint()
    # Save the final model
    torch.save({'agent': agent.state_dict()}, os.path.join(ckpt_dir, 'final_model.pth'))