Example #1
0
    def __init__(self,config,dev):

        
        self.dev = dev
        self.num_env =config['num_envs']
        self.get_img_from_render = config['get_img_from_render']
        
        
        
        self.obs_shape = (self.num_env,)+config['obs_space'][1:]
        self.reward_shape = (self.num_env,)+config['reward_space'][1:]
        self.gamma_shape = (self.num_env,)+config['gamma_space'][1:]
        
        
        
        if self.num_env == 1:
            self.env = gym.make(config['game_name'])
        else:
            def make_env():
                def _thunk():
                    env = gym.make(config['game_name'])
                    return env
                return _thunk
            envs = [make_env() for i in range(self.num_env)]
            self.env = SubprocVecEnv(envs)
    def __init__(self, num_env_workers, make_env_func, agent, batch_size,
                 rollout_length, num_recurrence_steps, state_shape,
                 action_shape, stats):
        ''' -one agent is assigned to a collector. 
            -a collector runs a bunch of envs in paralel to feed to that agent
            -you could run a bunch of collectors simultaniously, 
                |-  and then use weight mixing on the agents seperately
        '''
        self.num_env_workers = num_env_workers
        self.envs = SubprocVecEnv(
            [make_env_func() for i in range(num_env_workers)])
        self.agent = agent
        self.batch_size = batch_size
        self.rollout_length = rollout_length
        self.num_recurrence_steps = num_recurrence_steps
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.stats = stats

        self.buffer_full = False
        self.GAE_calculated = False

        self.gamma = 0.8
        self.tau = 0.8

        self.rollout_indices = np.zeros(batch_size)
        self.buffer_width = self.rollout_length + self.num_recurrence_steps - 1
        self.states = torch.zeros(
            (batch_size, self.buffer_width + 1, *state_shape),
            dtype=torch.float32).to(self.agent.device)
        self.actions = torch.zeros(
            (batch_size, self.buffer_width + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.log_probs = torch.zeros(
            (batch_size, self.buffer_width + 1, *action_shape),
            dtype=torch.float32).to(self.agent.device)
        self.values = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                  dtype=torch.float32).to(self.agent.device)
        self.rewards = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)
        self.done_masks = torch.zeros(
            (batch_size, self.buffer_width + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.advantages = torch.zeros(
            (batch_size, self.buffer_width + 1, 1),
            dtype=torch.float32).to(self.agent.device)
        self.returns = torch.zeros((batch_size, self.buffer_width + 1, 1),
                                   dtype=torch.float32).to(self.agent.device)

        self.state = self.envs.reset()
        self.hidden_state = torch.zeros(
            (1, self.num_env_workers,
             self.agent.hidden_state_size)).to(self.agent.device)
        self.cell_state = torch.zeros(
            (1, self.num_env_workers,
             self.agent.hidden_state_size)).to(self.agent.device)
Example #3
0
def gen_multi_envs(n_envs, policy):
    def make_env():
        def _thunk():
            env = gen_env(policy)
            return env

        return _thunk

    envs = [make_env() for i in range(n_envs)]
    envs = SubprocVecEnv(envs)
    return envs
def main():

    pixels = (
        (0.0, 1.0, 1.0),
        (0.0, 1.0, 0.0),
        (0.0, 0.0, 1.0),
        (1.0, 1.0, 1.0),
        (1.0, 1.0, 0.0),
        (0.0, 0.0, 0.0),
        (1.0, 0.0, 0.0),
    )
    pixel_to_categorical = {pix: i for i, pix in enumerate(pixels)}
    num_pixels = len(pixels)

    #For each mode in MiniPacman there are different rewards
    mode_rewards = {
        "regular": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        "avoid": [0.1, -0.1, -5, -10, -20],
        "hunt": [0, 1, 10, -20],
        "ambush": [0, -0.1, 10, -20],
        "rush": [0, -0.1, 9.9]
    }
    reward_to_categorical = {
        mode: {reward: i
               for i, reward in enumerate(mode_rewards[mode])}
        for mode in mode_rewards.keys()
    }

    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n

    env_model = EnvModel(envs.observation_space.shape, num_pixels,
                         len(mode_rewards["regular"]))
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(env_model.parameters())
Example #5
0
def main():
    num_envs = 16
    env_name = "CartPole-v0"

    def make_env():
        def _thunk():
            env = gym.make(env_name)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    env = gym.make("CartPole-v0")

    STATE_SIZE = env.observation_space.shape[0]
    N_ACTIONS = env.action_space.n

    agent = Agent(STATE_SIZE, N_ACTIONS)

    trainer = Trainer(envs, agent, lr=3e-4)
    trainer.train(epochs=10000, max_steps=5, test_every=50)
Example #6
0
File: ppo.py Project: CAiM-lab/PPO
    def __init__(self, args):
        """"Constructor which allows the PPO class to initialize the attributes of the class"""
        self.args = args
        self.random_seed()
        # Check if GPU is available via CUDA driver
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        # Initialize the actor critic class
        self.actor_critic = ActorCritic(
            self.args.nb_states, self.args.nb_actions,
            self.args.hidden_layer_size).to(self.device)
        # Define the optimizer used for the optimization of the surrogate loss
        self.optimizer = self.args.optimizer(self.actor_critic.parameters(),
                                             self.args.lr)

        # For training multiple instances of the env are needed (Shoulder model)
        self.envs = [self.make_env() for i in range(self.args.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        # To validate the intermediate learning process one test env is needed
        self.env_test = self.args.env
        self.env_test.seed(self.args.seed)
        self.env_test.set_scaling(self.args.output_scaling)

        #  Lists for Tensorboard to visualize learning process during learning
        self.test_rewards = []
        self.loss = []
        self.lr = []
        self.actor_grad_weight = []
        self.action_bang_bang = []

        self.lr.append(self.args.lr)

        # Dump bin files
        if self.args.play is False:
            self.output_path = "trained_models" + '/PPO_{}'.format(
                datetime.now().strftime('%Y%b%d_%H%M%S')) + "/"
            os.mkdir(self.output_path)
            self.writer = SummaryWriter(self.output_path)
Example #7
0
num_envs = 8
env_name = "CartPole-v0"


def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk


plt.ion()
envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)  # 8 env

env = gym.make(env_name)  # a single env


class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()

        self.critic = nn.Sequential(nn.Linear(num_inputs, hidden_size),
                                    nn.ReLU(), nn.Linear(hidden_size, 1))

        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
Example #8
0
def main():
    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape

    #a2c hyperparams:
    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e3)

    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99

    #Init a2c and rmsprop
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              lr,
                              eps=eps,
                              alpha=alpha)

    #if USE_CUDA:
    #    actor_critic = actor_critic.cuda()

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses = []

    state = envs.reset()
    state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            action = actor_critic.act(autograd.Variable(state))

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, state, action.data, reward, masks)

        _, next_value = actor_critic(
            autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        if i_update % num_frames == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())

            #clear_output(True)
            plt.figure(figsize=(20, 5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' %
                      (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()

        rollout.after_update()

    torch.save(actor_critic.state_dict(), "actor_critic_" + mode)

    import time

    def displayImage(image, step, reward):
        #clear_output(True)
        s = "step: " + str(step) + " reward: " + str(reward)
        plt.figure(figsize=(10, 3))
        plt.title(s)
        plt.imshow(image)
        plt.show()
        time.sleep(0.1)

    env = MiniPacman(mode, 1000)

    done = False
    state = env.reset()
    total_reward = 0
    step = 1

    while not done:
        current_state = torch.FloatTensor(state).unsqueeze(0)
        #if USE_CUDA:
        #    current_state = current_state.cuda()

        action = actor_critic.act(autograd.Variable(current_state))

        next_state, reward, done, _ = env.step(action.data[0, 0])
        total_reward += reward
        state = next_state

        image = torch.FloatTensor(state).permute(1, 2, 0).cpu().numpy()
        displayImage(image, step, total_reward)
        step += 1
Example #9
0
def create_envs(p, args, N):  #creates multiple environments for training
    urdf_path = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf")
    envs = [make_env(p, urdf_path, args=args) for i in range(N)]
    envs = SubprocVecEnv(envs)

    return envs
Example #10
0
def main():

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n
    num_rewards = len(task_rewards[mode])

    full_rollout = True

    env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards)
    env_model.load_state_dict(torch.load("env_model_" + mode))

    distil_policy = ActorCritic(envs.observation_space.shape,
                                envs.action_space.n)
    distil_optimizer = optim.Adam(distil_policy.parameters())

    imagination = ImaginationCore(1,
                                  state_shape,
                                  num_actions,
                                  num_rewards,
                                  env_model,
                                  distil_policy,
                                  full_rollout=full_rollout)

    actor_critic = I2A(state_shape,
                       num_actions,
                       num_rewards,
                       256,
                       imagination,
                       full_rollout=full_rollout)
    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              lr,
                              eps=eps,
                              alpha=alpha)

    #if USE_CUDA:
    #    env_model     = env_model.cuda()
    #    distil_policy = distil_policy.cuda()
    #    actor_critic  = actor_critic.cuda()

    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e5)

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses = []

    state = envs.reset()
    current_state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(current_state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            #if USE_CUDA:
            #    current_state = current_state.cuda()
            action = actor_critic.act(autograd.Variable(current_state))

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            current_state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, current_state, action.data, reward, masks)

        _, next_value = actor_critic(
            autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        distil_logit, _, _, _ = distil_policy.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        distil_loss = 0.01 * (F.softmax(logit).detach() *
                              F.log_softmax(distil_logit)).sum(1).mean()

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        distil_optimizer.zero_grad()
        distil_loss.backward()
        optimizer.step()

        if i_update % 100 == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())

            #clear_output(True)
            plt.figure(figsize=(20, 5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' %
                      (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()

        rollout.after_update()

    torch.save(actor_critic.state_dict(), "i2a_" + mode)
Example #11
0
def main():
    current_time = time.ctime().replace(":", "_")
    log_dir = "logs/PPO/{}".format(current_time)
    # tensorboard
    writer = SummaryWriter(log_dir=log_dir)

    # csv
    logfile_name = "{}/train_log.csv".format(log_dir)
    with open(logfile_name, 'w+', newline='') as f:
        csv_writer = csv.writer(f, delimiter=";")
        csv_writer.writerow([
            'update', 'running_loss', 'Reward', 'loss', 'actor_loss',
            'critic_loss', 'entropy_loss', 'time'
        ])

    ############## Hyperparameters ##############
    # env_name = "CartPole-v0"
    # creating environment
    envs = SubprocVecEnv([
        lambda: rpg.Environment('gym', "Neo"),
        lambda: rpg.Environment('gym', "Morpheus"),
        lambda: rpg.Environment('gym', "Trinity"),
        lambda: rpg.Environment('gym', "Oracle"),
        lambda: rpg.Environment('gym', "Cypher"),
        lambda: rpg.Environment('gym', "Tank"),
        lambda: rpg.Environment('gym', "Agent_Smith"),
        lambda: rpg.Environment('gym', "Dozer")
    ])

    env = VecPyTorch(envs, device)

    state_dim = (3, 64, 64)
    action_dim = env.action_space.n
    save_freq = 10000
    print_freq = 10
    max_episodes = 500001  # max training episodes
    max_timesteps = 5  # max timesteps in one episode
    n_latent_var = 256  # number of variables in hidden layer
    update_timestep = 15  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factror
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = 11
    actor_loss = 0
    critic_loss = 0
    entropy_loss = 0
    loss = 0
    #############################################

    if random_seed:
        os.environ['PYTHONHASHSEED'] = str(random_seed)
        random.seed(random_seed)
        numpy.random.seed(random_seed)
        torch.manual_seed(random_seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    state, minimap = env.reset()

    # training loop
    for i_episode in range(1, max_episodes + 1):
        # state, minimap = env.reset()
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            dist, _ = ppo.policy_old(state, minimap)
            action = dist.sample()
            state, minimap, reward, done, _ = env.step(action.unsqueeze(1))
            memory.states.append(state)
            memory.maps.append(minimap)
            memory.actions.append(action)
            memory.logprobs.append(dist.log_prob(action))

            # Saving reward and is_terminal:
            memory.rewards.append(reward.to(device).squeeze())
            memory.is_terminals.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                loss, actor_loss, critic_loss, entropy_loss = ppo.update(
                    memory)
                memory.clear_memory()
                timestep = 0

            running_reward += reward.mean().item()

        # avg_length += t

        # logging
        if i_episode % print_freq == 0:
            print("********************************************************")
            print("episode: {0}".format(i_episode))
            print("mean/median reward: {:.1f}/{:.1f}".format(
                reward.mean(), reward.median()))
            print("min/max reward: {:.1f}/{:.1f}".format(
                reward.min(), reward.max()))
            print("actor loss: {:.5f}, critic loss: {:.5f}, entropy: {:.5f}".
                  format(actor_loss, critic_loss, entropy_loss))
            print("Loss: {0}".format(loss))
            print("********************************************************")

        # show data in tensorflow
        writer.add_scalar('Loss/Loss', loss, i_episode)
        writer.add_scalar('Loss/Actor Loss', actor_loss, i_episode)
        writer.add_scalar('Loss/Critic Loss', critic_loss, i_episode)
        writer.add_scalar('Loss/Entropy', entropy_loss, i_episode)
        writer.add_scalar('Reward/Running Reward', running_reward, i_episode)

        writer.add_scalar('Reward/Min', reward.min(), i_episode)
        writer.add_scalar('Reward/Max', reward.max(), i_episode)
        writer.add_scalar('Reward/Mean', reward.mean(), i_episode)
        writer.add_scalar('Reward/Median', reward.median(), i_episode)
        writer.add_scalar('Reward/Sum', reward.sum(), i_episode)

        with open(logfile_name, 'a+', newline='') as f:
            csv_writer = csv.writer(f, delimiter=";")
            csv_writer.writerow([
                i_episode, running_reward,
                reward.mean(), loss, actor_loss, critic_loss, entropy_loss,
                time.ctime()
            ])

        if save_freq > 0 and i_episode % save_freq == 0:
            torch.save(ppo.policy.state_dict(), '{}/model.pth'.format(log_dir))
            torch.save(ppo.policy_old.state_dict(),
                       '{}/model_old.pth'.format(log_dir))
            print("saved")
def main():
    num_envs = 16
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    env = gym.make("CartPole-v0")

    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n
    # Hyper params:
    hidden_size = 256
    lr = 3e-4
    num_steps = 5

    model = ActorCritic(num_inputs,num_outputs,hidden_size).to(device)

    optimizer = optim.Adam(model.parameters())

    max_frames = 20000
    frame_idx = 0
    test_rewards = []
    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0

        #每个子网络运行num_steps个steps,实现n步采样
        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)
            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())
            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            #记录下这num_steps步的各子网络相关参数
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            state = next_state
            frame_idx += 1

            if frame_idx % 100 == 0:
                test_rewards.append(np.mean([test_env(model, env) for _ in range(10)]))
                plot(frame_idx, test_rewards)

        #将子网络的参数传给主网络,并进行参数更新
        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks)

        #将5个step的值串起来
        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values
        #计算loss均值
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Example #13
0
File: train.py Project: ProxJ/play
def train(env, agent, flags):
    """"""

    # set random seeds (for reproducibility)
    torch.manual_seed(flags['seed'])
    torch.cuda.manual_seed_all(flags['seed'])
    envs = [make_env(flags['env'], flags['seed'], i) for i in range(flags['num_envs'])]
    envs = SubprocVecEnv(envs)

    # instantiate the policy and optimiser
    num_inputs  = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    current_step_number = 0
    test_rewards = []
    state = envs.reset()

    
    while current_step_number < flags['max_steps']:
        
        log_probs = []
        values    = []
        rewards   = []
        masks     = []
        entropy = 0

        for _ in range(flags['num_step_td_update']):

            # sample an action from the distribution
            action = agent.act(state)
            # take a step in the environment
            next_state, reward, done, _ = envs.step(action.cpu().numpy())
                
            # compute the log probability
            log_prob = dist.log_prob(action)
            # compute the entropy
            entropy += dist.entropy().mean()
            
            # save the log probability, value and reward 
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))



            # if done, save episode rewards

            state = next_state
            current_step_number += 1
            
            if current_step_number % 1000 and flags['plot_test'] == 0:
                test_rewards.append(np.mean([test_env(model) for _ in range(10)]))
                plot(current_step_number, test_rewards)

        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
   
        # calculate the discounted return of the episode
        returns = compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns   = torch.cat(returns).detach()
        values    = torch.cat(values)

        advantage = returns - values

        actor_loss  = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        # loss function
        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    return rewards
Example #14
0
def dqn_algorithm(ENV_NAME,
                  NUM_ENV=8,
                  SEED=1,
                  TOTAL_TIMESTEPS=100000,
                  GAMMA=0.95,
                  MEMORY_SIZE=1000,
                  BATCH_SIZE=32,
                  EXPLORATION_MAX=1.0,
                  EXPLORATION_MIN=0.02,
                  EXPLORATION_FRACTION=0.7,
                  TRAINING_FREQUENCY=1000,
                  FILE_PATH='results/',
                  SAVE_MODEL=False,
                  MODEL_FILE_NAME='model',
                  LOG_FILE_NAME='log',
                  TIME_FILE_NAME='time',
                  PRINT_FREQ=100,
                  N_EP_AVG=100,
                  VERBOSE='False',
                  MLP_LAYERS=[64, 64],
                  MLP_ACTIVATIONS=['relu', 'relu'],
                  LEARNING_RATE=1e-3,
                  EPOCHS=1,
                  GRAD_CLIP=False,
                  DOUBLE_DQN=False,
                  USE_TARGET_NETWORK=True,
                  TARGET_UPDATE_FREQUENCY=5000,
                  LOAD_WEIGHTS=False,
                  LOAD_WEIGHTS_MODEL_PATH='results/model0.h5'):
    '''
    DQN Algorithm execution

    env_name : string for a gym environment
    num_env : no. for environment vectorization (multiprocessing env)
    total_timesteps : Total number of timesteps
    training_frequency : frequency of training (experience replay)
    gamma : discount factor : 
    buffer_size : Replay buffer size 
    batch_size : batch size for experience replay 
    exploration_max : maximum exploration at the begining 
    exploration_min : minimum exploration at the end 
    exploration_fraction : fraction of total timesteps on which the exploration decay takes place 
    output_folder : output filepath 
    save_model : boolean to specify whether the model is to be saved 
    model_file_name : name of file to save the model at the end learning 
    log_file_name : name of file to store DQN results 
    time_file_name : name of file to store computation time 
    print_frequency : results printing episodic frequency 
    n_ep_avg : no. of episodes to be considered while computing average reward 
    verbose : print episodic results 
    mlp_layers : list of neurons in each hodden layer of the DQN network 
    mlp_activations : list of activation functions in each hodden layer of the DQN network 
    learning_rate : learning rate for the neural network 
    epochs : no. of epochs in every experience replay 
    grad_clip : boolean to specify whether to use gradient clipping in the optimizer (graclip value 10.0) 
    double_dqn : boolean to specify whether to employ double DQN 
    use_target_network : boolean to use target neural network in DQN 
    target_update_frequency : timesteps frequency to do weight update from online network to target network 
    load_weights : boolean to specify whether to use a prespecified model to initializa the weights of neural network 
    load_weights_model_path : path for the model to use for weight initialization 
    '''

    before = time.time()
    num_envs = NUM_ENV
    env_name = ENV_NAME

    if TOTAL_TIMESTEPS % NUM_ENV:
        print('Error: total timesteps is not divisible by no. of envs')
        return

    def make_env():
        def _thunk():
            env = gym.make(env_name)
            env.seed(SEED)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    # for reproducibility
    set_seed(SEED)

    observation_space = envs.observation_space.shape[0]
    action_space = envs.action_space.n

    dqn_solver = DQNSolver(observation_space, action_space, MLP_LAYERS,
                           MLP_ACTIVATIONS, LEARNING_RATE, EPOCHS,
                           USE_TARGET_NETWORK, GRAD_CLIP, DOUBLE_DQN,
                           LOAD_WEIGHTS, LOAD_WEIGHTS_MODEL_PATH,
                           TOTAL_TIMESTEPS, MEMORY_SIZE, BATCH_SIZE, GAMMA,
                           EXPLORATION_MAX, EXPLORATION_MIN,
                           EXPLORATION_FRACTION)

    envs = ParallelEnvWrapper(envs)
    t = 0
    episode_rewards = [0.0] * num_envs
    explore_percent, episodes, mean100_rew, steps, NN_tr_loss = [],[],[],[],[]
    while True:
        state = envs.reset()
        # state = np.reshape(state, [1, observation_space])
        while True:
            t += num_envs
            dqn_solver.eps_timestep_decay(t)
            action = dqn_solver.act(state)
            state_next, reward, terminal, _ = envs.step(action)
            # print(terminal)
            # reward = reward if not terminal else -reward
            # state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            if t % TRAINING_FREQUENCY == 0:
                dqn_solver.experience_replay()
            state = state_next
            episode_rewards[-num_envs:] = [
                i + j for (i, j) in zip(episode_rewards[-num_envs:], reward)
            ]
            # num_episodes = len(episode_rewards)
            # print(terminal)
            if (t % PRINT_FREQ == 0):
                explore_percent.append(dqn_solver.exploration_rate * 100)
                episodes.append(len(episode_rewards))
                mean100_rew.append(
                    round(np.mean(episode_rewards[(-1 - N_EP_AVG):-1]), 1))
                steps.append(t)
                NN_tr_loss.append(dqn_solver.loss)
                if VERBOSE:
                    print('Exploration %: ' + str(int(explore_percent[-1])) +
                          ' ,Episodes: ' + str(episodes[-1]) +
                          ' ,Mean_reward: ' + str(mean100_rew[-1]) +
                          ' ,timestep: ' + str(t) + ' , tr_loss: ' +
                          str(round(NN_tr_loss[-1], 4)))

            if t > TOTAL_TIMESTEPS:
                output_table = np.stack((steps, mean100_rew, episodes,
                                         explore_percent, NN_tr_loss))
                if not os.path.exists(FILE_PATH):
                    os.makedirs(FILE_PATH)
                file_name = str(FILE_PATH) + LOG_FILE_NAME + '.csv'
                np.savetxt(
                    file_name,
                    np.transpose(output_table),
                    delimiter=',',
                    header=
                    'Timestep,Rewards,Episodes,Exploration %,Training Score')
                after = time.time()
                time_taken = after - before
                np.save(str(FILE_PATH) + TIME_FILE_NAME, time_taken)
                if SAVE_MODEL:
                    file_name = str(FILE_PATH) + MODEL_FILE_NAME + '.h5'
                    dqn_solver.model.save(file_name)
                return dqn_solver.model
            if USE_TARGET_NETWORK and t % TARGET_UPDATE_FREQUENCY == 0:
                dqn_solver.update_target_network()
            # print(t)
            if terminal.all():
                episode_rewards += [0.0] * num_envs
                break
Example #15
0
def main():
    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n

    env_model = EnvModel(envs.observation_space.shape, num_pixels,
                         len(mode_rewards["regular"]))
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(env_model.parameters())

    actor_critic.load_state_dict(torch.load("actor_critic_" + mode))

    def get_action(state):
        if state.ndim == 4:
            state = torch.FloatTensor(np.float32(state))
        else:
            state = torch.FloatTensor(np.float32(state)).unsqueeze(0)

        action = actor_critic.act(autograd.Variable(state, volatile=True))
        action = action.data.cpu().squeeze(1).numpy()
        return action

    def play_games(envs, frames):
        states = envs.reset()

        for frame_idx in range(frames):
            actions = get_action(states)
            next_states, rewards, dones, _ = envs.step(actions)

            yield frame_idx, states, actions, rewards, next_states, dones

            states = next_states

    reward_coef = 0.1
    num_updates = 5000

    losses = []
    all_rewards = []

    for frame_idx, states, actions, rewards, next_states, dones in tqdm(
            play_games(envs, num_updates), total=num_updates):
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)

        batch_size = states.size(0)

        onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:])
        onehot_actions[range(batch_size), actions] = 1
        inputs = autograd.Variable(torch.cat([states, onehot_actions], 1))

        #if USE_CUDA:
        #    inputs = inputs.cuda()

        imagined_state, imagined_reward = env_model(inputs)

        target_state = pix_to_target(next_states)
        target_state = autograd.Variable(torch.LongTensor(target_state))

        target_reward = rewards_to_target(mode, rewards)
        target_reward = autograd.Variable(torch.LongTensor(target_reward))

        optimizer.zero_grad()
        image_loss = criterion(imagined_state, target_state)
        reward_loss = criterion(imagined_reward, target_reward)
        loss = image_loss + reward_coef * reward_loss
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        all_rewards.append(np.mean(rewards))

        if frame_idx % num_updates == 0:
            plot(frame_idx, all_rewards, losses)

    torch.save(env_model.state_dict(), "env_model_" + mode)

    import time

    env = MiniPacman(mode, 1000)
    batch_size = 1

    done = False
    state = env.reset()
    iss = []
    ss = []

    steps = 0

    while not done:
        steps += 1
        actions = get_action(state)
        onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:])
        onehot_actions[range(batch_size), actions] = 1
        state = torch.FloatTensor(state).unsqueeze(0)

        inputs = autograd.Variable(torch.cat([state, onehot_actions], 1))
        #if USE_CUDA:
        #    inputs = inputs.cuda()

        imagined_state, imagined_reward = env_model(inputs)
        imagined_state = F.softmax(imagined_state)
        iss.append(imagined_state)

        next_state, reward, done, _ = env.step(actions[0])
        ss.append(state)
        state = next_state

        imagined_image = target_to_pix(
            imagined_state.view(batch_size, -1,
                                len(pixels))[0].max(1)[1].data.cpu().numpy())
        imagined_image = imagined_image.reshape(15, 19, 3)
        state_image = torch.FloatTensor(next_state).permute(1, 2,
                                                            0).cpu().numpy()

        #clear_output()
        plt.figure(figsize=(10, 3))
        plt.subplot(131)
        plt.title("Imagined")
        plt.imshow(imagined_image)
        plt.subplot(132)
        plt.title("Actual")
        plt.imshow(state_image)
        plt.show()
        time.sleep(0.3)

        if steps > 30:
            break
Example #16
0
def train(args):
	# hyper-params:
	frame_idx  		 = 0
	hidden_size      = args.hidden_size
	lr               = args.lr
	num_steps        = args.num_steps
	mini_batch_size  = args.mini_batch_size
	ppo_epochs       = args.ppo_epochs
	threshold_reward = args.threshold_reward
	max_frames 		 = args.max_frames
	# test_rewards 	 = []
	num_envs 		 = args.num_envs
	test_epochs		 = args.test_epochs
	resume_training	 = args.resume_training
	best_test_reward = 0.0
	urdf_path		 = os.path.join(BASE_DIR, os.pardir, "snake/snake.urdf")
	log_dir 		 = args.log_dir

	now = datetime.now()
	log_dir = log_dir + '_' + now.strftime('%d_%m_%Y_%H_%M_%S')

	# Check cuda availability.
	use_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if use_cuda else "cpu")


	p.connect(p.DIRECT)
	writer = SummaryWriter(log_dir)

	# Create training log.
	textio = utils.IOStream(os.path.join(log_dir, 'train.log'), args=args)
	# textio.log_params(device, num_envs, lr, threshold_reward)	
	utils.logFiles(log_dir)

	# create multiple environments.
	envs = [utils.make_env(p, urdf_path, args=args) for i in range(num_envs)]
	envs = SubprocVecEnv(envs)

	# pdb.set_trace()	# Debug
	num_inputs = envs.observation_space.shape[0]
	num_outputs = envs.action_space.shape[0]

	# Create Policy/Network
	net = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
	optimizer = optim.Adam(net.parameters(), lr=lr)

	# If use pretrained policy.
	if resume_training:
		if os.path.exists(resume_training):
			checkpoint = torch.load(resume_training)
			frame_idx = checkpoint['frame_idx']
			net.load_state_dict(checkpoint['model'])
			best_test_reward = checkpoint['best_test_reward']

	# Initial Reset for Environment.
	state = envs.reset()
	early_stop = False

	# Create env for policy testing.
	robot = snake.Snake(p, urdf_path, args=args)
	env = SnakeGymEnv(robot, args=args)

	print_('\nTraining Begins ...', color='r', style='bold')
	textio.log('Training Begins ...')
	while frame_idx < max_frames and not early_stop:
		print_('\nTraining Policy!', color='r', style='bold')
		textio.log('\n############## Epoch: %0.5d ##############'%(int(frame_idx/20)))

		# Memory buffers
		log_probs = []
		values    = []
		states    = []
		actions   = []
		rewards   = []
		masks     = []
		entropy   = 0
		total_reward = 0.0

		for i in range(num_steps):
			print('Steps taken: {} & Epoch: {}\r'.format(i, int(frame_idx/20)), end="")
			state = torch.FloatTensor(state).to(device)

			# Find action using policy.
			dist, value = net(state)
			action = dist.sample()
			action = action #HACK

			# Take actions and find MDP.
			next_state, reward, done, _ = envs.step(action.cpu().numpy())
			total_reward += sum(reward)
			textio.log('Steps: {} and Reward: {}'.format(int(frame_idx%20), total_reward))

			# Calculate log(policy)
			log_prob = dist.log_prob(action)
			entropy += dist.entropy().mean()

			# Create Experiences
			log_probs.append(log_prob)
			values.append(value)
			rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
			masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
			states.append(state)
			actions.append(action)
			
			# Update state.
			state = next_state
			frame_idx += 1

			# Test Trained Policy.
			if frame_idx % 40 == 0:
				print_('\n\nEvaluate Policy!', color='bl', style='bold')
				test_reward = np.mean([utils.test_env(env, net, test_idx) for test_idx in range(test_epochs)])

				# test_rewards.append(test_reward)
				# utils.plot(frame_idx, test_rewards)	# not required due to tensorboardX.
				writer.add_scalar('test_reward', test_reward, frame_idx)
				
				print_('\nTest Reward: {}\n'.format(test_reward), color='bl', style='bold')
				textio.log('Test Reward: {}'.format(test_reward))

				# Save various factors of training.
				snap = {'frame_idx': frame_idx,
						'model': net.state_dict(),
						'best_test_reward': best_test_reward,
						'optimizer' : optimizer.state_dict()}

				if best_test_reward < test_reward:
					save_checkpoint(snap, os.path.join(log_dir, 'weights_bestPolicy.pth'))
					best_test_reward = test_reward
				save_checkpoint(snap, os.path.join(log_dir,'weights.pth'))
				if test_reward > threshold_reward: early_stop = True
			if frame_idx % 1000 == 0:
				if not os.path.exists(os.path.join(log_dir, 'models')): os.mkdir(os.path.join(log_dir, 'models'))
				save_checkpoint(snap, os.path.join(log_dir, 'models', 'weights_%0.5d.pth'%frame_idx))

				
		# Calculate Returns
		next_state = torch.FloatTensor(next_state).to(device)
		_, next_value = net(next_state)
		returns = compute_gae(next_value, rewards, masks, values)

		# Concatenate experiences for multiple environments.
		returns   = torch.cat(returns).detach()
		log_probs = torch.cat(log_probs).detach()
		values    = torch.cat(values).detach()
		states    = torch.cat(states)
		actions   = torch.cat(actions)
		advantage = returns - values
		
		writer.add_scalar('reward/episode', total_reward, frame_idx)
		textio.log('Total Training Reward: {}'.format(total_reward))

		# Update the Policy.
		ppo_update(net, optimizer, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage, writer, frame_idx)