def _enjoy():          
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    policy.load(filename='ddpg', directory='reinforcement/pytorch/models/')

    obs = env.reset()
    done = False

    while True:
        while not done:
            action = policy.predict(np.array(obs))
            # Perform action
            obs, reward, done, _ = env.step(action)
            env.render()
        done = False
        obs = env.reset()        
Example #2
0
def _enjoy(args):

    from learning.utils.env import launch_env
    from learning.utils.wrappers import NormalizeWrapper, ImgWrapper, \
        DtRewardWrapper, ActionWrapper, ResizeWrapper
    from learning.utils.teacher import PurePursuitExpert
    # model = Model(action_dim=2, max_action=1.)
    model = Generator(action_dim=2)

    try:
        # state_dict = torch.load('models/imitate.pt', map_location=device)
        state_dict = torch.load('models/G{}.pt'.format(args.enjoy_tag),
                                map_location=device)

        model.load_state_dict(state_dict)
    except:
        print("Unexpected error:", sys.exc_info()[0])
        print('failed to load model')
        exit()

    model.eval().to(device)

    env = launch_env()
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)

    obs = env.reset()

    # max_count = 0
    while True:
        obs = torch.from_numpy(obs).float().to(device).unsqueeze(0)

        action = model(obs)
        action = action.squeeze().data.cpu().numpy()
        print("\nAction taken::", action, "\n")
        obs, reward, done, info = env.step(action)
        env.render()

        # if max_count > 50:
        #     max_count = 0
        #     obs = env.reset()

        if done:
            if reward < 0:
                print('*** FAILED ***')
                time.sleep(0.7)
            # max_count += 1
            obs = env.reset()
            env.render()
Example #3
0
def _main(args):
    ############## Hyperparameters ##############
    # env_name = "BipedalWalker-v2"
    env_name = 'Duckietown-loop_empty-v0'
    render = False

    lr = 0.0003  # parameters for Adam optimizer
    betas = (0.9, 0.999)

    random_seed = None

    print(args)
    #############################################

    # creating environment
    env = launch_env()
    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")
    # state_dim = env.observation_space.shape[0]
    state_dim = env.observation_space.shape
    state_dim = functools.reduce(operator.mul, state_dim, 1)
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    if random_seed:
        print("Random Seed: {}".format(random_seed))
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)

    memory = Memory()
    ppo = PPO(state_dim, action_dim, args.action_std, lr, betas, args.gamma,
              args.K_epochs, args.eps_clip, max_action, args.batch_size)
    print(lr, betas)

    # logging variables
    running_reward = 0
    avg_length = 0
    time_step = 0
    episode_reward = 0
    # stats = pd.DataFrame(columns = ["Episode", "Length", "Reward"])
    stats = []
    with open("PPO_stats.csv", 'w') as statsfile:
        statsfile.write("Epoch, Timesteps, Reward\n")
    # training loop
    for i_episode in range(1, args.max_episodes + 1):
        state = env.reset()
        for t in range(args.max_timesteps):
            time_step += 1
            # Running policy_old:
            action = ppo.select_action(state, memory)
            state, reward, done, _ = env.step(action)

            # Saving reward and is_terminals:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)

            # update if its time
            if time_step % args.update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0
            episode_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t
        # stats = stats.append({"Episode" : i_episode, "Length" : t, "Reward" : episode_reward}, ignore_index=True)
        stats.append((i_episode, t, episode_reward))
        running_reward += episode_reward
        episode_reward = 0

        if i_episode % args.store_interval == 0:
            torch.save(ppo.policy.state_dict(),
                       './PPO_continuous_{}.pth'.format(env_name))
            # stats.to_csv("PPO_stats.csv", index=False) #This line does not work on Google Colab!
            with open("PPO_stats.csv", 'a') as statsfile:
                for eps, ts, rwd in stats:
                    statsfile.write("%d, %d, %f\n" % (eps, ts, rwd))
            stats = []

        # logging
        if i_episode % args.log_interval == 0:
            avg_length = int(avg_length / args.log_interval)
            running_reward = int((running_reward / args.log_interval))

            print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
Example #4
0
def test():
    ############## Hyperparameters ##############
    # env_name = "BipedalWalker-v2"
    env_name = 'Duckietown-loop_empty-v0'
    # env = gym.make(env_name)
    # creating environment
    env = launch_env()
    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")
    # state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape
    state_dim = functools.reduce(operator.mul, state_dim, 1)
    max_action = float(env.action_space.high[0])

    n_episodes = 3  # num of episodes to run
    max_timesteps = 500  # max timesteps in one episode
    render = True  # render the environment
    save_gif = False  # png images are saved in gif folder

    # filename and directory to load model from
    filename = "PPO_continuous_" + env_name + ".pth"
    directory = "./preTrained/"

    action_std = 0.05  # constant std for action distribution (Multivariate Normal)
    K_epochs = 80  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr = 0.0003  # parameters for Adam optimizer
    betas = (0.9, 0.999)
    #############################################

    memory = Memory()
    ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs,
              eps_clip, max_action, 32)
    ppo.policy_old.load_state_dict(
        torch.load(directory + filename, map_location=torch.device('cpu')))

    for ep in range(1, n_episodes + 1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = ppo.select_action(state, memory)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
            if save_gif:
                img = env.render(mode='rgb_array')
                img = Image.fromarray(img)
                img.save('./gif/{}.jpg'.format(t))
            if done:
                break

        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()
Example #5
0
def _train(args):   
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)
        
    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")
    
    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    replay_buffer = ReplayBuffer(args.replay_buffer_max_size)
    print("Initialized DDPG")
    
    # Evaluate untrained policy
    evaluations= [evaluate_policy(env, policy)]
   
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    episode_reward = None
    env_counter = 0
    reward = 0
    episode_timesteps = 0
    
    print("Starting training")
    while total_timesteps < args.max_timesteps:
        
        print("timestep: {} | reward: {}".format(total_timesteps, reward))
            
        if done:
            if total_timesteps != 0:
                print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (
                    total_timesteps, episode_num, episode_timesteps, episode_reward))
                policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval %= args.eval_freq
                    evaluations.append(evaluate_policy(env, policy))
                    print("rewards at time {}: {}".format(total_timesteps, evaluations[-1]))

                    if args.save_models:
                        policy.save(filename='ddpg', directory=args.model_dir)
                    np.savez("./results/rewards.npz",evaluations)

            # Reset environment
            env_counter += 1
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0,
                    args.expl_noise,
                    size=env.action_space.shape[0])
                          ).clip(env.action_space.low, env.action_space.high)

        # Perform action
        new_obs, reward, done, _ = env.step(action)
        env.render()
        if episode_timesteps >= args.env_timesteps:
            done = True

        done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float(done)
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, new_obs, action, reward, done_bool)

        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1
    
    print("Training done, about to save..")
    policy.save(filename='ddpg', directory=args.model_dir)
    print("Finished saving..should return now!")