Ejemplo n.º 1
0
def adv_training():
    ############## Hyperparameters ##############
    # creating environment
    env = TronEnv()
    state_dim = env.observation_space.shape[0]
    action_dim = 4
    render = True
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 20  # print avg reward in the interval
    max_episodes = 10  # max training episodes
    max_timesteps = 300  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 2000  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = None
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    h_memory = Memory()
    a_memory = Memory()

    hero = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
               eps_clip)
    hero.load()

    adv = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    adv.load()

    timestep = 0

    # training loop
    for _ in range(1, max_episodes + 1):
        p_state, e_state = env.reset()
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            p_action = hero.policy_old.act(np.array(p_state), h_memory)
            e_action = adv.policy_old.act(np.array(e_state), a_memory)

            p_state, _, _, h_done, _ = env.step(p_action, 0)
            e_state, _, _, a_done, _ = env.step(e_action, 1)

            # if render:
            #     env.render()
            if h_done or a_done:
                env.render()
                break
Ejemplo n.º 2
0
def test():
    torch.set_default_tensor_type('torch.DoubleTensor')
    ############## Hyperparameters ##############
    env_name = "Snake Game"
    # creating environment
    # tell env to initialize game board too
    env = SnakeGameGym(initBoard=True)
    state_dim = env.observation_space.shape[0]
    action_dim = 4
    render = False
    max_timesteps = 500
    n_latent_var = 64  # number of variables in hidden layer
    lr = 0.0007
    betas = (0.9, 0.999)
    gamma = 1.00  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    #############################################

    n_episodes = 3
    max_timesteps = 300
    render = True
    save_gif = False

    filename = "PPO_{}.pth".format(env_name)
    directory = ""

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)

    ppo.policy.load_state_dict(torch.load(directory + filename))

    for ep in range(1, n_episodes + 1):
        ep_reward = 0
        state = env.reset()

        for t in range(max_timesteps):
            print(t)
            ## So that it doesn't go too fast and goes at a normal snake game pace
            time.sleep(0.05)
            action = ppo.policy.act(state, memory)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                print("Rendering")
                env.render()
            #if save_gif:
            #    img = env.render(mode = 'rgb_array')
            #    img = Image.fromarray(img)
            #     img.save('./gif/{}.jpg'.format(t))
            if done:
                print("Done")
                env.cleanup()
                break

        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()
Ejemplo n.º 3
0
def test():
    ############## Hyperparameters ##############
    env_name = "LunarLander-v2"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = 4
    render = False
    max_timesteps = 500
    n_latent_var = 64  # number of variables in hidden layer
    lr = 0.0007
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    #############################################

    n_episodes = 3
    max_timesteps = 300
    render = True
    save_gif = False

    filename = "PPO_{}.pth".format(env_name)
    directory = "./preTrained/"

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)

    ppo.policy_old.load_state_dict(torch.load(directory + filename))

    for ep in range(1, n_episodes + 1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                img = env.render(mode='rgb_array')
                #print(screen.shape)
                img = Image.fromarray(img)
                #plt.ion()
                plt.imshow(img)
                #plt.show()
                ipythondisplay.clear_output(wait=True)
                ipythondisplay.display(plt.gcf())
            if save_gif:
                img = env.render(mode='rgb_array')
                img = Image.fromarray(img)
                img.save('./gif/{}.jpg'.format(t))
            if done:
                break

        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()
Ejemplo n.º 4
0
def test(env_name):
    ############## Hyperparameters ##############
    env = make_env(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    n_episodes = 3  # num of episodes to run
    max_timesteps = 1500  # max timesteps in one episode
    render = True  # render the environment
    save_gif = True  # png images are saved in gif folder

    # filename and directory to load model from
    filename = "PPO_continuous_" + env_name + ".pth"
    directory = "./preTrained/"

    action_std = 0.5  # constant std for action distribution (Multivariate Normal)
    K_epochs = 80  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr = 0.0003  # parameters for Adam optimizer
    betas = (0.9, 0.999)
    #############################################

    memory = Memory()
    ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs,
              eps_clip)
    ppo.policy_old.load_state_dict(torch.load(directory + filename))

    for ep in range(1, n_episodes + 1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = ppo.select_action(state, memory)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
            if save_gif:
                img = env.render(mode='rgb_array')
                img = Image.fromarray((img * 255).astype(np.uint8))
                img.save('./gif/' + env_name + '/{}.jpg'.format(t))
            if done:
                break

        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()

    createGif(env_name)
Ejemplo n.º 5
0
def test():
    ############## Hyperparameters ##############
    # creating environment
    rm_ai = game()
    state_dim = rm_ai.state_num
    action_dim = rm_ai.action_num
    render = False
    max_timesteps = 500
    n_latent_var = 64  # number of variables in hidden layer
    lr = 0.0007
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    #############################################

    n_episodes = 3
    max_timesteps = 300
    render = True

    filename = "PPO_{}.pth".format("robomaster")

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)

    ppo.policy_old.load_state_dict(torch.load(filename))

    for ep in range(1, n_episodes + 1):
        ep_reward = 0
        state = rm_ai.reset()
        for t in range(max_timesteps):
            action = ppo.policy_old.act(state, memory)
            print(action)
            state, reward, done, _ = rm_ai.step(action)
            ep_reward += reward
            if render:
                for event in pygame.event.get():
                    if event.type == pygame.QUIT:
                        exit()

            if done:
                break

        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
Ejemplo n.º 6
0
def test():
    ############## Hyperparameters ##############

    # creating environment
    env = MyEnv()
    env_name = env.env_name
    action_dim = 5
    n_latent_var = 64           # number of variables in hidden layer
    lr = 0.0007
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 4                # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    #############################################

    n_episodes = 100
    max_timesteps = 5000
    save_gif = False

    filename = "./preTrained/PPO_{}_train2.pth".format(env_name)
    
    memory = Memory()
    ppo = PPO(64*64*3, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    
    ppo.policy_old.load_state_dict(torch.load(filename))
    rewards = []
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            obs, compass = converter(state)
            action = ppo.policy_old.act( obs=obs, compass=compass, memory=memory)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            # if render:
            #     env.render()
            if save_gif:
                 img = obs.data.numpy()
                 img = Image.fromarray(img)
                 img.save('./gif/{}.jpg'.format(t))  
            if done:
                break
        rewards.append(ep_reward)
        logging.debug('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
    np.save('./PPO_ep_rewards_test_{}'.format(env_name), np.array(rewards))
Ejemplo n.º 7
0
def test(env):
    ############## Hyperparameters ##############
    # creating environment

    state_dim = 11
    action_dim = 5

    n_latent_var = 128           # number of variables in hidden layer
    lr = 0.0007
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 4                # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    #############################################

    n_episodes = 15
    max_timesteps = 75


    filename = "PPO_{}.pth".format('bitirmeindep')
    directory = "./"
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    
    ppo.policy_old.load_state_dict(torch.load(directory+filename))
    
    for ep in range(1, n_episodes+1):
        env.ResetUnity()
        state, _ , _ = env.GetState()
        episode_reward = 0
        for t in range(max_timesteps):
            action = ppo.policy_old.act(state, memory)
            env.PostAction(action)
            state, reward, done = env.GetState()
            ep_reward += reward
 
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
Ejemplo n.º 8
0
def test(args):

    env = gym.make('GridExplore-v0')
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    state_dim = env.observation_space[0].shape[0]

    action_dim = 5
    

    render = args.render
    max_timesteps = 500
    n_latent_var = 512           # number of variables in hidden layer
    lr = 0.001
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 2                # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    #############################################

    
    filename = str(input("filename: "))
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    
    print(ppo.policy_old.state_dict)
    ppo.policy_old.load_state_dict(torch.load(filename, map_location=torch.device('cpu')))
    
    avg=0
    for i in range(10):
            
        s = env.reset()
        done_n = [False for _ in range(env.n_agents)]

        totalreward = 0 
        t= 0
        while not all(done_n):
            t+=1

            actions = []
            env.render()
            if render:
                env.render_graphic()        
            state=np.array([s])
            
            state = torch.from_numpy(state).float().to(device)

            action = ppo.policy_old.act(state, memory)
            state, r, done_n, _ = env.step([action])

            totalreward  = totalreward + r 
            time.sleep(0.01)
            if t > 500:
                break

        print("REWARDS: " , totalreward)
        avg += totalreward
    
    if render:
        env.render_graphic()        

    env.render()

    env.close()

    print("AVG REWARD: " , avg/10)
Ejemplo n.º 9
0
def mptrain(args):
    ############## Hyperparameters ##############
    env_name = "GridExplore-v0"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space[0].shape[0]

    action_dim = 5
    model = ConvNet(action_dim).to(device)

    render = False
    solved_reward = 200         # stop training if avg_reward > solved_reward
    log_interval = 20           # print avg reward in the interval
    max_episodes = 500        # max training episodes
    max_timesteps = 500         # max timesteps in one episode
    n_latent_var = 128           # number of variables in hidden layer
    update_timestep = 600      # update policy every n timesteps
    lr = 1e-4
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 4                # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    random_seed = None
    mini_batch_size = 32
    #############################################
    
    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)
    
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    print(lr,betas)

    # buffer = {key:value for key,value in memory.__dict__.items() if not key.startswith('__') and not callable(key)}

    
    num_processes = 4
    multi_envs = [gym.make(env_name) for i in range(num_processes)] 
    multi_mem = []
    for i in range(num_processes):
        multi_mem.append(Memory())


    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    writer = SummaryWriter("logs/" + time.strftime("%Y%m%d-%H%M%S"))
    q = queue()

    # training loop
    for i_episode in range(1, max_episodes+1):
        
        states = [multi_envs[i].reset() for i in range(num_processes)]        

        for t in range(max_timesteps):
            timestep += 1

            for k in range(num_processes):
                state = np.array([states[k]])
                outputs = torch.from_numpy(state).float().to(device)

                # Running policy_old:
                action = ppo.policy_old.act(outputs, multi_mem[k])
                state, reward, done, _ = multi_envs[k].step([action])

                # Saving reward and is_terminal:
                multi_mem[k].rewards.append(reward)
                multi_mem[k].dones.append(done[0])

                running_reward += reward[0]
                
                if done:
                    states[k] = multi_envs[k].reset()
                    avg = q.push(running_reward)

            # update if its time
            if timestep % update_timestep == 0:

                for k in range(num_processes):
                    memory = multi_mem[k]
                    # memory = multi_mem.flatten().tolist()
                    ppo.update(memory)
                    # for k in range(num_processes):
                    multi_mem[k].clear_memory()
                    timestep = 0
            
            
            if render:
                env.render()
            if all(done):
                break
                
        avg_length += t 
        
        running_reward /= num_processes
        avg = q.push(running_reward)

        
        # grid = torchvision.utils.make_grid(torch.tensor(env.grid))
        # writer.add_image('images', grid, max_timesteps)

        
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            torch.save(ppo.policy.state_dict(), './savedmodels/PPO_{}.pth'.format(time.strftime("%Y%m%d-%H%M%S")))
            break
            
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            writer.add_scalar('episode/average_reward', avg, i_episode)

            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0

    writer.close()
    torch.save(ppo.policy.state_dict(), './PPO_NOTSOLVED_{}.pth'.format(env_name))
    torch.save(ppo.policy.state_dict(), './savedmodels/PPO_NOTSOLVED_{}.pth'.format(time.strftime("%Y%m%d-%H%M%S")))
Ejemplo n.º 10
0
    action_constrain = [10, np.pi / 20]
    # parameter = [0.1,0.0009]
    # parameter =  [0.0000001, 0.5]
    # pid = PID( parameter, env.width, env.height )

    ### for plotting
    Reward = []
    save_path = './PPO_out/'
    if not os.path.exists("./PPO_out"):
        os.makedirs("./PPO_out")
    """ start straightly """
    evaluations = []
    if args.policy_name == 'state':
        from PPO import PPO
        from PPO import Memory
        memory = Memory()
        policy = PPO(state_dim, action_dim, args.action_std, args.lr,
                     args.betas, args.gamma, args.K_epochs, args.eps_clip)
    elif args.policy_name == 'rgb_array':
        from PPO_image import PPO
        from PPO_image import Memory
        memory = Memory()
        policy = PPO(img_stack, action_dim, args.action_std, args.lr,
                     args.betas, args.gamma, args.K_epochs, args.eps_clip)

    env.total_timesteps = 0
    timesteps_since_eval = 0
    pid_assist = 0
    time_step = 0
    done = True
Ejemplo n.º 11
0
def main():
    # ############## Hyperparameters ##############
    # env_name = "LunarLander-v2"
    # # creating environment
    # env = gym.make(env_name)
    # state_dim = env.observation_space.shape[0]
    # action_dim = 4
    # render = 'render' in sys.argv
    # solved_reward = 230         # stop training if avg_reward > solved_reward
    # log_interval = 20           # print avg reward in the interval
    # max_episodes = 50000        # max training episodes
    # max_timesteps = 300         # max timesteps in one episode
    # n_latent_var = 64           # number of variables in hidden layer
    # update_timestep = 2000      # update policy every n timesteps
    # lr = 0.002
    # betas = (0.9, 0.999)
    # gamma = 0.99                # discount factor
    # K_epochs = 4                # update policy for K epochs
    # eps_clip = 0.2              # clip parameter for PPO
    # random_seed = None
    # #############################################

    ############## Hyperparameters ##############
    env_name = "SuperMarioBros-v3"
    # creating environment
    env = gym_super_mario_bros.make(env_name)
    state_dim = env.observation_space.shape[2]
    # print('state_dim:', state_dim)
    action_dim = 4
    render = 'render' in sys.argv
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 1  # print avg reward in the interval
    max_episodes = 20  # max training episodes
    max_timesteps = 50  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 256  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = None
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    print(lr, betas)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes + 1):
        state = env.reset()
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            action = ppo.policy_old.act(state.copy(), memory)
            state, reward, done, _ = env.step(action.cpu())

            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                timestep = 0

            running_reward += reward
            if render:
                env.render()
            if done:
                state = env.reset()

        avg_length += t

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval * solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(),
                       './saved_models/PPO_{}.pth'.format(env_name))
            break

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
Ejemplo n.º 12
0
def test():
    ############## Hyperparameters ##############
    # env_name = "LunarLander-v2"
    # creating environment
    # env = gym.make(env_name)
    env = Scenario4()
    state_dim = env.observation_space.shape[0]
    action_dim = 2
    render = args.visualize
    max_timesteps = 500
    n_latent_var = 64  # number of variables in hidden layer
    lr = 0.0007
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    #############################################

    n_episodes = 100

    save_gif = False

    # filename = "PPO_{}.pth".format(env_name)
    test_name = args.case.lower()
    filename = './checkpoints/rl_checkpoint_' + test_name
    # directory = "./preTrained/"

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    nn_model = NN(5, 1)
    # nn_model.load_weights('./policies/data01_new_all_CoIL')
    scenario_name = args.scenario.lower()
    nn_model.load_weights('./policies/' + scenario_name + '_all_CoIL')

    ppo.policy_old.load_state_dict(torch.load(filename))
    success_counter = 0
    time_counter = 0
    # env.T = 400 * env.dt - env.dt / 2.
    for ep in range(1, n_episodes + 1):
        ep_reward = 0
        state = env.reset()

        for t in range(max_timesteps):
            mode = ppo.policy_old.act(state, memory)
            state = np.array(state).reshape(1, -1)
            action = nn_model(state, mode).numpy().reshape(-1)
            state, reward, done, terminal_time = env.step(action)
            print("mode is ", mode)
            if render:
                env.render()
            if save_gif:
                img = env.render(mode='rgb_array')
                img = Image.fromarray(img)
                img.save('./gif/{}.jpg'.format(t))
            if done:
                ep_reward = reward
                if env.target_reached:
                    time_counter += terminal_time
                    success_counter += 1
                break
        print("timesteps", t)
        print("success number", success_counter)
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        # ep_reward = 0
        # env.close()
    print('Success Rate = ' + str(float(success_counter) / ep))
    print('Mean reach time = ' + str(float(time_counter / success_counter)))
Ejemplo n.º 13
0
                                        seed=0)
schedule_generator = complex_schedule_generator()
env = RailEnv(width=env_width,
              height=env_height,
              rail_generator=rail_generator,
              schedule_generator=schedule_generator,
              obs_builder_object=GlobalObsForRailEnv(),
              number_of_agents=agent_num)
env_renderer = RenderTool(env)
render = True
render_sleep_time = 0.0
stuck_break_pont = 20
max_timesteps_in_episode = update_timestep
##################################

memory = Memory()
memory.init_vals(env.number_of_agents)
ppo = PPO.FlatlandPPO(action_space=action_space,
                      hidden_size=hidden_size,
                      in_channels=in_channels,
                      out_channels=out_channels,
                      kernel_size=kernel_size,
                      agent_num=agent_num,
                      num_epochs=num_epochs,
                      rnn_num_layers=rnn_num_layers,
                      gamma=gamma,
                      learning_rate=learning_rate,
                      clip_epsilon=clip_epsilon,
                      c1=c1,
                      c2=c2)
Ejemplo n.º 14
0
def adv_training():
    ############## Hyperparameters ##############
    # creating environment
    env = TronEnv()
    state_dim = env.observation_space.shape[0]
    action_dim = 4
    render = False
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 20  # print avg reward in the interval
    max_episodes = 20000  # max training episodes
    max_timesteps = 300  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 2000  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = None
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    h_memory = Memory()
    a_memory = Memory()
    hero = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
               eps_clip)
    adv = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes + 1):
        p_state, e_state = env.reset()
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            p_action = hero.policy_old.act(np.array(p_state), h_memory)
            e_action = adv.policy_old.act(np.array(e_state), a_memory)

            p_state, h_reward1, a_reward1, h_done, _ = env.step(p_action, 0)
            e_state, h_reward2, a_reward2, a_done, _ = env.step(e_action, 1)
            # Saving reward and is_terminal:
            h_memory.rewards.append(h_reward1 + h_reward2)
            h_memory.is_terminals.append(h_done)

            # Saving reward and is_terminal:
            a_memory.rewards.append(a_reward1 + a_reward2)
            a_memory.is_terminals.append(a_done)

            # update if its time
            if timestep % update_timestep == 0:
                hero.update(h_memory)
                h_memory.clear_memory()

                adv.update(a_memory)
                a_memory.clear_memory()

                timestep = 0

            running_reward += (h_reward1 + h_reward2)
            if render:
                env.render()
            if h_done or a_done:
                break

        avg_length += t

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))
            env.render()
            print('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
    adv.save_all()