Exemple #1
0
Fichier : main.py Projet : wxw0/rl
def main():
    sess = tf.Session(config=cf.tf_config)

    agent = A2C(cf, sess)
    sess.run(tf.global_variables_initializer())

    if bool(args.e):
        agent.evaluate(load_model=True)
    else:
        agent.learn()

    sess.close()
Exemple #2
0
def main():
    get_env_version()
    cfg = A2CConfig(env="CartPole-v0", train_frames=400)
    get_env_information(cfg.env)
    env = gym.make(cfg.env)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = A2C(state_dim, action_dim, cfg)
    envs = get_envs(env_name=cfg.env)
    rewards, smooth_rewards = train(cfg, envs, agent)
    os.makedirs(cfg.result_path)
    # In fact, a step/frame contains nums_envs environmental interactions
    plot_rewards(rewards,
                 smooth_rewards,
                 env=cfg.env,
                 algo=cfg.algo,
                 save=True,
                 path=cfg.result_path,
                 xlabel_name="Each 200 steps")
    envs.close()
    env.close()
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    # if args.policy == "TD3":
    #     # Target policy smoothing is scaled wrt the action scale
    #     kwargs["policy_noise"] = args.policy_noise * max_action
    #     kwargs["noise_clip"] = args.noise_clip * max_action
    #     kwargs["policy_freq"] = args.policy_freq
    #     policy = TD3.TD3(**kwargs)
    if args.policy == "A2C":
        envs = ParaEnv(args.env, args.n_processes, args.seed)
        policy = A2C.A2C(env.observation_space, env.action_space,
                         args.discount, args.tau, max_episode_timesteps)
        x, y = policy.run(envs, file_name, args)
        write_result(args.env + "_A2C.json", x, y)

    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)
        x, y = policy.run(env, file_name, args)
        write_result(args.env + "_DDPG.json", x, y)

    elif args.policy == "REINFORCE":
        args.n_steps = 5
        args.n_processes = 16
        envs = ParaEnv(args.env, args.n_processes, args.seed)
        policy = REINFORCE.REINFORCE(env.observation_space, env.action_space,
                                     args.discount, args.tau, args.n_steps,
                                     args.n_processes, max_episode_timesteps)
Exemple #4
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    # Maxime: commented this out because it very much changes the behavior
    # of the code for seemingly arbitrary reasons
    #if len(envs.observation_space.shape) == 1:
    #    envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    elif args.recurrent_policy:
        actor_critic = RecMLPPolicy(obs_numel, envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    # Maxime: log some info about the model and its size
    # call function PPO.modelsize() for this to happen
    '''
	modelSize = 0
	for p in actor_critic.parameters():
		pSize = reduce(operator.mul, p.size(), 1)
		modelSize += pSize
	'''

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)

    if args.algo == 'a2c':
        Agent = A2C(actor_critic, rollouts, args.lr, args.eps,
                    args.num_processes, obs_shape, args.use_gae, args.gamma,
                    args.tau, args.recurrent_policy, args.num_mini_batch,
                    args.cuda, args.log_interval, args.vis, args.env_name,
                    args.log_dir, args.entropy_coef, args.num_stack,
                    args.num_steps, args.ppo_epoch, args.clip_param,
                    args.max_grad_norm, args.alpha, args.save_dir,
                    args.vis_interval, args.save_interval, num_updates,
                    action_shape, args.value_loss_coef)

    elif args.algo == 'ppo':
        Agent = PPO(actor_critic, rollouts, args.lr, args.eps,
                    args.num_processes, obs_shape, args.use_gae, args.gamma,
                    args.tau, args.recurrent_policy, args.num_mini_batch,
                    args.cuda, args.log_interval, args.vis, args.env_name,
                    args.log_dir, args.entropy_coef, args.num_stack,
                    args.num_steps, args.ppo_epoch, args.clip_param,
                    args.max_grad_norm, args.save_dir, args.vis_interval,
                    args.save_interval, num_updates, action_shape,
                    args.value_loss_coef)

    elif args.algo == 'acktr':
        Agent = ACKTR(actor_critic, rollouts, args.lr, args.eps,
                      args.num_processes, obs_shape, args.use_gae, args.gamma,
                      args.tau, args.recurrent_policy, args.num_mini_batch,
                      args.cuda, args.log_interval, args.vis, args.env_name,
                      args.log_dir, args.entropy_coef, args.num_stack,
                      args.num_steps, args.ppo_epoch, args.clip_param,
                      args.max_grad_norm, args.alpha, args.save_dir,
                      args.vis_interval, args.save_interval, num_updates,
                      action_shape, args.value_loss_coef)
    print(str(actor_critic))
    print('Total model size: %d' % Agent.modelsize())

    obs = envs.reset()
    Agent.update_current_obs(obs, envs)
    Agent.rollouts.observations[0].copy_(Agent.current_obs)

    # These variables are used to compute average rewards for all processes.
    Agent.train(envs)
Exemple #5
0
    'gamma': 0.95,
    'learning_rate': 1e-3,
    'gae_lambda': 0.95,  # lambda for Generalized Advantage Estimation
    'rep_learning_rate':
    1e-5,  # learning rate for learning next state representation
    'seed': seed
}
'''Parameters of Model'''
model_parameters = {'num_units': 64, 'seed': seed}
loss_coefficients = {'value': 0.5, 'entropy': 1e-2, 'representation': 0.0}
'''Write Parameters to log_file'''
if verbose:
    with open(log_file_name, "a") as f:
        f.write('Environment: {}, Frames: {} \n'.format(game, num_frames))
        f.write('Algorithm Parameters: {} \n'.format(algorithm_parameters))
        f.write('Model Parameters: {} \n'.format(model_parameters))
        f.write('Loss Coefficients: {} \n'.format(loss_coefficients))
        f.flush()
'''Initialize Environment & Model'''
env = Environment(game, seed)
num_actions = env.number_of_actions
state_space = env.state_space
model = Model(num_actions, state_space, model_parameters)
agent = A2C(model, num_actions, algorithm_parameters, loss_coefficients)
'''Train the Agent'''
reward_history, loss_history = agent.train(env, num_frames, logs, verbose)
'''Save Rewards and Losses'''
if verbose:
    np.save(reward_file_name, reward_history)
    np.save(loss_file_name, loss_history)
Exemple #6
0
def train(cfg):
    print('Start to train ! \n')
    envs = make_envs(num_envs=16, env_name="CartPole-v0")
    state_dim = envs.observation_space.shape[0]
    action_dim = envs.action_space.n
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = A2C(state_dim, action_dim, hidden_dim=256)
    # moving_average_rewards = []
    # ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    state = envs.reset()
    for i_episode in range(1, cfg.train_eps + 1):
        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0
        for i_step in range(1, cfg.train_steps + 1):
            state = torch.FloatTensor(state).to(device)
            dist, value = agent.model(state)
            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())
            state = next_state
            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        if i_episode % 20 == 0:
            print("reward", test_env(agent, device='cpu'))
        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = agent.model(next_state)
        returns = agent.compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)
        advantage = returns - values
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()
        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        agent.optimizer.zero_grad()
        loss.backward()
        agent.optimizer.step()
    for _ in range(100):
        print("test_reward", test_env(agent, device='cpu'))

        # print('Episode:', i_episode, ' Reward: %i' %
        #       int(ep_reward[0]), 'n_steps:', i_step)
        # ep_steps.append(i_step)
        # rewards.append(ep_reward)
        # if i_episode == 1:
        #     moving_average_rewards.append(ep_reward[0])
        # else:
        #     moving_average_rewards.append(
        #         0.9*moving_average_rewards[-1]+0.1*ep_reward[0])
        # writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
        # writer.add_scalar('steps_of_each_episode',
        #                   ep_steps[-1], i_episode)
    writer.close()
    print('Complete training!')
    ''' 保存模型 '''