コード例 #1
0
ファイル: run_mujoco.py プロジェクト: zhengxiawu/deep_rl
def main():
    """Main."""
    # Initialize environment
    env = gym.make(args.env)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]

    print('---------------------------------------')
    print('Environment:', args.env)
    print('Algorithm:', args.algo)
    print('State dimension:', obs_dim)
    print('Action dimension:', act_dim)
    print('Action limit:', act_limit)
    print('---------------------------------------')

    # Set a random seed
    env.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Create an agent
    if args.algo == 'ddpg' or args.algo == 'td3':
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      expl_before=10000,
                      act_noise=0.1,
                      hidden_sizes=(256, 256),
                      buffer_size=int(1e6),
                      batch_size=256,
                      policy_lr=3e-4,
                      qf_lr=3e-4)
    elif args.algo == 'sac':
        agent = Agent(
            env,
            args,
            device,
            obs_dim,
            act_dim,
            act_limit,
            expl_before=10000,
            alpha=0.2,  # In HalfCheetah-v2 and Ant-v2, SAC with 0.2  
            hidden_sizes=(
                256,
                256),  # shows the best performance in entropy coefficient 
            buffer_size=int(
                1e6
            ),  # while, in Humanoid-v2, SAC with 0.05 shows the best performance.
            batch_size=256,
            policy_lr=3e-4,
            qf_lr=3e-4)
    elif args.algo == 'asac':
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      expl_before=10000,
                      automatic_entropy_tuning=True,
                      hidden_sizes=(256, 256),
                      buffer_size=int(1e6),
                      batch_size=256,
                      policy_lr=3e-4,
                      qf_lr=3e-4)
    elif args.algo == 'tac':
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      expl_before=10000,
                      alpha=0.2,
                      log_type='log-q',
                      entropic_index=1.2,
                      hidden_sizes=(256, 256),
                      buffer_size=int(1e6),
                      batch_size=256,
                      policy_lr=3e-4,
                      qf_lr=3e-4)
    elif args.algo == 'atac':
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      expl_before=10000,
                      log_type='log-q',
                      entropic_index=1.2,
                      automatic_entropy_tuning=True,
                      hidden_sizes=(256, 256),
                      buffer_size=int(1e6),
                      batch_size=256,
                      policy_lr=3e-4,
                      qf_lr=3e-4)
    else:  # vpg, npg, trpo, ppo
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      sample_size=4096)

    # If we have a saved model, load it
    if args.load is not None:
        pretrained_model_path = os.path.join('./save_model/' + str(args.load))
        pretrained_model = torch.load(pretrained_model_path,
                                      map_location=device)
        agent.policy.load_state_dict(pretrained_model)

    # Create a SummaryWriter object by TensorBoard
    if args.tensorboard and args.load is None:
        dir_name = 'runs/' + args.env + '/' \
                           + args.algo \
                           + '_s_' + str(args.seed) \
                           + '_t_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        writer = SummaryWriter(log_dir=dir_name)

    start_time = time.time()

    total_num_steps = 0
    train_sum_returns = 0.
    train_num_episodes = 0
    eval_sum_returns = 0.
    eval_num_episodes = 0

    # Main loop
    for i in range(args.iterations):
        # Perform the training phase, during which the agent learns
        if args.phase == 'train':
            train_step_count = 0
            while train_step_count <= args.steps_per_iter:
                agent.eval_mode = False

                # Run one episode
                train_step_length, train_episode_return = agent.run(
                    args.max_step)

                total_num_steps += train_step_length
                train_step_count += train_step_length
                train_sum_returns += train_episode_return
                train_num_episodes += 1

                train_average_return = train_sum_returns / train_num_episodes if train_num_episodes > 0 else 0.0

                # Log experiment result for training steps
                if args.tensorboard and args.load is None:
                    writer.add_scalar('Train/AverageReturns',
                                      train_average_return, total_num_steps)
                    writer.add_scalar('Train/EpisodeReturns',
                                      train_episode_return, total_num_steps)
                    if args.algo == 'asac' or args.algo == 'atac':
                        writer.add_scalar('Train/Alpha', agent.alpha,
                                          total_num_steps)

        # Perform the evaluation phase -- no learning
        agent.eval_mode = True

        for _ in range(10):
            # Run one episode
            eval_step_length, eval_episode_return = agent.run(args.max_step)

            eval_sum_returns += eval_episode_return
            eval_num_episodes += 1

        eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0

        # Log experiment result for evaluation steps
        if args.tensorboard and args.load is None:
            writer.add_scalar('Eval/AverageReturns', eval_average_return,
                              total_num_steps)
            writer.add_scalar('Eval/EpisodeReturns', eval_episode_return,
                              total_num_steps)

        if args.phase == 'train':
            print('---------------------------------------')
            print('Iterations:', i + 1)
            print('Steps:', total_num_steps)
            print('Episodes:', train_num_episodes)
            print('EpisodeReturn:', round(train_episode_return, 2))
            print('AverageReturn:', round(train_average_return, 2))
            print('EvalEpisodes:', eval_num_episodes)
            print('EvalEpisodeReturn:', round(eval_episode_return, 2))
            print('EvalAverageReturn:', round(eval_average_return, 2))
            print('OtherLogs:', agent.logger)
            print('Time:', int(time.time() - start_time))
            print('---------------------------------------')

            # Save the trained model
            if (i + 1) >= 180 and (i + 1) % 20 == 0:
                if not os.path.exists('./save_model'):
                    os.mkdir('./save_model')

                ckpt_path = os.path.join('./save_model/' + args.env + '_' + args.algo \
                                                                    + '_s_' + str(args.seed) \
                                                                    + '_i_' + str(i + 1) \
                                                                    + '_tr_' + str(round(train_episode_return, 2)) \
                                                                    + '_er_' + str(round(eval_episode_return, 2)) + '.pt')

                torch.save(agent.policy.state_dict(), ckpt_path)
        elif args.phase == 'test':
            print('---------------------------------------')
            print('EvalEpisodes:', eval_num_episodes)
            print('EvalEpisodeReturn:', round(eval_episode_return, 2))
            print('EvalAverageReturn:', round(eval_average_return, 2))
            print('Time:', int(time.time() - start_time))
            print('---------------------------------------')
コード例 #2
0
ファイル: run_pendulum.py プロジェクト: zheyuzhang/deep_rl
def main():
    """Main."""
    # Initialize environment
    env = gym.make(args.env)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]
    print('State dimension:', obs_dim)
    print('Action dimension:', act_dim)

    # Set a random seed
    env.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Create an agent
    if args.algo == 'ddpg' or args.algo == 'td3':
        agent = Agent(env, args, obs_dim, act_dim, act_limit, act_noise=0.1)
    elif args.algo == 'sac':
        agent = Agent(env, args, obs_dim, act_dim, act_limit, alpha=0.7)
    elif args.algo == 'asac':
        agent = Agent(env,
                      args,
                      obs_dim,
                      act_dim,
                      act_limit,
                      automatic_entropy_tuning=True)
    elif args.algo == 'tac':
        agent = Agent(env,
                      args,
                      obs_dim,
                      act_dim,
                      act_limit,
                      log_type='log-q',
                      entropic_index=1.5)
    elif args.algo == 'atac':
        agent = Agent(env,
                      args,
                      obs_dim,
                      act_dim,
                      act_limit,
                      log_type='log-q',
                      entropic_index=1.5,
                      automatic_entropy_tuning=True)
    else:  # vpg, npg, trpo, ppo
        agent = Agent(env, args, obs_dim, act_dim, act_limit)

    # Create a SummaryWriter object by TensorBoard
    dir_name = 'runs/' + args.env + '/' + args.algo + '/' + str(args.seed) \
                + '_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    writer = SummaryWriter(log_dir=dir_name)

    start_time = time.time()

    train_num_steps = 0
    train_sum_returns = 0.
    train_num_episodes = 0

    # Runs a full experiment, spread over multiple training episodes
    for episode in range(1, args.training_eps + 1):
        # Perform the training phase, during which the agent learns
        agent.eval_mode = False

        # Run one episode
        train_step_length, train_episode_return = agent.run(args.max_step)

        train_num_steps += train_step_length
        train_sum_returns += train_episode_return
        train_num_episodes += 1

        train_average_return = train_sum_returns / train_num_episodes if train_num_episodes > 0 else 0.0

        # Log experiment result for training episodes
        writer.add_scalar('Train/AverageReturns', train_average_return,
                          episode)
        writer.add_scalar('Train/EpisodeReturns', train_episode_return,
                          episode)
        if args.algo == 'asac' or args.algo == 'atac':
            writer.add_scalar('Train/Alpha', agent.alpha, episode)

        # Perform the evaluation phase -- no learning
        if episode > 0 and episode % args.eval_per_train == 0:
            agent.eval_mode = True

            eval_sum_returns = 0.
            eval_num_episodes = 0

            for _ in range(args.evaluation_eps):
                # Run one episode
                eval_step_length, eval_episode_return = agent.run(
                    args.max_step)

                eval_sum_returns += eval_episode_return
                eval_num_episodes += 1

                eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0

                # Log experiment result for evaluation episodes
                writer.add_scalar('Eval/AverageReturns', eval_average_return,
                                  episode)
                writer.add_scalar('Eval/EpisodeReturns', eval_episode_return,
                                  episode)

            print('---------------------------------------')
            print('Steps:', train_num_steps)
            print('Episodes:', train_num_episodes)
            print('AverageReturn:', round(train_average_return, 2))
            print('EvalEpisodes:', eval_num_episodes)
            print('EvalAverageReturn:', round(eval_average_return, 2))
            print('OtherLogs:', agent.logger)
            print('Time:', int(time.time() - start_time))
            print('---------------------------------------')

            # Save a training model
            if eval_average_return >= args.threshold_return:
                if not os.path.exists('./tests/save_model'):
                    os.mkdir('./tests/save_model')

                ckpt_path = os.path.join('./tests/save_model/' + args.env + '_' + args.algo \
                                                                                + '_ep_' + str(train_num_episodes) \
                                                                                + '_tr_' + str(round(train_average_return, 2)) \
                                                                                + '_er_' + str(round(eval_average_return, 2)) \
                                                                                + '_t_' + str(int(time.time() - start_time)) + '.pt')

                torch.save(agent.actor.state_dict(), ckpt_path)
コード例 #3
0
def main():
    """Main."""
    # Initialize environment
    env = gym.make(args.env)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]
    print('State dimension:', obs_dim)
    print('Action dimension:', act_dim)

    # Set a random seed
    env.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Create an agent
    if args.algo == 'ddpg' or args.algo == 'td3':
        agent = Agent(env, args, obs_dim, act_dim, act_limit, act_noise=0.1, 
                    hidden_size=(400,300), buffer_size=int(1e6), batch_size=100)
    elif args.algo == 'sac':
        agent = Agent(env, args, obs_dim, act_dim, act_limit, alpha=0.2, 
                    hidden_size=(400,300), buffer_size=int(1e6), batch_size=100)
    elif args.algo == 'asac':
        agent = Agent(env, args, obs_dim, act_dim, act_limit, automatic_entropy_tuning=True, 
                    hidden_size=(400,300), buffer_size=int(1e6), batch_size=100)
    elif args.algo == 'tac':
        agent = Agent(env, args, obs_dim, act_dim, act_limit, alpha=0.2, 
                    log_type='log-q', entropic_index=1.5, 
                    hidden_size=(400,300), buffer_size=int(1e6), batch_size=100)
    elif args.algo == 'atac':
        agent = Agent(env, args, obs_dim, act_dim, act_limit, 
                    log_type='log-q', entropic_index=1.5, automatic_entropy_tuning=True,
                    hidden_size=(400,300), buffer_size=int(1e6), batch_size=100)
    else:
        agent = Agent(env, args, obs_dim, act_dim, act_limit, 
                    hidden_size=(400,300), sample_size=4000)

    # Create a SummaryWriter object by TensorBoard
    dir_name = 'runs/' + args.env + '/' + args.algo + '/' + str(args.seed) + '_' + time.ctime()
    writer = SummaryWriter(log_dir=dir_name)

    start_time = time.time()

    total_num_steps = 0
    train_sum_returns = 0.
    train_num_episodes = 0

    # Main loop
    for i in range(args.iterations):
        train_step_count = 0
        while train_step_count <= args.steps_per_iter:
            # Perform the training phase, during which the agent learns
            agent.eval_mode = False
            
            # Run one episode
            train_step_length, train_episode_return = agent.run(args.max_step)
            
            total_num_steps += train_step_length
            train_step_count += train_step_length
            train_sum_returns += train_episode_return
            train_num_episodes += 1

            train_average_return = train_sum_returns / train_num_episodes if train_num_episodes > 0 else 0.0

            # Log experiment result for training episodes
            writer.add_scalar('Train/AverageReturns', train_average_return, train_num_episodes)
            writer.add_scalar('Train/EpisodeReturns', train_episode_return, train_num_episodes)
            if args.algo == 'asac' or args.algo == 'atac':
                writer.add_scalar('Train/Alpha', agent.alpha, train_num_episodes)

        # Perform the evaluation phase -- no learning
        agent.eval_mode = True
        
        eval_sum_returns = 0.
        eval_num_episodes = 0

        for _ in range(10):
            # Run one episode
            eval_step_length, eval_episode_return = agent.run(args.max_step)

            eval_sum_returns += eval_episode_return
            eval_num_episodes += 1

        eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0

        # Log experiment result for evaluation episodes
        writer.add_scalar('Eval/AverageReturns', eval_average_return, train_num_episodes)
        writer.add_scalar('Eval/EpisodeReturns', eval_episode_return, train_num_episodes)

        print('---------------------------------------')
        print('Episodes:', train_num_episodes)
        print('Steps:', total_num_steps)
        print('AverageReturn:', round(train_average_return, 2))
        print('EvalEpisodes:', eval_num_episodes)
        print('EvalAverageReturn:', round(eval_average_return, 2))
        print('OtherLogs:', agent.logger)
        print('Time:', int(time.time() - start_time))
        print('---------------------------------------')

        # Save a training model
        if not os.path.exists('./tests/save_model'):
            os.mkdir('./tests/save_model')
        
        ckpt_path = os.path.join('./tests/save_model/' + args.env + '/' + args.algo + '/' \
                                                                        + '_i_' + str(i) \
                                                                        + '_st_' + str(total_num_steps) \
                                                                        + '_ep_' + str(train_num_episodes) \
                                                                        + '_rt_' + str(round(train_average_return, 2)) \
                                                                        + '_t_' + str(int(time.time() - start_time)) + '.pt')
        
        torch.save(agent.actor.state_dict(), ckpt_path)
コード例 #4
0
ファイル: run_pendulum.py プロジェクト: wwchung91/deep_rl
def main():
    """Main."""
    # Initialize environment
    env = gym.make(args.env)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]

    print('---------------------------------------')
    print('Environment:', args.env)
    print('Algorithm:', args.algo)
    print('State dimension:', obs_dim)
    print('Action dimension:', act_dim)
    print('Action limit:', act_limit)
    print('---------------------------------------')

    # Set a random seed
    env.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Create an agent
    if args.algo == 'ddpg' or args.algo == 'td3':
        agent = Agent(env, args, device, obs_dim, act_dim, act_limit)
    elif args.algo == 'sac':
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      alpha=0.5)
    elif args.algo == 'asac':
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      automatic_entropy_tuning=True)
    elif args.algo == 'tac':
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      alpha=0.5,
                      log_type='log-q',
                      entropic_index=1.2)
    elif args.algo == 'atac':
        agent = Agent(env,
                      args,
                      device,
                      obs_dim,
                      act_dim,
                      act_limit,
                      log_type='log-q',
                      entropic_index=1.2,
                      automatic_entropy_tuning=True)
    else:  # vpg, npg, trpo, ppo
        agent = Agent(env, args, device, obs_dim, act_dim, act_limit)

    # If we have a saved model, load it
    if args.load is not None:
        pretrained_model_path = os.path.join('./save_model/' + str(args.load))
        pretrained_model = torch.load(pretrained_model_path,
                                      map_location=device)
        agent.policy.load_state_dict(pretrained_model)

    # Create a SummaryWriter object by TensorBoard
    if args.tensorboard and args.load is None:
        dir_name = 'runs/' + args.env + '/' \
                           + args.algo \
                           + '_s_' + str(args.seed) \
                           + '_t_' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        writer = SummaryWriter(log_dir=dir_name)

    start_time = time.time()

    train_num_steps = 0
    train_sum_returns = 0.
    train_num_episodes = 0

    # Main loop
    for i in range(args.iterations):
        # Perform the training phase, during which the agent learns
        if args.phase == 'train':
            agent.eval_mode = False

            # Run one episode
            train_step_length, train_episode_return = agent.run(args.max_step)

            train_num_steps += train_step_length
            train_sum_returns += train_episode_return
            train_num_episodes += 1

            train_average_return = train_sum_returns / train_num_episodes if train_num_episodes > 0 else 0.0

            # Log experiment result for training episodes
            if args.tensorboard and args.load is None:
                writer.add_scalar('Train/AverageReturns', train_average_return,
                                  i)
                writer.add_scalar('Train/EpisodeReturns', train_episode_return,
                                  i)
                if args.algo == 'asac' or args.algo == 'atac':
                    writer.add_scalar('Train/Alpha', agent.alpha, i)

        # Perform the evaluation phase -- no learning
        if (i + 1) % args.eval_per_train == 0:
            eval_sum_returns = 0.
            eval_num_episodes = 0
            agent.eval_mode = True

            for _ in range(100):
                # Run one episode
                eval_step_length, eval_episode_return = agent.run(
                    args.max_step)

                eval_sum_returns += eval_episode_return
                eval_num_episodes += 1

            eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0

            # Log experiment result for evaluation episodes
            if args.tensorboard and args.load is None:
                writer.add_scalar('Eval/AverageReturns', eval_average_return,
                                  i)
                writer.add_scalar('Eval/EpisodeReturns', eval_episode_return,
                                  i)

            if args.phase == 'train':
                print('---------------------------------------')
                print('Iterations:', i + 1)
                print('Steps:', train_num_steps)
                print('Episodes:', train_num_episodes)
                print('EpisodeReturn:', round(train_episode_return, 2))
                print('AverageReturn:', round(train_average_return, 2))
                print('EvalEpisodes:', eval_num_episodes)
                print('EvalEpisodeReturn:', round(eval_episode_return, 2))
                print('EvalAverageReturn:', round(eval_average_return, 2))
                print('OtherLogs:', agent.logger)
                print('Time:', int(time.time() - start_time))
                print('---------------------------------------')

                # Save the trained model
                if eval_average_return >= args.threshold_return:
                    if not os.path.exists('./save_model'):
                        os.mkdir('./save_model')

                    ckpt_path = os.path.join('./save_model/' + args.env + '_' + args.algo \
                                                                        + '_s_' + str(args.seed) \
                                                                        + '_i_' + str(i + 1) \
                                                                        + '_tr_' + str(round(train_episode_return, 2)) \
                                                                        + '_er_' + str(round(eval_episode_return, 2)) + '.pt')

                    torch.save(agent.policy.state_dict(), ckpt_path)
            elif args.phase == 'test':
                print('---------------------------------------')
                print('EvalEpisodes:', eval_num_episodes)
                print('EvalEpisodeReturn:', round(eval_episode_return, 2))
                print('EvalAverageReturn:', round(eval_average_return, 2))
                print('Time:', int(time.time() - start_time))
                print('---------------------------------------')