Ejemplo n.º 1
0
def main(env_name, seed, individual, args, eval_episodes=10):
    env = EnvWrapper(env_name)

    state_dim = sum(list(env.unwrapped().observation_space.shape))
    action_dim = sum(list(env.unwrapped().action_space.shape))
    hidden_sizes = args['hidden_sizes']
    activation = args['activation']
    layernorm = args['layernorm']

    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    policy = DeterministicPolicy(state_dim, action_dim, hidden_sizes, -1,
                                 activation, layernorm).eval()

    file_dir = os.path.abspath(os.path.dirname(__file__))
    save_dir = os.path.join(
        file_dir,
        'results',
        env_name,
        args['activation'] + ('_LayerNorm' if args['layernorm'] else ''),
        'seed' + str(seed),
    )
    model_path = os.path.join(save_dir, 'learned_model',
                              'individual' + str(individual) + '.pth')
    model_state_dict = torch.load(model_path)
    policy.load_state_dict(model_state_dict)

    env.seed(seed + 100)

    episode_rewards = []
    for _ in range(eval_episodes):
        state = env.reset()
        done = False
        sum_rewards = 0
        while not done:
            # env.render()
            action = policy.deterministic_action(
                torch.tensor(state.reshape(1, -1), dtype=torch.float))
            next_state, reward, done, _ = env.step(action)
            sum_rewards += reward
            state = next_state
        episode_rewards.append(sum_rewards)
        print(
            f'Episode: {len(episode_rewards)} Sum Rewards: {sum_rewards:.3f}')

    avg_reward = np.mean(episode_rewards)
    print('\n---------------------------------------')
    print(f'Evaluation over {eval_episodes} episodes: {avg_reward:.3f}')
    print('---------------------------------------')
Ejemplo n.º 2
0
def main(arglist):
    ACTORS = 1
    env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode)
    if arglist.eval:
        current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        writer = SummaryWriter(log_dir='./logs/' + current_time + '-' +
                               arglist.scenario)
    maddpg_wrapper = MADDPG(ACTORS)

    maddpg_wrapper.create_agents(env, arglist)

    j = 0
    for episode in range(arglist.max_episode):
        obs = env.reset()
        terminal = False
        maddpg_wrapper.reset()
        total_reward = [0 for i in maddpg_wrapper.workers]
        step = 0

        while not terminal and step < 25:
            if not arglist.eval:
                env.render(0)
                time.sleep(0.03)

            actions = maddpg_wrapper.take_actions(obs)
            obs2, reward, done = env.step(actions)

            for actor in range(ACTORS):
                for i, rew in enumerate(reward[actor]):
                    total_reward[i] += rew

            j += ACTORS
            #terminal = all(done)
            if arglist.eval:
                maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2,
                                      done)

            obs = obs2
            step += 1

        if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0:
            maddpg_wrapper.save(episode)

        if arglist.eval:
            for worker, ep_ave_max in zip(maddpg_wrapper.workers,
                                          maddpg_wrapper.ep_ave_max_q_value):
                print(worker.pos, ' => average_max_q: ',
                      ep_ave_max / float(step), ' Reward: ',
                      total_reward[worker.pos], ' Episode: ', episode)
                writer.add_scalar(
                    str(worker.pos) + '/Average_max_q',
                    ep_ave_max / float(step), episode)
                writer.add_scalar(
                    str(worker.pos) + '/Reward Agent',
                    total_reward[worker.pos], episode)

    env.close()
Ejemplo n.º 3
0
def rollout_worker(index, task_pipe, result_pipe, model_bucket, env_name):
    env = EnvWrapper(env_name)
    env.seed(index)

    while True:
        identifier = task_pipe.recv()
        if identifier == 'TERMINATE':
            exit(0)

        policy = model_bucket[identifier]

        fitness = 0.0
        num_frames = 0
        state = env.reset()
        done = False
        rollout_transition = []

        while not done:
            action = policy.deterministic_action(torch.tensor(state.reshape(1, -1), dtype=torch.float))
            next_state, reward, done, info = env.step(action)
            fitness += reward
            num_frames += 1

            done_buffer = done if num_frames < env.unwrapped()._max_episode_steps else False

            rollout_transition.append({
                'state': state,
                'next_state': next_state,
                'action': action,
                'reward': reward,
                'mask': float(not done_buffer)
            })
            state = next_state

        result_pipe.send([identifier, fitness, rollout_transition])
Ejemplo n.º 4
0
 def __init__(self, name, num_episodes=500):
     self.name = name
     self.num_episodes = num_episodes
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.env = gym.make(name).unwrapped
     self.env.reset()
     self.env_w = EnvWrapper(self.env, self.device)
     self.cfg = Config()
     self.cfg.n_actions = self.env.action_space.n
     self.cfg.policy_net = DQN(self.env_w.screen_height, self.env_w.screen_width,
                               self.cfg.n_actions).to(self.device)
     self.cfg.target_net = DQN(self.env_w.screen_height, self.env_w.screen_width,
                               self.cfg.n_actions).to(self.device)
     self.agent = Agent(self.env, self.env_w, self.device, self.cfg)
Ejemplo n.º 5
0
def train_agent(episodes=100, model='DDPG', print_every=10):

    if model.lower() == 'd4pg':
        agent = D4PGAgent()
        print('Use D4PG agent......\n')
    else:
        agent = DDPGAgent()
        print('Use default DDPG agent......\n')

    print('Batch size: ', BATCH_SIZE)
    print('Actor learning rate: ', LR_ACTOR)
    print('Critic learning rate: ', LR_CRITIC)
    print('\n')

    env = EnvWrapper(file_name='Reacher_Windows_x86_64\Reacher.exe',
                     train_mode=True)

    scores = []
    scores_window = deque(maxlen=100)

    for ep in range(1, episodes + 1):
        agent.reset()
        agent.states = env.reset()

        for s in range(agent.max_steps):
            agent.actions = agent.act(add_noise=True)
            agent.rewards, agent.next_states, agent.dones = env.step(
                agent.actions)
            agent.step()
            agent.states = agent.next_states

        scores.append(agent.scores.mean())
        scores_window.append(agent.scores.mean())

        if ep % print_every == 0:
            print('Episode %d, avg score: %.2f' % (ep, agent.scores.mean()))

        if np.mean(scores_window) >= 30:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(ep - 100, np.mean(scores_window)))
            torch.save(agent.actor.state_dict(),
                       'checkpoints/reacher_%s_actor_checkpoint.pth' % model)
            torch.save(agent.critic.state_dict(),
                       'checkpoints/reacher_%s_critic_checkpoint.pth' % model)

    env.close()

    return scores, agent
Ejemplo n.º 6
0
def test(args):
    gpu_ops = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(gpu_options=gpu_ops)
    sess = tf.Session(config=config)
    env = EnvWrapper(args)
    qn = DeepQN(state_shape=env.state_shape,
                num_actions=env.num_actions,
                gamma=args.gamma,
                type=args.qn_version)
    qn.reset_sess(sess)
    qn.load(args.model_path)
    testor = Tester(qn,
                    env,
                    report_interval=args.tester_report_interval,
                    episodes=args.tester_episodes)
    _, rs = testor.run(qn, sess, render=args.render)
    f = open(args.model_path + '_test.log', 'w')
    f.write(str(rs))
    f.close()
    return
Ejemplo n.º 7
0
FRAMES_IN_STATE_COUNT = 4
EPSILON = 0.05
GAME_ENV_NAME = 'BreakoutDeterministic-v4'
RENDER = False
PRINT_LATEX = True
MODEL_PATH_PREFIX = './drive/app/models/'
# list of models with iteration count as file names
STARTING_MODELS = [
    0, 200, 600, 800, 1000, 1200, 1400, 1550, 1800, 2000, 2200, 2400, 2500,
    2700, 3000, 3200, 3500, 3800, 4150, 4400, 4600, 4800, 5000, 5250, 5400,
    5600, 5800, 6000, 6200, 6400, 6800, 7000, 7200, 7400, 7600, 7750, 8000
]

GAMES_PER_MODEL = 5

env = EnvWrapper(GAME_ENV_NAME, IMG_SIZE, FRAMES_IN_STATE_COUNT, 1)
action_count = env.action_count
results = np.zeros((len(STARTING_MODELS), 2))
program_start_time = time.time()

for i in range(0, len(STARTING_MODELS)):
    model_name = STARTING_MODELS[i]
    model_path = MODEL_PATH_PREFIX + str(model_name)
    model = helpers.load_model(model_path)
    total_games_reward = 0
    start_time = time.time()
    for i_game in range(1, GAMES_PER_MODEL + 1):

        env.reset()
        if RENDER:
            env.render()
Ejemplo n.º 8
0
def train():
    print(tf.__version__)


    gpu_ops = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(gpu_options=gpu_ops)
    sess = tf.Session(config=config)

    env1 = EnvWrapper('MountainCar-v0', mod_r=False)

    env2 = EnvWrapper('MountainCar-v0', mod_r=True)

    mr = MemoryReplayer(env1.state_shape, capacity=50000)

    qn = DeepQN(state_shape=env1.state_shape, num_actions=env1.num_actions, gamma=0.99)

    qn.reset_sess(sess)

    qn.set_train(0.008)

    init = tf.global_variables_initializer()
    sess.run(init)

    testor = Tester(qn, env1, report_interval=100, episodes=100)

    score = []

    for epi in range(1000000):

        s = env2.reset()

        done = False

        rc = 0

        while not done:
            a = qn.select_action_eps_greedy(get_eps(epi), s)

            a_ = a[0]

            s_, r, done, _ = env2.step(a_)

            mr.remember(s, s_, r, a_, done)

            s = s_

            rc += r

        score.append(rc)

        # replay

        s, s_, r, a, done = mr.replay(batch_size=64)

        qn.train(s, s_, r, a, done)

        if (epi + 1) % 200 == 0:
            avg_score = np.mean(score)
            print('avg score last 200 episodes ', avg_score)
            score = []

            testor.run(qn, sess, render=False)


    return
Ejemplo n.º 9
0
ENV_NAME = 'Seaquest-v0'
TOTAL_FRAMES = 20000000
MAX_TRAINING_STEPS = 20 * 60 * 60 / 3
TESTING_GAMES = 30
MAX_TESTING_STEPS = 5 * 60 * 60 / 3
TRAIN_AFTER_FRAMES = 50000
epoch_size = 50000
MAX_NOOP_START = 30
LOG_DIR = 'logs'
outdir = 'results'
test_mode = True
logger = tf.summary.FileWriter(LOG_DIR)
# Intialize tensorflow session
session = tf.InteractiveSession()

env = EnvWrapper(ENV_NAME, test_mode)
# print(dir(env.action_space))
agent = DQN(
    state_size=env.observation_space.shape,
    action_size=env.action_space.n,
    session=session,
    summary_writer=logger,
    exploration_period=1000000,
    minibatch_size=32,
    discount_factor=0.99,
    experience_replay_buffer=1000000,
    target_qnet_update_frequency=20000,
    initial_exploration_epsilon=1.0,
    final_exploration_epsilon=0.1,
    reward_clipping=1.0,
)
Ejemplo n.º 10
0
    with open(file_name, "a", newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        for episode in range(len(goal_agents_list)):
            goal_agents = goal_agents_list[episode]

            episode_reward = episode_rewards[episode]
            row = [episode, episode_reward]
            row.extend([goal_agents.count(f"agent {agent}")
                        for agent in range(num_good_agents)])
            writer.writerow(row)


if __name__ == '__main__':
    arglist = parse_args()
    # Create environment
    env = EnvWrapper(arglist.scenario, arglist.benchmark,
                     agent_speeds=arglist.agent_speeds)
    with U.single_threaded_session():
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        # Initialize
        U.initialize()
        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.rollout or arglist.shapley_M or arglist.true_shapley or arglist.restore_episode != 0 or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        if arglist.true_shapley:
Ejemplo n.º 11
0
def main():
    print(tf.__version__)
    gpu_ops = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(gpu_options=gpu_ops, log_device_placement=False)
    sess = tf.Session(config=config)

    env = EnvWrapper('CartPole-v0')

    mr = MemoryReplayer(env.state_shape, capacity=100000, enabled=True)

    # set type='v1' for linear model, 'v3' for three layer model (two tanh activations)

    # type='v5' use dual

    qn = DeepQN(state_shape=env.state_shape,
                num_actions=env.num_actions,
                gamma=0.99,
                type='v1')

    qn.reset_sess(sess)

    qn.set_train(0.001)

    init = tf.global_variables_initializer()
    sess.run(init)

    plotter = Plotter()

    testor = Tester(qn, env, report_interval=100)

    print('Pretrain test:')
    testor.run(qn, sess)

    score = []
    reward_record = []
    cnt_iter = 0

    for epi in range(1000000):
        s = env.reset()

        done = False

        rc = 0

        while not done:
            a = qn.select_action_eps_greedy(get_eps(epi), s)

            a_ = a[0]

            s_, r, done, _ = env.step(a_)

            mr.remember(s, s_, r, a_, done)

            s = s_

            rc += r
            cnt_iter += 1
            if (cnt_iter + 1) % 10000 == 0:
                r_test = record(qn, sess, env)
                print("Iteration {}, avg reward is {}".format(
                    cnt_iter, r_test))
                reward_record.append(r_test)

        score.append(rc)

        # replay

        s, s_, r, a, done = mr.replay(batch_size=64)

        qn.train(s, s_, r, a, done)

        if cnt_iter > 1000000:
            break

        # if (epi + 1) % 200 == 0:
        #     avg_score = np.mean(score)
        #     plotter.plot(avg_score)
        #     print('avg score last 200 episodes ', avg_score)
        #     score = []
        #     if avg_score > 195:
        #         qn.save(path='./trained_model_linear_CartPole_w_mr.ckpt')
        #         break
    f = open('CartPole-v0_q2_data.log', 'w')
    f.write(str(reward_record))
    f.close()
    return
Ejemplo n.º 12
0
from env_wrapper import EnvWrapper
from a2c_agent import A2CAgent
from state_generator import StateGenerator
from utils import preprocess_experiences

from training_parameters import clip_range, sample_size, epoch, n_env, n_steps, skip_frames, ent_coef, vf_coef, max_grad_norm, episodes_before_training, render, input_shape, lr, GAMMA, LAMBDA, load_model, frame_size, stack_size, max_steps


def run_env(env):
    env.step(n_steps)


if __name__ == "__main__":
    n_env = multiprocessing.cpu_count()
    envs = [
        EnvWrapper(frame_size, skip_frames, stack_size) for i in range(n_env)
    ]
    action_size = envs[0].get_action_size()

    tf.reset_default_graph()
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    train_model = A2CAgent("train_model", True, sess, input_shape, action_size,
                           lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef,
                           clip_range, load_model)

    old_model = A2CAgent("old_model", False, sess, input_shape, action_size,
                         lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef,
                         clip_range, False)

    sync_ops = old_model.create_sync_ops(train_model)
Ejemplo n.º 13
0
def train(args=None):
    gpu_ops = tf.GPUOptions(allow_growth=True)
    config = tf.ConfigProto(gpu_options=gpu_ops, log_device_placement=False)
    sess = tf.Session(config=config)
    args_test = copy.copy(args)
    args_test.use_monitor = False
    env = EnvWrapper(args.env, mod_r=True)
    env_test = EnvWrapper(args.env, mod_r=False)

    if args.use_mr:
        print('Set experience replay ON')
    else:
        print('Set experience replay OFF')

    path = './tmp/burn_in_' + args.env + '-' + str(
        args.mr_capacity) + '.pickle'
    if os.path.exists(path):
        print('Found existing burn_in memory replayer, load...')
        with open(path, 'rb') as f:
            mr = pickle.load(file=f)
    else:
        mr = MemoryReplayer(env.state_shape,
                            capacity=args.mr_capacity,
                            enabled=args.use_mr)
        # burn_in
        mr = utils.burn_in(env, mr)

    # set type='v1' for linear model, 'v3' for three layer model (two tanh activations)

    # type='v5' use dual

    print('Set Q-network version: ', args.qn_version)
    qn = DeepQN(state_shape=env.state_shape,
                num_actions=env.num_actions,
                gamma=args.gamma,
                type=args.qn_version)

    qn.reset_sess(sess)

    qn.set_train(args.lr)

    if not args.reuse_model:
        print('Set reuse model      OFF')
        init = tf.global_variables_initializer()
        sess.run(init)
    else:
        print('Set reuse model      ON')
        try:
            qn.load('./tmp/qn-' + args.qn_version + '-' + args.env +
                    '-keyinterrupt' + '.ckpt')
            optimizer_scope = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                "optimizer")
            init = tf.variables_initializer(optimizer_scope)
            sess.run(init)
            print('Found previous model')
        except tf.errors.NotFoundError:
            print('No previous model found, init new model')
            init = tf.global_variables_initializer()
            sess.run(init)

    # plotter = Plotter(save_path=args.performance_plot_path, interval=args.performance_plot_interval,
    #                   episodes=args.performance_plot_episodes)

    pretrain_test = Tester(qn, env_test, report_interval=100)
    print('Pretrain test:')
    pretrain_test.run(qn, sess)
    print('Pretrain test done.')

    tester_1 = Tester(qn,
                      env,
                      episodes=args.performance_plot_episodes,
                      report_interval=args.performance_plot_episodes,
                      title='test-r-mod')
    tester_2 = Tester(qn,
                      env_test,
                      episodes=args.performance_plot_episodes,
                      report_interval=args.performance_plot_episodes,
                      title='test-r-real')

    score = deque([], maxlen=args.performance_plot_episodes)
    reward_record = []

    try:
        for epi in range(args.max_episodes):
            s = env.reset()

            done = False

            rc = 0

            while not done:
                a = qn.select_action_eps_greedy(get_eps(epi), s)
                a_ = a[0]
                s_, r, done, _ = env.step(a_)
                mr.remember(s, s_, r, a_, done)
                s = s_
                rc += r
            score.append(rc)
            # replay
            s, s_, r, a, done = mr.replay(batch_size=args.batch_size)
            qn.train(s, s_, r, a, done)

            if (epi + 1) % args.performance_plot_interval == 0:
                print('train-r-mod reward avg: ', np.mean(score))
                tester_2.run(qn, sess)
                #r_avg, _ = tester_2.run(qn, sess)
                # reward_record.append(r_avg)
    except KeyboardInterrupt:
        qn.save('./tmp/qn-' + args.qn_version + '-' + args.env +
                '-keyinterrupt' + '.ckpt')
        # save mr

        with open(path, 'wb+') as f:
            pickle.dump(mr, f)
        exit(-1)

    qn.save(args.model_path)
    f = open(args.log_name, 'w')
    f.write(str(reward_record))
    f.close()
    return
Ejemplo n.º 14
0
FRAMES_IN_STATE_COUNT = 4
BATCH_SIZE = 32
MEMORY_SIZE = 1000000
FREEZE_ITERATIONS = 10000
REPLAY_START_SIZE = 50000
LAST_EPSILON_DECREASE_ITERATION = 1000000
START_EPSILON = 1.0
END_EPSILON = 0.1

# -------- REPORT CONSTS --------
REPORT_ITERATIONS = 10000
SAVE_MODEL_ITERATIONS = 50000

print(device_lib.list_local_devices())

env = EnvWrapper(GAME_ENV_NAME, IMG_SIZE, FRAMES_IN_STATE_COUNT, MEMORY_SIZE)

action_count = env.action_count

if STARTING_MODEL is None:
    model = atari_model.model(action_count, IMG_SIZE, FRAMES_IN_STATE_COUNT)
else:
    model = helpers.load_model(STARTING_MODEL)
    print('Loaded model: ', STARTING_MODEL)

if LEARN:
    frozen_target_model = helpers.copy_model(model)

process = psutil.Process(os.getpid())
print('RAM :', helpers.convert_size(process.memory_info().rss))