Ejemplo n.º 1
0
    def init_agent_env(self, proc_id, role, role_id):
        env = cartPole.CartPoleEnv()
        # env = flappyBird.FlappyBirdEnv()
        NUM_STATE_FEATURES = env.get_num_state_features()
        NUM_ACTIONS = env.get_num_actions()
        PRINT_EVERY_EPISODE = 20
        LEARNING_RATE = 0.003
        REWARD_DISCOUNT = 0.99
        COEF_VALUE= 1
        COEF_ENTROPY = 0
        agent = A2C.Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, COEF_VALUE, COEF_ENTROPY)

        return agent, env
Ejemplo n.º 2
0
def test_ddpg(args):
    if not args.actor_model:
        print('ERROR: Need trained model folder.')
        return

    env = gym.make(args.env)
    env.reset(args)
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape
    action_lim = env.action_space.high
    other_cars = args.cars > 1

    agent = A2C(state_dim,
                action_dim,
                action_lim,
                update_type=args.update,
                batch_size=args.batch_size,
                other_cars=other_cars,
                ego_dim=args.ego_dim)
    agent.load_actor(args.actor_model)

    evaluate(agent, env, args, None, render_episode=True, log=False)
Ejemplo n.º 3
0
def train_ddpg(args):
    timestr = time.strftime("%Y%m%d-%H%M%S")
    change = args.change * "-change" + (not args.change) * "-follow"
    savedir = args.logdir + 'julia-sim/' + args.env.lower(
    ) + change + '/' + timestr + '/'
    if not os.path.exists(savedir):
        os.makedirs(savedir)

    env = gym.make(args.env)
    env.reset(args)
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape
    action_lim = env.action_space.high
    other_cars = args.cars > 1

    agent = A2C(state_dim,
                action_dim,
                action_lim,
                update_type=args.update,
                batch_size=args.batch_size,
                other_cars=other_cars,
                ego_dim=args.ego_dim)
    ep_start = 1
    if args.resume_train:
        if not args.actor_model:
            print('ERROR: Need trained model folder.')
            return
        agent.load_all(args.actor_model)
        ep_start = int(args.actor_model.split('_')[-1].split('.')[0][2:])
    agent.train()

    if args.seed:
        print("Random Seed: {}".format(args.random_seed))
        env.seed(args.random_seed)
        torch.manual_seed(args.random_seed)
        np.random.seed(args.random_seed)

    avg_reward = 0.0
    logfile = open(savedir + 'log.txt', 'w+')
    for episode in range(ep_start, args.episodes + 1):
        ep_reward = 0.0
        state = env.reset()
        agent.reset_noise()
        for t in range(1, args.max_steps + 1):
            action = agent.select_action(np.array(state))
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)

            next_state, reward, terminal, debug = env.step(action)
            if args.debug:
                s_f, t_f, phi_f, v_ego = debug[:4]
                print("(s, t, phi, v) = (%3.2f, %3.2f, %3.2f, %3.2f)" %
                      (s_f, t_f, phi_f, v_ego))
                logfile.write(
                    "(Episode, Step): (%d, %d) | (s, t, phi, v) = (%3.2f, %3.2f, %3.2f, %3.2f)\n"
                    % (episode, t, s_f, t_f, phi_f, v_ego))
                logfile.flush()

            agent.append(state, action, reward, next_state, float(terminal))
            state = next_state
            ep_reward += reward
            avg_reward += reward

            if args.update_always:
                junk = np.random.normal(np.random.randint(-10, 10),
                                        np.random.random() + 5.0)
                tot_actor_loss = junk
                tot_critic_loss = junk
                for b in range(args.update_batches):
                    actor_loss, critic_loss = agent.update(target_noise=False)
                    if (actor_loss is not None) and (critic_loss is not None):
                        tot_actor_loss += actor_loss
                        tot_critic_loss += critic_loss
                if (tot_actor_loss != junk) and (tot_critic_loss != junk):
                    tot_actor_loss -= junk
                    tot_critic_loss -= junk
                    tot_actor_loss /= args.update_batches
                    tot_critic_loss /= args.update_batches
                    logfile.write('LOSS: %d,%f,%f\n' %
                                  (episode, tot_actor_loss, tot_critic_loss))
                    logfile.flush()

            if terminal or t == args.max_steps:
                junk = np.random.normal(np.random.randint(-10, 10),
                                        np.random.random() + 5.0)
                tot_actor_loss = junk
                tot_critic_loss = junk
                for b in range(args.update_batches):
                    actor_loss, critic_loss = agent.update(target_noise=False)
                    if (actor_loss is not None) and (critic_loss is not None):
                        tot_actor_loss += actor_loss
                        tot_critic_loss += critic_loss
                if (tot_actor_loss != junk) and (tot_critic_loss != junk):
                    tot_actor_loss -= junk
                    tot_critic_loss -= junk
                    tot_actor_loss /= args.update_batches
                    tot_critic_loss /= args.update_batches
                    logfile.write('LOSS: %d,%f,%f\n' %
                                  (episode, tot_actor_loss, tot_critic_loss))
                    logfile.flush()
                break

        logfile.write('%d,%f\n' % (episode, ep_reward))
        logfile.flush()

        if (episode % args.save_every) == 0:
            agent.save(savedir, episode, previous=episode - args.save_every)

        if (episode % args.eval_every) == 0:
            avg_reward /= args.eval_every
            ep_start = episode - args.eval_every + 1
            print('Episodes %d - %d | Average reward = %f' %
                  (ep_start, episode, avg_reward))
            avg_reward = 0.0

            # print("Evaluating!")
            evaluate(agent, env, args, logfile)
            agent.train()

    print("Testing!")
    evaluate(agent, env, args, logfile)
    logfile.close()
Ejemplo n.º 4
0
env = cartPole.CartPoleEnv()
NUM_STATE_FEATURES = env.get_num_state_features()
NUM_ACTIONS = env.get_num_actions()
EPISODE_NUM = 2000
PRINT_EVERY_EPISODE = 20
LEARNING_RATE = 0.003
REWARD_DISCOUNT = 0.99

exp_stg = EPSG.EpsilonGreedy(0.2, NUM_ACTIONS)
# agent = Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg)

agent_params = ((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT,
                LEARNING_RATE, exp_stg)

init_local_agent_funct = lambda: A2C.Agent(
    (NUM_STATE_FEATURES,
     ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg)
init_local_env_funct = lambda: cartPole.CartPoleEnv()

master = Parallel.Master(EPISODE_NUM, init_local_agent_funct,
                         init_local_env_funct, -1)
master.start_workers()

state = env.reset()
accum_reward = 0
accum_loss = 0

# tqdm progress bar
bar = []
# Reward & LossHistory
r_his = []
Ejemplo n.º 5
0
    n_step = 10
    use_cuda = False
    is_render = True
    save_model = False
    ###########################################

    buffer_state = [[] for _ in range(num_worker)]
    buffer_action = [[] for _ in range(num_worker)]
    buffer_reward = [[] for _ in range(num_worker)]
    buffer_next_state = [[] for _ in range(num_worker)]

    model = A2C(s_dim,
                a_dim,
                num_worker,
                gamma=0.95,
                epsilon_start=1.0,
                epsilon_end=0.1,
                epsilon_length=100000,
                use_cuda=use_cuda,
                n_step=n_step,
                lr=0.001)
    model.load('0000800.pt')

    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        worker = MarioEnv(env_id, idx, child_conn, queue, n_step, is_render)
        worker.start()
        workers.append(worker)
        parent_conns.append(parent_conn)

    while model.g_episode < max_episode:
Ejemplo n.º 6
0
# Block any pop-up windows
os.environ['SDL_VIDEODRIVER'] = 'dummy'

# Test GPU and show the available logical & physical GPUs
Util.test_gpu()

env = FlappyBird.FlappyBirdEnv()
NUM_STATE_FEATURES = env.get_num_state_features()
NUM_ACTIONS = env.get_num_actions()
EPISODE_NUM = 10000
PRINT_EVERY_EPISODE = 50
LEARNING_RATE = 0.003
REWARD_DISCOUNT = 0.99

exp_stg = EPSG.EpsilonGreedy(0.2, NUM_ACTIONS)
agent = A2C.Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT,
                  LEARNING_RATE, exp_stg)

state = env.reset()
accum_reward = 0

# tqdm progress bar
bar = []
# Reward & LossHistory
r_his = []
avg_r_his = [0]
loss_his = []
episode_reward = 0

print("Episode 1")
for episode in range(1, EPISODE_NUM + 1):
    if episode % PRINT_EVERY_EPISODE == 1:
Ejemplo n.º 7
0
# Test GPU and show the available logical & physical GPUs
Util.test_gpu()

env = cartPole.CartPoleEnv()
NUM_STATE_FEATURES = env.get_num_state_features()
NUM_ACTIONS = env.get_num_actions()
EPISODE_NUM = 200
PRINT_EVERY_EPISODE = 20
LEARNING_RATE = 0.03
REWARD_DISCOUNT = 0.99
COEF_VALUE = 1
COEF_ENTROPY = 0

exp_stg = EPSG.EpsilonGreedy(0.2, NUM_ACTIONS)
agent = A2C.Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT,
                  LEARNING_RATE, COEF_VALUE, COEF_ENTROPY)

# agent_params = ((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg)
# init_local_agent_funct = lambda: Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, REWARD_DISCOUNT, LEARNING_RATE, exp_stg)
# init_local_env_funct = lambda: CartPoleEnv()

# master = Master(EPISODE_NUM, init_local_agent_funct, init_local_env_funct, 2)
# master.start_workers()

state = env.reset()
accum_reward = 0
accum_loss = 0

# tqdm progress bar
bar = []
# Reward & LossHistory