예제 #1
0
def main(args):
    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6

    def preprocess(obs):
        obs = obs[35:195]  # 160x160x3
        obs = obs[::2, ::2, 0]  # downsample (80x80)
        obs[obs == 144] = 0
        obs[obs == 109] = 0
        obs[obs != 0] = 1

        return obs.astype(np.float).ravel()

    # load agent
    agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.sess.run(tf.global_variables_initializer())

    # load env
    env = gym.make('Pong-v0')

    # training loop
    for ep in range(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # preprocess
            state = preprocess(state)
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done:
                break

        print('Ep%s  Reward: %s ' % (ep+1, total_rewards))
예제 #2
0
def main(args):
    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6

    def preprocess(obs):
        obs = obs[35:195]  # 160x160x3
        obs = obs[::2, ::2, 0]  # downsample (80x80)
        obs[obs == 144] = 0
        obs[obs == 109] = 0
        obs[obs != 0] = 1

        return obs.astype(np.float).ravel()

    # load agent
    agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.sess.run(tf.global_variables_initializer())

    # load env
    env = gym.make('Pong-v0')

    # training loop
    for ep in range(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # preprocess
            state = preprocess(state)
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done:
                break

        print('Ep%s  Reward: %s ' % (ep + 1, total_rewards))
예제 #3
0
def main(args):
    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6

    # load agent
    agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.sess.run(global_variables_initializer())

    # load env
    env = gym.make("Pong-v0")

    # training loop
    for ep in xrange(args.ep):
        # reset env
        total_rewards = 0
        state = env.reset()

        while True:
            env.render()
            # preprocess
            state = preprocess(state)
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            # state shift
            state = next_state
            if done: break

        print 'Ep%s  Reward: %s ' % (ep + 1, total_rewards)
예제 #4
0
                   directory=monitor_folder,
                   video_callable=lambda count: count % 1 == 0,
                   resume=True)

    state_bounds = np.array([[0, 0], [15, 1]])
    clip_state = generate_clip_state_function(state_bounds=state_bounds)
    # Q lambda does not work very well
    # agent = Q_Lambda_LFA(num_actions=2,state_bounds=state_bounds,n_basis = 3,
    #                 learning_rate=0.005,discount_factor=1,lambda1=0.95) # train_result_file="monitor-2018-08-13-2106-good-fp-05-fn-05/train_results.h5py") #"monitor-2018-08-03-2245/train_results.h5py"

    # Actor critic with eligibility trace now works  well
    agent = ActorCritic(num_actions=2,
                        state_bounds=state_bounds,
                        n_basis=5,
                        learning_rate_w=0.002,
                        learning_rate_theta=0.002,
                        discount_factor=1,
                        lambda_w=0.95,
                        lambda_theta=0.95,
                        train_result_file=train_result_file)

    episode_stats = train_agent(env,
                                agent,
                                2,
                                clip_state,
                                enforce_safety=True,
                                min_ttc_for_safety=min_ttc_for_safety)

    # save train results
    train_results_file = os.path.join(monitor_folder, "train_results.h5py")
    save_train_results(train_results_file, agent, episode_stats, driver,
def main(args):
    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6
    MAX_EPISODES = 10000

    # load agent
    agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        # build a new model
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        mean_rewards = 0.0

    # load env
    env = gym.make('Pong-v0')

    # training loop
    for ep in range(MAX_EPISODES):
        step = 0
        total_rewards = 0
        state = preprocess(env.reset())

        while True:
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)

            next_state = preprocess(next_state)

            step += 1
            total_rewards += reward

            agent.store_rollout(state, action, reward, next_state, done)
            # state shift
            state = next_state

            if done:
                break

        mean_rewards = 0.99 * mean_rewards + 0.01 * total_rewards
        rounds = (21 - np.abs(total_rewards)) + 21
        average_steps = (step + 1) / rounds
        print('Ep%s: %d rounds' % (ep_base + ep + 1, rounds))
        print('Average_steps: %.2f Reward: %s Average_reward: %.4f' %
              (average_steps, total_rewards, mean_rewards))

        # update model per episode
        agent.update_model()

        # model saving
        if ep > 0 and ep % args.save_every == 0:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)
            save_name = str(round(mean_rewards,
                                  2)) + '_' + str(ep_base + ep + 1)
            saver.save(agent.sess, args.save_path + save_name)
예제 #6
0
    def create_agent(self):
        agent = ActorCritic(2, 128)
        optimizer = Adam(self.learning_rate)

        return agent, optimizer
예제 #7
0
def main(args):
    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6
    MAX_EPISODES = 10000

    # load agent
    agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        mean_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        # build a new model
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        mean_rewards = None

    # load env
    env = gym.make('Pong-v0')

    # training loop
    for ep in range(MAX_EPISODES):
        # reset env
        step = 0
        total_rewards = 0
        state = preprocess(env.reset())

        while True:
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)

            next_state = preprocess(next_state)

            step += 1
            total_rewards += reward

            agent.store_rollout(state, action, reward, next_state, done)
            # state shift
            state = next_state

            if done:
                break

        if mean_rewards is None:
            mean_rewards = total_rewards
        else:
            mean_rewards = 0.99 * mean_rewards + 0.01 * total_rewards

        rounds = (21 - np.abs(total_rewards)) + 21
        average_steps = (step + 1) / rounds
        print('Ep%s: %d rounds' % (ep_base + ep + 1, rounds))
        print('Average_steps: %.2f Reward: %s Average_reward: %.4f' %
              (average_steps, total_rewards, mean_rewards))

        # update model per episode
        agent.update_model()
        # model saving
        if ep > 0 and ep % args.save_every == 0:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)
            save_name = str(round(mean_rewards, 2)) + '_' + str(ep_base + ep+1)
            saver.save(agent.sess, args.save_path + save_name)
예제 #8
0
파일: main.py 프로젝트: JIElite/A3C
NUM_WORKERS = 8
MAX_STEPS = 30000

global_net = SeparateNetwork(N_FEATURES, N_ACTIONS)
global_net.share_memory()
init_weights(global_net)

optimizer = SharedAdam(global_net.parameters(), lr=LR)
optimizer.share_memory()

# Shared Data
eps_counter = mp.Value('i', 0)

# Hogwild! style update
worker_list = []
for i in range(NUM_WORKERS):
    agent = ActorCritic(
        wid=i,
        shared_model=global_net,
        model=SeparateNetwork(N_FEATURES, N_ACTIONS),
        optimizer=optimizer,
        n_steps=N_STEPS,
    )
    worker = mp.Process(target=run_loop,
                        args=(agent, "CartPole-v0", eps_counter, MAX_STEPS))
    worker.start()
    worker_list.append(worker)

for worker in worker_list:
    worker.join()
예제 #9
0
def main(args):
    INPUT_DIM = 80 * 80
    HIDDEN_UNITS = 200
    ACTION_DIM = 6
    MAX_EPISODES = 10000

    # load agent
    agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM)
    agent.construct_model(args.gpu)

    # load model or init a new
    saver = tf.train.Saver(max_to_keep=1)
    #if args.model_path is not None:
    if True:
        # reuse saved model
        saver.restore(
            agent.sess,
            "/home/sugon/Peixian/atari-pong/reinforce_py/algorithms/Actor-Critic/model/summary/1.13_9901"
        )
        #ep_base = int(args.save_path.split('_')[-1])
        #mean_rewards = float(args.save_path.split('/')[-1].split('_')[0])
    else:
        # build a new model
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        mean_rewards = None

    summary_writer = tf.summary.FileWriter("./summary/", agent.sess.graph)
    summary_placeholders, update_ops, summary_op = setup_summary()

    # load env
    env = gym.make('Pong-v0')

    win = 0
    f = open("./ac_score.txt", "w")
    # training loop
    for ep in range(100):
        # reset env
        step = 0
        total_rewards = 0
        state = preprocess(env.reset())

        while True:
            # sample actions
            action = agent.sample_action(state[np.newaxis, :])
            # act!
            next_state, reward, done, _ = env.step(action)

            next_state = preprocess(next_state)

            step += 1
            total_rewards += reward

            agent.store_rollout(state, action, reward, next_state, done)
            # state shift
            state = next_state

            if done:
                print(win, ":", str(total_rewards) + " " + str(step))
                f.write("score:" + str(total_rewards) + " " + str(step) + "\n")
                if total_rewards > 0:
                    win += 1
                break

        #if mean_rewards is None:
        #   mean_rewards = total_rewards
        #else:
        #   mean_rewards = 0.99 * mean_rewards + 0.01 * total_rewards

        #stats = [total_rewards, step, mean_rewards]
    # for i in range(len(stats)):
    #    agent.sess.run(update_ops[i], feed_dict={
    #       summary_placeholders[i]: float(stats[i])
    #  })
    #summary_str = agent.sess.run(summary_op)
    #summary_writer.add_summary(summary_str, ep + 1)


#        rounds = (21 - np.abs(total_rewards)) + 21
#       average_steps = (step + 1) / rounds
#      print('Ep%s: %d rounds' % (ep_base + ep + 1, rounds))
#     print('Average_steps: %.2f Reward: %s Average_reward: %.4f' %
#          (average_steps, total_rewards, mean_rewards))

# update model per episode
#agent.update_model()
# model saving
#if ep > 0 and ep % args.save_every == 0:
#   if not os.path.isdir(args.save_path):
#      os.makedirs(args.save_path)
# save_name = str(round(mean_rewards, 2)) + '_' + str(ep_base + ep+1)
#saver.save(agent.sess, args.save_path + save_name)
    print("win:", win)
    f.close()