def main(args): INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 def preprocess(obs): obs = obs[35:195] # 160x160x3 obs = obs[::2, ::2, 0] # downsample (80x80) obs[obs == 144] = 0 obs[obs == 109] = 0 obs[obs != 0] = 1 return obs.astype(np.float).ravel() # load agent agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.sess.run(tf.global_variables_initializer()) # load env env = gym.make('Pong-v0') # training loop for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep+1, total_rewards))
def main(args): INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 def preprocess(obs): obs = obs[35:195] # 160x160x3 obs = obs[::2, ::2, 0] # downsample (80x80) obs[obs == 144] = 0 obs[obs == 109] = 0 obs[obs != 0] = 1 return obs.astype(np.float).ravel() # load agent agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.sess.run(tf.global_variables_initializer()) # load env env = gym.make('Pong-v0') # training loop for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep + 1, total_rewards))
def main(args): INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 # load agent agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.sess.run(global_variables_initializer()) # load env env = gym.make("Pong-v0") # training loop for ep in xrange(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # preprocess state = preprocess(state) # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print 'Ep%s Reward: %s ' % (ep + 1, total_rewards)
directory=monitor_folder, video_callable=lambda count: count % 1 == 0, resume=True) state_bounds = np.array([[0, 0], [15, 1]]) clip_state = generate_clip_state_function(state_bounds=state_bounds) # Q lambda does not work very well # agent = Q_Lambda_LFA(num_actions=2,state_bounds=state_bounds,n_basis = 3, # learning_rate=0.005,discount_factor=1,lambda1=0.95) # train_result_file="monitor-2018-08-13-2106-good-fp-05-fn-05/train_results.h5py") #"monitor-2018-08-03-2245/train_results.h5py" # Actor critic with eligibility trace now works well agent = ActorCritic(num_actions=2, state_bounds=state_bounds, n_basis=5, learning_rate_w=0.002, learning_rate_theta=0.002, discount_factor=1, lambda_w=0.95, lambda_theta=0.95, train_result_file=train_result_file) episode_stats = train_agent(env, agent, 2, clip_state, enforce_safety=True, min_ttc_for_safety=min_ttc_for_safety) # save train results train_results_file = os.path.join(monitor_folder, "train_results.h5py") save_train_results(train_results_file, agent, episode_stats, driver,
def main(args): INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 MAX_EPISODES = 10000 # load agent agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: # build a new model agent.sess.run(tf.global_variables_initializer()) ep_base = 0 mean_rewards = 0.0 # load env env = gym.make('Pong-v0') # training loop for ep in range(MAX_EPISODES): step = 0 total_rewards = 0 state = preprocess(env.reset()) while True: # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) next_state = preprocess(next_state) step += 1 total_rewards += reward agent.store_rollout(state, action, reward, next_state, done) # state shift state = next_state if done: break mean_rewards = 0.99 * mean_rewards + 0.01 * total_rewards rounds = (21 - np.abs(total_rewards)) + 21 average_steps = (step + 1) / rounds print('Ep%s: %d rounds' % (ep_base + ep + 1, rounds)) print('Average_steps: %.2f Reward: %s Average_reward: %.4f' % (average_steps, total_rewards, mean_rewards)) # update model per episode agent.update_model() # model saving if ep > 0 and ep % args.save_every == 0: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = str(round(mean_rewards, 2)) + '_' + str(ep_base + ep + 1) saver.save(agent.sess, args.save_path + save_name)
def create_agent(self): agent = ActorCritic(2, 128) optimizer = Adam(self.learning_rate) return agent, optimizer
def main(args): INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 MAX_EPISODES = 10000 # load agent agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: # build a new model agent.sess.run(tf.global_variables_initializer()) ep_base = 0 mean_rewards = None # load env env = gym.make('Pong-v0') # training loop for ep in range(MAX_EPISODES): # reset env step = 0 total_rewards = 0 state = preprocess(env.reset()) while True: # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) next_state = preprocess(next_state) step += 1 total_rewards += reward agent.store_rollout(state, action, reward, next_state, done) # state shift state = next_state if done: break if mean_rewards is None: mean_rewards = total_rewards else: mean_rewards = 0.99 * mean_rewards + 0.01 * total_rewards rounds = (21 - np.abs(total_rewards)) + 21 average_steps = (step + 1) / rounds print('Ep%s: %d rounds' % (ep_base + ep + 1, rounds)) print('Average_steps: %.2f Reward: %s Average_reward: %.4f' % (average_steps, total_rewards, mean_rewards)) # update model per episode agent.update_model() # model saving if ep > 0 and ep % args.save_every == 0: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = str(round(mean_rewards, 2)) + '_' + str(ep_base + ep+1) saver.save(agent.sess, args.save_path + save_name)
NUM_WORKERS = 8 MAX_STEPS = 30000 global_net = SeparateNetwork(N_FEATURES, N_ACTIONS) global_net.share_memory() init_weights(global_net) optimizer = SharedAdam(global_net.parameters(), lr=LR) optimizer.share_memory() # Shared Data eps_counter = mp.Value('i', 0) # Hogwild! style update worker_list = [] for i in range(NUM_WORKERS): agent = ActorCritic( wid=i, shared_model=global_net, model=SeparateNetwork(N_FEATURES, N_ACTIONS), optimizer=optimizer, n_steps=N_STEPS, ) worker = mp.Process(target=run_loop, args=(agent, "CartPole-v0", eps_counter, MAX_STEPS)) worker.start() worker_list.append(worker) for worker in worker_list: worker.join()
def main(args): INPUT_DIM = 80 * 80 HIDDEN_UNITS = 200 ACTION_DIM = 6 MAX_EPISODES = 10000 # load agent agent = ActorCritic(INPUT_DIM, HIDDEN_UNITS, ACTION_DIM) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver(max_to_keep=1) #if args.model_path is not None: if True: # reuse saved model saver.restore( agent.sess, "/home/sugon/Peixian/atari-pong/reinforce_py/algorithms/Actor-Critic/model/summary/1.13_9901" ) #ep_base = int(args.save_path.split('_')[-1]) #mean_rewards = float(args.save_path.split('/')[-1].split('_')[0]) else: # build a new model agent.sess.run(tf.global_variables_initializer()) ep_base = 0 mean_rewards = None summary_writer = tf.summary.FileWriter("./summary/", agent.sess.graph) summary_placeholders, update_ops, summary_op = setup_summary() # load env env = gym.make('Pong-v0') win = 0 f = open("./ac_score.txt", "w") # training loop for ep in range(100): # reset env step = 0 total_rewards = 0 state = preprocess(env.reset()) while True: # sample actions action = agent.sample_action(state[np.newaxis, :]) # act! next_state, reward, done, _ = env.step(action) next_state = preprocess(next_state) step += 1 total_rewards += reward agent.store_rollout(state, action, reward, next_state, done) # state shift state = next_state if done: print(win, ":", str(total_rewards) + " " + str(step)) f.write("score:" + str(total_rewards) + " " + str(step) + "\n") if total_rewards > 0: win += 1 break #if mean_rewards is None: # mean_rewards = total_rewards #else: # mean_rewards = 0.99 * mean_rewards + 0.01 * total_rewards #stats = [total_rewards, step, mean_rewards] # for i in range(len(stats)): # agent.sess.run(update_ops[i], feed_dict={ # summary_placeholders[i]: float(stats[i]) # }) #summary_str = agent.sess.run(summary_op) #summary_writer.add_summary(summary_str, ep + 1) # rounds = (21 - np.abs(total_rewards)) + 21 # average_steps = (step + 1) / rounds # print('Ep%s: %d rounds' % (ep_base + ep + 1, rounds)) # print('Average_steps: %.2f Reward: %s Average_reward: %.4f' % # (average_steps, total_rewards, mean_rewards)) # update model per episode #agent.update_model() # model saving #if ep > 0 and ep % args.save_every == 0: # if not os.path.isdir(args.save_path): # os.makedirs(args.save_path) # save_name = str(round(mean_rewards, 2)) + '_' + str(ep_base + ep+1) #saver.save(agent.sess, args.save_path + save_name) print("win:", win) f.close()