def main(): import gym from gym.wrappers.monitor import Monitor import quanser_robots def evaluate(env, policy, num_evlas=25): ep_returns = [] for eval_num in range(num_evlas): episode_return = 0 dones = False obs = env.reset() while not dones: action = policy(obs) obs, rewards, dones, info = env.step(action) episode_return += rewards ep_returns.append(episode_return) return ep_returns def render(env, policy): obs = env.reset() done = False while not done: env.render() act = policy(obs) obs, _, done, _ = env.step(act) def check(env, policy): render(env, policy) ret_all = evaluate(env, policy) print(np.mean(ret_all), np.std(ret_all)) env.close() # DQN I: Check learned policy env = Monitor(gym.make('CartpoleSwingShort-v0'), 'dqn_eval') policy = load_dqn_policy() check(env, policy) # DQN II: Check learning procedure env = Monitor(gym.make('CartpoleSwingShort-v0'), 'dqn_train', video_callable=False) policy = train_dqn_policy(env) check(env, policy) # LSPI I: Check learned policy env = Monitor(gym.make('CartpoleStabShort-v0'), 'lspi_eval') policy = load_lspi_policy() check(env, policy) # LSPI II: Check learning procedure env = Monitor(gym.make('CartpoleStabShort-v0'), 'lspi_train', video_callable=False) policy = train_lspi_policy(env) check(env, policy)
def play_poison(self, n_step=10000, n_episode=1000, test_ep=None, render=False): print('play poison: ', self.poison) print('is_trian: ', self.is_train) print('+++++++++++++++++++++++++++++++++==') if test_ep == None: test_ep = self.ep_end test_history = History(self.config) if not self.display: gym_dir = '/tmp/%s-%s' % (self.env_name, get_time()) #self.env.env.monitor.start(gym_dir) monitor = Monitor(self.env.env, directory = gym_dir) best_reward, best_idx = 0, 0 total_reward = 0. for idx in xrange(n_episode): screen, reward, action, terminal = self.env.new_random_game() current_reward = 0 for _ in range(self.history_length): test_history.add(screen) for t in tqdm(range(n_step), ncols=70): # 1. predict action = self.predict(test_history.get(), test_ep) # 2. act screen, reward, terminal = self.env.act(action, is_training=False) # 3. observe test_history.add(screen) # print('step: ', t, ' action: ', action, ' reward: ', reward) current_reward += reward if terminal: break if current_reward > best_reward: best_reward = current_reward best_idx = idx total_reward += current_reward print("="*30) print(" [%d] Best reward : %d" % (best_idx, best_reward)) print("="*30) print('average reward is: ', total_reward/n_episode) if not self.display: monitor.close()
def reset(self, record): self.episode_step = 0 if record: self.env = Monitor(gym.make('LunarLander-v2'), "recordings", video_callable=lambda episode_id: True, force=True) else: self.env = gym.make('LunarLander-v2') return self.env.reset()
def make_env(): env = gym.make(env_id) if record_video: video_path = os.path.join(output_dir, 'video') ensure_dir(video_path) env = Monitor(env, video_path, video_callable=lambda episode_id: episode_id % record_video_freq == 0, force=True) return env
def inizialize_wrapper(env, frame_skip: int, frame_width: int, frame_height: int, record_path: str): """ Applica un set di wrappers per i giochi Atari""" env = Monitor(env=env, directory=record_path, resume=True) env = MaxAndSkipEnv(env=env, skip=frame_skip) env = WarpFrame(env=env, width=frame_width, height=frame_height) env = ScaledFloatFrame(env=env) return env
def _thunk(): env = make_atari(env_id) env.seed(seed + rank) if record_video: video_path = os.path.join(output_dir, 'video/env-%d' % rank) ensure_dir(video_path) env = Monitor(env, video_path, video_callable=lambda episode_id: episode_id % record_video_freq == 0, force=True) return wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False)
def make_atari_env(name, seed): from gym.wrappers.monitor import Monitor from gym.envs.atari.atari_env import AtariEnv env = AtariEnv(game=name, frameskip=4, obs_type='image') env = Monitor(env, 'videos/', force=True, video_callable=lambda e: False) env = wrappers.wrap_deepmind(env) env.seed(seed) return env
def make_atari_env(name, history_len): from gym.envs.atari.atari_env import AtariEnv from gym.wrappers.monitor import Monitor env = AtariEnv(game=name, frameskip=4, obs_type='image') env = Monitor(env, 'videos/', force=True, video_callable=lambda e: False) env = wrappers.wrap_deepmind(env) env = wrappers.HistoryWrapper(env, history_len) env.seed(utils.random_seed()) return env
def make_env(): env = gym.make(env_id) if record_video: print("RECORDING VIDEO") video_path = os.path.join(output_dir, 'video') ensure_dir(video_path) env = Monitor(env, video_path, video_callable= lambda episode_id: episode_id % record_video_freq == 0, force=True) # env.render() return env
def train_and_evaluate(args, monitor_path, checkpoint_step_filename, checkpoint_weights_filename, weights_filename, log_filename): env = gym.make(args["env_name"]) env = Monitor(env, monitor_path, resume=True, uid=args["run_id"], video_callable=lambda episode_num: episode_num % args[ "record_video_every"] == 0) np.random.seed(args["random_seed"]) env.seed(args["random_seed"]) starting_step = 0 if os.path.exists(checkpoint_step_filename): with open(checkpoint_step_filename, 'r') as f: starting_step = int(f.read()) args["starting_step"] = starting_step dqn = make_deep_q_network(env, args) if args["starting_step"] > 0: dqn.load_weights(checkpoint_weights_filename) callbacks = [ ReloadModelIntervalCheckpoint(checkpoint_weights_filename, step_path=checkpoint_step_filename, interval=args["checkpoint_frequency"], starting_step=starting_step), MyTrainLogger(args["checkpoint_frequency"], args["training_steps"], starting_step, log_filename) ] if args["mode"] == "Train": dqn.fit(env, callbacks=callbacks, verbose=0, nb_steps=args["training_steps"] - starting_step, nb_max_start_steps=args["strarting_fire_steps"], start_step_policy=lambda obs: 1) # 1 is fire action dqn.save_weights(weights_filename, overwrite=True) else: dqn.load_weights(weights_filename) env = gym.make(args["env_name"]) env = Monitor(env, monitor_path, resume=True, uid=args["run_id"] + "_test") np.random.seed(args["random_seed"]) env.seed(args["random_seed"]) dqn.test(env, nb_episodes=1, visualize=False, nb_max_start_steps=args["strarting_fire_steps"], start_step_policy=lambda obs: 1) # 1 is fire action
def check_pyglet(): from pyglet.window import key a = np.array([0.0, 0.0, 0.0]) def key_press(k, _mod): if k == key.LEFT: a[0] = +1.0 if k == key.RIGHT: a[0] = -1.0 if k == key.UP: a[1] = +1.0 if k == key.DOWN: a[2] = +0.8 # set 1.0 for wheels to block to zero rotation def key_release(k, _mod): if k == key.LEFT and a[0] == +1.0: a[0] = 0 if k == key.RIGHT and a[0] == -1.0: a[0] = 0 if k == key.UP: a[1] = 0 if k == key.DOWN: a[2] = 0 env = CarRacingFix() env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, '/tmp/video-test', force=True) if_open = True while if_open: env.reset() total_reward = 0.0 steps = 0 while True: s, r, done, info = env.step(a) total_reward += r if steps % 200 == 0 or done: print("\naction " + str(["{:+0.2f}".format(x) for x in a])) print("step {} total_reward {:+0.2f}".format( steps, total_reward)) # import matplotlib.pyplot as plt # plt.imshow(s) # plt.savefig("test.jpeg") steps += 1 if_open = env.render() if done or not if_open: break env.close()
def main(side_force): from pyglet.window import key a = np.array([0.0, 0.0, 0.0]) def key_press(k, mod): global restart if k == 0xff0d: restart = True if k == key.LEFT: a[0] = -1.0 if k == key.RIGHT: a[0] = +1.0 if k == key.UP: a[1] = +1.0 if k == key.DOWN: a[2] = +0.8 # set 1.0 for wheels to block to zero rotation def key_release(k, mod): if k == key.LEFT and a[0] == -1.0: a[0] = 0 if k == key.RIGHT and a[0] == +1.0: a[0] = 0 if k == key.UP: a[1] = 0 if k == key.DOWN: a[2] = 0 env = CarRacing(side_force=side_force) env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, '/tmp/video-test', force=True) isopen = True while isopen: env.reset() total_reward = 0.0 steps = 0 restart = False while True: s, r, done, info = env.step(a) total_reward += r if steps % 200 == 0 or done: print("\naction " + str(["{:+0.2f}".format(x) for x in a])) print("step {} total_reward {:+0.2f}".format( steps, total_reward)) steps += 1 isopen = env.render() if done or restart or isopen == False: break env.close()
def train_ddpg_official(): env = LunarLanderContinuous() # env = LunarLander() env.render() record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, config.video_folder / "ddpg/", force=True) num_states = env.observation_space.shape[0] print("Size of State Space -> {}".format(num_states)) if env.continuous: num_actions = env.action_space.shape[0] print("Size of Action Space -> {}".format(num_actions)) upper_bound = env.action_space.high[0] lower_bound = env.action_space.low[0] print("Max Value of Action -> {}".format(upper_bound)) print("Min Value of Action -> {}".format(lower_bound)) else: num_actions = env.action_space.n print("Size of Action Space -> {}".format(num_actions)) upper_bound = num_actions lower_bound = 0 print("Max Value of Action -> {}".format(upper_bound)) print("Min Value of Action -> {}".format(lower_bound)) ddpg = DDPG_OFF(num_states, num_actions, lower_bound, upper_bound) avg_reward_list = ddpg.train(env, 10) # Plotting graph # Episodes versus Avg. Rewards plt.plot(avg_reward_list) plt.xlabel("Episode") plt.ylabel("Avg. Epsiodic Reward") plt.show()
def train_ddpg(): env = LunarLander() env.render() record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, config.video_folder / "ddpg/", force=True) agent = DDPG(env=env, num_actions=4, input_shape=env.observation_space.shape[0], continuous=False) # env.action_space.shape[0]) agent.load_model() n_games = 51 figure_file = config.plots_folder / "ddpg/lunar_planer.png" load_checkpoint = True score_history = agent.train(env, n_games, load_checkpoint) if not load_checkpoint: x = [i + 1 for i in range(n_games)] plot_learning_curve(x, score_history, figure_file)
def render(env, agent, name="", record=False): if record: env = Monitor(env, './video-test/{}'.format(name), force=True, mode="evaluation") for i_episode in range(5): state = env.reset() total_reward = 0 for step, _ in enumerate(range(STEPS), start=1): state = np.expand_dims(state, axis=0) env.render() action_index = agent.act(state) action = decode_action(action_index) next_state, reward, done, info = env.step(action) if done: break state = next_state total_reward += reward print("Episode achieves total reward {}".format(total_reward))
if k==key.RIGHT: a[0] = +1.0 if k==key.UP: a[1] = +1.0 if k==key.DOWN: a[2] = +0.8 # set 1.0 for wheels to block to zero rotation def key_release(k, mod): if k==key.LEFT and a[0]==-1.0: a[0] = 0 if k==key.RIGHT and a[0]==+1.0: a[0] = 0 if k==key.UP: a[1] = 0 if k==key.DOWN: a[2] = 0 env = CarRacing() env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, '/tmp/video-test', force=True) isopen = True while isopen: env.reset() total_reward = 0.0 steps = 0 restart = False while True: s, r, done, info = env.step(a) total_reward += r if steps % 200 == 0 or done: print("\naction " + str(["{:+0.2f}".format(x) for x in a])) print("step {} total_reward {:+0.2f}".format(steps, total_reward)) #import matplotlib.pyplot as plt #plt.imshow(s) #plt.savefig("test.jpeg")
feed.update({rwd: rwd_queue[rand_indexs]}) feed.update({next_obs: next_obs_queue[rand_indexs]}) if not learning_finished: # If not solved, we train and get the step loss step_loss_value, _ = sess.run([loss, train_step], feed_dict=feed) else: # If solved, we just get the step loss step_loss_value = sess.run(loss, feed_dict=feed) # Use sum to calculate average loss of this episode sum_loss_value += step_loss_value print("====== Episode {} ended with score = {}, avg_loss = {} ======".format(i_episode + 1, score, sum_loss_value / score)) score_queue.append(score) if len(score_queue) > MAX_SCORE_QUEUE_SIZE: score_queue.pop(0) if np.mean(score_queue) > 195: # The threshold of being solved learning_finished = True else: learning_finished = False if learning_finished: print("Testing !!!") # save progress every 100 episodes if learning_finished and i_episode % 100 == 0: saver.save(sess, 'checkpoints-cartpole/' + GAME + '-dqn', global_step=global_step) if __name__ == "__main__": env = gym.make(GAME) Monitor(env, './test/', force=True) train(env) env.close()
if k == key.RIGHT and a[0] == +0.3: a[0] = 0 if k == key.UP: a[1] = 0 if k == key.DOWN: a[2] = 0 env = CarRacing() env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, "./tmp/video-test", force=True) isopen = True record_s, record_a, record_r = [], [], [] episode_num = 0 while isopen: env.reset() total_reward = 0.0 steps = 0 restart = False while True: s, r, done, info = env.step(a) if a[0] == -0.3: record_a.append(0) elif a[0] == 0.3:
def do_run(run, dirname, args): """ global snapshot snapshot2 = tracemalloc.take_snapshot() print(('MEMORY', run, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) if snapshot is not None: top_stats = snapshot2.compare_to(snapshot, 'lineno') print("[ Top 10 differences ]") for stat in top_stats[:10]: print(stat) print() snapshot = snapshot2 """ with tf.Graph().as_default(): learner_assumptions = get_learner_assumption_kwargs(args) # Each run has a different random seed equal to the run id. np.random.seed(run) random.seed(run) is_gridworld = not 'lunar' in args.env_name.lower() # TODO: Reset test goal inside here? Or use environment instead? rollouts = [[]] # Initialize model with wrong transition model based on aristotle learner. rollouts[0] += make_rollouts( #policy=aristotle_pilot_policies[0], # Was from a noisy policy. policy=policies.make_perfect_pilot_policy( goal=test_goal, act_labels=train_act_labels, ), env=test_env, n=args.n_initial_rollouts, task_idx=task_idx, ) assert(len(rollouts[0]) == args.n_initial_rollouts) rollouts[0] += make_rollouts( #policy=aristotle_pilot_policies[0], # Was from a noisy policy. policy=policies.make_perfect_pilot_policy( goal=test_goal, act_labels=train_act_labels, ), env=wrong_train_env, n=args.n_initial_wrong_rollouts, task_idx=task_idx, ) model = None Q = None start_pos = None logs = [] evals = [] evals_unassisted = [] learner_q_values = [] with tf.Session() as sess: if needs_model: model = inverse_softq.InverseSoftQModel( train_envs=[test_env] ) # NOTE: Used to be inside episode loop! # TODO: Check if this broke anything! support_env = get_support_env( s=args.learner_support, model=model, sess=sess, goal=test_goal, test_act_labels=test_act_labels, n_act_dim=n_act_dim, threshold=args.bumper_threshold, q_bumper_boltzmann=args.q_bumper_boltzmann, q_bumper_version=args.q_bumper_version, q_bumper_target_r=args.q_bumper_target_r, q_bumper_length_normalized=args.q_bumper_length_normalized, q_bumper_logistic_upper_prob=args.q_bumper_logistic_upper_prob, q_bumper_alpha=args.q_bumper_alpha, q_threshold=args.q_threshold, test_env=test_env, env_name=args.env_name, start_pos=start_pos, trajectory_distance=args.trajectory_distance, dirname=dirname, p_override=args.p_override, undoing=args.undoing, p_suboptimal_override=args.p_suboptimal_override, override_next_best=args.override_next_best, optimal_agent_training_timesteps=args.optimal_agent_training_timesteps, optimal_agent_smoothing_timesteps=args.optimal_agent_smoothing_timesteps, gamma=args.gamma, ) policy = get_learner_policy( s=args.learner_policy, #model=model, #sess=sess, #test_goal=test_goal, #train_act_labels=train_act_labels, #test_act_labels=test_act_labels, #n_act_dim=n_act_dim, #Q=Q, env=support_env, exploration_fraction=args.exploration_fraction, exploration_final_eps=args.exploration_final_eps, exploration_final_lr=args.exploration_final_lr, total_episodes=args.n_episodes, run=run, ) for ep in range(args.n_episodes): #print('Rn: {} Ep: {}'.format(run, ep), flush=True) support_env_with_monitor = Monitor( support_env, directory=os.path.join( dirname, 'assisted', str(run).zfill(3), str(ep).zfill(3), ), force=True, video_callable=lambda e: True if is_gridworld or utils.IS_LOCAL else False, #video_callable=(lambda e: True) if is_gridworld else None, ) # Simulate human learning """ if args.learner_policy == 'q': assert(args.n_learn_rollouts > 0) Q = policies.q_learning( rollouts if ep == 0 else [rollouts[0][-args.n_learn_rollouts:]], n_obs_dim=n_obs_dim, n_act_dim=n_act_dim, user_action=args.think_all_actions_own, Q_init=Q, learning_rate=args.q_learning_rate, ) """ _logs = None if needs_model: _logs = inverse_softq.run_learning( model=model, sess=sess, # train_tasks=train_aristotle_envs[:1], rollouts=rollouts, test_goal=test_goal, test_act_labels=test_act_labels, train_act_labels=train_act_labels, n_iters=args.n_softq_train_iters, train_frac=0.9, # TODO: Change to 1 **learner_assumptions ) # Test #episode_seed = [run, ep] perf = compute_assisted_perf( model=model, sess=sess, #test_act_labels=test_act_labels, #train_act_labels=train_act_labels, test_env=support_env_with_monitor, policy=policy, goal=test_goal, #seed=episode_seed, n_eval_rollouts=args.n_eval_rollouts, policy_explore=True, policy_update=True, **learner_assumptions ) unassisted_perf = None if args.n_eval_unassisted_rollouts is not None: unassisted_support_env = get_support_env( s='unassisted', goal=test_goal, test_act_labels=test_act_labels, n_act_dim=n_act_dim, test_env=test_env, env_name=args.env_name, start_pos=start_pos, trajectory_distance=args.trajectory_distance, dirname=dirname, ) unassisted_support_env_with_monitor = Monitor( unassisted_support_env, directory=os.path.join( dirname, 'unassisted', str(run).zfill(3), str(ep).zfill(3), ), force=True, video_callable=lambda e: True if is_gridworld or utils.IS_LOCAL else False, #video_callable=(lambda e: True) if is_gridworld else None, ) unassisted_perf = compute_assisted_perf( model=model, sess=sess, #test_act_labels=test_act_labels, #train_act_labels=train_act_labels, test_env=unassisted_support_env_with_monitor, policy=policy, goal=test_goal, #seed=episode_seed, n_eval_rollouts=args.n_eval_unassisted_rollouts, policy_explore=False, policy_update=False, ) unassisted_support_env_with_monitor.close() unassisted_support_env.close() new_rollouts = perf['rollouts'] rollouts[task_idx] += new_rollouts[:args.n_learn_rollouts] if _logs is not None: logs.append(_logs) evals.append(perf) evals_unassisted.append(unassisted_perf) if args.learner_policy == 'q': learner_q_values.append(copy(policy.Q)) support_env_with_monitor.close() support_env.close() policy.close() out_d = { 'logs': logs, 'evals': evals, 'evals_unassisted': ( evals_unassisted if args.n_eval_unassisted_rollouts is not None else None ), 'q_values': learner_q_values, 'args': vars(args), 'run': run, 'support_details': support_env.get_support_details(), } with open( os.path.join(dirname, 'data{}.json'.format(str(run).zfill(3))), 'w', ) as f: json.dump(out_d, f, cls=NumpyEncoder)
a[2] = +0.8 # set 1.0 for wheels to block to zero rotation def key_release(k, mod): if k == key.LEFT and a[0] == -1.0: a[0] = 0 if k == key.RIGHT and a[0] == +1.0: a[0] = 0 if k == key.UP: a[1] = 0 if k == key.DOWN: a[2] = 0 env = CarRacingV1() env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, '/tmp/video-test', force=True) isopen = True while isopen: env.reset() total_reward = 0.0 steps = 0 restart = False while True: s, r, done, info = env.step(a) total_reward += r if steps % 200 == 0 or done: print("\naction " + str(["{:+0.2f}".format(x) for x in a])) print("step {} total_reward {:+0.2f}".format( steps, total_reward)) #import matplotlib.pyplot as plt #plt.imshow(s)
a[0] = 0 if k == key.D: set_trace() if k == key.R: env.reset() if k == key.Z: env.change_zoom() if k == key.G: env.switch_intersection_groups() if k == key.I: env.switch_intersection_points() if k == key.X: env.switch_xt_intersections() if k == key.E: env.switch_end_of_track() if k == key.S: env.switch_start_of_track() if k == key.T: env.screenshot('./') if k == key.Q: sys.exit() env.render() if record_video: PATH = os.path.abspath('env/') + '/data_samples/core_environments/' env = Monitor(env, PATH + 'car_racing_advanced', force=True) # env.key_press_fn = key_press # env.key_release_fn = key_release env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release while True: env.reset() total_reward = 0.0 steps = 0 restart = False while True: if discretize is not None: a_tmp = a[0] else:
""" Use this file to check that your implementation complies with our evaluation interface. """ import gym from gym.wrappers.monitor import Monitor from challenge1 import get_model, get_policy # 1. Learn the model f: s, a -> s', r env = Monitor(gym.make('Pendulum-v0'), 'training', video_callable=False, force=True) env.seed(98251624) max_num_samples = 10000 model = get_model(env, max_num_samples) env.close() # Your model will be tested on the quality of prediction obs = env.reset() act = env.action_space.sample() nobs, rwd, _, _ = env.step(act) nobs_pred, rwd_pred = model(obs, act) print(f'truth = {nobs, rwd}\nmodel = {nobs_pred, rwd_pred}') env.close() # 2. Perform dynamic programming using the learned model env = Monitor(gym.make('Pendulum-v0'), 'evaluation', force=True) env.seed(31186490) policy = get_policy(model, env.observation_space, env.action_space)
self.steps += 1 if self.steps == self.max_length: done = True return ob, reward, done, info if __name__ == '__main__': max_iterations = 80 max_episodes = 100 max_trajectory = 50 task = MaxLength(WarpFrame(CollectEnv(goal_condition=lambda x: (x.colour == 'beige' and x.shape == 'square') or (x.colour == 'purple' and x.shape == 'circle'))), max_trajectory) env = Monitor(task, './experiment_weighted_or/', video_callable=False, force=True) dqn_purple_circle = load('./models/purple_circle/model.dqn', task) # entropy regularised functions dqn_beige_crate = load('./models/beige_crate/model.dqn', task) # entropy regularised functions weights = np.arange(1/3, 3.01, 0.05) tally = {i: [] for i in range(len(weights))} for iter in range(max_iterations): for i, weight in enumerate(weights): collected_count = [0, 0] weight = 1 dqn_composed = ComposedDQN([dqn_beige_crate, dqn_purple_circle], [weight, 1]) for episode in range(max_episodes): if episode % 1000 == 0: print(episode)
import gym, time import numpy as np from getModel import getModelQube, getModelPendel from gym.wrappers.monitor import Monitor from sklearn.neural_network import MLPRegressor from challenge1_template import get_model, get_policy from scipy import spatial env = Monitor(gym.make('Pendulum-v0'), 'training', video_callable=False, force=True) env.seed(98251624) max_num_samples = 10000 model = get_model(env, max_num_samples) max_state = env.observation_space.high min_state = env.observation_space.low max_action = env.action_space.high min_action = env.action_space.low discret_states = 100 discrete_actions = 4 discount_factor = 0.99 theta = 1 def discreizeSpace(min_state, max_state, discret_num): discrete_space = [] for i in range(0, len(max_state)): min = min_state[i] max = max_state[i]
def record_game(env): # TODO: Test this return Monitor(env, '/tmp/video-test', force=True)
from model import Model import numpy as np import tensorflow as tf import gym from gym.wrappers.monitoring.video_recorder import VideoRecorder from gym.wrappers.monitor import Monitor # Test du modèle env = gym.make("CartPole-v0") env = Monitor(env, "videos",force=True) model = Model(num_actions=env.action_space.n) obs = env.reset() action, value = model.action_value(obs[None, :]) print(action, value) # Création de l'agent class Agent(): def __init__(self, model): self.params = { "value": 0.5, "entropy": 0.0001, "gamma": 0.99} self.model = model self.model.compile( optimizer=tf.keras.optimizers.Adam(lr=0.0005), loss = [self._logits_loss, self._value_loss] ) def test(self, env, render=True): obs, done, ep_reward = env.reset(), False, 0
def main(): toRender = { "line": 1, "circle": 1, "parabola": 0, "cycloid": 1, "random": 1, "rl": 0 } if (len(sys.argv) == 2): #read actions from file global env4list #toRender["rl"] = 1 #fin = open(sys.argv[1],"r") #line = fin.readline() env4list = np.load(sys.argv[1]) env4list = smooth(env4list) toRender["rl"] = 1 #fin.close() global gViewer gViewer = rendering.Viewer(600, 600) saveVideo = True global env0, env0theta, env0done if toRender["random"]: env0 = bc.BrachistochroneEnv("random", gViewer, (0, 0, 0)) if saveVideo: from gym.wrappers.monitor import Monitor env0 = Monitor(env0, './video-test', force=True) env0.reset() env0theta = 0 env0done = False env0.score_label.x = gViewer.width - 150 env0.score_label.y = gViewer.height - 10 if toRender["line"]: global env1, env1theta, env1done env1 = bc.BrachistochroneEnv("line", gViewer, (1, 0, 0)) if toRender["random"]: env1.setStartPosition(env0.start_position) env1done = False env1theta = math.atan( (env1.goal_position[1] - env1.start_position[1]) / (env1.goal_position[0] - env1.start_position[0])) / (math.pi) env1.reset() env1.score_label.x = gViewer.width - 150 env1.score_label.y = gViewer.height - 25 if toRender["circle"]: global env2, env2theta, env2done env2 = bc.BrachistochroneEnv("circle", gViewer, (0, 0, 1)) if toRender["random"]: env2.setStartPosition(env0.start_position) env2done = False env2theta = 2 * math.atan( (env2.goal_position[1] - env2.start_position[1]) / (env2.goal_position[0] - env2.start_position[0])) / (math.pi) env2.reset() env2.score_label.x = gViewer.width - 150 env2.score_label.y = gViewer.height - 40 if toRender["cycloid"]: global env3, env3theta, env3done, R_cycloid, T_Cycloid env3 = bc.BrachistochroneEnv("cycloid", gViewer, (0, 0.75, 0.25)) if toRender["random"]: env3.setStartPosition(env0.start_position) R_cycloid, T_Cycloid = solveCycloidInit(env3.start_position, env3.goal_position) env3theta = 2 * math.atan( (env3.goal_position[1] - env3.start_position[1]) / (env3.goal_position[0] - env3.start_position[0])) / (math.pi) env3done = False env3.reset() env3.score_label.x = gViewer.width - 150 env3.score_label.y = gViewer.height - 55 if toRender["rl"]: global env4, env4theta, env4done env4 = bc.BrachistochroneEnv("RL Agent", gViewer, (1, 0.5, 0)) env4.reset() env4theta = 0 env4done = False env4.score_label.x = gViewer.width - 150 env4.score_label.y = gViewer.height - 70 numsteps = 1000 for i in range(numsteps): toRender["random"] and env0.render() toRender["line"] and env1.render() toRender["circle"] and env2.render() toRender["cycloid"] and env3.render() toRender["rl"] and env4.render() if toRender["random"] and not env0done: env0theta = env0.action_space.sample() _, _, env0done, _ = env0.step(np.float32(env0theta)) if toRender["line"] and not env1done: _, _, env1done, _ = env1.step(np.float32([env1theta])) if toRender["circle"] and not env2done: _, _, env2done, _ = env2.step(np.float32([env2theta])) env2theta = 2 * math.atan( (env2.goal_position[1] - env2.state[1]) / (env2.goal_position[0] - env2.state[0])) / math.pi if toRender["cycloid"] and not env3done: _, _, env3done, _ = env3.step(np.float32([env3theta])) env3theta = solveCycloid(env3.start_position, [env3.state[0], env3.state[1]]) """ if toRender["rl"] and not env5done: line = fin.readline() if line: env0theta = [float(line)] _,_,env0done,_ = env5.step(np.float32([env5theta])) else: env0done = True """ if toRender["rl"] and not env4done: if i >= len(env4list): continue env4theta = env4list[i] _, _, env4done, _ = env4.step(np.float32([env4theta])) toRender["random"] and env0.close() toRender["line"] and env1.close() toRender["circle"] and env2.close() toRender["cycloid"] and env3.close() if toRender["rl"]: pts = env4.path print(pts) coeffs = polyfit(pts) env4.close() return
class DQNAgent: def __init__(self, lr, momentum, alpha, gamma, target_update_frequency, local_update_frequency, replay_start_size, queue_len, batch_size): gym.logger.set_level(40) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.env = gym.make('LunarLander-v2') self.replay_buffer = ReplayBuffer(queue_len, self.device, alpha) self.local_qnetwork = DQNModel().to(self.device) self.target_qnetwork = DQNModel().to(self.device) self.target_qnetwork.load_state_dict(self.local_qnetwork.state_dict()) self.optimizer = optim.RMSprop(self.local_qnetwork.parameters(), lr=lr, momentum=momentum) self.gamma = gamma self.target_update_frequency = target_update_frequency self.local_update_frequency = local_update_frequency self.replay_start_size = replay_start_size self.batch_size = batch_size self.state_size = self.env.observation_space.shape[0] self.action_size = self.env.action_space.n self.episode_step = 0 def agent_step(self, state, eps, beta): next_state, reward, done = self.env_step(state, eps) if len(self.replay_buffer.queue) < self.replay_start_size: return next_state, reward, None, done # Update the local q network every local_update_frequency steps loss = None if self.episode_step % self.local_update_frequency == 0: loss = self.qnetwork_step(beta) # Update the target q network every target_update_frequency steps if self.episode_step % self.target_update_frequency == 0: self.target_qnetwork.load_state_dict( self.local_qnetwork.state_dict()) self.episode_step += 1 return next_state, reward, loss, done def env_step(self, state, eps): action = self.policy(state, eps) next_state, reward, done, _ = self.env.step(action) self.replay_buffer.put([state, action, reward, next_state, done]) return next_state, reward, done def qnetwork_step(self, beta): states, actions, rewards, next_states, dones, indices, is_weights = self.replay_buffer.batch_get( self.batch_size, self.state_size, beta) # Double DQN next_target_actions = torch.argmax(self.local_qnetwork(next_states), dim=1).unsqueeze(1) next_target_rewards = self.target_qnetwork(next_states).gather( 1, next_target_actions) target_rewards = rewards + self.gamma * next_target_rewards * (1 - dones) local_rewards = self.local_qnetwork(states).gather(1, actions.long()) self.optimizer.zero_grad() td_error = (local_rewards - target_rewards.detach())**2 loss = torch.mean(is_weights.unsqueeze(1) * td_error) loss.backward() for param in self.local_qnetwork.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.replay_buffer.update_priorities(indices, td_error.data.cpu() + 0.0001) return loss.item() def policy(self, state, eps): if random.random() < eps: # Random action return self.env.action_space.sample() else: # Act according to local q network self.local_qnetwork.eval() with torch.no_grad(): out = self.local_qnetwork( torch.FloatTensor(state).to( self.device).unsqueeze(0)).cpu() self.local_qnetwork.train() return torch.argmax(out).item() def reset(self, record): self.episode_step = 0 if record: self.env = Monitor(gym.make('LunarLander-v2'), "recordings", video_callable=lambda episode_id: True, force=True) else: self.env = gym.make('LunarLander-v2') return self.env.reset()
def record_game(env): # TODO: Test this print('Recording') if not os.path.exists(VIDEO_PATH): os.mkdir('video-test') return Monitor(env, VIDEO_PATH, force=True)
import gym from gym.wrappers.monitor import Monitor from tf_rl.env.pybullet.env_list import ENVS for key, env_name in ENVS.items(): print(env_name) env = gym.make(env_name) env = Monitor(env=env, directory="./video/{}".format(key), force=True) state = env.reset() for t in range(100): action = env.action_space.sample() state, reward, done, info = env.step(action) if done: break env.close()