def play_poison(self, n_step=10000, n_episode=1000, test_ep=None, render=False): print('play poison: ', self.poison) print('is_trian: ', self.is_train) print('+++++++++++++++++++++++++++++++++==') if test_ep == None: test_ep = self.ep_end test_history = History(self.config) if not self.display: gym_dir = '/tmp/%s-%s' % (self.env_name, get_time()) #self.env.env.monitor.start(gym_dir) monitor = Monitor(self.env.env, directory = gym_dir) best_reward, best_idx = 0, 0 total_reward = 0. for idx in xrange(n_episode): screen, reward, action, terminal = self.env.new_random_game() current_reward = 0 for _ in range(self.history_length): test_history.add(screen) for t in tqdm(range(n_step), ncols=70): # 1. predict action = self.predict(test_history.get(), test_ep) # 2. act screen, reward, terminal = self.env.act(action, is_training=False) # 3. observe test_history.add(screen) # print('step: ', t, ' action: ', action, ' reward: ', reward) current_reward += reward if terminal: break if current_reward > best_reward: best_reward = current_reward best_idx = idx total_reward += current_reward print("="*30) print(" [%d] Best reward : %d" % (best_idx, best_reward)) print("="*30) print('average reward is: ', total_reward/n_episode) if not self.display: monitor.close()
def do_run(run, dirname, args): """ global snapshot snapshot2 = tracemalloc.take_snapshot() print(('MEMORY', run, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) if snapshot is not None: top_stats = snapshot2.compare_to(snapshot, 'lineno') print("[ Top 10 differences ]") for stat in top_stats[:10]: print(stat) print() snapshot = snapshot2 """ with tf.Graph().as_default(): learner_assumptions = get_learner_assumption_kwargs(args) # Each run has a different random seed equal to the run id. np.random.seed(run) random.seed(run) is_gridworld = not 'lunar' in args.env_name.lower() # TODO: Reset test goal inside here? Or use environment instead? rollouts = [[]] # Initialize model with wrong transition model based on aristotle learner. rollouts[0] += make_rollouts( #policy=aristotle_pilot_policies[0], # Was from a noisy policy. policy=policies.make_perfect_pilot_policy( goal=test_goal, act_labels=train_act_labels, ), env=test_env, n=args.n_initial_rollouts, task_idx=task_idx, ) assert(len(rollouts[0]) == args.n_initial_rollouts) rollouts[0] += make_rollouts( #policy=aristotle_pilot_policies[0], # Was from a noisy policy. policy=policies.make_perfect_pilot_policy( goal=test_goal, act_labels=train_act_labels, ), env=wrong_train_env, n=args.n_initial_wrong_rollouts, task_idx=task_idx, ) model = None Q = None start_pos = None logs = [] evals = [] evals_unassisted = [] learner_q_values = [] with tf.Session() as sess: if needs_model: model = inverse_softq.InverseSoftQModel( train_envs=[test_env] ) # NOTE: Used to be inside episode loop! # TODO: Check if this broke anything! support_env = get_support_env( s=args.learner_support, model=model, sess=sess, goal=test_goal, test_act_labels=test_act_labels, n_act_dim=n_act_dim, threshold=args.bumper_threshold, q_bumper_boltzmann=args.q_bumper_boltzmann, q_bumper_version=args.q_bumper_version, q_bumper_target_r=args.q_bumper_target_r, q_bumper_length_normalized=args.q_bumper_length_normalized, q_bumper_logistic_upper_prob=args.q_bumper_logistic_upper_prob, q_bumper_alpha=args.q_bumper_alpha, q_threshold=args.q_threshold, test_env=test_env, env_name=args.env_name, start_pos=start_pos, trajectory_distance=args.trajectory_distance, dirname=dirname, p_override=args.p_override, undoing=args.undoing, p_suboptimal_override=args.p_suboptimal_override, override_next_best=args.override_next_best, optimal_agent_training_timesteps=args.optimal_agent_training_timesteps, optimal_agent_smoothing_timesteps=args.optimal_agent_smoothing_timesteps, gamma=args.gamma, ) policy = get_learner_policy( s=args.learner_policy, #model=model, #sess=sess, #test_goal=test_goal, #train_act_labels=train_act_labels, #test_act_labels=test_act_labels, #n_act_dim=n_act_dim, #Q=Q, env=support_env, exploration_fraction=args.exploration_fraction, exploration_final_eps=args.exploration_final_eps, exploration_final_lr=args.exploration_final_lr, total_episodes=args.n_episodes, run=run, ) for ep in range(args.n_episodes): #print('Rn: {} Ep: {}'.format(run, ep), flush=True) support_env_with_monitor = Monitor( support_env, directory=os.path.join( dirname, 'assisted', str(run).zfill(3), str(ep).zfill(3), ), force=True, video_callable=lambda e: True if is_gridworld or utils.IS_LOCAL else False, #video_callable=(lambda e: True) if is_gridworld else None, ) # Simulate human learning """ if args.learner_policy == 'q': assert(args.n_learn_rollouts > 0) Q = policies.q_learning( rollouts if ep == 0 else [rollouts[0][-args.n_learn_rollouts:]], n_obs_dim=n_obs_dim, n_act_dim=n_act_dim, user_action=args.think_all_actions_own, Q_init=Q, learning_rate=args.q_learning_rate, ) """ _logs = None if needs_model: _logs = inverse_softq.run_learning( model=model, sess=sess, # train_tasks=train_aristotle_envs[:1], rollouts=rollouts, test_goal=test_goal, test_act_labels=test_act_labels, train_act_labels=train_act_labels, n_iters=args.n_softq_train_iters, train_frac=0.9, # TODO: Change to 1 **learner_assumptions ) # Test #episode_seed = [run, ep] perf = compute_assisted_perf( model=model, sess=sess, #test_act_labels=test_act_labels, #train_act_labels=train_act_labels, test_env=support_env_with_monitor, policy=policy, goal=test_goal, #seed=episode_seed, n_eval_rollouts=args.n_eval_rollouts, policy_explore=True, policy_update=True, **learner_assumptions ) unassisted_perf = None if args.n_eval_unassisted_rollouts is not None: unassisted_support_env = get_support_env( s='unassisted', goal=test_goal, test_act_labels=test_act_labels, n_act_dim=n_act_dim, test_env=test_env, env_name=args.env_name, start_pos=start_pos, trajectory_distance=args.trajectory_distance, dirname=dirname, ) unassisted_support_env_with_monitor = Monitor( unassisted_support_env, directory=os.path.join( dirname, 'unassisted', str(run).zfill(3), str(ep).zfill(3), ), force=True, video_callable=lambda e: True if is_gridworld or utils.IS_LOCAL else False, #video_callable=(lambda e: True) if is_gridworld else None, ) unassisted_perf = compute_assisted_perf( model=model, sess=sess, #test_act_labels=test_act_labels, #train_act_labels=train_act_labels, test_env=unassisted_support_env_with_monitor, policy=policy, goal=test_goal, #seed=episode_seed, n_eval_rollouts=args.n_eval_unassisted_rollouts, policy_explore=False, policy_update=False, ) unassisted_support_env_with_monitor.close() unassisted_support_env.close() new_rollouts = perf['rollouts'] rollouts[task_idx] += new_rollouts[:args.n_learn_rollouts] if _logs is not None: logs.append(_logs) evals.append(perf) evals_unassisted.append(unassisted_perf) if args.learner_policy == 'q': learner_q_values.append(copy(policy.Q)) support_env_with_monitor.close() support_env.close() policy.close() out_d = { 'logs': logs, 'evals': evals, 'evals_unassisted': ( evals_unassisted if args.n_eval_unassisted_rollouts is not None else None ), 'q_values': learner_q_values, 'args': vars(args), 'run': run, 'support_details': support_env.get_support_details(), } with open( os.path.join(dirname, 'data{}.json'.format(str(run).zfill(3))), 'w', ) as f: json.dump(out_d, f, cls=NumpyEncoder)
if k == key.DOWN: a[2] = 0 env = CarRacingV1() env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release record_video = False if record_video: from gym.wrappers.monitor import Monitor env = Monitor(env, '/tmp/video-test', force=True) isopen = True while isopen: env.reset() total_reward = 0.0 steps = 0 restart = False while True: s, r, done, info = env.step(a) total_reward += r if steps % 200 == 0 or done: print("\naction " + str(["{:+0.2f}".format(x) for x in a])) print("step {} total_reward {:+0.2f}".format( steps, total_reward)) #import matplotlib.pyplot as plt #plt.imshow(s) #plt.savefig("test.jpeg") steps += 1 isopen = env.render() if done or restart or isopen == False: break env.close()
def main(): toRender = { "line": 1, "circle": 1, "parabola": 0, "cycloid": 1, "random": 1, "rl": 0 } if (len(sys.argv) == 2): #read actions from file global env4list #toRender["rl"] = 1 #fin = open(sys.argv[1],"r") #line = fin.readline() env4list = np.load(sys.argv[1]) env4list = smooth(env4list) toRender["rl"] = 1 #fin.close() global gViewer gViewer = rendering.Viewer(600, 600) saveVideo = True global env0, env0theta, env0done if toRender["random"]: env0 = bc.BrachistochroneEnv("random", gViewer, (0, 0, 0)) if saveVideo: from gym.wrappers.monitor import Monitor env0 = Monitor(env0, './video-test', force=True) env0.reset() env0theta = 0 env0done = False env0.score_label.x = gViewer.width - 150 env0.score_label.y = gViewer.height - 10 if toRender["line"]: global env1, env1theta, env1done env1 = bc.BrachistochroneEnv("line", gViewer, (1, 0, 0)) if toRender["random"]: env1.setStartPosition(env0.start_position) env1done = False env1theta = math.atan( (env1.goal_position[1] - env1.start_position[1]) / (env1.goal_position[0] - env1.start_position[0])) / (math.pi) env1.reset() env1.score_label.x = gViewer.width - 150 env1.score_label.y = gViewer.height - 25 if toRender["circle"]: global env2, env2theta, env2done env2 = bc.BrachistochroneEnv("circle", gViewer, (0, 0, 1)) if toRender["random"]: env2.setStartPosition(env0.start_position) env2done = False env2theta = 2 * math.atan( (env2.goal_position[1] - env2.start_position[1]) / (env2.goal_position[0] - env2.start_position[0])) / (math.pi) env2.reset() env2.score_label.x = gViewer.width - 150 env2.score_label.y = gViewer.height - 40 if toRender["cycloid"]: global env3, env3theta, env3done, R_cycloid, T_Cycloid env3 = bc.BrachistochroneEnv("cycloid", gViewer, (0, 0.75, 0.25)) if toRender["random"]: env3.setStartPosition(env0.start_position) R_cycloid, T_Cycloid = solveCycloidInit(env3.start_position, env3.goal_position) env3theta = 2 * math.atan( (env3.goal_position[1] - env3.start_position[1]) / (env3.goal_position[0] - env3.start_position[0])) / (math.pi) env3done = False env3.reset() env3.score_label.x = gViewer.width - 150 env3.score_label.y = gViewer.height - 55 if toRender["rl"]: global env4, env4theta, env4done env4 = bc.BrachistochroneEnv("RL Agent", gViewer, (1, 0.5, 0)) env4.reset() env4theta = 0 env4done = False env4.score_label.x = gViewer.width - 150 env4.score_label.y = gViewer.height - 70 numsteps = 1000 for i in range(numsteps): toRender["random"] and env0.render() toRender["line"] and env1.render() toRender["circle"] and env2.render() toRender["cycloid"] and env3.render() toRender["rl"] and env4.render() if toRender["random"] and not env0done: env0theta = env0.action_space.sample() _, _, env0done, _ = env0.step(np.float32(env0theta)) if toRender["line"] and not env1done: _, _, env1done, _ = env1.step(np.float32([env1theta])) if toRender["circle"] and not env2done: _, _, env2done, _ = env2.step(np.float32([env2theta])) env2theta = 2 * math.atan( (env2.goal_position[1] - env2.state[1]) / (env2.goal_position[0] - env2.state[0])) / math.pi if toRender["cycloid"] and not env3done: _, _, env3done, _ = env3.step(np.float32([env3theta])) env3theta = solveCycloid(env3.start_position, [env3.state[0], env3.state[1]]) """ if toRender["rl"] and not env5done: line = fin.readline() if line: env0theta = [float(line)] _,_,env0done,_ = env5.step(np.float32([env5theta])) else: env0done = True """ if toRender["rl"] and not env4done: if i >= len(env4list): continue env4theta = env4list[i] _, _, env4done, _ = env4.step(np.float32([env4theta])) toRender["random"] and env0.close() toRender["line"] and env1.close() toRender["circle"] and env2.close() toRender["cycloid"] and env3.close() if toRender["rl"]: pts = env4.path print(pts) coeffs = polyfit(pts) env4.close() return
def __call__(self, step_limit, solution=None, stamp=None, record=False): logger.info("Playing game %s with step_limit %d", self.game, step_limit) with torch.no_grad(): controller = Controller(self.game, self.models_dir) if solution is not None: controller.load_solution(solution) else: controller.load_state(stamp) vae = VAE(self.game, self.models_dir) vae.load_state() mdn_rnn = MDN_RNN(self.game, self.models_dir) mdn_rnn.load_state() env = gym.make(self.game.key) if self.game.wrapper is not None: env = self.game.wrapper(env) if record: env = Monitor(env, "monitor", force=True) action = torch.zeros(self.game.action_vector_size) screen = env.reset() screen = transform(screen) screen.unsqueeze_(0) z, _, _ = vae.encoder(screen) _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0)) # h = torch.tensor([[[0] * 256]], dtype=torch.float32) overall_reward = 0 steps = 0 while True: env.render() action = controller(z.squeeze(0).squeeze(0), h.squeeze(0).squeeze(0)) actual_action = self.game.transform_action(action.detach().numpy()) screen, reward, done, _ = env.step(actual_action) overall_reward += reward screen = transform(screen) screen.unsqueeze_(0) z, _, _ = vae.encoder(screen) _, _, _, h = mdn_rnn(z.unsqueeze(0), action.unsqueeze(0).unsqueeze(0)) if done or (step_limit and steps >= step_limit): if done: logger.info("Game reached done") else: logger.info("Step limit reached") break steps += 1 env.close() # Transform reward to be useful to CMA-ES overall_reward = self.game.transform_overall_reward(overall_reward) logger.info("Game %s finished with reward %d", self.game.key, overall_reward) return overall_reward
from Controller import Controller from car_racing import CarRacing from gym.wrappers.monitor import Monitor C = Controller() for weights in [BEST_CONTROLLER_WEIGHTS, OPTIMAL_CONTROLLER_WEIGHTS]: ENV = Monitor(CarRacing(), f'{weights[:-5]}_SIM', force=True) try: C.load_parameters(weights) except: raise Exception('Train the Controller first.') done = False steps = 0 observation = ENV.reset() reward_FULL = 0 while not done and steps < MAX_STEPS: ENV.render() action = C.get_action(observation) observation, reward, done, _ = ENV.step(action) reward_FULL += reward steps += 1 ENV.close() print(f'{weights} Reward: {reward_FULL}')
''' Defining the simulation related constants ''' NUM_EPISODES = 50000 MAX_T = np.prod(MAZE_SIZE, dtype=int) * 100 STREAK_TO_END = 100 SOLVED_T = np.prod(MAZE_SIZE, dtype=int) DEBUG_MODE = 0 RENDER_MAZE = True ENABLE_RECORDING = True ''' Creating a Q-Table for each state-action pair ''' q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float) ''' Begin simulation ''' recording_folder = "/tmp/maze_q_learning" if ENABLE_RECORDING: monitor = Monitor(env, recording_folder, force=True) # env.monitor.start(recording_folder, force=True) simulate() if ENABLE_RECORDING: monitor.close() # env.monitor.close()