np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element replay_memory.append(Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = pre_proc(state) state = np.stack([state] * 4, axis=2) else: state = next_state print('Initialize replay buffer: done!') # Record videos env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) total_t = 0 for i_episode in range(num_episodes): loss = None state = env.reset() state = pre_proc(state) state = np.stack([state] * 4, axis=2) # One step in the environment for t in itertools.count(): # Choose random action if not yet start learning epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] action = select_epilson_greedy_action(q_estimator, state, epsilon) next_state, reward, done, _ = env.step(action) # clip rewards between -1 and 1 ??
def record_sessions(env_id, agent, n_actions): env = Monitor(gym.make(env_id), directory='videos', force=True) for _ in range(100): generate_agent_session(env, agent, n_actions) env.close()
config=vars(args), name=experiment_name, monitor_gym=True, save_code=True) writer = SummaryWriter(f"/tmp/{experiment_name}") # TRY NOT TO MODIFY: seeding device = torch.device( 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu') env = gym.make(args.gym_id) #env = wrap_atari(env) env = ImgObsWrapper(env) #env = gym.wrappers.RecordEpisodeStatistics(env) # records episode reward in `info['episode']['r']` if args.capture_video: env = Monitor(env, f'videos/{experiment_name}') #env = wrap_deepmind( # env, # clip_rewards=True, # frame_stack=True, # scale=False, #) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = args.torch_deterministic env.seed(args.seed) env.action_space.seed(args.seed) env.observation_space.seed(args.seed) # respect the default timelimit
def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True, t_max=10000): """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward. :param save_path: where to save the report :param record_video: if True, records mp4 video :return: total reward (scalar) """ env = self.make_env() if not use_monitor and record_video: raise warn( "Cannot video without gym monitor. If you still want video, set use_monitor to True" ) if record_video: env = Monitor(env, save_path, force=True) elif use_monitor: env = Monitor(env, save_path, video_callable=lambda i: False, force=True) game_rewards = [] for _ in range(n_games): # initial observation observation = env.reset() # initial memory prev_memories = [ np.zeros((1, ) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in self.agent.agent_states ] t = 0 total_reward = 0 while True: res = self.agent_step( self.preprocess_observation(observation)[None, ...], *prev_memories) action, new_memories = res[0], res[1:] observation, reward, done, info = env.step(action[0]) total_reward += reward prev_memories = new_memories if done or t >= t_max: if verbose: print( "Episode finished after {} timesteps with reward={}" .format(t + 1, total_reward)) break t += 1 game_rewards.append(total_reward) env.close() del env return game_rewards
model_path = utils.get_model_dir(args.model) for test_mode in test_modes: # Generate environment if "_n" in args.env: env = gym.make(args.env, pairs_dict=pairs_dict, test_instr_mode=test_mode, num_dists=args.num_dists) else: env = gym.make(args.env) demo_path = os.path.join(model_path, test_mode) env = Monitor(env, demo_path, _check_log_this, force=True) env.seed(args.seed) # Define agent agent = utils.load_agent(env=env, model_name=args.model, argmax=args.argmax, env_name=args.env, instr_arch=args.instr_arch) utils.seed(args.seed) print('\n') print(f'=== EVALUATING MODE: {test_mode} ===') # Run the agent done = False
import gym from gym.wrappers import Monitor from pyvirtualdisplay import Display display = Display(visible=0, size=(1400, 900)) display.start() env = Monitor(gym.make('CartPole-v0'), './video', force=True) env.reset() done = False while not done: env.render() action = env.action_space.sample() _, _, done, _ = env.step(action) env.close() display.stop()
def main_test(id): config(id) env = gym.make(id) env = env.unwrapped dqn = MyDQN(env) if id == 'CartPole-v0': T = 20000 else: T = 2000 count = 0 train_result = [] train_loss = [] for i in range(2000): observation = env.reset() for j in range(T): action = dqn.action(observation, i) new_observation, reward, done, info = env.step(action) if id == 'CartPole-v0': r1 = (env.x_threshold - abs(new_observation[0])) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs( new_observation[2])) / env.theta_threshold_radians - 0.5 reward = r1 + r2 '''if j<2000: reward=-200''' elif done: reward = 100 dqn.perceive(observation, action, reward, new_observation, done) observation = new_observation if done == False and j != T - 1: continue train_result.append(j) if id == 'CartPole-v0': if done or j == T - 1: if j > 5000: count += 1 else: count = 0 print(i, j) break elif id == 'MountainCar-v0': print(i, j) if done and j < 300: count += 1 else: count = 0 break else: print(i, j) if done and j < 300: count += 1 else: count = 0 break train_loss.append(dqn.get_loss() / train_result[-1]) if id == 'CartPole-v0' and count >= 5: break if id != 'CartPole-v0' and count >= 200: break print(train_loss) print(train_result) plt.plot(train_loss) plt.xlabel("round") plt.ylabel("loss") plt.show() if id != 'CartPole-v0': train_result = -np.array(train_result) plt.plot(train_result) plt.xlabel("round") plt.ylabel("reward") plt.show() if RECORD: env = Monitor(env, './cartpole-experiment-0201', force=True) observation = env.reset() for j in range(T): #env.render() action = dqn.best_action(observation) observation, reward, done, info = env.step(action) env.close() result = [] for i in range(200): observation = env.reset() for j in range(T): #env.render() action = dqn.best_action(observation) observation, reward, done, info = env.step(action) if done or j == T - 1: print("test", j + 1) result.append(j + 1) break result = np.array(result) if id != 'CartPole-v0': result = -result plt.plot(result) plt.xlabel("round") plt.ylabel("reward") plt.show() print("mean", np.mean(result)) print("var", np.std(result)) print("len", len(result))
def main(): global RENDER_DELAY assert len(sys.argv) > 1, 'python model.py gamename path_to_mode.json' gamename = sys.argv[1] if gamename.startswith("bullet"): RENDER_DELAY = True use_model = False game = config.games[gamename] if len(sys.argv) > 2: use_model = True filename = sys.argv[2] print("filename", filename) the_seed = 0 if len(sys.argv) > 3: the_seed = int(sys.argv[3]) print("seed", the_seed) model = make_model(game) print('model size', model.param_count) model.make_env(render_mode=render_mode) if use_model: model.load_model(filename) else: params = model.get_random_model_params(stdev=0.1) model.set_model_params(params) if final_mode: np.random.seed(the_seed) model.env.seed(the_seed) rewards = [] for i in range(100): reward, steps_taken = simulate(model, train_mode=False, render_mode=False, num_episode=1) print(i, reward) rewards.append(reward[0]) print("seed", the_seed, "average_reward", np.mean(rewards), "standard_deviation", np.std(rewards)) else: if record_video: model.env = Monitor(model.env, directory='/tmp/' + gamename, video_callable=lambda episode_id: True, write_upon_reset=True, force=True) while (5): reward, steps_taken = simulate(model, train_mode=False, render_mode=render_mode, num_episode=1) print("terminal reward", reward, "average steps taken", np.mean(steps_taken) + 1)
output = np.squeeze(output, axis=0) stochastic_action = output + noise_process.sample() # bound to torcs scope bounded = np.clip(stochastic_action, action_space.low, action_space.high) return bounded if __name__ == "__main__": tf.logging.info( "@@@ start ddpg training gym_bipedal_walker_v2 @@@ start time:{}". format(time.ctime())) # Generate a Torcs environment train_env = gym.make(id='BipedalWalker-v2') eval_monitor = Monitor(gym.make(id='BipedalWalker-v2'), directory=DDPG_CFG.eval_monitor_dir, video_callable=lambda x: False, resume=True) mu = np.array([0.0, 0.0, 0.0, 0.0]) # x0=np.array([0, 0.5, -0.1]) theta = np.array([0.15, 0.15, 0.15, 0.15]) sigma = np.array([0.3, 0.3, 0.3, 0.3]) # x0 = np.array([0.1, 0.3, 0.1]) # TODO greedy accel in the begining x0 = np.array([ -0.2, 0.2, 0.2, 0.2, ]) noise_process = UO_Process(mu=mu, x0=x0, theta=theta, sigma=sigma, dt=1e-2)
def wrap_env(env): env = Monitor(env, './video', force=True) return env
def wrap_env(env): # wrapper for recording env = Monitor(env, './video', force=True) return env
indices = np.array([i for i in range(self.batch_size)]) action_state_value[[indices],[actions]] = next_action_state_value self.model.fit(states, action_state_value, epochs=1, verbose=0) #################################################################################################### # Run File_Epsilon = open(str(FILE_EPSILON), 'a+') File_Rewards = open(str(FILE_REWARDS), 'a+') env = gym.make('LunarLander-v2') if RECORD == True: env = Monitor(env=env, directory=PATH_VIDEO, force=True) env.seed(0) action_space = env.action_space .n state_space = env.observation_space.shape[0] agent = Agent(action_space, state_space) if path.exists(PATH_WEIGHTS): agent.model.load_weights(PATH_WEIGHTS) rewards = [] for episode in range(EPISODES): state = env.reset() state = np.reshape(state,(1,state_space)) score = 0
def main(argv=()): del argv # Unused. # Build an environment # Create and record episode - remove Monitor statement if recording not desired env = Monitor(gym.make('one-random-evader-v0'), './tmp/pursuit_evasion_infer_pursuer_vs_random_evader', force=True) #Reset state state = env.reset() #Initialize Agent Parameters #Get observed state space observed_state_space = env.get_observed_state_space() #Set initial state distribution initial_state_dist = [] initial_state = env.get_initial_state() for state in observed_state_space: if state == initial_state: initial_state_dist.append(1) else: initial_state_dist.append(0) #Get action space action_space = range(0, env.action_space.n) #Set action prior to uniform dist action_prior = [] for action in action_space: action_prior.append(1/len(action_space)) #Get reward function reward_function = env.get_reward_function() #Get transition function transition_function = env.get_transition_function() #Set max trajectory length max_trajectory_length = 11 #needs to be greater than shortest distance to evader for any meaningful inference #Create Agent agent = infer.DiceInferenceEngine(observed_state_space, action_space, initial_state_dist, action_prior, reward_function, transition_function, max_trajectory_length) print("\nAgent created.\n") #Set current observed state to initial state uncolored_obs = initial_state #Initialize actions list actions = [] print("\nInfering action " + str(0) + "\n") actions.append(dist.Categorical(torch.tensor(agent.next(uncolored_obs))).sample().item()) #Game Loop for t in range(0, 11): #Render env.render() #Delay to make video easier to watch #sleep(5) #Take action and get observations, rewards, termination from environment observation, reward, done, info = env.step(actions[t]) #If termination signal received, break out of loop if done: break #Pick next action based on agent's reasoning uncolored_obs = env.uncolor_board(observation) print("\nInfering action " + str(t + 1) + "\n") actions.append(dist.Categorical(torch.tensor(agent.next(uncolored_obs))).sample().item()) env.close()
def main(): global RENDER_DELAY parser = argparse.ArgumentParser( description=('Train policy on OpenAI Gym environment ' 'using pepg, ses, openes, ga, cma')) parser.add_argument('gamename', type=str, help='robo_pendulum, robo_ant, robo_humanoid, etc.') parser.add_argument('-f', '--filename', type=str, help='json filename', default='none') parser.add_argument('-e', '--eval_steps', type=int, default=100, help='evaluate this number of step if final_mode') parser.add_argument('-s', '--seed_start', type=int, default=0, help='initial seed') parser.add_argument('-w', '--single_weight', type=float, default=-100, help='single weight parameter') parser.add_argument('--stdev', type=float, default=2.0, help='standard deviation for weights') parser.add_argument( '--sweep', type=int, default=-1, help='sweep a set of weights from -2.0 to 2.0 sweep times.') parser.add_argument('--lo', type=float, default=-2.0, help='slow side of sweep.') parser.add_argument('--hi', type=float, default=2.0, help='high side of sweep.') args = parser.parse_args() assert len(sys.argv) > 1, 'python model.py gamename path_to_mode.json' gamename = args.gamename use_model = False game = config.games[gamename] filename = args.filename if filename != "none": use_model = True print("filename", filename) the_seed = args.seed_start model = make_model(game) print('model size', model.param_count) eval_steps = args.eval_steps single_weight = args.single_weight weight_stdev = args.stdev num_sweep = args.sweep sweep_lo = args.lo sweep_hi = args.hi model.make_env(render_mode=render_mode) if use_model: model.load_model(filename) else: if single_weight > -100: params = model.get_single_model_params( weight=single_weight - game.weight_bias) # REMEMBER TO UNBIAS print("single weight value set to", single_weight) else: params = model.get_uniform_random_model_params( stdev=weight_stdev) - game.weight_bias model.set_model_params(params) if final_mode: if num_sweep > 1: the_weights = np.arange( sweep_lo, sweep_hi + (sweep_hi - sweep_lo) / num_sweep, (sweep_hi - sweep_lo) / num_sweep) for i in range(len(the_weights)): the_weight = the_weights[i] params = model.get_single_model_params( weight=the_weight - game.weight_bias) # REMEMBER TO UNBIAS model.set_model_params(params) rewards = [] for i in range(eval_steps): reward, steps_taken = simulate(model, train_mode=False, render_mode=False, num_episode=1, seed=the_seed + i) rewards.append(reward[0]) print("weight", the_weight, "average_reward", np.mean(rewards), "standard_deviation", np.std(rewards)) else: rewards = [] for i in range(eval_steps): ''' random uniform params params = model.get_uniform_random_model_params(stdev=weight_stdev)-game.weight_bias model.set_model_params(params) ''' reward, steps_taken = simulate(model, train_mode=False, render_mode=False, num_episode=1, seed=the_seed + i) print(i, reward) rewards.append(reward[0]) print("seed", the_seed, "average_reward", np.mean(rewards), "standard_deviation", np.std(rewards)) else: if record_video: model.env = Monitor(model.env, directory='/tmp/' + gamename, video_callable=lambda episode_id: True, write_upon_reset=True, force=True) for i in range(1): reward, steps_taken = simulate(model, train_mode=False, render_mode=render_mode, num_episode=1, seed=the_seed + i) print("terminal reward", reward, "average steps taken", np.mean(steps_taken) + 1)
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Make model copier object estimator_copy = ModelParametersCopier(q_estimator, target_estimator) # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # For 'system/' summaries, usefull to check if currrent process looks healthy current_process = psutil.Process() # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) replay_memory.append( Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # Record videos # Add env Monitor wrapper env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Maybe update the target estimator if total_t % update_target_estimator_every == 0: estimator_copy.make(sess) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory replay_memory.append( Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples)) # Calculate q values and targets q_values_next = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * discount_factor * np.amax(q_values_next, axis=1) # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon") episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], tag="episode/reward") episode_summary.value.add( simple_value=stats.episode_lengths[i_episode], tag="episode/length") episode_summary.value.add(simple_value=current_process.cpu_percent(), tag="system/cpu_usage_percent") episode_summary.value.add( simple_value=current_process.memory_percent(memtype="vms"), tag="system/v_memeory_usage_percent") q_estimator.summary_writer.add_summary(episode_summary, i_episode) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) return stats
def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, epsilon_steps, epsilon_final, tau_actor, tau_actor_param, use_ornstein_noise, learning_rate_actor, learning_rate_actor_param, reward_scale, clip_grad, title, scale_actions, zero_index_gradients, split, layers, multipass, indexed, weighted, average, random_weighted, render_freq, action_input_layer, initialise_params, save_freq, save_dir, save_frames, visualise): env = gym.make('Goal-v0') env = GoalObservationWrapper(env) if save_freq > 0 and save_dir: save_dir = os.path.join(save_dir, title + "{}".format(str(seed))) os.makedirs(save_dir, exist_ok=True) assert not (save_frames and visualise) if visualise: assert render_freq > 0 if save_frames: assert render_freq > 0 vidir = os.path.join(save_dir, "frames") os.makedirs(vidir, exist_ok=True) if scale_actions: kickto_weights = np.array([[-0.375, 0.5, 0, 0.0625, 0], [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]]) shoot_goal_left_weights = np.array([0.857346647646219686, 0]) shoot_goal_right_weights = np.array([-0.857346647646219686, 0]) else: xfear = 50.0 / PITCH_LENGTH yfear = 50.0 / PITCH_WIDTH caution = 5.0 / PITCH_WIDTH kickto_weights = np.array([[2.5, 1, 0, xfear, 0], [0, 0, 1 - caution, 0, yfear]]) shoot_goal_left_weights = np.array([GOAL_WIDTH / 2 - 1, 0]) shoot_goal_right_weights = np.array([-GOAL_WIDTH / 2 + 1, 0]) initial_weights = np.zeros((4, 17)) initial_weights[0, [10, 11, 14, 15]] = kickto_weights[0, 1:] initial_weights[1, [10, 11, 14, 15]] = kickto_weights[1, 1:] initial_weights[2, 16] = shoot_goal_left_weights[1] initial_weights[3, 16] = shoot_goal_right_weights[1] initial_bias = np.zeros((4,)) initial_bias[0] = kickto_weights[0, 0] initial_bias[1] = kickto_weights[1, 0] initial_bias[2] = shoot_goal_left_weights[0] initial_bias[3] = shoot_goal_right_weights[0] if not scale_actions: # rescale initial action-parameters for a scaled state space for a in range(env.action_space.spaces[0].n): mid = (env.observation_space.spaces[0].high + env.observation_space.spaces[0].low) / 2. initial_bias[a] += np.sum(initial_weights[a] * mid) initial_weights[a] = initial_weights[a]*env.observation_space.spaces[0].high - initial_weights[a] * mid env = GoalFlattenedActionWrapper(env) if scale_actions: env = ScaledParameterisedActionWrapper(env) env = ScaledStateWrapper(env) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) env.seed(seed) np.random.seed(seed) assert not (split and multipass) agent_class = PDQNAgent if split: agent_class = SplitPDQNAgent elif multipass: agent_class = MultiPassPDQNAgent agent = agent_class( observation_space=env.observation_space.spaces[0], action_space=env.action_space, batch_size=batch_size, learning_rate_actor=learning_rate_actor, # 0.0001 learning_rate_actor_param=learning_rate_actor_param, # 0.001 epsilon_steps=epsilon_steps, epsilon_final=epsilon_final, gamma=gamma, clip_grad=clip_grad, indexed=indexed, average=average, random_weighted=random_weighted, tau_actor=tau_actor, weighted=weighted, tau_actor_param=tau_actor_param, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, inverting_gradients=inverting_gradients, actor_kwargs={'hidden_layers': layers, 'output_layer_init_std': 1e-5, 'action_input_layer': action_input_layer,}, actor_param_kwargs={'hidden_layers': layers, 'output_layer_init_std': 1e-5, 'squashing_function': False}, zero_index_gradients=zero_index_gradients, seed=seed) if initialise_params: agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias) print(agent) max_steps = 150 total_reward = 0. returns = [] start_time = time.time() video_index = 0 for i in range(episodes): if save_freq > 0 and save_dir and i % save_freq == 0: agent.save_models(os.path.join(save_dir, str(i))) state, _ = env.reset() state = np.array(state, dtype=np.float32, copy=False) act, act_param, all_action_parameters = agent.act(state) action = pad_action(act, act_param) if visualise and i % render_freq == 0: env.render() episode_reward = 0. agent.start_episode() for j in range(max_steps): ret = env.step(action) (next_state, steps), reward, terminal, _ = ret next_state = np.array(next_state, dtype=np.float32, copy=False) next_act, next_act_param, next_all_action_parameters = agent.act(next_state) next_action = pad_action(next_act, next_act_param) r = reward * reward_scale agent.step(state, (act, all_action_parameters), r, next_state, (next_act, next_all_action_parameters), terminal, steps) act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters action = next_action state = next_state episode_reward += reward if visualise and i % render_freq == 0: env.render() if terminal: break agent.end_episode() if save_frames: video_index = env.unwrapped.save_render_states(vidir, title, video_index) returns.append(episode_reward) total_reward += episode_reward if (i + 1) % 100 == 0: print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format(str(i + 1), total_reward / (i + 1), (np.array(returns) == 50.).sum() / len(returns))) end_time = time.time() print("Training took %.2f seconds" % (end_time - start_time)) env.close() if save_freq > 0 and save_dir: agent.save_models(os.path.join(save_dir, str(i))) returns = env.get_episode_rewards() np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.epsilon_final = 0. agent.epsilon = 0. agent.noise = None evaluation_returns = evaluate(env, agent, evaluation_episodes) print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns)) print("Ave. evaluation prob. =", sum(evaluation_returns == 50.) / len(evaluation_returns)) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Get the current time step total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy( q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[total_t]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) replay_memory.append(Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # # Record videos env = Monitor(env, monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # Maybe update the target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory replay_memory.append(Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) # Calculate q values and targets # This is where Double Q-Learning comes in! q_values_next = q_estimator.predict(sess, next_states_batch) best_actions = np.argmax(q_values_next, axis=1) q_values_next_target = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ discount_factor * q_values_next_target[np.arange(batch_size), best_actions] # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1]) # env.monitor.close() return stats
def record_videos(env, path="videos"): return Monitor(env, path, force=True, video_callable=lambda episode: True)
def wrap_env(self): self.env = Monitor(self.env, './video', force=True) return self.env
def main(_): with tf.Session() as sess: env = gym.make(ENV_NAME) # np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric #assert(env.action_space.high == -env.action_space.low) actor2 = ActorNetwork2(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic2 = CriticNetwork2(sess, state_dim, action_dim, action_bound, CRITIC_LEARNING_RATE, TAU, actor2.get_num_trainable_vars()) if GYM_MONITOR_EN: if not RENDER_ENV: env = Monitor(env, MONITOR_DIR, video_callable=False, force=True) else: env = Monitor(env, MONITOR_DIR, force=True) train(sess, env, actor2, critic2) # if UPLOAD_GYM_RESULTS: # gym.upload(MONITOR_DIR, api_key=GYM_API_KEY) # net_params = sess.run(actor.update_target_net_params) # f1 = open(ACTOR_DIR1,'w') # f2 = open(ACTOR_DIR2, 'w') # f3 = open(ACTOR_DIR3, 'w') # # #Network1 parameters storing # for i in range(400): # for j in range(4): # f1.write('%.8f \n' %net_params[0][j][i]) # f1.write('%.8f \n' %net_params[1][i]) # # # Network2 parameters storing # for i in range(300): # for j in range(400): # f2.write('%.8f \n' %net_params[2][j][i]) # f2.write('%.8f \n' %net_params[3][i]) # # # Network3 parameters storing # for i in range(1): # for j in range(300): # f3.write('%.8f \n' %net_params[4][j][i]) # f3.write('%.8f \n' %net_params[5][i]) plt.figure(1) plt.subplot(121) plt.title('Reward') plt.plot(REWARD) plt.subplot(122) plt.title('Qmax average') plt.plot(QMAX) plt.show()
def _play_randomly(env): env.reset() env.render(mode="human") done = False while not done: time.sleep(0.01) env.render(mode="human") obs, r, done, info = env.step( env.action_space.sample()) # take a random action env.close() if __name__ == '__main__': args = parse_arguments() config = BreakoutConfiguration( brick_rows=args.rows, brick_cols=args.columns, fire_enabled=args.fire, ball_enabled=not args.disable_ball, ) env = BreakoutDictSpace(config) if args.record: env = Monitor(env, args.output_dir) if args.random: _play_randomly(env) else: env.play()
from gym.wrappers import Monitor from gym.scoreboard.scoring import score_from_local from gym_numgrid.wrappers import * from examples.agents import * red = '\033[91m' yellow = '\033[93m' green = '\033[32m' endc = '\033[0m' numgrid = gym.make('NumGrid-v0') numgrid = DirectionWrapper(numgrid) experiment_path = '/tmp/numgrid-direction-random' env = Monitor(numgrid, experiment_path, force=True) agent = RandomAgent(env.action_space) reward = 0 info = {} for i_episode in range(env.spec.trials): print("\n********* EPISODE", i_episode, "**********\n") observation = env.reset() done = False while not done: env.render() action = agent.act(observation, reward, done, info) digit = numgrid.action(action)[0] color = ''
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ DQN algorithm with fff-policy Temporal Differnce control returns EpisodeStats object with 2 numpy arrays for episode_lengths and episode_rewards """ Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) replay_memory = [] # useful statistics stats = plotting.EpisodeStats( episode_lengths = np.zeros(num_episodes), episode_rewards = np.zeros(num_episodes)) # directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # get current time step total_t = sess.run(tf.train.get_global_step()) # epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # q policy we are following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # load initial experience into replay memory print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis = 2) for i in range(replay_memory_init_size): if i % 1000 == 0: print("iteration " + str(i)) # according to policy, create a action probability array action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)]) # randomly select an action according to action probs from policy action = np.random.choice(np.arange(len(VALID_ACTIONS)), p=action_probs) # openAI gym take a step in action space next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) # process image data next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state,2), axis=2) # add action to replay memory replay_memory.append(Transition(state, action, reward, next_state, done)) if done: # if found goal, start over state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis = 2) else: # if not found goal, update state to next state state = next_state # record videos # ad env monitor wrapper env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True) for i_episode in range(num_episodes): # save the current checkpoint if i_episode % 100 == 0: print ("episode: " + str(i_episode)) saver.save(tf.get_default_session(), checkpoint_path) # reset openAI environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # main forloop after loading initial state for t in itertools.count(): # epsilon for this timestep epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] # add epsilon to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # maybe update the target estimator # update means copying parameters from q estimator -> target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # take the next step in the environment # similar to earlier when loading replay memory with first step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(VALID_ACTIONS)), p=action_probs) next_state, rewar, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state,2), axis=2) # if replay memory is full, pop if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # save transition to replay memory replay_memory.append(Transition(state, action, reward, next_state, done)) # update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # sample minibatch from replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) # calculate qvalues and targets # Q ALGO RIGHT HERE LMAO q_values_next = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.max(q_values_next, axis=1) # gradient descent states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1]) return stats
display.display(plt.gcf()) def loop(context, i): env, agent = context control = agent(env.state) _, reward, _, _ = env.step(control) show_state(env, step=i) return (env, agent), reward # ILQR agent = ILQR() agent.train(Acrobot(horizon=10), 10) # for loop version T = 75 env = Acrobot() env = Monitor(env, './video', video_callable=lambda episode_id: True, force=True) print(env.reset()) reward = 0 for i in range(T): (env, agent), r = loop((env, agent), i) reward += r reward_forloop = reward print('reward_forloop = ' + str(reward_forloop)) env.close()
if __name__ == '__main__': # You can optionally set up the logger. Also fine to set the level # to logging.DEBUG or logging.WARN if you want to change the # amount of output. logger = logging.getLogger() logger.setLevel(logging.INFO) env = gym.make('FlappyBird-v0' if len(sys.argv) < 2 else sys.argv[1]) # You provide the directory to write to (can be an existing # directory, including one with existing data -- all monitor files # will be namespaced). You can also dump to a tempdir if you'd # like: tempfile.mkdtemp(). outdir = 'random-agent-results' env = Monitor(env, directory=outdir, force=True) # This declaration must go *after* the monitor call, since the # monitor's seeding creates a new action_space instance with the # appropriate pseudorandom number generator. env.seed(0) agent = RandomAgent(env.action_space) episode_count = 100 reward = 0 done = False for i in range(episode_count): ob = env.reset() # while True:
from gym.wrappers import Monitor from sklearn import preprocessing from atari_wrappers import FrameStack, EpisodicLifeEnv from rl_baseline.mario_util import make_env # raw_env = retro.make("SuperMarioBros-Nes") from small_evo.wrappers import AutoRenderer monitor = None action_repeat = True episodic_life = True render = 2 env = retro.make("SuperMarioBros-Nes") if monitor is not None: env = Monitor(env, monitor) if render is not None: env = AutoRenderer(env, auto_render_period=render) if action_repeat: env = FrameStack(env, 4) if episodic_life: env = EpisodicLifeEnv(env, [0] * 9) raw_env = env.unwrapped # env = AllowBacktracking(make_env(stack=False, scale_rew=False)) first_obs = env.reset() order = ["coins", "levelHi", "levelLo", "lives", "score", "scrolling", "time", "xscrollHi", "xscrollLo"] index_right = raw_env.buttons.index("RIGHT") index_a = raw_env.buttons.index("A") index_b = raw_env.buttons.index("B") infos = []
import gym import gym_traffic from gym.wrappers import Monitor import time env = gym.make('Traffic-Simple-gui-v0') from tqdm import tqdm monitor = False # env = gym.make('Traffic-Simple-cli-v0') #TODO: Change simulation step size #TODO: Add more traffic flows #TODO: Scene image generation if monitor: env = Monitor(env, "output/traffic/simple/random", force=True) for i_episode in tqdm(range(500)): observation = env.reset() total_reward = 0 for t in tqdm(range(1000)): # env.render() # print(observation) # print "\n Observation: {}".format(observation) env = env.unwrapped action = env.action_from_ttc() # print "\n Action: {}".format(action) # time.sleep(1) observation, reward, done, info = env.step(action) total_reward += reward # print (observation) # print "---------------- Observations ----------------"
def train(path, env): env = Monitor(env, path, video_callable=video_callable, force=True) agent = Agent(env) agent.train() return agent
return out generator = RoombaMazeGenerator() maze = Maze(generator) print(maze.to_value()) motion = Motion() motion.add('north', [-1, 0]) motion.add('south', [1, 0]) motion.add('west', [0, -1]) motion.add('east', [0, 1]) env = RoombaEnv(maze, motion) img = env.render('rgb_array') plt.imshow(img) plt.show() from gym.wrappers import Monitor from mazelab.solvers import dijkstra_solver actions = dijkstra_solver(np.array(env.maze.to_impassable()), env.motion, env.state.positions[0], env.goal.positions[0]) env = Monitor(env, directory='./', force=True) env.reset() for action in actions: env.step(action) env.close()
acts.append(action) rews.append(reward) return obs, acts, rews def process_rewards(rews): """Rewards -> Advantages for one episode. """ # total reward: length of episode return [len(rews)] * len(rews) monitor_dir = '/tmp/cartpole_exp1' monitor = Monitor(env, monitor_dir, force=True) sess.run(tf.global_variables_initializer()) b_obs, b_acts, b_rews = [], [], [] # for _ in range(eparams['ep_per_batch']): obs, acts, rews = policy_rollout(env) print('Episode steps: {}'.format(len(obs))) b_obs.extend(obs) b_acts.extend(acts) advantages_rew = process_rewards(rews) b_rews.extend(advantages_rew)