def main(args): assert BATCH_SIZE <= TRAIN_START <= REPLAY_BUFFER_SIZE assert TARGET_UPDATE_EVERY % UPDATE_EVERY == 0 assert 84 % SIDE_BOXES == 0 assert STRATEGY in ['final', 'future'] print(args) env = make_atari('{}NoFrameskip-v4'.format(args.env)) set_seed(env, args.seed) env_train = wrap_deepmind(env, frame_stack=True, episode_life=True, clip_rewards=True) if args.weights: model = load_or_create_model(env_train, args.model) print_weights(model) elif args.debug: env, model, target_model, batch = load_for_debug() fit_batch(env, model, target_model, batch) elif args.play: env = wrap_deepmind(env) play(env) else: env_eval = wrap_deepmind(env, frame_stack=True) model = load_or_create_model(env_train, args.model) if args.view or args.images or args.eval: evaluate(env_eval, model, args.view, args.images) else: max_steps = 100 if args.test else MAX_STEPS train(env_train, env_eval, model, max_steps, args.name) if args.test: filename = save_model(model, EVAL_STEPS, logdir='.', name='test') load_or_create_model(env_train, filename)
def __init__(self, params): self.env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari('SeaquestNoFrameskip-v4'), frame_stack=True) self.replay_memory_size = params['replay_memory'] if 'replay_memory' in params else 10000 self.replay_memory = deque([], maxlen=self.replay_memory_size) self.n_steps = params['n_steps'] if 'n_steps' in params else 100000 self.training_start = params['training_start'] if 'training_start' in params else 1000 self.training_interval = params['training_interval'] if 'training_interval' in params else 3 self.save_steps = params['save_steps'] if 'save_steps' in params else 50 self.copy_steps = params['copy_steps'] if 'copy_steps' in params else 25 self.discount_rate = params['discount_rate'] if 'discount_rate' in params else 0.95 self.skip_start = params['skip_start'] if 'skip_start' in params else 90 self.batch_size = params['batch_size'] if 'batch_size' in params else 50 self.iteration = params['iteration'] if 'iteration' in params else 0 self.n_outputs = params['n_outputs'] if 'n_outputs' in params else self.env.action_space.n self.learning_rate = params['learning_rate'] if 'learning_rate' in params else 0.001 self.global_step = tf.Variable(0, trainable=False, name='global_step') self.x = tf.placeholder(tf.float32, shape=[None, 84, 84, 4], name="input_placeholder") self.x_action = tf.placeholder(tf.int32, shape=[None], name="x_action") self.y = tf.placeholder(tf.float32, [None, 1]) # setup models, replay memory, and optimizer self.actor_q_values, actor_vars = self.dqn_network("q_network/actor") critic_q_values, self.critic_vars = self.dqn_network("q_network/critic") self.q_value = tf.reduce_sum(critic_q_values * tf.one_hot(self.x_action, self.n_outputs), axis=1, keep_dims=True) copy_ops = [actor_var.assign(self.critic_vars[var_name]) for var_name, actor_var in actor_vars.items()] self.copy_critic_to_actor = tf.group(*copy_ops) self.train_op = self.training_op()
def initialize_env(): env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4') env = atari_wrappers.wrap_deepmind(env, clip_rewards=False, frame_stack=True, pytorch_img=True) agent = Agent(in_channels=4, action_size=18, seed=0) ####initial network#### agent.qnetwork_target.load_model( torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pth')) agent.qnetwork_local.load_model( torch.load('./data/dqn_Riverraid_local_model_state_dict.pth')) ####initial the buffer replay#### while len(agent.memory) < BUFFER_INI: observation = env.reset() done = False while not done: action = random.sample(range(env.action_space.n), 1)[0] next_observation, reward, done, info = env.step(action) agent.memory.add(observation, action, reward, next_observation, done) observation = next_observation print("Replay Buffer Initialized") return env, agent
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if is_atari: if new_wrapper: env = wrap_carl_full(env) else: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env) return env
def evaluate(step, policy_net, device, env, n_actions, eps=0.05, num_episode=5): env = wrap_deepmind(env, clip_rewards=True) sa = m.ActionSelector(eps, eps, policy_net, EPS_DECAY, n_actions, device) e_rewards = [] q = deque(maxlen=5) for i in range(num_episode): env.reset() e_reward = 0 for _ in range(10): # no-op n_frame, _, done, _ = env.step(0) n_frame = m.fp(n_frame) q.append(n_frame) while not done: state = torch.cat(list(q))[1:].unsqueeze(0) # print(state.shape) action, eps = sa.select_action(state, train) n_frame, reward, done, info = env.step(action) n_frame = m.fp(n_frame) q.append(n_frame) e_reward += reward e_rewards.append(e_reward) f = open("file.txt", 'a') f.write("%f, %d, %d\n" % (float(sum(e_rewards)) / float(num_episode), step, num_episode)) f.close()
def worker(env_name, pipe, atari=False): if atari: env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name), frame_stack=True, scale=True) else: env = gym.make(env_name) s = env.reset() reward = 0 done = False try: while True: pipe.send((s, reward, done)) cmd, data = pipe.recv() if cmd == 'step': if isinstance(env.action_space, Box): data = np.clip(data, env.action_space.low, env.action_space.high) s, reward, done, _ = env.step(data) else: break if done: s = env.reset() finally: pipe.close() env.close()
def evaluate(step, eva_net, env, num_episode=15): env = wrap_deepmind(env) e_rewards = [] for i in range(num_episode): img = env.reset() sum_r = 0 done = False state_buffer = [] for i in range(5): state_buffer.append(img) s = state_buffer[1:5] while not done: a = myDQN.choose_action(s, train=False) img, r, done, info = env.step(a) sum_r += r state_buffer.pop(0) state_buffer.append(img) s_ = state_buffer[1:5] s = s_ e_rewards.append(sum_r) f = open("file.txt", 'a') f.write("%f, %d, %d\n" % (float(sum(e_rewards)) / float(num_episode), step, num_episode)) f.close()
def setup(self): main_args = Singleton_arger()['main'] Singleton_logger.setup(main_args['result_dir'], multi_process=main_args['multi_process']) Singleton_evaluator.setup(main_args['env'], logger=Singleton_logger, num_episodes=10, model_dir=main_args['result_dir'], multi_process=main_args['multi_process'], visualize=False, rand_seed=main_args['rand_seed']) self.env = wrap_deepmind(make_atari(main_args['env']), frame_stack=True) if main_args['rand_seed'] >= 0: self.env.seed(main_args['rand_seed']) self.obs_shape = self.env.observation_space.shape self.nb_action = self.env.action_space.n self.agent = DQN() self.agent.setup(self.obs_shape, self.nb_action) self.result_dir = main_args['result_dir'] self.reset()
def DEBUG_time(): env = make_atari(GAME) env = wrap_deepmind(env, episode_life=EPISODE_LIFE, clip_rewards=CLIP_REWARDS, frame_stack=FRAME_STACK, scale=SCALE) np.random.seed(1) env.seed(0) agent = Agent('cuda') env.reset() transaction_list = [ Transition(state=env.observation_space.sample(), action=0, state_=env.observation_space.sample(), reward=0) for i in range(32) ] batch = Transition(*zip(*transaction_list)) time_1 = time.time() print("len: {}".format(len(batch.state))) for i in range(1000): agent._state2tensor(batch.state) print("time: {}".format(time.time() - time_1))
def __init__(self, agent, env_id, num_envs, timesteps): self.agent = agent self.num_actions = len(ACTIONS) self.num_envs = num_envs self.envs = [] self.timesteps = timesteps self.states = np.zeros(shape=[num_envs, timesteps + 1, *INPUT_SHAPE], dtype=np.uint8) self.actions = np.zeros(shape=[num_envs, timesteps], dtype=np.uint8) self.action_log_probs = np.zeros(shape=[num_envs, timesteps], dtype=np.float32) self.rewards = np.zeros(shape=[num_envs, timesteps], dtype=np.float32) self.returns = np.zeros(shape=[num_envs, timesteps], dtype=np.float32) self.advantages = np.zeros(shape=[num_envs, timesteps], dtype=np.float32) self.values = np.zeros(shape=[num_envs, timesteps + 1], dtype=np.float32) self.news = np.zeros(shape=[num_envs, timesteps + 1], dtype=np.uint8) self.last_states = np.zeros([num_envs, *INPUT_SHAPE], dtype=np.uint8) self.last_states_new = np.zeros(num_envs, dtype=np.uint8) for n in range(num_envs): if env_id == "Haxball": env = HaxballEnvironment() else: env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=True, scale=False) self.envs.append(env) state = env.reset() self.last_states[n] = to_pytorch(state) self.last_states_new[:] = 1
def main(): env_id = get_args().env env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=True) env = Monitor(env) # rewards will appear higher than during training since rewards are not clipped agent = get_agent(env) # check for save path save_path = os.path.join('models', env_id + '.save') agent.load(save_path) obs = env.reset() renders = [] while True: obs = np.expand_dims(obs.__array__(), axis=0) a, v = agent.step(obs) obs, reward, done, info = env.step(a) env.render() if done: print(info) env.reset()
def main(): env_name = 'BreakoutNoFrameskip-v4' env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name), episode_life=True, clip_rewards=True, frame_stack=True, scale=True) output_size = env.action_space.n input_shape = env.observation_space.shape with tf.Session() as sess: with tf.variable_scope('Breakout_lr'): input = tf.placeholder(tf.float32, [None, *input_shape]) model = PPO(sess, input, models.nature_cnn(input), actiontype.Discrete, output_size, learning_rate=lambda f: 2.5e-4 * (1 - f), epochs=4, minibatch_size=4, gamma=0.99, beta2=0.01, name='Breakout_lr') train(sess, model, env_name, 1e7, 256, log_interval=5, num_envs=16, atari=True) #run_only(sess, model, env, render=True) env.close()
def _thunk(): env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if is_atari: if len(env.observation_space.shape) == 3: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env) return env
def create_deepmind_env(flags): return atari_wrappers.wrap_pytorch( atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(flags.env), clip_rewards=False, frame_stack=True, scale=False, ))
def create_env(flags): return wrap_pytorch( wrap_deepmind( make_atari(flags.env), clip_rewards=False, frame_stack=True, scale=False, ))
def _thunk(): env = make_atari(env_id, max_episode_steps=max_episode_steps) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) return wrap_deepmind(env, **wrapper_kwargs)
def _thunk(): env = make_atari(get_args().env) env.seed(seed + rank) env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=False) env = Monitor(env, rank) return env
def _thunk(): env = make_atari(env_id) env.seed(SEED + rank) gym.logger.setLevel(logging.WARN) env = wrap_deepmind(env) # wrap the env one more time for getting total reward env = Monitor(env, rank) return env
def get_env(name, seed): env = gym.make(name) set_global_seeds(seed) env.seed(seed) expt_dir = './tmp/' env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) env = wrap_deepmind(env) return env
def get_env(env_id, seed): """Get gym environment, per id and seed.""" env = gym.make(env_id) set_global_seeds(seed) env.seed(seed) expt_dir = './tmp/hw3_vid_dir2/' env = wrappers.Monitor(env, os.path.join(expt_dir, "gym"), force=True) env = wrap_deepmind(env) return env
def main(): args = parser.parse_args() with tf.Session() as sess: # env = gym.make(args.env) # initializing atari environment env = make_atari(args.env) env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) rank = MPI.COMM_WORLD.Get_rank() workerseed = args.seed + 10000 * rank set_global_seeds(workerseed) env.seed(workerseed) if args.inference: inference( env, sess, args.env, path_to_model=args.path_to_model, embedding_space_size=288, joint_training=args.joint_training, using_extrinsic_reward=args.using_extrinsic_reward, ) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) cbf( rank, env, sess, args.env, args.seed, args.debug, args.tensorboard, args.idf, replay_size=1000, batch_size=128, n_timesteps=args.num_timesteps, len_rollouts=256, n_optimizations=4, embedding_space_size=288, learning_rate=1e-5, joint_training=args.joint_training, using_extrinsic_reward=args.using_extrinsic_reward, )
def get_env(game_name): """ Wraps the environment in a couple of decorators formulated by deep mind, and implemented by OpenAi that perform preprocessing. :param game_name: The game that will be played. :return: The wrapped environment. """ env = gym.make(game_name) if game_name == PONG: env = wrap_deepmind(env, False, False, True, False) return env
def make_env(test): # Use different random seeds for train and test envs env_seed = test_seed if test else train_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: misc.env_modifiers.make_rendered(env) return env
def evaluate(agent, env, sess, restore=False, eval_episodes=eval_episodes, play=False): if restore: saver = tf.compat.v1.train.Saver() latestSnapshot = tf.train.latest_checkpoint(modelDir) if not latestSnapshot: raise Exception('No saved model found in: ' + modelDir) saver.restore(sess, latestSnapshot) print("Restored saved model from latest snapshot") eval_env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, no_op_reset=True) obs = eval_env.reset() eval_episode_rewards = [0.0] while (True): action = sess.run(agent.choose_action, feed_dict={ agent.obs_input_ph: np.array(obs)[None, :], agent.epsilon_ph: evaluation_ep }) next_obs, reward, done, info = eval_env.step(action) eval_episode_rewards[-1] += reward obs = next_obs eval_mean_reward = np.mean(eval_episode_rewards) if done: obs = eval_env.reset() no_of_episodes = len(eval_episode_rewards) if (restore): print("Mean reward after {} episodes is {}".format( no_of_episodes, round(eval_mean_reward, 2))) if (play): break if (no_of_episodes >= eval_episodes): break eval_episode_rewards.append(0.0) return round(eval_mean_reward, 2)
def test(): """ test distillation and evaluation """ LEARNING_RATE = 0.0001 GAME = 'BreakoutNoFrameskip-v4' BATCH_SIZE = 32 EPSILON = 0.05 ADD_MEM_NUM = 3000 UPDATE_NUM = 200 EPOCH = 1 MEM_SIZE = 50000 MODEL_PATH = './model/teacher/breakout-1.h5f' LOSS_FUC = 'mse' EVAL_ITERATION = 3000 logger = LogWriter(ROOT_PATH, BATCH_SIZE) logger.save_setting(args) env = make_atari(GAME) env = wrap_deepmind(env, frame_stack=True, scale=True) teacher = Teacher(MODEL_PATH, env, EPSILON, MEM_SIZE, EVAL_ITERATION) student = SingleDtStudent(env, LEARNING_RATE, logger, BATCH_SIZE, EPSILON, teacher, ADD_MEM_NUM, UPDATE_NUM, EPOCH, LOSS_FUC, TARGET_NET_SIZE) student.distill() logger.save_weights(student, 'student_{}'.format(LOSS_FUC)) logger.log_total_time_cost() # log root = 'result_EVAL' if not os.path.exists(root): os.mkdir(root) print('*** Create folder: {} ***'.format(root)) now_time = time.strftime('%y%m%d_%H%M%S', time.localtime()) save_path = os.path.join(root, now_time).replace('\\', '/') if not os.path.exists(save_path): os.mkdir(save_path) print('*** Create folder: {} ***'.format(save_path)) # evaluate teacher teacher.evaluate(save_path) # evaluate student for log_path in glob.glob('./result_DT/*'): Evaluator_deprecate(env, log_path, save_path, eval_iteration=EVAL_ITERATION).evaluate()
def main(): args = parser.parse_args() env = make_atari(args.env) env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) set_global_seeds(args.seed) env.seed(args.seed) nA = env.action_space.n cur_time = datetime.datetime.today().strftime('%Y_%m_%d_%H_%M_%S') directory = 'results/' + cur_time + '_random' if not os.path.exists(directory): os.makedirs(directory) directory_m = 'model/' + cur_time + '_random' if not os.path.exists(directory_m): os.makedirs(directory_m) # For graphing best_reward = -float("inf") cur_reward = 0 cur_ep_len = 0 sum_rewards = 0 num_episodes = 0 graph_rewards = [] graph_epi_lens = [] graph_avg_rewards = [] _ = env.reset() for t in range(args.num_timesteps): if t > 0 and t % int(1e3) == 0: print('# frame: %i. Best reward so far: %i.' % (t, best_reward,)) save_to_file(directory, args.env, graph_rewards, graph_epi_lens, graph_avg_rewards) action = np.random.choice(nA) _, reward, done, _ = env.step(action) cur_reward += reward cur_ep_len += 1 if done: graph_epi_lens.append((cur_ep_len,t)) cur_ep_len = 0 if cur_reward > best_reward: best_reward = cur_reward graph_rewards.append((best_reward, t)) sum_rewards += cur_reward num_episodes += 1 graph_avg_rewards.append((sum_rewards / num_episodes, t)) cur_reward = 0 _ = env.reset() save_to_file(directory, env_name, graph_rewards, graph_epi_lens, graph_avg_rewards)
def __init__(self, env_name, params={}, **kwargs): self.load_params(params) envs = [gym.make(env_name) for _ in range(self.num_env)] envs[0] = gym.wrappers.Monitor( envs[0], kwargs.get('logdir', DEFAULT_LOGDIR), force=True, video_callable=lambda t: t % self.video_freq == 0) num_frames = self.input_shape[-1] envs = [ wrap_deepmind(env, num_frames, self.end_of_life_penalty) for env in envs ] super().__init__(envs, **kwargs)
def _thunk(): env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance(env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) if is_atari: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = WrapPyTorch(env) return env
def evaluate(agent, env, eval_episodes=eval_episodes, restore=False, play=False): if (restore): ckpt = tf.train.checkpoint(model=agent.online_model()) latestSnapshot = tf.train.latest_checkpoint(modelDir) if not latestSnapshot: raise Exception('No saved model found in: ' + modelDir) ckpt.restore(latestSnapshot) print("Restored saved model from latest snapshot") eval_env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, evaluate=True) obs = eval_env.reset() eval_episode_rewards = [0.0] while (True): action = agent.choose_action(obs=np.array(obs)[None, :], epsilon=evaluation_ep) next_obs, reward, done, info = eval_env.step(action) eval_episode_rewards[-1] += reward obs = next_obs eval_mean_reward = np.mean(eval_episode_rewards) if done: obs = eval_env.reset() no_of_episodes = len(eval_episode_rewards) if (restore): print("Mean reward after {} episodes is {}".format( no_of_episodes, round(eval_mean_reward, 2))) if (play): break if (no_of_episodes >= eval_episodes): break eval_episode_rewards.append(0.0) return round(eval_mean_reward, 2)
def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env), episode_life=not test, clip_rewards=not test) env.seed(int(env_seed)) if test: # Randomize actions like epsilon-greedy in evaluation as well env = chainerrl.wrappers.RandomizeAction(env, 0.05) if args.monitor: env = gym.wrappers.Monitor( env, args.outdir, mode='evaluation' if test else 'training') if args.render: misc.env_modifiers.make_rendered(env) return env
def get_env(task): env_id = task.env_id env = gym.make(env_id) env = wrap_deepmind(env) return env