def create_doom(env_id, client_id, envWrap=True, record=False, outdir=None, noLifeReward=False, acRepeat=0, **_): from ppaquette_gym_doom import wrappers if 'labyrinth' in env_id.lower(): if 'single' in env_id.lower(): env_id = 'ppaquette/LabyrinthSingle-v0' elif 'fix' in env_id.lower(): env_id = 'ppaquette/LabyrinthManyFixed-v0' else: env_id = 'ppaquette/LabyrinthMany-v0' elif 'very' in env_id.lower(): env_id = 'ppaquette/DoomMyWayHomeFixed15-v0' elif 'sparse' in env_id.lower(): env_id = 'ppaquette/DoomMyWayHomeFixed-v0' elif 'fix' in env_id.lower(): if '1' in env_id or '2' in env_id: env_id = 'ppaquette/DoomMyWayHomeFixed' + str(env_id[-2:]) + '-v0' elif 'new' in env_id.lower(): env_id = 'ppaquette/DoomMyWayHomeFixedNew-v0' else: env_id = 'ppaquette/DoomMyWayHomeFixed-v0' else: env_id = 'ppaquette/DoomMyWayHome-v0' # VizDoom workaround: Simultaneously launching multiple vizdoom processes # makes program stuck, so use the global lock in multi-threading/processing client_id = int(client_id) time.sleep(client_id * 10) env = gym.make(env_id) modewrapper = wrappers.SetPlayingMode('algo') obwrapper = wrappers.SetResolution('160x120') acwrapper = wrappers.ToDiscrete('minimal') env = modewrapper(obwrapper(acwrapper(env))) # env = env_wrapper.MakeEnvDynamic(env) # to add stochasticity if record and outdir is not None: env = gym.wrappers.Monitor(env, outdir, force=True) if envWrap: fshape = (42, 42) frame_skip = acRepeat if acRepeat > 0 else 4 env.seed(None) if noLifeReward: env = env_wrapper.NoNegativeRewardEnv(env) env = env_wrapper.BufferedObsEnv(env, skip=frame_skip, shape=fshape) env = env_wrapper.SkipEnv(env, skip=frame_skip) elif noLifeReward: env = env_wrapper.NoNegativeRewardEnv(env) env = Vectorize(env) env = DiagnosticsInfo(env) env = Unvectorize(env) return env
def create_mario(env_id, client_id, envWrap=True, record=False, outdir=None, noLifeReward=False, acRepeat=0, **_): import ppaquette_gym_super_mario from ppaquette_gym_super_mario import wrappers if '-v' in env_id.lower(): env_id = 'ppaquette/' + env_id else: env_id = 'ppaquette/SuperMarioBros-1-3-v0' # shape: (224,256,3)=(h,w,c) #env_id = 'ppaquette/SuperMarioBros-4-2-Tiles-v0' # Mario workaround: Simultaneously launching multiple vizdoom processes makes program stuck, # so use the global lock in multi-threading/multi-processing # see: https://github.com/ppaquette/gym-super-mario/tree/master/ppaquette_gym_super_mario client_id = int(client_id) time.sleep(client_id * 50) env = gym.make(env_id) modewrapper = wrappers.SetPlayingMode('algo') acwrapper = wrappers.ToDiscrete() env = modewrapper(acwrapper(env)) env = env_wrapper.MarioEnv(env, tilesEnv=False) #env = env_wrapper.MarioEnv(env, tilesEnv=True) if record and outdir is not None: env = gym.wrappers.Monitor(env, outdir, force=True) if envWrap: frame_skip = acRepeat if acRepeat > 0 else 4 frame_skip = 6 if "1-1" in env_id else frame_skip fshape = (42, 42) env.seed(None) if noLifeReward: env = env_wrapper.NoNegativeRewardEnv(env) env = env_wrapper.BufferedObsEnv(env, skip=frame_skip, shape=fshape, maxFrames=False) if frame_skip > 1: env = env_wrapper.SkipEnv(env, skip=frame_skip) elif noLifeReward: env = env_wrapper.NoNegativeRewardEnv(env) env = Vectorize(env) env = DiagnosticsInfo(env) env = Unvectorize(env) # env.close() # TODO: think about where to put env.close ! return env
def create_doom(record=False, outdir=None): from ppaquette_gym_doom import wrappers import env_wrapper env = gym.make('ppaquette/DoomMyWayHome-v0') modewrapper = wrappers.SetPlayingMode('algo') obwrapper = wrappers.SetResolution('160x120') acwrapper = wrappers.ToDiscrete('minimal') env = modewrapper(obwrapper(acwrapper(env))) if record: env = gym.wrappers.Monitor(env, outdir, force=True) fshape = (42, 42) env.seed(None) env = env_wrapper.NoNegativeRewardEnv(env) env = env_wrapper.BufferedObsEnv(env, skip=1, shape=fshape) return env
outputdir = './gray42/' env_id = 'ppaquette/SuperMarioBros-1-1-v0' env = gym.make(env_id) modewrapper = wrappers.SetPlayingMode('algo') acwrapper = wrappers.ToDiscrete() env = modewrapper(acwrapper(env)) env = env_wrapper.MarioEnv(env) freshape = fshape = (42, 42) env.seed(None) env = env_wrapper.NoNegativeRewardEnv(env) env = env_wrapper.DQNObsEnv(env, shape=freshape) env = env_wrapper.BufferedObsEnv(env, n=4, skip=1, shape=fshape, channel_last=True) env = env_wrapper.EltwiseScaleObsEnv(env) start = time.time() episodes = 0 maxepisodes = 1 env.reset() imCount = 1 utils.mkdir_p(outputdir + '/ep_%02d/' % (episodes + 1)) while (1): obs, reward, done, info = env.step(env.action_space.sample()) print(outputdir) Image.fromarray( (255 * obs).astype('uint8')).save(outputdir + '/ep_%02d/%06d.jpg' %
def train(): var = 2. pointer = 0 # timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') # if timestep_limit is None: timestep_limit = env.spec.timestep_limit J_D_loss = np.zeros((N_trials, len(N_vals))) J_G_loss = np.zeros((N_trials, len(N_vals))) for ep in range(MAX_EPISODES): ep_reward = 0 ep_curious_reward = 0 D_loss = 0 G_loss = 0 l_2_loss = 0 # if M.pointer > MEMORY_CAPACITY: # for t in range(ITER_D_Training): # b_s, b_a, b_r, b_curious_r, b_s_ = M.sample(BATCH_SIZE) # generator.learn(b_s, b_a) # b_g = generator.predict_batch(b_s, b_a) # discriminator.learn(b_g, b_s_, b_s, b_a) # if t%10 ==0: # one_step_D_loss = discriminator.eval(b_g, b_s_, b_s, b_a) # one_step_G_loss = generator.eval(b_s, b_a) # print('Ep:', ep, # '|D Training Step:%i'% int(t), # '| D loss: %f' % float(one_step_D_loss), # '| G loss: %f' % float(one_step_G_loss), # ) # # for t in range(ITER_G_Training): # b_s, b_a, b_r, b_curious_r, b_s_ = M.sample(BATCH_SIZE) # # if t % 10 == 0: # one_step_G_loss = generator.eval(b_s, b_a) # print('Ep:', ep, # '|G Training Step:%i' % int(t), # '| G loss: %f' % float(one_step_G_loss), # ) lstm_state = LSTM_unit.get_initial_state() env = gym.make(env_id) env = env_wrapper.BufferedObsEnv(env, n=TIME_STEP, skip=frame_skip, shape=fshape, channel_last=False) s = env.reset() s = np.expand_dims(s, -1) for t in range(MAX_EP_STEPS): a = actor.choose_action(s, lstm_state, var) g = generator.predict(s, lstm_state, a) s_, r, done, info, _ = env._step(a) s_ = np.expand_dims(s_, -1) curious_r = ITA * discriminator.determine(s, lstm_state, a, g)[0] new_lstm_state = LSTM_unit.get_state(s, lstm_state) one_step_l_2_loss = discriminator.observe_and_compare( s_, g, ) l_2_loss += one_step_l_2_loss M.store_transition(s, lstm_state, a, r, curious_r, s_, new_lstm_state) if M.pointer > MEMORY_CAPACITY: # for i in range(ITER_train_G): # b_s, b_a, b_r, b_curious_r, b_s_ = M.sample(BATCH_SIZE) # generator.learn(b_s, b_a) b_s, b_lstm_s, b_a, b_r, b_curious_r, b_s_, b_lstm_s_ = M.sample( BATCH_SIZE) generator.learn(b_s, b_lstm_s, b_a) b_g = generator.predict_batch(b_s, b_lstm_s, b_a) discriminator.learn(b_g, b_s_, b_s, b_lstm_s, b_a) # Learn the minibatch # b_curious_r = discriminator.determine_batch(b_s, b_lstm_s, b_a, b_g) critic.learn(b_s, b_lstm_s, b_a, b_curious_r, b_s_, b_lstm_s_) actor.learn(b_s, b_lstm_s) one_step_D_loss = discriminator.eval(b_g, b_s_, b_s, b_lstm_s, b_a) D_loss += one_step_D_loss one_step_G_loss = generator.eval(b_s, b_lstm_s, b_a) G_loss += one_step_G_loss if t % 10 == 0: print( 'Ep:', ep, '|Step:%i' % int(t), '| Curious_R: %f' % float(curious_r), '| Prediction_error: %f' % float(one_step_l_2_loss), '| D loss: %f' % float(one_step_D_loss), '| G loss: %f' % float(one_step_G_loss), ) s = s_ lstm_state = new_lstm_state ep_reward += r ep_curious_reward += curious_r if t == MAX_EP_STEPS - 1 or done or info[ 'life'] == 0 or info['time'] <= 1: # if done: t = t + 1 result = '| done' if done else '| ----' print( 'Ep:', ep, result, '| R: %i' % int(ep_reward), '| Curious_R: %f' % float(ep_curious_reward), '| D_loss: %f' % float(D_loss / t), '| G_loss: %f' % float(G_loss / t), '| Prediction_error: %f' % float(l_2_loss / t), '| Explore: %.2f' % var, ) env.close() var = max([var * .9999, VAR_MIN]) break if ep == N_vals[pointer]: # evaluate the minibatch J_D_loss[0, pointer] = D_loss / t J_G_loss[0, pointer] = G_loss / t J_Curious[0, pointer] = ep_curious_reward J_r[0, pointer] = ep_reward if pointer < N_vals.__len__() - 1: pointer += 1 if os.path.isdir(path): shutil.rmtree(path) os.mkdir(path) ckpt_path = os.path.join('./' + MODE[n_model], 'Curious_GAN.ckpt') save_path = saver.save(sess, ckpt_path, write_meta_graph=False) print("\nSave Model %s\n" % save_path)
MEMORY_CAPACITY = 5000 BATCH_SIZE = 32 VAR_MIN = 1 RENDER = False LOAD = False MODE = ['easy', 'hard'] n_model = 1 ITA = 1 # Curious coefficient frame_skip = acRepeat if acRepeat > 0 else 4 lock = multiprocessing.Lock() env = gym.make(env_id) env.configure(lock=lock) env = env_wrapper.BufferedObsEnv(env, n=TIME_STEP, skip=frame_skip, shape=fshape, channel_last=False) STATE_DIM = env.observation_space.shape ACTION_DIM = env.action_space.shape sess = tf.Session() with tf.name_scope("S"): S = tf.placeholder(tf.float32, shape=[None, TIME_STEP, *fshape, 1], name="s") with tf.name_scope("single_S_"): single_S_ = tf.placeholder(tf.float32, shape=[None, 1, *fshape, 1], name="single_s_")
def create_ple_env(env_id, record=False, outdir=None, **_): env = gym.make(env_id) env = env_wrapper.BufferedObsEnv(env, skip=4, shape=(42, 42)) env = env_wrapper.SkipEnv(env, skip=4) return env
def create_mario(env_id, client_id, envWrap=True, record=False, outdir=None, noLifeReward=False, acRepeat=0, **_): import ppaquette_gym_super_mario from ppaquette_gym_super_mario import wrappers if '-v' in env_id.lower(): env_id = 'ppaquette/' + env_id else: env_id = 'ppaquette/SuperMarioBros-1-1-v0' # shape: (224,256,3)=(h,w,c) # Mario workaround: Simultaneously launching multiple vizdoom processes makes program stuck, # so use the global lock in multi-threading/multi-processing # see: https://github.com/ppaquette/gym-super-mario/tree/master/ppaquette_gym_super_mario client_id = int(client_id) time.sleep(client_id * 50) env = gym.make(env_id) modewrapper = wrappers.SetPlayingMode('algo') acwrapper = wrappers.ToDiscrete() env = modewrapper(acwrapper(env)) env = env_wrapper.MarioEnv(env) if record and outdir is not None: env = gym.wrappers.Monitor(env, outdir, force=True) if envWrap: frame_skip = acRepeat if acRepeat > 0 else 6 fshape = (42, 42) env.seed(None) if noLifeReward: env = env_wrapper.NoNegativeRewardEnv(env) env = env_wrapper.BufferedObsEnv(env, skip=frame_skip, shape=fshape, maxFrames=False) if frame_skip > 1: env = env_wrapper.SkipEnv(env, skip=frame_skip) elif noLifeReward: env = env_wrapper.NoNegativeRewardEnv(env) env = Vectorize(env) env = DiagnosticsInfo(env) env = Unvectorize(env) # env.close() # TODO: think about where to put env.close ! return env # def DiagnosticsInfo(env, *args, **kwargs): # return vectorized.VectorizeFilter(env, DiagnosticsInfoI, *args, **kwargs) # class DiagnosticsInfoI(vectorized.Filter): # def __init__(self, log_interval=503): # super(DiagnosticsInfoI, self).__init__() # self._episode_time = time.time() # self._last_time = time.time() # self._local_t = 0 # self._log_interval = log_interval # self._episode_reward = 0 # self._episode_length = 0 # self._all_rewards = [] # self._num_vnc_updates = 0 # self._last_episode_id = -1 # def _after_reset(self, observation): # logger.info('Resetting environment logs') # self._episode_reward = 0 # self._episode_length = 0 # self._all_rewards = [] # return observation # def _after_step(self, observation, reward, done, info): # to_log = {} # if self._episode_length == 0: # self._episode_time = time.time() # self._local_t += 1 # if info.get("stats.vnc.updates.n") is not None: # self._num_vnc_updates += info.get("stats.vnc.updates.n") # if self._local_t % self._log_interval == 0: # cur_time = time.time() # elapsed = cur_time - self._last_time # fps = self._log_interval / elapsed # self._last_time = cur_time # cur_episode_id = info.get('vectorized.episode_id', 0) # to_log["diagnostics/fps"] = fps # if self._last_episode_id == cur_episode_id: # to_log["diagnostics/fps_within_episode"] = fps # self._last_episode_id = cur_episode_id # if info.get("stats.gauges.diagnostics.lag.action") is not None: # to_log["diagnostics/action_lag_lb"] = info["stats.gauges.diagnostics.lag.action"][0] # to_log["diagnostics/action_lag_ub"] = info["stats.gauges.diagnostics.lag.action"][1] # if info.get("reward.count") is not None: # to_log["diagnostics/reward_count"] = info["reward.count"] # if info.get("stats.gauges.diagnostics.clock_skew") is not None: # to_log["diagnostics/clock_skew_lb"] = info["stats.gauges.diagnostics.clock_skew"][0] # to_log["diagnostics/clock_skew_ub"] = info["stats.gauges.diagnostics.clock_skew"][1] # if info.get("stats.gauges.diagnostics.lag.observation") is not None: # to_log["diagnostics/observation_lag_lb"] = info["stats.gauges.diagnostics.lag.observation"][0] # to_log["diagnostics/observation_lag_ub"] = info["stats.gauges.diagnostics.lag.observation"][1] # if info.get("stats.vnc.updates.n") is not None: # to_log["diagnostics/vnc_updates_n"] = info["stats.vnc.updates.n"] # to_log["diagnostics/vnc_updates_n_ps"] = self._num_vnc_updates / elapsed # self._num_vnc_updates = 0 # if info.get("stats.vnc.updates.bytes") is not None: # to_log["diagnostics/vnc_updates_bytes"] = info["stats.vnc.updates.bytes"] # if info.get("stats.vnc.updates.pixels") is not None: # to_log["diagnostics/vnc_updates_pixels"] = info["stats.vnc.updates.pixels"] # if info.get("stats.vnc.updates.rectangles") is not None: # to_log["diagnostics/vnc_updates_rectangles"] = info["stats.vnc.updates.rectangles"] # if info.get("env_status.state_id") is not None: # to_log["diagnostics/env_state_id"] = info["env_status.state_id"] # if reward is not None: # self._episode_reward += reward # if observation is not None: # self._episode_length += 1 # self._all_rewards.append(reward) # if done: # logger.info('True Game terminating: env_episode_reward=%s episode_length=%s', self._episode_reward, self._episode_length) # total_time = time.time() - self._episode_time # to_log["global/episode_reward"] = self._episode_reward # to_log["global/episode_length"] = self._episode_length # to_log["global/episode_time"] = total_time # to_log["global/reward_per_time"] = self._episode_reward / total_time # self._episode_reward = 0 # self._episode_length = 0 # self._all_rewards = [] # if 'distance' in info: to_log['distance'] = info['distance'] # mario # if 'POSITION_X' in info: # doom # to_log['POSITION_X'] = info['POSITION_X'] # to_log['POSITION_Y'] = info['POSITION_Y'] # return observation, reward, done, to_log