def main(): logger.configure( 'E:\\Project\\Toyota RL\\Toyata 2018\\Toyata RL 4th quarter\\log') # 'F:\\GuanYang\\toyota2018_4\\log' parser = common_arg_parser() parser.add_argument('--load_model_path', default=None) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() env = environment.Env(N=6, pattern=[0, 2, 4, 8, 9, 10], height=30, width=30) if not args.play: # train the model train(env=env, num_timesteps=args.num_timesteps, load_model_path=args.load_model_path) else: # construct the model object, load pre-trained model and render pi = train(env=env, num_timesteps=1) U.load_state(args.load_model_path) ob = env.manualSet(modelList=env.pattern) while True: action = pi.act(stochastic=False, ob=ob)[0] # ob, _, done, _ = env.step(action) ob, rew, done, _ = env.updateEnv(action) env.showEnv() if done: ob = env.manualSet(modelList=env.pattern)
def policy_run(env, policy_fn, load_model_path, number_rollouts, stochastic_policy): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) U.initialize() # Prepare for rollouts # ---------------------------------------- weight_file = tf.train.latest_checkpoint(load_model_path) if weight_file is None: print("error: no weight file") return U.load_state(weight_file) for _ in range(number_rollouts): ob = env.reset() done = False ep_rewards = [] cur_ep_ret = 0 while not done: env.render() time.sleep(0.1) ac, vpred = pi.act(stochastic_policy, ob) ob, rew, done, _ = env.step(ac) cur_ep_ret += rew ep_rewards.append(cur_ep_ret) ep_reward_mean = np.mean(ep_rewards) print("ep_reward_mean: {}".format(ep_reward_mean))
def load_model(model_path): if ScoutExploreTaskRL.act is not None: return class FakeEnv(object): def __init__(self): low = np.zeros(6) high = np.ones(6) self.observation_space = Box(low, high) self.action_space = Discrete(8) def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) env = FakeEnv() network = deepq.models.mlp([64, 32]) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': network, 'num_actions': env.action_space.n, } act = deepq.build_act(**act_params) sess = tf.Session() sess.__enter__() print("load_model path=", model_path) load_state(model_path) ScoutExploreTaskRL.act = ActWrapper(act, act_params) print("load_model ok")
def eval(env, model_dir): from baselines.ppo1 import mlp_policy # Load variables U.make_session(num_cpu=1).__enter__() ob_space = env.observation_space ac_space = env.action_space def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy # Load variables U.load_state(osp.join(model_dir, "model")) ob = env.reset() while True: # print ("Obs: ", ob) # print (type(ob)) ac, vpred = pi.act(True, ob) ob, rew, new, _ = env.step(ac) if new: ob = env.reset() env.close()
def submit_round2(walker_env, submit_env, policy_fn, load_model_path, stochastic, actions): ob_space = walker_env.observation_space ac_space = walker_env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy U.initialize() U.load_state(load_model_path) while True: obs = walker_env.reset() stepno = 0 if isinstance(obs, bool) and obs == False: break done = False while not done: action, _ = pi.act(stochastic, obs, np.int32(stepno)) obs, rew, done, info = walker_env.step(action) stepno += 1 if done: break submit_env.submit()
def load(path, q_func, env, num_cpu=16): with open(path, "rb") as f: model_data = dill.load(f) def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = deepq.build_act(**act_params) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6666667) sess = U.make_session(num_cpu=num_cpu, gpu_opt=gpu_options) #sess = U.make_session(num_cpu=num_cpu) sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model")) return ActWrapper(act, act_params)
def predict_action(img, load_model_path): ob_space1 = spaces.Box(low=-np.inf, high=np.inf, shape=(5, ), dtype=np.float) ob_space2 = spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) ob_space = (ob_space1, ob_space2) ac_space = spaces.Box(low=-np.array(np.ones(2)), high=np.array(np.ones(2))) pi = cnn_lstm_policy.CnnPhyLSTMPolicy("pi", ob_space, ac_space, hid_size=64, num_hid_layers=1) U.initialize() assert load_model_path is not None U.load_state(load_model_path) ob1 = np.array([np.cos(1), np.sin(1), 0, 0, 0]) ob2 = process_img(img) ac = pi.act(True, (ob1, ob2), pi.get_initial_state())[0] print(pi.get_initial_state()) return ac
def load_model(load_model_path, var_list=None): if os.path.isdir(load_model_path): ckpt_path = tf.train.latest_checkpoint(load_model_path) else: ckpt_path = load_model_path logger.info("Load checkpoint: %s", ckpt_path) U.load_state(ckpt_path, var_list)
def evaluate(env, policy_func, load_model_path, timesteps_per_batch, number_trajs=10, stocahstic_policy=False): from tqdm import tqdm # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=False) U.initialize() # Prepare for rollouts # ---------------------------------------- ep_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=stocahstic_policy) U.load_state(load_model_path) len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = ep_gen.__next__() ep_len, ep_ret = traj['ep_len'], traj['ep_ret'] len_list.append(ep_len) ret_list.append(ep_ret) if stocahstic_policy: print('stochastic policy:') else: print('deterministic policy:') print("Average length:", sum(len_list) / len(len_list)) print("Average return:", sum(ret_list) / len(ret_list))
def main(): logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default='checkpoints_best/Humanoid-v2-6914') parser.set_defaults(num_timesteps=int(2e8)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render pi = train(num_timesteps=1, seed=args.seed) U.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=123) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() time.sleep(0.01) if done: ob = env.reset()
def main(): logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render pi = train(args.env, num_timesteps=1, seed=args.seed) U.load_state(args.model_path) env = make_mujoco_env(args.env, seed=0) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) print(ob,action) #env.render() if done: ob = env.reset()
def restore_act_and_value(env, path, num_cpu=4, scope="saved/deepq", reuse=None): # pdb.set_trace() qfunc_path = path + 'model.pkl' with open(qfunc_path, "rb") as f: q_func = dill.load(f) def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act = build_act(make_obs_ph, q_func, env.action_space.n, scope, reuse) value = build_value_function(make_obs_ph, q_func, env.action_space.n, scope, True) sess = U.make_session(num_cpu=num_cpu) sess.__enter__() # for debugging # for var in tf.global_variables(): # print(var.name) U.load_state(tf.train.latest_checkpoint(path)) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n } return ActWrapper(act, act_params), value
def main(): """ Runs the test """ logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render policy = train(num_timesteps=1, seed=args.seed) tf_util.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) obs = env.reset() while True: action = policy.act(stochastic=False, obs=obs)[0] obs, _, done, _ = env.step(action) env.render() if done: obs = env.reset()
def evaluate_proximity_predictor(self, var_list): config = self._config if config.evaluate_all_ckpts: from glob import glob import pandas as pd from tqdm import tqdm files = glob(os.path.join(config.log_dir, "*.index")) files.sort() max_step = max([int(os.path.basename(f).split('.')[0]) for f in files]) results = {} for proximity in self.proximity_predictors: results[proximity.env_name] = {'mean': [], 'std': [], 'step': []} for i in tqdm(range(0, max_step, 25)): logger.log('*** evaluate ckpt {}'.format(i)) U.load_state(os.path.join(config.log_dir, '%.5d' % i), var_list) info = self._evaluate_proximity_predictor() for proximity_name, proximity_info in info.items(): for key, value in proximity_info.items(): results[proximity_name][key].append(value) results[proximity_name]['step'].append(i) df = pd.DataFrame(results) df.to_pickle('proximity_predictor_evaluation.pkl') else: self._evaluate_proximity_predictor()
def load_model(load_model_path, var_list=None): if os.path.isdir(load_model_path): ckpt_path = tf.train.latest_checkpoint(load_model_path) else: ckpt_path = load_model_path if ckpt_path: U.load_state(ckpt_path, var_list) return ckpt_path
def load_model(self, dirname, iteration=None): if iteration is not None: dirname = os.path.join(dirname, 'iter_%d' % iteration) else: dirname = os.path.join(dirname, 'trained_model') print('Loading model from %s' % dirname) U.load_state(dirname) print('Loaded!')
def evaluate_ppo(num_eps, is_gui): sumoseed = 0 randomseed = 0 model_dir = '../tf_models/trial9' latest_checkpoint = tf.train.latest_checkpoint(model_dir) model_path = latest_checkpoint pi = train(max_iters=1, callback=None) U.load_state(model_path) env = LaneChangeEnv(is_train=False) ret_eval = 0 ret_det_eval = 0 # not a integer, will be broadcasted danger_num = 0 crash_num = 0 level_1_danger = [] level_2_danger = [] collision_num = 0 ep_len_list = [] success_num = 0 for i in range(num_eps): ep_eval = episode_generator(pi, env, is_gui=is_gui, sumoseed=sumoseed, randomseed=randomseed) ret_eval += ep_eval['ep_ret'] ret_det_eval += ep_eval['ep_rets_detail'] danger_num += ep_eval['ep_num_danger'] crash_num += ep_eval['ep_num_crash'] level_1_danger.append(1 if ep_eval['ep_num_danger'] > 0 else 0) level_2_danger.append((1 if ep_eval['ep_num_crash'] > 0 else 0)) collision_num += ep_eval['ep_is_collision'] success_num += int(ep_eval['ep_is_success']) if ep_eval['ep_is_success']: ep_len_list.append(ep_eval['ep_len']) sumoseed += 1 randomseed += 1 ret_eval /= float(num_eps) ret_det_eval /= float(num_eps) danger_rate = danger_num / num_eps crash_rate = crash_num / num_eps level_1_danger_rate = np.mean(level_1_danger) level_2_danger_rate = np.mean(level_2_danger) coll_rate = collision_num / num_eps success_rate = success_num / float(num_eps) success_len = np.mean(ep_len_list) print('reward_detail: ', ret_det_eval) print('reward: ', ret_eval, '\ndanger_rate: ', danger_rate, '\ncrash_rate: ', crash_rate, '\nlevel-1-danger_rate: ', level_1_danger_rate, '\nlevel-2-danger_rate: ', level_2_danger_rate, '\ncollision_rate: ', coll_rate, '\nsuccess_rate: ', success_rate, '\nsucess_len: ', success_len) return ret_eval, danger_rate, crash_rate, level_1_danger_rate, level_2_danger_rate, coll_rate, success_rate, success_len
def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, args, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=reuse) U.initialize() # Prepare for rollouts # ---------------------------------------- U.load_state(load_model_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] total_success = 0 for _ in tqdm(range(number_trajs)): traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[ 'ep_len'], traj['ep_ret'] if traj['is_success'] == True: total_success += 1 obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split( '/')[-1] + '.' + env.spec.id + "seed_{0}".format(args.seed) np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list) / len(len_list) avg_ret = sum(ret_list) / len(ret_list) print("Average length:", avg_len) print("Average return:", avg_ret) return avg_len, avg_ret
def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=reuse) U.initialize() # Prepare for rollouts # ---------------------------------------- U.load_state(load_model_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] max_x_pos_list = [] for _ in tqdm(range(number_trajs)): traj, max_x_pos = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[ 'ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) max_x_pos_list.append(max_x_pos) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split('/')[-1] + '.' + env.spec.id np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list) / len(len_list) avg_ret = sum(ret_list) / len(ret_list) avg_max_x_pos = np.mean(max_x_pos_list) print("Average length:", avg_len) print("Average return:", avg_ret) print("Average max_x_pos:", avg_max_x_pos) print("Std max_x_pos:", np.std(max_x_pos_list)) return avg_len, avg_ret
def reload(path): with open(path, "rb") as f: model_data, act_params = dill.load(f) with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model"))
def main(): set_global_seeds(1) args = parse_args() with U.make_session(4): # noqa _, env = make_env(args.env) act = deepq.build_act(make_obs_ph=lambda name: U.Uint8Input( env.observation_space.shape, name=name), q_func=dueling_model if args.dueling else model, num_actions=env.action_space.n) U.load_state(os.path.join(args.model_dir, "saved")) wang2015_eval(args.env, act, stochastic=args.stochastic)
def load(path): with open(path, "rb") as f: model_data = cloudpickle.load(f) sess = U.get_session() sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model"))
def main(): set_global_seeds(1) args = parse_args() with U.make_session(4) as sess: # noqa _, env = make_env(args.env) act = deepq.build_act( make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), q_func=dueling_model if args.dueling else model, num_actions=env.action_space.n) U.load_state(os.path.join(args.model_dir, "saved")) wang2015_eval(args.env, act, stochastic=args.stochastic)
def evaluate(env, policy_func, load_model_path, video_prefix, record, render, *, timesteps_per_batch # what to train on ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) U.initialize() U.load_state(load_model_path) ep_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=False, record=record, render=render) ep_lens = [] ep_rets = [] visual_obs = [] if record: record_dir = os.path.join(os.path.dirname(load_model_path), 'video') os.makedirs(record_dir, exist_ok=True) for _ in tqdm(range(10)): ep_traj = ep_gen.__next__() ep_lens.append(ep_traj["ep_len"]) ep_rets.append(ep_traj["ep_ret"]) # Video recording if _ % 2 == 0 and record: visual_obs = ep_traj["visual_obs"] if video_prefix is None: video_path = os.path.join(record_dir, '{}.mp4'.format(_)) else: video_path = os.path.join(record_dir, '{}-{}.mp4'.format(video_prefix, _)) fps = 15. def f(t): frame_length = len(visual_obs) new_fps = 1./(1./fps + 1./frame_length) idx = min(int(t*new_fps), frame_length-1) return visual_obs[idx] video = mpy.VideoClip(f, duration=len(visual_obs)/fps+2) video.write_videofile(video_path, fps, verbose=False) print('Episode Length: {}'.format(sum(ep_lens)/10.)) print('Episode Rewards: {}'.format(sum(ep_rets)/10.))
def main(): """ restore latest model from ckpt """ model_dir = '../tf_models/trial9' latest_checkpoint = tf.train.latest_checkpoint(model_dir) model_path = latest_checkpoint EP_MAX = 20 EP_LEN_MAX = 1000 # train flag check: train or animate trained results # animate trained results pi = train(max_iters=1, callback=None) U.load_state(model_path) env = LaneChangeEnv(gui=True, label='1', is_train=False) sumoseed = 45 #44 randomseed = 45 # 6 9 for ep in range(EP_MAX): # if env.is_collision: # print('sumoseed:', sumoseed, 'randomseed:', randomseed) # break sumoseed += 0 randomseed += 0 print('sumoseed:', sumoseed, 'randomseed:', randomseed) ob = env.reset(tlane=0, tfc=2, is_gui=True, sumoseed=sumoseed, randomseed=randomseed) # ob = env.reset(tlane=0, tfc=2, is_gui=True, sumoseed=None, randomseed=None) traci.vehicle.setColor(env.egoID, (255, 69, 0)) ob_np = np.asarray(ob).flatten() speed_list = [] lat_speed_list = [] for t in range(EP_LEN_MAX): ac = pi.act(stochastic=False, ob=ob_np)[0] ob, reward, done, info = env.step(ac) # need modification speed_list.append(env.ego.speed) lat_speed_list.append(env.ego.speed_lat) ob_np = np.asarray(ob).flatten() if done: break np_array = np.vstack([ np.linspace(0, len(speed_list) - 1, num=len(speed_list)), speed_list, lat_speed_list ]).T if ep == 1: np.savetxt('../data/final.csv', np_array, delimiter=",")
def maybe_load_model(savedir): """Load model if present at the specified path.""" if savedir is None: return state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip')) found_model = os.path.exists(state_path) if found_model: state = pickle_load(state_path, compression=True) model_dir = "model-{}".format(state["num_iters"]) U.load_state(os.path.join(savedir, model_dir, "saved")) logger.log("Loaded models checkpoint at {} iterations".format( state["num_iters"])) return state
def load_wrapper(load_path=None, checkpoint_path=None): with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td if tf.train.latest_checkpoint(td) is not None: model_file = os.path.join(td, "model") load_state(model_file) elif load_path is not None: load_state(load_path) else: raise Warning("Baselines DQN: no model file found")
def main(): parser = mujoco_arg_parser() parser.add_argument('--model-path') parser.add_argument('--sim', default=False, action='store_true') parser.add_argument('--hessians', default=False, action='store_true') parser.add_argument('--logdir', type=str, default=None) args = parser.parse_args() logger.configure(args.logdir) if not args.model_path: raise ValueError('You have to provide a model path.') if not args.play: # train the model train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path, target1=args.target1, target2=args.target2, target3=args.target3, output_prefix=args.output_prefix, input_file=args.input_file, sim=args.sim, hessians=args.hessians) else: # construct the model object, load pre-trained model and render pi = train(args.env, num_timesteps=1, seed=args.seed, target1=args.target1, target2=args.target2, target3=args.target3, output_prefix=args.output_prefix, input_file=args.input_file, sim=False) U.load_state('models/' + args.model_path) env = make_pareto_mujoco_env(args.env, seed=0, target1=args.target1, target2=args.target2, target3=args.target3) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() if done: ob = env.reset()
def load(path): with open(path, "rb") as f: model_data, act_params = cloudpickle.load(f) act = deepqn.build_act(**act_params) sess = tf.Session() sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model")) return ActWrapper(act, act_params)
def main(): parser = argparse.ArgumentParser() logger.configure() parser.add_argument('--env', type=str, help="The Gym environement ID", default="AttFC_GyroErr-MotorVel_M4_Con-v0") parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--model-path', default=os.path.join( '/root/code/nti/gymfc/humanoid_policy', 'hum')) parser.add_argument('--play', action="store_true", default=False) parser.add_argument('--num-timesteps', type=int, default=2 * 1e6) current_dir = os.path.dirname(__file__) config_path = os.path.join(current_dir, "../configs/iris.config") print("Loading config from ", config_path) os.environ["GYMFC_CONFIG"] = config_path args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path, env_id=args.env) else: print(" Making env=", args.env) # construct the model object, load pre-trained model and render pi = train(num_timesteps=1, seed=args.seed, env_id=args.env) U.load_state(args.model_path) env = gym.make(args.env) # env.render() ob = env.reset() actuals = [] desireds = [] while True: desired = env.omega_target # [0., 0., 0.] actual = env.omega_actual # -[0., 0., 0.] actuals.append(actual) desireds.append(desired) print("sp=", desired, " rate=", actual) action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) if done: break print(np.array(desireds)) print(np.array(actuals)) plot_step_response(np.array(desireds), np.array(actuals))
def load(path, act_params, num_cpu=16): with open(path, "rb") as f: model_data = dill.load(f) act = deepq.build_act(**act_params) sess = U.make_session(num_cpu=num_cpu) sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model")) return ActWrapper(act)
def load(path): with open(path, "rb") as f: model_data, act_params = cloudpickle.load(f) act = deepq.build_act(**act_params) sess = tf.Session() sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) load_state(os.path.join(td, "model")) return ActWrapper(act, act_params)
def maybe_load_model(savedir, container): """Load model if present at the specified path.""" if savedir is None: return state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip')) if container is not None: logger.log("Attempting to download model from Azure") found_model = container.get(savedir, 'training_state.pkl.zip') else: found_model = os.path.exists(state_path) if found_model: state = pickle_load(state_path, compression=True) model_dir = "model-{}".format(state["num_iters"]) if container is not None: container.get(savedir, model_dir) U.load_state(os.path.join(savedir, model_dir, "saved")) logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"])) return state
def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=reuse) U.initialize() # Prepare for rollouts # ---------------------------------------- U.load_state(load_model_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split('/')[-1] + '.' + env.spec.id np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list)/len(len_list) avg_ret = sum(ret_list)/len(ret_list) print("Average length:", avg_len) print("Average return:", avg_ret) return avg_len, avg_ret
def main(): logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render pi = train(num_timesteps=1, seed=args.seed) U.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() if done: ob = env.reset()
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def load(self, load_path): tf_util.load_state(load_path, sess=self.sess)
video_recorder = VideoRecorder( env, video_path, enabled=video_path is not None) obs = env.reset() while True: env.unwrapped.render() video_recorder.capture_frame() action = act(np.array(obs)[None], stochastic=stochastic)[0] obs, rew, done, info = env.step(action) if done: obs = env.reset() if len(info["rewards"]) > num_episodes: if len(info["rewards"]) == 1 and video_recorder.enabled: # save video of first episode print("Saved video.") video_recorder.close() video_recorder.enabled = False print(info["rewards"][-1]) num_episodes = len(info["rewards"]) if __name__ == '__main__': with U.make_session(4) as sess: args = parse_args() env = make_env(args.env) act = deepq.build_act( make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), q_func=dueling_model if args.dueling else model, num_actions=env.action_space.n) U.load_state(os.path.join(args.model_dir, "saved")) play(env, act, args.stochastic, args.video)
def learn(env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update(mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches((ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((32, 32), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq") # # act_y, train_y, update_target_y, debug_y = deepq.build_train( # make_obs_ph=make_obs_ph, # q_func=q_func, # num_actions=num_actions, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10, # scope="deepq_y" # ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, # initial_p=prioritized_replay_beta0, # final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) # replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule = None # beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() # update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): screen = shift(LEFT, player[0] - 16, screen) elif (player[0] < 16): screen = shift(RIGHT, 16 - player[0], screen) if (player[1] > 16): screen = shift(UP, player[1] - 16, screen) elif (player[1] < 16): screen = shift(DOWN, 16 - player[1], screen) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act( np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 if (action == 0): #UP if (player[1] >= 8): coord = [player[0], player[1] - 8] #path_memory_[player[1] - 16 : player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] #path_memory_[0 : player[1], player[0]] = -1 #else: # rew -= 1 elif (action == 1): #DOWN if (player[1] <= 23): coord = [player[0], player[1] + 8] #path_memory_[player[1] : player[1] + 16, player[0]] = -1 elif (player[1] > 23): coord = [player[0], 31] #path_memory_[player[1] : 63, player[0]] = -1 #else: # rew -= 1 elif (action == 2): #LEFT if (player[0] >= 8): coord = [player[0] - 8, player[1]] #path_memory_[player[1], player[0] - 16 : player[0]] = -1 elif (player[0] < 8): coord = [0, player[1]] #path_memory_[player[1], 0 : player[0]] = -1 #else: # rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 23): coord = [player[0] + 8, player[1]] #path_memory_[player[1], player[0] : player[0] + 16] = -1 elif (player[0] > 23): coord = [31, player[1]] #path_memory_[player[1], player[0] : 63] = -1 if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): new_screen = shift(LEFT, player[0] - 16, new_screen) elif (player[0] < 16): new_screen = shift(RIGHT, 16 - player[0], new_screen) if (player[1] > 16): new_screen = shift(UP, player[1] - 16, new_screen) elif (player[1] < 16): new_screen = shift(DOWN, 16 - player[1], new_screen) rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size) # weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps # new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)