def __init__(self, env, gamma): self.env = env self.env.seed(543) torch.manual_seed(543) self.policy_model = PolicyModel() self.optimizer = optim.Adam(self.policy_model.parameters(), lr=0.009) self.gamma = gamma self.eps = np.finfo(np.float32).eps.item() self.loss_list = [] self.ep_no_list = []
def __init__(self): """ init """ self.parse_args = self._init_parser() self.bl_decay = self.parse_args.bl_decay self.log_dir = self.parse_args.log_dir self.early_stop = self.parse_args.early_stop self.data_path = self.parse_args.data_path self.num_models = self.parse_args.num_models self.batch_size = self.parse_args.batch_size self.chunk_size = self.parse_args.chunk_size self._init_dir_path() self.model = PolicyModel(self.parse_args) algo_hyperparas = {'lr': self.parse_args.learning_rate} self.algorithm = ReinforcePolicyGradient(self.model, hyperparas=algo_hyperparas) self.autodl_agent = AutoDLAgent(self.algorithm, self.parse_args) self.total_reward = 0
#Create the environment #---------------------------- env = gym.make(env_id) if args.unwrap: env = env.unwrapped a_dim = env.action_space.shape[0] a_low = env.action_space.low[0] a_hight = env.action_space.high[0] s_dim = env.observation_space.shape[0] #Create the model #---------------------------- config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, s_dim, a_dim, a_low, a_hight) #Start playing #---------------------------- sess.run(tf.global_variables_initializer()) #Load the model saver = tf.train.Saver(max_to_keep=2) ckpt = tf.train.get_checkpoint_state(save_dir) if ckpt: print("Loading the model ... ", end="") saver.restore(sess, ckpt.model_checkpoint_path) print("Done.") logstd = np.zeros((1, a_dim), dtype=np.float32) logstd.fill(-6.0)
#Create multiple environments #---------------------------- env = MultiEnv([make_env(i, env_id=env_id) for i in range(n_env)]) a_dim = env.ac_space.shape[0] s_dim = env.ob_space.shape[0] a_low = env.ac_space.low[0] a_high = env.ac_space.high[0] runner = MultiEnvRunner(env, s_dim, a_dim, n_step, gamma) #Create the model #---------------------------- config = tf.ConfigProto(intra_op_parallelism_threads=n_env, inter_op_parallelism_threads=n_env) config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, s_dim, a_dim, a_low, a_high, name="policy") #Placeholders #---------------------------- #action_ph: (mb_size, a_dim) #adv_ph: (mb_size) #reward_ph: (mb_size) action_ph = tf.placeholder(tf.float32, [None, a_dim], name="action") adv_ph = tf.placeholder(tf.float32, [None], name="advantage") discount_return_ph = tf.placeholder(tf.float32, [None], name="discounted_return") actor_lr_ph = tf.placeholder(tf.float32, []) critic_lr_ph = tf.placeholder(tf.float32, []) #Loss #----------------------------
if "FIRE" in env.unwrapped.get_action_meanings(): env = env_wrapper.FireResetEnv(env) env = env_wrapper.WarpFrame(env) env = env_wrapper.FrameStack(env, n_stack) a_dim = env.action_space.n img_height, img_width, c_dim = env.observation_space.shape #Create the model #---------------------------- config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, img_height, img_width, c_dim, a_dim) #Start training #---------------------------- sess.run(tf.global_variables_initializer()) #Load the model if not os.path.exists(save_dir): os.mkdir(save_dir) saver = tf.train.Saver(max_to_keep=2) ckpt = tf.train.get_checkpoint_state(save_dir) if ckpt: print("Loading the model ... ", end="") saver.restore(sess, ckpt.model_checkpoint_path) print("Done.")
#Create multiple environments #---------------------------- env = MultiEnv([make_env(i, env_id=env_id) for i in range(n_env)]) a_dim = env.ac_space.n img_height, img_width, c_dim = env.ob_space.shape runner = MultiEnvRunner(env, img_height, img_width, c_dim, n_step, n_stack, gamma) #Create the model #---------------------------- config = tf.ConfigProto(intra_op_parallelism_threads=n_env, inter_op_parallelism_threads=n_env) config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, img_height, img_width, c_dim * n_stack, a_dim) #Placeholders #---------------------------- action_ph = tf.placeholder(tf.int32, [None], name="action") adv_ph = tf.placeholder(tf.float32, [None], name="advantage") discount_return_ph = tf.placeholder(tf.float32, [None], name="discounted_return") lr_ph = tf.placeholder(tf.float32, []) #Loss #---------------------------- nll_loss = -policy.cat_dist.log_prob(action_ph) pg_loss = tf.reduce_mean(adv_ph * nll_loss) value_loss = tf.reduce_mean( tf.squared_difference(tf.squeeze(policy.value), discount_return_ph) / 2.0)
env_id = args.env save_dir = "./save_" + env_id #Create the environment #---------------------------- env = gym.make(env_id) if args.unwrap: env = env.unwrapped s_dim = env.observation_space.shape[0] a_dim = env.action_space.n #Create the model #---------------------------- config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, s_dim, a_dim) #Start playing #---------------------------- sess.run(tf.global_variables_initializer()) #Load the model saver = tf.train.Saver(max_to_keep=2) ckpt = tf.train.get_checkpoint_state(save_dir) if ckpt: print("Loading the model ... ", end="") saver.restore(sess, ckpt.model_checkpoint_path) print("Done.") for it in range(100): ob = env.reset()