Esempio n. 1
0
 def __init__(self, env, gamma):
     self.env = env
     self.env.seed(543)
     torch.manual_seed(543)
     self.policy_model = PolicyModel()
     self.optimizer = optim.Adam(self.policy_model.parameters(), lr=0.009)
     self.gamma = gamma
     self.eps = np.finfo(np.float32).eps.item()
     self.loss_list = []
     self.ep_no_list = []
Esempio n. 2
0
    def __init__(self):
        """
            init
        """
        self.parse_args = self._init_parser()
        self.bl_decay = self.parse_args.bl_decay
        self.log_dir = self.parse_args.log_dir
        self.early_stop = self.parse_args.early_stop
        self.data_path = self.parse_args.data_path
        self.num_models = self.parse_args.num_models
        self.batch_size = self.parse_args.batch_size
        self.chunk_size = self.parse_args.chunk_size

        self._init_dir_path()
        self.model = PolicyModel(self.parse_args)
        algo_hyperparas = {'lr': self.parse_args.learning_rate}
        self.algorithm = ReinforcePolicyGradient(self.model,
                                                 hyperparas=algo_hyperparas)
        self.autodl_agent = AutoDLAgent(self.algorithm, self.parse_args)
        self.total_reward = 0
Esempio n. 3
0
#Create the environment
#----------------------------
env = gym.make(env_id)
if args.unwrap: env = env.unwrapped
a_dim = env.action_space.shape[0]
a_low = env.action_space.low[0]
a_hight = env.action_space.high[0]
s_dim = env.observation_space.shape[0]

#Create the model
#----------------------------
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, s_dim, a_dim, a_low, a_hight)

#Start playing
#----------------------------
sess.run(tf.global_variables_initializer())

#Load the model
saver = tf.train.Saver(max_to_keep=2)
ckpt = tf.train.get_checkpoint_state(save_dir)
if ckpt:
    print("Loading the model ... ", end="")
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Done.")

logstd = np.zeros((1, a_dim), dtype=np.float32)
logstd.fill(-6.0)
Esempio n. 4
0
#Create multiple environments
#----------------------------
env = MultiEnv([make_env(i, env_id=env_id) for i in range(n_env)])
a_dim = env.ac_space.shape[0]
s_dim = env.ob_space.shape[0]
a_low = env.ac_space.low[0]
a_high = env.ac_space.high[0]
runner = MultiEnvRunner(env, s_dim, a_dim, n_step, gamma)

#Create the model
#----------------------------
config = tf.ConfigProto(intra_op_parallelism_threads=n_env,
                        inter_op_parallelism_threads=n_env)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, s_dim, a_dim, a_low, a_high, name="policy")

#Placeholders
#----------------------------
#action_ph: (mb_size, a_dim)
#adv_ph:    (mb_size)
#reward_ph: (mb_size)
action_ph = tf.placeholder(tf.float32, [None, a_dim], name="action")
adv_ph = tf.placeholder(tf.float32, [None], name="advantage")
discount_return_ph = tf.placeholder(tf.float32, [None],
                                    name="discounted_return")
actor_lr_ph = tf.placeholder(tf.float32, [])
critic_lr_ph = tf.placeholder(tf.float32, [])

#Loss
#----------------------------
Esempio n. 5
0
if "FIRE" in env.unwrapped.get_action_meanings():
    env = env_wrapper.FireResetEnv(env)

env = env_wrapper.WarpFrame(env)
env = env_wrapper.FrameStack(env, n_stack)

a_dim = env.action_space.n
img_height, img_width, c_dim = env.observation_space.shape

#Create the model
#----------------------------
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, img_height, img_width, c_dim, a_dim)

#Start training
#----------------------------
sess.run(tf.global_variables_initializer())

#Load the model
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

saver = tf.train.Saver(max_to_keep=2)
ckpt = tf.train.get_checkpoint_state(save_dir)
if ckpt:
    print("Loading the model ... ", end="")
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Done.")
Esempio n. 6
0
#Create multiple environments
#----------------------------
env = MultiEnv([make_env(i, env_id=env_id) for i in range(n_env)])
a_dim = env.ac_space.n
img_height, img_width, c_dim = env.ob_space.shape
runner = MultiEnvRunner(env, img_height, img_width, c_dim, n_step, n_stack,
                        gamma)

#Create the model
#----------------------------
config = tf.ConfigProto(intra_op_parallelism_threads=n_env,
                        inter_op_parallelism_threads=n_env)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, img_height, img_width, c_dim * n_stack, a_dim)

#Placeholders
#----------------------------
action_ph = tf.placeholder(tf.int32, [None], name="action")
adv_ph = tf.placeholder(tf.float32, [None], name="advantage")
discount_return_ph = tf.placeholder(tf.float32, [None],
                                    name="discounted_return")
lr_ph = tf.placeholder(tf.float32, [])

#Loss
#----------------------------
nll_loss = -policy.cat_dist.log_prob(action_ph)
pg_loss = tf.reduce_mean(adv_ph * nll_loss)
value_loss = tf.reduce_mean(
    tf.squared_difference(tf.squeeze(policy.value), discount_return_ph) / 2.0)
Esempio n. 7
0
env_id = args.env
save_dir = "./save_" + env_id

#Create the environment
#----------------------------
env = gym.make(env_id)
if args.unwrap: env = env.unwrapped
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n

#Create the model
#----------------------------
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
policy = PolicyModel(sess, s_dim, a_dim)

#Start playing
#----------------------------
sess.run(tf.global_variables_initializer())

#Load the model
saver = tf.train.Saver(max_to_keep=2)
ckpt = tf.train.get_checkpoint_state(save_dir)
if ckpt:
    print("Loading the model ... ", end="")
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Done.")

for it in range(100):
    ob = env.reset()