def init_nets(env): global q, q_targ global pi, pi_targ global optim_q, optim_pi action_size = env.action_space.shape[0] obs_size = env.observation_space.shape[0] # initialize policy pi_args = { "input_size": obs_size, "hidden_size": args.hidden_size, "output_size": action_size, "num_hidden": args.num_hidden } pi = models.Policy(**pi_args).to(device) pi_targ = models.Policy(**pi_args).to(device) pi_targ.load_state_dict(pi.state_dict()) # initalize q function q_args = { "input_size": obs_size + action_size, "hidden_size": args.hidden_size, "output_size": 1, "num_hidden": args.num_hidden } q = models.FeedForward(**q_args).to(device) q_targ = models.FeedForward(**q_args).to(device) q_targ.load_state_dict(q.state_dict())
def __init__(self, env, policy_lr, value_lr, tau, gamma, buffer_size, max_time_step, observate_time, batch_size, path, soft_update_step, use_cuda): self.env = env self.policy_lr = policy_lr self.value_lr = value_lr self.use_cuda = bool(use_cuda) self.tau = tau self.gamma = gamma self.buffer_size = buffer_size self.max_time_step = max_time_step self.observate_time = observate_time self.batch_size = batch_size self.global_time_step = 0 self.path = path self.soft_update_step = soft_update_step print('IF USE CUDA: ' + str(self.use_cuda)) num_inputs = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] # the scale of the action space.... self.action_scale = self.env.action_space.high[0] # build up the network.... # build the actor_network firstly... self.actor_net = models.Policy(num_inputs, self.num_actions) self.actor_target_net = models.Policy(num_inputs, self.num_actions) # build the critic_network.... self.critic_net = models.Critic(num_inputs, self.num_actions) self.critic_target_net = models.Critic(num_inputs, self.num_actions) # if use cuda... if self.use_cuda: self.actor_net.cuda() self.actor_target_net.cuda() self.critic_net.cuda() self.critic_target_net.cuda() # init the same parameters.... self.actor_target_net.load_state_dict(self.actor_net.state_dict()) self.critic_target_net.load_state_dict(self.critic_net.state_dict()) # define the optimize.... add the L2 reg in critic optimzier here... self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), lr=self.policy_lr) self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(), lr=self.value_lr, weight_decay=1e-2) # init the filter... self.running_state = ZFilter((num_inputs, ), clip=5)
def prepare_local_network(self): with tf.variable_scope("local"): self.local_network = models.Policy( self.args, self.env.observation_space.shape, self.env.action_space.n, data_format='channels_last') ########################## # Trajectory queue ########################## self.trajectory_placeholders = { name:tf.placeholder( tf.float32, dict(self.queue_shapes)[name], name=f"{name}_in") \ for name, shape in self.queue_shapes } self.trajectory_enqueues = self.trajectory_queue.enqueue( { name:self.trajectory_placeholders[name] \ for name, _ in self.queue_shapes }) ############################### # Thread dealing with queues ############################### self.worker_thread = rl_utils.WorkerThread( self.env, self.local_network, self.trajectory_enqueues, self.trajectory_placeholders, self.trajectory_queue_size_op) # copy weights from the parameter server to the local model self.sync = ut.tf.get_sync_op( self.global_network.var_list, self.local_network.var_list)
def evaluate_model(args, episode_num, model=None, save_json=False): env = gym.make('PccNs-v0') env.seed(args.seed) torch.manual_seed(args.seed) if model is None: model = models.Policy() model.load_state_dict(torch.load('ac_model_%s.pkl' % args.reward)) state = env.reset(reward=args.reward, max_bw=args.bandwidth, test=True) ep_reward = 0 for t in range(1, 10000): # select action from policy state = torch.from_numpy(state).float() # action_mean, action_log_var, _ = model(state) action_mean, _ = model(state) # statistics.append(action_log_var.item()) action = action_mean.item() # take the action state, reward, done, _ = env.step(action) model.rewards.append(reward) ep_reward += reward if done: break # log results # print('Evaluation results: reward = %.3f' % (ep_reward)) if save_json: env.dump_events_to_file('results/test_rl_%s_%.2f.json' % (args.reward, args.bandwidth)) return ep_reward
def create_policy(self, policy_values): print policy_values values = copy.deepcopy(policy_values) values['id2'] = uuidutils.generate_uuid() policy = models.Policy() params = policy.generate_param() for k, v in params.items(): params[k] = values.get(k) print params policy.update(params) self.add(policy) return self.get_policy_id2(values['id2'])
def post(self): args = self.parser.parse_args() if md.Policy.query.filter_by(pname=args['pname']).count(): r = msg.error_msg("policy name has been used") else: new_policy = md.Policy(pname=args['pname'], description=args['description']) db.session.add(new_policy) db.session.commit() r = msg.success_msg return r
def prepare_local_network(self): self.local_network = models.Policy( self.args, self.env, "local", self.input_shape, self.action_sizes, data_format='channels_last') ########################## # Trajectory queue ########################## self.trajectory_placeholders = { name:tf.placeholder( tf.float32, dict(self.queue_shapes)[name], name="{}_in".format(name)) \ for name, shape in self.queue_shapes } self.trajectory_enqueues = self.trajectory_queue.enqueue( { name:self.trajectory_placeholders[name] \ for name, _ in self.queue_shapes }) ########################## # Replay queue ########################## if self.args.loss == 'gan': self.replay_placeholder = tf.placeholder( tf.float32, self.input_shape, name="replay_in") self.replay_enqueue = self.replay_queue.enqueue( self.replay_placeholder) else: self.replay_placeholder = None self.replay_enqueue = None ############################### # Thread dealing with queues ############################### self.worker_thread = rl_utils.WorkerThread( self.env, self.local_network, self.trajectory_enqueues, self.trajectory_placeholders, self.trajectory_queue_size_op, self.replay_enqueue, self.replay_placeholder, self.replay_queue_size_op) # copy weights from the parameter server to the local model self.policy_sync = ut.tf.get_sync_op( self.global_network.var_list, self.local_network.var_list)
def __init__(self, env_name, info=False, gamma=0.9, entropy_beta=0.01, global_update_step=20): self.env = gym.make(env_name) num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] # init some other parameters.... self.gamma = gamma self.global_update_step = global_update_step self.info = info # build up the personal network... self.value_network_local = models.Value(num_inputs) self.policy_network_local = models.Policy(num_inputs, num_actions)
def __init__(self, config, out_dir): super().__init__(config) def env_make_fn(): return gym.make(config['env']) self.env = env_make_fn() self.device = config['device'] self.storage = StorageWrapper.remote(storage.ReplayBuffer, [config['replay_buffer_size']], {}) critic_kwargs = { 'num_inputs': self.env.observation_space.shape[0], 'actions_dim': self.env.action_space.shape[0] } policy_kwargs = critic_kwargs self.critic = models.Critic(**critic_kwargs).to(self.device) self.policy = models.Policy(**policy_kwargs).to(self.device) self.target_policy = copy.deepcopy(self.policy) self.target_critic = copy.deepcopy(self.critic) self.params_server = ParamServer.remote(utils.get_cpu_state_dict(self.policy)) self.evaluator = workers.Evaluator.as_remote(num_gpus=config['gpu_per_runner'], num_cpus=config['cpu_per_runner']) self.evaluator = self.evaluator.remote(models.Policy, policy_kwargs, env_make_fn, self.params_server, self.config) self.runners = [workers.Runner.as_remote(num_gpus=config['gpu_per_runner'], num_cpus=config['cpu_per_runner']).remote(models.Policy, policy_kwargs, env_make_fn, self.params_server, self.storage, self.config) for _ in range(self.config['n_runners'])] self.critic.train() self.policy.train() self.target_policy.eval() self.target_critic.eval() self.opt_policy = torch.optim.Adam([{'params': self.policy.parameters(), 'lr': self.config['policy_lr']}]) self.opt_critic = torch.optim.Adam([{'params': self.critic.parameters(), 'lr': self.config['critic_lr']}]) self.critic_loss = None self.policy_loss = None
def __init__(self, args, env): # define the arguments and environments... self.args = args self.env = env # define the num of inputs and num of actions num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] # define the model save dir... self.saved_path = self.args.save_dir + self.args.env_name + '/' # check the path if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) if not os.path.exists(self.saved_path): os.mkdir(self.saved_path) # define the networks... self.policy_network = models.Policy(num_inputs, num_actions) self.value_network = models.Value(num_inputs) # define the optimizer self.optimizer_value = torch.optim.Adam(self.value_network.parameters(), lr=self.args.value_lr, weight_decay=self.args.l2_reg) # init the filter... self.running_state = ZFilter((num_inputs,), clip=5)
def __init__(self, env, args): # define the parameters... self.env = env # get the environment's input size and output size num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] # get the parameters self.args = args self.saved_path = 'saved_models/' + str(self.args.env_name) + '/' # check the path if not os.path.exists(self.saved_path): os.mkdir(self.saved_path) # check if cuda is avaiable... self.use_cuda = torch.cuda.is_available() and self.args.cuda print('The cuda is avaiable: ' + str(torch.cuda.is_available())) print('If use the cuda: ' + str(self.args.cuda)) # define the network... self.policy_network = models.Policy(num_inputs, num_actions) self.value_network = models.Value(num_inputs) if self.use_cuda: self.policy_network.cuda() self.value_network.cuda() # define the optimizer self.optimizer_value = torch.optim.Adam( self.value_network.parameters(), lr=self.args.value_lr, weight_decay=self.args.l2_reg) self.optimizer_policy = torch.optim.Adam( self.policy_network.parameters(), lr=self.args.policy_lr, weight_decay=self.args.l2_reg) # init the Filter... self.running_state = ZFilter((num_inputs, ), clip=5)
def prepare_master_network(self): self.global_network = pi = models.Policy( self.args, self.env, "global", self.input_shape, self.action_sizes, data_format='channels_first' \ if self.args.dynamic_channel \ else 'channels_last') self.acs, acs = {}, {} for idx, (name, action_size) in enumerate(self.action_sizes.items()): # [B, action_size] self.acs[name] = tf.placeholder(tf.int32, [None, None], name="{}_in".format(name)) acs[name] = tf.one_hot(self.acs[name], np.prod(action_size)) self.adv = adv = tf.placeholder(tf.float32, [None, self.env.episode_length], name="adv") self.r = r = tf.placeholder(tf.float32, [None, self.env.episode_length], name="r") bsz = tf.to_float(tf.shape(pi.x)[0]) ######################## # Building optimizer ######################## self.loss = 0 self.pi_loss, self.vf_loss, self.entropy = 0, 0, 0 for name in self.action_sizes: ac = acs[name] logit = pi.logits[name] log_prob_tf = tf.nn.log_softmax(logit) prob_tf = tf.nn.softmax(logit) pi_loss = -tf.reduce_sum( tf.reduce_sum(log_prob_tf * ac, [-1]) * adv) # loss of value function vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - r)) entropy = -tf.reduce_sum(prob_tf * log_prob_tf) self.loss += pi_loss + 0.5 * vf_loss - \ entropy * self.args.entropy_coeff self.pi_loss += pi_loss self.vf_loss += vf_loss self.entropy += entropy grads = tf.gradients(self.loss, pi.var_list) ################## # Summaries ################## # summarize only the last state last_state = self.env.denorm(pi.x[:, -1]) last_state.set_shape([self.args.policy_batch_size] + ut.tf.int_shape(last_state)[1:]) summaries = [ tf.summary.image("last_state", image_reshaper(last_state)), tf.summary.scalar("env/r", tf.reduce_mean(self.r[:, -1])), tf.summary.scalar("model/policy_loss", self.pi_loss / bsz), tf.summary.scalar("model/value_loss", self.vf_loss / bsz), tf.summary.scalar("model/entropy", self.entropy / bsz), tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)), tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)), ] if pi.c is not None: target = self.env.denorm(pi.c[:, -1]) target.set_shape([self.args.policy_batch_size] + ut.tf.int_shape(target)[1:]) summaries.append(tf.summary.image("target", image_reshaper(target))) l2_loss = tf.reduce_sum((pi.x[:, -1] - pi.c[:, -1])**2, [1, 2, 3]) summaries.append( tf.summary.scalar("model/l2_loss", tf.reduce_mean(l2_loss))) self.summary_op = tf.summary.merge(summaries) grads, _ = tf.clip_by_global_norm(grads, self.args.grad_clip) grads_and_vars = list(zip(grads, self.global_network.var_list)) # each worker has a different set of adam optimizer parameters opt = tf.train.AdamOptimizer(self.args.policy_lr, name="policy_optim") self.train_op = opt.apply_gradients(grads_and_vars, self.policy_step) self.summary_writer = None
def prepare_master_network(self): with tf.variable_scope("global"): self.policy_step = tf.get_variable( "policy_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.global_network = pi = models.Policy( self.args, self.env.observation_space.shape, self.env.action_space.n, data_format='channels_first' \ if self.args.dynamic_channel \ else 'channels_last') self.acs, acs = {}, {} for idx, (name, action_size) in enumerate( self.action_sizes.items()): # [B, action_size] self.acs[name] = tf.placeholder( tf.int32, [None, None], name=f"{name}_in") acs[name] = tf.one_hot(self.acs[name], np.prod(action_size)) self.adv = adv = tf.placeholder( tf.float32, [None, self.env.episode_length], name="adv") self.r = r = tf.placeholder( tf.float32, [None, self.env.episode_length], name="r") self.loss = 0 bsz = tf.to_float(tf.shape(pi.x)[0]) for name in self.action_sizes: ac = acs[name] self.logit = pi.logits[name] log_prob_tf = tf.nn.log_softmax(self.logit) prob_tf = tf.nn.softmax(self.logit) pi_loss = tf.reduce_sum( tf.reduce_sum(log_prob_tf * ac, [-1]) * adv) # loss of value function self.vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - r)) entropy = - tf.reduce_sum(prob_tf * log_prob_tf) self.loss += pi_loss + 0.5 * self.vf_loss \ - entropy * self.args.entropy_coeff grads = tf.gradients(self.loss, pi.var_list) # summarize only the last state tf.summary.image("last_state", self.env.denorm(pi.x[:,-1])) tf.summary.scalar("env/r", tf.reduce_mean(self.r[:,-1])) tf.summary.scalar("model/policy_loss", pi_loss / bsz) tf.summary.scalar("model/value_loss", self.vf_loss / bsz) tf.summary.scalar("model/entropy", entropy / bsz) tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.summary.merge_all() grads, _ = tf.clip_by_global_norm(grads, self.args.grad_clip) grads_and_vars = list(zip(grads, self.global_network.var_list)) policy_inc_step = self.policy_step.assign_add(tf.shape(pi.x)[0]) # each worker has a different set of adam optimizer parameters opt = tf.train.AdamOptimizer(self.args.policy_lr) self.train_op = tf.group( opt.apply_gradients(grads_and_vars), policy_inc_step) self.summary_writer = None
import gym import ppo_agent import models import mujoco_py env = gym.make('Humanoid-v1') num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print('The number of states is ' + str(num_inputs)) print('The number of actions is ' + str(num_actions)) policy_network = models.Policy(num_inputs, num_actions) value_network = models.Value(num_inputs) ppo_man = ppo_agent.ppo_brain(env, policy_network, value_network, use_cuda=False) ppo_man.test_network('saved_models/Humanoid-v1/policy_net_model_400.pt')
torch.set_default_tensor_type('torch.DoubleTensor') if __name__ == '__main__': env_name = 'Pendulum-v0' save_path = 'saved_models/Pendulum-v0/' # the number of cpu... num_of_workers = multiprocessing.cpu_count() env = gym.make(env_name) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] # build up the center network.... value_network_global = models.Value(num_inputs) policy_network_global = models.Policy(num_inputs, num_actions) value_network_global.share_memory() policy_network_global.share_memory() # build up the workers... workers = [] processor = [] #worker_test = A3C_Workers(env_name) #worker_test.test_the_network(path='saved_models/policy_model_3700.pt') for idx in range(num_of_workers): if idx == 0: workers.append(A3C_Workers(env_name, True)) else: workers.append(A3C_Workers(env_name))