Ejemplo n.º 1
0
def init_nets(env):
    global q, q_targ
    global pi, pi_targ
    global optim_q, optim_pi

    action_size = env.action_space.shape[0]
    obs_size = env.observation_space.shape[0]

    # initialize policy
    pi_args = {
        "input_size": obs_size,
        "hidden_size": args.hidden_size,
        "output_size": action_size,
        "num_hidden": args.num_hidden
    }
    pi = models.Policy(**pi_args).to(device)
    pi_targ = models.Policy(**pi_args).to(device)
    pi_targ.load_state_dict(pi.state_dict())

    # initalize q function
    q_args = {
        "input_size": obs_size + action_size,
        "hidden_size": args.hidden_size,
        "output_size": 1,
        "num_hidden": args.num_hidden
    }
    q = models.FeedForward(**q_args).to(device)
    q_targ = models.FeedForward(**q_args).to(device)
    q_targ.load_state_dict(q.state_dict())
    def __init__(self, env, policy_lr, value_lr, tau, gamma, buffer_size,
                 max_time_step, observate_time, batch_size, path,
                 soft_update_step, use_cuda):
        self.env = env
        self.policy_lr = policy_lr
        self.value_lr = value_lr
        self.use_cuda = bool(use_cuda)
        self.tau = tau
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.max_time_step = max_time_step
        self.observate_time = observate_time
        self.batch_size = batch_size
        self.global_time_step = 0
        self.path = path
        self.soft_update_step = soft_update_step

        print('IF USE CUDA: ' + str(self.use_cuda))

        num_inputs = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        # the scale of the action space....
        self.action_scale = self.env.action_space.high[0]

        # build up the network....
        # build the actor_network firstly...
        self.actor_net = models.Policy(num_inputs, self.num_actions)
        self.actor_target_net = models.Policy(num_inputs, self.num_actions)

        # build the critic_network....
        self.critic_net = models.Critic(num_inputs, self.num_actions)
        self.critic_target_net = models.Critic(num_inputs, self.num_actions)

        # if use cuda...
        if self.use_cuda:
            self.actor_net.cuda()
            self.actor_target_net.cuda()

            self.critic_net.cuda()
            self.critic_target_net.cuda()

        # init the same parameters....
        self.actor_target_net.load_state_dict(self.actor_net.state_dict())
        self.critic_target_net.load_state_dict(self.critic_net.state_dict())

        # define the optimize.... add the L2 reg in critic optimzier here...
        self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                                lr=self.policy_lr)
        self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(),
                                                 lr=self.value_lr,
                                                 weight_decay=1e-2)

        # init the filter...
        self.running_state = ZFilter((num_inputs, ), clip=5)
Ejemplo n.º 3
0
    def prepare_local_network(self):
        with tf.variable_scope("local"):
            self.local_network = models.Policy(
                    self.args,
                    self.env.observation_space.shape,
                    self.env.action_space.n,
                    data_format='channels_last')

            ##########################
            # Trajectory queue
            ##########################
            self.trajectory_placeholders = {
                    name:tf.placeholder(
                            tf.float32, dict(self.queue_shapes)[name],
                            name=f"{name}_in") \
                                    for name, shape in self.queue_shapes
            }
            self.trajectory_enqueues = self.trajectory_queue.enqueue(
                    { name:self.trajectory_placeholders[name] \
                            for name, _ in self.queue_shapes })

            ###############################
            # Thread dealing with queues
            ###############################
            self.worker_thread = rl_utils.WorkerThread(
                    self.env,
                    self.local_network,
                    self.trajectory_enqueues,
                    self.trajectory_placeholders,
                    self.trajectory_queue_size_op)

        # copy weights from the parameter server to the local model
        self.sync = ut.tf.get_sync_op(
                self.global_network.var_list,
                self.local_network.var_list)
Ejemplo n.º 4
0
def evaluate_model(args, episode_num, model=None, save_json=False):
    env = gym.make('PccNs-v0')
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    if model is None:
        model = models.Policy()
        model.load_state_dict(torch.load('ac_model_%s.pkl' % args.reward))

    state = env.reset(reward=args.reward, max_bw=args.bandwidth, test=True)
    ep_reward = 0
    for t in range(1, 10000):

        # select action from policy
        state = torch.from_numpy(state).float()
        # action_mean, action_log_var, _ = model(state)
        action_mean, _ = model(state)
        # statistics.append(action_log_var.item())
        action = action_mean.item()

        # take the action
        state, reward, done, _ = env.step(action)

        model.rewards.append(reward)
        ep_reward += reward
        if done:
            break

    # log results
    # print('Evaluation results: reward = %.3f' % (ep_reward))
    if save_json:
        env.dump_events_to_file('results/test_rl_%s_%.2f.json' %
                                (args.reward, args.bandwidth))

    return ep_reward
Ejemplo n.º 5
0
 def create_policy(self, policy_values):
     print policy_values
     values = copy.deepcopy(policy_values)
     values['id2'] = uuidutils.generate_uuid()
     policy = models.Policy()
     params = policy.generate_param()
     for k, v in params.items():
         params[k] = values.get(k)
     print params
     policy.update(params)
     self.add(policy)
     return self.get_policy_id2(values['id2'])
Ejemplo n.º 6
0
    def post(self):
        args = self.parser.parse_args()

        if md.Policy.query.filter_by(pname=args['pname']).count():
            r = msg.error_msg("policy name has been used")
        else:
            new_policy = md.Policy(pname=args['pname'],
                                   description=args['description'])
            db.session.add(new_policy)
            db.session.commit()
            r = msg.success_msg

        return r
Ejemplo n.º 7
0
    def prepare_local_network(self):
        self.local_network = models.Policy(
                self.args, self.env, "local",
                self.input_shape, self.action_sizes,
                data_format='channels_last')

        ##########################
        # Trajectory queue
        ##########################
        self.trajectory_placeholders = {
                name:tf.placeholder(
                        tf.float32, dict(self.queue_shapes)[name],
                        name="{}_in".format(name)) \
                                for name, shape in self.queue_shapes
        }
        self.trajectory_enqueues = self.trajectory_queue.enqueue(
                { name:self.trajectory_placeholders[name] \
                        for name, _ in self.queue_shapes })

        ##########################
        # Replay queue
        ##########################
        if self.args.loss == 'gan':
            self.replay_placeholder = tf.placeholder(
                    tf.float32, self.input_shape,
                    name="replay_in")
            self.replay_enqueue = self.replay_queue.enqueue(
                    self.replay_placeholder)
        else:
            self.replay_placeholder = None
            self.replay_enqueue = None

        ###############################
        # Thread dealing with queues
        ###############################
        self.worker_thread = rl_utils.WorkerThread(
                self.env,
                self.local_network,
                self.trajectory_enqueues,
                self.trajectory_placeholders,
                self.trajectory_queue_size_op,
                self.replay_enqueue,
                self.replay_placeholder,
                self.replay_queue_size_op)

        # copy weights from the parameter server to the local model
        self.policy_sync = ut.tf.get_sync_op(
                self.global_network.var_list,
                self.local_network.var_list)
Ejemplo n.º 8
0
    def __init__(self,
                 env_name,
                 info=False,
                 gamma=0.9,
                 entropy_beta=0.01,
                 global_update_step=20):
        self.env = gym.make(env_name)
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]

        # init some other parameters....
        self.gamma = gamma
        self.global_update_step = global_update_step
        self.info = info

        # build up the personal network...
        self.value_network_local = models.Value(num_inputs)
        self.policy_network_local = models.Policy(num_inputs, num_actions)
Ejemplo n.º 9
0
    def __init__(self, config, out_dir):
        super().__init__(config)

        def env_make_fn():
            return gym.make(config['env'])

        self.env = env_make_fn()
        self.device = config['device']
        self.storage = StorageWrapper.remote(storage.ReplayBuffer, [config['replay_buffer_size']], {})
        critic_kwargs = {
            'num_inputs': self.env.observation_space.shape[0],
            'actions_dim': self.env.action_space.shape[0]
        }
        policy_kwargs = critic_kwargs
        self.critic = models.Critic(**critic_kwargs).to(self.device)
        self.policy = models.Policy(**policy_kwargs).to(self.device)
        self.target_policy = copy.deepcopy(self.policy)
        self.target_critic = copy.deepcopy(self.critic)

        self.params_server = ParamServer.remote(utils.get_cpu_state_dict(self.policy))
        self.evaluator = workers.Evaluator.as_remote(num_gpus=config['gpu_per_runner'], num_cpus=config['cpu_per_runner'])
        self.evaluator = self.evaluator.remote(models.Policy,
                                               policy_kwargs,
                                               env_make_fn,
                                               self.params_server,
                                               self.config)

        self.runners = [workers.Runner.as_remote(num_gpus=config['gpu_per_runner'], 
                                                 num_cpus=config['cpu_per_runner']).remote(models.Policy,
                                                                                           policy_kwargs,
                                                                                           env_make_fn,
                                                                                           self.params_server,
                                                                                           self.storage,
                                                                                           self.config)
                        for _ in range(self.config['n_runners'])]

        self.critic.train()
        self.policy.train()
        self.target_policy.eval()
        self.target_critic.eval()
        self.opt_policy = torch.optim.Adam([{'params': self.policy.parameters(), 'lr': self.config['policy_lr']}])
        self.opt_critic = torch.optim.Adam([{'params': self.critic.parameters(), 'lr': self.config['critic_lr']}])
        self.critic_loss = None
        self.policy_loss = None
 def __init__(self, args, env):
     # define the arguments and environments...
     self.args = args
     self.env = env
     # define the num of inputs and num of actions
     num_inputs = self.env.observation_space.shape[0]
     num_actions = self.env.action_space.shape[0]
     # define the model save dir...
     self.saved_path = self.args.save_dir + self.args.env_name + '/'
     # check the path
     if not os.path.exists(self.args.save_dir):
         os.mkdir(self.args.save_dir)
     if not os.path.exists(self.saved_path):
         os.mkdir(self.saved_path)
     # define the networks...
     self.policy_network = models.Policy(num_inputs, num_actions)
     self.value_network = models.Value(num_inputs)
     # define the optimizer
     self.optimizer_value = torch.optim.Adam(self.value_network.parameters(), lr=self.args.value_lr, weight_decay=self.args.l2_reg)
     # init the filter...
     self.running_state = ZFilter((num_inputs,), clip=5)
    def __init__(self, env, args):
        # define the parameters...
        self.env = env
        # get the environment's input size and output size
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]
        # get the parameters
        self.args = args
        self.saved_path = 'saved_models/' + str(self.args.env_name) + '/'
        # check the path
        if not os.path.exists(self.saved_path):
            os.mkdir(self.saved_path)

        # check if cuda is avaiable...
        self.use_cuda = torch.cuda.is_available() and self.args.cuda
        print('The cuda is avaiable: ' + str(torch.cuda.is_available()))
        print('If use the cuda: ' + str(self.args.cuda))

        # define the network...
        self.policy_network = models.Policy(num_inputs, num_actions)
        self.value_network = models.Value(num_inputs)

        if self.use_cuda:
            self.policy_network.cuda()
            self.value_network.cuda()

        # define the optimizer
        self.optimizer_value = torch.optim.Adam(
            self.value_network.parameters(),
            lr=self.args.value_lr,
            weight_decay=self.args.l2_reg)
        self.optimizer_policy = torch.optim.Adam(
            self.policy_network.parameters(),
            lr=self.args.policy_lr,
            weight_decay=self.args.l2_reg)

        # init the Filter...
        self.running_state = ZFilter((num_inputs, ), clip=5)
Ejemplo n.º 12
0
    def prepare_master_network(self):
        self.global_network = pi = models.Policy(
                self.args, self.env, "global",
                self.input_shape, self.action_sizes,
                data_format='channels_first' \
                        if self.args.dynamic_channel \
                        else 'channels_last')

        self.acs, acs = {}, {}
        for idx, (name, action_size) in enumerate(self.action_sizes.items()):
            # [B, action_size]
            self.acs[name] = tf.placeholder(tf.int32, [None, None],
                                            name="{}_in".format(name))
            acs[name] = tf.one_hot(self.acs[name], np.prod(action_size))

        self.adv = adv = tf.placeholder(tf.float32,
                                        [None, self.env.episode_length],
                                        name="adv")
        self.r = r = tf.placeholder(tf.float32,
                                    [None, self.env.episode_length],
                                    name="r")

        bsz = tf.to_float(tf.shape(pi.x)[0])

        ########################
        # Building optimizer
        ########################

        self.loss = 0
        self.pi_loss, self.vf_loss, self.entropy = 0, 0, 0

        for name in self.action_sizes:
            ac = acs[name]
            logit = pi.logits[name]

            log_prob_tf = tf.nn.log_softmax(logit)
            prob_tf = tf.nn.softmax(logit)

            pi_loss = -tf.reduce_sum(
                tf.reduce_sum(log_prob_tf * ac, [-1]) * adv)

            # loss of value function
            vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - r))
            entropy = -tf.reduce_sum(prob_tf * log_prob_tf)

            self.loss += pi_loss + 0.5 * vf_loss - \
                    entropy * self.args.entropy_coeff

            self.pi_loss += pi_loss
            self.vf_loss += vf_loss
            self.entropy += entropy

        grads = tf.gradients(self.loss, pi.var_list)

        ##################
        # Summaries
        ##################

        # summarize only the last state
        last_state = self.env.denorm(pi.x[:, -1])
        last_state.set_shape([self.args.policy_batch_size] +
                             ut.tf.int_shape(last_state)[1:])

        summaries = [
            tf.summary.image("last_state", image_reshaper(last_state)),
            tf.summary.scalar("env/r", tf.reduce_mean(self.r[:, -1])),
            tf.summary.scalar("model/policy_loss", self.pi_loss / bsz),
            tf.summary.scalar("model/value_loss", self.vf_loss / bsz),
            tf.summary.scalar("model/entropy", self.entropy / bsz),
            tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)),
            tf.summary.scalar("model/var_global_norm",
                              tf.global_norm(pi.var_list)),
        ]

        if pi.c is not None:
            target = self.env.denorm(pi.c[:, -1])
            target.set_shape([self.args.policy_batch_size] +
                             ut.tf.int_shape(target)[1:])

            summaries.append(tf.summary.image("target",
                                              image_reshaper(target)))

            l2_loss = tf.reduce_sum((pi.x[:, -1] - pi.c[:, -1])**2, [1, 2, 3])
            summaries.append(
                tf.summary.scalar("model/l2_loss", tf.reduce_mean(l2_loss)))

        self.summary_op = tf.summary.merge(summaries)
        grads, _ = tf.clip_by_global_norm(grads, self.args.grad_clip)

        grads_and_vars = list(zip(grads, self.global_network.var_list))

        # each worker has a different set of adam optimizer parameters
        opt = tf.train.AdamOptimizer(self.args.policy_lr, name="policy_optim")

        self.train_op = opt.apply_gradients(grads_and_vars, self.policy_step)
        self.summary_writer = None
Ejemplo n.º 13
0
    def prepare_master_network(self):
        with tf.variable_scope("global"):
            self.policy_step = tf.get_variable(
                    "policy_step", [], tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

            self.global_network = pi = models.Policy(
                    self.args,
                    self.env.observation_space.shape,
                    self.env.action_space.n,
                    data_format='channels_first' \
                            if self.args.dynamic_channel \
                            else 'channels_last')

        self.acs, acs = {}, {}
        for idx, (name, action_size) in enumerate(
                self.action_sizes.items()):
            # [B, action_size]
            self.acs[name] = tf.placeholder(
                    tf.int32, [None, None], name=f"{name}_in")
            acs[name] = tf.one_hot(self.acs[name], np.prod(action_size))

        self.adv = adv = tf.placeholder(
                tf.float32, [None, self.env.episode_length], name="adv")
        self.r = r = tf.placeholder(
                tf.float32, [None, self.env.episode_length], name="r")

        self.loss = 0
        bsz = tf.to_float(tf.shape(pi.x)[0])

        for name in self.action_sizes:
            ac = acs[name]
            self.logit = pi.logits[name]

            log_prob_tf = tf.nn.log_softmax(self.logit)
            prob_tf = tf.nn.softmax(self.logit)

            pi_loss = tf.reduce_sum(
                    tf.reduce_sum(log_prob_tf * ac, [-1]) * adv)

            # loss of value function
            self.vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - r))
            entropy = - tf.reduce_sum(prob_tf * log_prob_tf)

            self.loss += pi_loss + 0.5 * self.vf_loss \
                    - entropy * self.args.entropy_coeff

        grads = tf.gradients(self.loss, pi.var_list)

        # summarize only the last state
        tf.summary.image("last_state", self.env.denorm(pi.x[:,-1]))
        tf.summary.scalar("env/r", tf.reduce_mean(self.r[:,-1]))
        tf.summary.scalar("model/policy_loss", pi_loss / bsz)
        tf.summary.scalar("model/value_loss", self.vf_loss / bsz)
        tf.summary.scalar("model/entropy", entropy / bsz)
        tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads))
        tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list))

        self.summary_op = tf.summary.merge_all()
        grads, _ = tf.clip_by_global_norm(grads, self.args.grad_clip)

        grads_and_vars = list(zip(grads, self.global_network.var_list))
        policy_inc_step = self.policy_step.assign_add(tf.shape(pi.x)[0])

        # each worker has a different set of adam optimizer parameters
        opt = tf.train.AdamOptimizer(self.args.policy_lr)

        self.train_op = tf.group(
                opt.apply_gradients(grads_and_vars), policy_inc_step)
        self.summary_writer = None
import gym
import ppo_agent
import models
import mujoco_py

env = gym.make('Humanoid-v1')

num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

print('The number of states is ' + str(num_inputs))
print('The number of actions is ' + str(num_actions))

policy_network = models.Policy(num_inputs, num_actions)
value_network = models.Value(num_inputs)

ppo_man = ppo_agent.ppo_brain(env, policy_network, value_network, use_cuda=False)
ppo_man.test_network('saved_models/Humanoid-v1/policy_net_model_400.pt')


Ejemplo n.º 15
0
torch.set_default_tensor_type('torch.DoubleTensor')

if __name__ == '__main__':
    env_name = 'Pendulum-v0'
    save_path = 'saved_models/Pendulum-v0/'
    # the number of cpu...
    num_of_workers = multiprocessing.cpu_count()

    env = gym.make(env_name)
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    # build up the center network....
    value_network_global = models.Value(num_inputs)
    policy_network_global = models.Policy(num_inputs, num_actions)

    value_network_global.share_memory()
    policy_network_global.share_memory()

    # build up the workers...
    workers = []
    processor = []

    #worker_test = A3C_Workers(env_name)
    #worker_test.test_the_network(path='saved_models/policy_model_3700.pt')
    for idx in range(num_of_workers):
        if idx == 0:
            workers.append(A3C_Workers(env_name, True))
        else:
            workers.append(A3C_Workers(env_name))