Beispiel #1
0
    def __init__(
        self,
        log_interval=10,
        lr=1e-5,
        use_cuda=False,
        verbose=0,
        log_tensorboard=False,
        path="rnd_model/",
    ):
        self.predictor = predictor_generator()
        self.target = target_generator()
        for param in self.target.parameters():
            param.requires_grad = False
        self.target.eval()

        self.log_interval = log_interval
        self.optimizer = torch.optim.Adam(self.predictor.parameters(), lr=lr)
        self.loss_function = torch.nn.MSELoss(reduction='mean')

        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.target.to(self.device)
        self.predictor.to(self.device)

        self.running_stats = RunningMeanStd()

        self.verbose = verbose
        self.writer = SummaryWriter() if log_tensorboard else None
        self.n_iter = 0

        self.save_path = path
        Path(path).mkdir(parents=True, exist_ok=True)

        self.early_stopping = EarlyStopping(save_dir=self.save_path)
Beispiel #2
0
    def __init__(self,
                 input_size=8,
                 learning_late=1e-4,
                 verbose=1,
                 use_cuda=False,
                 tensorboard=False):
        self.target = torch.nn.Sequential(torch.nn.Linear(input_size, 64),
                                          torch.nn.Linear(64, 128),
                                          torch.nn.Linear(128, 64))

        self.predictor = torch.nn.Sequential(torch.nn.Linear(input_size, 64),
                                             torch.nn.Linear(64, 128),
                                             torch.nn.Linear(128, 128),
                                             torch.nn.Linear(128, 64))

        self.loss_function = torch.nn.MSELoss(reduction='mean')
        self.optimizer = torch.optim.Adam(self.predictor.parameters(),
                                          lr=learning_late)
        for param in self.target.parameters():
            param.requires_grad = False
        self.verbose = verbose
        self.tensorboard = tensorboard
        if self.tensorboard:
            self.summary = SummaryWriter()
        self.iteration = 0

        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.target.to(self.device)
        self.predictor.to(self.device)

        self.running_stats = RunningMeanStd()
Beispiel #3
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=False,
                 clipob=5.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8,
                 use_tf=False):
        VecEnvWrapper.__init__(self, venv)
        if use_tf:
            from running_mean_std import TfRunningMeanStd
            self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                           scope='ob_rms') if ob else None
            self.ret_rms = TfRunningMeanStd(shape=(),
                                            scope='ret_rms') if ret else None
        else:
            from running_mean_std import RunningMeanStd
            self.ob_rms = RunningMeanStd(
                shape=self.observation_space.shape) if ob else None
            self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)
Beispiel #4
0
 def __init__(self, env_params, gamma, clip_obs=5, clip_rew=5, eps=1e-8):
     with tf.variable_scope('obs_rms'):
         self.obs_rms = RunningMeanStd(shape=(env_params['observation'], ))
     with tf.variable_scope('ret_rms'):
         self.ret_rms = RunningMeanStd(shape=(1, ))
     self.clip_obs = clip_obs
     self.clip_rew = clip_rew
     self.epsilon = eps
     self.disc_reward = np.array([0])
     self.gamma = .99
Beispiel #5
0
class Normalizer:
    """
    Normalizes state and vectors through running means and running stds. Based on open ai's stable baselines
    """
    def __init__(self, env_params, gamma, clip_obs=5, clip_rew=5, eps=1e-8):
        with tf.variable_scope('obs_rms'):
            self.obs_rms = RunningMeanStd(shape=(env_params['observation'], ))
        with tf.variable_scope('ret_rms'):
            self.ret_rms = RunningMeanStd(shape=(1, ))
        self.clip_obs = clip_obs
        self.clip_rew = clip_rew
        self.epsilon = eps
        self.disc_reward = np.array([0])
        self.gamma = .99

    def normalize_state(self, obs, training=True):

        observation = obs
        if training:
            self.obs_rms.update(np.array(observation))
        observation = np.clip((observation - self.obs_rms.mean) /
                              np.sqrt(self.obs_rms.var + self.epsilon),
                              -self.clip_obs, self.clip_obs)
        return observation

    def normalize_reward(self, reward, training=True):

        if training:
            self.disc_reward = self.disc_reward * self.gamma + reward
            self.ret_rms.update(self.disc_reward.flatten())
            r = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon),
                        -self.clip_rew, self.clip_rew)
        return r

    def load(load_path, venv):
        """
        Loads a saved VecNormalize object.

        :param load_path: the path to load from.
        :param venv: the VecEnv to wrap.
        :return: (VecNormalize)
        """
        with open(load_path, "rb") as file_handler:
            norm = pickle.load(file_handler)

        return norm

    def save(self, save_path):
        with open(save_path, "wb") as file_handler:
            pickle.dump(self, file_handler)
    def __init__(self, env, test_env, env_name, n_iterations, agent, epochs,
                 mini_batch_size, epsilon, horizon):
        self.env = env
        self.env_name = env_name
        self.test_env = test_env
        self.agent = agent
        self.epsilon = epsilon
        self.horizon = horizon
        self.epochs = epochs
        self.mini_batch_size = mini_batch_size
        self.n_iterations = n_iterations

        self.start_time = 0
        self.state_rms = RunningMeanStd(shape=(self.agent.n_states, ))

        self.running_reward = 0
Beispiel #7
0
    def __init__(self, memory, nb_status, nb_actions, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_observations=True,
                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
                 actor_lr=1e-4, critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None
Beispiel #8
0
    def __init__(self,
                 obs,
                 action_space,
                 hid_size,
                 num_hidden_layers,
                 num_sub_policies,
                 gaussian_fixed_var=True):

        super(PolicyNet, self).__init__()
        self.obs = obs
        self.action_space = action_space
        self.num_hidden_layers = num_hidden_layers
        self.num_sub_policies = num_sub_policies
        self.gaussian_fixed_var = gaussian_fixed_var
        self.hid_size = hid_size
        self.ob_rms = RunningMeanStd(shape=(self.obs.get_shape()[1], ))
        obz = np.clip((self.obs - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                      5.0)
        last_out = t.FloatTensor(obz)
        self.hiddenlayer = []
        self.lin = (nn.Linear(last_out, self.hid_size), nn.Tanh())
        #self.hiddenlayer = nn.Tanh(nn.Linear(last_out, self.hid_size))
        for layer in range(self.num_hidden_layers):
            last_out = self.hiddenlayer.append(self.lin)
        print(last_out)
Beispiel #9
0
 def __init__(self,
              venv,
              ob=True,
              ret=False,
              clipob=5.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8,
              use_tf=False):
     VecEnvWrapper.__init__(self, venv)
     if use_tf:
         from running_mean_std import TfRunningMeanStd
         self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                        scope='ob_rms') if ob else None
         self.ret_rms = TfRunningMeanStd(shape=(),
                                         scope='ret_rms') if ret else None
     else:
         from running_mean_std import RunningMeanStd
         self.ob_rms = RunningMeanStd(
             shape=self.observation_space.shape) if ob else None
         self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
Beispiel #10
0
    def __init__(self,
                 memory,
                 nb_status,
                 nb_actions,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 actor_lr=1e-4,
                 critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None
Beispiel #11
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        param_list = self.stochpol.param_list + self.dynamics.param_list + self.dynamics.auxiliary_task.param_list  # copy a link, not deepcopy.
        self.optimizer = torch.optim.Adam(param_list, lr=self.lr)
        self.optimizer.zero_grad()

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.ob_space, self.ac_space])
            for l in range(self.nlump)
        ]

        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
 def __init__(self, obs_shape_list, sess=None, summary_writer=None):
     self.sess = sess
     _obs_shape_list = obs_shape_list
     self.summary_writer = summary_writer
     action_shape = (1, 8)
     self.BS = 1
     # self.full_stt_rms = RunningMeanStd(shape=obs_shape_list[1])
     self.s_t0_rms = RunningMeanStd(shape=_obs_shape_list[0])  # (100,100,3)
     self.s_t1_rms = RunningMeanStd(shape=_obs_shape_list[1])  # (7,) pos
     self.s_t2_rms = RunningMeanStd(shape=_obs_shape_list[2])  # (7,) vel
     self.s_t3_rms = RunningMeanStd(shape=_obs_shape_list[3])  # (7,) eff
     self.s_t4_rms = RunningMeanStd(shape=_obs_shape_list[4])  # (1,) grip
     self.s_t5_rms = RunningMeanStd(shape=_obs_shape_list[5])  # (7,) ee
     self.s_t6_rms = RunningMeanStd(shape=_obs_shape_list[6])  # (3,) aux
     self.a_t_rms = RunningMeanStd(shape=action_shape)  # (3,) aux
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 network='mlp',
                 gaussian_fixed_var=True,
                 nsteps=None,
                 nbatch=None,
                 nlstm=256,
                 states=None,
                 masks=None,
                 reuse=False):
        self.network = network

        shape = []
        for d in range(1, len(ob.shape)):
            shape.append(ob.shape[d])

        with tf.variable_scope(name, reuse=reuse):
            self.scope = tf.get_variable_scope().name

            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=shape)
            obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)

            if network == 'mlp':
                hid_size = 64
                num_hid_layers = 2
                self.hid_size = hid_size
                self.num_hid_layers = num_hid_layers
                self.gaussian_fixed_var = gaussian_fixed_var
                self._mlp(obs, hid_size, num_hid_layers, ac_space,
                          gaussian_fixed_var)
            elif network == 'cnn':
                self._cnn(obs, ac_space, gaussian_fixed_var)
            elif network == 'lstm':
                assert nsteps is not None and nbatch is not None
                assert states is not None and masks is not None
                assert isinstance(nsteps, int) and isinstance(nbatch, int)
                assert nsteps > 0 and nbatch > 0
                self._lstm(obs, states, masks, nlstm, ac_space, nbatch, nsteps)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        if network == 'mlp' or network == 'cnn':
            self._act = U.function([stochastic, ob], [ac, self.vpred])
        elif network == 'lstm':
            self._act = U.function([stochastic, ob, states, masks],
                                   [ac, self.vpred, self.snew])
Beispiel #14
0
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 num_subpolicies,
                 network='mlp',
                 gaussian_fixed_var=True):
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        shape = []
        for d in range(1, len(ob.shape)):
            shape.append(ob.shape[d])

        with tf.variable_scope("obfilter", reuse=tf.AUTO_REUSE):
            self.ob_rms = RunningMeanStd(shape=shape)
        obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name

            if network == 'mlp':
                hid_size = 64
                num_hid_layers = 2
                self.hid_size = hid_size
                self.num_hid_layers = num_hid_layers
                self._mlp(obs, num_subpolicies, hid_size, num_hid_layers,
                          ac_space, gaussian_fixed_var)
            elif network == 'cnn':
                self._cnn(obs, num_subpolicies)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])
Beispiel #15
0
def train(variant):
    set_global_seeds(variant['seed'])

    if variant['mode'] == 'local':
        import colored_traceback.always
    '''
    Set-up folder and files
    '''
    snapshot_dir = logger.get_snapshot_dir()
    working_dir = config.PROJECT_PATH
    param_path = os.path.join(working_dir, 'params/params.json')
    # copyfile(param_path, os.path.join(snapshot_dir,'params.json'))

    try:
        '''
        Save parameters
        '''
        if 'params' in variant:
            logger.log('Load params from variant.')
            params = variant['params']
        else:
            logger.log('Load params from file.')
            with open(param_path, 'r') as f:
                params = json.load(f)

        # Save to snapshot dir
        new_param_path = os.path.join(snapshot_dir, 'params.json')
        with open(new_param_path, 'w') as f:
            json.dump(params,
                      f,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))

        # TODO: can use variant to modify here.
        dynamics_opt_params = params['dynamics_opt_params']
        dynamics_opt_params['stop_critereon'] = stop_critereon(
            threshold=dynamics_opt_params['stop_critereon']['threshold'],
            offset=dynamics_opt_params['stop_critereon']['offset'])
        dynamics_opt_params = Dynamics_opt_params(**dynamics_opt_params)

        policy_opt_params = params['policy_opt_params']
        policy_opt_params['stop_critereon'] = stop_critereon(
            threshold=policy_opt_params['stop_critereon']['threshold'],
            offset=policy_opt_params['stop_critereon']['offset'],
            percent_models_threshold=policy_opt_params['stop_critereon']
            ['percent_models_threshold'])
        policy_opt_params = Policy_opt_params(**policy_opt_params)

        rollout_params = params['rollout_params']
        rollout_params['monitorpath'] = os.path.join(snapshot_dir, 'videos')
        rollout_params = Rollout_params(**rollout_params)

        assert params['rollout_params']['max_timestep'] == \
               params['policy_opt_params']['oracle_maxtimestep'] == \
               params['policy_opt_params']['T']
        '''
        Policy model
        '''
        def build_policy_from_rllab(scope_name='training_policy'):
            '''
            Return both rllab policy and policy model function.
            '''
            sess = tf.get_default_session()

            ### Initialize training_policy to copy from policy
            from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
            output_nonlinearity = eval(params['policy']['output_nonlinearity'])

            training_policy = GaussianMLPPolicy(
                name=scope_name,
                env_spec=env.spec,
                hidden_sizes=params['policy']['hidden_layers'],
                init_std=policy_opt_params.trpo['init_std'],
                output_nonlinearity=output_nonlinearity)
            training_policy_vars = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, scope='training_policy')
            sess.run([tf.variables_initializer(training_policy_vars)])

            ### Compute policy model function using the same weights.
            training_layers = training_policy._mean_network.layers

            def policy_model(x, stochastic=0.0, collect_summary=False):
                assert (training_layers[0].shape[1] == x.shape[1])
                h = x
                for i, layer in enumerate(training_layers[1:]):
                    w = layer.W
                    b = layer.b
                    pre_h = tf.matmul(h, w) + b
                    h = layer.nonlinearity(pre_h, name='policy_out')
                    if collect_summary:
                        with tf.name_scope(scope_name + '/observation'):
                            variable_summaries(x)
                        with tf.name_scope(scope_name + '/layer%d' % i):
                            with tf.name_scope('weights'):
                                variable_summaries(w)
                            with tf.name_scope('biases'):
                                variable_summaries(b)
                            with tf.name_scope('Wx_plus_b'):
                                tf.summary.histogram('pre_activations', pre_h)
                            tf.summary.histogram('activations', h)
                std = training_policy._l_std_param.param
                h += stochastic * tf.random_normal(
                    shape=(tf.shape(x)[0], n_actions)) * tf.exp(std)
                return h

            return training_policy, policy_model

        '''
        Dynamics model
        '''

        def get_value(key, dict):
            return key in dict and dict[key]

        def prepare_input(xgu, xgu_norm, scope_name, variable_name,
                          collect_summary, prediction_type):
            name_scope = '%s/%s' % (scope_name, variable_name)
            assert n_states > 1 and n_actions > 1 \
                   and xgu.shape[1] == n_states + n_actions + n_goals
            xu = tf.concat([xgu[:, :n_states], xgu[:, n_states + n_goals:]],
                           axis=1)
            xu_norm = tf.concat(
                [xgu_norm[:, :n_states], xgu_norm[:, n_states + n_goals:]],
                axis=1)
            # Collect data summaries
            if collect_summary:
                with tf.name_scope(name_scope + '/inputs'):
                    with tf.name_scope('states'):
                        data_summaries(xgu[:, :n_states])
                    with tf.name_scope('goals'):
                        data_summaries(xgu[:, n_states:n_states + n_goals])
                    with tf.name_scope('actions'):
                        data_summaries(xgu[:, n_states + n_goals:])
            # Ignore xy in the current state.
            if get_value('ignore_xy_input', params['dynamics_model']):
                n_inputs = n_states + n_actions - 2
                nn_input = xu_norm[:, 2:]
            elif get_value('ignore_x_input', params['dynamics_model']):
                n_inputs = n_states + n_actions - 1
                nn_input = xu_norm[:, 1:]
            else:
                n_inputs = n_states + n_actions
                nn_input = xu_norm
            hidden_layers = list(params['dynamics_model']['hidden_layers'])
            nonlinearity = [
                eval(_x) for _x in params['dynamics_model']['nonlinearity']
            ]
            assert (len(nonlinearity) == len(hidden_layers))
            # Verify if the input type is valid.
            if prediction_type == 'state_change' or \
                            prediction_type == 'state_change_goal':
                n_outputs = n_states
            else:
                assert prediction_type == 'second_derivative' or \
                       prediction_type == 'second_derivative_goal'
                n_outputs = int(n_states / 2)
            nonlinearity.append(tf.identity)
            hidden_layers.append(n_outputs)
            return xu, nn_input, n_inputs, n_outputs, \
                   nonlinearity, hidden_layers

        def build_ff_neural_net(nn_input,
                                n_inputs,
                                hidden_layers,
                                nonlinearity,
                                scope_name,
                                variable_name,
                                collect_summary,
                                logit_weights=None,
                                initializer=layers.xavier_initializer()):
            assert len(hidden_layers) == len(nonlinearity)
            name_scope = '%s/%s' % (scope_name, variable_name)
            h = nn_input
            n_hiddens = n_inputs
            n_hiddens_next = hidden_layers[0]
            for i in range(len(hidden_layers)):
                w = get_scope_variable(scope_name,
                                       "%s/layer%d/weights" %
                                       (variable_name, i),
                                       shape=(n_hiddens, n_hiddens_next),
                                       initializer=initializer)
                b = get_scope_variable(scope_name,
                                       "%s/layer%d/biases" %
                                       (variable_name, i),
                                       shape=(n_hiddens_next),
                                       initializer=initializer)
                if collect_summary:
                    with tf.name_scope(name_scope + '/layer%d' % i):
                        with tf.name_scope('weights'):
                            variable_summaries(w)
                        with tf.name_scope('biases'):
                            variable_summaries(b)
                        with tf.name_scope('Wx_plus_b'):
                            pre_h = tf.matmul(h, w) + b
                            tf.summary.histogram('pre_activations', pre_h)
                        h = nonlinearity[i](pre_h, name='activation')
                        tf.summary.histogram('activations', h)
                else:
                    pre_h = tf.matmul(h, w) + b
                    h = nonlinearity[i](pre_h, name='activation')
                n_hiddens = hidden_layers[i]
                if i + 1 < len(hidden_layers):
                    n_hiddens_next = hidden_layers[i + 1]
                if logit_weights is not None and i == len(hidden_layers) - 2:
                    h *= logit_weights
            return h

        def build_dynamics_model(n_states,
                                 n_actions,
                                 n_goals,
                                 dt=None,
                                 input_rms=None,
                                 diff_rms=None):
            prediction_type = params['dynamics_model']['prediction_type']

            def dynamics_model(xgu,
                               scope_name,
                               variable_name,
                               collect_summary=False):
                '''
                :param xu: contains states, goals, actions
                :param scope_name:
                :param variable_name:
                :param dt:
                :return:
                '''
                xu, nn_input, n_inputs, n_outputs, nonlinearity, hidden_layers = \
                    prepare_input(xgu,
                                  (xgu - input_rms.mean)/input_rms.std,
                                  scope_name,
                                  variable_name,
                                  collect_summary,
                                  prediction_type)

                if "use_logit_weights" in params["dynamics_model"] and params[
                        "dynamics_model"]["use_logit_weights"]:
                    logit_weights = build_ff_neural_net(
                        nn_input, n_inputs, hidden_layers[:-1],
                        nonlinearity[:-2] + [tf.nn.sigmoid], scope_name,
                        variable_name + '_sig', collect_summary)
                else:
                    logit_weights = None
                nn_output = build_ff_neural_net(nn_input,
                                                n_inputs,
                                                hidden_layers,
                                                nonlinearity,
                                                scope_name,
                                                variable_name,
                                                collect_summary,
                                                logit_weights=logit_weights)

                # predict the delta instead (x_next-x_current)
                if 'state_change' in prediction_type:
                    next_state = tf.add(
                        diff_rms.mean[:n_states] +
                        diff_rms.std[:n_outputs] * nn_output, xu[:, :n_states])
                else:
                    assert 'second_derivative' in prediction_type
                    # We train 'out' to match state_dot_dot
                    # Currently only works for swimmer.
                    qpos = xu[:, :n_outputs] + dt * xu[:, n_outputs:n_states]
                    qvel = xu[:, n_outputs:n_states] + dt * nn_output
                    next_state = tf.concat([qpos, qvel], axis=1)
                if '_goal' in prediction_type:
                    assert n_goals > 1
                    g = xgu[:, n_states:n_states + n_goals]
                    next_state = tf.concat([next_state, g], axis=1)
                return tf.identity(next_state,
                                   name='%s/%s/dynamics_out' %
                                   (scope_name, variable_name))

            return dynamics_model

        def get_regularizer_loss(scope_name, variable_name):
            if params['dynamics_model']['regularization']['method'] in [
                    None, ''
            ]:
                return tf.constant(0.0, dtype=tf.float32)
            constant = params['dynamics_model']['regularization']['constant']
            regularizer = eval(
                params['dynamics_model']['regularization']['method'])
            hidden_layers = params['dynamics_model']['hidden_layers']
            reg_loss = 0.0
            for i in range(len(hidden_layers) + 1):
                w = get_scope_variable(
                    scope_name, "%s/layer%d/weights" % (variable_name, i))
                b = get_scope_variable(
                    scope_name, "%s/layer%d/biases" % (variable_name, i))
                reg_loss += regularizer(w) + regularizer(b)
            return constant * reg_loss

        '''
        Main
        '''
        # with get_session() as sess:
        if variant['mode'] == 'local':
            sess = get_session(interactive=True, mem_frac=0.1)
        else:
            sess = get_session(interactive=True,
                               mem_frac=1.0,
                               use_gpu=variant['use_gpu'])

        # data = joblib.load(os.path.join(working_dir, params['trpo_path']))
        env = get_env(variant['params']['env'])

        # policy = data['policy']
        training_policy, policy_model = build_policy_from_rllab()
        if hasattr(env._wrapped_env, '_wrapped_env'):
            inner_env = env._wrapped_env._wrapped_env
        else:
            inner_env = env._wrapped_env.env.unwrapped
        n_obs = inner_env.observation_space.shape[0]
        n_actions = inner_env.action_space.shape[0]
        cost_np = inner_env.cost_np
        cost_tf = inner_env.cost_tf
        cost_np_vec = inner_env.cost_np_vec
        if hasattr(inner_env, 'n_goals'):
            n_goals = inner_env.n_goals
            n_states = inner_env.n_states
            assert n_goals + n_states == n_obs
        else:
            n_goals = 0
            n_states = n_obs
        dt = None
        # Only necessary for second_derivative
        if hasattr(inner_env, 'model') and hasattr(inner_env, 'frame_skip'):
            dt = inner_env.model.opt.timestep * inner_env.frame_skip
        from running_mean_std import RunningMeanStd
        with tf.variable_scope('input_rms'):
            input_rms = RunningMeanStd(epsilon=0.0,
                                       shape=(n_states + n_goals + n_actions))
        with tf.variable_scope('diff_rms'):
            diff_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals))
        dynamics_model = build_dynamics_model(n_states=n_states,
                                              n_actions=n_actions,
                                              n_goals=n_goals,
                                              dt=dt,
                                              input_rms=input_rms,
                                              diff_rms=diff_rms)

        kwargs = {}
        kwargs['input_rms'] = input_rms
        kwargs['diff_rms'] = diff_rms
        kwargs['mode'] = variant['mode']

        if params['algo'] == 'vpg':
            from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
            from algos.vpg import VPG
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = VPG(
                env=env,
                policy=training_policy,
                baseline=baseline,
                batch_size=policy_opt_params.vpg['batch_size'],
                max_path_length=policy_opt_params.T,
                discount=policy_opt_params.vpg['discount'],
            )
            kwargs['rllab_algo'] = algo
            if params["policy_opt_params"]["vpg"]["reset"]:
                kwargs['reset_opt'] = tf.assign(
                    training_policy._l_std_param.param,
                    np.log(params["policy_opt_params"]["vpg"]["init_std"]) *
                    np.ones(n_actions))
        elif params['algo'] == 'trpo':
            ### Write down baseline and algo
            from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
            from algos.trpo import TRPO
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = TRPO(
                env=env,
                policy=training_policy,
                baseline=baseline,
                batch_size=policy_opt_params.trpo['batch_size'],
                max_path_length=policy_opt_params.T,
                discount=policy_opt_params.trpo['discount'],
                step_size=policy_opt_params.trpo['step_size'],
            )
            kwargs['rllab_algo'] = algo
            if params["policy_opt_params"]["trpo"]["reset"]:
                kwargs['reset_opt'] = tf.assign(
                    training_policy._l_std_param.param,
                    np.log(params["policy_opt_params"]["trpo"]["init_std"]) *
                    np.ones(n_actions))
            # if "decay_rate" in params["policy_opt_params"]["trpo"]:
            #     kwargs['trpo_std_decay'] = tf.assign_sub(training_policy._l_std_param.param,
            #     np.log(params["policy_opt_params"]["trpo"]["decay_rate"])*np.ones(n_actions))
        kwargs['inner_env'] = inner_env
        kwargs['algo_name'] = params['algo']
        kwargs['logstd'] = training_policy._l_std_param.param
        # Save initial policy
        joblib.dump(training_policy,
                    os.path.join(snapshot_dir, 'params-initial.pkl'))

        train_models(env=env,
                     dynamics_model=dynamics_model,
                     dynamics_opt_params=dynamics_opt_params,
                     get_regularizer_loss=get_regularizer_loss,
                     policy_model=policy_model,
                     policy_opt_params=policy_opt_params,
                     rollout_params=rollout_params,
                     cost_np=cost_np,
                     cost_np_vec=cost_np_vec,
                     cost_tf=cost_tf,
                     snapshot_dir=snapshot_dir,
                     working_dir=working_dir,
                     n_models=params['n_models'],
                     sweep_iters=params['sweep_iters'],
                     sample_size=params['sample_size'],
                     verbose=False,
                     variant=variant,
                     saved_policy=training_policy,
                     **kwargs)  # Make sure not to reinitialize TRPO policy.

        # Save the final policy
        joblib.dump(training_policy, os.path.join(snapshot_dir, 'params.pkl'))

    except Exception as e:
        rmtree(snapshot_dir)
        import sys, traceback
        # traceback.print_exception(*sys.exc_info())
        from IPython.core.ultratb import ColorTB
        c = ColorTB()
        exc = sys.exc_info()
        print(''.join(c.structured_traceback(*exc)))
        print('Removed the experiment folder %s.' % snapshot_dir)
Beispiel #16
0
# Build the key classes
if args.logger == "wandb":
    tracker = WandBTracker(args.name, args)
else:
    tracker = ConsoleTracker(args.name, args)
game_player = GamePlayer(args, shared_obs)
if action_type == "discrete":
    dist = Discrete(args.num_actions)
elif action_type == "continuous":
    dist = Normal(args.num_actions)
if args.model == "cnn":
    model = CNNBase(1, args.num_actions, dist).to(device)
elif args.model == "mlp":
    model = MLPBase(args.num_obs, args.num_actions, dist).to(device)
optim = torch.optim.Adam(model.parameters(), lr=args.lr)
reward_normalizer = RunningMeanStd(shape=())
obs_normalizer = RunningMeanStd(shape=(args.num_obs, ))

# Main loop
i = 0
for i in range(args.num_iterations):
    # Run num_steps of the game in each worker and accumulate results in
    # the data arrays
    game_player.run_rollout(args, shared_obs, rewards, discounted_rewards,
                            values, policy_probs, actions, model,
                            obs_normalizer, device, episode_ends)

    observations = shared_obs.copy()

    if args.model == "mlp":
        # Normalize rewards
Beispiel #17
0
class DDPG(object):
    def __init__(self, memory, nb_status, nb_actions, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_observations=True,
                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.),
                 actor_lr=1e-4, critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None

    def pi(self, obs, apply_noise=True, compute_Q=True):
        obs = np.array([obs])
        action = to_numpy(self.actor(to_tensor(obs))).squeeze(0)
        if compute_Q:
            q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise

        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q[0][0]

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        next_q_values = self.critic_target([
            to_tensor(batch['obs1'], volatile=True),
            self.actor_target(to_tensor(batch['obs1'], volatile=True))])
        next_q_values.volatile = False

        target_q_batch = to_tensor(batch['rewards']) + \
                         self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values

        self.critic.zero_grad()
        q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])])
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()
        policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.cpu().data[0], policy_loss.cpu().data[0]

    def initialize(self):
        hard_update(self.actor_target, self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_target_net(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def reset(self):
        if self.action_noise is not None:
            self.action_noise.reset()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()
Beispiel #18
0
class PpoOptimizer(object):
    envs = None

    def __init__(self, *, scope, ob_space, ac_space, stochpol, ent_coef, gamma,
                 lam, nepochs, lr, cliprange, nminibatches, normrew, normadv,
                 use_news, ext_coeff, int_coeff, nsteps_per_seg, nsegs_per_env,
                 dynamics):
        self.dynamics = dynamics
        self.use_recorder = True
        self.n_updates = 0
        self.scope = scope
        self.ob_space = ob_space
        self.ac_space = ac_space
        self.stochpol = stochpol
        self.nepochs = nepochs
        self.lr = lr
        self.cliprange = cliprange
        self.nsteps_per_seg = nsteps_per_seg
        self.nsegs_per_env = nsegs_per_env
        self.nminibatches = nminibatches
        self.gamma = gamma
        self.lam = lam
        self.normrew = normrew
        self.normadv = normadv
        self.use_news = use_news
        self.ent_coef = ent_coef
        self.ext_coeff = ext_coeff
        self.int_coeff = int_coeff

    def start_interaction(self, env_fns, dynamics, nlump=2):
        param_list = self.stochpol.param_list + self.dynamics.param_list + self.dynamics.auxiliary_task.param_list  # copy a link, not deepcopy.
        self.optimizer = torch.optim.Adam(param_list, lr=self.lr)
        self.optimizer.zero_grad()

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.ob_space, self.ac_space])
            for l in range(self.nlump)
        ]

        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()

    def stop_interaction(self):
        for env in self.envs:
            env.close()

    def calculate_advantages(self, rews, use_news, gamma, lam):
        nsteps = self.rollout.nsteps
        lastgaelam = 0
        for t in range(nsteps - 1, -1, -1):  # nsteps-2 ... 0
            nextnew = self.rollout.buf_news[:, t +
                                            1] if t + 1 < nsteps else self.rollout.buf_new_last
            if not use_news:
                nextnew = 0
            nextvals = self.rollout.buf_vpreds[:, t +
                                               1] if t + 1 < nsteps else self.rollout.buf_vpred_last
            nextnotnew = 1 - nextnew
            delta = rews[:,
                         t] + gamma * nextvals * nextnotnew - self.rollout.buf_vpreds[:,
                                                                                      t]
            self.buf_advs[:,
                          t] = lastgaelam = delta + gamma * lam * nextnotnew * lastgaelam
        self.buf_rets[:] = self.buf_advs + self.rollout.buf_vpreds

    def update(self):
        if self.normrew:
            rffs = np.array(
                [self.rff.update(rew) for rew in self.rollout.buf_rews.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std**2,
                                             rffs_count)
            rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        else:
            rews = np.copy(self.rollout.buf_rews)
        self.calculate_advantages(rews=rews,
                                  use_news=self.use_news,
                                  gamma=self.gamma,
                                  lam=self.lam)

        info = dict(advmean=self.buf_advs.mean(),
                    advstd=self.buf_advs.std(),
                    retmean=self.buf_rets.mean(),
                    retstd=self.buf_rets.std(),
                    vpredmean=self.rollout.buf_vpreds.mean(),
                    vpredstd=self.rollout.buf_vpreds.std(),
                    ev=explained_variance(self.rollout.buf_vpreds.ravel(),
                                          self.buf_rets.ravel()),
                    rew_mean=np.mean(self.rollout.buf_rews),
                    recent_best_ext_ret=self.rollout.current_max)
        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret

        to_report = {
            'total': 0.0,
            'pg': 0.0,
            'vf': 0.0,
            'ent': 0.0,
            'approxkl': 0.0,
            'clipfrac': 0.0,
            'aux': 0.0,
            'dyn_loss': 0.0,
            'feat_var': 0.0
        }

        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        mblossvals = []

        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env,
                               envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]

                acs = self.rollout.buf_acs[mbenvinds]
                rews = self.rollout.buf_rews[mbenvinds]
                vpreds = self.rollout.buf_vpreds[mbenvinds]
                nlps = self.rollout.buf_nlps[mbenvinds]
                obs = self.rollout.buf_obs[mbenvinds]
                rets = self.buf_rets[mbenvinds]
                advs = self.buf_advs[mbenvinds]
                last_obs = self.rollout.buf_obs_last[mbenvinds]

                lr = self.lr
                cliprange = self.cliprange

                self.stochpol.update_features(obs, acs)
                self.dynamics.auxiliary_task.update_features(obs, last_obs)
                self.dynamics.update_features(obs, last_obs)

                feat_loss = torch.mean(self.dynamics.auxiliary_task.get_loss())
                dyn_loss = torch.mean(self.dynamics.get_loss())

                acs = torch.tensor(flatten_dims(acs, len(self.ac_space.shape)))
                neglogpac = self.stochpol.pd.neglogp(acs)
                entropy = torch.mean(self.stochpol.pd.entropy())
                vpred = self.stochpol.vpred
                vf_loss = 0.5 * torch.mean(
                    (vpred.squeeze() - torch.tensor(rets))**2)

                nlps = torch.tensor(flatten_dims(nlps, 0))
                ratio = torch.exp(nlps - neglogpac.squeeze())

                advs = flatten_dims(advs, 0)
                negadv = torch.tensor(-advs)
                pg_losses1 = negadv * ratio
                pg_losses2 = negadv * torch.clamp(
                    ratio, min=1.0 - cliprange, max=1.0 + cliprange)
                pg_loss_surr = torch.max(pg_losses1, pg_losses2)
                pg_loss = torch.mean(pg_loss_surr)
                ent_loss = (-self.ent_coef) * entropy

                approxkl = 0.5 * torch.mean((neglogpac - nlps)**2)
                clipfrac = torch.mean(
                    (torch.abs(pg_losses2 - pg_loss_surr) > 1e-6).float())
                feat_var = torch.std(self.dynamics.auxiliary_task.features)

                total_loss = pg_loss + ent_loss + vf_loss + feat_loss + dyn_loss

                total_loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()

                to_report['total'] += total_loss.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['pg'] += pg_loss.data.numpy() / (self.nminibatches *
                                                           self.nepochs)
                to_report['vf'] += vf_loss.data.numpy() / (self.nminibatches *
                                                           self.nepochs)
                to_report['ent'] += ent_loss.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['approxkl'] += approxkl.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['clipfrac'] += clipfrac.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['feat_var'] += feat_var.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['aux'] += feat_loss.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['dyn_loss'] += dyn_loss.data.numpy() / (
                    self.nminibatches * self.nepochs)

        info.update(to_report)
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        })
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        info['tps'] = self.rollout.nsteps * self.nenvs / (
            tnow - self.t_last_update)  # MPI.COMM_WORLD.Get_size() *
        self.t_last_update = tnow

        return info

    def step(self):
        self.rollout.collect_rollout()
        update_info = self.update()
        return {'update': update_info}

    def get_var_values(self):
        return self.stochpol.get_var_values()

    def set_var_values(self, vv):
        self.stochpol.set_var_values(vv)
Beispiel #19
0
class DDPG(object):
    def __init__(self,
                 memory,
                 nb_status,
                 nb_actions,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 actor_lr=1e-4,
                 critic_lr=1e-3):
        self.nb_status = nb_status
        self.nb_actions = nb_actions
        self.action_range = action_range
        self.observation_range = observation_range
        self.normalize_observations = normalize_observations

        self.actor = Actor(self.nb_status, self.nb_actions)
        self.actor_target = Actor(self.nb_status, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)

        self.critic = Critic(self.nb_status, self.nb_actions)
        self.critic_target = Critic(self.nb_status, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        # Create replay buffer
        self.memory = memory  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.action_noise = action_noise

        # Hyper-parameters
        self.batch_size = batch_size
        self.tau = tau
        self.discount = gamma

        if self.normalize_observations:
            self.obs_rms = RunningMeanStd()
        else:
            self.obs_rms = None

    def pi(self, obs, apply_noise=True, compute_Q=True):
        obs = np.array([obs])
        action = to_numpy(self.actor(to_tensor(obs))).squeeze(0)
        if compute_Q:
            q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data
        else:
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise

        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q[0][0]

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        next_q_values = self.critic_target([
            to_tensor(batch['obs1'], volatile=True),
            self.actor_target(to_tensor(batch['obs1'], volatile=True))
        ])
        next_q_values.volatile = False

        target_q_batch = to_tensor(batch['rewards']) + \
                         self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values

        self.critic.zero_grad()
        q_batch = self.critic(
            [to_tensor(batch['obs0']),
             to_tensor(batch['actions'])])
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()
        policy_loss = -self.critic(
            [to_tensor(batch['obs0']),
             self.actor(to_tensor(batch['obs0']))]).mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.cpu().data[0], policy_loss.cpu().data[0]

    def initialize(self):
        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

    def update_target_net(self):
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def reset(self):
        if self.action_noise is not None:
            self.action_noise.reset()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()
Beispiel #20
0
def main():
    actor_critic = core.MLPActorCritic
    hidden_size = 64
    activation = torch.nn.Tanh
    seed = 5
    steps_per_epoch = 2048
    epochs = 1000
    gamma = 0.99
    lam = 0.97
    clip_ratio = 0.2
    pi_lr = 3e-4
    vf_lr = 1e-3
    train_pi_iters = 80
    train_vf_iters = 80
    max_ep_len = 1000
    target_kl = 0.01
    save_freq = 10
    obs_norm = True
    view_curve = False

    # make an environment
    #     env = gym.make('CartPole-v0')
    #     env = gym.make('CartPole-v1')
    #     env = gym.make('MountainCar-v0')
    #     env = gym.make('LunarLander-v2')
    env = gym.make('BipedalWalker-v3')
    print(f"reward_threshold: {env.spec.reward_threshold}")

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Random seed
    env.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space,
                      (hidden_size, hidden_size), activation)

    # Set up optimizers for policy and value function
    pi_optimizer = AdamW(ac.pi.parameters(), lr=pi_lr, eps=1e-6)
    vf_optimizer = AdamW(ac.v.parameters(), lr=vf_lr, eps=1e-6)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch)
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Prepare for interaction with environment
    o, ep_ret, ep_len = env.reset(), 0, 0
    ep_num = 0
    ep_ret_buf, eval_ret_buf = [], []
    loss_buf = {'pi': [], 'vf': []}
    obs_normalizer = RunningMeanStd(shape=env.observation_space.shape)
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            env.render()
            if obs_norm:
                obs_normalizer.update(np.array([o]))
                o_norm = np.clip(
                    (o - obs_normalizer.mean) / np.sqrt(obs_normalizer.var),
                    -10, 10)
                a, v, logp = ac.step(
                    torch.as_tensor(o_norm, dtype=torch.float32))
            else:
                a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            if obs_norm:
                buf.store(o_norm, a, r, v, logp)
            else:
                buf.store(o, a, r, v, logp)

            # Update obs
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if timeout or epoch_ended:
                    if obs_norm:
                        obs_normalizer.update(np.array([o]))
                        o_norm = np.clip((o - obs_normalizer.mean) /
                                         np.sqrt(obs_normalizer.var), -10, 10)
                        _, v, _ = ac.step(
                            torch.as_tensor(o_norm, dtype=torch.float32))
                    else:
                        _, v, _ = ac.step(
                            torch.as_tensor(o, dtype=torch.float32))
                else:
                    if obs_norm:
                        obs_normalizer.update(np.array([o]))
                    v = 0
                buf.finish_path(v)
                if terminal:
                    ep_ret_buf.append(ep_ret)
                    eval_ret_buf.append(np.mean(ep_ret_buf[-20:]))
                    ep_num += 1
                    if view_curve:
                        plot(ep_ret_buf, eval_ret_buf, loss_buf)
                    else:
                        print(f'Episode: {ep_num:3}\tReward: {ep_ret:3}')
                    if eval_ret_buf[-1] >= env.spec.reward_threshold:
                        print(f"\n{env.spec.id} is sloved! {ep_num} Episode")
                        torch.save(
                            ac.state_dict(),
                            f'./test/saved_models/{env.spec.id}_ep{ep_num}_clear_model_ppo.pt'
                        )
                        with open(
                                f'./test/saved_models/{env.spec.id}_ep{ep_num}_clear_norm_obs.pkl',
                                'wb') as f:
                            pickle.dump(obs_normalizer, f,
                                        pickle.HIGHEST_PROTOCOL)
                        return

                o, ep_ret, ep_len = env.reset(), 0, 0
        # Perform PPO update!
        update(buf, train_pi_iters, train_vf_iters, clip_ratio, target_kl, ac,
               pi_optimizer, vf_optimizer, loss_buf)
Beispiel #21
0
def main():
    args = get_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    args_dir, logs_dir, models_dir, samples_dir = get_all_save_paths(
        args, 'pretrain', combine_action=args.combine_action)
    eval_log_dir = logs_dir + "_eval"
    utils.cleanup_log_dir(logs_dir)
    utils.cleanup_log_dir(eval_log_dir)

    _, _, intrinsic_models_dir, _ = get_all_save_paths(args,
                                                       'learn_reward',
                                                       load_only=True)
    if args.load_iter != 'final':
        intrinsic_model_file_name = os.path.join(
            intrinsic_models_dir,
            args.env_name + '_{}.pt'.format(args.load_iter))
    else:
        intrinsic_model_file_name = os.path.join(
            intrinsic_models_dir, args.env_name + '.pt'.format(args.load_iter))
    intrinsic_arg_file_name = os.path.join(args_dir, 'command.txt')

    # save args to arg_file
    with open(intrinsic_arg_file_name, 'w') as f:
        json.dump(args.__dict__, f, indent=2)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, logs_dir, device, False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)
    else:
        raise NotImplementedError

    if args.use_intrinsic:
        obs_shape = envs.observation_space.shape
        if len(obs_shape) == 3:
            action_dim = envs.action_space.n
        elif len(obs_shape) == 1:
            action_dim = envs.action_space.shape[0]

        if 'NoFrameskip' in args.env_name:
            file_name = os.path.join(
                args.experts_dir, "trajs_ppo_{}.pt".format(
                    args.env_name.split('-')[0].replace('NoFrameskip',
                                                        '').lower()))
        else:
            file_name = os.path.join(
                args.experts_dir,
                "trajs_ppo_{}.pt".format(args.env_name.split('-')[0].lower()))

        rff = RewardForwardFilter(args.gamma)
        intrinsic_rms = RunningMeanStd(shape=())

        if args.intrinsic_module == 'icm':
            print('Loading pretrained intrinsic module: %s' %
                  intrinsic_model_file_name)
            inverse_model, forward_dynamics_model, encoder = torch.load(
                intrinsic_model_file_name)
            icm =  IntrinsicCuriosityModule(envs, device, inverse_model, forward_dynamics_model, \
                                            inverse_lr=args.intrinsic_lr, forward_lr=args.intrinsic_lr,\
                                            )

        if args.intrinsic_module == 'vae':
            print('Loading pretrained intrinsic module: %s' %
                  intrinsic_model_file_name)
            vae = torch.load(intrinsic_model_file_name)
            icm =  GenerativeIntrinsicRewardModule(envs, device, \
                                                   vae, lr=args.intrinsic_lr, \
                                                   )

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            obs, reward, done, infos = envs.step(action)
            next_obs = obs

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, next_obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        if args.use_intrinsic:
            for step in range(args.num_steps):
                state = rollouts.obs[step]
                action = rollouts.actions[step]
                next_state = rollouts.next_obs[step]
                if args.intrinsic_module == 'icm':
                    state = encoder(state)
                    next_state = encoder(next_state)
                with torch.no_grad():
                    rollouts.rewards[
                        step], pred_next_state = icm.calculate_intrinsic_reward(
                            state, action, next_state, args.lambda_true_action)
            if args.standardize == 'True':
                buf_rews = rollouts.rewards.cpu().numpy()
                intrinsic_rffs = np.array(
                    [rff.update(rew) for rew in buf_rews.T])
                rffs_mean, rffs_std, rffs_count = mpi_moments(
                    intrinsic_rffs.ravel())
                intrinsic_rms.update_from_moments(rffs_mean, rffs_std**2,
                                                  rffs_count)
                mean = intrinsic_rms.mean
                std = np.asarray(np.sqrt(intrinsic_rms.var))
                rollouts.rewards = rollouts.rewards / torch.from_numpy(std).to(
                    device)

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % args.save_interval == 0
                or j == num_updates - 1) and args.save_dir != "":
            save_path = os.path.join(models_dir, args.algo)
            policy_file_name = os.path.join(save_path, args.env_name + '.pt')

            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save([
                actor_critic,
                getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
            ], policy_file_name)

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "{} Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(args.env_name, j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            ob_rms = utils.get_vec_normalize(envs).ob_rms
            evaluate(actor_critic, ob_rms, args.env_name, args.seed,
                     args.num_processes, eval_log_dir, device)
Beispiel #22
0
class RandomNetworkDistillation:
    def __init__(
        self,
        log_interval=10,
        lr=1e-5,
        use_cuda=False,
        verbose=0,
        log_tensorboard=False,
        path="rnd_model/",
    ):
        self.predictor = predictor_generator()
        self.target = target_generator()
        for param in self.target.parameters():
            param.requires_grad = False
        self.target.eval()

        self.log_interval = log_interval
        self.optimizer = torch.optim.Adam(self.predictor.parameters(), lr=lr)
        self.loss_function = torch.nn.MSELoss(reduction='mean')

        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.target.to(self.device)
        self.predictor.to(self.device)

        self.running_stats = RunningMeanStd()

        self.verbose = verbose
        self.writer = SummaryWriter() if log_tensorboard else None
        self.n_iter = 0

        self.save_path = path
        Path(path).mkdir(parents=True, exist_ok=True)

        self.early_stopping = EarlyStopping(save_dir=self.save_path)

    def set_data(self, train_tensor, test_tensor):
        train_target_tensor = self.target(train_tensor.to(self.device))
        train_dataset = TensorDataset(train_tensor, train_target_tensor)
        self.train_loader = DataLoader(train_dataset)

        test_target_tensor = self.target(test_tensor.to(self.device))
        test_dataset = TensorDataset(test_tensor, test_target_tensor)
        self.test_loader = DataLoader(test_dataset)
        return

    def learn(self, epochs):
        for epoch in range(epochs):
            self._train(epoch)
            test_loss = self._test()
        return test_loss

    def _train(self, epoch):
        self.predictor.train()
        for batch_idx, (data, target) in enumerate(self.train_loader):
            data, target = data.to(self.device), target.to(self.device)
            output = self.predictor(data)
            loss = self.loss_function(output, target)
            loss.backward()
            self.optimizer.step()
            self.n_iter += 1
            self.running_stats.update(arr=array([loss.item()]))

            if self.verbose > 0 and batch_idx % self.log_interval == 0:
                print(
                    f"Train Epoch: {epoch} [{batch_idx*len(data)}/{len(self.train_loader.dataset)} ({100. * batch_idx/len(self.train_loader):.0f}%)]",
                    end="\t")
                print(f"Loss: {loss.item():.6f}")
            if self.writer is not None and self.n_iter % 100 == 0:
                self.writer.add_scalar("Loss/train", loss.item(), self.n_iter)
        return

    def _test(self):
        self.predictor.eval()
        test_loss = 0
        with torch.no_grad():
            for data, target in self.test_loader:
                data, target = data.to(self.device), target.to(self.device)
                output = self.predictor(data)
                test_loss += self.loss_function(output, target).item()
        test_loss /= len(self.test_loader.dataset)
        if self.verbose > 0:
            print(f"\nTest set: Average loss: {test_loss:.4f}\n")
        if self.writer is not None:
            self.writer.add_scalar("Loss/test", test_loss, self.n_iter)

        self.early_stopping(test_loss, self.predictor)
        if self.early_stopping.early_stop:
            print(">> save early stop checkpoint")
        return test_loss

    def get_intrinsic_reward(self, x: torch.Tensor):
        x = x.to(self.device)
        predict = self.predictor(x)
        target = self.target(x)
        intrinsic_reward = self.loss_function(predict,
                                              target).data.cpu().numpy()
        intrinsic_reward = (intrinsic_reward - self.running_stats.mean) / sqrt(
            self.running_stats.var)
        intrinsic_reward = clip(intrinsic_reward, -5, 5)
        return intrinsic_reward

    def save(self):
        path = self.save_path
        with open("{}/running_stat.pkl".format(path), 'wb') as f:
            pickle.dump(self.running_stats, f)
        torch.save(self.target.state_dict(), "{}/target.pt".format(path))
        torch.save(self.predictor.state_dict(), "{}/predictor.pt".format(path))
        return

    def load(self, path="rnd_model/", load_checkpoint=False):
        with open("{}/running_stat.pkl".format(path), 'rb') as f:
            self.running_stats = pickle.load(f)
        self.target.load_state_dict(
            torch.load("{}/target.pt".format(path),
                       map_location=torch.device(self.device)))
        if load_checkpoint:
            self.predictor.load_state_dict(
                torch.load("{}/checkpoint.pt".format(path),
                           map_location=torch.device(self.device)))
        else:
            self.predictor.load_state_dict(
                torch.load("{}/predictor.pt".format(path),
                           map_location=torch.device(self.device)))
        return
class Train:
    def __init__(self, env, test_env, env_name, n_iterations, agent, epochs,
                 mini_batch_size, epsilon, horizon):
        self.env = env
        self.env_name = env_name
        self.test_env = test_env
        self.agent = agent
        self.epsilon = epsilon
        self.horizon = horizon
        self.epochs = epochs
        self.mini_batch_size = mini_batch_size
        self.n_iterations = n_iterations

        self.start_time = 0
        self.state_rms = RunningMeanStd(shape=(self.agent.n_states, ))

        self.running_reward = 0

    @staticmethod
    def choose_mini_batch(mini_batch_size, states, actions, returns, advs,
                          values, log_probs):
        full_batch_size = len(states)
        for _ in range(full_batch_size // mini_batch_size):
            indices = np.random.randint(0, full_batch_size, mini_batch_size)
            yield states[indices], actions[indices], returns[indices], advs[indices], values[indices],\
                  log_probs[indices]

    def train(self, states, actions, advs, values, log_probs):

        values = np.vstack(values[:-1])
        log_probs = np.vstack(log_probs)
        returns = advs + values
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
        actions = np.vstack(actions)
        for epoch in range(self.epochs):
            for state, action, return_, adv, old_value, old_log_prob in self.choose_mini_batch(
                    self.mini_batch_size, states, actions, returns, advs,
                    values, log_probs):
                state = torch.Tensor(state).to(self.agent.device)
                action = torch.Tensor(action).to(self.agent.device)
                return_ = torch.Tensor(return_).to(self.agent.device)
                adv = torch.Tensor(adv).to(self.agent.device)
                old_value = torch.Tensor(old_value).to(self.agent.device)
                old_log_prob = torch.Tensor(old_log_prob).to(self.agent.device)

                value = self.agent.critic(state)
                # clipped_value = old_value + torch.clamp(value - old_value, -self.epsilon, self.epsilon)
                # clipped_v_loss = (clipped_value - return_).pow(2)
                # unclipped_v_loss = (value - return_).pow(2)
                # critic_loss = 0.5 * torch.max(clipped_v_loss, unclipped_v_loss).mean()
                critic_loss = self.agent.critic_loss(value, return_)

                new_log_prob = self.calculate_log_probs(
                    self.agent.current_policy, state, action)

                ratio = (new_log_prob - old_log_prob).exp()
                actor_loss = self.compute_actor_loss(ratio, adv)

                self.agent.optimize(actor_loss, critic_loss)

        return actor_loss, critic_loss

    def step(self):
        state = self.env.reset()
        for iteration in range(1, 1 + self.n_iterations):
            states = []
            actions = []
            rewards = []
            values = []
            log_probs = []
            dones = []

            self.start_time = time.time()
            for t in range(self.horizon):
                # self.state_rms.update(state)
                state = np.clip((state - self.state_rms.mean) /
                                (self.state_rms.var**0.5 + 1e-8), -5, 5)
                dist = self.agent.choose_dist(state)
                action = dist.sample().cpu().numpy()[0]
                # action = np.clip(action, self.agent.action_bounds[0], self.agent.action_bounds[1])
                log_prob = dist.log_prob(torch.Tensor(action))
                value = self.agent.get_value(state)
                next_state, reward, done, _ = self.env.step(action)

                states.append(state)
                actions.append(action)
                rewards.append(reward)
                values.append(value)
                log_probs.append(log_prob)
                dones.append(done)

                if done:
                    state = self.env.reset()
                else:
                    state = next_state
            # self.state_rms.update(next_state)
            next_state = np.clip((next_state - self.state_rms.mean) /
                                 (self.state_rms.var**0.5 + 1e-8), -5, 5)
            next_value = self.agent.get_value(next_state) * (1 - done)
            values.append(next_value)

            advs = self.get_gae(rewards, values, dones)
            states = np.vstack(states)
            actor_loss, critic_loss = self.train(states, actions, advs, values,
                                                 log_probs)
            # self.agent.set_weights()
            self.agent.schedule_lr()
            eval_rewards = evaluate_model(self.agent, self.test_env,
                                          self.state_rms,
                                          self.agent.action_bounds)
            self.state_rms.update(states)
            self.print_logs(iteration, actor_loss, critic_loss, eval_rewards)

    @staticmethod
    def get_gae(rewards, values, dones, gamma=0.99, lam=0.95):

        advs = []
        gae = 0

        dones.append(0)
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * (values[step + 1]) * (
                1 - dones[step]) - values[step]
            gae = delta + gamma * lam * (1 - dones[step]) * gae
            advs.append(gae)

        advs.reverse()
        return np.vstack(advs)

    @staticmethod
    def calculate_log_probs(model, states, actions):
        policy_distribution = model(states)
        return policy_distribution.log_prob(actions)

    def compute_actor_loss(self, ratio, adv):
        pg_loss1 = adv * ratio
        pg_loss2 = adv * torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon)
        loss = -torch.min(pg_loss1, pg_loss2).mean()
        return loss

    def print_logs(self, iteration, actor_loss, critic_loss, eval_rewards):
        if iteration == 1:
            self.running_reward = eval_rewards
        else:
            self.running_reward = self.running_reward * 0.99 + eval_rewards * 0.01

        if iteration % 100 == 0:
            print(f"Iter:{iteration}| "
                  f"Ep_Reward:{eval_rewards:.3f}| "
                  f"Running_reward:{self.running_reward:.3f}| "
                  f"Actor_Loss:{actor_loss:.3f}| "
                  f"Critic_Loss:{critic_loss:.3f}| "
                  f"Iter_duration:{time.time() - self.start_time:.3f}| "
                  f"lr:{self.agent.actor_scheduler.get_last_lr()}")
            self.agent.save_weights(iteration, self.state_rms)

        with SummaryWriter(self.env_name + "/logs") as writer:
            writer.add_scalar("Episode running reward", self.running_reward,
                              iteration)
            writer.add_scalar("Episode reward", eval_rewards, iteration)
            writer.add_scalar("Actor loss", actor_loss, iteration)
            writer.add_scalar("Critic loss", critic_loss, iteration)
Beispiel #24
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    run_id = "alpha{}".format(args.gcn_alpha)
    if args.use_logger:
        from utils import Logger
        folder = "{}/{}".format(args.folder, run_id)
        logger = Logger(algo_name=args.algo,
                        environment_name=args.env_name,
                        folder=folder,
                        seed=args.seed)
        logger.save_args(args)

        print("---------------------------------------")
        print('Saving to', logger.save_folder)
        print("---------------------------------------")

    else:
        print("---------------------------------------")
        print('NOTE : NOT SAVING RESULTS')
        print("---------------------------------------")
    all_rewards = []

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          args.env_name,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size,
                              actor_critic.base.output_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    ############################
    # GCN Model and optimizer
    from pygcn.train import update_graph
    from pygcn.models import GCN, GAT, SAGE
    assert args.gnn in ['gcn', 'gat', 'sage']

    if args.gnn == 'gat':
        gcn_model = GAT(nfeat=actor_critic.base.output_size,
                        nhid=args.gcn_hidden)
    elif args.gnn == 'sage':
        gcn_model = SAGE(nfeat=actor_critic.base.output_size,
                         nhid=args.gcn_hidden)
    elif args.gnn == 'gcn':
        gcn_model = GCN(nfeat=actor_critic.base.output_size,
                        nhid=args.gcn_hidden)

    gcn_model.to(device)
    gcn_optimizer = optim.Adam(gcn_model.parameters(),
                               lr=args.gcn_lr,
                               weight_decay=args.gcn_weight_decay)
    gcn_loss = nn.NLLLoss()
    gcn_states = [[] for _ in range(args.num_processes)]
    Gs = [nx.Graph() for _ in range(args.num_processes)]
    node_ptrs = [0 for _ in range(args.num_processes)]
    rew_states = [[] for _ in range(args.num_processes)]
    ############################

    episode_rewards = deque(maxlen=100)
    avg_fwdloss = deque(maxlen=100)
    rew_rms = RunningMeanStd(shape=())
    delay_rew = torch.zeros([args.num_processes, 1])
    delay_step = torch.zeros([args.num_processes])

    start = time.time()
    for j in range(num_updates):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob,\
                 recurrent_hidden_states, hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            delay_rew += reward
            delay_step += 1

            for idx, (info, hid,
                      eps_done) in enumerate(zip(infos, hidden_states, done)):

                if eps_done or delay_step[idx] == args.reward_freq:
                    reward[idx] = delay_rew[idx]
                    delay_rew[idx] = delay_step[idx] = 0
                else:
                    reward[idx] = 0

                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                if args.gcn_alpha < 1.0:
                    gcn_states[idx].append(hid)
                    node_ptrs[idx] += 1
                    if not eps_done:
                        Gs[idx].add_edge(node_ptrs[idx] - 1, node_ptrs[idx])
                    if reward[idx] != 0. or eps_done:
                        rew_states[idx].append(
                            [node_ptrs[idx] - 1, reward[idx]])
                    if eps_done:
                        adj = nx.adjacency_matrix(Gs[idx]) if len(Gs[idx].nodes)\
                                        else sp.csr_matrix(np.eye(1,dtype='int64'))
                        update_graph(gcn_model, gcn_optimizer,
                                     torch.stack(gcn_states[idx]), adj,
                                     rew_states[idx], gcn_loss, args, envs)
                        gcn_states[idx] = []
                        Gs[idx] = nx.Graph()
                        node_ptrs[idx] = 0
                        rew_states[idx] = []

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks,
                            hidden_states)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau, gcn_model, args.gcn_alpha)
        agent.update(rollouts)
        rollouts.after_update()

        ####################### Saving and book-keeping #######################
        if (j % int(num_updates / 5.) == 0
                or j == num_updates - 1) and args.save_dir != "":
            print('Saving model')
            print()

            save_dir = "{}/{}/{}".format(args.save_dir, args.folder, run_id)
            save_path = os.path.join(save_dir, args.algo, 'seed' +
                                     str(args.seed)) + '_iter' + str(j)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            save_gcn = gcn_model
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                save_gcn = copy.deepcopy(gcn_model).cpu()

            save_model = [
                save_gcn, save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + "ac.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {}\
             training episodes: mean/median reward {:.2f}/{:.2f},\
              min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".format(
                j,
                total_num_steps,
                int(total_num_steps / (end - start)),
                len(episode_rewards),
                np.mean(episode_rewards),
                np.median(episode_rewards),
                np.min(episode_rewards),
                np.max(episode_rewards),
                np.count_nonzero(np.greater(episode_rewards, 0)) /
                len(episode_rewards),
            ))

            all_rewards.append(np.mean(episode_rewards))
            if args.use_logger:
                logger.save_task_results(all_rewards)
        ####################### Saving and book-keeping #######################

    envs.close()
def train(env,
          ddpg_graph,
          actor,
          critic,
          cl_nn=None,
          pt=None,
          cl_mode=None,
          compare_with_sess=None,
          compare_with_actor=None,
          norm_complexity=0,
          **config):

    print('train: ' + config['output'] + ' started!')
    print("Noise: {} and {}".format(config["ou_sigma"], config["ou_theta"]))
    print("Actor learning rate {}".format(config["actor_lr"]))
    print("Critic learning rate {}".format(config["critic_lr"]))
    print("Minibatch size {}".format(config["minibatch_size"]))

    curriculums = []
    if config["curriculum"]:
        print("Following curriculum {}".format(config["curriculum"]))
        items = config["curriculum"].split(";")
        for item in items:
            params = item.split("_")
            x = np.array(params[1:]).astype(np.float)
            c = {'var': params[0], 'gen': cur_gen(config["steps"], x)}
            curriculums.append(c)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.15)
    with tf.Session(graph=ddpg_graph,
                    config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        # Check if a policy needs to be loaded
        sess = preload_policy(sess, config)

        # Initialize target network weights
        actor.update_target_network(sess)
        critic.update_target_network(sess)

        # Load curriculum neural network weights (provided parametes have priority)
        if cl_nn:
            sess = cl_nn.load(sess, config["cl_load"])

        # Initialize replay memory
        o_dims = env.observation_space.shape[-1]
        replay_buffer = ReplayBuffer(config, o_dims=o_dims)

        # Observation normalization.
        obs_range = [env.observation_space.low, env.observation_space.high]
        if config["normalize_observations"]:
            obs_rms = RunningMeanStd(shape=env.observation_space.shape)
        else:
            obs_rms = None

        # decide mode
        if cl_nn:
            v = pt.flatten()
            cl_mode_new, cl_threshold = cl_nn.predict(sess, v)
            #cl_threshold = pt.denormalize(cl_threshold)
        else:
            cl_mode_new = cl_mode
            cl_threshold = None

        # Initialize constants for exploration noise
        ou_sigma = config["ou_sigma"]
        ou_theta = config["ou_theta"]
        ou_mu = 0
        trial_return = 0
        max_trial_return = 0

        obs_dim = actor.s_dim
        act_dim = actor.a_dim
        max_action = np.minimum(np.absolute(env.action_space.high),
                                np.absolute(env.action_space.low))

        obs = np.zeros(obs_dim)
        action = np.zeros(act_dim)
        noise = np.zeros(act_dim)

        tt = 0
        ss = 0
        ss_all = 0
        terminal = 0
        reach_timeout_num = 0
        more_info = None
        ss_acc, td_acc, l2_reg_acc, action_grad_acc, actor_grad_acc = 0, 0, 0, 0, 0
        prev_l2_reg = critic.l2_reg_(sess)
        ti = config["test_interval"]
        test_returns = []
        avg_test_return = config['reach_return']

        # rewarding object if rewards in replay buffer are to be recalculated
        replay_buffer.load()
        if config['reassess_for']:
            print('Reassessing replay buffer for {}'.format(
                config['reassess_for']))
            evaluator = Evaluator(max_action)
            #pdb.set_trace()
            replay_buffer = evaluator.add_bonus(replay_buffer,
                                                how=config['reassess_for'])

        # Export trajectory
        if config['trajectory']:
            trajectory = []
            if config["compare_with"]:
                actor_sim = []

        # start environment
        for c in curriculums:
            c['ss'], val = next(c['gen'])
            d = {"action": "update_{}".format(c['var']), c['var']: val}
            env.reconfigure(d)
        test = (ti >= 0 and tt % (ti + 1) == ti)
        obs = env.reset(test=test)
        obs = obs_normalize(obs, obs_rms, obs_range, o_dims,
                            config["normalize_observations"])

        # Export environment state
        if cl_nn:
            more_info = ''.join('{:10.2f}'.format(indi)
                                for indi in [-100, -100, -100])
            more_info += ''.join('{:10.2f}'.format(vvv) for vv in v[0]
                                 for vvv in vv)
            more_info += ''.join('{:10.2f}'.format(th) for th in cl_threshold)
        env.log(more_info if cl_threshold is not None else '')

        # Main loop over steps or trials
        # Finish when trials finish
        # or Finish when steps finish
        # or Finishe when new mode in curriculum is switched
        # or Finish when certain return is reached
        # of Finish if trial happend to be longer then config['reach_balance'] twice in a row
        while (config["trials"] == 0 or tt < config["trials"]) and \
              (config["steps"]  == 0 or ss < config["steps"]) and \
              (not cl_nn or cl_mode_new == cl_mode) and \
              (not config['reach_return'] or avg_test_return <= config['reach_return']) and \
              (not config['reach_timeout'] or (config['reach_timeout'] > 0 and reach_timeout_num < config['reach_timeout_num'])):

            # Compute OU noise and action
            if not test:
                noise = ExplorationNoise.ou_noise(ou_theta, ou_mu, ou_sigma,
                                                  noise, act_dim)

            action = compute_action(sess, actor, obs[:o_dims], noise,
                                    test)  # from [-1; 1]

            # obtain observation of a state
            next_obs, reward, terminal, info = env.step(action * max_action)
            #print('Forward promotion: ' + str(next_obs[-1]))
            #print('Reward: ' + str(reward))
            next_obs = obs_normalize(next_obs, obs_rms, obs_range, o_dims,
                                     config["normalize_observations"])

            reward *= config['reward_scale']

            # Add the transition to replay buffer
            if not test:
                replay_buffer.replay_buffer_add(obs, action, reward,
                                                terminal == 2, next_obs)

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if not test and replay_buffer.size() > config["rb_min_size"]:
                minibatch_size = config["minibatch_size"]
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    minibatch_size)

                # Calculate targets
                target_q = critic.predict_target(
                    sess, s2_batch, actor.predict_target(sess, s2_batch))

                y_i = []
                for k in range(minibatch_size):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + config["gamma"] *
                                   target_q[k][0])  # target_q: list -> float

                if config['perf_td_error']:
                    q_i = critic.predict_target(sess, s_batch, a_batch)
                    td_acc += np.sum(
                        np.abs(q_i -
                               np.reshape(y_i, newshape=(minibatch_size, 1))))

                # Update the critic given the targets
                if config['perf_l2_reg']:
                    _, _, l2_reg = critic.train_(
                        sess, s_batch, a_batch,
                        np.reshape(y_i, (minibatch_size, 1)))
                    l2_reg_acc += (l2_reg - prev_l2_reg)
                    prev_l2_reg = l2_reg
                else:
                    critic.train(sess, s_batch, a_batch,
                                 np.reshape(y_i, (minibatch_size, 1)))

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(sess, s_batch)
                grad = critic.action_gradients(sess, s_batch, a_outs)[0]
                if config['perf_action_grad']:
                    action_grad_acc += np.linalg.norm(grad, ord=2)

                if config['perf_actor_grad']:
                    _, actor_grad = actor.train_(sess, s_batch, grad)
                    for ag in actor_grad:
                        actor_grad_acc += np.linalg.norm(ag, ord=2)
                else:
                    actor.train(sess, s_batch, grad)

                ss_acc += 1

                # Update target networks
                actor.update_target_network(sess)
                critic.update_target_network(sess)

            # Render
            if config["render"]:
                still_open = env.render("human")
                if still_open == False:
                    break

            # Record trajectory
            # Note that it exports all training and testing episodes
            if config['trajectory']:
                real_time = ss_all * config['env_timestep']
                trajectory.append([real_time] + obs[:o_dims].tolist() +
                                  (action * max_action).tolist() +
                                  next_obs[:o_dims].tolist() + [reward] +
                                  [terminal])  # + [info])
                if config["compare_with"]:
                    compare_with_action = compute_action(
                        compare_with_sess, compare_with_actor, obs[:o_dims],
                        noise, test)
                    actor_sim.append([real_time] + obs[:o_dims].tolist() +
                                     (compare_with_action *
                                      max_action).tolist())

            # Prepare next step
            obs = next_obs
            trial_return += reward

            # Logging performance at the end of the testing trial
            if terminal and test:

                # NN performance indicators
                more_info = ""
                s = info.split()
                norm_duration = float(s[0]) / config["env_timeout"]
                td_per_step = td_acc / ss_acc if ss_acc > 0 else 0
                norm_td_error = td_per_step / config["env_td_error_scale"]
                norm_complexity += l2_reg_acc / ss_acc if ss_acc > 0 else 0
                indicators = [norm_duration, norm_td_error, norm_complexity]
                more_info += ''.join('{:14.8f}'.format(indi)
                                     for indi in indicators)
                if cl_nn:
                    # update PerformanceTracker
                    pt.add(indicators)  # return, duration, damage
                    v = pt.flatten()
                    cl_mode_new, cl_threshold = cl_nn.predict(sess, v)
                    more_info += ''.join('{:14.8f}'.format(vvv) for vv in v[0]
                                         for vvv in vv)
                    more_info += ''.join('{:14.8f}'.format(th)
                                         for th in cl_threshold)
                ss_acc, td_acc, l2_reg_acc, action_grad_acc, actor_grad_acc = 0, 0, 0, 0, 0
                # report
                env.log(more_info)

                # check if performance is satisfactory
                test_returns.append(trial_return)
                avg_test_return = np.mean(
                    test_returns[max([0, len(test_returns) - 10]):])
                if float(info.split()[0]) > config['reach_timeout']:
                    reach_timeout_num += 1
                else:
                    reach_timeout_num = 0

                if not config['mp_debug']:
                    msg = "{:>10} {:>10} {:>10.3f} {:>10}" \
                        .format(tt, ss, trial_return, terminal)
                    print("{}".format(msg))

            # Save NN if performance is better then before
            if terminal and config['save'] and trial_return > max_trial_return:
                max_trial_return = trial_return
                save_policy(sess, config, suffix="-best")

            if not test:
                ss += 1
                for c in curriculums:
                    if ss > c['ss']:
                        c['ss'], val = next(c['gen'])
                        d = {
                            "action": "update_{}".format(c['var']),
                            c['var']: val
                        }
                        env.reconfigure(d)
            ss_all += 1

            if terminal:
                tt += 1
                test = (ti >= 0 and tt % (ti + 1) == ti)
                obs = env.reset(test=test)
                obs = obs_normalize(obs, obs_rms, obs_range, o_dims,
                                    config["normalize_observations"])
                reward = 0
                terminal = 0
                trial_return = 0
                noise = np.zeros(actor.a_dim)

        # Export final performance, but when curriculum is not used or terminated
        # not due to the curriculum swithch.
        # Becasue data is always exported when curriculum is switched over.
        if (not cl_nn or cl_mode_new == cl_mode):
            env.log(more_info)

        # Export trajectory
        if config['trajectory']:
            dump_pkl_csv(config['trajectory'], trajectory)
            if config["compare_with"]:
                dump_pkl_csv(config['trajectory'] + '_sim', actor_sim)

        # verify replay_buffer
        #evaluator.reassess(replay_buffer, verify=True, task = config['reassess_for'])
        print('train: ' + config['output'] + ' finished!')

        # Save the last episode policy
        if config['save']:
            suffix = "-last"
            save_policy(sess, config, suffix=suffix)
            #save_policy(sess, saver, config, suffix=suffix)
            if config["normalize_observations"]:
                with open(config["output"] + suffix + '.obs_rms', 'w') as f:
                    data = {
                        'count': obs_rms.count,
                        'mean': obs_rms.mean.tolist(),
                        'std': obs_rms.std.tolist(),
                        'var': obs_rms.var.tolist()
                    }
                    json.dump(data, f)

        replay_buffer.save()

        # save curriculum network
        if cl_nn:
            cl_nn.save(sess, config["cl_save"])

        # extract damage from the last step
        damage = 0
        info = env.get_latest_info()
        if info:
            s = info.split()
            damage = float(s[1])

    print('train: ' + config['output'] + ' returning ' +
          '{} {} {} {}'.format(avg_test_return, damage, ss, cl_mode_new))

    return (avg_test_return, damage, ss, cl_mode_new, norm_complexity)
Beispiel #26
0
class PPO(object):
    def __init__(self):
        self.sess = tf.Session()
        self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(self.sess, shape=S_DIM)

        # critic
        # l1 = self.feature #tf.layers.dense(self.feature, 100, tf.nn.relu)
        self.feature = self._build_feature_net('feature',
                                               self.tfs,
                                               reuse=False)
        self.v = self._build_cnet('value', self.feature, reuse=False)

        self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
        self.diff_r_v = self.tfdc_r - self.v
        self.closs = tf.reduce_mean(tf.square(self.diff_r_v))
        # self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)

        # actor
        self.pi, pi_params = self._build_anet('pi', trainable=True)
        oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)

        self.update_oldpi_op = [
            oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)
        ]

        self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')

        # # for continue action
        self.tfa = tf.placeholder(tf.float32, [None, 1], 'action')
        # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
        self.ratio = self.pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
        self.entropy = self.pi.entropy()
        self.sample_op = tf.squeeze(self.pi.sample(1),
                                    axis=0)  # operation of choosing action
        self.sample_op_stochastic = self.pi.loc
        self.std = self.pi.scale

        # # descrete action
        # self.tfa = tf.placeholder(tf.int32, [None], 'action')
        # self.pi_prob = tf.reduce_sum((self.pi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True)
        # oldpi_prob = tf.reduce_sum((oldpi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True)
        # self.ratio = self.pi_prob / (oldpi_prob + 1e-5) #tf.exp(self.log_pi - log_oldpi)
        # self.entropy = -tf.reduce_sum(self.pi * tf.log(self.pi + 1e-5), axis=1, keep_dims=True)

        self.surr1 = self.ratio * self.tfadv
        self.surr2 = tf.clip_by_value(self.ratio, 1. - EPSILON, 1. + EPSILON)
        self.surr = tf.minimum(self.surr1, self.surr2) + 0.0 * self.entropy
        self.aloss = -tf.reduce_mean(self.surr)

        # value replay
        self.tfs_history = tf.placeholder(tf.float32, [None, S_DIM],
                                          'state_history')  # for value replay
        self.return_history = tf.placeholder(
            tf.float32, [None, 1], 'history_return')  # for value replay

        self.feature_history = self._build_feature_net(
            'feature', self.tfs_history, reuse=True)  # for value replay
        self.v_history = self._build_cnet('value',
                                          self.feature_history,
                                          reuse=True)
        self.diff_history = self.return_history - self.v_history
        self.loss_history = tf.reduce_mean(tf.square(self.diff_history))

        # reward predict
        self.tfs_label = tf.placeholder(tf.float32, [None, S_DIM],
                                        'state_label')  # for reward prediction
        self.label = tf.placeholder(tf.int32, [None], 'true_label')

        self.feature_label = self._build_feature_net(
            'feature', self.tfs_label, reuse=True)  # for reward prediction
        self.pred_label = tf.layers.dense(self.feature_label, 2)
        self.loss_pred = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.pred_label, labels=self.label))

        ###########################################################################################
        self.total_loss = self.aloss + (self.closs * 1 + self.loss_pred * 0 +
                                        self.loss_history * 0)
        self.base_loss = self.aloss + self.closs * 1 + self.loss_history * 0

        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = LR
        end_learning_rate = LR / 10
        decay_steps = 10
        learning_rate = tf.train.polynomial_decay(starter_learning_rate,
                                                  global_step,
                                                  decay_steps,
                                                  end_learning_rate,
                                                  power=0.5)

        # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        self.train_op = optimizer.minimize(self.total_loss,
                                           global_step=global_step)
        self.train_base_op = optimizer.minimize(self.base_loss,
                                                global_step=global_step)

        # self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
        self.summary_writer = tf.summary.FileWriter('./log', self.sess.graph)
        # self.load_model()

    def get_entropy(self):
        a0 = self.pi - self.max(self.pi, axis=-1, keepdims=True)
        ea0 = tf.exp(a0)
        z0 = self.sum(ea0, axis=-1, keepdims=True)
        p0 = ea0 / z0
        entropy = self.sum(p0 * (tf.log(z0) - a0), axis=-1)
        return entropy

    def neglogp(self, pi, a):
        one_hot_actions = tf.one_hot(a, pi.get_shape().as_list()[-1])
        return tf.nn.softmax_cross_entropy_with_logits(logits=pi,
                                                       labels=one_hot_actions)

    def sum(self, x, axis=None, keepdims=False):
        axis = None if axis is None else [axis]
        return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)

    def max(self, x, axis=None, keepdims=False):
        axis = None if axis is None else [axis]
        return tf.reduce_max(x, axis=axis, keep_dims=keepdims)

    def load_model(self):
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state('./model/rl/')
        if ckpt and ckpt.model_checkpoint_path:
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
            print('loaded')
        else:
            print('no model file')

    def write_summary(self, summary_name, value):
        summary = tf.Summary()
        summary.value.add(tag=summary_name, simple_value=float(value))
        self.summary_writer.add_summary(summary, GLOBAL_EP)
        self.summary_writer.flush()

    def get_rp_buffer(self, sample_goal_num, sample_crash_num):
        rp_states = []
        rp_label = []
        rp_return = []

        sample_goal_num = int(sample_goal_num)
        sample_crash_num = int(sample_crash_num)

        size = RP_buffer_size
        replace = False
        if Goal_buffer_full == False:
            size = Goal_count
            replace = True
        if size > 0 and sample_goal_num > 0:
            if sample_goal_num > size * 2:
                sample_goal_num = size * 2
            goal_selected = np.random.choice(size,
                                             sample_goal_num,
                                             replace=replace)
            for index in goal_selected:
                rp_states.append(Goal_states[index])
                rp_label.append(0)
                rp_return.append(Goal_return[index])

        size = RP_buffer_size
        replace = False
        if Crash_buffer_full == False:
            size = Crash_count
            replace = True
        if size > 0 and sample_crash_num > 0:
            if sample_crash_num > size * 2:
                sample_crash_num = size * 2
            crash_selected = np.random.choice(size,
                                              sample_crash_num,
                                              replace=replace)
            for index in crash_selected:
                rp_states.append(Crash_states[index])
                rp_label.append(1)
                rp_return.append(Crash_return[index])

        return np.array(rp_states), np.array(rp_label), np.array(
            rp_return)[:, np.newaxis]

    def get_vr_buffer(self, sample_num):
        vr_states = []
        vr_returns = []

        sample_num = int(sample_num)
        size = History_buffer_size
        replace = False
        if History_buffer_full == False:
            size = History_count
            replace = True
        if size > 0:
            if sample_num > size * 2:
                sample_num = size * 2

            index_selected = np.random.choice(size,
                                              sample_num,
                                              replace=replace)
            for index in index_selected:
                vr_states.append(History_states[index])
                vr_returns.append(History_return[index])

        return np.array(vr_states), np.array(vr_returns)[:, np.newaxis]

    def update_base_task(self, s, a, r, adv, vr_states, vr_returns):
        feed_dict = {
            self.tfs: s,
            self.tfa: a,
            self.tfdc_r: r,
            self.tfadv: adv,
            self.tfs_history: vr_states,
            self.return_history: vr_returns
        }
        # st = self.sess.run(self.aloss, feed_dict = feed_dict)
        # ratio = self.sess.run(self.ratio, feed_dict = feed_dict)
        # # st2 = self.sess.run(self.surr, feed_dict = feed_dict)
        # print('aloss', st.flatten())
        # print('ratio',ratio.flatten())
        # # print(st2)
        # # print(np.mean(st2))

        vr_loss = 0
        # tloss, aloss, vloss, entropy, _ = self.sess.run([self.base_loss, self.aloss, self.closs, self.entropy, self.train_base_op]
        tloss, aloss, vloss, vr_loss, entropy, _ = self.sess.run(
            [
                self.base_loss, self.aloss, self.closs, self.loss_history,
                self.entropy, self.train_base_op
            ],
            feed_dict=feed_dict)

        return tloss, aloss, vloss, 0, vr_loss, np.mean(entropy)

    def update_all_task(self, s, a, r, adv, rp_states, rp_labels, vr_states,
                        vr_returns):
        feed_dict = {
            self.tfs: s,
            self.tfa: a,
            self.tfdc_r: r,
            self.tfadv: adv,
            self.tfs_label: rp_states,
            self.label: rp_labels,
            self.tfs_history: vr_states,
            self.return_history: vr_returns
        }
        # st = self.sess.run(self.aloss, feed_dict = feed_dict)
        # print(st)
        tloss, aloss, vloss, rp_loss, vr_loss, entropy, _ = self.sess.run(
            [
                self.total_loss, self.aloss, self.closs, self.loss_pred,
                self.loss_history, self.entropy, self.train_op
            ],
            feed_dict=feed_dict)

        return tloss, aloss, vloss, rp_loss, vr_loss, np.mean(entropy)

    def shuffel_data(self, s, a, r, adv):
        index_shuffeled = np.random.choice(len(r), len(r), replace=False)
        s_shuf, a_shuf, r_shuf, adv_shuf = [], [], [], []

        for i in index_shuffeled:
            s_shuf.append(s[i])
            a_shuf.append(a[i])
            r_shuf.append(r[i])
            adv_shuf.append(adv[i])

        return s_shuf, a_shuf, r_shuf, adv_shuf

    def shuffel_history(self, history_states, history_returns):
        index_shuffeled = np.random.choice(len(history_returns),
                                           len(history_returns),
                                           replace=False)
        s_shuf, r_shuf = [], []

        for i in index_shuffeled:
            s_shuf.append(history_states[i])
            r_shuf.append(history_returns[i])

        return s_shuf, r_shuf  #, np.array(r_shuf)[:, np.newaxis]

    def get_vr_batch(self, s, r):
        # combined_states = s
        # combined_returns = r
        # history buffer
        if History_buffer_full or History_count > 0:
            if History_buffer_full:
                his_size = History_buffer_size
            else:
                his_size = History_count

            combined_states = History_states[:his_size]
            combined_returns = np.array(History_return[:his_size])[:,
                                                                   np.newaxis]

        # goal buffer
        if Goal_buffer_full or Goal_count > 0:
            if Goal_buffer_full:
                his_size = RP_buffer_size
            else:
                his_size = Goal_count

            combined_states = np.concatenate(
                (combined_states, Goal_states[:his_size]), axis=0)
            combined_returns = np.concatenate(
                (combined_returns, np.array(
                    Goal_return[:his_size])[:, np.newaxis]),
                axis=0)

        #crash buffer
        if Crash_buffer_full or Crash_count > 0:
            if Crash_buffer_full:
                his_size = RP_buffer_size
            else:
                his_size = Crash_count

            combined_states = np.concatenate(
                (combined_states, Crash_states[:his_size]), axis=0)
            combined_returns = np.concatenate(
                (combined_returns, np.array(
                    Crash_return[:his_size])[:, np.newaxis]),
                axis=0)

        return combined_states, combined_returns

    def update(self):
        global GLOBAL_UPDATE_COUNTER, G_ITERATION
        while not COORD.should_stop():
            UPDATE_EVENT.wait()  # wait until get batch of data
            self.sess.run(self.update_oldpi_op)  # copy pi to old pi
            data = [QUEUE.get() for _ in range(QUEUE.qsize())
                    ]  # collect data from all workers
            data = np.vstack(data)
            # s, a, r, adv = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, S_DIM + A_DIM: S_DIM + A_DIM + 1], data[:, -1:]
            s, a, r, reward, adv = data[:, :
                                        S_DIM], data[:, S_DIM:S_DIM +
                                                     1], data[:,
                                                              S_DIM + 1:S_DIM +
                                                              2], data[:,
                                                                       S_DIM +
                                                                       2:
                                                                       S_DIM +
                                                                       3], data[:,
                                                                                -1:]
            self.ob_rms.update(s)
            if adv.std() != 0:
                adv = (adv - adv.mean()) / adv.std()
                print('adv min max', adv.min(), adv.max())

            # print('adv', adv)
            # adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
            # update actor and critic in a update loop

            mean_return = np.mean(r)
            print(G_ITERATION, '  --------------- update! batch size:', len(a),
                  '-----------------')
            print(
                '--------------------------------------------------------------------------------------'
            )

            # combined_states, combined_returns = self.get_vr_batch(s, r)
            combined_states, combined_returns = s, r

            print('a batch', len(r), 'v batch', len(combined_returns))

            for iteration in range(UPDATE_STEP):
                # construct reward predict data
                tloss, aloss, vloss, rp_loss, vr_loss = [], [], [], [], []
                tloss_sum, aloss_sum, vloss_sum, rp_loss_sum, vr_loss_sum, entropy_sum = 0, 0, 0, 0, 0, 0

                # s, a, r, adv = self.shuffel_data(s, a, r, adv)

                combined_states, combined_returns = self.shuffel_history(
                    combined_states, combined_returns)

                count = 0
                for start in range(0, len(combined_returns), MIN_BATCH_SIZE):
                    # print('update',iteration, count)
                    end = start + MIN_BATCH_SIZE
                    if end > len(combined_returns) - 1:
                        break
                    count += 1
                    sub_s = combined_states[start:end]
                    # sub_a = a[start:end]
                    sub_r = combined_returns[start:end]
                    # sub_adv = adv[start:end]

                    rp_states, rp_labels, rp_returns = self.get_rp_buffer(
                        MIN_BATCH_SIZE * 1, MIN_BATCH_SIZE * 1)
                    # vr_states, vr_returns = self.get_vr_buffer(MIN_BATCH_SIZE*1)

                    # vr_states = np.concatenate((vr_states, s), axis=0)
                    # vr_returns = np.concatenate((vr_returns, r), axis=0)
                    # if len(rp_states) != 0:
                    #     vr_states = np.concatenate((vr_states, rp_states), axis=0)
                    #     vr_returns = np.concatenate((vr_returns, rp_returns), axis=0)

                    # if len(rp_states) != 0:
                    #     tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_all_task(sub_s, sub_a, sub_r, sub_adv, rp_states, rp_labels, vr_states, vr_returns)
                    # else:
                    # tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_base_task(sub_s, sub_a, sub_r, sub_adv, vr_states, vr_returns)
                    tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_base_task(
                        s, a, r, adv, sub_s, sub_r)

                    tloss_sum += tloss
                    aloss_sum += aloss
                    vloss_sum += vloss
                    rp_loss_sum += rp_loss
                    vr_loss_sum += vr_loss
                    entropy_sum += entropy

                if count == 0:
                    count = 1
                    print(
                        '---------------  need more sample  --------------- ')
                    break

                print("aloss: %7.4f|, vloss: %7.4f|, rp_loss: %7.4f|, vr_loss: %7.4f|, entropy: %7.4f" % \
                                    (aloss_sum/count, vloss_sum/count, rp_loss_sum/count, vr_loss_sum/count, entropy_sum/count))

            # [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
            # [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]

            print(Goal_count, Crash_count, History_count)
            print(Goal_buffer_full, Crash_buffer_full, History_buffer_full)
            entropy = self.sess.run(self.entropy, {self.tfs: s})
            self.write_summary('Loss/entropy', np.mean(entropy))
            self.write_summary('Loss/a loss', aloss_sum / count)
            self.write_summary('Loss/v loss', vloss_sum / count)
            self.write_summary('Loss/rp loss', rp_loss_sum / count)
            self.write_summary('Loss/vr loss', vr_loss_sum / count)
            self.write_summary('Loss/t loss', tloss_sum / count)
            self.write_summary('Perf/mean_reward', np.mean(reward))

            self.saver.save(self.sess, './model/rl/model.cptk')

            UPDATE_EVENT.clear()  # updating finished
            GLOBAL_UPDATE_COUNTER = 0  # reset counter
            G_ITERATION += 1
            ROLLING_EVENT.set()  # set roll-out available

    def _build_feature_net(self, name, input_state, reuse=False):
        w_init = tf.contrib.layers.xavier_initializer()
        # w_init = tf.zeros_initializer()
        with tf.variable_scope(name, reuse=reuse):
            state_size = 5
            num_img = S_DIM - state_size - 1  #
            img_size = int(math.sqrt(num_img))
            print(num_img, img_size)

            input_state = (input_state - self.ob_rms.mean) / self.ob_rms.std
            ob_grid = tf.slice(input_state, [0, 0], [-1, num_img])
            # tp_state = tf.slice(self.tfs, [0, num_img], [-1, 2])
            # rp_state = tf.slice(self.tfs, [0, num_img+2], [-1, 3])
            # action_taken = tf.slice(self.tfs, [0, num_img+4], [-1, 1])
            # index_in_ep = tf.slice(self.tfs, [0, num_img+5], [-1, 1])

            ob_state = tf.slice(input_state, [0, num_img], [-1, state_size])
            # ob_state = tf.concat([ob_state , index_in_ep], 1, name = 'concat_ob')
            # reshaped_grid = tf.reshape(ob_grid,shape=[-1, img_size, img_size, 1])
            ob_state = tf.reshape(ob_state, shape=[-1, state_size])

            x = (ob_grid - 0.5) * 2
            x = tf.layers.dense(x,
                                100,
                                tf.nn.tanh,
                                kernel_initializer=w_init,
                                name='x_fc1')
            x = tf.layers.dense(x,
                                50,
                                tf.nn.tanh,
                                kernel_initializer=w_init,
                                name='x_fc2')

            # process state
            state_rt = tf.layers.dense(ob_state,
                                       state_size * 10,
                                       tf.nn.tanh,
                                       kernel_initializer=w_init,
                                       name='rt_fc1')
            # state_rt = tf.layers.dense(state_rt, state_size*10, tf.nn.tanh, name='rt_fc2' )

            feature = tf.concat([x, state_rt], 1, name='concat')
            # feature = state_rt
            # feature = tf.layers.dense(state_concat, 100, tf.nn.tanh, name='feature_fc' )
        return feature

    def _build_anet(self, name, trainable):
        # w_init = tf.random_normal_initializer(0., .1)
        # w_init = tf.zeros_initializer()
        w_init = tf.contrib.layers.xavier_initializer()
        with tf.variable_scope(name):
            l1 = tf.layers.dense(self.feature,
                                 100,
                                 tf.nn.tanh,
                                 trainable=trainable)
            # l1 = self.feature

            mu = tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
            # logstd = tf.get_variable(name="logstd", shape=[1, A_DIM], initializer=tf.zeros_initializer(), trainable=trainable)
            sigma = tf.layers.dense(l1,
                                    A_DIM,
                                    tf.nn.softplus,
                                    trainable=trainable)
            norm_dist = tf.distributions.Normal(
                loc=mu, scale=sigma)  #   tf.exp(logstd))

            # norm_dist = tf.layers.dense(l1, A_DIM, tf.nn.softmax, kernel_initializer=w_init, trainable=trainable)

        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
        return norm_dist, params

    def _build_cnet(self, name, input_state, reuse=False):
        w_init = tf.contrib.layers.xavier_initializer()
        # w_init = tf.zeros_initializer()
        with tf.variable_scope(name, reuse=reuse):
            l1 = tf.layers.dense(input_state,
                                 100,
                                 tf.nn.tanh,
                                 kernel_initializer=w_init)
            # l1 = input_state
            v = tf.layers.dense(l1, 1)
        return v

    def choose_action(self, s, stochastic=True, show_plot=False):
        s = s[np.newaxis, :]
        if stochastic:
            a = self.sess.run(self.sample_op, {self.tfs: s})[0]
        else:
            a = self.sess.run(self.sample_op_stochastic, {self.tfs: s})[0]

        mean, scale = self.sess.run([self.sample_op_stochastic, self.std],
                                    {self.tfs: s})

        mean = mean[0]
        scale = scale[0]
        np.append(scale, 0)

        scale = np.pi * (20 * scale)**2
        a = np.clip(a, -1, 1)
        if show_plot:
            plt.clf()
            plt.scatter(range(A_DIM + 1),
                        np.append(a, 1.0).flatten(),
                        s=scale,
                        c=[10, 10, 10, 10])
            plt.pause(0.01)
            # print(prob)

        return a, 0

    # def choose_action(self, s, stochastic = True, show_plot = False):  # run by a local
    #     prob_weights = self.sess.run(self.pi, feed_dict={self.tfs: s[np.newaxis, :]})

    #     if stochastic:
    #         action = np.random.choice(range(prob_weights.shape[1]),
    #                               p=prob_weights.ravel())  # select action w.r.t the actions prob
    #     else:
    #         action = np.argmax(prob_weights.ravel())

    #     if show_plot:
    #         prob = prob_weights.ravel()
    #         plt.clf()
    #         plt.scatter(range(A_DIM+1), np.append(prob, 0.5).flatten() )
    #         plt.pause(0.01)
    #         # print(s[-6:])
    #         # print(prob)
    #     return action, prob_weights.ravel()

    def get_v(self, s):
        if s.ndim < 2: s = s[np.newaxis, :]
        return self.sess.run(self.v, {self.tfs: s})[0, 0]
Beispiel #27
0
if __name__ == '__main__':
    torch.manual_seed(29)
    random.seed(21)
    np.random.seed(218)
    env_name = "Walker2d-v2"
    # env_name = "InvertedDoublePendulum-v2"
    env = gym.make(env_name)
    env.seed(2180)
    import os
    from datetime import datetime
    from gym import wrappers
    # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    # aigym_path = os.path.join('.', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    from running_mean_std import RunningMeanStd
    rms = RunningMeanStd(env.observation_space.shape[0])
    from tensorboardX import SummaryWriter
    writer = SummaryWriter()

    obs_dims = (2, 2, 2, 2, 2, 2)
    act_dims = (1, 1, 1, 1, 1, 1)
    global_dim = 6
    # obs_dims = (1, 1, 1, 1)
    # obs_dims = (11, )
    # obs_dims = (3, 3)
    # act_dims = (1, 1)
    # global_dim = 6
    topo = ((0, 1), (1, 2), (0, 3), (3, 4), (4, 5))
    #
    # topo = ((0, 1), )
    import pickle
Beispiel #28
0
    def __init__(self):
        self.sess = tf.Session()
        self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(self.sess, shape=S_DIM)

        # critic
        # l1 = self.feature #tf.layers.dense(self.feature, 100, tf.nn.relu)
        self.feature = self._build_feature_net('feature',
                                               self.tfs,
                                               reuse=False)
        self.v = self._build_cnet('value', self.feature, reuse=False)

        self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
        self.diff_r_v = self.tfdc_r - self.v
        self.closs = tf.reduce_mean(tf.square(self.diff_r_v))
        # self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)

        # actor
        self.pi, pi_params = self._build_anet('pi', trainable=True)
        oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)

        self.update_oldpi_op = [
            oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)
        ]

        self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')

        # # for continue action
        self.tfa = tf.placeholder(tf.float32, [None, 1], 'action')
        # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
        self.ratio = self.pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
        self.entropy = self.pi.entropy()
        self.sample_op = tf.squeeze(self.pi.sample(1),
                                    axis=0)  # operation of choosing action
        self.sample_op_stochastic = self.pi.loc
        self.std = self.pi.scale

        # # descrete action
        # self.tfa = tf.placeholder(tf.int32, [None], 'action')
        # self.pi_prob = tf.reduce_sum((self.pi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True)
        # oldpi_prob = tf.reduce_sum((oldpi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True)
        # self.ratio = self.pi_prob / (oldpi_prob + 1e-5) #tf.exp(self.log_pi - log_oldpi)
        # self.entropy = -tf.reduce_sum(self.pi * tf.log(self.pi + 1e-5), axis=1, keep_dims=True)

        self.surr1 = self.ratio * self.tfadv
        self.surr2 = tf.clip_by_value(self.ratio, 1. - EPSILON, 1. + EPSILON)
        self.surr = tf.minimum(self.surr1, self.surr2) + 0.0 * self.entropy
        self.aloss = -tf.reduce_mean(self.surr)

        # value replay
        self.tfs_history = tf.placeholder(tf.float32, [None, S_DIM],
                                          'state_history')  # for value replay
        self.return_history = tf.placeholder(
            tf.float32, [None, 1], 'history_return')  # for value replay

        self.feature_history = self._build_feature_net(
            'feature', self.tfs_history, reuse=True)  # for value replay
        self.v_history = self._build_cnet('value',
                                          self.feature_history,
                                          reuse=True)
        self.diff_history = self.return_history - self.v_history
        self.loss_history = tf.reduce_mean(tf.square(self.diff_history))

        # reward predict
        self.tfs_label = tf.placeholder(tf.float32, [None, S_DIM],
                                        'state_label')  # for reward prediction
        self.label = tf.placeholder(tf.int32, [None], 'true_label')

        self.feature_label = self._build_feature_net(
            'feature', self.tfs_label, reuse=True)  # for reward prediction
        self.pred_label = tf.layers.dense(self.feature_label, 2)
        self.loss_pred = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.pred_label, labels=self.label))

        ###########################################################################################
        self.total_loss = self.aloss + (self.closs * 1 + self.loss_pred * 0 +
                                        self.loss_history * 0)
        self.base_loss = self.aloss + self.closs * 1 + self.loss_history * 0

        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = LR
        end_learning_rate = LR / 10
        decay_steps = 10
        learning_rate = tf.train.polynomial_decay(starter_learning_rate,
                                                  global_step,
                                                  decay_steps,
                                                  end_learning_rate,
                                                  power=0.5)

        # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        self.train_op = optimizer.minimize(self.total_loss,
                                           global_step=global_step)
        self.train_base_op = optimizer.minimize(self.base_loss,
                                                global_step=global_step)

        # self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
        self.summary_writer = tf.summary.FileWriter('./log', self.sess.graph)
Beispiel #29
0
                    episodes, ep_steps, ep_reward, rank))


if __name__ == '__main__':
    mp.set_start_method('spawn')
    obs_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    n_eval = 100
    net = ActorCriticNet(obs_space, action_space).to(device)
    net.share_memory()
    param_p = [p for n, p in net.named_parameters() if 'pol' in n]
    param_v = [p for n, p in net.named_parameters() if 'val' in n]
    optim_p = torch.optim.AdamW(param_p, lr=P_LR, eps=1e-6)
    optim_v = torch.optim.AdamW(param_v, lr=V_LR, eps=1e-6)
    optimizer = [optim_p, optim_v]
    norm_obs = RunningMeanStd(shape=env.observation_space.shape)

    jobs = []
    pipes = []
    trajectory = []
    rewards = deque(maxlen=n_eval)
    update = 0
    steps = 0
    for i in range(N_PROCESS):
        parent, child = mp.Pipe()
        p = mp.Process(target=roll_out,
                       args=(env, ROLL_LEN // N_PROCESS, i, child),
                       daemon=True)
        jobs.append(p)
        pipes.append(parent)
    def __init__(self, sess, obs_shape_list, summary_writer):
        self.sess = sess
        obs_shape_list = obs_shape_list
        self.summary_writer = summary_writer

        self.BS = 1

        self.s_t0_rms = RunningMeanStd(shape=obs_shape_list[0])
        self.s_t1_rms = RunningMeanStd(shape=obs_shape_list[1])
        self.s_t2_rms = RunningMeanStd(shape=obs_shape_list[2])
        self.s_t3_rms = RunningMeanStd(shape=obs_shape_list[3])
        self.s_t4_rms = RunningMeanStd(shape=obs_shape_list[4])
        self.goal_state0_rms = RunningMeanStd(shape=obs_shape_list[5])
        self.goal_state1_rms = RunningMeanStd(shape=obs_shape_list[6])
        self.goal_obs_rms = RunningMeanStd(shape=obs_shape_list[7])
        # achieved goals have the same shape with that of desired goals
        self.achvd_obs_rms = RunningMeanStd(shape=obs_shape_list[10])
        self.achvd_state0_rms = RunningMeanStd(shape=obs_shape_list[8])
        self.achvd_state1_rms = RunningMeanStd(shape=obs_shape_list[9])
# global values
steps = 0
ep_rewards = []
reward_eval = []
is_rollout = False
is_solved = False

# make memories
train_memory = []
roll_memory = []
obses = []
rews = []
rewards = []
values = []
norm_obs = RunningMeanStd(shape=env.observation_space.shape)
norm_rew = RunningMeanStd()

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)

# grouped_parameters = [
#         {'params': [p for n, p in net.named_parameters() if n == 'val'], 'lr': LR * 0.1},
#         {'params': [p for n, p in net.named_parameters() if n != 'val'], 'lr': LR}
# ]
param_p = [p for n, p in net.named_parameters() if 'val' not in n]
param_v = [p for n, p in net.named_parameters() if 'val' in n]
optim_p = torch.optim.AdamW(param_p, lr=LR, eps=1e-6)
optim_v = torch.optim.AdamW(param_v, lr=0.001, eps=1e-6)
optimizer = [optim_p, optim_v]
Beispiel #32
0
class RandomNetworkDistillation():
    def __init__(self,
                 input_size=8,
                 learning_late=1e-4,
                 verbose=1,
                 use_cuda=False,
                 tensorboard=False):
        self.target = torch.nn.Sequential(torch.nn.Linear(input_size, 64),
                                          torch.nn.Linear(64, 128),
                                          torch.nn.Linear(128, 64))

        self.predictor = torch.nn.Sequential(torch.nn.Linear(input_size, 64),
                                             torch.nn.Linear(64, 128),
                                             torch.nn.Linear(128, 128),
                                             torch.nn.Linear(128, 64))

        self.loss_function = torch.nn.MSELoss(reduction='mean')
        self.optimizer = torch.optim.Adam(self.predictor.parameters(),
                                          lr=learning_late)
        for param in self.target.parameters():
            param.requires_grad = False
        self.verbose = verbose
        self.tensorboard = tensorboard
        if self.tensorboard:
            self.summary = SummaryWriter()
        self.iteration = 0

        self.device = torch.device('cuda' if use_cuda else 'cpu')
        self.target.to(self.device)
        self.predictor.to(self.device)

        self.running_stats = RunningMeanStd()

    def learn(self, x, n_steps=500):
        intrinsic_reward = self.get_intrinsic_reward(x[0])
        if self.tensorboard:
            self.summary.add_scalar('intrinsic-reward', intrinsic_reward,
                                    self.iteration)
        x = np.float32(x)
        x = torch.from_numpy(x).to(self.device)
        y_train = self.target(x)
        for t in range(n_steps):
            y_pred = self.predictor(x)
            loss = self.loss_function(y_pred, y_train)
            if t % 100 == 99:
                if self.verbose > 0:
                    print("timesteps: {}, loss: {}".format(t, loss.item()))
            self.optimizer.zero_grad()
            loss.backward(retain_graph=True)
            self.optimizer.step()
            if self.tensorboard:
                self.summary.add_scalar('loss/loss', loss.item(),
                                        self.iteration)
            self.iteration += 1
        self.running_stats.update(arr=np.array([loss.item()]))
        if self.tensorboard:
            self.summary.add_scalar('loss/running-mean',
                                    self.running_stats.mean, self.iteration)
            self.summary.add_scalar('loss/running-var', self.running_stats.var,
                                    self.iteration)

    def evaluate(self, x):
        x = np.float32(x)
        x = torch.from_numpy(x).to(self.device)
        y_test = self.target(x)
        y_pred = self.predictor(x)
        loss = self.loss_function(y_pred, y_test)
        print("evaluation loss: {}".format(loss.item()))
        return loss.item()

    def get_intrinsic_reward(self, x):
        x = np.float32(x)
        x = torch.from_numpy(x).to(self.device)
        predict = self.predictor(x)
        target = self.target(x)
        intrinsic_reward = self.loss_function(predict,
                                              target).data.cpu().numpy()
        intrinsic_reward = (intrinsic_reward - self.running_stats.mean
                            ) / np.sqrt(self.running_stats.var)
        intrinsic_reward = np.clip(intrinsic_reward, -5, 5)
        return intrinsic_reward

    def save(self, path="rnd_model/", subfix=None):
        Path(path).mkdir(parents=True, exist_ok=True)
        if not os.path.isdir(path):
            os.mkdir(path)
        if subfix is not None:
            subfix = "_" + subfix
        else:
            subfix = ""
        with open("{}/running_stat.pkl".format(path), 'wb') as f:
            pickle.dump(self.running_stats, f)
        torch.save(self.target.state_dict(),
                   "{}/target{}.pt".format(path, subfix))
        torch.save(self.predictor.state_dict(),
                   "{}/predictor{}.pt".format(path, subfix))

    def load(self, path="rnd_model/", subfix=None):
        if subfix is not None:
            subfix = "_" + subfix
        else:
            subfix = ""
        with open("{}/running_stat.pkl".format(path), 'rb') as f:
            self.running_stats = pickle.load(f)
        self.target.load_state_dict(
            torch.load("{}/target{}.pt".format(path, subfix),
                       map_location=torch.device(self.device)))
        self.predictor.load_state_dict(
            torch.load("{}/predictor{}.pt".format(path, subfix),
                       map_location=torch.device(self.device)))

    def set_to_inference(self):
        self.target.eval()
        self.predictor.eval()