def __init__( self, log_interval=10, lr=1e-5, use_cuda=False, verbose=0, log_tensorboard=False, path="rnd_model/", ): self.predictor = predictor_generator() self.target = target_generator() for param in self.target.parameters(): param.requires_grad = False self.target.eval() self.log_interval = log_interval self.optimizer = torch.optim.Adam(self.predictor.parameters(), lr=lr) self.loss_function = torch.nn.MSELoss(reduction='mean') self.device = torch.device('cuda' if use_cuda else 'cpu') self.target.to(self.device) self.predictor.to(self.device) self.running_stats = RunningMeanStd() self.verbose = verbose self.writer = SummaryWriter() if log_tensorboard else None self.n_iter = 0 self.save_path = path Path(path).mkdir(parents=True, exist_ok=True) self.early_stopping = EarlyStopping(save_dir=self.save_path)
def __init__(self, input_size=8, learning_late=1e-4, verbose=1, use_cuda=False, tensorboard=False): self.target = torch.nn.Sequential(torch.nn.Linear(input_size, 64), torch.nn.Linear(64, 128), torch.nn.Linear(128, 64)) self.predictor = torch.nn.Sequential(torch.nn.Linear(input_size, 64), torch.nn.Linear(64, 128), torch.nn.Linear(128, 128), torch.nn.Linear(128, 64)) self.loss_function = torch.nn.MSELoss(reduction='mean') self.optimizer = torch.optim.Adam(self.predictor.parameters(), lr=learning_late) for param in self.target.parameters(): param.requires_grad = False self.verbose = verbose self.tensorboard = tensorboard if self.tensorboard: self.summary = SummaryWriter() self.iteration = 0 self.device = torch.device('cuda' if use_cuda else 'cpu') self.target.to(self.device) self.predictor.to(self.device) self.running_stats = RunningMeanStd()
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=False, clipob=5., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): VecEnvWrapper.__init__(self, venv) if use_tf: from running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs)
def __init__(self, env_params, gamma, clip_obs=5, clip_rew=5, eps=1e-8): with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=(env_params['observation'], )) with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd(shape=(1, )) self.clip_obs = clip_obs self.clip_rew = clip_rew self.epsilon = eps self.disc_reward = np.array([0]) self.gamma = .99
class Normalizer: """ Normalizes state and vectors through running means and running stds. Based on open ai's stable baselines """ def __init__(self, env_params, gamma, clip_obs=5, clip_rew=5, eps=1e-8): with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=(env_params['observation'], )) with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd(shape=(1, )) self.clip_obs = clip_obs self.clip_rew = clip_rew self.epsilon = eps self.disc_reward = np.array([0]) self.gamma = .99 def normalize_state(self, obs, training=True): observation = obs if training: self.obs_rms.update(np.array(observation)) observation = np.clip((observation - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return observation def normalize_reward(self, reward, training=True): if training: self.disc_reward = self.disc_reward * self.gamma + reward self.ret_rms.update(self.disc_reward.flatten()) r = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_rew, self.clip_rew) return r def load(load_path, venv): """ Loads a saved VecNormalize object. :param load_path: the path to load from. :param venv: the VecEnv to wrap. :return: (VecNormalize) """ with open(load_path, "rb") as file_handler: norm = pickle.load(file_handler) return norm def save(self, save_path): with open(save_path, "wb") as file_handler: pickle.dump(self, file_handler)
def __init__(self, env, test_env, env_name, n_iterations, agent, epochs, mini_batch_size, epsilon, horizon): self.env = env self.env_name = env_name self.test_env = test_env self.agent = agent self.epsilon = epsilon self.horizon = horizon self.epochs = epochs self.mini_batch_size = mini_batch_size self.n_iterations = n_iterations self.start_time = 0 self.state_rms = RunningMeanStd(shape=(self.agent.n_states, )) self.running_reward = 0
def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None
def __init__(self, obs, action_space, hid_size, num_hidden_layers, num_sub_policies, gaussian_fixed_var=True): super(PolicyNet, self).__init__() self.obs = obs self.action_space = action_space self.num_hidden_layers = num_hidden_layers self.num_sub_policies = num_sub_policies self.gaussian_fixed_var = gaussian_fixed_var self.hid_size = hid_size self.ob_rms = RunningMeanStd(shape=(self.obs.get_shape()[1], )) obz = np.clip((self.obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = t.FloatTensor(obz) self.hiddenlayer = [] self.lin = (nn.Linear(last_out, self.hid_size), nn.Tanh()) #self.hiddenlayer = nn.Tanh(nn.Linear(last_out, self.hid_size)) for layer in range(self.num_hidden_layers): last_out = self.hiddenlayer.append(self.lin) print(last_out)
def __init__(self, venv, ob=True, ret=False, clipob=5., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): VecEnvWrapper.__init__(self, venv) if use_tf: from running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def start_interaction(self, env_fns, dynamics, nlump=2): param_list = self.stochpol.param_list + self.dynamics.param_list + self.dynamics.auxiliary_task.param_list # copy a link, not deepcopy. self.optimizer = torch.optim.Adam(param_list, lr=self.lr) self.optimizer.zero_grad() self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def __init__(self, obs_shape_list, sess=None, summary_writer=None): self.sess = sess _obs_shape_list = obs_shape_list self.summary_writer = summary_writer action_shape = (1, 8) self.BS = 1 # self.full_stt_rms = RunningMeanStd(shape=obs_shape_list[1]) self.s_t0_rms = RunningMeanStd(shape=_obs_shape_list[0]) # (100,100,3) self.s_t1_rms = RunningMeanStd(shape=_obs_shape_list[1]) # (7,) pos self.s_t2_rms = RunningMeanStd(shape=_obs_shape_list[2]) # (7,) vel self.s_t3_rms = RunningMeanStd(shape=_obs_shape_list[3]) # (7,) eff self.s_t4_rms = RunningMeanStd(shape=_obs_shape_list[4]) # (1,) grip self.s_t5_rms = RunningMeanStd(shape=_obs_shape_list[5]) # (7,) ee self.s_t6_rms = RunningMeanStd(shape=_obs_shape_list[6]) # (3,) aux self.a_t_rms = RunningMeanStd(shape=action_shape) # (3,) aux
def __init__(self, name, ob, ac_space, network='mlp', gaussian_fixed_var=True, nsteps=None, nbatch=None, nlstm=256, states=None, masks=None, reuse=False): self.network = network shape = [] for d in range(1, len(ob.shape)): shape.append(ob.shape[d]) with tf.variable_scope(name, reuse=reuse): self.scope = tf.get_variable_scope().name with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=shape) obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) if network == 'mlp': hid_size = 64 num_hid_layers = 2 self.hid_size = hid_size self.num_hid_layers = num_hid_layers self.gaussian_fixed_var = gaussian_fixed_var self._mlp(obs, hid_size, num_hid_layers, ac_space, gaussian_fixed_var) elif network == 'cnn': self._cnn(obs, ac_space, gaussian_fixed_var) elif network == 'lstm': assert nsteps is not None and nbatch is not None assert states is not None and masks is not None assert isinstance(nsteps, int) and isinstance(nbatch, int) assert nsteps > 0 and nbatch > 0 self._lstm(obs, states, masks, nlstm, ac_space, nbatch, nsteps) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) if network == 'mlp' or network == 'cnn': self._act = U.function([stochastic, ob], [ac, self.vpred]) elif network == 'lstm': self._act = U.function([stochastic, ob, states, masks], [ac, self.vpred, self.snew])
def __init__(self, name, ob, ac_space, num_subpolicies, network='mlp', gaussian_fixed_var=True): self.num_subpolicies = num_subpolicies self.gaussian_fixed_var = gaussian_fixed_var shape = [] for d in range(1, len(ob.shape)): shape.append(ob.shape[d]) with tf.variable_scope("obfilter", reuse=tf.AUTO_REUSE): self.ob_rms = RunningMeanStd(shape=shape) obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) with tf.variable_scope(name): self.scope = tf.get_variable_scope().name if network == 'mlp': hid_size = 64 num_hid_layers = 2 self.hid_size = hid_size self.num_hid_layers = num_hid_layers self._mlp(obs, num_subpolicies, hid_size, num_hid_layers, ac_space, gaussian_fixed_var) elif network == 'cnn': self._cnn(obs, num_subpolicies) # sample actions stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred]) # debug self._debug = U.function([stochastic, ob], [ac, self.selector]) self._act_forced = U.function([stochastic, ob, self.selector], [ac, self.vpred])
def train(variant): set_global_seeds(variant['seed']) if variant['mode'] == 'local': import colored_traceback.always ''' Set-up folder and files ''' snapshot_dir = logger.get_snapshot_dir() working_dir = config.PROJECT_PATH param_path = os.path.join(working_dir, 'params/params.json') # copyfile(param_path, os.path.join(snapshot_dir,'params.json')) try: ''' Save parameters ''' if 'params' in variant: logger.log('Load params from variant.') params = variant['params'] else: logger.log('Load params from file.') with open(param_path, 'r') as f: params = json.load(f) # Save to snapshot dir new_param_path = os.path.join(snapshot_dir, 'params.json') with open(new_param_path, 'w') as f: json.dump(params, f, sort_keys=True, indent=4, separators=(',', ': ')) # TODO: can use variant to modify here. dynamics_opt_params = params['dynamics_opt_params'] dynamics_opt_params['stop_critereon'] = stop_critereon( threshold=dynamics_opt_params['stop_critereon']['threshold'], offset=dynamics_opt_params['stop_critereon']['offset']) dynamics_opt_params = Dynamics_opt_params(**dynamics_opt_params) policy_opt_params = params['policy_opt_params'] policy_opt_params['stop_critereon'] = stop_critereon( threshold=policy_opt_params['stop_critereon']['threshold'], offset=policy_opt_params['stop_critereon']['offset'], percent_models_threshold=policy_opt_params['stop_critereon'] ['percent_models_threshold']) policy_opt_params = Policy_opt_params(**policy_opt_params) rollout_params = params['rollout_params'] rollout_params['monitorpath'] = os.path.join(snapshot_dir, 'videos') rollout_params = Rollout_params(**rollout_params) assert params['rollout_params']['max_timestep'] == \ params['policy_opt_params']['oracle_maxtimestep'] == \ params['policy_opt_params']['T'] ''' Policy model ''' def build_policy_from_rllab(scope_name='training_policy'): ''' Return both rllab policy and policy model function. ''' sess = tf.get_default_session() ### Initialize training_policy to copy from policy from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy output_nonlinearity = eval(params['policy']['output_nonlinearity']) training_policy = GaussianMLPPolicy( name=scope_name, env_spec=env.spec, hidden_sizes=params['policy']['hidden_layers'], init_std=policy_opt_params.trpo['init_std'], output_nonlinearity=output_nonlinearity) training_policy_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='training_policy') sess.run([tf.variables_initializer(training_policy_vars)]) ### Compute policy model function using the same weights. training_layers = training_policy._mean_network.layers def policy_model(x, stochastic=0.0, collect_summary=False): assert (training_layers[0].shape[1] == x.shape[1]) h = x for i, layer in enumerate(training_layers[1:]): w = layer.W b = layer.b pre_h = tf.matmul(h, w) + b h = layer.nonlinearity(pre_h, name='policy_out') if collect_summary: with tf.name_scope(scope_name + '/observation'): variable_summaries(x) with tf.name_scope(scope_name + '/layer%d' % i): with tf.name_scope('weights'): variable_summaries(w) with tf.name_scope('biases'): variable_summaries(b) with tf.name_scope('Wx_plus_b'): tf.summary.histogram('pre_activations', pre_h) tf.summary.histogram('activations', h) std = training_policy._l_std_param.param h += stochastic * tf.random_normal( shape=(tf.shape(x)[0], n_actions)) * tf.exp(std) return h return training_policy, policy_model ''' Dynamics model ''' def get_value(key, dict): return key in dict and dict[key] def prepare_input(xgu, xgu_norm, scope_name, variable_name, collect_summary, prediction_type): name_scope = '%s/%s' % (scope_name, variable_name) assert n_states > 1 and n_actions > 1 \ and xgu.shape[1] == n_states + n_actions + n_goals xu = tf.concat([xgu[:, :n_states], xgu[:, n_states + n_goals:]], axis=1) xu_norm = tf.concat( [xgu_norm[:, :n_states], xgu_norm[:, n_states + n_goals:]], axis=1) # Collect data summaries if collect_summary: with tf.name_scope(name_scope + '/inputs'): with tf.name_scope('states'): data_summaries(xgu[:, :n_states]) with tf.name_scope('goals'): data_summaries(xgu[:, n_states:n_states + n_goals]) with tf.name_scope('actions'): data_summaries(xgu[:, n_states + n_goals:]) # Ignore xy in the current state. if get_value('ignore_xy_input', params['dynamics_model']): n_inputs = n_states + n_actions - 2 nn_input = xu_norm[:, 2:] elif get_value('ignore_x_input', params['dynamics_model']): n_inputs = n_states + n_actions - 1 nn_input = xu_norm[:, 1:] else: n_inputs = n_states + n_actions nn_input = xu_norm hidden_layers = list(params['dynamics_model']['hidden_layers']) nonlinearity = [ eval(_x) for _x in params['dynamics_model']['nonlinearity'] ] assert (len(nonlinearity) == len(hidden_layers)) # Verify if the input type is valid. if prediction_type == 'state_change' or \ prediction_type == 'state_change_goal': n_outputs = n_states else: assert prediction_type == 'second_derivative' or \ prediction_type == 'second_derivative_goal' n_outputs = int(n_states / 2) nonlinearity.append(tf.identity) hidden_layers.append(n_outputs) return xu, nn_input, n_inputs, n_outputs, \ nonlinearity, hidden_layers def build_ff_neural_net(nn_input, n_inputs, hidden_layers, nonlinearity, scope_name, variable_name, collect_summary, logit_weights=None, initializer=layers.xavier_initializer()): assert len(hidden_layers) == len(nonlinearity) name_scope = '%s/%s' % (scope_name, variable_name) h = nn_input n_hiddens = n_inputs n_hiddens_next = hidden_layers[0] for i in range(len(hidden_layers)): w = get_scope_variable(scope_name, "%s/layer%d/weights" % (variable_name, i), shape=(n_hiddens, n_hiddens_next), initializer=initializer) b = get_scope_variable(scope_name, "%s/layer%d/biases" % (variable_name, i), shape=(n_hiddens_next), initializer=initializer) if collect_summary: with tf.name_scope(name_scope + '/layer%d' % i): with tf.name_scope('weights'): variable_summaries(w) with tf.name_scope('biases'): variable_summaries(b) with tf.name_scope('Wx_plus_b'): pre_h = tf.matmul(h, w) + b tf.summary.histogram('pre_activations', pre_h) h = nonlinearity[i](pre_h, name='activation') tf.summary.histogram('activations', h) else: pre_h = tf.matmul(h, w) + b h = nonlinearity[i](pre_h, name='activation') n_hiddens = hidden_layers[i] if i + 1 < len(hidden_layers): n_hiddens_next = hidden_layers[i + 1] if logit_weights is not None and i == len(hidden_layers) - 2: h *= logit_weights return h def build_dynamics_model(n_states, n_actions, n_goals, dt=None, input_rms=None, diff_rms=None): prediction_type = params['dynamics_model']['prediction_type'] def dynamics_model(xgu, scope_name, variable_name, collect_summary=False): ''' :param xu: contains states, goals, actions :param scope_name: :param variable_name: :param dt: :return: ''' xu, nn_input, n_inputs, n_outputs, nonlinearity, hidden_layers = \ prepare_input(xgu, (xgu - input_rms.mean)/input_rms.std, scope_name, variable_name, collect_summary, prediction_type) if "use_logit_weights" in params["dynamics_model"] and params[ "dynamics_model"]["use_logit_weights"]: logit_weights = build_ff_neural_net( nn_input, n_inputs, hidden_layers[:-1], nonlinearity[:-2] + [tf.nn.sigmoid], scope_name, variable_name + '_sig', collect_summary) else: logit_weights = None nn_output = build_ff_neural_net(nn_input, n_inputs, hidden_layers, nonlinearity, scope_name, variable_name, collect_summary, logit_weights=logit_weights) # predict the delta instead (x_next-x_current) if 'state_change' in prediction_type: next_state = tf.add( diff_rms.mean[:n_states] + diff_rms.std[:n_outputs] * nn_output, xu[:, :n_states]) else: assert 'second_derivative' in prediction_type # We train 'out' to match state_dot_dot # Currently only works for swimmer. qpos = xu[:, :n_outputs] + dt * xu[:, n_outputs:n_states] qvel = xu[:, n_outputs:n_states] + dt * nn_output next_state = tf.concat([qpos, qvel], axis=1) if '_goal' in prediction_type: assert n_goals > 1 g = xgu[:, n_states:n_states + n_goals] next_state = tf.concat([next_state, g], axis=1) return tf.identity(next_state, name='%s/%s/dynamics_out' % (scope_name, variable_name)) return dynamics_model def get_regularizer_loss(scope_name, variable_name): if params['dynamics_model']['regularization']['method'] in [ None, '' ]: return tf.constant(0.0, dtype=tf.float32) constant = params['dynamics_model']['regularization']['constant'] regularizer = eval( params['dynamics_model']['regularization']['method']) hidden_layers = params['dynamics_model']['hidden_layers'] reg_loss = 0.0 for i in range(len(hidden_layers) + 1): w = get_scope_variable( scope_name, "%s/layer%d/weights" % (variable_name, i)) b = get_scope_variable( scope_name, "%s/layer%d/biases" % (variable_name, i)) reg_loss += regularizer(w) + regularizer(b) return constant * reg_loss ''' Main ''' # with get_session() as sess: if variant['mode'] == 'local': sess = get_session(interactive=True, mem_frac=0.1) else: sess = get_session(interactive=True, mem_frac=1.0, use_gpu=variant['use_gpu']) # data = joblib.load(os.path.join(working_dir, params['trpo_path'])) env = get_env(variant['params']['env']) # policy = data['policy'] training_policy, policy_model = build_policy_from_rllab() if hasattr(env._wrapped_env, '_wrapped_env'): inner_env = env._wrapped_env._wrapped_env else: inner_env = env._wrapped_env.env.unwrapped n_obs = inner_env.observation_space.shape[0] n_actions = inner_env.action_space.shape[0] cost_np = inner_env.cost_np cost_tf = inner_env.cost_tf cost_np_vec = inner_env.cost_np_vec if hasattr(inner_env, 'n_goals'): n_goals = inner_env.n_goals n_states = inner_env.n_states assert n_goals + n_states == n_obs else: n_goals = 0 n_states = n_obs dt = None # Only necessary for second_derivative if hasattr(inner_env, 'model') and hasattr(inner_env, 'frame_skip'): dt = inner_env.model.opt.timestep * inner_env.frame_skip from running_mean_std import RunningMeanStd with tf.variable_scope('input_rms'): input_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals + n_actions)) with tf.variable_scope('diff_rms'): diff_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals)) dynamics_model = build_dynamics_model(n_states=n_states, n_actions=n_actions, n_goals=n_goals, dt=dt, input_rms=input_rms, diff_rms=diff_rms) kwargs = {} kwargs['input_rms'] = input_rms kwargs['diff_rms'] = diff_rms kwargs['mode'] = variant['mode'] if params['algo'] == 'vpg': from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from algos.vpg import VPG baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=training_policy, baseline=baseline, batch_size=policy_opt_params.vpg['batch_size'], max_path_length=policy_opt_params.T, discount=policy_opt_params.vpg['discount'], ) kwargs['rllab_algo'] = algo if params["policy_opt_params"]["vpg"]["reset"]: kwargs['reset_opt'] = tf.assign( training_policy._l_std_param.param, np.log(params["policy_opt_params"]["vpg"]["init_std"]) * np.ones(n_actions)) elif params['algo'] == 'trpo': ### Write down baseline and algo from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from algos.trpo import TRPO baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=training_policy, baseline=baseline, batch_size=policy_opt_params.trpo['batch_size'], max_path_length=policy_opt_params.T, discount=policy_opt_params.trpo['discount'], step_size=policy_opt_params.trpo['step_size'], ) kwargs['rllab_algo'] = algo if params["policy_opt_params"]["trpo"]["reset"]: kwargs['reset_opt'] = tf.assign( training_policy._l_std_param.param, np.log(params["policy_opt_params"]["trpo"]["init_std"]) * np.ones(n_actions)) # if "decay_rate" in params["policy_opt_params"]["trpo"]: # kwargs['trpo_std_decay'] = tf.assign_sub(training_policy._l_std_param.param, # np.log(params["policy_opt_params"]["trpo"]["decay_rate"])*np.ones(n_actions)) kwargs['inner_env'] = inner_env kwargs['algo_name'] = params['algo'] kwargs['logstd'] = training_policy._l_std_param.param # Save initial policy joblib.dump(training_policy, os.path.join(snapshot_dir, 'params-initial.pkl')) train_models(env=env, dynamics_model=dynamics_model, dynamics_opt_params=dynamics_opt_params, get_regularizer_loss=get_regularizer_loss, policy_model=policy_model, policy_opt_params=policy_opt_params, rollout_params=rollout_params, cost_np=cost_np, cost_np_vec=cost_np_vec, cost_tf=cost_tf, snapshot_dir=snapshot_dir, working_dir=working_dir, n_models=params['n_models'], sweep_iters=params['sweep_iters'], sample_size=params['sample_size'], verbose=False, variant=variant, saved_policy=training_policy, **kwargs) # Make sure not to reinitialize TRPO policy. # Save the final policy joblib.dump(training_policy, os.path.join(snapshot_dir, 'params.pkl')) except Exception as e: rmtree(snapshot_dir) import sys, traceback # traceback.print_exception(*sys.exc_info()) from IPython.core.ultratb import ColorTB c = ColorTB() exc = sys.exc_info() print(''.join(c.structured_traceback(*exc))) print('Removed the experiment folder %s.' % snapshot_dir)
# Build the key classes if args.logger == "wandb": tracker = WandBTracker(args.name, args) else: tracker = ConsoleTracker(args.name, args) game_player = GamePlayer(args, shared_obs) if action_type == "discrete": dist = Discrete(args.num_actions) elif action_type == "continuous": dist = Normal(args.num_actions) if args.model == "cnn": model = CNNBase(1, args.num_actions, dist).to(device) elif args.model == "mlp": model = MLPBase(args.num_obs, args.num_actions, dist).to(device) optim = torch.optim.Adam(model.parameters(), lr=args.lr) reward_normalizer = RunningMeanStd(shape=()) obs_normalizer = RunningMeanStd(shape=(args.num_obs, )) # Main loop i = 0 for i in range(args.num_iterations): # Run num_steps of the game in each worker and accumulate results in # the data arrays game_player.run_rollout(args, shared_obs, rewards, discounted_rewards, values, policy_probs, actions, model, obs_normalizer, device, episode_ends) observations = shared_obs.copy() if args.model == "mlp": # Normalize rewards
class DDPG(object): def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None def pi(self, obs, apply_noise=True, compute_Q=True): obs = np.array([obs]) action = to_numpy(self.actor(to_tensor(obs))).squeeze(0) if compute_Q: q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q[0][0] def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) next_q_values = self.critic_target([ to_tensor(batch['obs1'], volatile=True), self.actor_target(to_tensor(batch['obs1'], volatile=True))]) next_q_values.volatile = False target_q_batch = to_tensor(batch['rewards']) + \ self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values self.critic.zero_grad() q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.cpu().data[0], policy_loss.cpu().data[0] def initialize(self): hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_target_net(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def reset(self): if self.action_noise is not None: self.action_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
class PpoOptimizer(object): envs = None def __init__(self, *, scope, ob_space, ac_space, stochpol, ent_coef, gamma, lam, nepochs, lr, cliprange, nminibatches, normrew, normadv, use_news, ext_coeff, int_coeff, nsteps_per_seg, nsegs_per_env, dynamics): self.dynamics = dynamics self.use_recorder = True self.n_updates = 0 self.scope = scope self.ob_space = ob_space self.ac_space = ac_space self.stochpol = stochpol self.nepochs = nepochs self.lr = lr self.cliprange = cliprange self.nsteps_per_seg = nsteps_per_seg self.nsegs_per_env = nsegs_per_env self.nminibatches = nminibatches self.gamma = gamma self.lam = lam self.normrew = normrew self.normadv = normadv self.use_news = use_news self.ent_coef = ent_coef self.ext_coeff = ext_coeff self.int_coeff = int_coeff def start_interaction(self, env_fns, dynamics, nlump=2): param_list = self.stochpol.param_list + self.dynamics.param_list + self.dynamics.auxiliary_task.param_list # copy a link, not deepcopy. self.optimizer = torch.optim.Adam(param_list, lr=self.lr) self.optimizer.zero_grad() self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time() def stop_interaction(self): for env in self.envs: env.close() def calculate_advantages(self, rews, use_news, gamma, lam): nsteps = self.rollout.nsteps lastgaelam = 0 for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0 nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last if not use_news: nextnew = 0 nextvals = self.rollout.buf_vpreds[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_last nextnotnew = 1 - nextnew delta = rews[:, t] + gamma * nextvals * nextnotnew - self.rollout.buf_vpreds[:, t] self.buf_advs[:, t] = lastgaelam = delta + gamma * lam * nextnotnew * lastgaelam self.buf_rets[:] = self.buf_advs + self.rollout.buf_vpreds def update(self): if self.normrew: rffs = np.array( [self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) else: rews = np.copy(self.rollout.buf_rews) self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) info = dict(advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), recent_best_ext_ret=self.rollout.current_max) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret to_report = { 'total': 0.0, 'pg': 0.0, 'vf': 0.0, 'ent': 0.0, 'approxkl': 0.0, 'clipfrac': 0.0, 'aux': 0.0, 'dyn_loss': 0.0, 'feat_var': 0.0 } # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] acs = self.rollout.buf_acs[mbenvinds] rews = self.rollout.buf_rews[mbenvinds] vpreds = self.rollout.buf_vpreds[mbenvinds] nlps = self.rollout.buf_nlps[mbenvinds] obs = self.rollout.buf_obs[mbenvinds] rets = self.buf_rets[mbenvinds] advs = self.buf_advs[mbenvinds] last_obs = self.rollout.buf_obs_last[mbenvinds] lr = self.lr cliprange = self.cliprange self.stochpol.update_features(obs, acs) self.dynamics.auxiliary_task.update_features(obs, last_obs) self.dynamics.update_features(obs, last_obs) feat_loss = torch.mean(self.dynamics.auxiliary_task.get_loss()) dyn_loss = torch.mean(self.dynamics.get_loss()) acs = torch.tensor(flatten_dims(acs, len(self.ac_space.shape))) neglogpac = self.stochpol.pd.neglogp(acs) entropy = torch.mean(self.stochpol.pd.entropy()) vpred = self.stochpol.vpred vf_loss = 0.5 * torch.mean( (vpred.squeeze() - torch.tensor(rets))**2) nlps = torch.tensor(flatten_dims(nlps, 0)) ratio = torch.exp(nlps - neglogpac.squeeze()) advs = flatten_dims(advs, 0) negadv = torch.tensor(-advs) pg_losses1 = negadv * ratio pg_losses2 = negadv * torch.clamp( ratio, min=1.0 - cliprange, max=1.0 + cliprange) pg_loss_surr = torch.max(pg_losses1, pg_losses2) pg_loss = torch.mean(pg_loss_surr) ent_loss = (-self.ent_coef) * entropy approxkl = 0.5 * torch.mean((neglogpac - nlps)**2) clipfrac = torch.mean( (torch.abs(pg_losses2 - pg_loss_surr) > 1e-6).float()) feat_var = torch.std(self.dynamics.auxiliary_task.features) total_loss = pg_loss + ent_loss + vf_loss + feat_loss + dyn_loss total_loss.backward() self.optimizer.step() self.optimizer.zero_grad() to_report['total'] += total_loss.data.numpy() / ( self.nminibatches * self.nepochs) to_report['pg'] += pg_loss.data.numpy() / (self.nminibatches * self.nepochs) to_report['vf'] += vf_loss.data.numpy() / (self.nminibatches * self.nepochs) to_report['ent'] += ent_loss.data.numpy() / ( self.nminibatches * self.nepochs) to_report['approxkl'] += approxkl.data.numpy() / ( self.nminibatches * self.nepochs) to_report['clipfrac'] += clipfrac.data.numpy() / ( self.nminibatches * self.nepochs) to_report['feat_var'] += feat_var.data.numpy() / ( self.nminibatches * self.nepochs) to_report['aux'] += feat_loss.data.numpy() / ( self.nminibatches * self.nepochs) to_report['dyn_loss'] += dyn_loss.data.numpy() / ( self.nminibatches * self.nepochs) info.update(to_report) self.n_updates += 1 info["n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = self.rollout.nsteps * self.nenvs / ( tnow - self.t_last_update) # MPI.COMM_WORLD.Get_size() * self.t_last_update = tnow return info def step(self): self.rollout.collect_rollout() update_info = self.update() return {'update': update_info} def get_var_values(self): return self.stochpol.get_var_values() def set_var_values(self, vv): self.stochpol.set_var_values(vv)
class DDPG(object): def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None def pi(self, obs, apply_noise=True, compute_Q=True): obs = np.array([obs]) action = to_numpy(self.actor(to_tensor(obs))).squeeze(0) if compute_Q: q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q[0][0] def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) next_q_values = self.critic_target([ to_tensor(batch['obs1'], volatile=True), self.actor_target(to_tensor(batch['obs1'], volatile=True)) ]) next_q_values.volatile = False target_q_batch = to_tensor(batch['rewards']) + \ self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values self.critic.zero_grad() q_batch = self.critic( [to_tensor(batch['obs0']), to_tensor(batch['actions'])]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.cpu().data[0], policy_loss.cpu().data[0] def initialize(self): hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_target_net(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def reset(self): if self.action_noise is not None: self.action_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
def main(): actor_critic = core.MLPActorCritic hidden_size = 64 activation = torch.nn.Tanh seed = 5 steps_per_epoch = 2048 epochs = 1000 gamma = 0.99 lam = 0.97 clip_ratio = 0.2 pi_lr = 3e-4 vf_lr = 1e-3 train_pi_iters = 80 train_vf_iters = 80 max_ep_len = 1000 target_kl = 0.01 save_freq = 10 obs_norm = True view_curve = False # make an environment # env = gym.make('CartPole-v0') # env = gym.make('CartPole-v1') # env = gym.make('MountainCar-v0') # env = gym.make('LunarLander-v2') env = gym.make('BipedalWalker-v3') print(f"reward_threshold: {env.spec.reward_threshold}") obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Random seed env.seed(seed) random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, (hidden_size, hidden_size), activation) # Set up optimizers for policy and value function pi_optimizer = AdamW(ac.pi.parameters(), lr=pi_lr, eps=1e-6) vf_optimizer = AdamW(ac.v.parameters(), lr=vf_lr, eps=1e-6) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Prepare for interaction with environment o, ep_ret, ep_len = env.reset(), 0, 0 ep_num = 0 ep_ret_buf, eval_ret_buf = [], [] loss_buf = {'pi': [], 'vf': []} obs_normalizer = RunningMeanStd(shape=env.observation_space.shape) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): env.render() if obs_norm: obs_normalizer.update(np.array([o])) o_norm = np.clip( (o - obs_normalizer.mean) / np.sqrt(obs_normalizer.var), -10, 10) a, v, logp = ac.step( torch.as_tensor(o_norm, dtype=torch.float32)) else: a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log if obs_norm: buf.store(o_norm, a, r, v, logp) else: buf.store(o, a, r, v, logp) # Update obs o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if timeout or epoch_ended: if obs_norm: obs_normalizer.update(np.array([o])) o_norm = np.clip((o - obs_normalizer.mean) / np.sqrt(obs_normalizer.var), -10, 10) _, v, _ = ac.step( torch.as_tensor(o_norm, dtype=torch.float32)) else: _, v, _ = ac.step( torch.as_tensor(o, dtype=torch.float32)) else: if obs_norm: obs_normalizer.update(np.array([o])) v = 0 buf.finish_path(v) if terminal: ep_ret_buf.append(ep_ret) eval_ret_buf.append(np.mean(ep_ret_buf[-20:])) ep_num += 1 if view_curve: plot(ep_ret_buf, eval_ret_buf, loss_buf) else: print(f'Episode: {ep_num:3}\tReward: {ep_ret:3}') if eval_ret_buf[-1] >= env.spec.reward_threshold: print(f"\n{env.spec.id} is sloved! {ep_num} Episode") torch.save( ac.state_dict(), f'./test/saved_models/{env.spec.id}_ep{ep_num}_clear_model_ppo.pt' ) with open( f'./test/saved_models/{env.spec.id}_ep{ep_num}_clear_norm_obs.pkl', 'wb') as f: pickle.dump(obs_normalizer, f, pickle.HIGHEST_PROTOCOL) return o, ep_ret, ep_len = env.reset(), 0, 0 # Perform PPO update! update(buf, train_pi_iters, train_vf_iters, clip_ratio, target_kl, ac, pi_optimizer, vf_optimizer, loss_buf)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True args_dir, logs_dir, models_dir, samples_dir = get_all_save_paths( args, 'pretrain', combine_action=args.combine_action) eval_log_dir = logs_dir + "_eval" utils.cleanup_log_dir(logs_dir) utils.cleanup_log_dir(eval_log_dir) _, _, intrinsic_models_dir, _ = get_all_save_paths(args, 'learn_reward', load_only=True) if args.load_iter != 'final': intrinsic_model_file_name = os.path.join( intrinsic_models_dir, args.env_name + '_{}.pt'.format(args.load_iter)) else: intrinsic_model_file_name = os.path.join( intrinsic_models_dir, args.env_name + '.pt'.format(args.load_iter)) intrinsic_arg_file_name = os.path.join(args_dir, 'command.txt') # save args to arg_file with open(intrinsic_arg_file_name, 'w') as f: json.dump(args.__dict__, f, indent=2) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, logs_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) else: raise NotImplementedError if args.use_intrinsic: obs_shape = envs.observation_space.shape if len(obs_shape) == 3: action_dim = envs.action_space.n elif len(obs_shape) == 1: action_dim = envs.action_space.shape[0] if 'NoFrameskip' in args.env_name: file_name = os.path.join( args.experts_dir, "trajs_ppo_{}.pt".format( args.env_name.split('-')[0].replace('NoFrameskip', '').lower())) else: file_name = os.path.join( args.experts_dir, "trajs_ppo_{}.pt".format(args.env_name.split('-')[0].lower())) rff = RewardForwardFilter(args.gamma) intrinsic_rms = RunningMeanStd(shape=()) if args.intrinsic_module == 'icm': print('Loading pretrained intrinsic module: %s' % intrinsic_model_file_name) inverse_model, forward_dynamics_model, encoder = torch.load( intrinsic_model_file_name) icm = IntrinsicCuriosityModule(envs, device, inverse_model, forward_dynamics_model, \ inverse_lr=args.intrinsic_lr, forward_lr=args.intrinsic_lr,\ ) if args.intrinsic_module == 'vae': print('Loading pretrained intrinsic module: %s' % intrinsic_model_file_name) vae = torch.load(intrinsic_model_file_name) icm = GenerativeIntrinsicRewardModule(envs, device, \ vae, lr=args.intrinsic_lr, \ ) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) obs, reward, done, infos = envs.step(action) next_obs = obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, next_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.use_intrinsic: for step in range(args.num_steps): state = rollouts.obs[step] action = rollouts.actions[step] next_state = rollouts.next_obs[step] if args.intrinsic_module == 'icm': state = encoder(state) next_state = encoder(next_state) with torch.no_grad(): rollouts.rewards[ step], pred_next_state = icm.calculate_intrinsic_reward( state, action, next_state, args.lambda_true_action) if args.standardize == 'True': buf_rews = rollouts.rewards.cpu().numpy() intrinsic_rffs = np.array( [rff.update(rew) for rew in buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments( intrinsic_rffs.ravel()) intrinsic_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) mean = intrinsic_rms.mean std = np.asarray(np.sqrt(intrinsic_rms.var)) rollouts.rewards = rollouts.rewards / torch.from_numpy(std).to( device) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(models_dir, args.algo) policy_file_name = os.path.join(save_path, args.env_name + '.pt') try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], policy_file_name) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "{} Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(args.env_name, j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
class RandomNetworkDistillation: def __init__( self, log_interval=10, lr=1e-5, use_cuda=False, verbose=0, log_tensorboard=False, path="rnd_model/", ): self.predictor = predictor_generator() self.target = target_generator() for param in self.target.parameters(): param.requires_grad = False self.target.eval() self.log_interval = log_interval self.optimizer = torch.optim.Adam(self.predictor.parameters(), lr=lr) self.loss_function = torch.nn.MSELoss(reduction='mean') self.device = torch.device('cuda' if use_cuda else 'cpu') self.target.to(self.device) self.predictor.to(self.device) self.running_stats = RunningMeanStd() self.verbose = verbose self.writer = SummaryWriter() if log_tensorboard else None self.n_iter = 0 self.save_path = path Path(path).mkdir(parents=True, exist_ok=True) self.early_stopping = EarlyStopping(save_dir=self.save_path) def set_data(self, train_tensor, test_tensor): train_target_tensor = self.target(train_tensor.to(self.device)) train_dataset = TensorDataset(train_tensor, train_target_tensor) self.train_loader = DataLoader(train_dataset) test_target_tensor = self.target(test_tensor.to(self.device)) test_dataset = TensorDataset(test_tensor, test_target_tensor) self.test_loader = DataLoader(test_dataset) return def learn(self, epochs): for epoch in range(epochs): self._train(epoch) test_loss = self._test() return test_loss def _train(self, epoch): self.predictor.train() for batch_idx, (data, target) in enumerate(self.train_loader): data, target = data.to(self.device), target.to(self.device) output = self.predictor(data) loss = self.loss_function(output, target) loss.backward() self.optimizer.step() self.n_iter += 1 self.running_stats.update(arr=array([loss.item()])) if self.verbose > 0 and batch_idx % self.log_interval == 0: print( f"Train Epoch: {epoch} [{batch_idx*len(data)}/{len(self.train_loader.dataset)} ({100. * batch_idx/len(self.train_loader):.0f}%)]", end="\t") print(f"Loss: {loss.item():.6f}") if self.writer is not None and self.n_iter % 100 == 0: self.writer.add_scalar("Loss/train", loss.item(), self.n_iter) return def _test(self): self.predictor.eval() test_loss = 0 with torch.no_grad(): for data, target in self.test_loader: data, target = data.to(self.device), target.to(self.device) output = self.predictor(data) test_loss += self.loss_function(output, target).item() test_loss /= len(self.test_loader.dataset) if self.verbose > 0: print(f"\nTest set: Average loss: {test_loss:.4f}\n") if self.writer is not None: self.writer.add_scalar("Loss/test", test_loss, self.n_iter) self.early_stopping(test_loss, self.predictor) if self.early_stopping.early_stop: print(">> save early stop checkpoint") return test_loss def get_intrinsic_reward(self, x: torch.Tensor): x = x.to(self.device) predict = self.predictor(x) target = self.target(x) intrinsic_reward = self.loss_function(predict, target).data.cpu().numpy() intrinsic_reward = (intrinsic_reward - self.running_stats.mean) / sqrt( self.running_stats.var) intrinsic_reward = clip(intrinsic_reward, -5, 5) return intrinsic_reward def save(self): path = self.save_path with open("{}/running_stat.pkl".format(path), 'wb') as f: pickle.dump(self.running_stats, f) torch.save(self.target.state_dict(), "{}/target.pt".format(path)) torch.save(self.predictor.state_dict(), "{}/predictor.pt".format(path)) return def load(self, path="rnd_model/", load_checkpoint=False): with open("{}/running_stat.pkl".format(path), 'rb') as f: self.running_stats = pickle.load(f) self.target.load_state_dict( torch.load("{}/target.pt".format(path), map_location=torch.device(self.device))) if load_checkpoint: self.predictor.load_state_dict( torch.load("{}/checkpoint.pt".format(path), map_location=torch.device(self.device))) else: self.predictor.load_state_dict( torch.load("{}/predictor.pt".format(path), map_location=torch.device(self.device))) return
class Train: def __init__(self, env, test_env, env_name, n_iterations, agent, epochs, mini_batch_size, epsilon, horizon): self.env = env self.env_name = env_name self.test_env = test_env self.agent = agent self.epsilon = epsilon self.horizon = horizon self.epochs = epochs self.mini_batch_size = mini_batch_size self.n_iterations = n_iterations self.start_time = 0 self.state_rms = RunningMeanStd(shape=(self.agent.n_states, )) self.running_reward = 0 @staticmethod def choose_mini_batch(mini_batch_size, states, actions, returns, advs, values, log_probs): full_batch_size = len(states) for _ in range(full_batch_size // mini_batch_size): indices = np.random.randint(0, full_batch_size, mini_batch_size) yield states[indices], actions[indices], returns[indices], advs[indices], values[indices],\ log_probs[indices] def train(self, states, actions, advs, values, log_probs): values = np.vstack(values[:-1]) log_probs = np.vstack(log_probs) returns = advs + values advs = (advs - advs.mean()) / (advs.std() + 1e-8) actions = np.vstack(actions) for epoch in range(self.epochs): for state, action, return_, adv, old_value, old_log_prob in self.choose_mini_batch( self.mini_batch_size, states, actions, returns, advs, values, log_probs): state = torch.Tensor(state).to(self.agent.device) action = torch.Tensor(action).to(self.agent.device) return_ = torch.Tensor(return_).to(self.agent.device) adv = torch.Tensor(adv).to(self.agent.device) old_value = torch.Tensor(old_value).to(self.agent.device) old_log_prob = torch.Tensor(old_log_prob).to(self.agent.device) value = self.agent.critic(state) # clipped_value = old_value + torch.clamp(value - old_value, -self.epsilon, self.epsilon) # clipped_v_loss = (clipped_value - return_).pow(2) # unclipped_v_loss = (value - return_).pow(2) # critic_loss = 0.5 * torch.max(clipped_v_loss, unclipped_v_loss).mean() critic_loss = self.agent.critic_loss(value, return_) new_log_prob = self.calculate_log_probs( self.agent.current_policy, state, action) ratio = (new_log_prob - old_log_prob).exp() actor_loss = self.compute_actor_loss(ratio, adv) self.agent.optimize(actor_loss, critic_loss) return actor_loss, critic_loss def step(self): state = self.env.reset() for iteration in range(1, 1 + self.n_iterations): states = [] actions = [] rewards = [] values = [] log_probs = [] dones = [] self.start_time = time.time() for t in range(self.horizon): # self.state_rms.update(state) state = np.clip((state - self.state_rms.mean) / (self.state_rms.var**0.5 + 1e-8), -5, 5) dist = self.agent.choose_dist(state) action = dist.sample().cpu().numpy()[0] # action = np.clip(action, self.agent.action_bounds[0], self.agent.action_bounds[1]) log_prob = dist.log_prob(torch.Tensor(action)) value = self.agent.get_value(state) next_state, reward, done, _ = self.env.step(action) states.append(state) actions.append(action) rewards.append(reward) values.append(value) log_probs.append(log_prob) dones.append(done) if done: state = self.env.reset() else: state = next_state # self.state_rms.update(next_state) next_state = np.clip((next_state - self.state_rms.mean) / (self.state_rms.var**0.5 + 1e-8), -5, 5) next_value = self.agent.get_value(next_state) * (1 - done) values.append(next_value) advs = self.get_gae(rewards, values, dones) states = np.vstack(states) actor_loss, critic_loss = self.train(states, actions, advs, values, log_probs) # self.agent.set_weights() self.agent.schedule_lr() eval_rewards = evaluate_model(self.agent, self.test_env, self.state_rms, self.agent.action_bounds) self.state_rms.update(states) self.print_logs(iteration, actor_loss, critic_loss, eval_rewards) @staticmethod def get_gae(rewards, values, dones, gamma=0.99, lam=0.95): advs = [] gae = 0 dones.append(0) for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * (values[step + 1]) * ( 1 - dones[step]) - values[step] gae = delta + gamma * lam * (1 - dones[step]) * gae advs.append(gae) advs.reverse() return np.vstack(advs) @staticmethod def calculate_log_probs(model, states, actions): policy_distribution = model(states) return policy_distribution.log_prob(actions) def compute_actor_loss(self, ratio, adv): pg_loss1 = adv * ratio pg_loss2 = adv * torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) loss = -torch.min(pg_loss1, pg_loss2).mean() return loss def print_logs(self, iteration, actor_loss, critic_loss, eval_rewards): if iteration == 1: self.running_reward = eval_rewards else: self.running_reward = self.running_reward * 0.99 + eval_rewards * 0.01 if iteration % 100 == 0: print(f"Iter:{iteration}| " f"Ep_Reward:{eval_rewards:.3f}| " f"Running_reward:{self.running_reward:.3f}| " f"Actor_Loss:{actor_loss:.3f}| " f"Critic_Loss:{critic_loss:.3f}| " f"Iter_duration:{time.time() - self.start_time:.3f}| " f"lr:{self.agent.actor_scheduler.get_last_lr()}") self.agent.save_weights(iteration, self.state_rms) with SummaryWriter(self.env_name + "/logs") as writer: writer.add_scalar("Episode running reward", self.running_reward, iteration) writer.add_scalar("Episode reward", eval_rewards, iteration) writer.add_scalar("Actor loss", actor_loss, iteration) writer.add_scalar("Critic loss", critic_loss, iteration)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") run_id = "alpha{}".format(args.gcn_alpha) if args.use_logger: from utils import Logger folder = "{}/{}".format(args.folder, run_id) logger = Logger(algo_name=args.algo, environment_name=args.env_name, folder=folder, seed=args.seed) logger.save_args(args) print("---------------------------------------") print('Saving to', logger.save_folder) print("---------------------------------------") else: print("---------------------------------------") print('NOTE : NOT SAVING RESULTS') print("---------------------------------------") all_rewards = [] envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, args.env_name, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, actor_critic.base.output_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) ############################ # GCN Model and optimizer from pygcn.train import update_graph from pygcn.models import GCN, GAT, SAGE assert args.gnn in ['gcn', 'gat', 'sage'] if args.gnn == 'gat': gcn_model = GAT(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) elif args.gnn == 'sage': gcn_model = SAGE(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) elif args.gnn == 'gcn': gcn_model = GCN(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) gcn_model.to(device) gcn_optimizer = optim.Adam(gcn_model.parameters(), lr=args.gcn_lr, weight_decay=args.gcn_weight_decay) gcn_loss = nn.NLLLoss() gcn_states = [[] for _ in range(args.num_processes)] Gs = [nx.Graph() for _ in range(args.num_processes)] node_ptrs = [0 for _ in range(args.num_processes)] rew_states = [[] for _ in range(args.num_processes)] ############################ episode_rewards = deque(maxlen=100) avg_fwdloss = deque(maxlen=100) rew_rms = RunningMeanStd(shape=()) delay_rew = torch.zeros([args.num_processes, 1]) delay_step = torch.zeros([args.num_processes]) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob,\ recurrent_hidden_states, hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) delay_rew += reward delay_step += 1 for idx, (info, hid, eps_done) in enumerate(zip(infos, hidden_states, done)): if eps_done or delay_step[idx] == args.reward_freq: reward[idx] = delay_rew[idx] delay_rew[idx] = delay_step[idx] = 0 else: reward[idx] = 0 if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if args.gcn_alpha < 1.0: gcn_states[idx].append(hid) node_ptrs[idx] += 1 if not eps_done: Gs[idx].add_edge(node_ptrs[idx] - 1, node_ptrs[idx]) if reward[idx] != 0. or eps_done: rew_states[idx].append( [node_ptrs[idx] - 1, reward[idx]]) if eps_done: adj = nx.adjacency_matrix(Gs[idx]) if len(Gs[idx].nodes)\ else sp.csr_matrix(np.eye(1,dtype='int64')) update_graph(gcn_model, gcn_optimizer, torch.stack(gcn_states[idx]), adj, rew_states[idx], gcn_loss, args, envs) gcn_states[idx] = [] Gs[idx] = nx.Graph() node_ptrs[idx] = 0 rew_states[idx] = [] # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, hidden_states) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau, gcn_model, args.gcn_alpha) agent.update(rollouts) rollouts.after_update() ####################### Saving and book-keeping ####################### if (j % int(num_updates / 5.) == 0 or j == num_updates - 1) and args.save_dir != "": print('Saving model') print() save_dir = "{}/{}/{}".format(args.save_dir, args.folder, run_id) save_path = os.path.join(save_dir, args.algo, 'seed' + str(args.seed)) + '_iter' + str(j) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_gcn = gcn_model if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_gcn = copy.deepcopy(gcn_model).cpu() save_model = [ save_gcn, save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + "ac.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {}\ training episodes: mean/median reward {:.2f}/{:.2f},\ min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards), )) all_rewards.append(np.mean(episode_rewards)) if args.use_logger: logger.save_task_results(all_rewards) ####################### Saving and book-keeping ####################### envs.close()
def train(env, ddpg_graph, actor, critic, cl_nn=None, pt=None, cl_mode=None, compare_with_sess=None, compare_with_actor=None, norm_complexity=0, **config): print('train: ' + config['output'] + ' started!') print("Noise: {} and {}".format(config["ou_sigma"], config["ou_theta"])) print("Actor learning rate {}".format(config["actor_lr"])) print("Critic learning rate {}".format(config["critic_lr"])) print("Minibatch size {}".format(config["minibatch_size"])) curriculums = [] if config["curriculum"]: print("Following curriculum {}".format(config["curriculum"])) items = config["curriculum"].split(";") for item in items: params = item.split("_") x = np.array(params[1:]).astype(np.float) c = {'var': params[0], 'gen': cur_gen(config["steps"], x)} curriculums.append(c) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.15) with tf.Session(graph=ddpg_graph, config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # Check if a policy needs to be loaded sess = preload_policy(sess, config) # Initialize target network weights actor.update_target_network(sess) critic.update_target_network(sess) # Load curriculum neural network weights (provided parametes have priority) if cl_nn: sess = cl_nn.load(sess, config["cl_load"]) # Initialize replay memory o_dims = env.observation_space.shape[-1] replay_buffer = ReplayBuffer(config, o_dims=o_dims) # Observation normalization. obs_range = [env.observation_space.low, env.observation_space.high] if config["normalize_observations"]: obs_rms = RunningMeanStd(shape=env.observation_space.shape) else: obs_rms = None # decide mode if cl_nn: v = pt.flatten() cl_mode_new, cl_threshold = cl_nn.predict(sess, v) #cl_threshold = pt.denormalize(cl_threshold) else: cl_mode_new = cl_mode cl_threshold = None # Initialize constants for exploration noise ou_sigma = config["ou_sigma"] ou_theta = config["ou_theta"] ou_mu = 0 trial_return = 0 max_trial_return = 0 obs_dim = actor.s_dim act_dim = actor.a_dim max_action = np.minimum(np.absolute(env.action_space.high), np.absolute(env.action_space.low)) obs = np.zeros(obs_dim) action = np.zeros(act_dim) noise = np.zeros(act_dim) tt = 0 ss = 0 ss_all = 0 terminal = 0 reach_timeout_num = 0 more_info = None ss_acc, td_acc, l2_reg_acc, action_grad_acc, actor_grad_acc = 0, 0, 0, 0, 0 prev_l2_reg = critic.l2_reg_(sess) ti = config["test_interval"] test_returns = [] avg_test_return = config['reach_return'] # rewarding object if rewards in replay buffer are to be recalculated replay_buffer.load() if config['reassess_for']: print('Reassessing replay buffer for {}'.format( config['reassess_for'])) evaluator = Evaluator(max_action) #pdb.set_trace() replay_buffer = evaluator.add_bonus(replay_buffer, how=config['reassess_for']) # Export trajectory if config['trajectory']: trajectory = [] if config["compare_with"]: actor_sim = [] # start environment for c in curriculums: c['ss'], val = next(c['gen']) d = {"action": "update_{}".format(c['var']), c['var']: val} env.reconfigure(d) test = (ti >= 0 and tt % (ti + 1) == ti) obs = env.reset(test=test) obs = obs_normalize(obs, obs_rms, obs_range, o_dims, config["normalize_observations"]) # Export environment state if cl_nn: more_info = ''.join('{:10.2f}'.format(indi) for indi in [-100, -100, -100]) more_info += ''.join('{:10.2f}'.format(vvv) for vv in v[0] for vvv in vv) more_info += ''.join('{:10.2f}'.format(th) for th in cl_threshold) env.log(more_info if cl_threshold is not None else '') # Main loop over steps or trials # Finish when trials finish # or Finish when steps finish # or Finishe when new mode in curriculum is switched # or Finish when certain return is reached # of Finish if trial happend to be longer then config['reach_balance'] twice in a row while (config["trials"] == 0 or tt < config["trials"]) and \ (config["steps"] == 0 or ss < config["steps"]) and \ (not cl_nn or cl_mode_new == cl_mode) and \ (not config['reach_return'] or avg_test_return <= config['reach_return']) and \ (not config['reach_timeout'] or (config['reach_timeout'] > 0 and reach_timeout_num < config['reach_timeout_num'])): # Compute OU noise and action if not test: noise = ExplorationNoise.ou_noise(ou_theta, ou_mu, ou_sigma, noise, act_dim) action = compute_action(sess, actor, obs[:o_dims], noise, test) # from [-1; 1] # obtain observation of a state next_obs, reward, terminal, info = env.step(action * max_action) #print('Forward promotion: ' + str(next_obs[-1])) #print('Reward: ' + str(reward)) next_obs = obs_normalize(next_obs, obs_rms, obs_range, o_dims, config["normalize_observations"]) reward *= config['reward_scale'] # Add the transition to replay buffer if not test: replay_buffer.replay_buffer_add(obs, action, reward, terminal == 2, next_obs) # Keep adding experience to the memory until # there are at least minibatch size samples if not test and replay_buffer.size() > config["rb_min_size"]: minibatch_size = config["minibatch_size"] s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( minibatch_size) # Calculate targets target_q = critic.predict_target( sess, s2_batch, actor.predict_target(sess, s2_batch)) y_i = [] for k in range(minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + config["gamma"] * target_q[k][0]) # target_q: list -> float if config['perf_td_error']: q_i = critic.predict_target(sess, s_batch, a_batch) td_acc += np.sum( np.abs(q_i - np.reshape(y_i, newshape=(minibatch_size, 1)))) # Update the critic given the targets if config['perf_l2_reg']: _, _, l2_reg = critic.train_( sess, s_batch, a_batch, np.reshape(y_i, (minibatch_size, 1))) l2_reg_acc += (l2_reg - prev_l2_reg) prev_l2_reg = l2_reg else: critic.train(sess, s_batch, a_batch, np.reshape(y_i, (minibatch_size, 1))) # Update the actor policy using the sampled gradient a_outs = actor.predict(sess, s_batch) grad = critic.action_gradients(sess, s_batch, a_outs)[0] if config['perf_action_grad']: action_grad_acc += np.linalg.norm(grad, ord=2) if config['perf_actor_grad']: _, actor_grad = actor.train_(sess, s_batch, grad) for ag in actor_grad: actor_grad_acc += np.linalg.norm(ag, ord=2) else: actor.train(sess, s_batch, grad) ss_acc += 1 # Update target networks actor.update_target_network(sess) critic.update_target_network(sess) # Render if config["render"]: still_open = env.render("human") if still_open == False: break # Record trajectory # Note that it exports all training and testing episodes if config['trajectory']: real_time = ss_all * config['env_timestep'] trajectory.append([real_time] + obs[:o_dims].tolist() + (action * max_action).tolist() + next_obs[:o_dims].tolist() + [reward] + [terminal]) # + [info]) if config["compare_with"]: compare_with_action = compute_action( compare_with_sess, compare_with_actor, obs[:o_dims], noise, test) actor_sim.append([real_time] + obs[:o_dims].tolist() + (compare_with_action * max_action).tolist()) # Prepare next step obs = next_obs trial_return += reward # Logging performance at the end of the testing trial if terminal and test: # NN performance indicators more_info = "" s = info.split() norm_duration = float(s[0]) / config["env_timeout"] td_per_step = td_acc / ss_acc if ss_acc > 0 else 0 norm_td_error = td_per_step / config["env_td_error_scale"] norm_complexity += l2_reg_acc / ss_acc if ss_acc > 0 else 0 indicators = [norm_duration, norm_td_error, norm_complexity] more_info += ''.join('{:14.8f}'.format(indi) for indi in indicators) if cl_nn: # update PerformanceTracker pt.add(indicators) # return, duration, damage v = pt.flatten() cl_mode_new, cl_threshold = cl_nn.predict(sess, v) more_info += ''.join('{:14.8f}'.format(vvv) for vv in v[0] for vvv in vv) more_info += ''.join('{:14.8f}'.format(th) for th in cl_threshold) ss_acc, td_acc, l2_reg_acc, action_grad_acc, actor_grad_acc = 0, 0, 0, 0, 0 # report env.log(more_info) # check if performance is satisfactory test_returns.append(trial_return) avg_test_return = np.mean( test_returns[max([0, len(test_returns) - 10]):]) if float(info.split()[0]) > config['reach_timeout']: reach_timeout_num += 1 else: reach_timeout_num = 0 if not config['mp_debug']: msg = "{:>10} {:>10} {:>10.3f} {:>10}" \ .format(tt, ss, trial_return, terminal) print("{}".format(msg)) # Save NN if performance is better then before if terminal and config['save'] and trial_return > max_trial_return: max_trial_return = trial_return save_policy(sess, config, suffix="-best") if not test: ss += 1 for c in curriculums: if ss > c['ss']: c['ss'], val = next(c['gen']) d = { "action": "update_{}".format(c['var']), c['var']: val } env.reconfigure(d) ss_all += 1 if terminal: tt += 1 test = (ti >= 0 and tt % (ti + 1) == ti) obs = env.reset(test=test) obs = obs_normalize(obs, obs_rms, obs_range, o_dims, config["normalize_observations"]) reward = 0 terminal = 0 trial_return = 0 noise = np.zeros(actor.a_dim) # Export final performance, but when curriculum is not used or terminated # not due to the curriculum swithch. # Becasue data is always exported when curriculum is switched over. if (not cl_nn or cl_mode_new == cl_mode): env.log(more_info) # Export trajectory if config['trajectory']: dump_pkl_csv(config['trajectory'], trajectory) if config["compare_with"]: dump_pkl_csv(config['trajectory'] + '_sim', actor_sim) # verify replay_buffer #evaluator.reassess(replay_buffer, verify=True, task = config['reassess_for']) print('train: ' + config['output'] + ' finished!') # Save the last episode policy if config['save']: suffix = "-last" save_policy(sess, config, suffix=suffix) #save_policy(sess, saver, config, suffix=suffix) if config["normalize_observations"]: with open(config["output"] + suffix + '.obs_rms', 'w') as f: data = { 'count': obs_rms.count, 'mean': obs_rms.mean.tolist(), 'std': obs_rms.std.tolist(), 'var': obs_rms.var.tolist() } json.dump(data, f) replay_buffer.save() # save curriculum network if cl_nn: cl_nn.save(sess, config["cl_save"]) # extract damage from the last step damage = 0 info = env.get_latest_info() if info: s = info.split() damage = float(s[1]) print('train: ' + config['output'] + ' returning ' + '{} {} {} {}'.format(avg_test_return, damage, ss, cl_mode_new)) return (avg_test_return, damage, ss, cl_mode_new, norm_complexity)
class PPO(object): def __init__(self): self.sess = tf.Session() self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state') with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(self.sess, shape=S_DIM) # critic # l1 = self.feature #tf.layers.dense(self.feature, 100, tf.nn.relu) self.feature = self._build_feature_net('feature', self.tfs, reuse=False) self.v = self._build_cnet('value', self.feature, reuse=False) self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') self.diff_r_v = self.tfdc_r - self.v self.closs = tf.reduce_mean(tf.square(self.diff_r_v)) # self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs) # actor self.pi, pi_params = self._build_anet('pi', trainable=True) oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) self.update_oldpi_op = [ oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params) ] self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') # # for continue action self.tfa = tf.placeholder(tf.float32, [None, 1], 'action') # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) self.ratio = self.pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5) self.entropy = self.pi.entropy() self.sample_op = tf.squeeze(self.pi.sample(1), axis=0) # operation of choosing action self.sample_op_stochastic = self.pi.loc self.std = self.pi.scale # # descrete action # self.tfa = tf.placeholder(tf.int32, [None], 'action') # self.pi_prob = tf.reduce_sum((self.pi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True) # oldpi_prob = tf.reduce_sum((oldpi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True) # self.ratio = self.pi_prob / (oldpi_prob + 1e-5) #tf.exp(self.log_pi - log_oldpi) # self.entropy = -tf.reduce_sum(self.pi * tf.log(self.pi + 1e-5), axis=1, keep_dims=True) self.surr1 = self.ratio * self.tfadv self.surr2 = tf.clip_by_value(self.ratio, 1. - EPSILON, 1. + EPSILON) self.surr = tf.minimum(self.surr1, self.surr2) + 0.0 * self.entropy self.aloss = -tf.reduce_mean(self.surr) # value replay self.tfs_history = tf.placeholder(tf.float32, [None, S_DIM], 'state_history') # for value replay self.return_history = tf.placeholder( tf.float32, [None, 1], 'history_return') # for value replay self.feature_history = self._build_feature_net( 'feature', self.tfs_history, reuse=True) # for value replay self.v_history = self._build_cnet('value', self.feature_history, reuse=True) self.diff_history = self.return_history - self.v_history self.loss_history = tf.reduce_mean(tf.square(self.diff_history)) # reward predict self.tfs_label = tf.placeholder(tf.float32, [None, S_DIM], 'state_label') # for reward prediction self.label = tf.placeholder(tf.int32, [None], 'true_label') self.feature_label = self._build_feature_net( 'feature', self.tfs_label, reuse=True) # for reward prediction self.pred_label = tf.layers.dense(self.feature_label, 2) self.loss_pred = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pred_label, labels=self.label)) ########################################################################################### self.total_loss = self.aloss + (self.closs * 1 + self.loss_pred * 0 + self.loss_history * 0) self.base_loss = self.aloss + self.closs * 1 + self.loss_history * 0 global_step = tf.Variable(0, trainable=False) starter_learning_rate = LR end_learning_rate = LR / 10 decay_steps = 10 learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5) # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) optimizer = tf.train.AdamOptimizer(learning_rate) self.train_op = optimizer.minimize(self.total_loss, global_step=global_step) self.train_base_op = optimizer.minimize(self.base_loss, global_step=global_step) # self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() self.summary_writer = tf.summary.FileWriter('./log', self.sess.graph) # self.load_model() def get_entropy(self): a0 = self.pi - self.max(self.pi, axis=-1, keepdims=True) ea0 = tf.exp(a0) z0 = self.sum(ea0, axis=-1, keepdims=True) p0 = ea0 / z0 entropy = self.sum(p0 * (tf.log(z0) - a0), axis=-1) return entropy def neglogp(self, pi, a): one_hot_actions = tf.one_hot(a, pi.get_shape().as_list()[-1]) return tf.nn.softmax_cross_entropy_with_logits(logits=pi, labels=one_hot_actions) def sum(self, x, axis=None, keepdims=False): axis = None if axis is None else [axis] return tf.reduce_sum(x, axis=axis, keep_dims=keepdims) def max(self, x, axis=None, keepdims=False): axis = None if axis is None else [axis] return tf.reduce_max(x, axis=axis, keep_dims=keepdims) def load_model(self): print('Loading Model...') ckpt = tf.train.get_checkpoint_state('./model/rl/') if ckpt and ckpt.model_checkpoint_path: self.saver.restore(self.sess, ckpt.model_checkpoint_path) print('loaded') else: print('no model file') def write_summary(self, summary_name, value): summary = tf.Summary() summary.value.add(tag=summary_name, simple_value=float(value)) self.summary_writer.add_summary(summary, GLOBAL_EP) self.summary_writer.flush() def get_rp_buffer(self, sample_goal_num, sample_crash_num): rp_states = [] rp_label = [] rp_return = [] sample_goal_num = int(sample_goal_num) sample_crash_num = int(sample_crash_num) size = RP_buffer_size replace = False if Goal_buffer_full == False: size = Goal_count replace = True if size > 0 and sample_goal_num > 0: if sample_goal_num > size * 2: sample_goal_num = size * 2 goal_selected = np.random.choice(size, sample_goal_num, replace=replace) for index in goal_selected: rp_states.append(Goal_states[index]) rp_label.append(0) rp_return.append(Goal_return[index]) size = RP_buffer_size replace = False if Crash_buffer_full == False: size = Crash_count replace = True if size > 0 and sample_crash_num > 0: if sample_crash_num > size * 2: sample_crash_num = size * 2 crash_selected = np.random.choice(size, sample_crash_num, replace=replace) for index in crash_selected: rp_states.append(Crash_states[index]) rp_label.append(1) rp_return.append(Crash_return[index]) return np.array(rp_states), np.array(rp_label), np.array( rp_return)[:, np.newaxis] def get_vr_buffer(self, sample_num): vr_states = [] vr_returns = [] sample_num = int(sample_num) size = History_buffer_size replace = False if History_buffer_full == False: size = History_count replace = True if size > 0: if sample_num > size * 2: sample_num = size * 2 index_selected = np.random.choice(size, sample_num, replace=replace) for index in index_selected: vr_states.append(History_states[index]) vr_returns.append(History_return[index]) return np.array(vr_states), np.array(vr_returns)[:, np.newaxis] def update_base_task(self, s, a, r, adv, vr_states, vr_returns): feed_dict = { self.tfs: s, self.tfa: a, self.tfdc_r: r, self.tfadv: adv, self.tfs_history: vr_states, self.return_history: vr_returns } # st = self.sess.run(self.aloss, feed_dict = feed_dict) # ratio = self.sess.run(self.ratio, feed_dict = feed_dict) # # st2 = self.sess.run(self.surr, feed_dict = feed_dict) # print('aloss', st.flatten()) # print('ratio',ratio.flatten()) # # print(st2) # # print(np.mean(st2)) vr_loss = 0 # tloss, aloss, vloss, entropy, _ = self.sess.run([self.base_loss, self.aloss, self.closs, self.entropy, self.train_base_op] tloss, aloss, vloss, vr_loss, entropy, _ = self.sess.run( [ self.base_loss, self.aloss, self.closs, self.loss_history, self.entropy, self.train_base_op ], feed_dict=feed_dict) return tloss, aloss, vloss, 0, vr_loss, np.mean(entropy) def update_all_task(self, s, a, r, adv, rp_states, rp_labels, vr_states, vr_returns): feed_dict = { self.tfs: s, self.tfa: a, self.tfdc_r: r, self.tfadv: adv, self.tfs_label: rp_states, self.label: rp_labels, self.tfs_history: vr_states, self.return_history: vr_returns } # st = self.sess.run(self.aloss, feed_dict = feed_dict) # print(st) tloss, aloss, vloss, rp_loss, vr_loss, entropy, _ = self.sess.run( [ self.total_loss, self.aloss, self.closs, self.loss_pred, self.loss_history, self.entropy, self.train_op ], feed_dict=feed_dict) return tloss, aloss, vloss, rp_loss, vr_loss, np.mean(entropy) def shuffel_data(self, s, a, r, adv): index_shuffeled = np.random.choice(len(r), len(r), replace=False) s_shuf, a_shuf, r_shuf, adv_shuf = [], [], [], [] for i in index_shuffeled: s_shuf.append(s[i]) a_shuf.append(a[i]) r_shuf.append(r[i]) adv_shuf.append(adv[i]) return s_shuf, a_shuf, r_shuf, adv_shuf def shuffel_history(self, history_states, history_returns): index_shuffeled = np.random.choice(len(history_returns), len(history_returns), replace=False) s_shuf, r_shuf = [], [] for i in index_shuffeled: s_shuf.append(history_states[i]) r_shuf.append(history_returns[i]) return s_shuf, r_shuf #, np.array(r_shuf)[:, np.newaxis] def get_vr_batch(self, s, r): # combined_states = s # combined_returns = r # history buffer if History_buffer_full or History_count > 0: if History_buffer_full: his_size = History_buffer_size else: his_size = History_count combined_states = History_states[:his_size] combined_returns = np.array(History_return[:his_size])[:, np.newaxis] # goal buffer if Goal_buffer_full or Goal_count > 0: if Goal_buffer_full: his_size = RP_buffer_size else: his_size = Goal_count combined_states = np.concatenate( (combined_states, Goal_states[:his_size]), axis=0) combined_returns = np.concatenate( (combined_returns, np.array( Goal_return[:his_size])[:, np.newaxis]), axis=0) #crash buffer if Crash_buffer_full or Crash_count > 0: if Crash_buffer_full: his_size = RP_buffer_size else: his_size = Crash_count combined_states = np.concatenate( (combined_states, Crash_states[:his_size]), axis=0) combined_returns = np.concatenate( (combined_returns, np.array( Crash_return[:his_size])[:, np.newaxis]), axis=0) return combined_states, combined_returns def update(self): global GLOBAL_UPDATE_COUNTER, G_ITERATION while not COORD.should_stop(): UPDATE_EVENT.wait() # wait until get batch of data self.sess.run(self.update_oldpi_op) # copy pi to old pi data = [QUEUE.get() for _ in range(QUEUE.qsize()) ] # collect data from all workers data = np.vstack(data) # s, a, r, adv = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, S_DIM + A_DIM: S_DIM + A_DIM + 1], data[:, -1:] s, a, r, reward, adv = data[:, : S_DIM], data[:, S_DIM:S_DIM + 1], data[:, S_DIM + 1:S_DIM + 2], data[:, S_DIM + 2: S_DIM + 3], data[:, -1:] self.ob_rms.update(s) if adv.std() != 0: adv = (adv - adv.mean()) / adv.std() print('adv min max', adv.min(), adv.max()) # print('adv', adv) # adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r}) # update actor and critic in a update loop mean_return = np.mean(r) print(G_ITERATION, ' --------------- update! batch size:', len(a), '-----------------') print( '--------------------------------------------------------------------------------------' ) # combined_states, combined_returns = self.get_vr_batch(s, r) combined_states, combined_returns = s, r print('a batch', len(r), 'v batch', len(combined_returns)) for iteration in range(UPDATE_STEP): # construct reward predict data tloss, aloss, vloss, rp_loss, vr_loss = [], [], [], [], [] tloss_sum, aloss_sum, vloss_sum, rp_loss_sum, vr_loss_sum, entropy_sum = 0, 0, 0, 0, 0, 0 # s, a, r, adv = self.shuffel_data(s, a, r, adv) combined_states, combined_returns = self.shuffel_history( combined_states, combined_returns) count = 0 for start in range(0, len(combined_returns), MIN_BATCH_SIZE): # print('update',iteration, count) end = start + MIN_BATCH_SIZE if end > len(combined_returns) - 1: break count += 1 sub_s = combined_states[start:end] # sub_a = a[start:end] sub_r = combined_returns[start:end] # sub_adv = adv[start:end] rp_states, rp_labels, rp_returns = self.get_rp_buffer( MIN_BATCH_SIZE * 1, MIN_BATCH_SIZE * 1) # vr_states, vr_returns = self.get_vr_buffer(MIN_BATCH_SIZE*1) # vr_states = np.concatenate((vr_states, s), axis=0) # vr_returns = np.concatenate((vr_returns, r), axis=0) # if len(rp_states) != 0: # vr_states = np.concatenate((vr_states, rp_states), axis=0) # vr_returns = np.concatenate((vr_returns, rp_returns), axis=0) # if len(rp_states) != 0: # tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_all_task(sub_s, sub_a, sub_r, sub_adv, rp_states, rp_labels, vr_states, vr_returns) # else: # tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_base_task(sub_s, sub_a, sub_r, sub_adv, vr_states, vr_returns) tloss, aloss, vloss, rp_loss, vr_loss, entropy = self.update_base_task( s, a, r, adv, sub_s, sub_r) tloss_sum += tloss aloss_sum += aloss vloss_sum += vloss rp_loss_sum += rp_loss vr_loss_sum += vr_loss entropy_sum += entropy if count == 0: count = 1 print( '--------------- need more sample --------------- ') break print("aloss: %7.4f|, vloss: %7.4f|, rp_loss: %7.4f|, vr_loss: %7.4f|, entropy: %7.4f" % \ (aloss_sum/count, vloss_sum/count, rp_loss_sum/count, vr_loss_sum/count, entropy_sum/count)) # [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)] # [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)] print(Goal_count, Crash_count, History_count) print(Goal_buffer_full, Crash_buffer_full, History_buffer_full) entropy = self.sess.run(self.entropy, {self.tfs: s}) self.write_summary('Loss/entropy', np.mean(entropy)) self.write_summary('Loss/a loss', aloss_sum / count) self.write_summary('Loss/v loss', vloss_sum / count) self.write_summary('Loss/rp loss', rp_loss_sum / count) self.write_summary('Loss/vr loss', vr_loss_sum / count) self.write_summary('Loss/t loss', tloss_sum / count) self.write_summary('Perf/mean_reward', np.mean(reward)) self.saver.save(self.sess, './model/rl/model.cptk') UPDATE_EVENT.clear() # updating finished GLOBAL_UPDATE_COUNTER = 0 # reset counter G_ITERATION += 1 ROLLING_EVENT.set() # set roll-out available def _build_feature_net(self, name, input_state, reuse=False): w_init = tf.contrib.layers.xavier_initializer() # w_init = tf.zeros_initializer() with tf.variable_scope(name, reuse=reuse): state_size = 5 num_img = S_DIM - state_size - 1 # img_size = int(math.sqrt(num_img)) print(num_img, img_size) input_state = (input_state - self.ob_rms.mean) / self.ob_rms.std ob_grid = tf.slice(input_state, [0, 0], [-1, num_img]) # tp_state = tf.slice(self.tfs, [0, num_img], [-1, 2]) # rp_state = tf.slice(self.tfs, [0, num_img+2], [-1, 3]) # action_taken = tf.slice(self.tfs, [0, num_img+4], [-1, 1]) # index_in_ep = tf.slice(self.tfs, [0, num_img+5], [-1, 1]) ob_state = tf.slice(input_state, [0, num_img], [-1, state_size]) # ob_state = tf.concat([ob_state , index_in_ep], 1, name = 'concat_ob') # reshaped_grid = tf.reshape(ob_grid,shape=[-1, img_size, img_size, 1]) ob_state = tf.reshape(ob_state, shape=[-1, state_size]) x = (ob_grid - 0.5) * 2 x = tf.layers.dense(x, 100, tf.nn.tanh, kernel_initializer=w_init, name='x_fc1') x = tf.layers.dense(x, 50, tf.nn.tanh, kernel_initializer=w_init, name='x_fc2') # process state state_rt = tf.layers.dense(ob_state, state_size * 10, tf.nn.tanh, kernel_initializer=w_init, name='rt_fc1') # state_rt = tf.layers.dense(state_rt, state_size*10, tf.nn.tanh, name='rt_fc2' ) feature = tf.concat([x, state_rt], 1, name='concat') # feature = state_rt # feature = tf.layers.dense(state_concat, 100, tf.nn.tanh, name='feature_fc' ) return feature def _build_anet(self, name, trainable): # w_init = tf.random_normal_initializer(0., .1) # w_init = tf.zeros_initializer() w_init = tf.contrib.layers.xavier_initializer() with tf.variable_scope(name): l1 = tf.layers.dense(self.feature, 100, tf.nn.tanh, trainable=trainable) # l1 = self.feature mu = tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable) # logstd = tf.get_variable(name="logstd", shape=[1, A_DIM], initializer=tf.zeros_initializer(), trainable=trainable) sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable) norm_dist = tf.distributions.Normal( loc=mu, scale=sigma) # tf.exp(logstd)) # norm_dist = tf.layers.dense(l1, A_DIM, tf.nn.softmax, kernel_initializer=w_init, trainable=trainable) params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name) return norm_dist, params def _build_cnet(self, name, input_state, reuse=False): w_init = tf.contrib.layers.xavier_initializer() # w_init = tf.zeros_initializer() with tf.variable_scope(name, reuse=reuse): l1 = tf.layers.dense(input_state, 100, tf.nn.tanh, kernel_initializer=w_init) # l1 = input_state v = tf.layers.dense(l1, 1) return v def choose_action(self, s, stochastic=True, show_plot=False): s = s[np.newaxis, :] if stochastic: a = self.sess.run(self.sample_op, {self.tfs: s})[0] else: a = self.sess.run(self.sample_op_stochastic, {self.tfs: s})[0] mean, scale = self.sess.run([self.sample_op_stochastic, self.std], {self.tfs: s}) mean = mean[0] scale = scale[0] np.append(scale, 0) scale = np.pi * (20 * scale)**2 a = np.clip(a, -1, 1) if show_plot: plt.clf() plt.scatter(range(A_DIM + 1), np.append(a, 1.0).flatten(), s=scale, c=[10, 10, 10, 10]) plt.pause(0.01) # print(prob) return a, 0 # def choose_action(self, s, stochastic = True, show_plot = False): # run by a local # prob_weights = self.sess.run(self.pi, feed_dict={self.tfs: s[np.newaxis, :]}) # if stochastic: # action = np.random.choice(range(prob_weights.shape[1]), # p=prob_weights.ravel()) # select action w.r.t the actions prob # else: # action = np.argmax(prob_weights.ravel()) # if show_plot: # prob = prob_weights.ravel() # plt.clf() # plt.scatter(range(A_DIM+1), np.append(prob, 0.5).flatten() ) # plt.pause(0.01) # # print(s[-6:]) # # print(prob) # return action, prob_weights.ravel() def get_v(self, s): if s.ndim < 2: s = s[np.newaxis, :] return self.sess.run(self.v, {self.tfs: s})[0, 0]
if __name__ == '__main__': torch.manual_seed(29) random.seed(21) np.random.seed(218) env_name = "Walker2d-v2" # env_name = "InvertedDoublePendulum-v2" env = gym.make(env_name) env.seed(2180) import os from datetime import datetime from gym import wrappers # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # aigym_path = os.path.join('.', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) from running_mean_std import RunningMeanStd rms = RunningMeanStd(env.observation_space.shape[0]) from tensorboardX import SummaryWriter writer = SummaryWriter() obs_dims = (2, 2, 2, 2, 2, 2) act_dims = (1, 1, 1, 1, 1, 1) global_dim = 6 # obs_dims = (1, 1, 1, 1) # obs_dims = (11, ) # obs_dims = (3, 3) # act_dims = (1, 1) # global_dim = 6 topo = ((0, 1), (1, 2), (0, 3), (3, 4), (4, 5)) # # topo = ((0, 1), ) import pickle
def __init__(self): self.sess = tf.Session() self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state') with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(self.sess, shape=S_DIM) # critic # l1 = self.feature #tf.layers.dense(self.feature, 100, tf.nn.relu) self.feature = self._build_feature_net('feature', self.tfs, reuse=False) self.v = self._build_cnet('value', self.feature, reuse=False) self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') self.diff_r_v = self.tfdc_r - self.v self.closs = tf.reduce_mean(tf.square(self.diff_r_v)) # self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs) # actor self.pi, pi_params = self._build_anet('pi', trainable=True) oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) self.update_oldpi_op = [ oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params) ] self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') # # for continue action self.tfa = tf.placeholder(tf.float32, [None, 1], 'action') # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) self.ratio = self.pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5) self.entropy = self.pi.entropy() self.sample_op = tf.squeeze(self.pi.sample(1), axis=0) # operation of choosing action self.sample_op_stochastic = self.pi.loc self.std = self.pi.scale # # descrete action # self.tfa = tf.placeholder(tf.int32, [None], 'action') # self.pi_prob = tf.reduce_sum((self.pi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True) # oldpi_prob = tf.reduce_sum((oldpi) * tf.one_hot(self.tfa, A_DIM, dtype=tf.float32), axis=1, keep_dims=True) # self.ratio = self.pi_prob / (oldpi_prob + 1e-5) #tf.exp(self.log_pi - log_oldpi) # self.entropy = -tf.reduce_sum(self.pi * tf.log(self.pi + 1e-5), axis=1, keep_dims=True) self.surr1 = self.ratio * self.tfadv self.surr2 = tf.clip_by_value(self.ratio, 1. - EPSILON, 1. + EPSILON) self.surr = tf.minimum(self.surr1, self.surr2) + 0.0 * self.entropy self.aloss = -tf.reduce_mean(self.surr) # value replay self.tfs_history = tf.placeholder(tf.float32, [None, S_DIM], 'state_history') # for value replay self.return_history = tf.placeholder( tf.float32, [None, 1], 'history_return') # for value replay self.feature_history = self._build_feature_net( 'feature', self.tfs_history, reuse=True) # for value replay self.v_history = self._build_cnet('value', self.feature_history, reuse=True) self.diff_history = self.return_history - self.v_history self.loss_history = tf.reduce_mean(tf.square(self.diff_history)) # reward predict self.tfs_label = tf.placeholder(tf.float32, [None, S_DIM], 'state_label') # for reward prediction self.label = tf.placeholder(tf.int32, [None], 'true_label') self.feature_label = self._build_feature_net( 'feature', self.tfs_label, reuse=True) # for reward prediction self.pred_label = tf.layers.dense(self.feature_label, 2) self.loss_pred = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pred_label, labels=self.label)) ########################################################################################### self.total_loss = self.aloss + (self.closs * 1 + self.loss_pred * 0 + self.loss_history * 0) self.base_loss = self.aloss + self.closs * 1 + self.loss_history * 0 global_step = tf.Variable(0, trainable=False) starter_learning_rate = LR end_learning_rate = LR / 10 decay_steps = 10 learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5) # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) optimizer = tf.train.AdamOptimizer(learning_rate) self.train_op = optimizer.minimize(self.total_loss, global_step=global_step) self.train_base_op = optimizer.minimize(self.base_loss, global_step=global_step) # self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() self.summary_writer = tf.summary.FileWriter('./log', self.sess.graph)
episodes, ep_steps, ep_reward, rank)) if __name__ == '__main__': mp.set_start_method('spawn') obs_space = env.observation_space.shape[0] action_space = env.action_space.n n_eval = 100 net = ActorCriticNet(obs_space, action_space).to(device) net.share_memory() param_p = [p for n, p in net.named_parameters() if 'pol' in n] param_v = [p for n, p in net.named_parameters() if 'val' in n] optim_p = torch.optim.AdamW(param_p, lr=P_LR, eps=1e-6) optim_v = torch.optim.AdamW(param_v, lr=V_LR, eps=1e-6) optimizer = [optim_p, optim_v] norm_obs = RunningMeanStd(shape=env.observation_space.shape) jobs = [] pipes = [] trajectory = [] rewards = deque(maxlen=n_eval) update = 0 steps = 0 for i in range(N_PROCESS): parent, child = mp.Pipe() p = mp.Process(target=roll_out, args=(env, ROLL_LEN // N_PROCESS, i, child), daemon=True) jobs.append(p) pipes.append(parent)
def __init__(self, sess, obs_shape_list, summary_writer): self.sess = sess obs_shape_list = obs_shape_list self.summary_writer = summary_writer self.BS = 1 self.s_t0_rms = RunningMeanStd(shape=obs_shape_list[0]) self.s_t1_rms = RunningMeanStd(shape=obs_shape_list[1]) self.s_t2_rms = RunningMeanStd(shape=obs_shape_list[2]) self.s_t3_rms = RunningMeanStd(shape=obs_shape_list[3]) self.s_t4_rms = RunningMeanStd(shape=obs_shape_list[4]) self.goal_state0_rms = RunningMeanStd(shape=obs_shape_list[5]) self.goal_state1_rms = RunningMeanStd(shape=obs_shape_list[6]) self.goal_obs_rms = RunningMeanStd(shape=obs_shape_list[7]) # achieved goals have the same shape with that of desired goals self.achvd_obs_rms = RunningMeanStd(shape=obs_shape_list[10]) self.achvd_state0_rms = RunningMeanStd(shape=obs_shape_list[8]) self.achvd_state1_rms = RunningMeanStd(shape=obs_shape_list[9])
# global values steps = 0 ep_rewards = [] reward_eval = [] is_rollout = False is_solved = False # make memories train_memory = [] roll_memory = [] obses = [] rews = [] rewards = [] values = [] norm_obs = RunningMeanStd(shape=env.observation_space.shape) norm_rew = RunningMeanStd() # make nerual networks net = ActorCriticNet(obs_space, action_space).to(device) old_net = deepcopy(net) # grouped_parameters = [ # {'params': [p for n, p in net.named_parameters() if n == 'val'], 'lr': LR * 0.1}, # {'params': [p for n, p in net.named_parameters() if n != 'val'], 'lr': LR} # ] param_p = [p for n, p in net.named_parameters() if 'val' not in n] param_v = [p for n, p in net.named_parameters() if 'val' in n] optim_p = torch.optim.AdamW(param_p, lr=LR, eps=1e-6) optim_v = torch.optim.AdamW(param_v, lr=0.001, eps=1e-6) optimizer = [optim_p, optim_v]
class RandomNetworkDistillation(): def __init__(self, input_size=8, learning_late=1e-4, verbose=1, use_cuda=False, tensorboard=False): self.target = torch.nn.Sequential(torch.nn.Linear(input_size, 64), torch.nn.Linear(64, 128), torch.nn.Linear(128, 64)) self.predictor = torch.nn.Sequential(torch.nn.Linear(input_size, 64), torch.nn.Linear(64, 128), torch.nn.Linear(128, 128), torch.nn.Linear(128, 64)) self.loss_function = torch.nn.MSELoss(reduction='mean') self.optimizer = torch.optim.Adam(self.predictor.parameters(), lr=learning_late) for param in self.target.parameters(): param.requires_grad = False self.verbose = verbose self.tensorboard = tensorboard if self.tensorboard: self.summary = SummaryWriter() self.iteration = 0 self.device = torch.device('cuda' if use_cuda else 'cpu') self.target.to(self.device) self.predictor.to(self.device) self.running_stats = RunningMeanStd() def learn(self, x, n_steps=500): intrinsic_reward = self.get_intrinsic_reward(x[0]) if self.tensorboard: self.summary.add_scalar('intrinsic-reward', intrinsic_reward, self.iteration) x = np.float32(x) x = torch.from_numpy(x).to(self.device) y_train = self.target(x) for t in range(n_steps): y_pred = self.predictor(x) loss = self.loss_function(y_pred, y_train) if t % 100 == 99: if self.verbose > 0: print("timesteps: {}, loss: {}".format(t, loss.item())) self.optimizer.zero_grad() loss.backward(retain_graph=True) self.optimizer.step() if self.tensorboard: self.summary.add_scalar('loss/loss', loss.item(), self.iteration) self.iteration += 1 self.running_stats.update(arr=np.array([loss.item()])) if self.tensorboard: self.summary.add_scalar('loss/running-mean', self.running_stats.mean, self.iteration) self.summary.add_scalar('loss/running-var', self.running_stats.var, self.iteration) def evaluate(self, x): x = np.float32(x) x = torch.from_numpy(x).to(self.device) y_test = self.target(x) y_pred = self.predictor(x) loss = self.loss_function(y_pred, y_test) print("evaluation loss: {}".format(loss.item())) return loss.item() def get_intrinsic_reward(self, x): x = np.float32(x) x = torch.from_numpy(x).to(self.device) predict = self.predictor(x) target = self.target(x) intrinsic_reward = self.loss_function(predict, target).data.cpu().numpy() intrinsic_reward = (intrinsic_reward - self.running_stats.mean ) / np.sqrt(self.running_stats.var) intrinsic_reward = np.clip(intrinsic_reward, -5, 5) return intrinsic_reward def save(self, path="rnd_model/", subfix=None): Path(path).mkdir(parents=True, exist_ok=True) if not os.path.isdir(path): os.mkdir(path) if subfix is not None: subfix = "_" + subfix else: subfix = "" with open("{}/running_stat.pkl".format(path), 'wb') as f: pickle.dump(self.running_stats, f) torch.save(self.target.state_dict(), "{}/target{}.pt".format(path, subfix)) torch.save(self.predictor.state_dict(), "{}/predictor{}.pt".format(path, subfix)) def load(self, path="rnd_model/", subfix=None): if subfix is not None: subfix = "_" + subfix else: subfix = "" with open("{}/running_stat.pkl".format(path), 'rb') as f: self.running_stats = pickle.load(f) self.target.load_state_dict( torch.load("{}/target{}.pt".format(path, subfix), map_location=torch.device(self.device))) self.predictor.load_state_dict( torch.load("{}/predictor{}.pt".format(path, subfix), map_location=torch.device(self.device))) def set_to_inference(self): self.target.eval() self.predictor.eval()