def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs*nsteps tstart = time.time() coord = tf.train.Coordinator() enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", np.nan if len(rewards) == 0 else np.mean(rewards)) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def learn( network, env, save_path, seed=None, nsteps=10, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, # log_interval=100, log_interval=10, load_path=None, **network_kwargs): if network == 'cnn': network_kwargs['one_dim_bias'] = True set_global_seeds(seed) assert save_path is not None # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) # Calculate the batch_size,这里将nsteps设为1 nbatch = nenvs*nsteps observation = [] action = [] for update in range(1, total_timesteps//nbatch+1): # Get mini batch of experiences obs, states, rewards, masks, actions, values,output = runner.run() observation.append(obs) print('times', update) obs = np.concatenate(observation) #compute fisher matrix FM = model.compute_fisher(obs,plot_diffs=True,disp_freq=10) # FM = model.compute_exact_fisher(obs,plot_diffs=True,disp_freq=10) joblib.dump(FM, save_path)
def learn( network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs*nsteps # Start total timer tstart = time.time() for update in range(1, total_timesteps//nbatch+1): # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) nseconds = time.time()-tstart # Calculate the fps (frame per second) fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space if isinstance(network, str): network_type = network policy_network_fn = get_network_builder(network_type)(**network_kwargs) policy = policy_network_fn(ob_space.shape) model = Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: # TODO: q_runner enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) obs = tf.constant(obs) if states is not None: states = tf.constant(states) rewards = tf.constant(rewards) masks = tf.constant(masks) actions = tf.constant(actions) values = tf.constant(values) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def learn(network, env, save_path, seed=None, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=False, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs model = Model(policy, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) # Calculate the batch_size,这里将nsteps设为1 nbatch = nenvs * nsteps print(nbatch) tstart = time.time() F = [] for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, output = runner.run() fisher = model.compute_fisher(obs) # f = [] # l = len(obs) # efficient = 0.6 # weight = np.logspace(l,1,l,base=efficient) # # for index in range(l): # observation = obs[index] # fisher = model.compute_fisher(observation) # for i,j in enumerate(fisher): # if index == 0: # f.append(weight[index] * fisher[j]) # else: # f[i]+=weight[index] * fisher[j] # for i in range(len(f)): # f[i] = f[i]/np.sum(weight) # model.old_obs = obs nseconds = time.time() - tstart print(update) # # if update == 1: # for x in f: # F.append(x) # else: # for x in range(len(f)): # F[x] += f[x] if update == 1: for i in fisher: F.append(fisher[i]) else: for i, j in enumerate(fisher): F[i] += fisher[j] for i in range(len(F)): F[i] /= total_timesteps joblib.dump(F, 'fisher_matrix/simple_agent_random_4000') return model
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs*nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
nbatch = nenvs * nsteps # Start total timer tstart = time.time() last_rewards = [] graph_names = ('policy_entropy', 'value_loss', 'policy_loss', 'values_mean', 'explained_variance', 'rewards_mean', 'rewards_min', 'rewards_max', 'rewards_median', 'rewards_std', 'values_mean', 'values_min', 'values_max', 'values_median', 'values_std') graph_data = {k: [] for k in graph_names} for update in range(1, total_timesteps // nbatch + 1): # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() # invert inv_obs = env.invert_states(obs) obs = np.vstack((obs, inv_obs)) rewards = np.hstack((rewards, rewards)) masks = np.hstack((masks, masks)) inv_actions = env.invert_actions(actions) actions = np.hstack((actions, inv_actions)) values = np.hstack((values, values)) epinfobuf.extend(epinfos) # policy_loss, value_loss, policy_entropy = model.train(inv_obs, states, rewards, masks, inv_actions, values) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values)
def learn(network, env, seed=None, nsteps=5, noptions=64, top_n_options=8, replay_buffer_size=1000, total_timesteps=int(80e6), start_op_at=0.8, options_update_iter=10, vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, diverse_r_coef=0.1, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for VFO algorithm. Train a policy with given network architecture on a given environment using vfo algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) noptions: int, number of options for VFO, i.e. channels of last Conv layer top_n_options: int, number of top possible options to for selective option step replay_buffer_size int, size of replay buffer which is used to train options total_timesteps: int, total number of timesteps to train on (default: 80M) start_op_at: float, after trainign mf policy for `start_op_at * total_timesteps` steps, begin to train options policy options_update_iter: int, number of call for train_options per sample vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) diverse_r_coef: float, scaling factor for diversity reward when training option policy alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.vfo/policies.py/build_policy and arguments to a particular type of network ''' set_global_seeds(seed) nenvs = env.num_envs policy = build_policy(env, network, noptions, **network_kwargs) assert replay_buffer_size > 100, 'Replay buffer is too small' replay_buffer = Buffer(env, nsteps, size=replay_buffer_size) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, diverse_r_coef=diverse_r_coef, gamma=gamma, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) options_runner = OptionsRunner(env, model, noptions, nsteps=nsteps, gamma=gamma, use_selective_option=True, top_n_options=top_n_options) nbatch = nenvs * nsteps tstart = time.time() to_train_options, init_replay_buffer_done = False, False total_updates = total_timesteps // nbatch + 1 for update in range(1, total_updates): if update % 300 == 0: model.save(os.path.join(logger.get_dir(), "snapshot")) if not to_train_options: obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() if update > total_updates * start_op_at: to_train_options = True else: obs, next_obs, states, next_states, masks, next_masks, actions, \ actions_full, rewards, values, dones, options_z = \ options_runner.run() replay_buffer.put(obs, next_obs, states, next_states, masks, next_masks, actions, actions_full, dones, options_z) options_runner.sample_option_z(prior=model.prior_op_z) if replay_buffer.num_in_buffer > 100: init_replay_buffer_done = True if not init_replay_buffer_done: logger.info('Sample data using option policy...') continue policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) for _ in range(options_update_iter): obs, next_obs, states, next_states, masks, next_masks, \ actions, actions_full, dones, options_z = \ replay_buffer.get() distillation_loss_value = model.distill_mf_to_options( obs, states, masks) record_loss_values = model.train_options( obs, next_obs, states, next_states, masks, next_masks, actions, actions_full, dones, options_z) record_loss_values.append( ('distillation_loss', distillation_loss_value)) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("explained_variance", float(ev)) for loss_name, loss_value in record_loss_values: logger.record_tabular(loss_name, loss_value) logger.dump_tabular() env.close() return model
def learn(network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' A2C 알고리즘에 대한 주 진입지점. `a2c` 알고리즘을 사용하여 주어진 환경에서 주어진 망으로 정책을 벼림한다. Parameters: ----------- network: 정책망 구조. 표준망 구조를 지정하는 문자열(mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small , conv_only - 전체목록을 보려면 baselines.common/models.py를 보라), 또는 입력으로 텐서플로우 텐서를 가지고 출력 텐서는 망 마지막단 출력쌍(output_tensor, extra_feed)을 반환하는 함수, , extra_feed 는 feed-forward 를 위해서는 None 다, 그리고 extra_feed 는 재사용신경망을 위한 망으로 상태를 사비하는 방법을 설명하는 목록(dictionary)이다. 정책에서 재사용신경망 사용에 대한 자세한 내용은 baselines.common/policies.py/lstm 을 보라. env: 강화각흡 환경. VecEnv(baselines.common/vec_env)와 비슷한 전달기를 구현하거나 DummyVecEnv(baselines.common/vec_env/dummy_vec_env.py)로 싸야 한다. seed: 알고리즘에서 뿌림수 순서를 복제하기 위한 씨알이다. 기본적으로 None 이다, 이것은 씨스템 노이즈생성기가 씨알임을 의미한다(복제하지 않는다) nsteps: int, 환경을 배열의 보수 마다 갱신한다(즉, 사리수(batch size)는 nsteps * nenv 이다 여기에서 nenv 는 병렬로 모사한 환경을 복사한 개수다.) total_timesteps: int, 벼림하기 위한 총 보수 (기본값: 80M) vf_coef: float, 총손실 함수에서 가치함수 손실 앞의 계수 (기본값: 0.5) ent_coef: float, 총손실 함수에서 정책 엔트로피 앞의 계수 (기본값: 0.01) max_gradient_norm: float, 기울기(gradient)는 전역(global) L2 보다 크지않은 값으로 제한(clipped)한다 (기본값: 0.5) lr: float, RMSProp 을 위한 벼림비(현재 구현은 RMSProp 에서 강제(hardcoded)한다) (기본값: 7e-4) lrschedule: 벼림비 계획. 'linear', 'constant', 또는 [0..1] -> [0..1] 함수로 할수 있다, 이것은 벼림진행의 일부를 입력으로 취하여 출력으로 벼림비(lr 로 지정) 부분을 반환한다. epsilon: float, RMSProp epsilon (RMSProp 갱신 분모로 제곱근 계산을 정상화 한다) (기본값: 1e-5) alpha: float, RMSProp 에누리 참여값(decay parameter) (기본값: 0.99) gamma: float, 포상 에누리 참여값(reward discounting parameter) (기본값: 0.99) log_interval: int, 얼마나 자주 기록을 인쇄하는지 지정한다 (기본값: 100) **network_kwargs: 정책/망 작성기에 대한 열쇄글 결정고유값(arguments). baselines.common/policies.py/build_policy와 망의 특정 유형에 대한 결정고유값(arguments)을 봐라. 예를들어, 'mlp' 망 구조는 num_hidden 와 num_layers 의 결정고유값(arguments)을 가진다. ''' set_global_seeds(seed) # 환경의 개수를 가져온다(Get the nb of env) nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # 모형개체 대리자 (step_model(표집모형) 와 train_model(벼림모형)을 생성한다) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # 실행개체 대리자(Instantiate the runner object) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # 사리수(batch_size) 계산 nbatch = nenvs * nsteps # 전체타이머 시작 tstart = time.time() for update in range(1, total_timesteps // nbatch + 1): # 경험의 작은 덩이를 가져온다. Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) nseconds = time.time() - tstart # fps 계산 (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=100, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, eval_env=None, save_interval=None, lrschedule='linear', load_path=None, is_async=True, augment=False, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma, augment=augment) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma) eval_epinfobuf = deque(maxlen=100) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] best_rew = float('-inf') for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) if eval_env is not None: eval_obs, eval_states, eval_returns, eval_masks, eval_actions, eval_values, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 eval_epinfobuf.extend(eval_epinfos) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.record_tabular( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.record_tabular( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.dump_tabular() if safemean([epinfo['r'] for epinfo in epinfobuf ]) > best_rew and logger.get_dir(): best_rew = safemean([epinfo['r'] for epinfo in epinfobuf]) checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, 'best.ckpt') print(f"Best model w/ rew {best_rew}. Saving to", savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def learn(network, env, seed, env_id=None, total_timesteps=int(40e6), gamma=0.99, log_interval=100, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, save_path=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): info_env = gym.make(env_id) algo = 'acktr' # wandb.init(project="floorplan_generator", name=algo) # wandb.config.algo = algo # # wandb.config.action_space = info_env.action_type # wandb.config.step_size = info_env.step_size #wandb.config.active_rewards = info_env.active_rewards #print("\n \n \n \n \n HI21 \n \n \n \n \n") if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) #print("\n \n \n \n \n HI22 \n \n \n \n \n") nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) # if save_interval and logger.get_dir(): # import cloudpickle # print(osp.join(logger.get_dir(), 'make_model.pkl')) # with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb+') as fh: # print(make_model) # fh.write(cloudpickle.dumps(make_model)) model = make_model() #print("\n \n \n \n \n HI23 \n \n \n \n \n") if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] #print("\n \n \n \n \n HI24 \n \n \n \n \n") for update in range(1, total_timesteps // nbatch + 1): #print("step1") obs, states, rewards, masks, actions, values, epinfos = runner.run() #print("step2") epinfobuf.extend(epinfos) #print("step3") policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) #print("step4") model.old_obs = obs #print("step5") nseconds = time.time() - tstart #print("step6") fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # images = env.get_images() # image = images[0] # writer.add_image('imresult', image, update, dataformats='HWC') ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() # wandb.log({'eprewmean': safemean([epinfo['r'] for epinfo in epinfobuf]), # 'eplenmean': safemean([epinfo['l'] for epinfo in epinfobuf])}) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) savepath = save_path print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model