def simple_actor_critic(hidden_sizes=(32, 32), activation='relu', activation_output=None, kernel_initalizer='glorot_uniform', name='simple_actor_critic', env_info=EnvInfo): _actor = mlp(hidden_sizes=hidden_sizes, output_size=env_info.act_size, activation=activation, activation_output=activation_output, name=name, kernel_initalizer=kernel_initalizer) _critic = mlp(hidden_sizes=hidden_sizes, output_size=1, activation=activation, activation_output=activation_output, name=name, kernel_initalizer=kernel_initalizer) log('Model Summary: ' + name) _actor.build(input_shape=(None, ) + env_info.shapes['vec']) _actor.summary() _critic.build(input_shape=(None, ) + env_info.shapes['vec']) _critic.summary() def forward(inp=None): logits = _actor(inp['vec_obs']) values = _critic(inp['vec_obs']) return logits, values return {"forward": forward, "trainable_networks": [_actor, _critic]}
def vis_vec_actor_critic(hidden_sizes=(32, 32), activation='relu', activation_output=None, kernel_initalizer='glorot_uniform', name='vis_vec_actor_critic', env_info=EnvInfo): cnn, out_units = cnn_simple() _mlp_actor = mlp(hidden_sizes=hidden_sizes, output_size=env_info.act_size, activation=activation, activation_output=activation_output, name=name, kernel_initalizer=kernel_initalizer) _mlp_critic = mlp(hidden_sizes=hidden_sizes, output_size=1, activation=activation, activation_output=activation_output, name=name, kernel_initalizer=kernel_initalizer) log('Model Summary: ' + name) cnn.build(input_shape=(None, ) + env_info.shapes['vis']) cnn.summary() _mlp_actor.build(input_shape=(None, env_info.shapes['vec'][0] + out_units)) _mlp_actor.summary() _mlp_critic.build(input_shape=(None, env_info.shapes['vec'][0] + out_units)) _mlp_critic.summary() def forward(inp=None): out_cnn = cnn(inp['vis_obs']) # Convolutional Network for visuals # out_vec_mlp = _mlp_vec(inp['vec_obs]) # Put vec_obs thorugh Neural Network if to much features mixed = tf.concat([out_cnn, inp['vec_obs']], -1) # Concatenate cnn and vec_obs or out_vec_mlp # out_mixer_mlp = _mlp_mixer(mixed) # state mixer with Neural Network if needed logits = _mlp_actor(mixed) # Feed with raw mixed or with out_mixer_mlp values = _mlp_critic(mixed) return logits, values return { "forward": forward, "trainable_networks": [cnn, _mlp_actor, _mlp_critic] }
def cnn_simple_actor_critic(hidden_sizes=(32, 32), activation='relu', activation_output=None, kernel_initalizer='glorot_uniform', name='cnn_simple_actor_critic', env_info=EnvInfo): cnn, _ = cnn_simple() _actor = tf.keras.Sequential(name='actor') _critic = tf.keras.Sequential(name='critic') _actor.add(cnn) _critic.add(cnn) _mlp_actor = mlp(hidden_sizes=hidden_sizes, output_size=env_info.act_size, activation=activation, activation_output=activation_output, name=name, kernel_initalizer=kernel_initalizer) _actor.add(_mlp_actor) _mlp_critic = mlp(hidden_sizes=hidden_sizes, output_size=1, activation=activation, activation_output=activation_output, name=name, kernel_initalizer=kernel_initalizer) _critic.add(_mlp_critic) log('Model Summary: ' + name) _actor.build(input_shape=(None, ) + env_info.shapes['vis']) _actor.summary() _critic.build(input_shape=(None, ) + env_info.shapes['vis']) _critic.summary() def forward(inp=None): logits = _actor(inp['vis_obs']) values = _critic(inp['vis_obs']) return logits, values return {"forward": forward, "trainable_networks": [_actor, _critic]}
def update(self, rollouts): """ Update Policy and the Value Network ----------------------------------- Inputs: obs, act, advantages, returns, logp-t Returns: loss-pi, loss-entropy, approx-ent, kl, loss-v, loss-total """ inds = np.arange(self.nbatch) for i in range(self.train_iters): losses = self._inner_update_loop(rollouts['vec_obses'], rollouts['vis_obses'], rollouts['actions'], rollouts['advs'], rollouts['returns'], rollouts['logp'], inds) if losses['approx_kl'] > 1.5 * self.target_kl: log("Early stopping at step %d due to reaching max kl." % i) break return losses # Return Metrics
tf.random.set_seed(params.env.seed) # Set Random Seeds for np and tf np.random.seed(params.env.seed) env = BitcoinTradingEnv(data) # Create Environment in multiprocessing mode LOGGER = Logger('academy_name', os.getcwd(), config) # Set Logger # network = network_builder(params.trainer.nn_architecure) \ # (hidden_sizes=params.policy.hidden_sizes, env_info=env.env_info) # Build Neural Network with Forward Pass network = simple_actor_critic(env_info = env_test, hidden_sizes = (128, 128)) # Our own version of the simple_actor_critic that fits with the environment model = CategoricalModel_2 model = model(network=network, env_info=env_test) # Build Model for Discrete or Continuous Spaces if params.trainer.load_model: log('Loading Model ...') model.load_weights(LOGGER.tf_model_path('model_weights')) # Load model if load_model flag set to true roller = Roller(env_test, model, params.trainer.steps_per_epoch, params.trainer.gamma, params.trainer.lam) # Define Roller for genrating rollouts for training ppo = PolicyCombinedLoss(model=model, num_envs=1) # Define PPO Policy with combined loss for epoch in range(params.trainer.epochs): # Main training loop for n epochs rollouts, infos = roller.rollout() # Get Rollout and infos outs = ppo.update(rollouts) # Push rollout in ppo and update policy accordingly LOGGER.store('Loss Pi', outs['pi_loss']) LOGGER.store('Loss V', outs['v_loss']) LOGGER.store('Loss Ent', outs['entropy_loss'])