def main(args): # Create directories if not os.path.exists("./logs"): os.makedirs("./logs") # Set logs log = set_log(args) # Create env env = make_env(log, args) # Set seeds random.seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Visualize environment observations = env.reset() for _ in range(args.ep_max_timesteps): env.render() prey_action = env.action_space.sample() predator1_action = env.action_space.sample() predator2_action = env.action_space.sample() actions = [prey_action, predator1_action, predator2_action] observations, reward, done, _ = env.step(actions) if done: break
def main(): parser = argparse.ArgumentParser() set_args(parser) args = parser.parse_args() specify_path(args) specify_device(args) specify_seed(args) print_args = {k: v for k, v in vars(args).items() if k != 'device'} print_args = argparse.Namespace(**print_args) logger.info('CONFIG:\n%s' % json.dumps(vars(print_args), indent=4, sort_keys=True)) if not args.no_log: set_log(args.log_dir) tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_bert_dir) model = RobertaForPreTraining.from_pretrained( args.pretrain_bert_dir, pos_tag_embedding=is_pos_embedding, senti_embedding=is_senti_embedding, polarity_embedding=is_polarity_embedding) if args.fp16: model.half() model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Load pre-training data logging.info('Loading yelp pretraining data...') yelp = Yelp(args, tokenizer, max_seq_length=args.max_seq_length) sampler = RandomSampler(yelp) loader = DataLoader(yelp, sampler=sampler, batch_size=args.batch_size, num_workers=WORKERS, pin_memory=args.cuda) optimization_step = int( len(yelp) / args.batch_size / args.grad_accum_steps) * args.epochs optimizer, scheduler = init_optimizer(args, model, optimization_step) train(args, model, tokenizer, loader, optimizer, scheduler)
def run_polopt_agent( env_fn, agent=PPOAgent(), actor_critic=mlp_actor_critic, ac_kwargs=dict(), seed=0, render=False, # Experience collection: steps_per_epoch=4000, epochs=50, max_ep_len=1000, # Discount factors: gamma=0.99, lam=0.97, cost_gamma=0.99, cost_lam=0.97, # Policy learning: pi_lr=3e-4, ent_reg=0., # Cost constraints / penalties: cost_lim=25, penalty_init=1., penalty_lr=5e-2, # KL divergence: target_kl=0.01, # Value learning: vf_lr=1e-3, vf_iters=80, # Logging: logger=None, logger_kwargs=dict(), save_freq=1, prefix="", custom_log=None, args=None): tb_writer = SummaryWriter('./log/tb_{}'.format(args.log_name)) #=========================================================================# # Prepare logger, seed, and environment in this process # #=========================================================================# logger = EpochLogger(**logger_kwargs) if logger is None else logger logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() agent.set_logger(logger) #=========================================================================# # Create computation graph for actor and critic (not training routine) # #=========================================================================# # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph from environment spaces x_ph, a_ph = placeholders_from_spaces(env.observation_space, env.action_space) # Inputs to computation graph for batch data adv_ph, cadv_ph, ret_ph, cret_ph, logp_old_ph = placeholders( *(None for _ in range(5))) # Inputs to computation graph for special purposes surr_cost_rescale_ph = tf.placeholder(tf.float32, shape=()) cur_cost_ph = tf.placeholder(tf.float32, shape=()) # Outputs from actor critic ac_outs = actor_critic(x_ph, a_ph, **ac_kwargs) pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent, v, vc = ac_outs # Organize placeholders for zipping with data from buffer on updates buf_phs = [x_ph, a_ph, adv_ph, cadv_ph, ret_ph, cret_ph, logp_old_ph] buf_phs += values_as_sorted_list(pi_info_phs) # Organize symbols we have to compute at each step of acting in env get_action_ops = dict(pi=pi, v=v, logp_pi=logp_pi, pi_info=pi_info) # If agent is reward penalized, it doesn't use a separate value function # for costs and we don't need to include it in get_action_ops; otherwise we do. if not (agent.reward_penalized): get_action_ops['vc'] = vc # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'vf', 'vc']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t vc: %d\n' % var_counts) # Make a sample estimate for entropy to use as sanity check approx_ent = tf.reduce_mean(-logp) #=========================================================================# # Create replay buffer # #=========================================================================# # Obs/act shapes obs_shape = env.observation_space.shape act_shape = env.action_space.shape # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) pi_info_shapes = {k: v.shape.as_list()[1:] for k, v in pi_info_phs.items()} buf = CPOBuffer(local_steps_per_epoch, obs_shape, act_shape, pi_info_shapes, gamma, lam, cost_gamma, cost_lam) #=========================================================================# # Create computation graph for penalty learning, if applicable # #=========================================================================# if agent.use_penalty: with tf.variable_scope('penalty'): # param_init = np.log(penalty_init) param_init = np.log(max(np.exp(penalty_init) - 1, 1e-8)) penalty_param = tf.get_variable('penalty_param', initializer=float(param_init), trainable=agent.learn_penalty, dtype=tf.float32) # penalty = tf.exp(penalty_param) penalty = tf.nn.softplus(penalty_param) if agent.learn_penalty: if agent.penalty_param_loss: penalty_loss = -penalty_param * (cur_cost_ph - cost_lim) else: penalty_loss = -penalty * (cur_cost_ph - cost_lim) train_penalty = MpiAdamOptimizer( learning_rate=penalty_lr).minimize(penalty_loss) #=========================================================================# # Create computation graph for policy learning # #=========================================================================# # Likelihood ratio ratio = tf.exp(logp - logp_old_ph) # Surrogate advantage / clipped surrogate advantage if agent.clipped_adv: min_adv = tf.where(adv_ph > 0, (1 + agent.clip_ratio) * adv_ph, (1 - agent.clip_ratio) * adv_ph) surr_adv = tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) else: surr_adv = tf.reduce_mean(ratio * adv_ph) # Surrogate cost surr_cost = tf.reduce_mean(ratio * cadv_ph) # Create policy objective function, including entropy regularization pi_objective = surr_adv + ent_reg * ent # Possibly include surr_cost in pi_objective if agent.objective_penalized: pi_objective -= penalty * surr_cost pi_objective /= (1 + penalty) # Loss function for pi is negative of pi_objective pi_loss = -pi_objective # Optimizer-specific symbols if agent.trust_region: # Symbols needed for CG solver for any trust region method pi_params = get_vars('pi') flat_g = tro.flat_grad(pi_loss, pi_params) v_ph, hvp = tro.hessian_vector_product(d_kl, pi_params) if agent.damping_coeff > 0: hvp += agent.damping_coeff * v_ph # Symbols needed for CG solver for CPO only flat_b = tro.flat_grad(surr_cost, pi_params) # Symbols for getting and setting params get_pi_params = tro.flat_concat(pi_params) set_pi_params = tro.assign_params_from_flat(v_ph, pi_params) training_package = dict(flat_g=flat_g, flat_b=flat_b, v_ph=v_ph, hvp=hvp, get_pi_params=get_pi_params, set_pi_params=set_pi_params) elif agent.first_order: # Optimizer for first-order policy optimization train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) # Prepare training package for agent training_package = dict(train_pi=train_pi) else: raise NotImplementedError # Provide training package to agent training_package.update( dict(pi_loss=pi_loss, surr_cost=surr_cost, d_kl=d_kl, target_kl=target_kl, cost_lim=cost_lim)) agent.prepare_update(training_package) #=========================================================================# # Create computation graph for value learning # #=========================================================================# # Value losses v_loss = tf.reduce_mean((ret_ph - v)**2) vc_loss = tf.reduce_mean((cret_ph - vc)**2) # If agent uses penalty directly in reward function, don't train a separate # value function for predicting cost returns. (Only use one vf for r - p*c.) if agent.reward_penalized: total_value_loss = v_loss else: total_value_loss = v_loss + vc_loss # Optimizer for value learning train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(total_value_loss) #=========================================================================# # Create session, sync across procs, and set up saver # #=========================================================================# sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={ 'pi': pi, 'v': v, 'vc': vc }) #=========================================================================# # Provide session to agent # #=========================================================================# agent.prepare_session(sess) #=========================================================================# # Create function for running update (called at end of each epoch) # #=========================================================================# def update(): cur_cost = logger.get_stats('EpCost')[0] c = cur_cost - cost_lim if c > 0 and agent.cares_about_cost: logger.log('Warning! Safety constraint is already violated.', 'red') #=====================================================================# # Prepare feed dict # #=====================================================================# inputs = {k: v for k, v in zip(buf_phs, buf.get())} inputs[surr_cost_rescale_ph] = logger.get_stats('EpLen')[0] inputs[cur_cost_ph] = cur_cost #=====================================================================# # Make some measurements before updating # #=====================================================================# measures = dict(LossPi=pi_loss, SurrCost=surr_cost, LossV=v_loss, Entropy=ent) if not (agent.reward_penalized): measures['LossVC'] = vc_loss if agent.use_penalty: measures['Penalty'] = penalty pre_update_measures = sess.run(measures, feed_dict=inputs) logger.store(**pre_update_measures) #=====================================================================# # Update penalty if learning penalty # #=====================================================================# if agent.learn_penalty: sess.run(train_penalty, feed_dict={cur_cost_ph: cur_cost}) #=====================================================================# # Update policy # #=====================================================================# agent.update_pi(inputs) #=====================================================================# # Update value function # #=====================================================================# for _ in range(vf_iters): sess.run(train_vf, feed_dict=inputs) #=====================================================================# # Make some measurements after updating # #=====================================================================# del measures['Entropy'] measures['KL'] = d_kl post_update_measures = sess.run(measures, feed_dict=inputs) deltas = dict() for k in post_update_measures: if k in pre_update_measures: deltas['Delta' + k] = post_update_measures[k] - pre_update_measures[k] logger.store(KL=post_update_measures['KL'], **deltas) #=========================================================================# # Run main environment interaction loop # #=========================================================================# start_time = time.time() o, r, d, c, ep_ret, ep_cost, ep_len = env.reset(), 0, False, 0, 0, 0, 0 cur_penalty = 0 cum_cost = 0 counter = 0 from utils import set_log log = set_log(args) for epoch in range(epochs): if agent.use_penalty: cur_penalty = sess.run(penalty) for t in range(local_steps_per_epoch): # Possibly render if render and proc_id() == 0 and t < 1000: env.render() # Get outputs from policy obs = o if len(o.shape) == 1: obs = np.expand_dims(o, 0) get_action_outs = sess.run(get_action_ops, feed_dict={x_ph: obs}) a = get_action_outs['pi'] v_t = get_action_outs['v'] vc_t = get_action_outs.get('vc', 0) # Agent may not use cost value func logp_t = get_action_outs['logp_pi'] pi_info_t = get_action_outs['pi_info'] # Step in environment o2, r, d, info = env.step(a) # Include penalty on cost c = info.get('cost', 0) # Track cumulative cost over training cum_cost += c # save and log if agent.reward_penalized: r_total = r - cur_penalty * c r_total = r_total / (1 + cur_penalty) buf.store(o, a, r_total, v_t, 0, 0, logp_t, pi_info_t) else: buf.store(o, a, r, v_t, c, vc_t, logp_t, pi_info_t) logger.store(VVals=v_t, CostVVals=vc_t) o = o2 ep_ret += r * pow(0.99, ep_len) ep_cost += c * pow(0.99, ep_len) ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): # If trajectory didn't reach terminal state, bootstrap value target(s) if d and not (ep_len == max_ep_len): # Note: we do not count env time out as true terminal state last_val, last_cval = 0, 0 else: feed_dict = {x_ph: o[np.newaxis]} if agent.reward_penalized: last_val = sess.run(v, feed_dict=feed_dict) last_cval = 0 else: last_val, last_cval = sess.run([v, vc], feed_dict=feed_dict) buf.finish_path(last_val, last_cval) # Only save EpRet / EpLen if trajectory finished if terminal: logger.store(EpRet=ep_ret, EpLen=ep_len, EpCost=ep_cost) tb_writer.add_scalar("cost", ep_cost - cost_lim, counter) tb_writer.add_scalar("return", ep_ret, counter) log[args.log_name].info("At iteration {}, cost: {}".format( counter, ep_cost - cost_lim)) log[args.log_name].info( "At iteration {}, return: {}".format(counter, ep_ret)) counter += 1 if counter > 200000: import sys sys.exit() else: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # Reset environment o, r, d, c, ep_ret, ep_len, ep_cost = env.reset( ), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) #=====================================================================# # Run RL update # #=====================================================================# update() #=====================================================================# # Cumulative cost calculations # #=====================================================================# cumulative_cost = mpi_sum(cum_cost) cost_rate = cumulative_cost / ((epoch + 1) * steps_per_epoch) #=====================================================================# # Log performance and stats # #=====================================================================# logger.log_tabular('Epoch', epoch) # Performance stats logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpCost', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('CumulativeCost', cumulative_cost) logger.log_tabular('CostRate', cost_rate) # Value function values logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('CostVVals', with_min_and_max=True) # Pi loss and change logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) # Surr cost and change logger.log_tabular('SurrCost', average_only=True) logger.log_tabular('DeltaSurrCost', average_only=True) # V loss and change logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) # Vc loss and change, if applicable (reward_penalized agents don't use vc) if not (agent.reward_penalized): logger.log_tabular('LossVC', average_only=True) logger.log_tabular('DeltaLossVC', average_only=True) if agent.use_penalty or agent.save_penalty: logger.log_tabular('Penalty', average_only=True) logger.log_tabular('DeltaPenalty', average_only=True) else: logger.log_tabular('Penalty', 0) logger.log_tabular('DeltaPenalty', 0) # Anything from the agent? agent.log() # Policy stats logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) # Time and steps elapsed logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('Time', time.time() - start_time) # Show results! logger.dump_tabular()
receiver_elem.attrib.pop('{%s}permission' % MANIFEST_NAMESPACE) # Service for service_elem in manifest_tree.iter(tag='service'): service_elem.set('{%s}exported' % MANIFEST_NAMESPACE, 'true') if service_elem.attrib.has_key('{%s}permission' % MANIFEST_NAMESPACE): service_elem.attrib.pop('{%s}permission' % MANIFEST_NAMESPACE) manifest_tree.write(manifest_path) except Exception, ex: self.has_error = True self.log.exception(ex) if __name__ == '__main__': # Set log utils.set_log() # Parse args parser = argparse.ArgumentParser() parser.add_argument('-t', action='store', dest='launcher_type') parser.add_argument('-p', action='store', dest='emu_port') parser.add_argument('-n', action='store', dest='emu_name') #args = parser.parse_args(['-t', '1', '-p', '5554', '-n', 'Android']) args = parser.parse_args() try: launcher_type = int(args.launcher_type) emu_port = args.emu_port emu_name = args.emu_name
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) # Set up tensorboard logging tb_writer = SummaryWriter("tb_ppo_seed::" + str(seed)) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_cost, ep_len, ep_count = env.reset(), 0, 0, 0, 0 log_name = "ppo_seed::" + str(seed) + "_log" from utils import set_log log = set_log(log_name) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): logging_ret, logging_cost = [], [] for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, info = env.step(a) ep_ret += r * pow(0.99, ep_len) ep_cost += info["cost"] * pow(0.99, ep_len) ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logging_ret.append(ep_ret) logging_cost.append(ep_cost) logger.store(EpRet=ep_ret, EpLen=ep_len) log[log_name].info("At iteration {}, cost: {}".format( ep_count, ep_cost)) log[log_name].info("At iteration {}, return: {}".format( ep_count, ep_ret)) tb_writer.add_scalar("objective", ep_ret, ep_count) tb_writer.add_scalar("cost", ep_cost, ep_count) ep_count += 1 if ep_count >= 200000: import sys sys.exit() o, ep_ret, ep_cost, ep_len = env.reset(), 0, 0, 0 logging_ret = sum(logging_ret) / len(logging_ret) logging_cost = sum(logging_cost) / len(logging_cost) print(epoch, logging_ret, logging_cost) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
parser.add_argument("func", choices=func_map.keys(), help="functions. train/val/test") parser.add_argument("-f", "--config", type=str, required=True, help="configure file") parser.add_argument("-w", "--weight", type=str, default=None, help="weight for initialization or validate") parser.add_argument("-r", "--result", type=str, default=None, help="results for shows hard case") parser.add_argument("-l", "--outlog", type=str, help="output log file") args = parser.parse_args() return args if __name__ == "__main__": args = get_command() utils.set_log(args.outlog) logging.info(args) cfgs = get_config(args) logging.info("Start %s the network" % args.func) func_map[args.func](cfgs)
'-m', type=str, help="pretrained model", default=None) config = parser.parse_args() # Constant values N_EPOCH = config.epoch N_ITER = config.iteration SEGLEN = 128 # =============== Directories and data =============== # Make directories and create log file save_path = os.path.join(config.save_root, "vcc") logprint = utils.set_log(save_path, add=False)[1] # Set input directories and data paths if config.dataset == "vcc": data_root = "./data/vcc/" src_folders = sorted(os.listdir(data_root)) data_paths = ["{}{}/cspec/".format(data_root, f) for f in src_folders] stat_paths = [ "{}{}/train_cspecstat.npy".format(data_root, f) for f in src_folders ] label_paths = ["{}{}/label.npy".format(data_root, f) for f in src_folders] n_src = len(src_folders) src_data = [sorted(os.listdir(p)) for p in data_paths] n_src_data = [len(d) for d in src_data] src_batch_size = [math.floor(n) // N_ITER for n in n_src_data]