def train(env_fn=None, spectrum=False, a2c_arch=None, nenvs=16, nsteps=100, max_iters=1e6, gamma=0.99, pg_coeff=1.0, vf_coeff=0.5, ent_coeff=0.01, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, log_interval=100, summarize=True, load_path=None, log_path=None, cpu_cores=1): # Construct the vectorized parallel environments envs = [env_fn for _ in range(nenvs)] envs = SubprocVecEnv(envs) # Set some random seeds for the environment envs.seed(0) if spectrum: envs.spectrum() ob_space = envs.observation_space.shape nw, nh, nc = ob_space ac_space = envs.action_space obs = envs.reset() tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores, intra_op_parallelism_threads=cpu_cores) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: actor_critic = ActorCritic(sess, a2c_arch, ob_space, ac_space, pg_coeff, vf_coeff, ent_coeff, max_grad_norm, lr, alpha, epsilon, summarize) load_count = 0 if load_path is not None: actor_critic.load(load_path) print('Loaded a2c') summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) batch_ob_shape = (-1, nw, nh, nc) dones = [False for _ in range(nenvs)] episode_rewards = np.zeros((nenvs, )) final_rewards = np.zeros((nenvs, )) print('a2c Training Start!') print('Model will be saved on intervals of %i' % (log_interval)) for i in tqdm(range(load_count + 1, int(max_iters) + 1), ascii=True, desc='ActorCritic'): # Create the minibatch lists mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_depth = [], [], [], [], [], [] total_reward = 0 for n in range(nsteps): # Get the actions and values from the actor critic, we don't need neglogp actions, values, neglogp = actor_critic.act(obs) mb_obs.append(np.copy(obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(dones) obs, rewards, dones, info = envs.step(actions) total_reward += np.sum(rewards) episode_rewards += rewards masks = 1 - np.array(dones) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks mb_rewards.append(rewards) mb_depth.append( np.array( [info_item['scramble_depth'] for info_item in info])) mb_dones.append(dones) # Convert batch steps to batch rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes( 1, 0).reshape(batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.float32).swapaxes(1, 0) mb_depth = np.asarray(mb_depth, dtype=np.int32).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = actor_critic.critique(obs).tolist() # discounting for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() d = d.tolist() if d[-1] == 0: rewards = discount_with_dones(rewards + [value], d + [0], gamma)[:-1] else: rewards = discount_with_dones(rewards, d, gamma) mb_rewards[n] = rewards # Flatten the whole minibatch mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() mb_depth = mb_depth.flatten() # Save the information to tensorboard if summarize: loss, policy_loss, value_loss, policy_ent, mrew, mdp, _, summary = actor_critic.train( mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, mb_depth, i, summary_op) writer.add_summary(summary, i) else: loss, policy_loss, value_loss, policy_ent, mrew, mdp, _ = actor_critic.train( mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, mb_depth, i) if i % log_interval == 0: actor_critic.save(log_path, i) actor_critic.save(log_path, 'final') print('a2c model is finished training')
def train(env_fn=None, spectrum=False, vae_arch=None, a2c_arch=None, nenvs=16, nsteps=100, max_iters=1e6, kl_coeff=0.5, lr=7e-4, log_interval=100, summarize=True, vae_load_path=None, a2c_load_path=None, log_path=None, cpu_cores=1): # Construct the vectorized parallel environments envs = [env_fn for _ in range(nenvs)] envs = SubprocVecEnv(envs) # Set some random seeds for the environment envs.seed(0) if spectrum: envs.spectrum() ob_space = envs.observation_space.shape nw, nh, nc = ob_space ac_space = envs.action_space obs = envs.reset() tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores, intra_op_parallelism_threads=cpu_cores) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: actor_critic = RandomActorCritic(sess, a2c_arch, ob_space, ac_space, nenvs, nsteps) if a2c_load_path is not None: actor_critic.load(a2c_load_path) print('Loaded a2c') else: actor_critic.epsilon = -1 print('WARNING: No Actor Critic Model loaded. Using Random Agent') vae = VariationalAutoEncoder(sess, vae_arch, ob_space, ac_space, lr, kl_coeff, summarize) load_count = 0 if vae_load_path is not None: vae.load(vae_load_path) summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) print('VAE Training Start!') print('Model will be saved on intervals of %i' % (log_interval)) for i in tqdm(range(load_count + 1, int(max_iters) + 1), ascii=True, desc='VarAutoEncoder'): mb_s, mb_a, mb_r, mb_ns, mb_d = [], [], [], [], [] for s, a, r, ns, d in model_play_games(actor_critic, envs, nsteps): mb_s.append(s) mb_a.append(a) mb_r.append(r) mb_ns.append(ns) mb_d.append(d) mb_s = np.concatenate(mb_s) mb_a = np.concatenate(mb_a) mb_r = np.concatenate(mb_r) mb_ns = np.concatenate(mb_ns) mb_d = np.concatenate(mb_d) if summarize: loss, recon_loss, kl_loss, _, smy = vae.train( mb_s, mb_a, mb_ns, mb_r, summary_op) writer.add_summary(smy, i) else: loss, recon_loss, kl_loss, _ = vae.train( mb_s, mb_a, mb_ns, mb_r) if i % log_interval == 0: vae.save(log_path, i) vae.save(log_path, 'final') print('Variational AutoEncoder is finished training')