def evaluate(step, episode_number): global max_eval_reward_mean, model_saved print('Evaluate...') eval_reward_sum = 0.0 # Run evaluation episodes for eval_episode in range(n_eval_episodes): obs = env.reset() done = False while not done: # Choose action action_idxes = np.array( act(np.array(obs)[None], stochastic=False)) # deterministic actions_greedy = action_idxes / num_action_grains * actions_range + low if eval_std == 0.0: action = actions_greedy else: action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: a_stoch = np.random.normal(loc=a_greedy, scale=eval_std) a_idx_stoch = np.rint( (a_stoch + high[index]) / actions_range[index] * num_action_grains) if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad: action.append(a_stoch) out_of_range_action = False # Step obs, rew, done, _ = env.step(action) eval_reward_sum += rew # Average the rewards and log eval_reward_mean = eval_reward_sum / n_eval_episodes print(eval_reward_mean, 'over', n_eval_episodes, 'episodes') game_scores.append(eval_reward_mean) score_timesteps.append(step) if max_eval_reward_mean is None or eval_reward_mean > max_eval_reward_mean: logger.log( "Saving model due to mean eval increase: {} -> {}".format( max_eval_reward_mean, eval_reward_mean)) U.save_state(model_file) model_saved = True max_eval_reward_mean = eval_reward_mean intact = ActWrapper(act, act_params) intact.save(model_file + "_" + str(episode_number) + "_" + str(int(np.round(max_eval_reward_mean)))) print('Act saved to ' + model_file + "_" + str(episode_number) + "_" + str(int(np.round(max_eval_reward_mean))))
def evaluate(step, episode_number): global max_eval_reward_mean, model_saved print('Evaluate...') eval_reward_sum = 0.0 # Run evaluation episodes for eval_episode in range(n_eval_episodes): obs = env.reset() done = False while not done: # Choose action action_idxes = np.array( act(np.array(obs)[None], stochastic=False)) # deterministic actions_greedy = action_idxes / num_action_grains * actions_range + low if eval_std == 0.0: action = actions_greedy else: action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: a_stoch = np.random.normal(loc=a_greedy, scale=eval_std) a_idx_stoch = np.rint( (a_stoch + high[index]) / actions_range[index] * num_action_grains) if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad: action.append(a_stoch) out_of_range_action = False # Step obs, rew, done, _ = env.step(action) eval_reward_sum += rew # Average the rewards and log eval_reward_mean = eval_reward_sum / n_eval_episodes print(eval_reward_mean, 'over', n_eval_episodes, 'episodes') with open("results/{}_{}_eval.csv".format(time_stamp, env_name), "a") as eval_fw: eval_writer = csv.writer( eval_fw, delimiter="\t", lineterminator="\n", ) eval_writer.writerow([episode_number, step, eval_reward_mean]) if max_eval_reward_mean is None or eval_reward_mean > max_eval_reward_mean: logger.log( "Saving model due to mean eval increase: {} -> {}".format( max_eval_reward_mean, eval_reward_mean)) U.save_state(model_file) model_saved = True max_eval_reward_mean = eval_reward_mean
def learn(env, policy_func, dataset, pretrained, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None): val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac - pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) if not pretrained: writer = U.FileWriter(log_dir) ep_stats = stats(["Loss"]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if not pretrained: ep_stats.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') loss, g = lossandgrad(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def save(self, path): """Save model to a pickle located at `path`""" with tempfile.TemporaryDirectory() as td: U.save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: dill.dump((model_data, self._act_params), f)
agent2.experience(obs, [act2], rews[1], obs_n, goals[1]) print(env.scores, act1, act2, rews, goals, win) if win: break ep_score = env.scores agent1_score.append(ep_score[0]) agent2_score.append(ep_score[1]) q_loss1, p_loss1 = agent1.learn(arglist.batch_size, arglist.gamma) q_loss2, p_loss2 = 0,0 if arglist.adv_agent == "agent": q_loss2, p_loss2 = agent2.learn(arglist.batch_size, arglist.gamma) agent1_q_loss.append(q_loss1) agent1_p_loss.append(p_loss1) agent2_q_loss.append(q_loss2) agent2_p_loss.append(p_loss2) print("Episodes {} scores: {}, losses {}".format(ep, ep_score,[q_loss1, p_loss1 , q_loss2, p_loss2 ])) log.write('{}\n'.format(','.join(map(str, [ep_score[0], ep_score[1] , q_loss1, p_loss1 , q_loss2, p_loss2 ])))) print("Epoch {} scores: {}, losses {}".format( epo, [np.mean(agent1_score), np.mean(agent2_score)], [np.mean(agent1_q_loss), np.mean(agent1_p_loss) , np.mean(agent2_q_loss), np.mean(agent2_p_loss) ])) U.save_state(model_path, saver)
def save(self, save_path): tf_util.save_state(save_path, sess=self.sess)
def learn( env, policy_func, discriminator, expert_dataset, embedding_z, pretrained, pretrained_weight, *, g_step, d_step, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, save_per_iter=100, ckpt_dir=None, log_dir=None, load_model_path=None, task_name=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] d_adam = MpiAdam(discriminator.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out writer = U.FileWriter(log_dir) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, embedding=embedding_z, timesteps_per_batch=timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(discriminator.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) # if provieded model path if load_model_path is not None: U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for discriminator if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() g_loss_stats.add_all_summary(writer, g_losses, iters_so_far) d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far) ep_stats.add_all_summary(writer, [ np.mean(true_rewbuffer), np.mean(rewbuffer), np.mean(lenbuffer) ], iters_so_far)
def train_maddpg(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist, good_agent_mode=arglist.good_policy, adv_agent_mode=arglist.adv_policy) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=None) obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() if arglist.real_q_log: world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], [] q_means, real_means = [], [] print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) # note: unused, never happens terminal = (episode_step >= arglist.max_episode_len) done = done or terminal if arglist.real_q_log: world_state_buffer.append(deepcopy(env.world)) obs_n_buffer.append(obs_n) action_n_buffer.append(action_n) start_episode_step_buffer.append(episode_step) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done, terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) # add element for next episode for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): if arglist.save_dir != '/tmp/policy/': U.save_state(arglist.save_dir + arglist.exp_name, saver=saver, global_step=len(episode_rewards)) else: U.save_state( arglist.save_dir, saver=saver ) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:-1]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:-1]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:-1])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:-1])) if arglist.real_q_log and (len(episode_rewards) % (5 * arglist.save_rate) == 0): q_mean, real_mean = calculate_real_q_value( deepcopy(env), trainers, world_state_buffer=world_state_buffer, action_n_buffer=action_n_buffer, obs_n_buffer=obs_n_buffer, start_episode_step_buffer=start_episode_step_buffer, num_start_states=200, args=arglist) world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], [] q_means.append(q_mean) real_means.append(real_mean) print('Q-mean: ' + str(q_mean) + ' Real mean: ' + str(real_mean)) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) args_file_name = arglist.plots_dir + arglist.exp_name + '_args.pkl' with open(args_file_name, 'wb') as fp: pickle.dump(arglist, fp) if arglist.real_q_log: real_q_path = arglist.plots_dir + arglist.exp_name + '_q_values.pkl' with open(real_q_path, 'wb') as fp: pickle.dump( { 'q_means': q_means, 'real_means': real_means }, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def learn( env, policy_func, discriminator, expert_dataset, timesteps_per_batch, *, g_step, d_step, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, d_stepsize=3e-4, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_per_iter=100, ckpt_dir=None, task="train", sample_stochastic=True, load_model_path=None, task_name=None, max_sample_traj=1500): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) d_adam = MpiAdam(discriminator.get_trainable_variables()) adam = MpiAdam(var_list, epsilon=adam_epsilon) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() adam.sync() def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, timesteps_per_batch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=100) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if task == 'sample_trajectory': # not elegant, i know :( sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) for _ in range(g_step): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new( ) # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # update running mean/std for discriminator if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # ----------------- logger -------------------- logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) true_rewbuffer.extend(true_rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def learn(env_id, q_func, lr=5e-4, max_timesteps=10000, buffer_size=5000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, train_steps=10, learning_starts=500, batch_size=32, print_freq=10, checkpoint_freq=100, model_dir=None, gamma=1.0, target_network_update_freq=50, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, player_processes=None, player_connections=None): env, _, _ = create_gvgai_environment(env_id) # Create all the functions necessary to train the model # expert_decision_maker = ExpertDecisionMaker(env=env) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) session = tf.Session() session.__enter__() policy_path = os.path.join(model_dir, "Policy.pkl") model_path = os.path.join(model_dir, "model", "model") if os.path.isdir(os.path.join(model_dir, "model")): load_state(model_path) else: act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Initialize the parameters and copy them to the target network. U.initialize() update_target() act.save(policy_path) save_state(model_path) env.close() # Create the replay buffer if prioritized_replay: replay_buffer_path = os.path.join(model_dir, "Prioritized_replay.pkl") if os.path.isfile(replay_buffer_path): with open(replay_buffer_path, 'rb') as input_file: replay_buffer = pickle.load(input_file) else: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer_path = os.path.join(model_dir, "Normal_replay.pkl") if os.path.isfile(replay_buffer_path): with open(replay_buffer_path, 'rb') as input_file: replay_buffer = pickle.load(input_file) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) episode_rewards = list() saved_mean_reward = -999999999 signal.signal(signal.SIGQUIT, signal_handler) global terminate_learning total_timesteps = 0 for timestep in range(max_timesteps): if terminate_learning: break for connection in player_connections: experiences, reward = connection.recv() episode_rewards.append(reward) for experience in experiences: replay_buffer.add(*experience) total_timesteps += 1 if total_timesteps < learning_starts: if timestep % 10 == 0: print("not strated yet", flush=True) continue if timestep % train_freq == 0: for i in range(train_steps): # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(total_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if timestep % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if print_freq is not None and timestep % print_freq == 0: logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * exploration.value(total_timesteps))) logger.dump_tabular() if timestep % checkpoint_freq == 0 and mean_100ep_reward > saved_mean_reward: act.save(policy_path) save_state(model_path) saved_mean_reward = mean_100ep_reward with open(replay_buffer_path, 'wb') as output_file: pickle.dump(replay_buffer, output_file, pickle.HIGHEST_PROTOCOL) send_message_to_all(player_connections, Message.UPDATE) send_message_to_all(player_connections, Message.TERMINATE) if mean_100ep_reward > saved_mean_reward: act.save(policy_path) with open(replay_buffer_path, 'wb') as output_file: pickle.dump(replay_buffer, output_file, pickle.HIGHEST_PROTOCOL) for player_process in player_processes: player_process.join() # player_process.terminate() return act.load(policy_path)
def learn( env, policy_func, *, timesteps=4, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_per_iter=100, ckpt_dir=None, task="train", sample_stochastic=True, load_model_path=None, task_name=None, max_sample_traj=1500): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", timesteps, ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", timesteps, ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return pi_vpred = tf.placeholder(dtype=tf.float32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") # ob_now = tf.placeholder(dtype=tf.float32, shape=[optim_batchsize, list(ob_space.shape)[0]]) ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) # total_loss = pol_surr + pol_entpen + vf_loss total_loss = pol_surr + pol_entpen losses = [pol_surr, pol_entpen, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if not v.name.split("/")[1].startswith("vf") ] # lossandgrad = U.function([ob, ac, atarg ,ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, pol_var_list)]) vf_grad = U.function([ob, ac, atarg, ret, lrmult], U.flatgrad(vf_loss, vf_var_list)) # adam = MpiAdam(var_list, epsilon=adam_epsilon) pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() #adam.sync() pol_adam.sync() vf_adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, timesteps, env, timesteps_per_batch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards EpRewMean_MAX = 2.5e3 assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if task == 'sample_trajectory': # not elegant, i know :( sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) # if(iters_so_far == 1): # a = 1 seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, vpred, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "vpred"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset( dict(ob=ob, ac=ac, atarg=atarg, vpred=vpred, vtarg=tdlamret), shuffle=False ) #d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vpred = vpred, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)] for batch in d.iterate_once(optim_batchsize): ##feed ob, 重新处理一下ob,在batch["ob"]的最前面插入timesteps-1个env.reset的ob,然后滑动串口划分一下batch['ob] ob_now = np.append(pre_obs, batch['ob']).reshape( optim_batchsize + timesteps - 1, list(ob_space.shape)[0]) pre_obs = ob_now[-(timesteps - 1):] ob_fin = [] for jmj in range(optim_batchsize): ob_fin.append(ob_now[jmj:jmj + timesteps]) *newlosses, g = lossandgrad(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) ###这里的g好像都是0 #adam.update(g, optim_stepsize * cur_lrmult) pol_adam.update(g, optim_stepsize * cur_lrmult) vf_g = vf_grad(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(vf_g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)] for batch in d.iterate_once(optim_batchsize): ##feed ob, 重新处理一下ob,在batch["ob"]的最前面插入timesteps-1个env.reset的ob,然后滑动串口划分一下batch['ob] ob_now = np.append(pre_obs, batch['ob']).reshape( optim_batchsize + timesteps - 1, list(ob_space.shape)[0]) pre_obs = ob_now[-(timesteps - 1):] ob_fin = [] for jmj in range(optim_batchsize): ob_fin.append(ob_now[jmj:jmj + timesteps]) *newlosses, g = lossandgrad(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) ###这里的g好像都是0 #adam.update(g, optim_stepsize * cur_lrmult) pol_adam.update(g, optim_stepsize * cur_lrmult) vf_g = vf_grad(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(vf_g, optim_stepsize * cur_lrmult) logger.log("Evaluating losses...") losses = [] loss_pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)] for batch in d.iterate_once(optim_batchsize): ### feed ob ob_now = np.append(loss_pre_obs, batch['ob']).reshape( optim_batchsize + timesteps - 1, list(ob_space.shape)[0]) loss_pre_obs = ob_now[-(timesteps - 1):] ob_fin = [] for jmj in range(optim_batchsize): ob_fin.append(ob_now[jmj:jmj + timesteps]) newlosses = compute_losses(ob_fin, batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) if (np.mean(rewbuffer) > EpRewMean_MAX): EpRewMean_MAX = np.mean(rewbuffer) print(iters_so_far) print(np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()