def init(self): import tensorflow as tf self.env = self.env_producer.get_new_environment() self.s0 = self.env.reset() self.session = utils.create_session(self.env_opts, False) with tf.device("/cpu:0"): with tf.variable_scope("gather-%s" % self.idx): pol = get_policy(self.env_opts, self.session) self.agent = PPOAgent(pol, self.session, "gather-%s" % self.idx, self.env_opts) self.trainable_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "gather-%s" % self.idx) self.accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in self.trainable_vars ] assign_ops = [ self.trainable_vars[i].assign(self.accum_vars[i]) for i in range(len(self.trainable_vars)) ] self.assign_op = tf.group(assign_ops) self.session.run(tf.global_variables_initializer()) self.cur_hidden_state = self.agent.get_init_hidden_state() self.episode = [self.s0], [], [], [], [], [self.cur_hidden_state ], []
def evaluate(args): env = gym.make(args.env) env_params = get_env_params(env, args) env.close() agent = PPOAgent(args, env_params) agent.load_model(load_model_remark=args.load_model_remark) parent_conn, child_conn = Pipe() worker = AtariEnvironment(args.env, 1, child_conn, is_render=True, max_episode_step=args.max_episode_step) worker.start() for i_episode in range(100): obs = worker.reset() while True: obs = np.expand_dims(obs, axis=0) action = agent.choose_action(obs / 255) parent_conn.send(action[0]) obs_, r, done, info = parent_conn.recv() obs = obs_ if done: break
def main(): device = torch.device("cpu") env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] agent = PPOAgent(state_size=state_size, action_size=action_size, hidden_size=256, num_agents=num_agents, random_seed=0, ppo_epochs=4, mini_batch_size=128, normalize_advantages=True, learning_rate=3e-4, clip_gradients=True, gamma=0.99, tau=0.95, device=device) agent.load_model('assets/ppo_checkpoint_37.10.pth') test_agent(env, brain_name, agent, device, real_time=True)
def load_policy(self, file_path): tf.reset_default_graph() with tf.Session() as session: with tf.variable_scope(MASTER_NAME) as scope: policy = get_policy(env_opts, session) master_agent = PPOAgent(policy, session, 'master-0', env_opts) saver = tf.train.Saver(max_to_keep=1) saver.restore(session, tf.train.latest_checkpoint(file_path))
def init_agent(self): import tensorflow as tf env_opts = environments.get_env_options( self.env_name, self.env_producer.get_use_gpu()) self.session = utils.create_session(env_opts, True) with tf.variable_scope("worker-%s" % self.idx): pol = get_policy(env_opts, self.session) self.agent = PPOAgent(pol, self.session, "worker-%s" % self.idx, env_opts) self.trainable_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "worker-%s" % self.idx) self.accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in self.trainable_vars ] p_vars = self.agent.p_opt.variables() v_vars = self.agent.v_opt.variables() self.p_opt_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in p_vars ] self.v_opt_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in v_vars ] p_assign_ops = [ p_vars[i].assign(self.p_opt_vars[i]) for i in range(len(p_vars)) ] v_assign_ops = [ v_vars[i].assign(self.v_opt_vars[i]) for i in range(len(v_vars)) ] assign_ops = [ self.trainable_vars[i].assign(self.accum_vars[i]) for i in range(len(self.trainable_vars)) ] self.assign_op = tf.group(assign_ops + p_assign_ops + v_assign_ops) self.session.run(tf.global_variables_initializer()) self.run()
def start(env): env = gym.make(env) frames = [] MASTER_NAME = "master-0" IMAGE_PATH = "images/%s.gif" % env.spec.id tf.reset_default_graph() with tf.Session() as session: with tf.variable_scope(MASTER_NAME) as scope: env_opts = environments.get_env_options(env, False) policy = get_policy(env_opts, session) master_agent = PPOAgent(policy, session, MASTER_NAME, env_opts) saver = tf.train.Saver(max_to_keep=1) saver = tf.train.import_meta_graph( tf.train.latest_checkpoint("models/%s/" % env.spec.id) + ".meta") saver.restore(session, tf.train.latest_checkpoint("models/%s/" % env.spec.id)) try: pass except: print("Failed to restore model, starting from scratch") session.run(tf.global_variables_initializer()) global_step = 0 while global_step < 1000: terminal = False s0 = env.reset() cum_rew = 0 cur_hidden_state = master_agent.get_init_hidden_state() episode_count = 0 while not terminal: episode_count += 1 frames.append(env.render(mode='rgb_array')) action, h_out = master_agent.get_strict_sample( s0, cur_hidden_state) cur_hidden_state = h_out s0, r, terminal, _ = env.step(action) cum_rew += r global_step += 1 print(episode_count, cum_rew) imageio.mimsave(IMAGE_PATH, frames, duration=1.0 / 60.0)
def ppo(env, brain_name, policy, config, train): if train: optimizier = optim.Adam( policy.parameters(), config['hyperparameters']['adam_learning_rate'], eps=config['hyperparameters']['adam_epsilon']) agent = PPOAgent(env, brain_name, policy, optimizier, config) all_scores = [] averages = [] last_max = 30.0 for i in tqdm.tqdm(range(config['hyperparameters']['episode_count'])): agent.step() last_mean_reward = play_round(env, brain_name, policy, config) if i == 0: last_average = last_mean_reward else: last_average = np.mean(np.array( all_scores[-100:])) if len(all_scores) > 100 else np.mean( np.array(all_scores)) all_scores.append(last_mean_reward) averages.append(last_average) if last_average > last_max: torch.save( policy.state_dict(), f"reacher-ppo/models/ppo-max-hiddensize-{config['hyperparameters']['hidden_size']}.pth" ) last_max = last_average clear_output(True) print( 'Episode: {} Total score this episode: {} Last {} average: {}'. format(i + 1, last_mean_reward, min(i + 1, 100), last_average)) return all_scores, averages else: all_scores = [] for i in range(20): score = play_round(env, brain_name, policy, config, train) all_scores.append(score) return [score], [np.mean(score)]
def start(env): env = gym.make(env) MASTER_NAME = "master-0" tf.reset_default_graph() with tf.Session() as session: with tf.variable_scope(MASTER_NAME) as scope: env_opts = environments.get_env_options(env, False) policy = get_policy(env_opts, session) master_agent = PPOAgent(policy, session, MASTER_NAME, env_opts) saver = tf.train.Saver(max_to_keep=1) saver = tf.train.import_meta_graph(tf.train.latest_checkpoint("models/%s/" % env.spec.id) + ".meta") saver.restore(session, tf.train.latest_checkpoint("models/%s/" % env.spec.id)) try: pass except: print("Failed to restore model, starting from scratch") session.run(tf.global_variables_initializer()) while True: terminal = False s0 = env.reset() cum_rew = 0 cur_hidden_state = master_agent.get_init_hidden_state() episode_count = 0 while not terminal: episode_count += 1 env.render() action, h_out = master_agent.get_strict_sample(s0, cur_hidden_state) cur_hidden_state = h_out s0, r, terminal, _ = env.step(action) cum_rew += r print(episode_count, cum_rew)
def main(args): model_store_sprefix = "snapshot" # NormalizedEnv env = gym.make(args.env) env.seed(args.seed) torch.manual_seed(args.seed) env, generator, model, cont = get_functions(env, args) optimizer = optim.Adam(model.parameters(), lr=args.rllr) memory = Memory(args) agent = PPOAgent(args, model, optimizer, env, generator, memory, cont) if args.resume: agent.load_model(model_store_sprefix) agent.train(model_store_sprefix, args.save_interval)
state_size = list(states[0][0].transpose(2, 0, 1).shape) state_size[0] *= NUM_CONSEQ_FRAMES # create policy policy = ActorCritic(state_size, action_size, model_path=ckpt_path).to(device) trajectory_collector = TrajectoryCollector( env, policy, num_agents, is_visual=True, visual_state_size=NUM_CONSEQ_FRAMES, is_training=False) agent = PPOAgent(policy) state = trajectory_collector.last_states is_random_run = [0, 1, 2] for is_random in is_random_run: print(f"Staring {'' if is_random else 'non' } random run...") total_rewards = [] avg_episode_length = 0 episode_lengths = [] for i_run in range(NUM_RUNS): sum_reward = 0 ep = 0 while True: ep += 1 if is_random == 1:
class GatheringWorker: def __init__(self, idx, env_producer, env_opts, rollout_size, worker_queue, weights_queue): self.session = None self.idx = idx self.env_producer = env_producer self.env = None self.s0 = None self.trainable_vars = None self.agent = None self.env_opts = env_opts self.cur_hidden_state = None self.episode = None self.episodes = [] self.batch_size = env_opts["batch_size"] self.terminal = False self.recurrent_policy = env_opts["recurrent"] self.timestep_size = env_opts["timestep_size"] if not self.recurrent_policy: self.timestep_size = 1 self.discount_factor = env_opts["discount_factor"] self.gae_factor = env_opts["gae_factor"] self.max_episode_steps = env_opts["max_episode_steps"] self.rollout_size = rollout_size self.discrete_env = env_opts["discrete"] self.ep_count = 0 self.episode_step = 0 self.cum_rew = 0 self.global_step = 0 self.sampled_action = None self.sampled_a_prob = None self.accum_vars = None self.assign_op = None self.worker_queue = worker_queue self.weights_queue = weights_queue self.stats = [] self.get_experience() def get_experience(self): self.init() action, a_prob, h_out, v_out = self.agent.get_sample( self.s0, self.cur_hidden_state) self.sampled_action = action self.sampled_a_prob = a_prob while True: self.stats = [] self.apply_weights() self.episodes = [] for i in range(self.rollout_size): if self.terminal: if self.episode_step == self.max_episode_steps and len( self.episode[1]) > 0: self.episode[4][-1] = False self.episode_step = 0 self.s0 = self.env.reset() self.episodes.append(self.episode) self.cur_hidden_state = self.agent.get_init_hidden_state() self.episode = [self.s0 ], [], [], [], [], [self.cur_hidden_state ], [] self.stats.append({ "reward": self.cum_rew, "step": self.ep_count, "a_probs": self.sampled_a_prob, "picked_a": self.sampled_action, "a_dim": self.env_opts["action_dim"], "discrete": self.env_opts["discrete"] }) self.terminal = False self.ep_count += 1 self.cum_rew = 0 action, a_prob, h_out, v_out = self.agent.get_sample( self.s0, self.cur_hidden_state) self.episode_step += 1 self.global_step += 1 if np.random.random() > 0.99: self.sampled_action = action self.sampled_a_prob = a_prob self.cur_hidden_state = h_out self.s0, r, self.terminal, _ = self.env.step(action) self.cum_rew += r self.episode[0].append(self.s0) self.episode[1].append(self.agent.transform_reward(r)) self.episode[2].append(action) self.episode[3].append(a_prob) self.episode[4].append(self.terminal) self.episode[5].append(h_out) self.episode[6].append(v_out) self.episodes.append(self.episode) self.episode = [self.s0], [], [], [], [], [self.cur_hidden_state ], [] result = self.process_episodes(self.episodes) self.worker_queue.put(result) def apply_weights(self): weights = self.weights_queue.get() feed_dict = {} for i, t in enumerate(self.accum_vars): feed_dict[t] = weights[i] self.session.run(self.assign_op, feed_dict=feed_dict) def init(self): import tensorflow as tf self.env = self.env_producer.get_new_environment() self.s0 = self.env.reset() self.session = utils.create_session(self.env_opts, False) with tf.device("/cpu:0"): with tf.variable_scope("gather-%s" % self.idx): pol = get_policy(self.env_opts, self.session) self.agent = PPOAgent(pol, self.session, "gather-%s" % self.idx, self.env_opts) self.trainable_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "gather-%s" % self.idx) self.accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in self.trainable_vars ] assign_ops = [ self.trainable_vars[i].assign(self.accum_vars[i]) for i in range(len(self.trainable_vars)) ] self.assign_op = tf.group(assign_ops) self.session.run(tf.global_variables_initializer()) self.cur_hidden_state = self.agent.get_init_hidden_state() self.episode = [self.s0], [], [], [], [], [self.cur_hidden_state ], [] def process_episodes(self, episodes): all_states = [] all_advantages = [] all_returns = [] all_picked_actions = [] all_old_actions_probs = [] all_pred_values = [] all_hidden_states = [] for episode in episodes: st, rewards, picked_actions, old_action_probs, terminals, hidden_states, values = episode if len(rewards) == 0: continue states = np.asarray(st) pred_values = np.zeros(len(values) + 1) pred_values[:-1] = np.array(values) episode_len = len(rewards) advantages = np.zeros((episode_len, )) returns = np.zeros((episode_len + 1, )) if terminals[-1]: pred_values[-1] = 0 else: _, _, _, v_out = self.agent.get_sample(states[-1], hidden_states[-1]) pred_values[-1] = v_out returns[-1] = pred_values[-1] for i in reversed(range(episode_len)): r = rewards[i] next_v = pred_values[i + 1] cur_v = pred_values[i] diff = r + self.discount_factor * next_v - cur_v if i == episode_len - 1: advantages[i] = diff else: advantages[ i] = diff + self.discount_factor * self.gae_factor * advantages[ i + 1] returns[i] = r + self.discount_factor * returns[i + 1] returns = returns[:-1] ep_states = states[:-1] ep_advantages = advantages ep_returns = returns ep_picked_actions = np.array(picked_actions) ep_old_action_probs = np.array(old_action_probs) ep_all_pred_values = pred_values ep_hidden_state = np.array(hidden_states[:-1]) splitted = utils.split_episode(ep_states, ep_advantages, ep_returns, ep_picked_actions, ep_old_action_probs, ep_all_pred_values, ep_hidden_state, self.timestep_size) for b_states, b_hidden_state, b_advantages, b_returns, b_picked_actions, b_old_action_probs, b_all_pred_values in splitted: all_states.append(b_states) all_advantages.append(b_advantages) all_returns.append(b_returns) all_picked_actions.append(b_picked_actions) all_old_actions_probs.append(b_old_action_probs) all_pred_values.append(b_all_pred_values) all_hidden_states.append(b_hidden_state) all_states = np.array(all_states) all_advantages = np.array(all_advantages) all_picked_actions = np.array(all_picked_actions) all_returns = np.array(all_returns) all_old_actions_probs = np.array(all_old_actions_probs) all_pred_values = np.array(all_pred_values) all_hidden_states = np.array(all_hidden_states) return [ all_states, all_advantages, all_picked_actions, all_returns, all_old_actions_probs, all_pred_values, all_hidden_states, self.ep_count, self.stats, self.idx ]
state_size = list(states[0][0].transpose(2, 0, 1).shape) state_size[0] *= NUM_CONSEQ_FRAMES # torch.manual_seed(SEED) # np.random.seed(SEED) # create policy to be trained & optimizer policy = ActorCritic(state_size, action_size).to(device) writer = tensorboardX.SummaryWriter(comment=f"-ejik") trajectory_collector = TrajectoryCollector(env, policy, num_agents, tmax=TMAX, gamma=GAMMA, gae_lambda=GAE_LAMBDA, debug=debug, is_visual=True, visual_state_size=NUM_CONSEQ_FRAMES) tb_tracker = TBMeanTracker(writer, EPOCHS) agent = PPOAgent(policy, tb_tracker, LR, EPSILON, BETA) #scheduler = lr_scheduler.LambdaLR(agent.optimizer, lambda ep: 0.1 if ep == STEP_DECAY else 1) scheduler = lr_scheduler.StepLR(agent.optimizer, step_size=STEP_DECAY, gamma=GAMMA) n_episodes = 0 max_score = - np.Inf traj_attributes = ["states", "actions", "log_probs", "advantages", "returns"] solved = False start = None step = 0 with RewardTracker(writer, mean_window=AVG_WIN, print_every=AVG_WIN // 2) as reward_tracker: d = datetime.datetime.today() print(f"Started training run: at {d.strftime('%d-%m-%Y %H:%M:%S')}")
class Worker: def __init__(self, env_producer, idx, env_opts, num_gather_workers, master_weights_in_queue, master_weights_out_queue): self.env_opts = env_opts self.num_gather_workers = num_gather_workers self.env_producer = env_producer self.batch_size = env_opts["batch_size"] self.clip_eps = env_opts["clip_eps"] self.grad_step = env_opts["grad_step"] self.epochs = env_opts["epochs"] self.entropy_coef = env_opts["entropy_coef"] self.state_dim = env_opts["state_dim"] self.idx = idx self.session = None self.episode_step = 0 self.initialized = False self.beta = self.env_opts["init_beta"] self.eta = self.env_opts["eta"] self.kl_target = self.env_opts["kl_target"] self.use_kl_loss = self.env_opts["use_kl_loss"] self.lr_multiplier = 1.0 self.prev_batch = None self.variables_file_path = "models/%s/variables.txt" % env_opts[ "env_name"] self.worker_queue = Queue() self.weights_queues = [Queue() for _ in range(self.num_gather_workers)] self.master_weights_in_queue = master_weights_in_queue self.master_weights_out_queue = master_weights_out_queue self.init_workers() self.agent = None self.trainable_vars = None self.accum_vars = None self.assign_op = None self.p_opt_vars = None self.v_opt_vars = None self.init_agent() def init_agent(self): import tensorflow as tf self.session = utils.create_session(self.env_opts, True) with tf.variable_scope("worker-%s" % self.idx): pol = get_policy(self.env_opts, self.session) self.agent = PPOAgent(pol, self.session, "worker-%s" % self.idx, self.env_opts) self.trainable_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "worker-%s" % self.idx) self.accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in self.trainable_vars ] p_vars = self.agent.p_opt.variables() v_vars = self.agent.v_opt.variables() self.p_opt_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in p_vars ] self.v_opt_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in v_vars ] p_assign_ops = [ p_vars[i].assign(self.p_opt_vars[i]) for i in range(len(p_vars)) ] v_assign_ops = [ v_vars[i].assign(self.v_opt_vars[i]) for i in range(len(v_vars)) ] assign_ops = [ self.trainable_vars[i].assign(self.accum_vars[i]) for i in range(len(self.trainable_vars)) ] self.assign_op = tf.group(assign_ops + p_assign_ops + v_assign_ops) self.session.run(tf.global_variables_initializer()) self.run() def init_workers(self): for i in range(self.num_gather_workers): rollout_size = self.env_opts[ "rollout_size"] // self.num_gather_workers t = Process(target=make_worker, args=(i, self.env_producer, self.env_opts, self.worker_queue, self.weights_queues[i], rollout_size)) t.start() def run(self): while True: self.apply_shared_variables() self.apply_weights_to_gather_workers() stats = self.compute_grads_and_stats() self.send_to_master(stats) def send_to_master(self, stats): weights, p_opt_weights, v_opt_weights = self.session.run([ self.trainable_vars, self.agent.p_opt.variables(), self.agent.v_opt.variables() ]) arr = [ self.beta, self.lr_multiplier, p_opt_weights, v_opt_weights, weights, stats ] self.master_weights_out_queue.put(arr) def apply_weights_to_gather_workers(self): weights = self.session.run(self.trainable_vars) for q in self.weights_queues: q.put(weights) def apply_shared_variables(self): beta, lr_multiplier, p_opt_weights, v_opt_weights, weights = self.master_weights_in_queue.get( ) self.beta = beta self.lr_multiplier = lr_multiplier fd = {} for i, t in enumerate(self.accum_vars): fd[t] = weights[i] for i, t in enumerate(self.p_opt_vars): fd[t] = p_opt_weights[i] for i, t in enumerate(self.v_opt_vars): fd[t] = v_opt_weights[i] self.session.run(self.assign_op, feed_dict=fd) def compute_grads_and_stats(self): results = [] for i in range(self.num_gather_workers): results.append(self.worker_queue.get()) w_idx = list(range(self.num_gather_workers)) cur_all_states = np.concatenate([results[i][0] for i in w_idx], axis=0) cur_all_advantages = np.concatenate([results[i][1] for i in w_idx], axis=0) cur_all_picked_actions = np.concatenate([results[i][2] for i in w_idx], axis=0) cur_all_returns = np.concatenate([results[i][3] for i in w_idx], axis=0) cur_all_old_actions_probs = np.concatenate( [results[i][4] for i in w_idx], axis=0) cur_all_pred_values = np.concatenate([results[i][5] for i in w_idx], axis=0) cur_all_hidden_states = np.concatenate([results[i][6] for i in w_idx], axis=0) if self.prev_batch is not None: prev_all_states, prev_all_advantages, prev_all_picked_actions, prev_all_returns, \ prev_all_old_actions_probs, prev_all_pred_values, prev_all_hidden_states = self.prev_batch all_states = np.concatenate([cur_all_states, prev_all_states], axis=0) all_advantages = np.concatenate( [cur_all_advantages, prev_all_advantages], axis=0) all_picked_actions = np.concatenate( [cur_all_picked_actions, prev_all_picked_actions], axis=0) all_returns = np.concatenate([cur_all_returns, prev_all_returns], axis=0) all_old_actions_probs = np.concatenate( [cur_all_old_actions_probs, prev_all_old_actions_probs], axis=0) all_pred_values = np.concatenate( [cur_all_pred_values, prev_all_pred_values], axis=0) all_hidden_states = np.concatenate( [cur_all_hidden_states, prev_all_hidden_states], axis=0) else: all_states = cur_all_states all_advantages = cur_all_advantages all_picked_actions = cur_all_picked_actions all_returns = cur_all_returns all_old_actions_probs = cur_all_old_actions_probs all_pred_values = cur_all_pred_values all_hidden_states = cur_all_hidden_states self.prev_batch = [ cur_all_states, cur_all_advantages, cur_all_picked_actions, cur_all_returns, cur_all_old_actions_probs, cur_all_pred_values, cur_all_hidden_states ] all_advantages = (all_advantages - all_advantages.mean()) / (max( all_advantages.std(), 1e-4)) first_gather = [x for x in results if x[9] == 0][0] self.episode_step = first_gather[7] stats = first_gather[8] sz = len(all_states) n_batches = (sz - 1) // self.batch_size + 1 steps = 0 cur_kl = 0 entropy = 0 hinge = 0 src_policy_loss = 0 vloss = 0 ploss = 0 for cur_epoch in range(self.epochs): idx = np.arange(len(all_states)) np.random.shuffle(idx) all_states = all_states[idx] all_returns = all_returns[idx] all_picked_actions = all_picked_actions[idx] all_old_actions_probs = all_old_actions_probs[idx] all_advantages = all_advantages[idx] all_pred_values = all_pred_values[idx] all_hidden_states = all_hidden_states[idx] for b in range(n_batches): start = b * self.batch_size end = min(sz, (b + 1) * self.batch_size) states_b = all_states[start:end] returns_b = all_returns[start:end] picked_actions_b = all_picked_actions[start:end] old_action_probs_b = all_old_actions_probs[start:end] advantages_b = all_advantages[start:end] hidden_states_b = all_hidden_states[start:end] old_values_b = all_pred_values[start:end] cur_kl, entropy, hinge, src_policy_loss, vloss, ploss = \ self.agent.train(states_b, advantages_b, returns_b, picked_actions_b, old_action_probs_b, hidden_states_b, old_values_b, self.clip_eps, self.beta, self.eta, self.grad_step * self.lr_multiplier) steps += 1 if cur_kl > self.kl_target * 4 and self.use_kl_loss: break if self.use_kl_loss: if cur_kl > self.kl_target * 2: self.beta = np.minimum(35, 1.5 * self.beta) if self.beta > 30.0: self.lr_multiplier /= 1.5 elif cur_kl < self.kl_target / 2: self.beta = np.maximum(1 / 35, self.beta / 1.5) if self.beta <= 1 / 30.0: self.lr_multiplier *= 1.5 self.lr_multiplier = max(min(self.lr_multiplier, 3.0), 0.1) train_stats = { "stats": stats, "kl": cur_kl, "entropy": entropy, "hinge": hinge, "src_policy_loss": src_policy_loss, "vloss": vloss, "ploss": ploss, "lr_multiplier": self.lr_multiplier, "beta": self.beta, "step": self.episode_step, "idx": self.idx } return train_stats
config.gradient_clip = 5 config.rollout_length = 20 * 512 config.optimization_epochs = 10 config.num_mini_batches = 512 config.ppo_ratio_clip = 0.2 config.log_interval = 3 * 200 * 512 config.max_steps = 2e7 config.eval_episodes = 10 # config.logger = get_logger() select_device(0) print("GPU available: {}".format(torch.cuda.is_available())) print("GPU tensor test: {}".format(torch.rand(3, 3).cuda())) agent = PPOAgent(config) random_seed() config = agent.config agent.actor_critic.load_state_dict( torch.load('../checkpoints/ppo_checkpoint.pth')) score = 0 # initialize the score for i in range(3): env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations for j in range(2000): action = agent.act(state) env_info = env.step(action.cpu().detach().numpy())[brain_name]
def train(args): device = 'cuda:0' if torch.cuda.is_available() else 'cpu' envs = MultiprocessEnvironment.create_mario_env(num_envs=args.jobs, world=args.world, stage=args.stage) actor_critic = RecurrentPolicy(state_frame_channels=envs.observation_shape[0], action_space_size=envs.action_space_size, hidden_layer_size=args.hidden_size, prev_actions_out_size=args.prev_actions_hidden_size, recurrent_hidden_size=args.recurrent_hidden_size, device=device) experience = ExperienceStorage(num_steps=args.steps_per_update, num_envs=args.jobs, observation_shape=envs.observation_shape, recurrent_hidden_size=args.recurrent_hidden_size, device=device) initial_observations = envs.reset() experience.insert_initial_observations(initial_observations) tb_writer = SummaryWriter() num_updates = args.steps // (args.jobs * args.steps_per_update) agent = PPOAgent(actor_critic, lr=args.lr, lr_lambda=lambda step: 1 - (step / float(num_updates)), policy_loss_coef=args.policy_loss_coef, value_loss_coef=args.value_loss_coef, entropy_loss_coef=args.entropy_loss_coef, max_grad_norm=args.max_grad_norm, clip_threshold=args.ppo_clip_threshold, epochs=args.ppo_epochs, minibatches=args.ppo_minibatches) for update_step in tqdm(range(num_updates)): episode_rewards = [] for step in range(args.steps_per_update): with torch.no_grad(): actor_input = experience.get_actor_input(step) (values, actions, action_log_probs, _, # Action disribution entropy is not needed. recurrent_hidden_states) = actor_critic.act(*actor_input) observations, rewards, done_values, info_dicts = envs.step(actions) masks = 1 - done_values experience.insert(observations, actions, action_log_probs, rewards, values, masks, recurrent_hidden_states) for done, info in zip(done_values, info_dicts): if done: level_completed_percentage = info['x_pos'] / MAX_X episode_rewards.append(level_completed_percentage) with torch.no_grad(): critic_input = experience.get_critic_input() next_value = actor_critic.value(*critic_input) experience.compute_gae_returns(next_value, gamma=args.discount, gae_lambda=args.gae_lambda) losses = agent.update(experience) if episode_rewards: with torch.no_grad(): cumulative_reward = experience.rewards.sum((0, 2)) mean_reward = cumulative_reward.mean() std_reward = cumulative_reward.std() tb_writer.add_scalar('mario/lr', agent.current_lr(), update_step) tb_writer.add_scalars('mario/level_progress', { 'min': np.min(episode_rewards), 'max': np.max(episode_rewards), 'mean': np.mean(episode_rewards), 'median': np.median(episode_rewards), }, update_step) tb_writer.add_scalars('mario/reward', {'mean': mean_reward, 'std': std_reward}, update_step) tb_writer.add_scalars('mario/loss', { 'policy': losses['policy_loss'], 'value': losses['value_loss'], }, update_step) tb_writer.add_scalar('mario/action_dist_entropy', losses['action_dist_entropy'], update_step) if np.min(episode_rewards) == 1.0: model_path = 'models/super_model_{}.bin'.format(update_step + 1) torch.save(actor_critic.state_dict(), model_path) save_model = (update_step % args.save_interval) == (args.save_interval - 1) if save_model: model_path = 'models/model_{}.bin'.format(update_step + 1) torch.save(actor_critic.state_dict(), model_path) tb_writer.close()
def start(self): import tensorflow as tf self.summary_writer = tf.summary.FileWriter("logs/%s" % self.env_opts["env_name"]) self.session = utils.create_session(self.env_opts, True) with tf.variable_scope("master-0"): pol = get_policy(self.env_opts, self.session) self.agent = PPOAgent(pol, self.session, "master-0", self.env_opts) self.trainable_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "master-0") self.accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in self.trainable_vars ] p_vars = self.agent.p_opt.variables() v_vars = self.agent.v_opt.variables() self.p_opt_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in p_vars ] self.v_opt_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in v_vars ] p_assign_ops = [ p_vars[i].assign(self.p_opt_vars[i]) for i in range(len(p_vars)) ] v_assign_ops = [ v_vars[i].assign(self.v_opt_vars[i]) for i in range(len(v_vars)) ] assign_ops = [ self.trainable_vars[i].assign(self.accum_vars[i]) for i in range(len(self.trainable_vars)) ] self.assign_op = tf.group(assign_ops + p_assign_ops + v_assign_ops) self.restore_variables() self.saver = tf.train.Saver(max_to_keep=1) self.session.run(tf.global_variables_initializer()) try: self.saver = tf.train.import_meta_graph( tf.train.latest_checkpoint("models/%s/" % self.env_opts["env_name"]) + ".meta") self.saver.restore( self.session, tf.train.latest_checkpoint("models/%s/" % self.env_opts["env_name"])) except: print("failed to restore model") while True: if self.iter_count % 10 == 0: print("Saving model...") self.save_variables() self.saver.save(self.session, self.model_path, self.iter_count) print("Model saved") self.broadcast_weights() self.merge_weights() self.iter_count += 1
def ppo(): # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = UnityEnvironment(file_name="../Reacher_Linux/Reacher.x86_64", no_graphics=True) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) config = Config() config.env = env config.actor_critic_fn = lambda: ActorCritic( actor=Actor(state_size, action_size), critic=Critic(state_size)) config.discount = 0.99 config.use_gae = True config.gae_tau = 0.95 config.gradient_clip = 5 config.rollout_length = 2048 config.optimization_epochs = 5 config.num_mini_batches = 512 config.ppo_ratio_clip = 0.2 config.log_interval = 10 * 2048 config.max_steps = 2e7 config.eval_episodes = 10 # config.logger = get_logger() print("GPU available: {}".format(torch.cuda.is_available())) print("GPU tensor test: {}".format(torch.rand(3, 3).cuda())) agent = PPOAgent(config) random_seed() config = agent.config t0 = time.time() scores = [] scores_window = deque(maxlen=100) # last 100 scores while True: if config.log_interval and not agent.total_steps % config.log_interval and len( agent.episode_rewards): rewards = agent.episode_rewards for reward in rewards: scores.append(reward) scores_window.append(reward) agent.episode_rewards = [] print('\r===> Average Score: {:d} episodes {:.2f}'.format( len(scores), np.mean(scores_window))) if np.mean(scores_window) >= 1.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(len(scores_window), np.mean(scores_window))) torch.save(agent.actor_critic.state_dict(), '../checkpoints/ppo_checkpoint.pth') break print( 'Total steps %d, returns %d/%.2f/%.2f/%.2f/%.2f (count/mean/median/min/max), %.2f steps/s' % (agent.total_steps, len(rewards), np.mean(rewards), np.median(rewards), np.min(rewards), np.max(rewards), config.log_interval / (time.time() - t0))) t0 = time.time() agent.step() return scores
# -*- coding: utf-8 -*- """ Created on Sun Mar 13 16:49:00 2022 @author: nbrow """ from agent import PPOAgent import gym import numpy as np import os from UC_Env import UC_Env #tf.set_random_seed(0) if __name__ == "__main__": # newest gym fixed bugs in 'BipedalWalker-v2' and now it's called 'BipedalWalker-v3' env = UC_Env() agent = PPOAgent(env) agent.run_batch() # train as PPO #agent.run_multiprocesses(num_worker = 16) # train PPO multiprocessed (fastest) #agent.test()
default=False) parser.add_argument('-threshold_score', type=int, default=200) parser.add_argument('-best_avg_reward', type=int, default=-200) parser.add_argument('-test_env', type=bool, default=False) args = parser.parse_args() env = gym.make(args.env) envs = SubprocVecEnv([make_env(args.env) for i in range(args.n_envs)]) n_inputs = envs.observation_space.shape[0] n_outs = envs.action_space.n agent = PPOAgent(lr=args.lr, n_inputs=n_inputs, n_hidden=args.n_hidden, n_outs=n_outs, td_n=args.td_n, ppo_epochs=args.ppo_epochs, mini_batch_size=args.mini_batch_size) if args.load_best_pretrained_model: agent.load_model('../models/ppo/model.pt') print('Loaded pretrained model') if args.test_env: state = env.reset() done = False score = 0 while not done: env.render() dist, value = agent.step(state)
def experiment(hidden_size=64, lr=3e-4, num_steps=2048, mini_batch_size=32, ppo_epochs=10, threshold_reward=10, max_episodes=15, nrmlz_adv=True, gamma=0.99, tau=0.95, clip_gradients=True): ''' :param hidden_size: number of neurons for the layers of the model :param lr: learning rate :param num_steps: maximum duration of one epoch :param mini_batch_size: mini batch size for ppo :param ppo_epochs: number of epochs for ppo to learn :param threshold_reward: what is the goal of the training :param max_episodes: maximum duration of the training :param nrmlz_adv: True, if advantages should be normalized before PPO :param clip_gradients: True if gradients should ne clipped after PPO :return: list of scores and list of test_rewards ''' use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") scores_window = deque(maxlen=100) test_rewards = [] moving_averages = [] env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] agent = PPOAgent(learning_rate=lr, state_size=state_size, action_size=action_size, hidden_size=hidden_size, num_agents=num_agents, random_seed=0, ppo_epochs=ppo_epochs, mini_batch_size=mini_batch_size, normalize_advantages=nrmlz_adv, clip_gradients=clip_gradients, gamma=gamma, tau=tau, device=device) # while episode < max_episodes and not early_stop: for episode in tqdm(range(max_episodes)): log_probs = [] values = [] states_list = [] actions_list = [] rewards = [] masks = [] env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations for duration in range(num_steps): state = torch.FloatTensor(state).to(device) action, value, log_prob = agent.act(state) env_info = env.step(action.cpu().data.numpy())[ brain_name] # send all actions to the environment next_state = env_info.vector_observations # get next state (for each agent) reward = env_info.rewards # get reward (for each agent) dones = np.array(env_info.local_done) # see if episode finished if reward == None: pass log_probs.append(log_prob) values.append(value) reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device) masks_t = torch.FloatTensor(1 - dones) rewards.append(reward_t) masks.append(masks_t) states_list.append(state) actions_list.append(action) state = next_state if np.any(dones): break next_state = torch.FloatTensor(state).to(device) _, next_value, _ = agent.act(next_state) agent.step(states=states_list, actions=actions_list, values=values, log_probs=log_probs, rewards=rewards, masks=masks, next_value=next_value) test_mean_reward = test_agent(env, brain_name, agent, device) test_rewards.append(test_mean_reward) scores_window.append(test_mean_reward) moving_averages.append(np.mean(scores_window)) print('Episode {}, Total score this episode: {}, Last {} average: {}'. format(episode, test_mean_reward, min(episode, 100), np.mean(scores_window))) if np.mean(scores_window) > threshold_reward: agent.save_model( f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth" ) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode, test_mean_reward)) break episode += 1 env.close() return scores_window, test_rewards, moving_averages
def main(): mujoco = True render = True save_models = False # Save the models training_mode = False # Train the agent or test a memory model reward_threshold = None # reward_threshold = 290 update_threshold = 800 # Iterations before update the Policy plot_batch_threshold = 500 # Espisodes included in the partial plot episode_max = 30000 # update_threshold = 1000 # Iterations before update the Policy # plot_batch_threshold = 100 # Episodes included in the partial plot # episode_max = 3000 if mujoco: env_name = 'Humanoid-v2' epsilon_discount = 5.0e-3 else: env_name = 'MountainCarContinuous-v0' epsilon_discount = 4.0e-4 env = gym.make(env_name) check_folder(env_name) env.seed(69) np.random.seed(69) tf.random.set_seed(69) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] epsilon = 0.9 agent = PPOAgent(state_dim, action_dim, env, epsilon, mujoco) if not training_mode: path = 'test_models/'+env_name agent.load_models(path) rewards = [] rewards_means = [] batch_rewards = [] batch_solved_reward = [] times = [] batch_times = [] updates_counter = 0 tb_writer = agent.get_summary_writer() rewards_metric = tf.keras.metrics.Mean(name='rewards_metric') for epis in range(1, episode_max + 1): try: total_reward, time, updates_counter = run_episode(env, agent, state_dim, render, training_mode, updates_counter, update_threshold) print('Episode {} Elapsed time: {} Total reward: {} Epsilon: {}'.format(epis, time, int(total_reward), agent.get_epsilon())) batch_rewards.append(int(total_reward)) batch_times.append(time) epsilon -= epsilon_discount rewards_metric(total_reward) with tb_writer.as_default(): tf.summary.scalar('rewards', rewards_metric.result(), step=epis) rewards_metric.reset_states() if epsilon >= 0.2 and training_mode: agent.set_epsilon(epsilon) if save_models: agent.save_models(epis,'') if epis % plot_batch_threshold == 0: print('=====================') print('|-------Batch-------|') print('=====================') plot(env_name,batch_rewards,"+",'Rewards of batch until episode {}'.format(epis), 'Episodes','Rewards',str(epis)+'_Batch') plot(env_name,batch_times,".",'Times of batch until episode {}'.format(epis), 'Episodes','Times',str(epis)+'_Batch') rewards_mean = np.mean(batch_rewards) print('Max Reward:', np.max(batch_rewards)) print('Min Reward:', np.min(batch_rewards)) print('Avg Reward:', rewards_mean) print('') rewards = rewards + batch_rewards times = times = batch_times rewards_means.append(rewards_mean) batch_rewards = [] batch_times = [] print('============================') print('|-------Accumulative-------|') print('============================') plot(env_name,rewards,"+",'Total rewards until episode {}'.format(epis), 'Episodes','Rewards',str(epis)+'_Total') plot(env_name,times,".",'Total times until episode {}'.format(epis), 'Episodes','Times',str(epis)+'_Total') if reward_threshold: if len(batch_solved_reward) == 100: if np.mean(batch_solved_reward) >= reward_threshold : rewards = rewards + batch_rewards times = times = batch_times print('============================') print('Reward threshold reached after {} episodes'.format(epis)) print('============================') agent.save_models(epis,'solved') break else: del batch_solved_reward[0] batch_solved_reward.append(total_reward) else: batch_solved_reward.append(total_reward) except KeyboardInterrupt: print('Training loop interrupted, saving last models . . .') agent.save_models(epis,'forced') plot(env_name,rewards_means,"ro-",'Average reward per batch until episode {}'.format(epis), 'Batchs','Rewards',str(epis)+'_BatchAverage') exit() agent.save_models(episode_max,'finalized') plot(env_name,rewards_means,"ro-",'Average reward per batch until episode {}'.format(epis), 'Batchs','Rewards',str(epis)+'_BatchAverage')
args = get_args() torch.set_default_tensor_type('torch.cuda.FloatTensor') writer = SummaryWriter(log_dir='logs/circuit') if os.name == 'nt': # windows binary = os.path.join('cicuit2', 'circuit_2') else: binary = 'circuit_linux/circuit_linux.x86_64' env = UnityEnvironment(file_name=binary, worker_id=0) print(str(env)) train_mode = True agent = PPOAgent() agent.model = agent.model.cuda() load_weights = True if (load_weights): agent.model.load_state_dict( torch.load("pretrained_weights/saved_model_ppo_epoch_23040")) optimizer = torch.optim.Adam(agent.parameters(), lr=args.lr) default_brain = env.brain_names[0] brain = env.brains[default_brain] config = { "WrongDirectionPenalty": 0.01, 'PenaltyCarCollision': 1.0, 'MaxAngleReward': 35, 'TimePenalty': 0.015
def paralle_train(args): logger = SummaryWriter(log_dir='results/{}_{}_{}'.format( args.env, args.seed, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))) np.random.seed(args.seed) torch.manual_seed(args.seed) env = gym.make(args.env) env_params = get_env_params(env, args) env.close() agent = PPOAgent(args, env_params) workers, parent_conns, children_conns = workers_initialize(args) obs = np.zeros(shape=[args.num_worker, 4, 84, 84], dtype=np.float32) #initialize obs_normalizer print('Start initialize obs normalizer....') next_obs_batch = [] for step in range(args.initialize_episode * args.max_episode_step): actions = np.random.randint(0, env_params['a_dim'], size=(args.num_worker)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: obs_, r, done, info = parent_conn.recv() next_obs_batch.append(obs_) if len(next_obs_batch) % (10 * args.num_worker) == 0: next_obs_batch = np.stack(next_obs_batch) agent.normalizer_obs.update(next_obs_batch) next_obs_batch = [] print('End initialize obs normalizer....') log_reward_ex = 0 log_reward_in = 0 log_step = 0 log_episode = 0 for i_epoch in range(args.max_epoch): epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob = [], [], [], [], [], [], [] for i_step in range(args.rollout_len): actions, log_probs = agent.choose_action(obs) for action, parent_conn in zip(actions, parent_conns): parent_conn.send(action) batch_re, batch_mask, batch_next_obs = [], [], [] for parent_conn in parent_conns: obs_, r_e, done, info = parent_conn.recv() batch_next_obs.append(obs_) batch_re.append(r_e) batch_mask.append(0 if done else 1) batch_next_obs = np.stack(batch_next_obs) batch_re = np.stack(batch_re) batch_mask = np.stack(batch_mask) batch_ri = agent.compute_intrinsic_reward(batch_next_obs.copy()) #for log log_reward_ex += batch_re[args.log_env_idx] log_reward_in += batch_ri[args.log_env_idx] log_step += 1 if batch_mask[args.log_env_idx] == 0: log_episode += 1 logger.add_scalar('Indicator/Reward_ex', log_reward_ex, log_episode) logger.add_scalar('Indicator/Reward_in', log_reward_in, log_episode) log_reward_ex = 0 log_reward_in = 0 epoch_obs.append(obs) epoch_action.append(actions) epoch_next_obs.append(batch_next_obs) epoch_ri.append(batch_ri) epoch_re.append(batch_re) epoch_mask.append(batch_mask) epoch_logprob.append(log_probs) obs = batch_next_obs[:, :, :, :] epoch_obs = np.stack(epoch_obs) epoch_action = np.stack(epoch_action) epoch_ri = np.stack(epoch_ri) epoch_re = np.stack(epoch_re) epoch_mask = np.stack(epoch_mask) epoch_next_obs = np.stack(epoch_next_obs) epoch_logprob = np.stack(epoch_logprob) epoch_obs = np.transpose(epoch_obs, axes=[1, 0, 2, 3, 4]) epoch_action = np.transpose(epoch_action, axes=[1, 0]) epoch_ri = np.transpose(epoch_ri, axes=[1, 0]) epoch_re = np.transpose(epoch_re, axes=[1, 0]) epoch_mask = np.transpose(epoch_mask, axes=[1, 0]) epoch_next_obs = np.transpose(epoch_next_obs, axes=[1, 0, 2, 3, 4]) epoch_logprob = np.transpose(epoch_logprob, axes=[1, 0]) loss_rnd, loss_a, loss_c = agent.update(epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob) used_sample_num = args.rollout_len * args.num_worker * i_epoch logger.add_scalar('Loss/loss_RND', loss_rnd, used_sample_num) logger.add_scalar('Loss/loss_a', loss_a, used_sample_num) logger.add_scalar('Loss/loss_c', loss_c, used_sample_num) if i_epoch % args.save_model_interval == 0: agent.save_model(remark='{}'.format(i_epoch))
def main(): parser = argparse.ArgumentParser(description='Reinforce') parser.add_argument('--data', type=str, default=config.data_dir, help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=config.unk_threshold, help='minimum word frequency to be in dictionary') parser.add_argument('--alice_model_file', type=str, help='Alice model file') parser.add_argument('--bob_model_file', type=str, help='Bob model file') parser.add_argument('--output_model_file', type=str, help='output model file') parser.add_argument('--context_file', type=str, help='context file') parser.add_argument('--temperature', type=float, default=config.rl_temperature, help='temperature') parser.add_argument('--cuda', action='store_true', default=config.cuda, help='use CUDA') parser.add_argument('--verbose', action='store_true', default=config.verbose, help='print out converations') parser.add_argument('--seed', type=int, default=config.seed, help='random seed') parser.add_argument( '--score_threshold', type=int, default=config.rl_score_threshold, help='successful dialog should have more than score_threshold in score' ) parser.add_argument('--log_file', type=str, default='', help='log successful dialogs to file for training') parser.add_argument('--smart_bob', action='store_true', default=False, help='make Bob smart again') parser.add_argument('--gamma', type=float, default=config.rl_gamma, help='discount factor') parser.add_argument('--eps', type=float, default=config.rl_eps, help='eps greedy') parser.add_argument('--nesterov', action='store_true', default=config.nesterov, help='enable nesterov momentum') parser.add_argument('--momentum', type=float, default=config.rl_momentum, help='momentum for sgd') parser.add_argument('--lr', type=float, default=config.rl_lr, help='learning rate') parser.add_argument('--clip', type=float, default=config.rl_clip, help='gradient clip') parser.add_argument('--rl_lr', type=float, default=config.rl_reinforcement_lr, help='RL learning rate') parser.add_argument('--rl_clip', type=float, default=config.rl_reinforcement_clip, help='RL gradient clip') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--bsz', type=int, default=config.rl_bsz, help='batch size') parser.add_argument('--sv_train_freq', type=int, default=config.rl_sv_train_freq, help='supervision train frequency') parser.add_argument('--nepoch', type=int, default=config.rl_nepoch, help='number of epochs') parser.add_argument('--visual', action='store_true', default=config.plot_graphs, help='plot graphs') parser.add_argument('--domain', type=str, default=config.domain, help='domain for the dialogue') parser.add_argument('--eps_clip', type=float, default=0.2, help='clipping threshold for PPO surrogate loss 2') parser.add_argument('--ppo_epochs', type=int, default=5, help='Number of epochs to perform PPO policy update') # TODO: split policy update epochs from supervised model update args = parser.parse_args() device_id = utils.use_cuda(args.cuda) logging.info("Starting training using pytorch version:%s" % (str(torch.__version__))) logging.info("CUDA is %s" % ("enabled. Using device_id:"+str(device_id) + " version:" \ +str(torch.version.cuda) + " on gpu:" + torch.cuda.get_device_name(0) if args.cuda else "disabled")) alice_model = utils.load_model(args.alice_model_file) # we don't want to use Dropout during RL alice_model.eval() # Alice is a RL based agent, meaning that she will be learning while selfplaying logging.info("Creating RlAgent from alice_model: %s" % (args.alice_model_file)) alice = PPOAgent(alice_model, args, name="Alice") # we keep Bob frozen, i.e. we don't update his parameters logging.info("Creating Bob's (--smart_bob) LstmRolloutAgent" if args.smart_bob \ else "Creating Bob's (not --smart_bob) LstmAgent" ) bob_ty = LstmRolloutAgent if args.smart_bob else LstmAgent bob_model = utils.load_model(args.bob_model_file) bob_model.eval() bob = bob_ty(bob_model, args, name='Bob') logging.info("Initializing communication dialogue between Alice and Bob") dialog = Dialog([alice, bob], args) logger = DialogLogger(verbose=args.verbose, log_file=args.log_file) ctx_gen = ContextGenerator(args.context_file) logging.info( "Building word corpus, requiring minimum word frequency of %d for dictionary" % (args.unk_threshold)) corpus = data.WordCorpus(args.data, freq_cutoff=args.unk_threshold) engine = Engine(alice_model, args, device_id, verbose=False) logging.info("Starting Reinforcement Learning") reinforce = PPO(dialog, ctx_gen, args, engine, corpus, logger) reinforce.run() logging.info("Saving updated Alice model to %s" % (args.output_model_file)) utils.save_model(alice.model, args.output_model_file)