def create_agent(agent_type, *args, **kwargs): agent_type = agent_type.lower() if agent_type == 'a2c': agent = A2C(*args, **kwargs) return agent
def dyian_test(path): path_plot = path + "plot_pt\\" if not os.path.exists(path_plot): os.makedirs(path_plot) path_save = path + "save_pt\\" if not os.path.exists(path_save): os.makedirs(path_save) diayn_mod = build_diayn(2) diayn_mod.load(path + "save_diayn\\") mountaincar(diayn_mod, path_plot + "pretrained_trajectoires_0") pretrained = A2C.from_diayn(diayn_mod, 0) model = pretrained for k in range(0, 1): iter_ = 200 model.train(iter_) model.plot_rewards(path_plot + "pretrained_rewards_" + str((k + 1) * iter_)) mountaincar_baseline( model, path_plot + "pretrained_trajectoires_" + str( (k + 1) * iter_)) model.save(path_save)
def run(self): self.agent = A2C(self.id) def treatQueue(): msg = self.conn.recv() if msg == "load": self.agent.load_model() print("Process " + str(self.id) + " loaded the master (0) model.") if msg[0] == "train_with_batchs": print("Master process is training ...") t0 = time.time() self.agent.train_with_batchs(msg[1]) self.agent.save_model() print("Master process finished training. Time : " + str(time.time() - t0) + " \n") self.conn.send("saved") while True: if (self.id != 0): batch_values = [] batch_states = [] batch_actions = [] print("Process " + str(self.id) + " starts playing " + str(self.n_games) + " games.") scores = [] env = SnakeEnv() overall_data = 0 for i in range(self.n_games): state = env.init() t = 0 lastScoring = -1 while True: action = self.agent([state]) newState, reward, done = env.step(action) if (reward == 1): for j in range(t - lastScoring): batch_values.append(1) lastScoring = t batch_states.append([state]) batch_actions.append(action) t += 1 if (done or (t - lastScoring >= 100)): for j in range(t - lastScoring - 1): batch_values.append(0) break state = newState scores.append(env.score) overall_data += t if (overall_data >= 10000): break print("Process " + str(self.id) + " finished playing.") batch = (batch_states, batch_actions, batch_values) self.conn.send((np.mean(scores), batch)) treatQueue()
def plot_results(path): env = gym.make("MountainCar-v0") baseline = A2C(env, {"actor": [30, 30], "critic": [30, 30]}, gamma=0.99) baseline.load(path + "save_bl\\") diayn_mod = build_diayn(2) diayn_mod.load(path + "save_diayn\\") pretrained = A2C.from_diayn(diayn_mod, 0) pretrained.load(path + "save_pt\\") plt.figure() plt.plot(range(99, len(pretrained.rewards)), np.convolve(pretrained.rewards, np.ones(100) / 100, "valid"), label="pretrained") plt.plot(range(99, len(baseline.rewards)), np.convolve(baseline.rewards, np.ones(100) / 100, "valid"), label="baseline") plt.legend() plt.show() plt.savefig(path + "results") plt.pause(1)
def __init__(self): #self.name = name action_mask = [1,1,1,1] self.controlTLIds = traci.trafficlight.getIDList() self.controlTLIds = self.controlTLIds[0] self.phaseDefs = ['GrrrGrrr', 'rGrrrGrr', 'rrGrrrGr', 'rrrGrrrG'] self.yelloPhases = ['yrrryrrr', 'ryrrryrr', 'rryrrryr', 'rrryrrry'] self.detectorIDs = traci.inductionloop.getIDList() self.controlLanes = get_laneID(self.detectorIDs) self.reset() state_size = len(self.state) self.learner = A2C(state_size, action_mask) self.buffer_reset() return
def baseline_test(path): path_plot = path + "plot_bl\\" if not os.path.exists(path_plot): os.makedirs(path_plot) path_save = path + "save_bl\\" if not os.path.exists(path_save): os.makedirs(path_save) env = gym.make("MountainCar-v0") baseline = A2C(env, {"actor": [30, 30], "critic": [30, 30]}, gamma=0.99) model = baseline for k in range(0, 1): iter_ = 200 model.train(iter_) model.plot_rewards(path_plot + "baseline_rewards_" + str((k + 1) * iter_)) mountaincar_baseline( model, path_plot + "baseline_trajectoires_" + str((k + 1) * iter_)) model.save(path_save)
def main(flags): ''' Runs an agent in an environment. params: flags (dict): configuration ''' env = gym.make('CartPole-v0') agent = A2C(env, gamma=flags.gamma, lambd=flags.lambd, learning_rate=flags.learning_rate, num_units=flags.num_units, num_layers=flags.num_layers, update_frequency=flags.update_frequency) trainer = ActorCriticTrainer(env, agent, flags) rewards, lengths = trainer.train(flags.num_episodes, flags.max_steps) plot_results(rewards, lengths)
def main(): args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) set_seed(config['seed']) writer = None # Will ERROR if outdir already exists if not os.path.exists(config['outdir']): os.makedirs(config['outdir']) if config['use_tensorboard']: os.makedirs(os.path.join(config['outdir'], 'tensorboard')) writer = SummaryWriter( os.path.join(config['outdir'], 'tensorboard')) # save a copy of the config file shutil.copyfile(args.config, os.path.join(config['outdir'], 'config.yaml')) else: print("ERROR: directory \'./{}\' already exists!".format( config['outdir'])) raise EnvironmentError logger = get_logger(config) # create environment env = make_atari_env(config['task'], num_env=config['parallel_envs'], seed=config['seed']) env = VecFrameStack(env, n_stack=config['state_frames']) # default device for torch tensors device = torch.device('cuda') if config['use_gpu'] else torch.device('cpu') # start training a2c = A2C(config, env, device, logger, writer) a2c.train()
def main(): a2c_config = A2CConfig() set_seed(a2c_config.seed) # initialize environment env = football_env.create_environment( env_name=a2c_config.env_name, representation="simple115", number_of_left_players_agent_controls=1, stacked=False, logdir="/tmp/football", write_goal_dumps=False, write_full_episode_dumps=False, render=False) # state and action space state_space_size = env.observation_space.shape[ 0] # we are using simple115 representation if a2c_config.forbid_actions: action_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 15] # forbid some actions action_space_size = len(action_list) else: action_list = list(range(env.action_space.n)) # default action space action_space_size = len(action_list) # initialize model model_config = FFNModelConfig(state_space_size=state_space_size, action_space_size=action_space_size) model = FFN(model_config) # TODO multiprocessing env a2c = A2C(env=env, model=model, a2c_config=a2c_config, action_list=action_list) a2c.learn()
def main(): map_name=args.map envs_num=args.envs max_windows=args.max_windows total_updates=args.iters env_args = dict( map_name=map_name, battle_net_map=False, players=[sc2_env.Agent(sc2_env.Race.terran)], agent_interface_format=sc2_env.parse_agent_interface_format( feature_screen=32, feature_minimap=32, rgb_screen=None, rgb_minimap=None, action_space=None, use_feature_units=False, use_raw_units=False), step_mul=8, game_steps_per_episode=None, disable_fog=False, visualize=False ) vis_env_args = env_args.copy() vis_env_args['visualize'] = True num_vis = min(envs_num, max_windows) env_fns = [partial(make_sc2env, **vis_env_args)] * num_vis num_no_vis = envs_num - num_vis if num_no_vis > 0: env_fns.extend([partial(make_sc2env, **env_args)] * num_no_vis) envs = SubprocVecEnv(env_fns) # 一个随机的实现方式 用来debug '''agents=[] for i in range(envs_num): agent=RandomAgent() agents.append(agent)''' '''observation_spec = envs.observation_spec() action_spec = envs.action_spec() processor = pro(observation_spec) for agent,obs_spec,act_spec in zip(agents,observation_spec,action_spec): agent.setup(obs_spec[0],act_spec[0]) try: while True: num_frames=0 timesteps= envs.reset() for a in agents: a.reset() while True: num_frames+=1 last_timesteps=timesteps actions= [agent.step(timestep) for agent,timestep in zip(agents,timesteps)] timesteps=envs.step(actions) obs=processor.preprocess_obs(timesteps) a=1 except KeyboardInterrupt: pass''' while True: if args.algorithm=='a2c': agent=A2C(envs,args) elif args.algorithm=='ppo': agent=PPO(envs,args) agent.reset() if os.path.exists(args.load_model): agent.net.load_state_dict(torch.load(args.load_model)) #try: while True: agent.train() if agent.sum_episode>total_updates: print("over############################\n\n\n") break #except : #print(agent.last_obs['available_actions']) envs.close()
torch.cuda.manual_seed_all(args.seed) torch.set_num_threads(1) device = torch.device('cuda' if args.cuda else 'cpu') # device = torch.device('cpu') env = gym.make(args.env_name) env.seed(args.seed) # reset env and preprocess to obtain the shape of the input (feeded into nn) obs = torch.from_numpy(utils.preprocess(env.reset())).float().unsqueeze(0).unsqueeze(0) shared_ac = Policy(obs.shape) shared_ac.to(device) shared_ac.share_memory() agent = A2C(shared_ac, args) if args.cuda: # somehow need it to enable cuda.. but super slow torch.multiprocessing.set_start_method('spawn') processes = [] counter = mp.Value('i', 0) lock = mp.Lock() for idx in range(0, args.num_processes): # p = mp.Process(target = train, args = (agent, shared_ac, args, 'cpu', idx, counter, lock)) p = mp.Process(target = train, args = (agent, shared_ac, args, device, idx, counter, lock)) p.start() processes.append(p) for p in processes: p.join()
def main(_): if FLAGS.debug: tf.config.experimental_run_functions_eagerly(True) with open(f"configs/{FLAGS.algo}.yaml") as file: kwargs = yaml.load(file, Loader=yaml.FullLoader) os.makedirs(FLAGS.logs_dir, exist_ok=True) tf.random.set_seed(FLAGS.seed) envs = make_vec_envs(FLAGS.env_name, FLAGS.seed, kwargs['num_processes'], FLAGS.logs_dir) for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) def get_obs(): return envs.stackedobs def env_step(action): next_obs, reward, done, _ = envs.step(action) return next_obs, reward.astype(np.float32), done.astype(np.float32) batch_size = kwargs['num_steps'] * kwargs['num_processes'] if FLAGS.algo == 'ppo': actor_critic = PPO((-1, *envs.observation_space.shape), envs.action_space.n, FLAGS.entropy_coef, FLAGS.value_loss_coef, FLAGS.gamma, **kwargs) else: del kwargs['num_processes'] actor_critic = A2C((-1, *envs.observation_space.shape), envs.action_space.n, FLAGS.entropy_coef, FLAGS.value_loss_coef, FLAGS.gamma, **kwargs) num_updates = FLAGS.max_timesteps // batch_size val_loss, act_loss, ent_loss = 0, 0, 0 hparam_str = utils.get_haram_str(env_name=FLAGS.env_name, seed=FLAGS.seed) writer = tf.summary.create_file_writer( os.path.join(FLAGS.save_dir, 'tb', hparam_str)) writer.set_as_default() envs.reset() for i in tqdm(range(num_updates), unit_scale=batch_size, smoothing=0.1): actor_critic.set_learning_rate(kwargs['learning_rate'] * (1.0 - i / num_updates)) value_loss, action_loss, entropy_loss = actor_critic.update( env_step, get_obs) val_loss += value_loss act_loss += action_loss ent_loss += entropy_loss if i % FLAGS.log_interval == 0 and i > 0: tf.summary.scalar("losses/value_loss", val_loss / FLAGS.log_interval, step=batch_size * i) tf.summary.scalar("losses/action_loss", act_loss / FLAGS.log_interval, step=batch_size * i) tf.summary.scalar("losses/entropy_loss", ent_loss / FLAGS.log_interval, step=batch_size * i) tf.summary.flush() val_loss = 0 act_loss = 0 ent_loss = 0
help="Random seed for the environment.") parser.add_argument('--num_episodes', type=int, default=1, help="Number of test episodes.") parser.add_argument('--stochastic', action='store_true', help="Use stochastic policy in testing.") parser.add_argument('--record', action='store_true', help="Record videos of test episodes.") parser.add_argument('--video_dir', help="Directory to store recorded videos.") args = parser.parse_args() env = gym.make('LunarLander-v2') env.seed(args.seed) if args.record: env = gym.wrappers.Monitor(env, args.video_dir, force=True) if args.agent_type == 'reinforce': agent = Reinforce(env, 0) elif args.agent_type == 'a2c': agent = A2C(env, 0, args.n) else: print('Unknown agent type %s' % args.agent_type) exit(1) agent.model.load_state_dict( torch.load(args.model_path, map_location=lambda storage, loc: storage)) stochastic = True if args.stochastic else False r_avg, r_std = agent.eval(args.num_episodes, stochastic=stochastic) print('Reward average %.6f std %.6f' % (r_avg, r_std))
############################### # MAKE NET AND POLICY critic_net = FFNet(in_size=2, out_size=1) actor_net = FFNet(in_size=2, out_size=2) plc = None if train_config['policy'] == 'angular': plc = policy.AngularPolicy(actor_net, train_config['sigma']) elif train_config['policy'] == 'gauss': plc = policy.GaussianPolicy(actor_net, train_config['sigma']) else: raise RuntimeError('Not a valid policy: %s' % train_config['policy']) ############################### # CREATE ENVIRONMENT AND RUN algo = A2C(plc, critic_net, train_config['lr'], train_config['gamma']) sampler = sampler.BatchSampler(plc, **train_config) cumulative_rewards = np.array([]).reshape((0, 3)) cur_update = 0 finished_episodes = 0 sampler.reset() while cur_update < train_config['num_updates']: batch, terminal = sampler.sample() algo.update(batch, terminal) cr = sampler.cumulative_reward # save cumulative rewards for i, t in enumerate(terminal): if t:
from hyperparams import HyperParams from a2c import A2C import torch.multiprocessing as mp if __name__ == "__main__": mp.set_start_method('forkserver') a2c_trainer = A2C() hyps = dict() hyps['exp_name'] = "pongbptt" hyps['env_type'] = "Pong-v0" hyps['model_type'] = 'conv' hyps['use_bptt'] = True hyps['entr_coef'] = .01 hyps['entr_coef_low'] = .001 hyps['decay_entr'] = True hyps['val_coef'] = .5 hyps['lr'] = 5e-4 hyps['lr_low'] = 1e-6 hyps['decay_lr'] = True hyps['gamma'] = .98 hyps['lambda_'] = .95 hyps['n_tsteps'] = 32 hyps['n_rollouts'] = 36 hyps['n_envs'] = 13 hyps['max_tsteps'] = 40000000 hyps['n_frame_stack'] = 3 hyps['optim_type'] = 'rmsprop' hyper_params = HyperParams(hyps) a2c_trainer.train(hyper_params.hyps)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train_env', type=str, default="BanditTenArmedRandomRandom-v0", help='env for meta-training') parser.add_argument('--train_eps', type=int, default=100, help='training episodes per trial') parser.add_argument('--train_trial_n', type=int, default=1000, help='number of trials during training') parser.add_argument('--seed', type=int, default=1, help='experiment seed') # Training Hyperparameters parser.add_argument('--hidden', type=int, default=48, help='hidden layer dimensions') parser.add_argument('--gamma', type=float, default=0.8, help='discount factor') args = parser.parse_args() x, y, e = [], [], [] for trial in range(1, args.train_trial_n + 1): env = gym.make(args.train_env) env._seed(args.seed) env.reset() # initialize algorithm at first iteration if trial == 1: action_dim = env.action_space.n input_dim = 3 algo = A2C(session=get_session(), policy_cls=LSTMPolicy, input_dim=input_dim, hidden_dim=args.hidden, action_dim=action_dim, scope='a2c') algo.reset() """ what does the env.unwrapped do exactly? https://discuss.pytorch.org/t/in-the-official-q-learning-example-what-does-the-env-unwrapped-do-exactly/28695 there is a core super class called gym.Env and there are other sub classes of this to implement different environments (CartPoleEnv, MountainCarEnv etc). This unwrapped property is used to get the underlying gym.Env object from other environments. """ save_iter = args.train_trial_n // 20 tot_returns = [] prop_reward = [] tot_regret = [] tot_subopt = [] ep_X, ep_R, ep_A, ep_V, ep_D = [], [], [], [], [] track_R = 0 track_regret = np.max(env.unwrapped.p_dist) * args.train_eps best_action = np.argmax(env.unwrapped.p_dist) num_suboptimal = 0 action_hist = np.zeros(env.action_space.n) action = 0 rew = 0 # begin a trial for ep in range(args.train_eps): # run policy #print(action,rew, ep) algo_input = np.array([action, rew, ep]) #print(algo_input) if len(algo_input.shape) <= 1: algo_input = algo_input[None] action, value = algo.get_actions(algo_input) new_obs, rew, done, info = env.step(action) track_R += rew num_suboptimal += int(action != best_action) action_hist[action] += 1 if ep == 0: ep_X = algo_input else: ep_X = np.concatenate([ep_X, algo_input], axis=0) ep_A.append(action) ep_V.append(value) ep_R.append(rew) ep_D.append(done) # update policy ep_X = np.asarray(ep_X, dtype=np.float32) ep_R = np.asarray(ep_R, dtype=np.float32) ep_A = np.asarray(ep_A, dtype=np.int32) ep_V = np.squeeze(np.asarray(ep_V, dtype=np.float32)) ep_D = np.asarray(ep_D, dtype=np.float32) last_value = value if ep_D[-1] == 0: disc_rew = discount_with_dones( ep_R.to_list() + [np.squeeze(last_value)], ep_D.to_list() + [0], args.gamma)[:-1] else: disc_rew = discount_with_dones(ep_R.tolist(), ep_D.tolist(), args.gamma) ep_adv = disc_rew - ep_V prop_reward.append(track_R / track_regret) track_regret -= track_R train_info = algo.train(ep_X=ep_X, ep_A=ep_A, ep_R=ep_R, ep_adv=ep_adv) tot_returns.append(track_R) tot_regret.append(track_regret) tot_subopt.append(num_suboptimal) if trial % save_iter == 0 and trial != 0: print("Episode: {}".format(trial)) print("MeanReward: {}".format(np.mean(tot_returns[-save_iter:]))) print("StdReward: {}".format(np.std(tot_returns[-save_iter:]))) print("MeanRegret: {}".format(np.mean(tot_regret[-save_iter:]))) print("StdRegret: {}".format(np.std(tot_regret[-save_iter:]))) print("NumSuboptimal: {}".format(np.mean(tot_subopt[-save_iter:]))) cur_y = np.mean(prop_reward[-save_iter:]) cur_e = np.std(prop_reward[-save_iter:]) x.append(trial) y.append(cur_y) e.append(cur_e) print("MeanPropReward: {}".format(cur_y)) print("StdPropReward: {}".format(cur_e)) x = np.asarray(x, dtype=np.int) y = np.asarray(y, dtype=np.float32) e = np.asarray(e, dtype=np.float32) # plt.errorbar(x, y, e) # plt.show() # database db = {} db['x'] = x db['y'] = y db['e'] = e file_name = args.train_env[:-3] + str(args.train_trial_n) pickle.dump(db, open(file_name + ".p", "wb"))
runner = Runner(env_name, n_envs, pool) # Create model and optimizer action_dim = 2 # Pong specific number of possible actions net = model.Model(runner.obs_shape, action_dim, batch_norm=batch_norm) if torch.cuda.is_available(): net = net.cuda() torch.FloatTensor = torch.cuda.FloatTensor torch.LongTensor = torch.cuda.LongTensor optimizer = optim.Adam(net.parameters(), lr=lr) a2c = A2C(net, n_envs, pool, val_const=val_const, entropy_const=entropy_const, spatio_const=spatio_const, gamma=gamma, lambda_=lambda_, predict_spatio=predict_spatio) if resume: net.load_state_dict(torch.load(net_save_file)) optimizer.load_state_dict(torch.load(optim_save_file)) logger = open(log_file, 'a+') else: logger = open(log_file, 'w+') if batch_norm: logger.write("Batch Norm = True\n") else: logger.write("Batch Norm = False\n")
import os writer = SummaryWriter(os.path.join('runs', name_dir(config_enhanced))) # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 # os.environ["CUDA_VISIBLE_DEVICES"] = config_enhanced["GPU_id"] print("Current config_enhanced is:") pprint(config_enhanced) writer.add_text("config", str(config_enhanced)) env = CholeskyTaskGraph(**config_enhanced['env_settings']) # env.reset() # model = Net # model = SimpleNet model = ResNetG # model = SimpleNetMax agent = A2C(config_enhanced, env, model=model, writer=writer) # rewards = Parallel(n_jobs=config_enhanced['num_cores'])( # delayed(wrap_non_picklable_objects(agent.training_batch))(config_enhanced['epochs'], # config_enhanced['nbatch']) for i in range(config_enhanced['num_cores'])) agent.training_batch() # TODO : evaluate test_mode and save if best than previous # TODO: Transfer ? # ToDo : load training batch during GPU training
def main( _run, _log, num_env_steps, env_name, seed, algorithm, dummy_vecenv, time_limit, wrappers, save_dir, eval_dir, loss_dir, log_interval, save_interval, eval_interval, ): if loss_dir: loss_dir = path.expanduser(loss_dir.format(id=str(_run._id))) utils.cleanup_log_dir(loss_dir) writer = SummaryWriter(loss_dir) else: writer = None eval_dir = path.expanduser(eval_dir.format(id=str(_run._id))) save_dir = path.expanduser(save_dir.format(id=str(_run._id))) utils.cleanup_log_dir(eval_dir) utils.cleanup_log_dir(save_dir) torch.set_num_threads(1) envs = make_vec_envs( env_name, seed, dummy_vecenv, algorithm["num_processes"], time_limit, wrappers, algorithm["device"], ) agents = [ A2C(i, osp, asp) for i, (osp, asp) in enumerate(zip(envs.observation_space, envs.action_space)) ] obs = envs.reset() for i in range(len(obs)): agents[i].storage.obs[0].copy_(obs[i]) agents[i].storage.to(algorithm["device"]) start = time.time() num_updates = ( int(num_env_steps) // algorithm["num_steps"] // algorithm["num_processes"] ) all_infos = deque(maxlen=10) for j in range(1, num_updates + 1): for step in range(algorithm["num_steps"]): # Sample actions with torch.no_grad(): n_value, n_action, n_action_log_prob, n_recurrent_hidden_states = zip( *[ agent.model.act( agent.storage.obs[step], agent.storage.recurrent_hidden_states[step], agent.storage.masks[step], ) for agent in agents ] ) # Obser reward and next obs obs, reward, done, infos = envs.step(n_action) # envs.envs[0].render() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [ [0.0] if info.get("TimeLimit.truncated", False) else [1.0] for info in infos ] ) for i in range(len(agents)): agents[i].storage.insert( obs[i], n_recurrent_hidden_states[i], n_action[i], n_action_log_prob[i], n_value[i], reward[:, i].unsqueeze(1), masks, bad_masks, ) for info in infos: if info: all_infos.append(info) # value_loss, action_loss, dist_entropy = agent.update(rollouts) for agent in agents: agent.compute_returns() for agent in agents: loss = agent.update([a.storage for a in agents]) for k, v in loss.items(): if writer: writer.add_scalar(f"agent{agent.agent_id}/{k}", v, j) for agent in agents: agent.storage.after_update() if j % log_interval == 0 and len(all_infos) > 1: squashed = _squash_info(all_infos) total_num_steps = ( (j + 1) * algorithm["num_processes"] * algorithm["num_steps"] ) end = time.time() _log.info( f"Updates {j}, num timesteps {total_num_steps}, FPS {int(total_num_steps / (end - start))}" ) _log.info( f"Last {len(all_infos)} training episodes mean reward {squashed['episode_reward'].sum():.3f}" ) for k, v in squashed.items(): _run.log_scalar(k, v, j) all_infos.clear() if save_interval is not None and ( j > 0 and j % save_interval == 0 or j == num_updates ): cur_save_dir = path.join(save_dir, f"u{j}") for agent in agents: save_at = path.join(cur_save_dir, f"agent{agent.agent_id}") os.makedirs(save_at, exist_ok=True) agent.save(save_at) archive_name = shutil.make_archive(cur_save_dir, "xztar", save_dir, f"u{j}") shutil.rmtree(cur_save_dir) _run.add_artifact(archive_name) if eval_interval is not None and ( j > 0 and j % eval_interval == 0 or j == num_updates ): evaluate( agents, os.path.join(eval_dir, f"u{j}"), ) videos = glob.glob(os.path.join(eval_dir, f"u{j}") + "/*.mp4") for i, v in enumerate(videos): _run.add_artifact(v, f"u{j}.{i}.mp4") envs.close()
def train(): """ 1. Process data. 2. Train actor supervised in SL model 3. Train critic supervised 4. Train RL agent as a function of actor and critic weights. """ # process data state_inputs, prev_order_inputs, prev_orders_game_labels, season_names, \ supply_center_owners, board_dict_list = get_data("data/standard_no_press.jsonl", num_games=1) # train SL actor print("Training SL actor") actor_sl = SL_model(num_board_blocks=16, num_order_blocks=16) actor_sl.train(state_inputs, prev_order_inputs, prev_orders_game_labels, season_names, board_dict_list) # save actor weights print("Saving SL actor weights") weights_file = open("actor_weights.pickle", "wb+") pickle.dump(actor_sl.get_weights(), weights_file) weights_file.close() # train SL critic print("Training SL critic") critic_sl = CriticSL() critic_sl.train(state_inputs, supply_center_owners) # save critic weights print("Saving SL critic weights") weights_file = open("critic_weights.pickle","wb+") pickle.dump(critic_sl.get_weights(), weights_file) weights_file.close() # load actor, critic weights from SL print("Loading actor, critic weights ready for RL training") ### LOADING ACTOR DOESN'T WORK BECAUSE YOU NEED TO CALL IT ON SOMETHING FIRST ### ## see https://stackoverflow.com/questions/55719047/is-loading-in-eager-tensorflow-broken-right-now new_weights_file = open("sl_weights_50_chunks.pickle", "rb") new_weights_actor = pickle.load(new_weights_file) weights_file.close() actor_rl = ActorRL(num_board_blocks=16, num_order_blocks=16) # actor_rl.call(state_inputs[0], prev_order_inputs[0], season_names[0],board_dict_list[0],"AUSTRIA") ########################################################################## new_weights_file = open("critic_weights.pickle","rb") new_weights = pickle.load(new_weights_file) weights_file.close() critic_rl = CriticRL() train_data = critic_sl.process_data(state_inputs, supply_center_owners)[0][0] # needed so that critic_rl knows input shapes or something set_rl_weights(new_weights, critic_rl, train_data) # Train RL A2C print("Training A2C") a2c = A2C(actor_rl, critic_rl) a2c.train(num_episodes=1) actor_rl.set_weights(new_weights_actor) a2c.train(num_episodes=1) # save actor/critic RL weights print("Saving RL actor/critic weights") weights_file = open("critic_rl_weights.pickle", "wb+") pickle.dump(critic_rl.get_weights(), weights_file) weights_file.close() weights_file = open("actor_rl_weights.pickle", "wb+") pickle.dump(actor_rl.get_weights(), weights_file) weights_file.close() print("Done!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train_env', type=str, default="MediumBandit-v0", help='env for meta-training') parser.add_argument('--test_env', type=str, default="EasyBandit-v0", help='env for meta-testing') parser.add_argument('--train_eps', type=int, default=int(2e4), help='training episodes') parser.add_argument('--test_eps', type=int, default=300, help='test episodes') parser.add_argument('--seed', type=int, default=1, help='experiment seed') # Training Hyperparameters parser.add_argument('--hidden', type=int, default=48, help='hidden layer dimensions') parser.add_argument('--gamma', type=float, default=0.8, help='discount factor') args = parser.parse_args() env = gym.make(args.train_env) env.seed(args.seed) eval_env = gym.make(args.test_env) eval_env.seed(args.seed) algo = A2C(env=env, session=get_session(), policy_cls=LSTMPolicy, hidden_dim=args.hidden, action_dim=env.action_space.n, scope='a2c') save_iter = args.train_eps // 20 average_returns = [] average_regret = [] average_subopt = [] for ep in range(args.train_eps): obs = env.reset() done = False ep_X, ep_R, ep_A, ep_V, ep_D = [], [], [], [], [] track_R = 0 track_regret = np.max(env.unwrapped.probs) * env.unwrapped.n best_action = np.argmax(env.unwrapped.probs) num_suboptimal = 0 action_hist = np.zeros(env.action_space.n) algo.reset() while not done: action, value = algo.get_actions(obs[None]) new_obs, rew, done, info = env.step(action) track_R += rew num_suboptimal += int(action != best_action) action_hist[action] += 1 ep_X.append(obs) ep_A.append(action) ep_V.append(value) ep_R.append(rew) ep_D.append(done) obs = new_obs _, last_value = algo.get_actions(obs[None]) ep_X = np.asarray(ep_X, dtype=np.float32) ep_R = np.asarray(ep_R, dtype=np.float32) ep_A = np.asarray(ep_A, dtype=np.int32) ep_V = np.squeeze(np.asarray(ep_V, dtype=np.float32)) ep_D = np.asarray(ep_D, dtype=np.float32) if ep_D[-1] == 0: disc_rew = discount_with_dones( ep_R.to_list() + [np.squeeze(last_value)], ep_D.to_list() + [0], args.gamma)[:-1] else: disc_rew = discount_with_dones(ep_R.tolist(), ep_D.tolist(), args.gamma) ep_adv = disc_rew - ep_V track_regret -= track_R train_info = algo.train(ep_X=ep_X, ep_A=ep_A, ep_R=ep_R, ep_adv=ep_adv) average_returns.append(track_R) average_regret.append(track_regret) average_subopt.append(num_suboptimal) if ep % save_iter == 0 and ep != 0: print("Episode: {}".format(ep)) print("ActionHist: {}".format(action_hist)) print("Probs: {}".format(env.unwrapped.probs)) print("MeanReward: {}".format(np.mean(average_returns[-50:]))) print("MeanRegret: {}".format(np.mean(average_regret[-50:]))) print("NumSuboptimal: {}".format(np.mean(average_subopt[-50:]))) print() test_regrets = [] test_rewards = [] for test_ep in range(args.test_eps): obs = eval_env.reset() algo.reset() done = False track_regret = np.max(eval_env.unwrapped.probs) * eval_env.unwrapped.n track_R = 0 while not done: action, value = algo.get_actions(obs[None]) new_obs, rew, done, info = eval_env.step(action) obs = new_obs track_R += rew test_regrets.append(track_regret - track_R) test_rewards.append(track_R) print('Mean Test Cumulative Regret: {}'.format(np.mean(test_regrets))) print('Mean Test Reward: {}'.format(np.mean(test_rewards)))
def main(): env = MyDoom() agent = A2C(unsup, envWrap, designHead, noReward) actions = [[True, False, False], [False, True, False], [False, False, True]] last_state = env.reset() last_features = agent.network.get_initial_features() # reset lstm memory length = 0 rewards = 0 values = 0 ep_bonus = 0 life_bonus = 0 timestep_limit = 524 # 2100/4 episodes = 0 total_steps = 0 f_loss = open('./logs/loss.txt', 'a') f_pred_loss = open('./logs/pred_loss.txt', 'a') f_reward = open('./logs/reward.txt', 'a') with tf.Session() as sess, sess.as_default(): init_op(sess) while(True): terminal_end = False rollout = PartialRollout(True) for _ in range(constants['ROLLOUT_MAXLEN']): # run policy fetched = agent.network.act(last_state, *last_features) action, value_, features = fetched[0], fetched[1], fetched[2:] # run environment: get action_index from sampled one-hot 'action' stepAct = action.argmax() # action repeat state, reward, terminal = env.skip_step(actions[stepAct]) total_steps += 1 if terminal: state = last_state if noReward: reward = 0. bonus = agent.ap_network.pred_bonus(last_state, state, action) curr_tuple = [last_state, action, reward, value_, terminal, last_features, bonus, state] life_bonus += bonus ep_bonus += bonus # collect the experience rollout.add(*curr_tuple) rewards += reward length += 1 values += value_[0] last_state = state last_features = features if terminal or length >= timestep_limit: # prints summary of each life if envWrap==True else each game print("Episode %d finished. Sum of shaped rewards: %.2f. Length: %d. Bonus: %.4f." % (episodes ,rewards, length, life_bonus)) f_reward.write(str(total_steps) + "," + str(rewards) + "\n") f_loss.flush() f_pred_loss.flush() f_reward.flush() if (episodes % 100 == 0): env.make_gif("./video/" + str(episodes) + ".gif") life_bonus = 0 length = 0 rewards = 0 terminal_end = True last_features = agent.network.get_initial_features() # reset lstm memory last_state = env.reset() episodes += 1 if terminal_end: break if not terminal_end: rollout.r = agent.network.value(last_state, *last_features) loss, pred_loss = agent.process(sess, rollout) f_loss.write(str(total_steps) + "," + str(loss) + "\n") f_pred_loss.write(str(total_steps) + "," + str(pred_loss) + "\n") env.close() f_reward.close() f_loss.close() f_pred_loss.close()
import gym from a2c import A2C from utils.a2c_runner import vector_train from utils.a2c_runner import evaluate if __name__ == "__main__": env = gym.vector.make("CartPole-v1", num_envs=4, asynchronous=True) actor = A2C(env.single_observation_space, env.single_action_space) returns = vector_train(actor, env, 100000, 300) eval_env = gym.make("CartPole-v1") evaluate(actor, eval_env, 1, True)
from a2c import A2C from wrappers import RecordEpisodeStatistics, TimeLimit path = "pretrained/rware-small-4ag" env_name = "rware-small-4ag-v1" time_limit = 500 # 25 for LBF RUN_STEPS = 1500 env = gym.make(env_name) env = TimeLimit(env, time_limit) env = RecordEpisodeStatistics(env) agents = [ A2C(i, osp, asp, 0.1, 0.1, False, 1, 1, "cpu") for i, (osp, asp) in enumerate(zip(env.observation_space, env.action_space)) ] for agent in agents: agent.restore(path + f"/agent{agent.agent_id}") obs = env.reset() for i in range(RUN_STEPS): obs = [torch.from_numpy(o) for o in obs] _, actions, _, _ = zip( * [agent.model.act(obs[agent.agent_id], None, None) for agent in agents]) actions = [a.item() for a in actions]