def play(env_name, seed, load_file, total_timesteps, n_cpu): np.set_printoptions(precision=5) def padding_obss(obss, dummy_obss): dummy_obss[ 0, :, :, : ] = obss return dummy_obss # trained LSTM model cannot change number of env. # so it needs to reshape observation by padding dummy data. dummy_obss = np.zeros((n_cpu, 64, 64, 4)) env = SubprocVecEnv([make_env(env_name, 0, seed)]) model = PPO2.load(load_file, verbose=1) obss = env.reset() obss = padding_obss(obss, dummy_obss) rewards_buf = [] steps_buf = [] # TODO: single for i in range(total_timesteps): actions, _states = model.predict(obss) actions = actions[0:1] obss, rewards, dones, infos = env.step(actions) obss = padding_obss(obss, dummy_obss) # env.render() # dummy if dones[0]: rewards_buf.append(infos[0]['episode']['r']) steps_buf.append(infos[0]['episode']['l']) line = np.array([np.mean(rewards_buf), np.std(rewards_buf), np.mean(steps_buf), np.std(steps_buf)]) print(len(rewards_buf), line) obss = env.reset() obss = padding_obss(obss, dummy_obss) env.close()
def main(): #env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10, saveEpisode=True)) for i in range(1)]) env = SubprocVecEnv([ (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=10)) for i in range(1) ]) try: model = PPO2("MlpPolicy", env, verbose=1, tensorboard_log='/home/ralph/swoc2019/log') if SaveFile.exists(): print('loading...') model.load_parameters(SaveFile) else: print('Warning: No save file loaded') print('evaluating...', end='') obs = env.reset() totalRewards = None for i in range(100): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) totalRewards = totalRewards + rewards if totalRewards is not None else rewards env.render() sleep(0.2) print(f'mean reward: {np.mean(totalRewards)}') except KeyboardInterrupt: print('closing...') finally: env.close() print('closed')
def train(): def callback(_locals, _globals): # Save model _locals['self'].save(MODEL_NAME) envs = [create_env_headless for _ in range(ENV_COUNT)] vec_envs = SubprocVecEnv(envs) model = PPO2('CnnPolicy', vec_envs, verbose=1, ent_coef=0.0001, n_steps=256) if not os.path.isfile(MODEL_NAME): model.save(MODEL_NAME) vec_envs.close() print("Run again to train") else: model.learn(total_timesteps=TIMESTEPS, callback=callback) model.save(MODEL_NAME) vec_envs.close() print("Training Done") # Evaluation print("Evaluation") vec_env = create_env_headless() vec_env = DummyVecEnv([lambda: vec_env]) model = PPO2.load(MODEL_NAME) print(evaluate_policy(model, vec_env, n_eval_episodes=100)) print(evaluate_policy(model, vec_env, n_eval_episodes=100)) vec_env.close()
def train_policy(): ppo_config = { "gamma": 0.9988, "n_steps": 200, "ent_coef": 0, "learning_rate": 0.001, "vf_coef": 0.99, "max_grad_norm": 0.1, "lam": 0.95, "nminibatches": 5, "noptepochs": 100, "cliprange": 0.2, "tensorboard_log": log_relative_path } os.makedirs(log_relative_path) policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[256, 128]) env = SubprocVecEnv([_make_env(rank=i) for i in range(5)]) model = PPO2(MlpPolicy, env, _init_setup_model=True, policy_kwargs=policy_kwargs, verbose=1, **ppo_config) model.learn(total_timesteps=1000, tb_log_name="ppo2", reset_num_timesteps=False) model.save(os.path.join(log_relative_path, 'model')) env.env_method("save_world", log_relative_path) env.close() return
def main(): env = SubprocVecEnv([(lambda i=i: SwocGym( i + 1, GameServicePath, i, actionRepeat=4, oneTarget=True)) for i in range(4)]) try: model = PPO2("MlpPolicy", env, verbose=1, policy_kwargs={ 'net_arch': [256, 256, 256, 128, 128, 128], 'act_fun': tf.nn.relu }, n_steps=256, ent_coef=0.0, learning_rate=1e-5) if SaveFile.exists(): print('loading...') model.load_parameters(SaveFile) else: print('Warning: No save file loaded') print('evaluating...', end='') totalRewards = evaluate(env, model) print(f'mean reward: {np.mean(totalRewards)}') except KeyboardInterrupt: print('closing...') finally: env.close() print('closed')
def play(env_name, seed, load_file, total_timesteps, n_cpu): np.set_printoptions(precision=5) def padding_obss(obss, dummy_obss): dummy_obss[ 0, :, :, : ] = obss return dummy_obss # if it's GUI mode, number of env is changed to 1 to reduce GUI windows. # but trained LSTM model cannot change number of env. # so it needs to reshape observation by padding dummy data. isGUI = env_name.find('GUI') != -1 dummy_obss = np.zeros((n_cpu, 64, 64, 4)) if isGUI else None env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(1 if isGUI else n_cpu)]) model = PPO2.load(load_file, verbose=1) obss = env.reset() obss = padding_obss(obss, dummy_obss) if isGUI else obss rewards_buf = [] steps_buf = [] # TODO: single for i in range(total_timesteps): actions, _states = model.predict(obss) actions = actions[0:1] if isGUI else actions obss, rewards, dones, infos = env.step(actions) obss = padding_obss(obss, dummy_obss) if isGUI else obss # env.render() # dummy if dones.any(): rewards_buf.extend([ info['episode']['r'] for info in infos if 'episode' in info ]) steps_buf.extend([ info['episode']['l'] for info in infos if 'episode' in info ]) line = np.array([np.mean(rewards_buf), np.std(rewards_buf), np.mean(steps_buf), np.std(steps_buf)]) print(len(rewards_buf), line) env.close()
def run_experiment(args): hyperparam_file = os.path.join(HYPERPARAM_DIR, args.agent + ".yml") hyperparams = yaml.safe_load(open(hyperparam_file)) hyperparams = hyperparams[args.env] n_envs = hyperparams.pop("n_envs", 1) n_timesteps = int(hyperparams.pop("n_timesteps")) policy = hyperparams.pop("policy") normalize = hyperparams.pop("normalize", None) vecEnv = [] for i in range(n_envs): # Bit of trickery here to avoid referencing # to the same "i" vecEnv.append((lambda idx: lambda: create_env(args, idx))(i)) if args.subprocenv: vecEnv = SubprocVecEnv(vecEnv) else: vecEnv = DummyVecEnv(vecEnv) # Handle learning rates # Taken from rl-zoo/train.py for key in ['learning_rate', 'cliprange', 'cliprange_vf']: if key not in hyperparams or args.agent == "dqn": continue if key == 'learning_rate' and args.agent == "a2c": continue if isinstance(hyperparams[key], str): schedule, initial_value = hyperparams[key].split('_') initial_value = float(initial_value) hyperparams[key] = linear_schedule(initial_value) elif isinstance(hyperparams[key], (float, int)): # Negative value: ignore (ex: for clipping) if hyperparams[key] < 0: continue hyperparams[key] = constfn(float(hyperparams[key])) if args.forced_cliprange is not None: hyperparams["cliprange"] = args.forced_cliprange agent_class = AVAILABLE_ALGORITHMS[args.agent] agent = agent_class(policy, vecEnv, verbose=1, **hyperparams) # Prepare callback checkpoint_dir = os.path.join(args.output, CHECKPOINT_DIR) os.makedirs(checkpoint_dir) # Note that save_freq is counted in number of agent step-calls, # not env step-calls. save_freq = n_timesteps // (args.num_snapshots * n_envs) checkpoint_callback = CheckpointCallback(save_freq, checkpoint_dir) agent.learn(total_timesteps=n_timesteps, callback=checkpoint_callback) vecEnv.close()
def train(): n_cpu = os.cpu_count() env = SubprocVecEnv([lambda: DemoEnv() for i in range(n_cpu)]) model = PPO2(MlpPolicy, env, verbose=1, policy_kwargs={'net_arch': [dict(vf=[4], pi=[4])]}) model.learn(total_timesteps=int(1e6)) model.save("ppo2_DemoEnv") env.close() del model
def run_training(config: Dict): """Runs training based on config passed in""" print("Run configuration:") print(config) seed(config['seed']) # read config hyperparameters = read_hyperparameters(config) graphs = graphs_from_args(config['graphs']) policy, policy_kwargs = policy_from_args(config, graphs) demands = demands_from_args(config, graphs) env_kwargs = env_kwargs_from_args(config) env_name = config['env_name'] timesteps = config['timesteps'] parallelism = config['parallelism'] log_name = config['log_name'] model_name = config['model_name'] tensorboard_log = config['tensorboard_log'] oblivious_routings = None # make env env = lambda: gym.make(env_name, dm_sequence=demands, graphs=graphs, oblivious_routings=oblivious_routings, **env_kwargs) vec_env = SubprocVecEnv([env for _ in range(parallelism)], start_method="spawn") # make model model = PPO2(policy, vec_env, cliprange_vf=-1, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log=tensorboard_log, **hyperparameters) # learn if env_name == 'ddr-iterative-v0': model.learn(total_timesteps=timesteps, tb_log_name=log_name) else: model.learn(total_timesteps=timesteps, tb_log_name=log_name) # save it model.save(model_name) # make sure everything stopped correctly vec_env.close()
def get_rewards(self, skills=[], train_total_timesteps=5000000, eval_times=100, eval_max_steps=10000, model_save_name=None, add_info={}): # def get_rewards(self, skills=[], train_total_timesteps=10, eval_times=10, eval_max_steps=10, model_save_name=None, add_info={}): """ :param skills: (list) the availiable action sequence for agent e.g [[0,2,2],[0,1,1]] :param train_total_timesteps: (int)total_timesteps to train :param eval_times: (int)the evaluation times e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode :param eval_max_steps: (int)maximum timesteps per episode when evaluate :param model_save_name: (str)specify the name of saved model (should not repeat) :param add_info: (dict) other information to log in log.txt """ # env = SkillWrapper(self.env, skills=skills) if self.num_cpu > 1: env = SubprocVecEnv([ self.make_env(self.env_creator, i, skills) for i in range(self.num_cpu) ]) else: env = DummyVecEnv([lambda: self.env_creator()]) model = self.model(self.policy, env, verbose=self.verbose) self.strat_time = time.time() print("start to train agent...") model.learn(total_timesteps=train_total_timesteps, reset_num_timesteps=self.reset_num_timesteps) print("Finish train agent") if self.save_path is not None: if self.preserve_model > 0: self.save_model(model, model_save_name, skills=skills) # evaluate info = self.evaluate(env, model, eval_times, eval_max_steps) env.close() #log result info.update(add_info) self.log(info) self._serial_num = self._serial_num + 1 return info["ave_score"], info["ave_action_reward"]
class SimulatorModel(object): def __init__(self, _make_env_func, parallel_agents): """ This class instantiates a dynamics model based on the pybullet simulator (i.e: simulates exactly the result of the actions), it can be used for reward tuning and verifying tasks..etc :param _make_env_func: (func) a function if called it will return a gym environment. :param parallel_agents: (int) number of parallel agents to siumulate to evaluate the actions. """ self.parallel_agents = parallel_agents self.envs = SubprocVecEnv( [_make_env_func() for i in range(self.parallel_agents)]) return def evaluate_trajectories(self, action_sequences): """ A function to be called to evaluate the action sequences and return the corresponding reward for each sequence. :param action_sequences: (nd.array) actions to be evaluated (number of sequences, horizon length) :return: (nd.array) sum of rewards for each action sequence. """ horizon_length = action_sequences.shape[1] num_of_particles = action_sequences.shape[0] rewards = np.zeros([num_of_particles]) assert ((float(num_of_particles) / self.parallel_agents).is_integer()) for j in range(0, num_of_particles, self.parallel_agents): self.envs.reset() total_reward = np.zeros([self.parallel_agents]) for k in range(horizon_length): actions = action_sequences[j:j + self.parallel_agents, k] task_observations, current_reward, done, info = \ self.envs.step(actions) total_reward += current_reward rewards[j:j + self.parallel_agents] = total_reward return rewards def end_sim(self): """ Closes the environments that were used for simulation. :return: """ self.envs.close() return
def train(): def callback(_locals, _globals): global n_steps if (n_steps + 1) % 100 == 0: _locals['self'].save(MODEL_NAME) n_steps += 1 envs = [create_env_headless_monitor for _ in range(ENV_COUNT)] envs = SubprocVecEnv(envs) model = PPO2('CnnPolicy', envs, verbose=1, ent_coef=0.0001, n_steps=512) model.save(MODEL_NAME) model.learn(total_timesteps=TIMESTEPS, callback=callback) model.save(MODEL_NAME) print("Training Done") envs.close()
def _eval_model(model, env_id, ob_shape, num_eps, plot=False): test_env = SubprocVecEnv([make_env(env_id)]) sharpe_ratios = [] for episode in range(num_eps): # Padding zeros to the test env to match the shape of the training env. zero_completed_obs = np.zeros((NUM_CPU,) + ob_shape) zero_completed_obs[0, :] = test_env.reset() state = None for _ in range(L): action, state = model.predict(zero_completed_obs, state=state, deterministic=True) zero_completed_obs[0, :], reward, done, _ = test_env.env_method('step', action[0], indices=0)[0] sharpe_ratios.append(test_env.env_method('get_sharpe_ratio', indices=0)[0]) if plot: test_env.env_method('render', indices=0) test_env.close() # Return the average sharpe ratio return sum(sharpe_ratios) / len(sharpe_ratios)
def test(): # Parallel environments n_cpu = 4 env = SubprocVecEnv([lambda: RSEnv() for i in range(n_cpu)]) model = A2C(MlpPolicy, env, verbose=1) model.learn(total_timesteps=600000, log_interval=10) model.save("sba2c") env = TestRSEnv() obs = env.reset() done = False while not done: action, _ = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() env.close()
def main(): # env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10)) for i in range(16)]) env = SubprocVecEnv([ (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=60)) for i in range(12) ]) try: #model = PPO2("MlpPolicy", env, verbose=1, ent_coef=0.01, tensorboard_log='/home/ralph/swoc2019/log') #model = PPO2("MlpPolicy", env, verbose=1, policy_kwargs={'net_arch': [1024,1024,512,512,256,256,128,128,64,64], 'act_fun': tf.nn.relu}, # n_steps=64, ent_coef=0.01, learning_rate=1e-5, tensorboard_log='/home/ralph/swoc2019/log') model = PPO2("MlpPolicy", env, verbose=1, policy_kwargs={ 'net_arch': [1024, 1024, 512, 512, 256, 256, 128, 128, 64, 64], 'act_fun': tf.nn.relu }) if SaveFile.exists(): print('loading...', end='') model.load_parameters(SaveFile) print('loaded!') else: # No weights loaded, so remove history with open(RewardsLog, 'w+') as file: file.write('') try: print('learning...') model.learn(total_timesteps=100000000, callback=callback) finally: print('saving...', end='') model.save(SaveFile) print('saved!') except KeyboardInterrupt: print('closing...', end='') finally: env.close() print('closed!')
def learn(env_name, seed, load_file, save_file, tensorboard_log, total_timesteps, n_cpu): best_mean_reward = -np.inf best_mean_step = np.inf save_file = env_name if save_file is None else save_file best_save_file = save_file + BEST_SAVE_FILE_SUFFIX start_time = time.time() def callback(_locals, _globals): nonlocal best_mean_reward, best_mean_step t = (time.time() - start_time) / 3600.0 print(f'hours: {t:.2f}') ep_info_buf = _locals['ep_info_buf'] if len(ep_info_buf) < ep_info_buf.maxlen: return True mean_reward = np.mean([ ep_info['r'] for ep_info in ep_info_buf ]) mean_step = np.mean([ ep_info['l'] for ep_info in ep_info_buf ]) if mean_reward > best_mean_reward: best_mean_reward = mean_reward print('best_mean_reward:', best_mean_reward) print('saving new best model:', best_save_file) _locals['self'].save(best_save_file) if mean_step < best_mean_step: best_mean_step = mean_step print('best_mean_step:', best_mean_step) return True # False should finish learning # policy = CnnPolicy policy = CnnLstmPolicy # policy = CnnLnLstmPolicy print(env_name, policy) # Run this to enable SubprocVecEnv on Mac OS X. # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331 env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(n_cpu)]) if load_file is not None: model = PPO2.load(load_file, env, verbose=1, tensorboard_log=tensorboard_log) else: model = PPO2(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=total_timesteps, log_interval=5, callback=callback) print('saving model:', save_file) model.save(save_file) env.close()
def learn(env_name, seed, load_path, save_path, tensorboard_log, total_timesteps, n_cpu): save_path = env_name if save_path is None else save_path checkpoint_callback = CheckpointCallback(save_freq=2000, save_path=save_path) eval_env = make_env(env_name, n_cpu, seed)() eval_callback = EvalCallback(eval_env, best_model_save_path=save_path+'/best', log_path=tensorboard_log, eval_freq=1000) callback = CallbackList([checkpoint_callback, eval_callback]) policy = CnnPolicy # policy = CnnLstmPolicy # policy = CnnLnLstmPolicy print(env_name, policy) # Run this to enable SubprocVecEnv on Mac OS X. # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES # see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331 env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(n_cpu)]) if load_path is not None: model = PPO2.load(load_path, env, verbose=1, tensorboard_log=tensorboard_log) else: model = PPO2(policy, env, verbose=1, tensorboard_log=tensorboard_log) model.learn(total_timesteps=total_timesteps, log_interval=5, callback=callback) print('saving model:', save_path+'/latest_model') model.save(save_path+'/latest_model') env.close()
def main(): env = SubprocVecEnv([(lambda i=i: SwocGym( i + 1, GameServicePath, i, actionRepeat=4, oneTarget=True)) for i in range(16)]) try: model = PPO2("MlpPolicy", env, verbose=1, policy_kwargs={ 'net_arch': [256, 256, 256, 128, 128, 128], 'act_fun': tf.nn.relu }, n_steps=32, ent_coef=0.1, learning_rate=1e-4, tensorboard_log='/home/ralph/swoc2019/log') if SaveFile.exists(): print('loading...') model.load_parameters(SaveFile) else: # No weights loaded, so remove history with open(RewardsLog, 'w+') as file: file.write('') try: print('learning...') model.learn(total_timesteps=100000000, callback=callback) finally: print('saving...') model.save(SaveFile) print('saved!') except KeyboardInterrupt: print('closing...') finally: env.close() print('closed')
def train(exp_name, env_name, n_envs, **kwargs): # Train 10 runs for n in range(1, 11): # PPO2_n # Configure logger log_folder = 'training_logs/' + exp_name + '_' + str(n) + '/' logger.configure(log_folder, ['csv']) print("[+] Starting training", n) env = SubprocVecEnv([ make_env(log_folder, env_name, i, (n - 1) * 32) for i in range(n_envs) ]) model = PPO2( policy=MlpPolicy, env=env, verbose=True, # Make it deterministic seed=32 * n, # Fixed seed n_cpu_tf_sess=1, # force deterministic results # Pass arguments **kwargs) model.learn( total_timesteps=int(250e3), log_interval=1, # log each update ) # Saving model os.makedirs("trained_models", exist_ok=True) model.save("trained_models/" + exp_name + "_" + str(n)) env.close() del env del model
env = SubprocVecEnv([make_env(env_id, params) for _ in range(4)], start_method='fork') policy_kwargs = dict() model = A2C('MlpLstmPolicy', env, learning_rate=1e-3, verbose=1, n_steps=64, tensorboard_log="/tmp", gamma=0.99, policy_kwargs=policy_kwargs) model.learn(total_timesteps=int(params["steps"])) print("Done learning, saving model") model.save("agents/SBL_{}".format(params["ID"])) print("Saved model, closing env") env.close() print("Finished training with ID: {}".format(ID)) else: #env_vec = SubprocVecEnv([make_env(env_id, params) for _ in range(4)], start_method='fork') env = env_id(params["env_list"], max_n_envs=1, specific_env_len=70, s_len=150, walls=True, target_vel=params["target_vel"], use_contacts=params["use_contacts"]) print("Testing") policy_name = "QWZ" # LX3: joints + contacts + yaw policy_path = 'agents/SBL_{}'.format(policy_name) model = A2C.load(policy_path)
def main(): args = get_args() log_dir = create_log_dir(args) if not args.test: writer = SummaryWriter(log_dir) else: writer = None SEED = 721 if args.ram_obs or args.env == "slimevolley_v0": obs_type = 'ram' else: obs_type = 'rgb_image' env = make_env( args.env, SEED, obs_type=obs_type ) # TODO used for providing spaces info, can also modify SubprocVecEnv wrapper # https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html?highlight=multiprocessing envs = SubprocVecEnv([ lambda: make_env(args.env, obs_type=obs_type) for _ in range(args.num_envs) ], start_method='spawn') # envs.seed(np.random.randint(1000, size=args.num_envs).tolist()) # random seeding envs.seed(SEED) # fix seeding state_spaces = env.observation_spaces action_spaces = env.action_spaces print('state_spaces: ', state_spaces, ', action_spaces: ', action_spaces) learner_args = {'device': args.device} env.reset() agents = env.agents print('agents: ', agents) if args.train_both: fixed_agents = [] else: fixed_agents = [ 'first_0' ] # SlimeVolley: opponent is the first, the second agent is the learnable one if obs_type == 'ram': model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces, action_spaces, 'MLP', fixed_agents, learner_args, **hyperparams).to(args.device) else: model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces, action_spaces, 'CNN', fixed_agents, learner_args, **hyperparams).to(args.device) load_model(model, args) path = f"model/{args.env}/" os.makedirs(path, exist_ok=True) if args.fictitious: path = path + 'fictitious_' parallel_rollout(envs, model, writer, max_eps=max_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\ render=args.render, model_path=path, against_baseline=args.against_baseline, selfplay=args.selfplay, \ fictitious=args.fictitious, test=args.test) envs.close()
def main(): args = get_configuration() args.state_dim = util.get_state_dim(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir, exist_ok=True) if args.graph_embedding: class MyPolicy(EmbeddingPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=True, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=reuse, **_kwargs) else: class MyPolicy(EnigmaPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=True, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=reuse, **_kwargs) t0 = time.time() from mpi4py import MPI as mpi comm = mpi.COMM_WORLD rank = comm.Get_rank() all = comm.Get_size() gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(',') gpu_count = len(gpus) gpu = gpus[rank % gpu_count] os.environ["CUDA_VISIBLE_DEVICES"] = gpu print("My rank is {} out of {}, using GPU {}".format(rank, all, gpu)) if args.model_type == "ppo2": from stable_baselines import PPO2 as PPO env = SubprocVecEnv([(lambda: ProofEnv.ProofEnv(args)) for _ in range(args.parallel_envs) ]) #, start_method="spawn") elif args.model_type == "ppo1": args.parallel_envs = 1 env = DummyVecEnv([lambda: ProofEnv.ProofEnv(args)]) # from stable_baselines import PPO1 as PPO from ppo import PPO1 as PPO if args.saved_model == None: myPolicy = MyPolicy if args.model_type == "ppo2": model = PPO( policy=myPolicy, env=env, n_steps=args.actorbatch, # nminibatches=args.optim_stepsize, lam=0.95, gamma=args.gamma, noptepochs=4, ent_coef=args.entcoeff, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) elif args.model_type == "ppo1": model = PPO(myPolicy, env, verbose=2, timesteps_per_actorbatch=args.actorbatch, schedule=args.lr_schedule, optim_stepsize=args.optim_stepsize, entcoeff=args.entcoeff, optim_batchsize=args.optim_batchsize, gamma=args.gamma) else: print("Loading model from {}".format(args.saved_model)) model = PPO.load(args.saved_model) model.set_env(env) counter = 0 for ind in range(args.parallel_envs): env.env_method("set_model", model, indices=list(range(args.parallel_envs))) modelfiles = [] for train_timestep, train_dir in zip(args.train_timesteps, args.train_dirs): problem_files = sorted(util.list_problems(train_dir)) problem_files = util.split_list(problem_files, all)[rank] problem_files_splitted = util.split_list(problem_files, args.parallel_envs, extensible=False) if args.add_repeating_pretraining: for ind in range(args.parallel_envs): env.env_method("set_source", problem_files_splitted[ind], indices=[ind], generator_type="repeating") # all_thread_timestep = train_timestep * all print("PRETRAINING") model.learn(total_timesteps=train_timestep) print("Pretraining on {} finished in {}".format( train_dir, util.format_time(time.time() - t0))) for ind in range(args.parallel_envs): env.env_method("set_source", problem_files_splitted[ind], indices=[ind]) # all_thread_timestep = train_timestep * all model.learn(total_timesteps=train_timestep) modelfile = "{}/ppo1_fcop_train_{}".format(args.outdir, counter) modelfiles.append(modelfile) if rank == 0: model.save(modelfile) # logger.logkv("finished_train_problems", counter) counter += 1 print("Training on {} finished in {}".format( train_dir, util.format_time(time.time() - t0))) statistics_list = env.get_attr("statistics", indices=list(range(args.parallel_envs))) blacklist_list = env.get_attr("blacklist", indices=list(range(args.parallel_envs))) for i, statistics in enumerate(statistics_list): print("ENV {} - {} - blacklist: {}\n".format( rank, i, blacklist_list[i])), util.print_problemdict(statistics, rank) # for f in statistics: # statistics[f]["mcts"].display_tree([0]) # util.print_problemdict(env.envs[0].statistics) if len(args.train_dirs) > 0 and len( args.train_timesteps) > 0: # we did training print("We have finished training, rank {}".format(rank)) # for p in problem_files: # vis_policy.vis_policy(env.envs[0], model, p) env.close() del env del model # here we wait for everyone comm.Barrier() print("We have started evaluation, rank {}".format(rank)) # evaluation without training if (args.saved_model is not None) and (len( args.train_dirs) == 0): # no training, just evaluation modelfiles = [args.saved_model] for evaldir in args.evaldirs: for model_index, modelfile in enumerate(modelfiles): eval.eval_mpi(args, evaldir, modelfile, model_index) # here we wait for everyone comm.Barrier()
callbacks += 1 if RENDER_TO_SCREEN: locals["self"].env.render() # Saves the model every 1000 calls if callbacks % 10000 == 0: locals['self'].save("models/" + folderName + "/" + runName + "-" + str(callbacks)) return True # Returns true as false ends the training n = 6 list = [lambda: gym.make('gvgai-aliens-lvl0-v0') for _ in range(n)] + \ [lambda: gym.make('gvgai-aliens-lvl1-v0') for _ in range(n)] + \ [lambda: gym.make('gvgai-boulderdash-lvl0-v0') for _ in range(n)] + \ [lambda: gym.make('gvgai-boulderdash-lvl1-v0') for _ in range(n)] + \ [lambda: gym.make('gvgai-missilecommand-lvl0-v0') for _ in range(n)] + \ [lambda: gym.make('gvgai-missilecommand-lvl1-v0') for _ in range(n)] # multiprocess environment n_cpu = multiprocessing.cpu_count() venv = SubprocVecEnv(list) venv = EnvWrapper(venv, (128, 128, 3)) #(110, 300, 3) model = A2C(ONet, venv, verbose=1, tensorboard_log="tensorboard/" + folderName + "/", n_steps=stepsUpdate) model.learn(total_timesteps=int(1e8), tb_log_name=runName, callback=callback) venv.close() model.save("models/" + folderName + "/" + runName + "-Final")