def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') init_w = trainer.get_policy('policy_01').get_weights() trainer.restore( 'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-25_08-56-43eo23nmho\\checkpoint_002360\\checkpoint-2360' ) trainer.workers.foreach_worker( lambda w: w.get_policy('policy_01').set_weights(init_w)) trainer.restore('.\\kill-policy-0\\checkpoint') trainer.import_model() iter = 0 #def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 200 == 0: if not os.path.exists(f'./model-{iter}'): #trainer.get_policy('policy_01').export_model(f'./model-{iter}') trainer.export_policy_model(f'./model-{iter}/main', 'policy_01') trainer.export_policy_model(f'./model-{iter}/collect', 'policy_collect') trainer.export_policy_model(f'./model-{iter}/destroy', 'policy_destroy') trainer.export_policy_model(f'./model-{iter}/kill', 'policy_kill') else: print("model already saved")
def load_agent(): # Initialize training environment ray.init() def environment_creater(params=None): agent = SimpleAvoidAgent(noise=0.05) return TronRaySinglePlayerEnvironment(board_size=13, num_players=4, agent=agent) env = environment_creater() tune.register_env("tron_single_player", environment_creater) ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard) # Configure Deep Q Learning with reasonable values config = DEFAULT_CONFIG.copy() config['num_workers'] = 4 ## config['num_gpus'] = 1 #config["timesteps_per_iteration"] = 1024 #config['target_network_update_freq'] = 256 #config['buffer_size'] = 100_000 #config['schedule_max_timesteps'] = 200_000 #config['exploration_fraction'] = 0.02 #config['compress_observations'] = False #config['n_step'] = 2 #config['seed'] = SEED #Configure for PPO #config["sample_batch_size"]= 100 #config["train_batch_size"]=200 #config["sgd_minibatch_size"]=60 #Configure A3C with reasonable values # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # Begin training or evaluation #trainer = DDPGTrainer(config, "tron_single_player") #trainer = A3CTrainer(config, "tron_single_player") #trainer = DQNTrainer(config, "tron_single_player") trainer = PPOTrainer(config, "tron_single_player") trainer.restore("./ppo_checkpoint_201/checkpoint-201") return trainer #.get_policy("trainer")
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') trainer.restore( 'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-26_20-15-082mjvde9i\\checkpoint_008980\\checkpoint-8980' ) iter = 0 while True: iter += 1 result = trainer.train() if iter % 200 == 0: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model( f'./model-{iter}') else: print("model already saved")
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') trainer.restore( 'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770' ) iter = 0 #def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 250 == 0: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model( f'./model-{iter}') else: print("model already saved")
def train_model(args): # We are using custom model and environment, which need to be registered in ray/rllib # Names can be anything. register_env("DuckieTown-MultiMap", lambda _: DiscreteWrapper(MultiMapEnv())) # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. trainer = PPOTrainer( env="DuckieTown-MultiMap", config={ "framework": "torch", "model": { "custom_model": "image-ppo", }, "sgd_minibatch_size": 64, "output": None, "compress_observations": True, "num_workers": 0, } ) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) plot = plotter.Plotter('ppo_agent') for i in range(args.epochs): # Number of episodes (basically epochs) print(f'----------------------- Starting epoch {i} ----------------------- ') # train() trains only a single episode result = trainer.train() print(result) plot.add_results(result) # Save model so far. checkpoint_path = trainer.save() print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. print(torch.cuda.memory_summary(device=None, abbreviated=False)) plot.plot('PPO DuckieTown-MultiMap')
def build_bot(): ray.init(local_mode=True) trainer = PPOTrainer(env=ExternalAtari, config=dict(**CONFIG_PPO)) model_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'ckpts') last_iter = 0 for name in os.listdir(model_dir): print(name) it = int(name.split('_')[1]) if it > last_iter: last_iter = it print( os.listdir( os.path.join(os.path.abspath(os.path.dirname(__file__)), 'ckpts/checkpoint_{}'.format(last_iter)))) trainer.restore( os.path.join( os.path.abspath(os.path.dirname(__file__)), 'ckpts/checkpoint_{}/checkpoint-{}'.format(last_iter, last_iter))) return trainer
def get_trainer(checkpoint_path=None, extra_config=None, num_workers=10): config = dict( num_gpus=0, num_workers=num_workers, num_cpus_per_worker=1, horizon=1000, lr=0.0, batch_mode="complete_episodes", callbacks=DrivingCallbacks, # explore=False, # Add this line to only use mean for action. # Setup the correct environment env=GeneralizationRacing, env_config=dict(environment_num=10000)) if extra_config: config.update(extra_config) trainer = PPOTrainer(config=config) if checkpoint_path is not None: trainer.restore(os.path.expanduser(checkpoint_path)) return trainer
def evaluate_model(args): if args.model_path == '': print('Cannot evaluate model, no --model_path set') exit(1) def get_env(): # Simulator env uses a single map, so better for evaluation/testing. # DiscreteWrapper just converts wheel velocities to high level discrete actions. return DiscreteWrapper(simulator.Simulator( map_name=args.map, max_steps=2000, )) # Rather than reuse the env, another one is created later because I can't # figure out how to provide register_env with an object, th # register_env('DuckieTown-Simulator', lambda _: get_env()) trainer = PPOTrainer( env="DuckieTown-Simulator", config={ "framework": "torch", "model": { "custom_model": "image-ppo", }, }, ) trainer.restore(args.model_path) sim_env = get_env() # Standard OpenAI Gym reset/action/step/render loop. # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2 done = False observation = sim_env.reset() episode_reward = 0 while not done: action = trainer.compute_action(observation) observation, reward, done, _ = sim_env.step(action) episode_reward += reward sim_env.render() print(f'Episode complete, total reward: {episode_reward}')
def my_train_fn(config, reporter): # Train for n iterations with high LR agent1 = PPOTrainer(env="CartPole-v0", config=config) for _ in range(10): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPOTrainer(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(10): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop()
def get_trainer(friction, checkpoint_path=None, extra_config=None): config = dict( num_gpus=0, num_workers=10, num_cpus_per_worker=1, horizon=1000, lr=0.0, batch_mode="complete_episodes", callbacks=DrivingCallbacks, # Setup the correct environment env=GeneralizationRacing, env_config=dict( # The start seed is default to 0, so the test environments are unseen before. environment_num=200, vehicle_config=dict(wheel_friction=friction))) if extra_config: config.update(extra_config) trainer = PPOTrainer(config=config) if checkpoint_path is not None: trainer.restore(os.path.expanduser(checkpoint_path)) return trainer
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config) if checkpoint_dir: trainer.load_checkpoint(checkpoint_dir) chk_freq = 10 if useModelFromLowLevelTrain: config_low["num_workers"] = 0 config_low["num_envs_per_worker"] = 1 config_low["num_gpus"] = 1 agentLow = PPOTrainer(config_low) agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}". format(experiment_name, experiment_id, checkpoint_num, checkpoint_num)) lowWeight = agentLow.get_policy().get_weights() highWeight = trainer.get_policy("low_level_policy").get_weights() lowState = agentLow.get_policy().get_state() importedOptState = OrderedDict([ (k.replace("default_policy", "low_level_policy"), v) for k, v in lowState["_optimizer_variables"].items() ]) importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } importedPolicy["_optimizer_variables"] = importedOptState trainer.get_policy("low_level_policy").set_state(importedPolicy) chk_freq = 1 # Hanya perlu 1 kali saja di awal untuk save model hasil import while True: result = trainer.train() tune.report(**result) if (trainer._iteration % chk_freq == 0): with tune.checkpoint_dir( step=trainer._iteration) as checkpoint_dir: trainer.save(checkpoint_dir)
from ray.tune import function import pickle from collections import OrderedDict from train_config import config_hier, config_low, single_env if __name__ == "__main__": ray.shutdown() ray.init(ignore_reinit_error=True) agentLow = PPOTrainer(config_low) experiment_name = "HWalk_Low_Mimic" experiment_id = "PPO_HumanoidBulletEnvLow-v0_699c9_00000_0_2021-04-18_22-14-39" checkpoint_num = "1930" agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".format( experiment_name, experiment_id, checkpoint_num, checkpoint_num)) # agent.export_policy_model("out/model", "default_policy") # agent.import_model("out/model") # agent.get_policy("default_policy").import_model_from_h5 agentHigh = PPOTrainer(config_hier) lowWeight = agentLow.get_policy().get_weights() highWeight = agentHigh.get_policy("low_level_policy").get_weights() importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } s1 = agentLow.get_policy().get_state()
"episodes_this_iter": "train_episodes", "policy_reward_mean/main": "reward", "win_rate": "win_rate", "league_size": "league_size", }, sort_by_metric=True, ), ) # Restore trained trainer (set to non-explore behavior) and play against # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 trainer = PPOTrainer(config=dict(config, **{"explore": False})) if args.from_checkpoint: trainer.restore(args.from_checkpoint) else: checkpoint = results.get_last_checkpoint() if not checkpoint: raise ValueError("No last checkpoint found in results!") trainer.restore(checkpoint) # Play from the command line against the trained agent # in an actual (non-RLlib-wrapped) open-spiel env. human_player = 1 env = Environment(args.env) while num_episodes < args.num_episodes_human_play: print("You play as {}".format("o" if human_player else "x")) time_step = env.reset() while not time_step.last():
config['rollout_fragment_length'] = rollout_fragment_length adjust_config(config, spec['run']) if args.mode == "load": adjust_config_for_loading(config, spec['run']) if spec["run"] == "PPO": from ray.rllib.agents.ppo import PPOTrainer as Trainer else: raise NotImplementedError("Not a supported algorithm") trainer = Trainer(env=env_module.env_cls, config=config) if args.checkpoint is not None: trainer.restore(args.checkpoint) env_module.rm.initialize() env = env_module.env_cls(config['env_config']) cam = env_module.default_cam() renderer = env_module.EnvRenderer(trainer=trainer, env=env, cam=cam) renderer.run() else: tune.run( spec['run'], name=spec['name'], stop=spec['stop'], local_dir=spec['local_dir'], checkpoint_freq=spec['checkpoint_freq'],
agent_cfg[ "shuffle_sequences"] = True # Whether to shuffle sequences in the batch when training agent_cfg[ "grad_clip"] = None # Clamp the norm of the gradient during optimization (None to disable) # ====================== Run the optimization ====================== agent_cfg["lr"] = 1.0e-4 agent_cfg["lr_schedule"] = None train_agent = Trainer(agent_cfg, "env", logger_creator) checkpoint_path = train(train_agent, max_timesteps=100000) # ===================== Enjoy the trained agent ====================== test_agent = Trainer(agent_cfg, "env", logger_creator) test_agent.restore(checkpoint_path) test(test_agent, explore=False) # =================== Terminate Ray backend ==================== train_agent.stop() test_agent.stop() ray.shutdown() # =================== Terminate the Ray backend ==================== train_agent.stop() test_agent.stop() ray.shutdown()
} }, "multiagent": { "policies": env_list, "policy_mapping_fn": lambda agent_id: agent_id }, "lr": 3e-4, "num_sgd_iter": 5, "vf_loss_coeff": 0.0003, "log_level": "WARN", "clip_param": 10.0, "vf_clip_param": 10.0 } trainer = PPOTrainer(env="fire_mage", config=rnn_config) trainer.restore('./checkpoints_iter_24/checkpoint_138/checkpoint-138') #trainer.restore('./checkpoints_iter_20/checkpoint_325/checkpoint-325') #trainer.restore('./checkpoints_iter_13/checkpoint_193/checkpoint-193') #trainer.restore('./checkpoints_iter_12/checkpoint_206/checkpoint-206') state_list = [] for key, val in env_list.items(): dummy_model = RNNModel(val[1], val[2], 0, rnn_config['model'], 'happy') state = dummy_model.get_initial_state() state_list.append((key, [s.detach().numpy() for s in state])) state_list = dict(state_list) iters = 100
best_ckpt = 1 ckpt_to_restore = None # Restore the latest checkpoint if exist: for ckpt in os.listdir(ckpt_dir): if ckpt == ".gitkeep": continue ckpt_indx = int(ckpt.split("_")[1]) if ckpt_indx > best_ckpt: best_ckpt = ckpt_indx if best_ckpt > 1: ckpt_to_restore = os.path.join(ckpt_dir, "checkpoint_" + str(best_ckpt), "checkpoint-" + str(best_ckpt)) trainer.restore(ckpt_to_restore) print("Checkpoint number " + str(best_ckpt) + " restored") else: print("No checkpoint found, Training starting from scratch...") # Serving and training loop env = trainer.env_creator({}) # obs_state = {} # obs_state["obs"] = obs[list(obs.keys())[0]] player1 = Connect4Config.PLAYER1 player1_id = Connect4Config.PLAYER1_ID player2 = Connect4Config.PLAYER2 player2_id = Connect4Config.PLAYER2_ID actual_player = player1 actual_player_id = player1_id obs = env.reset(player1_id)
return env env = env_creator() env_name = 'pistonball_v4' register_env(env_name, lambda config: PettingZooEnv(env_creator())) with open(params_path, "rb") as f: config = pickle.load(f) # num_workers not needed since we are not training del config['num_workers'] del config['num_gpus'] ray.init(num_cpus=8, num_gpus=1) PPOagent = PPOTrainer(env=env_name, config=config) PPOagent.restore(checkpoint_path) reward_sum = 0 frame_list = [] i = 0 env.reset() for agent in env.agent_iter(): observation, reward, done, info = env.last() reward_sum += reward if done: action = None else: action, _, _ = PPOagent.get_policy("policy_0").compute_single_action(observation)
def run_saved(args): if args.OSM[0] == 1 and args.OSM[1] == 0: setting = "RLvsOSM" elif args.OSM[0] == 1 and args.OSM[1] == 1: setting = "OSMvsOSM" else: setting = "RL{0}".format(len(args.alphas) - sum(args.honest)) if args.save_path == 'none': checkpointnum = 0 else: checkpointnum = args.save_path.split('-')[-1] env_name = "{setting}_{spirit}_{blocks}_{alpha:04d}_{spy}_{checkpointnum}".format( spirit=int(args.team_spirit * 100), blocks=int(args.blocks), alpha=int(args.alphas[0] * 10000), spy=args.spy[1], setting=setting, checkpointnum=checkpointnum) ray.init(local_mode=True, memory=700 * 1024 * 1024, object_store_memory=100 * 1024 * 1024, driver_object_store_memory=100 * 1024 * 102) print("Testing {0}".format(setting), env_name) def select_policy(agent_id): return agent_id ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) register_env(env_name, lambda config: ParametricBitcoin(config)) if args.extended: action_n = 6 else: action_n = 4 # define the state space, one for parties that have access to spy info and one without spy_state_space = constants.make_spy_space(len(args.alphas), args.blocks) blind_state_space = constants.make_blind_space(len(args.alphas), args.blocks) policies = dict() osm_space = spaces.Box( low=np.zeros(4), high=np.array([args.blocks + 4, args.blocks + 4, args.blocks + 4, 3.])) if sum(args.OSM) > 0: osm = OSM_strategy( osm_space, spaces.Discrete(4), { 'alpha': args.alphas[0], 'gamma': args.gammas[0], 'blocks': args.blocks }) blind_dim = 0 for space in blind_state_space: blind_dim += get_preprocessor(space)(space).size spy_dim = 0 for space in spy_state_space: spy_dim += get_preprocessor(space)(space).size spy_state_space_wrapped = spaces.Dict({ "action_mask": spaces.Box(0, 1, shape=(action_n, )), "avail_actions": spaces.Box(-10, 10, shape=(action_n, action_n)), "bitcoin": spaces.Box(0, np.inf, shape=(spy_dim, )) }) blind_state_space_wrapped = spaces.Dict({ "action_mask": spaces.Box(0, 1, shape=(action_n, )), "avail_actions": spaces.Box(-10, 10, shape=(action_n, action_n)), "bitcoin": spaces.Box(0, np.inf, shape=(blind_dim, )) }) preps = [None for i in range(len(args.alphas))] for i in range(len(args.alphas)): if args.spy[i] == 1: policies[str(i)] = (None, spy_state_space_wrapped, spaces.Discrete(action_n), { "model": { "use_lstm": args.use_lstm, "custom_model": "pa_model", "custom_options": { "parties": len(args.alphas), "spy": True, "blocks": args.blocks, "extended": args.extended } } }) preps[i] = get_preprocessor(spy_state_space_wrapped)( spy_state_space_wrapped) elif args.OSM[i] == 1: policies[str(i)] = (OSM_strategy, osm_space, spaces.Discrete(4), { 'alpha': args.alphas[0], 'gamma': args.gammas[0], 'blocks': args.blocks }) elif args.honest[i] == 1: policies[str(i)] = (Honest, osm_space, spaces.Discrete(6), { 'alpha': args.alphas[0], 'gamma': args.gammas[0], 'blocks': args.blocks, 'fiftyone': args.fiftyone[i], 'extended': args.extended }) else: policies[str(i)] = (None, blind_state_space_wrapped, spaces.Discrete(action_n), { "model": { "use_lstm": args.use_lstm, "custom_model": "pa_model", "custom_options": { "parties": len(args.alphas), "spy": False, "blocks": args.blocks, "extended": args.extended } } }) preps[i] = get_preprocessor(blind_state_space_wrapped)( blind_state_space_wrapped) env_config = { 'max_hidden_block': args.blocks, 'alphas': args.alphas, 'gammas': args.gammas, 'ep_length': args.ep_length, 'print': args.debug, 'spy': args.spy, 'team_spirit': args.team_spirit, 'OSM': args.OSM, 'extended': args.extended, 'honest': args.honest, } policies_to_train = [ str(i) for i in range(len(args.alphas)) if args.OSM[i] != 1 and args.honest[i] != 1 ] env = ParametricBitcoin(env_config=env_config) if len(policies_to_train) != 0: if args.trainer == 'PPO': trainer = PPOTrainer(env=BitcoinEnv, config={ "num_workers": 0, "multiagent": { "policies_to_train": policies_to_train, "policies": policies, "policy_mapping_fn": select_policy, }, "env_config": env_config }) else: trainer = DQNTrainer(env=env_name, config={ "eager": True, "multiagent": { "policies_to_train": policies_to_train, "policies": policies, "policy_mapping_fn": select_policy, }, "env_config": env_config }) model = trainer.get_policy().model print(model.base_model.summary()) print("Restoring model") trainer.restore(args.save_path) loaded_policies = dict() for k in range(len(args.alphas)): if args.OSM[k] == 1: loaded_policies[str(k)] = osm elif args.honest[k] == 1: honest = Honest( osm_space, spaces.Discrete(6), { 'alpha': args.alphas[0], 'gamma': args.gammas[0], 'blocks': args.blocks, 'fiftyone': args.fiftyone[k], 'extended': args.extended }, ) loaded_policies[str(k)] = honest preps[k] = None else: loaded_policies[str(k)] = trainer.get_policy(str(k)) trials = 100000 reslist = [] for j in range(3): blocks = np.zeros(len(args.alphas) + 1) event_blocks = np.zeros(len(args.alphas) + 1) action_dist = { str(i): np.zeros(action_n) for i in range(len(args.alphas)) } res = dict() for i in range(trials): obs = env.reset() isDone = False RNNstates = {str(i): [] for i in range(len(args.alphas))} while not isDone: action_dict = dict() for k in range(len(policies)): prep = preps[k] if not prep: action_dict[str(k)], _, _ = loaded_policies[str( k)].compute_single_action(obs=obs[str(k)], state=[]) else: action_dict[str(k)], _, _ = loaded_policies[str( k)].compute_single_action(obs=prep.transform( obs[str(k)]), state=[]) action_dist[str(k)][action_dict[str(k)]] += 1 obs, _, done, _ = env.step(action_dict) isDone = done['__all__'] if i == 0 and j == 0: with open( os.path.join('/afs/ece/usr/charlieh/eval_results', env_name + '_trace.txt'), 'w+') as f: f.write(env.wrapped._debug_string) blocks += env.wrapped._accepted_blocks event_blocks += env.wrapped._total_blocks total_event_blocks = np.sum(event_blocks) if i % 100 == 0: print("Relative rewards", blocks / np.sum(blocks)) print("Relative received", event_blocks / total_event_blocks) for i in range(len(args.alphas)): print("Action dist", str(i), action_dist[str(i)] / np.sum(action_dist[str(i)])) res['blocks'] = blocks res['action dist'] = action_dist res['blocks norm'] = blocks / np.sum(blocks) res['actions norm'] = { str(i): action_dist[str(i)] / np.sum(action_dist[str(i)]) for i in range(len(args.alphas)) } reslist.append(res) np.save(os.path.join('/afs/ece/usr/charlieh/eval_results', env_name), reslist, allow_pickle=True)
"model": { "custom_model": "3rd_model", "use_lstm": True, } }), }, "policy_mapping_fn": (lambda agent_id: "ppo_policy"), "policies_to_train": ["ppo_policy"], }, }, env=v0.RllibPomme) # fdb733b6 checkpoint = 950 checkpoint_dir = "/home/nhatminh2947/ray_results/3rd_model_no_wood_static/PPO_PommeMultiAgent_283d4406_0_2020-03-24_04-09-09mjgzr90e" ppo_agent.restore("{}/checkpoint_{}/checkpoint-{}".format( checkpoint_dir, checkpoint, checkpoint)) agents_list = [ agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent() ] env_id = "PommeTeam-nowood-v0" env = pommerman.make(env_id, agents_list) penv = v0.RllibPomme({ "agent_names": agent_names, "env_id": env_id, "phase": 0 })
#config['seed'] = SEED # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # Begin training or evaluation trainer = PPOTrainer(config, "tron_single_player") num_epoch = 10000 test_epoch = 2 if LOAD_FROM_CHECKPOINT: # np.random.seed(42) trainer.restore("./ppo_model/checkpoint_400/checkpoint-400") for epoch in range(num_epoch): print("Training iteration: {}".format(epoch), end='') res = trainer.train() print(f", Average reward: {res['episode_reward_mean']}") if epoch % test_epoch == 0: reward = env.test(trainer) if epoch % 300 == 0: trainer.save() trainer.save() else: for epoch in range(num_epoch): #print(type(trainer)) print("Training iteration: {}".format(epoch), end='')
def load_agent(): # Initialize training environment ray.init() def environment_creater(params=None): agent = SimpleAvoidAgent(noise=0.05) return TronRayEnvironment(board_size=13, num_players=4) env = environment_creater() tune.register_env("tron_multi_player", environment_creater) ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard) # Configure Deep Q Learning with reasonable values config = DEFAULT_CONFIG.copy() config['num_workers'] = 4 ## config['num_gpus'] = 1 #config["timesteps_per_iteration"] = 1024 #config['target_network_update_freq'] = 256 #config['buffer_size'] = 100_000 #config['schedule_max_timesteps'] = 200_000 #config['exploration_fraction'] = 0.02 #config['compress_observations'] = False #config['n_step'] = 2 #config['seed'] = SEED #Configure for PPO #config["sample_batch_size"]= 100 #config["train_batch_size"]=200 #config["sgd_minibatch_size"]=60 #Configure A3C with reasonable values # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # All of the models will use the same network as before agent_config = { "model": { "vf_share_layers": True, "conv_filters": [(512, 5, 1), (256, 3, 2), (128, 3, 2)], "fcnet_hiddens": [256], "custom_preprocessor": 'tron_prep' } } def policy_mapping_function(x): if x == '0': return "trainer" return "opponent" config['multiagent'] = { "policy_mapping_fn": policy_mapping_function, "policies": { "trainer": (None, env.observation_space, env.action_space, agent_config), "opponent": (None, env.observation_space, env.action_space, agent_config) }, "policies_to_train": ["trainer"] } # Begin training or evaluation #trainer = DDPGTrainer(config, "tron_single_player") #trainer = A3CTrainer(config, "tron_single_player") #trainer = MARWILTrainer(config, "tron_single_player") trainer = PPOTrainer(config, "tron_multi_player") trainer.restore("./sp_checkpoint_1802/checkpoint-1802") return trainer.get_policy("trainer")
NUM_GPUS = args.num_gpus TOTAL_STEPS = int(args.total_steps) launch_script = "./launchClient_quiet.sh" register_env(ENV_NAME, create_env) # update config with evaluation resources and switch exploration off config = get_config(checkpoint_file) config["num_workers"] = args.num_workers config["num_gpus"] = args.num_gpus config["explore"] = False # Load agent ray.init() trainer = PPOTrainer(config) trainer.restore(checkpoint_file) policy = trainer.get_policy() # Start Malmo instances GAME_INSTANCE_PORTS = [COMMAND_PORT + i for i in range(NUM_WORKERS)] instances = launch_minecraft(GAME_INSTANCE_PORTS, launch_script=launch_script) # Connect to the Java instances env = create_env(config) # Custom evaluation loop print(f"running evaluations for {EPISODES} episodes") for ep in range(EPISODES): state = env.reset() done = False
ppo_config[ 'num_workers'] = 4 # noptepochs (int) Number of epoch when optimizing the surrogate ppo_config[ 'clip_param'] = 0.2 # cliprange (float or callable) Clipping parameter, it can be a function ppo_config[ 'vf_clip_param'] = 1 # cliprange_vf = None? -- (float or callable) Clipping parameter for the value function, # it can be a function. This is a parameter specific to the OpenAI implementation. If None is passed (default), then # cliprange (that is used for the policy) will be used. IMPORTANT: this clipping depends on the reward scaling. To # deactivate value function clipping (and recover the original PPO implementation), you have to pass a negative value # (e.g. -1). ppo_config['env_config'] = env_config ppo_config['train_batch_size'] = 4000 ppo_config['explore'] = False PPO_agent = PPOTrainer(config=ppo_config, env=SSA_Tasker_Env) PPO_agent.restore(ppo_checkpoint) PPO_agent.get_policy().config['explore'] = False logdir = '/home/ash/ray_results/ssa_experiences/agent_visible_greedy_spoiled/' + str( env_config['rso_count']) + 'RSOs_jones_flatten_10000episodes/' marwil_config = MARWIL_CONFIG.copy() marwil_config['evaluation_num_workers'] = 1 marwil_config['env_config'] = env_config marwil_config['evaluation_interval'] = 1 marwil_config['evaluation_config'] = {'input': 'sampler'} marwil_config['beta'] = 1 # 0 marwil_config['input'] = logdir marwil_config['env_config'] = env_config marwil_config['explore'] = False
"multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping, "policies_to_train": policies_to_train, }, "observation_filter": "NoFilter", "clip_actions": False, "framework": "torch" }, env="MinerEnv-v0") id = 2050 checkpoint_dir = "/home/lucius/ray_results/gold_miner_2/PPO_MinerEnv-v0_0_2020-09-13_00-54-26q3mjnpej" checkpoint = "{}/checkpoint_{}/checkpoint-{}".format(checkpoint_dir, id, id) ppo_agent.restore(checkpoint) for i in range(8): mem_size = 0 weights = ppo_agent.get_policy(f"policy_{i}").get_weights() for key in weights: parameters = 1 for value in weights[key].shape: parameters *= value mem_size += parameters weights[key] = torch.tensor(weights[key]) print(mem_size) torch.save( weights,
config['multiagent'] = { "policy_mapping_fn": policy_mapping_function, "policies": { "trainer": (None, env.observation_space, env.action_space, agent_config), "opponent": (None, env.observation_space, env.action_space, agent_config) }, "policies_to_train": ["trainer"] } trainer = PPOTrainer(config, "tron_multi_player") #trainer.restore("./desktop_version/checkpoint_1802/checkpoint-1802") trainer.restore("./ppo_selfplay/sp_checkpoint_2257/checkpoint-2257") num_epoch = 1000 save_epochs = 50 update_times = 0 #update_percentage = update_times * 0.01 epoch_update = 0 for epoch in range(num_epoch): print("Training iteration: {}".format(epoch), end='\t') res = trainer.train() win_percentage = (res["policy_reward_mean"]["trainer"] - res["episode_len_mean"]) / 11 - 10 / 11 + 1 print("Win percentage: ", win_percentage, end='\t') print("Average reward: ", res["policy_reward_mean"]["trainer"]) update_percentage = update_times * 0.01
"timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } results = tune.run( args.run, config=config, stop=stop, verbose=2, checkpoint_at_end=True) if args.as_test: check_learning_achieved(results, args.stop_reward) checkpoints = results.get_trial_checkpoints_paths( trial=results.get_best_trial("episode_reward_mean", mode="max"), metric="episode_reward_mean") checkpoint_path = checkpoints[0][0] trainer = PPOTrainer(config) trainer.restore(checkpoint_path) # Inference loop. env = StatelessCartPole() # Run manual inference loop for n episodes. for _ in range(10): episode_reward = 0.0 reward = 0.0 action = 0 done = False obs = env.reset() while not done: # Create a dummy action using the same observation n times, # as well as dummy prev-n-actions and prev-n-rewards. action, state, logits = trainer.compute_single_action(
config = {**env_config, **agent_config, **general_config} agent = PPOTrainer(config=config) elif args.run == "SAC": agent_config = config_SAC config = {**env_config, **agent_config, **general_config} agent = SACTrainer(config=config) elif args.run == "DDPG": agent_config = config_DDPG config = {**env_config, **agent_config, **general_config} agent = DDPGTrainer(config=config) # '/home/david/ray_results/SAC/SAC_FarmEnv_ff600_00000_0_2021-02-06_14-34-11/checkpoint_50/checkpoint-50' checkpoint_path = '/home/david/ray_results/SAC/SAC_FarmEnv_305d2_00000_0_2021-03-24_08-40-22/checkpoint_10/checkpoint-10' agent.restore(checkpoint_path=checkpoint_path) font = pygame.font.Font('freesansbold.ttf', 20) textX = 10 textY = 10 # arrow indicating wind direction arrow_Img = pygame.image.load('wind-compass.png') arrow_x = 250 arrow_y = 5 screen = pygame.display.set_mode((800, 600)) pygame.display.set_caption('WindAI') def update_env():
config['num_workers'] = 1 config['num_gpus'] = 1 config['framework'] = "torch" config['gamma'] = 0.1 config['monitor'] = False # PPO config ... # config['lr'] = 1e-4 # config['train_batch_size'] config['model']['dim'] = 21 config['model']['conv_filters'] = [[8, [4, 4], 2], [16, [2, 2], 2], [512, [6, 6], 1]] #, #[config['train_batch_size'], 4, 1, 1]] # trainner = PPOTrainer(config=config, env="mars_explorer:explorer-v01") trainner = PPOTrainer(config=config, env="custom-explorer") # import pdb; pdb.set_trace() PATH = "/home/dkoutras/ray_results/290_out_of_400/checkpoint_2991/checkpoint-2991" trainner.restore(PATH) import pdb pdb.set_trace() for _ in range(10): initial_time = time.time() result = trainner.train() print( f"mean:{result['episode_reward_mean']} time:{time.time() - initial_time:.2f}[sec]" )
verbose=1, checkpoint_freq=1, checkpoint_at_end=True, ) print("Pre-training done.") best_checkpoint = results.get_best_checkpoint( results.trials[0], mode="max") print(f".. best checkpoint was: {best_checkpoint}") # Create a new dummy Trainer to "fix" our checkpoint. new_trainer = PPOTrainer(config=config) # Get untrained weights for all policies. untrained_weights = new_trainer.get_weights() # Restore all policies from checkpoint. new_trainer.restore(best_checkpoint) # Set back all weights (except for 1st agent) to original # untrained weights. new_trainer.set_weights( {pid: w for pid, w in untrained_weights.items() if pid != "policy_0"}) # Create the checkpoint from which tune can pick up the # experiment. new_checkpoint = new_trainer.save() new_trainer.stop() print(".. checkpoint to restore from (all policies reset, " f"except policy_0): {new_checkpoint}") print("Starting new tune.run") # Start our actual experiment.