def jim_load(path, env_params, jim=True): if not jim: # Read in params used to create trainer config_path = os.path.join(os.path.dirname(path), "config.pkl") with open(config_path, "rb") as f: # We use dill (instead of pickle) here because we must deserialize functions config = dill.load(f) config['rllib_params']['env_config'] = env_params else: model_params = { "NUM_HIDDEN_LAYERS": 0, "SIZE_HIDDEN_LAYERS": 256, "NUM_FILTERS": 64, "NUM_CONV_LAYERS": 3 } config = { "model": { "custom_model_config": model_params, "custom_model": RllibDQNModel }, "gamma": 0.995, "framework": "torch", "env_config": env_params, "hiddens": [256, 256], "output": 'brawl-training/results', "lr": 1e-4, "v_min": -300.0, "v_max": 300.0, "noisy": True, "sigma0": 0.2, "n_step": 5, "exploration_config": { "type": "EpsilonGreedy", "initial_epsilon": 1.0, "final_epsilon": 0.01, "epsilon_timesteps": 200000 } } ray.shutdown() ray.init() def env_creator(env_config): return SSBMEnv(**env_config) register_env("SSBM", env_creator) if jim: trainer = dqn.DQNTrainer(env="SSBM", config=config) else: trainer = PPOTrainer(env="melee", config=config['rllib_params']) trainer.restore(path) return trainer
def main(checkpoint): env = CustomKukaEnv(dict(renders=True, isDiscrete=False, maxSteps=10000000)) class EnvPlaceholder(gym.Env): def __init__(self, env_config): super(EnvPlaceholder, self).__init__() self.observation_space = env.observation_space self.action_space = env.action_space trainer = PPOTrainer(config=config, env=EnvPlaceholder) trainer.restore(checkpoint) done = False i = 0 while not done: time.sleep(0.01) action = env.action_space.sample() state, reward, done, info = env.step(action) obs = env.getExtendedObservation() print(i) print(f"Action: {action}") print(f"Observation: {obs}") i += 1
ray.init() results = {} N = 100 config["num_workers"] = 1 config["num_gpus"] = 0 # You may have to run each agents in separate sessions # to avoid PyBullet restrictions agent = "ALP-GMM" # agent = "Manual" # agent = "No Curriculum" print(f"Evaluating agent: {agent}") results[agent] = [] trainer = PPOTrainer(config=config, env=envs[agent]) trainer.restore(agents[agent]) env = envs[agent](dict(config["env_config"], **{"in_training": False})) for i in range(N): print(agent, i) done = False obs = env.reset() ep_reward = 0 while not done: action = trainer.compute_action(obs) obs, reward, done, info = env.step(action) ep_reward += reward if done: obs = env.reset() results[agent].append(ep_reward) print(f"Agent {agent} score: {np.round(np.mean(results[agent]), 2)}")
ray.init() num_policies = 4 policies = { "policy_{}".format(i): (None, env.observation_space, env.action_space, {}) for i in range(num_policies) } policy_ids = list(policies.keys()) config = { "multiagent": { "policies": policies, "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)), }, "framework": "tf", } #trainer = ApexTrainer(env=TicTacToe, config=config) trainer = PPOTrainer(env=TicTacToe, config=config) trainer.restore("ttt_model/checkpoint_51/checkpoint-51") obs = env.reset() print(obs) done = False while not done: env.render() player = list(obs)[0] if player == "X": action = int(input(f"Player {player} - enter action 1-9:")) - 1 else: action = trainer.compute_action(np.array(obs["O"]), policy_id="policy_1") obs, rewards, dones, infos = env.step({player: action}) done = dones["__all__"] print(obs, rewards, dones, infos) env.render()
def __init__(self, ticker_list, time_interval, drl_lib, agent, cwd, net_dim, state_dim, action_dim, API_KEY, API_SECRET, APCA_API_BASE_URL, tech_indicator_list, turbulence_thresh=30, max_stock=1e2, latency=None): #load agent self.drl_lib = drl_lib if agent == 'ppo': if drl_lib == 'elegantrl': from elegantrl.agent import AgentPPO from elegantrl.run import Arguments, init_agent #load agent config = { 'state_dim': state_dim, 'action_dim': action_dim, } args = Arguments(agent=AgentPPO, env=StockEnvEmpty(config)) args.cwd = cwd args.net_dim = net_dim # load agent try: agent = init_agent(args, gpu_id=0) self.act = agent.act self.device = agent.device except BaseException: raise ValueError("Fail to load agent!") elif drl_lib == 'rllib': from ray.rllib.agents import ppo from ray.rllib.agents.ppo.ppo import PPOTrainer config = ppo.DEFAULT_CONFIG.copy() config['env'] = StockEnvEmpty config["log_level"] = "WARN" config['env_config'] = { 'state_dim': state_dim, 'action_dim': action_dim, } trainer = PPOTrainer(env=StockEnvEmpty, config=config) trainer.restore(cwd) try: trainer.restore(cwd) self.agent = trainer print("Restoring from checkpoint path", cwd) except: raise ValueError('Fail to load agent!') elif drl_lib == 'stable_baselines3': from stable_baselines3 import PPO try: #load agent self.model = PPO.load(cwd) print("Successfully load model", cwd) except: raise ValueError('Fail to load agent!') else: raise ValueError( 'The DRL library input is NOT supported yet. Please check your input.' ) else: raise ValueError('Agent input is NOT supported yet.') #connect to Alpaca trading API try: self.alpaca = tradeapi.REST(API_KEY, API_SECRET, APCA_API_BASE_URL, 'v2') except: raise ValueError( 'Fail to connect Alpaca. Please check account info and internet connection.' ) #read trading time interval if time_interval == '1s': self.time_interval = 1 elif time_interval == '5s': self.time_interval = 5 elif time_interval == '1Min': self.time_interval = 60 elif time_interval == '5Min': self.time_interval = 60 * 5 elif time_interval == '15Min': self.time_interval = 60 * 15 else: raise ValueError('Time interval input is NOT supported yet.') #read trading settings self.tech_indicator_list = tech_indicator_list self.turbulence_thresh = turbulence_thresh self.max_stock = max_stock #initialize account self.stocks = np.asarray([0] * len(ticker_list)) #stocks holding self.stocks_cd = np.zeros_like(self.stocks) self.cash = None #cash record self.stocks_df = pd.DataFrame(self.stocks, columns=['stocks'], index=ticker_list) self.asset_list = [] self.price = np.asarray([0] * len(ticker_list)) self.stockUniverse = ticker_list self.turbulence_bool = 0 self.equities = []
import numpy as np import ray from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG from ray.rllib.agents.ppo.ppo import PPOTrainer from inventory_env import InventoryEnv config = DEFAULT_CONFIG.copy() config["env"] = InventoryEnv ray.init() trainer = PPOTrainer(config=config, env=InventoryEnv) trainer.restore( # Replace this with your checkpoint path. "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-58-04t8r36o9o/checkpoint_781/checkpoint-781" ) if __name__ == "__main__": np.random.seed(0) env = InventoryEnv() episode_reward_avgs = [] episode_total_rewards = [] for i in range(2000): print(f"Episode: {i+1}") state = env.reset() done = False ep_rewards = [] while not done: action = trainer.compute_action(state) state, reward, done, info = env.step(action)
config = { "num_gpus": 1, "env": "yaniv", "env_config": env_config, "framework": "torch", "multiagent": { "policies": { "policy_1": (None, obs_space, act_space, {}), "policy_2": (None, obs_space, act_space, {}), "policy_3": (None, obs_space, act_space, {}), "policy_4": (None, obs_space, act_space, {}), }, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["policy_1"], }, "model": { "custom_model": "yaniv_mask", "fcnet_hiddens": [512, 512], }, } ray.init(include_dashboard=False) trainer = PPOTrainer(env="yaniv", config=config) trainer.restore(args.checkpoint) tourny = YanivTournament(env_config, trainers=[trainer]) tourny.run(args.eval_num) print("\n\nRESULTS:\n") tourny.print_stats()
def main(): """main function""" ray.init() if 'NUM_WORKERS' in os.environ: num_of_workers = int(os.environ['NUM_WORKERS']) else: num_of_workers = DEFAULT_NUM_WORKERS if os.path.isfile(WORLDS_JSON_PATH): with open(WORLDS_JSON_PATH) as jsonfile: dict_worlds = json.load(jsonfile) else: dict_worlds = None if os.path.isfile(MASTER_URI_JSON_PATH): with open(MASTER_URI_JSON_PATH) as jsonfile: list_master_uri = json.load(jsonfile)['master_uri'] else: list_master_uri = None config = ppo.DEFAULT_CONFIG.copy() config.update({ 'env_config': { 'dict_worlds': dict_worlds, 'list_master_uri': list_master_uri, # 병렬 시뮬레이션 수행 스크립트 사용할 때 # 'list_master_uri': None, # 기본 ROS 마스터 URI로 시뮬레이션 1개만 돌릴 떄 'use_random_heading': True, 'result_csv': RESULT_CSV_NAME, 'num_workers': num_of_workers }, 'num_gpus': 0, # 사용하는 GPU 수에 맞게 설정 'num_workers': num_of_workers, 'train_batch_size': 10000, 'batch_mode': 'complete_episodes' }) register_env('gazebo', lambda cfg: DroneSimEnv(cfg)) trainer = PPOTrainer(env='gazebo', config=config) num_iteration = 10000 latest_index = 0 checkpoint_path = None checkpoint_name = None for name in [ name for name in os.listdir(CHECKPOINT_PATH_BASE) if 'checkpoint_' in name ]: index = int(name.replace('checkpoint_', '')) if index > latest_index: latest_index = index checkpoint_path = CHECKPOINT_PATH_BASE + name + '/' checkpoint_name = 'checkpoint-' + str(index) if checkpoint_name: print('Running using (', checkpoint_name, ').') trainer.restore(checkpoint_path + checkpoint_name) print(checkpoint_name, '==========================================') ## goal/collision data init success_cnt = 0 goal_rate_filename = 'goal_rate_{}.csv'.format( WORLDS_JSON_NAME.replace('curriculum/', '').replace('.json', '')) if not os.path.isfile(goal_rate_filename): with open(goal_rate_filename, 'w') as goal_rate_logfile: goal_rate_logfile.write("training_iteration,goal_rate\n") while True: ## goal/collision data create with open(RESULT_CSV_NAME, 'w+') as file_: pass result = trainer.train() print(pretty_print(result)) # 복구용 체크포인트는 5 iteration 마다 저장 if result['training_iteration'] % 5 == 0: checkpoint = trainer.save(CHECKPOINT_PATH_BASE) print("checkpoint saved at", checkpoint) # 결과 확인용 체크포인트는 100 iteration 마다 저장 if result['training_iteration'] % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) ## goal/collision data read with open(RESULT_CSV_NAME, 'r') as file_: episodes_raw = file_.read() goal_list = episodes_raw.split(',') goal_cnt = goal_list.count('1') if goal_cnt == 0: goal_ratio = 0 else: goal_ratio = goal_cnt / (goal_cnt + goal_list.count('0')) print('goal rate:', goal_ratio) with open(goal_rate_filename, 'a') as goal_rate_logfile: goal_rate_logfile.write( str(result['training_iteration']) + ',' + str(goal_ratio) + '\n') if goal_ratio >= 0.95: success_cnt += 1 print('success in raw:', success_cnt) else: success_cnt = 0 if success_cnt >= 5 and EXIT_ON_SUCCESS: if result['training_iteration'] % 5 != 0: checkpoint = trainer.save(CHECKPOINT_PATH_BASE) print("checkpoint saved at", checkpoint) break if result['training_iteration'] >= num_iteration: break print('PPO training is done.')
"policies": { "policy_1": (None, obs_space, act_space, {}), "policy_2": (None, obs_space, act_space, {}), "policy_3": (None, obs_space, act_space, {}), "policy_4": (None, obs_space, act_space, {}), }, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["policy_1"], }, "model": { "custom_model": "yaniv_mask", "fcnet_hiddens": [512, 512], }, } ray.init(include_dashboard=False, local_mode=True) ppo = PPOTrainer(env="yaniv", config=config) ppo.restore(args.ppo_checkpoint) a3c = A3CTrainer(env="yaniv", config=config) a3c.restore(args.a3c_checkpoint) tourney = YanivTournament(env_config, trainers=[a3c], opponent="intermediate") # tourney.run_episode(True) # tourney.print_stats() tourney.run(args.eval_num) print("\n\nRESULTS:\n") tourney.print_stats()