def train(config, reporter): trainer = DDPGTrainer(config=config, env=imuCalibrEnv_seq) while True: result = trainer.train() reporter(**result) if result["timesteps_since_restore"] > 250: phase = 1 else: phase = 0 trainer.workers.foreach_worker( lambda ev: ev.foreach_env(lambda env: env.set_phase(phase)))
def train(config, reporter): trainer = DDPGTrainer(config=config, env=CamImuCalibrEnv_seq) #checkpoint_path = trainer.save() policy = trainer.get_policy() print(policy.dist_class) i = 0 while True: result = trainer.train() reporter(**result) # if result["timesteps_since_restore"] > 200: # phase = 1 # else: # phase = 0 # trainer.workers.foreach_worker( # lambda ev: ev.foreach_env( # lambda env: env.set_phase(phase))) if i == 0: trainer.restore( "/home/yunke/ray_results/DDPG_CamImuCalibrEnv_seq_2020-06-03_23-18-37xwsq706i/checkpoint_437/checkpoint-437" ) if i > 3: checkpoint_path = trainer.save() print(checkpoint_path) auto_garbage_collect() i += 1
def evaluate_model(args): if args.model_path == '': print('Cannot evaluate model, no --model_path set') exit(1) def get_env(): # Simulator env uses a single map, so better for evaluation/testing. # return SteeringToWheelVelWrapper(DuckietownLF( # )) return MultiMapSteeringToWheelVelWrapper( simulator.Simulator( map_name=args.map, max_steps=2000, )) # Rather than reuse the env, another one is created later because I can't # figure out how to provide register_env with an object, th register_env('DuckieTown-Simulator', lambda _: get_env()) trainer = DDPGTrainer( env="DuckieTown-Simulator", config={ "framework": "torch", "model": { "custom_model": "image-ddpg", }, "num_gpus": args.gpu_use, }, ) trainer.restore(args.model_path) sim_env = get_env() # Standard OpenAI Gym reset/action/step/render loop. # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2 done = False observation = sim_env.reset() episode_reward = 0 while not done: action = trainer.compute_action(observation) observation, reward, done, _ = sim_env.step(action) episode_reward += reward sim_env.render() print(f'Episode complete, total reward: {episode_reward}')
class DDPGrl(object): def __init__(self, env, env_config, config): self.config = config self.config['env_config'] = env_config self.env = env(env_config) self.agent = DDPGTrainer(config=self.config, env=env) def fit(self, checkpoint=None): if checkpoint is None: checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl') for idx in trange(5): result = self.agent.train() LOGGER.warning('result: ', result) if (idx + 1) % 5 == 0: LOGGER.warning('Save checkpoint at: {}'.format(idx + 1)) state = self.agent.save_to_object() with open(checkpoint, 'wb') as fp: pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL) return result def predict(self, checkpoint=None): if checkpoint is not None: with open(checkpoint, 'rb') as fp: state = pickle.load(fp) self.agent.restore_from_object(state) done = False episode_reward = 0 obs = self.env.reset() actions = [] while not done: action = self.agent.compute_action(obs) actions.append(action) obs, reward, done, info = self.env.step(action) episode_reward += reward results = {'action': actions, 'reward': episode_reward} return results
def train(config, reporter): trainer = DDPGTrainer(config=config, env=imuCalibrEnv_seq) #checkpoint_path = trainer.save() policy = trainer.get_policy() print(policy.dist_class) i = 0 while True: result = trainer.train() reporter(**result) # if result["timesteps_since_restore"] > 200: # phase = 1 # else: # phase = 0 # trainer.workers.foreach_worker( # lambda ev: ev.foreach_env( # lambda env: env.set_phase(phase))) # if i==0: # trainer.restore("/home/yunke/ray_results/DDPG_imuCalibrEnv_seq_2020-06-27_01-48-53hwk9uq89/checkpoint_995/checkpoint-995") if i>3 and i%100==0: checkpoint_path = trainer.save() print(checkpoint_path) auto_garbage_collect() i+=1
def train_model(args, config): # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DDPG specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#ddpg trainer = DDPGTrainer( env="DuckieTown-MultiMap", config=config, ) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) # TODO(balbok0): Start values from checkpoint, if available. best_mean_reward = -np.inf epoch_of_best_mean_reward = 0 path_of_best_mean_reward = None for i in trange(args.epochs, desc="Epochs", leave=False): # Number of episodes (basically epochs) # print(f'----------------------- Starting epoch {i} ----------------------- ') # train() trains only a single episode result = trainer.train() # print(result) # Save model so far. checkpoint_path = trainer.save() # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') if result["episode_reward_mean"] > best_mean_reward: best_mean_reward = result["episode_reward_mean"] epoch_of_best_mean_reward = i path_of_best_mean_reward = checkpoint_path # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. # print(torch.cuda.memory_summary(device=None, abbreviated=False)) return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
def train_model(args): # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DDPG specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#ddpg trainer = DDPGTrainer(env="DuckieTown-MultiMap", config={ "framework": "torch", "model": { "custom_model": "image-ddpg", }, "learning_starts": 0, "train_batch_size": 16, }) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) plot = plotter.Plotter('ddpg_agent') for i in range(args.epochs): # Number of episodes (basically epochs) print( f'----------------------- Starting epoch {i} ----------------------- ' ) # train() trains only a single episode result = trainer.train() print(result) plot.add_results(result) # Save model so far. checkpoint_path = trainer.save() print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. print(torch.cuda.memory_summary(device=None, abbreviated=False)) plot.plot('DDPG DuckieTown-MultiMap')
def __init__(self, env, env_config, config): self.config = config self.config['env_config'] = env_config self.env = env(env_config) self.agent = DDPGTrainer(config=self.config, env=env)
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=10, help='number of training epochs') parser.add_argument( '--num_step', type=int, default=10**3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() model_dir = "model/{}_{}".format(args.algo, date) result_dir = "result/{}_{}".format(args.algo, date) config_env = env_config(args) # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env)) config_agent = agent_config(config_env) # # build cityflow environment trainer = DDPGTrainer(env=CityflowGymEnv, config=config_agent) for i in range(1000): # Perform one iteration of training the policy with DQN result = trainer.train() print(pretty_print(result)) if i % 20 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
}) results = tune.run( args.run, config=config, scheduler=scheduler, num_samples=4, stop=stop, # checkpoint_freq=10, checkpoint_at_end=True, # restore="/home/david/ray_results/SAC/SAC_FarmEnv_5aa8e_00000_0_2021-01-21_18-23-19/checkpoint_199/checkpoint-199", ) if args.run == "PPO": agent = PPOTrainer(config=config) elif args.run == "SAC": agent = SACTrainer(config=config) elif args.run == "DDPG": agent = DDPGTrainer(config=config) # list of lists: one list per checkpoint; each checkpoint list contains # 1st the path, 2nd the metric value checkpoints = results.get_trial_checkpoints_paths( trial=results.get_best_trial("episode_reward_mean", mode='max'), metric="episode_reward_mean") checkpoint_path, _ = checkpoints[0] print(f'checkpoint_path {checkpoint_path}') # agent = PPOTrainer(config=config_PPO) if args.as_test: check_learning_achieved(results, args.stop_reward) ray.shutdown()
config_copy = config.copy() config_copy['explore'] = False trainer = A2CTrainer(config=config_copy, env='Bertrand') elif trainer_choice == 'MADDPG': from ray.rllib.contrib.maddpg import MADDPGTrainer config['agent_id'] = 0 # For eval afterward config_copy = config.copy() config_copy['explore'] = False trainer = MADDPGTrainer(config=config_copy, env='Bertrand') elif trainer_choice == 'DDPG': from ray.rllib.agents.ddpg import DDPGTrainer # For eval afterward config_copy = config.copy() config_copy['explore'] = False trainer = DDPGTrainer(config=config_copy, env='Bertrand') analysis = tune.run( trainer_choice, # num_samples = 4, config=config, local_dir='./log', stop={'training_iteration': sessions}, mode='max', metric='episode_reward_mean', checkpoint_at_end=True) trainer.restore(checkpoint_path=analysis.best_checkpoint) # analysis = tune.run( # trainer_choice,