def __init__(self, name, environment,training_iterations=10000, checkpoint_path=None, gpu=True): self.name = name self.env = environment self.config = DEFAULT_CONFIG self.config['num_gpus'] = 1 if gpu else 0 self.config['num_gpus_per_worker'] = 1 if gpu else 0 self.iterations = training_iterations self.trainer = SACTrainer(env = self.env) # load model if checkpoint_path != '': self.trainer.restore(checkpoint_path)
def train(config, reporter): trainer = SACTrainer(config=config, env=CamCalibrEnv_seq) policy = trainer.get_policy() print(policy.dist_class) while True: result = trainer.train() reporter(**result) if result["timesteps_since_restore"] > 200: phase = 1 else: phase = 0 trainer.workers.foreach_worker( lambda ev: ev.foreach_env(lambda env: env.set_phase(phase))) checkpoint_path = trainer.save() print(checkpoint_path)
def train_zero(config, reporter): agent = SACTrainer(config) #agent.restore("/home/yunke/ray_results/AlphaZero_BlackjackEnv_zero_2020-05-01_22-50-303ae70oaq/checkpoint_1981/checkpoint-1981") #continue training #training curriculum, start with phase 0 episodes = 0 i = 0 while True: result = agent.train() if reporter is None: continue else: reporter(**result) if i % 50 == 0: #save every 10th training iteration checkpoint_path = agent.save() print(checkpoint_path) i+=1
class SACrl(object): def __init__(self, env, env_config, config): self.config = config self.config['env_config'] = env_config self.env = env(env_config) self.agent = SACTrainer(config=self.config, env=env) def fit(self, checkpoint=None): if checkpoint is None: checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl') for idx in trange(5): result = self.agent.train() LOGGER.warning('result: ', result) if (idx + 1) % 5 == 0: LOGGER.warning('Save checkpoint at: {}'.format(idx + 1)) state = self.agent.save_to_object() with open(checkpoint, 'wb') as fp: pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL) return result def predict(self, checkpoint=None): if checkpoint is not None: with open(checkpoint, 'rb') as fp: state = pickle.load(fp) self.agent.restore_from_object(state) done = False episode_reward = 0 obs = self.env.reset() actions = [] while not done: action = self.agent.compute_action(obs) actions.append(action) obs, reward, done, info = self.env.step(action) episode_reward += reward results = {'action': actions, 'reward': episode_reward} return results
def train(config, reporter): trainer = SACTrainer(config=config, env=imuCalibrEnv_seq) #checkpoint_path = trainer.save() policy = trainer.get_policy() print(policy.dist_class) i = 0 while True: result = trainer.train() reporter(**result) # if result["timesteps_since_restore"] > 200: # phase = 1 # else: # phase = 0 # trainer.workers.foreach_worker( # lambda ev: ev.foreach_env( # lambda env: env.set_phase(phase))) # if i==0: # trainer.restore("/home/yunke/ray_results/SAC_imuCalibrEnv_seq_2020-05-21_23-27-20ig3rw_2c/checkpoint_1/checkpoint-1") if i%100==0: checkpoint_path = trainer.save() print(checkpoint_path) auto_garbage_collect() i+=1
class SACAgent(Agent): def __init__(self, name, environment,training_iterations=10000, checkpoint_path=None, gpu=True): self.name = name self.env = environment self.config = DEFAULT_CONFIG self.config['num_gpus'] = 1 if gpu else 0 self.config['num_gpus_per_worker'] = 1 if gpu else 0 self.iterations = training_iterations self.trainer = SACTrainer(env = self.env) # load model if checkpoint_path != '': self.trainer.restore(checkpoint_path) def action(self, obs): act = self.trainer.compute_action(obs) return act def train(self, save_iter = 100): for it in range(self.iterations): self.trainer.train() if it % save_iter == 0: checkpoint = self.trainer.save() print("checkpoint saved at", checkpoint)
def __init__(self, env, env_config, config): self.config = config self.config['env_config'] = env_config self.env = env(env_config) self.agent = SACTrainer(config=self.config, env=env)
config['num_workers'] = args.workers config['num_gpus'] = 1 config['framework'] = "torch" config['gamma'] = args.gamma config['monitor'] = False config['model']['dim'] = 21 config['model']['conv_filters'] = [ [8, [3, 3], 2], [16, [2, 2], 2], [512, [6, 6], 1]] config["buffer_size"] = 500_000 # If True prioritized replay buffer will be used. config["prioritized_replay"] = True trainner = SACTrainer(config=config, env="mars_explorer:explorer-v01") if PATH != "": print(f"\nLoading trainner from dir {PATH}") trainner.restore(PATH) else: print(f"Starting trainning without a priori knowledge") N_start = 0 N_finish = args.steps results = [] episode_data = [] episode_json = [] writer = SummaryWriter(comment="SAC-GEP")
DEFAULT_CONFIG = SACTrainer.merge_trainer_configs( SAC_DEFAULT_CONFIG, { # Batch mode (see common config) "batch_mode": "complete_episodes", # If True prioritized replay buffer will be used. "prioritized_replay": False, # RNNSAC does not suport n-step > 1 yet! "n_step": 1, # If True, assume a zero-initialized state input (no matter where in # the episode the sequence is located). # If False, store the initial states along with each SampleBatch, use # it (as initial state when running through the network for training), # and update that initial state during training (from the internal # state outputs of the immediately preceding sequence). "zero_init_states": True, # If > 0, use the `burn_in` first steps of each replay-sampled sequence # (starting either from all 0.0-values if `zero_init_state=True` or # from the already stored values) to calculate an even more accurate # initial states for the actual sequence (starting after this burn-in # window). In the burn-in case, the actual length of the sequence # used for loss calculation is `n - burn_in` time steps # (n=LSTM’s/attention net’s max_seq_len). "burn_in": 0, # Set automatically: The number of contiguous environment steps to # replay at once. Will be calculated via # model->max_seq_len + burn_in. # Do not set this to any valid value! "replay_sequence_length": -1, }, _allow_unknown_configs=True, )
} }) results = tune.run( args.run, config=config, scheduler=scheduler, num_samples=4, stop=stop, # checkpoint_freq=10, checkpoint_at_end=True, # restore="/home/david/ray_results/SAC/SAC_FarmEnv_5aa8e_00000_0_2021-01-21_18-23-19/checkpoint_199/checkpoint-199", ) if args.run == "PPO": agent = PPOTrainer(config=config) elif args.run == "SAC": agent = SACTrainer(config=config) elif args.run == "DDPG": agent = DDPGTrainer(config=config) # list of lists: one list per checkpoint; each checkpoint list contains # 1st the path, 2nd the metric value checkpoints = results.get_trial_checkpoints_paths( trial=results.get_best_trial("episode_reward_mean", mode='max'), metric="episode_reward_mean") checkpoint_path, _ = checkpoints[0] print(f'checkpoint_path {checkpoint_path}') # agent = PPOTrainer(config=config_PPO) if args.as_test: check_learning_achieved(results, args.stop_reward) ray.shutdown()
'framework': 'tfe' if args.tfe else 'tf', # 'target_entropy': args.target_entropy, # 'asymmetric': args.asymmetric, # "multiagent": { # "policy_mapping_fn": lambda x: f"worker_p{x}", # if x == 0 else "worker_p2", # 'policies': policies, # 'policies_to_train': [f"worker_p{e}" for e in range(args.ensemble_size)] # }, } ray.init( num_cpus=args.num_cpus or None, local_mode=args.local_mode, # redis_max_memory=int(4e9), # object_store_memory=int(10e9), # memory=int(16e9) ) if args.debug: trainer = SACTrainer(config=config) # trainer = PPOTrainer(config=config) while True: results = trainer.train() # distributed training step print( f"Iter: {results['training_iteration']}, R: {results['episode_reward_mean']}" ) else: tune.run( SACTrainer, # PPOTrainer, verbose=args.verbose, config=config)
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='PPO', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=10, help='number of training epochs') parser.add_argument( '--num_step', type=int, default=10**3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() config_env = env_config(args) # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env)) config_agent = agent_config(config_env) # # build cityflow environment trainer = SACTrainer(env=CityflowGymEnv, config=config_agent) for i in range(500): # Perform one iteration of training the policy with PPO result = trainer.train() print(pretty_print(result)) if i % 30 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
(self.log_return_series, self.metric_series)).transpose() self.observation = np.concatenate( (price_lookback, metrics, self.position_series), axis=1) return self.observation def transaction_cost( self, new_action, old_action, ): turnover = np.abs(new_action - old_action) fees = 0.9995 tcost = turnover * np.log(fees) return tcost # Train agent agent = SACTrainer(config, Equitydaily) best_reward = -0.4 for i in range(50000): result = agent.train() if (result["episode_reward_mean"] > best_reward + 0.01) or (i % 1000 == 500): path = agent.save("sacagent") print(path) if result["episode_reward_mean"] > best_reward + 0.01: best_reward = result["episode_reward_mean"] print(i, best_reward)
def main_sac(): from ray.rllib.agents.sac import SACTrainer, DEFAULT_CONFIG wandb.init(project='duocopter', sync_tensorboard=True) ray.init() env_config = { 'copter_weight_kg': 0.5, 'g': 9.81, 'max_thrust_N': 2*9.81, 'max_wattage_W': 2*350, # TODO: Use power curve 'k1_m': 0.01, # TODO: Change 'k2_m': 24E-3, 'theta_deg': 0, 'dyn_fric_coeff': 0.14, 'cart_height_m': 0.2104, 'thrust_centerline_distance_m': 0.01, #TODO: Change 'dt': 1E-3, 'max_height_m': 1.44, 'sampling_rate_hz': 20, 'log': True } config = DEFAULT_CONFIG.copy() config['num_workers'] = 10 config['env_config'] = env_config config['framework'] = 'torch' config['Q_model']['fcnet_hiddens'] = [64, 64] config['policy_model']['fcnet_hiddens'] = [64, 64] config['timesteps_per_iteration'] = 5000 config['rollout_fragment_length'] = 1 config['buffer_size'] = 30000 config['prioritized_replay'] = True config['train_batch_size'] = 1024 config['n_step'] = 5 config['target_network_update_freq'] = 5 #config['lambda'] = 0.9 #config['lr'] = 5e-5 #config['rollout_fragment_length'] = 500 #config['model']['fcnet_hiddens'] = [64, 64] trainer = SACTrainer(config=config, env=SimEnv) for i in range(100): result = trainer.train() print(pretty_print(result)) env = SimEnv(env_config) state = env.reset() done = False ep_reward = 0 while not done: thrust = trainer.compute_action(state, explore=False) state, rw, done, _ = env.step(thrust) ep_reward += rw print(env.calc_rms()) env.plot()