def train(config, reporter): trainer = SACTrainer(config=config, env=CamCalibrEnv_seq) policy = trainer.get_policy() print(policy.dist_class) while True: result = trainer.train() reporter(**result) if result["timesteps_since_restore"] > 200: phase = 1 else: phase = 0 trainer.workers.foreach_worker( lambda ev: ev.foreach_env(lambda env: env.set_phase(phase))) checkpoint_path = trainer.save() print(checkpoint_path)
def train_zero(config, reporter): agent = SACTrainer(config) #agent.restore("/home/yunke/ray_results/AlphaZero_BlackjackEnv_zero_2020-05-01_22-50-303ae70oaq/checkpoint_1981/checkpoint-1981") #continue training #training curriculum, start with phase 0 episodes = 0 i = 0 while True: result = agent.train() if reporter is None: continue else: reporter(**result) if i % 50 == 0: #save every 10th training iteration checkpoint_path = agent.save() print(checkpoint_path) i+=1
class SACAgent(Agent): def __init__(self, name, environment,training_iterations=10000, checkpoint_path=None, gpu=True): self.name = name self.env = environment self.config = DEFAULT_CONFIG self.config['num_gpus'] = 1 if gpu else 0 self.config['num_gpus_per_worker'] = 1 if gpu else 0 self.iterations = training_iterations self.trainer = SACTrainer(env = self.env) # load model if checkpoint_path != '': self.trainer.restore(checkpoint_path) def action(self, obs): act = self.trainer.compute_action(obs) return act def train(self, save_iter = 100): for it in range(self.iterations): self.trainer.train() if it % save_iter == 0: checkpoint = self.trainer.save() print("checkpoint saved at", checkpoint)
def train(config, reporter): trainer = SACTrainer(config=config, env=imuCalibrEnv_seq) #checkpoint_path = trainer.save() policy = trainer.get_policy() print(policy.dist_class) i = 0 while True: result = trainer.train() reporter(**result) # if result["timesteps_since_restore"] > 200: # phase = 1 # else: # phase = 0 # trainer.workers.foreach_worker( # lambda ev: ev.foreach_env( # lambda env: env.set_phase(phase))) # if i==0: # trainer.restore("/home/yunke/ray_results/SAC_imuCalibrEnv_seq_2020-05-21_23-27-20ig3rw_2c/checkpoint_1/checkpoint-1") if i%100==0: checkpoint_path = trainer.save() print(checkpoint_path) auto_garbage_collect() i+=1
writer = SummaryWriter(comment="SAC-GEP") for batch in range(N_start, N_finish): initial_time = time.time() result = trainner.train() results.append(result) episode = {'n': batch, 'episode_reward_min': result['episode_reward_min'], 'episode_reward_mean': result['episode_reward_mean'], 'episode_reward_max': result['episode_reward_max'], 'episode_len_mean': result['episode_len_mean']} episode_data.append(episode) episode_json.append(json.dumps(episode)) writer.add_scalar("reward_min", result['episode_reward_min'], batch) writer.add_scalar("reward_mean", result['episode_reward_mean'], batch) writer.add_scalar("reward_max", result['episode_reward_max'], batch) if batch % 10 == 0: checkpoint = trainner.save() print("checkpoint saved at", checkpoint) print(f'{batch:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f} time:{time.time() - initial_time:.2f}[sec]') writer.close() print("\n Finished successfully")
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='PPO', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=10, help='number of training epochs') parser.add_argument( '--num_step', type=int, default=10**3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() config_env = env_config(args) # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env)) config_agent = agent_config(config_env) # # build cityflow environment trainer = SACTrainer(env=CityflowGymEnv, config=config_agent) for i in range(500): # Perform one iteration of training the policy with PPO result = trainer.train() print(pretty_print(result)) if i % 30 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
(self.log_return_series, self.metric_series)).transpose() self.observation = np.concatenate( (price_lookback, metrics, self.position_series), axis=1) return self.observation def transaction_cost( self, new_action, old_action, ): turnover = np.abs(new_action - old_action) fees = 0.9995 tcost = turnover * np.log(fees) return tcost # Train agent agent = SACTrainer(config, Equitydaily) best_reward = -0.4 for i in range(50000): result = agent.train() if (result["episode_reward_mean"] > best_reward + 0.01) or (i % 1000 == 500): path = agent.save("sacagent") print(path) if result["episode_reward_mean"] > best_reward + 0.01: best_reward = result["episode_reward_mean"] print(i, best_reward)