Exemple #1
0
def train(config, reporter):
    trainer = SACTrainer(config=config, env=CamCalibrEnv_seq)
    policy = trainer.get_policy()
    print(policy.dist_class)
    while True:
        result = trainer.train()
        reporter(**result)
        if result["timesteps_since_restore"] > 200:
            phase = 1
        else:
            phase = 0
        trainer.workers.foreach_worker(
            lambda ev: ev.foreach_env(lambda env: env.set_phase(phase)))
        checkpoint_path = trainer.save()
        print(checkpoint_path)
def train_zero(config, reporter):
    agent = SACTrainer(config)
    #agent.restore("/home/yunke/ray_results/AlphaZero_BlackjackEnv_zero_2020-05-01_22-50-303ae70oaq/checkpoint_1981/checkpoint-1981") #continue training
    #training curriculum, start with phase 0

    episodes = 0
    i = 0
    while True:
        result = agent.train()
        if reporter is None:
            continue
        else:
            reporter(**result)
        if i % 50 == 0: #save every 10th training iteration
            checkpoint_path = agent.save()
            print(checkpoint_path)

        i+=1
Exemple #3
0
class SACAgent(Agent):
    def __init__(self, name, environment,training_iterations=10000, checkpoint_path=None, gpu=True):
        self.name = name
        self.env = environment
        self.config = DEFAULT_CONFIG
        self.config['num_gpus'] = 1 if gpu else 0
        self.config['num_gpus_per_worker'] = 1 if gpu else 0
        self.iterations = training_iterations
        self.trainer = SACTrainer(env = self.env)
        # load model
        if checkpoint_path != '':
            self.trainer.restore(checkpoint_path)
        
    def action(self, obs):
        act = self.trainer.compute_action(obs)
        return act

    def train(self, save_iter = 100):
        for it in range(self.iterations):
            self.trainer.train()
            if it % save_iter == 0:
                checkpoint = self.trainer.save()
                print("checkpoint saved at", checkpoint)
Exemple #4
0
def train(config, reporter):
    trainer = SACTrainer(config=config, env=imuCalibrEnv_seq)
    #checkpoint_path = trainer.save()
    policy = trainer.get_policy()
    print(policy.dist_class)

    i = 0
    while True:
        result = trainer.train()
        reporter(**result)
        # if result["timesteps_since_restore"] > 200:
        #     phase = 1
        # else:
        #     phase = 0
        # trainer.workers.foreach_worker(
        #     lambda ev: ev.foreach_env(
        #         lambda env: env.set_phase(phase)))
        # if i==0:
        #     trainer.restore("/home/yunke/ray_results/SAC_imuCalibrEnv_seq_2020-05-21_23-27-20ig3rw_2c/checkpoint_1/checkpoint-1")
        if i%100==0:
            checkpoint_path = trainer.save()
            print(checkpoint_path)
        auto_garbage_collect()
        i+=1
Exemple #5
0
    writer = SummaryWriter(comment="SAC-GEP")

    for batch in range(N_start, N_finish):

        initial_time = time.time()

        result = trainner.train()
        results.append(result)

        episode = {'n': batch,
                   'episode_reward_min':  result['episode_reward_min'],
                   'episode_reward_mean': result['episode_reward_mean'],
                   'episode_reward_max':  result['episode_reward_max'],
                   'episode_len_mean':    result['episode_len_mean']}

        episode_data.append(episode)
        episode_json.append(json.dumps(episode))

        writer.add_scalar("reward_min", result['episode_reward_min'], batch)
        writer.add_scalar("reward_mean", result['episode_reward_mean'], batch)
        writer.add_scalar("reward_max", result['episode_reward_max'], batch)

        if batch % 10 == 0:
            checkpoint = trainner.save()
            print("checkpoint saved at", checkpoint)

        print(f'{batch:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f} time:{time.time() - initial_time:.2f}[sec]')

    writer.close()
    print("\n Finished successfully")
def main():
    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config',
                        type=str,
                        default='config/global_config.json',
                        help='config file')
    parser.add_argument('--algo',
                        type=str,
                        default='PPO',
                        choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference',
                        action="store_true",
                        help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='number of training epochs')
    parser.add_argument(
        '--num_step',
        type=int,
        default=10**3,
        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq',
                        type=int,
                        default=100,
                        help='model saving frequency')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='model saving frequency')
    parser.add_argument('--state_time_span',
                        type=int,
                        default=5,
                        help='state interval to receive long term state')
    parser.add_argument('--time_span',
                        type=int,
                        default=30,
                        help='time interval to collect data')

    args = parser.parse_args()

    config_env = env_config(args)
    # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env))

    config_agent = agent_config(config_env)

    # # build cityflow environment

    trainer = SACTrainer(env=CityflowGymEnv, config=config_agent)
    for i in range(500):
        # Perform one iteration of training the policy with PPO
        result = trainer.train()
        print(pretty_print(result))

        if i % 30 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
Exemple #7
0
            (self.log_return_series, self.metric_series)).transpose()
        self.observation = np.concatenate(
            (price_lookback, metrics, self.position_series), axis=1)

        return self.observation

    def transaction_cost(
        self,
        new_action,
        old_action,
    ):
        turnover = np.abs(new_action - old_action)
        fees = 0.9995
        tcost = turnover * np.log(fees)
        return tcost


# Train agent
agent = SACTrainer(config, Equitydaily)

best_reward = -0.4
for i in range(50000):
    result = agent.train()
    if (result["episode_reward_mean"] > best_reward + 0.01) or (i % 1000
                                                                == 500):
        path = agent.save("sacagent")
        print(path)
        if result["episode_reward_mean"] > best_reward + 0.01:
            best_reward = result["episode_reward_mean"]
            print(i, best_reward)