Ejemplo n.º 1
0
 def __init__(self, name, environment,training_iterations=10000, checkpoint_path=None, gpu=True):
     self.name = name
     self.env = environment
     self.config = DEFAULT_CONFIG
     self.config['num_gpus'] = 1 if gpu else 0
     self.config['num_gpus_per_worker'] = 1 if gpu else 0
     self.iterations = training_iterations
     self.trainer = SACTrainer(env = self.env)
     # load model
     if checkpoint_path != '':
         self.trainer.restore(checkpoint_path)
Ejemplo n.º 2
0
def train(config, reporter):
    trainer = SACTrainer(config=config, env=CamCalibrEnv_seq)
    policy = trainer.get_policy()
    print(policy.dist_class)
    while True:
        result = trainer.train()
        reporter(**result)
        if result["timesteps_since_restore"] > 200:
            phase = 1
        else:
            phase = 0
        trainer.workers.foreach_worker(
            lambda ev: ev.foreach_env(lambda env: env.set_phase(phase)))
        checkpoint_path = trainer.save()
        print(checkpoint_path)
def train_zero(config, reporter):
    agent = SACTrainer(config)
    #agent.restore("/home/yunke/ray_results/AlphaZero_BlackjackEnv_zero_2020-05-01_22-50-303ae70oaq/checkpoint_1981/checkpoint-1981") #continue training
    #training curriculum, start with phase 0

    episodes = 0
    i = 0
    while True:
        result = agent.train()
        if reporter is None:
            continue
        else:
            reporter(**result)
        if i % 50 == 0: #save every 10th training iteration
            checkpoint_path = agent.save()
            print(checkpoint_path)

        i+=1
Ejemplo n.º 4
0
class SACrl(object):
    def __init__(self, env, env_config, config):
        self.config = config
        self.config['env_config'] = env_config
        self.env = env(env_config)
        self.agent = SACTrainer(config=self.config, env=env)

    def fit(self, checkpoint=None):
        if checkpoint is None:
            checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl')
        for idx in trange(5):
            result = self.agent.train()
            LOGGER.warning('result: ', result)
            if (idx + 1) % 5 == 0:
                LOGGER.warning('Save checkpoint at: {}'.format(idx + 1))
                state = self.agent.save_to_object()
                with open(checkpoint, 'wb') as fp:
                    pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL)
        return result

    def predict(self, checkpoint=None):
        if checkpoint is not None:
            with open(checkpoint, 'rb') as fp:
                state = pickle.load(fp)
            self.agent.restore_from_object(state)
        done = False
        episode_reward = 0
        obs = self.env.reset()
        actions = []
        while not done:
            action = self.agent.compute_action(obs)
            actions.append(action)
            obs, reward, done, info = self.env.step(action)
            episode_reward += reward
        results = {'action': actions, 'reward': episode_reward}
        return results
Ejemplo n.º 5
0
def train(config, reporter):
    trainer = SACTrainer(config=config, env=imuCalibrEnv_seq)
    #checkpoint_path = trainer.save()
    policy = trainer.get_policy()
    print(policy.dist_class)

    i = 0
    while True:
        result = trainer.train()
        reporter(**result)
        # if result["timesteps_since_restore"] > 200:
        #     phase = 1
        # else:
        #     phase = 0
        # trainer.workers.foreach_worker(
        #     lambda ev: ev.foreach_env(
        #         lambda env: env.set_phase(phase)))
        # if i==0:
        #     trainer.restore("/home/yunke/ray_results/SAC_imuCalibrEnv_seq_2020-05-21_23-27-20ig3rw_2c/checkpoint_1/checkpoint-1")
        if i%100==0:
            checkpoint_path = trainer.save()
            print(checkpoint_path)
        auto_garbage_collect()
        i+=1
Ejemplo n.º 6
0
class SACAgent(Agent):
    def __init__(self, name, environment,training_iterations=10000, checkpoint_path=None, gpu=True):
        self.name = name
        self.env = environment
        self.config = DEFAULT_CONFIG
        self.config['num_gpus'] = 1 if gpu else 0
        self.config['num_gpus_per_worker'] = 1 if gpu else 0
        self.iterations = training_iterations
        self.trainer = SACTrainer(env = self.env)
        # load model
        if checkpoint_path != '':
            self.trainer.restore(checkpoint_path)
        
    def action(self, obs):
        act = self.trainer.compute_action(obs)
        return act

    def train(self, save_iter = 100):
        for it in range(self.iterations):
            self.trainer.train()
            if it % save_iter == 0:
                checkpoint = self.trainer.save()
                print("checkpoint saved at", checkpoint)
Ejemplo n.º 7
0
 def __init__(self, env, env_config, config):
     self.config = config
     self.config['env_config'] = env_config
     self.env = env(env_config)
     self.agent = SACTrainer(config=self.config, env=env)
Ejemplo n.º 8
0
    config['num_workers'] = args.workers
    config['num_gpus'] = 1
    config['framework'] = "torch"
    config['gamma'] = args.gamma

    config['monitor'] = False

    config['model']['dim'] = 21
    config['model']['conv_filters'] = [ [8, [3, 3], 2],
                                        [16, [2, 2], 2],
                                        [512, [6, 6], 1]]

    config["buffer_size"] = 500_000
    # If True prioritized replay buffer will be used.
    config["prioritized_replay"] = True
    trainner = SACTrainer(config=config, env="mars_explorer:explorer-v01")

    if PATH != "":
        print(f"\nLoading trainner from dir {PATH}")
        trainner.restore(PATH)
    else:
        print(f"Starting trainning without a priori knowledge")

    N_start = 0
    N_finish = args.steps
    results = []
    episode_data = []
    episode_json = []

    writer = SummaryWriter(comment="SAC-GEP")
Ejemplo n.º 9
0
DEFAULT_CONFIG = SACTrainer.merge_trainer_configs(
    SAC_DEFAULT_CONFIG,
    {
        # Batch mode (see common config)
        "batch_mode": "complete_episodes",
        # If True prioritized replay buffer will be used.
        "prioritized_replay": False,
        # RNNSAC does not suport n-step > 1 yet!
        "n_step": 1,
        # If True, assume a zero-initialized state input (no matter where in
        # the episode the sequence is located).
        # If False, store the initial states along with each SampleBatch, use
        # it (as initial state when running through the network for training),
        # and update that initial state during training (from the internal
        # state outputs of the immediately preceding sequence).
        "zero_init_states": True,
        # If > 0, use the `burn_in` first steps of each replay-sampled sequence
        # (starting either from all 0.0-values if `zero_init_state=True` or
        # from the already stored values) to calculate an even more accurate
        # initial states for the actual sequence (starting after this burn-in
        # window). In the burn-in case, the actual length of the sequence
        # used for loss calculation is `n - burn_in` time steps
        # (n=LSTM’s/attention net’s max_seq_len).
        "burn_in": 0,
        # Set automatically: The number of contiguous environment steps to
        # replay at once. Will be calculated via
        # model->max_seq_len + burn_in.
        # Do not set this to any valid value!
        "replay_sequence_length": -1,
    },
    _allow_unknown_configs=True,
)
Ejemplo n.º 10
0
            }
        })
    results = tune.run(
        args.run,
        config=config,
        scheduler=scheduler,
        num_samples=4,
        stop=stop,
        # checkpoint_freq=10,
        checkpoint_at_end=True,
        # restore="/home/david/ray_results/SAC/SAC_FarmEnv_5aa8e_00000_0_2021-01-21_18-23-19/checkpoint_199/checkpoint-199",
    )
    if args.run == "PPO":
        agent = PPOTrainer(config=config)
    elif args.run == "SAC":
        agent = SACTrainer(config=config)
    elif args.run == "DDPG":
        agent = DDPGTrainer(config=config)

    # list of lists: one list per checkpoint; each checkpoint list contains
    # 1st the path, 2nd the metric value
    checkpoints = results.get_trial_checkpoints_paths(
        trial=results.get_best_trial("episode_reward_mean", mode='max'),
        metric="episode_reward_mean")
    checkpoint_path, _ = checkpoints[0]
    print(f'checkpoint_path {checkpoint_path}')
    #  agent = PPOTrainer(config=config_PPO)

    if args.as_test:
        check_learning_achieved(results, args.stop_reward)
    ray.shutdown()
Ejemplo n.º 11
0
        'framework': 'tfe' if args.tfe else 'tf',
        # 'target_entropy': args.target_entropy,
        # 'asymmetric': args.asymmetric,
        # "multiagent": {
        #     "policy_mapping_fn": lambda x: f"worker_p{x}", # if x == 0 else "worker_p2",
        #     'policies': policies,
        #     'policies_to_train': [f"worker_p{e}" for e in range(args.ensemble_size)]
        # },
    }
    ray.init(
        num_cpus=args.num_cpus or None,
        local_mode=args.local_mode,
        # redis_max_memory=int(4e9),
        # object_store_memory=int(10e9),
        # memory=int(16e9)
    )
    if args.debug:
        trainer = SACTrainer(config=config)
        # trainer = PPOTrainer(config=config)
        while True:
            results = trainer.train()  # distributed training step
            print(
                f"Iter: {results['training_iteration']}, R: {results['episode_reward_mean']}"
            )
    else:
        tune.run(
            SACTrainer,
            # PPOTrainer,
            verbose=args.verbose,
            config=config)
def main():
    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config',
                        type=str,
                        default='config/global_config.json',
                        help='config file')
    parser.add_argument('--algo',
                        type=str,
                        default='PPO',
                        choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference',
                        action="store_true",
                        help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='number of training epochs')
    parser.add_argument(
        '--num_step',
        type=int,
        default=10**3,
        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq',
                        type=int,
                        default=100,
                        help='model saving frequency')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='model saving frequency')
    parser.add_argument('--state_time_span',
                        type=int,
                        default=5,
                        help='state interval to receive long term state')
    parser.add_argument('--time_span',
                        type=int,
                        default=30,
                        help='time interval to collect data')

    args = parser.parse_args()

    config_env = env_config(args)
    # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env))

    config_agent = agent_config(config_env)

    # # build cityflow environment

    trainer = SACTrainer(env=CityflowGymEnv, config=config_agent)
    for i in range(500):
        # Perform one iteration of training the policy with PPO
        result = trainer.train()
        print(pretty_print(result))

        if i % 30 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
Ejemplo n.º 13
0
            (self.log_return_series, self.metric_series)).transpose()
        self.observation = np.concatenate(
            (price_lookback, metrics, self.position_series), axis=1)

        return self.observation

    def transaction_cost(
        self,
        new_action,
        old_action,
    ):
        turnover = np.abs(new_action - old_action)
        fees = 0.9995
        tcost = turnover * np.log(fees)
        return tcost


# Train agent
agent = SACTrainer(config, Equitydaily)

best_reward = -0.4
for i in range(50000):
    result = agent.train()
    if (result["episode_reward_mean"] > best_reward + 0.01) or (i % 1000
                                                                == 500):
        path = agent.save("sacagent")
        print(path)
        if result["episode_reward_mean"] > best_reward + 0.01:
            best_reward = result["episode_reward_mean"]
            print(i, best_reward)
Ejemplo n.º 14
0
def main_sac():

    from ray.rllib.agents.sac import SACTrainer, DEFAULT_CONFIG

    wandb.init(project='duocopter', sync_tensorboard=True)
    ray.init()

    env_config = {
        'copter_weight_kg': 0.5,
        'g': 9.81,
        'max_thrust_N': 2*9.81,
        'max_wattage_W': 2*350, # TODO: Use power curve
        'k1_m': 0.01, # TODO: Change
        'k2_m': 24E-3,
        'theta_deg': 0,
        'dyn_fric_coeff': 0.14,
        'cart_height_m': 0.2104,
        'thrust_centerline_distance_m': 0.01, #TODO: Change
        'dt': 1E-3,
        'max_height_m': 1.44,
        'sampling_rate_hz': 20,
        'log': True
        }

    config = DEFAULT_CONFIG.copy()

    config['num_workers'] = 10
    config['env_config'] = env_config
    config['framework'] = 'torch'
    config['Q_model']['fcnet_hiddens'] = [64, 64]
    config['policy_model']['fcnet_hiddens'] = [64, 64]
    config['timesteps_per_iteration'] = 5000
    config['rollout_fragment_length'] = 1
    config['buffer_size'] = 30000
    config['prioritized_replay'] = True
    config['train_batch_size'] = 1024
    config['n_step'] = 5
    config['target_network_update_freq'] = 5
    #config['lambda'] = 0.9
    #config['lr'] = 5e-5
    #config['rollout_fragment_length'] = 500
    #config['model']['fcnet_hiddens'] = [64, 64]

    trainer = SACTrainer(config=config, env=SimEnv)

    for i in range(100):
        result = trainer.train()
        print(pretty_print(result))

    env = SimEnv(env_config)
    state = env.reset()
    done = False
    ep_reward = 0

    while not done:
        thrust = trainer.compute_action(state, explore=False)
        state, rw, done, _ = env.step(thrust)
        ep_reward += rw

    print(env.calc_rms())
    env.plot()