Exemple #1
0
class SACAgent(Agent):
    def __init__(self, name, environment,training_iterations=10000, checkpoint_path=None, gpu=True):
        self.name = name
        self.env = environment
        self.config = DEFAULT_CONFIG
        self.config['num_gpus'] = 1 if gpu else 0
        self.config['num_gpus_per_worker'] = 1 if gpu else 0
        self.iterations = training_iterations
        self.trainer = SACTrainer(env = self.env)
        # load model
        if checkpoint_path != '':
            self.trainer.restore(checkpoint_path)
        
    def action(self, obs):
        act = self.trainer.compute_action(obs)
        return act

    def train(self, save_iter = 100):
        for it in range(self.iterations):
            self.trainer.train()
            if it % save_iter == 0:
                checkpoint = self.trainer.save()
                print("checkpoint saved at", checkpoint)
Exemple #2
0
class SACrl(object):
    def __init__(self, env, env_config, config):
        self.config = config
        self.config['env_config'] = env_config
        self.env = env(env_config)
        self.agent = SACTrainer(config=self.config, env=env)

    def fit(self, checkpoint=None):
        if checkpoint is None:
            checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl')
        for idx in trange(5):
            result = self.agent.train()
            LOGGER.warning('result: ', result)
            if (idx + 1) % 5 == 0:
                LOGGER.warning('Save checkpoint at: {}'.format(idx + 1))
                state = self.agent.save_to_object()
                with open(checkpoint, 'wb') as fp:
                    pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL)
        return result

    def predict(self, checkpoint=None):
        if checkpoint is not None:
            with open(checkpoint, 'rb') as fp:
                state = pickle.load(fp)
            self.agent.restore_from_object(state)
        done = False
        episode_reward = 0
        obs = self.env.reset()
        actions = []
        while not done:
            action = self.agent.compute_action(obs)
            actions.append(action)
            obs, reward, done, info = self.env.step(action)
            episode_reward += reward
        results = {'action': actions, 'reward': episode_reward}
        return results
def main_sac():

    from ray.rllib.agents.sac import SACTrainer, DEFAULT_CONFIG

    wandb.init(project='duocopter', sync_tensorboard=True)
    ray.init()

    env_config = {
        'copter_weight_kg': 0.5,
        'g': 9.81,
        'max_thrust_N': 2*9.81,
        'max_wattage_W': 2*350, # TODO: Use power curve
        'k1_m': 0.01, # TODO: Change
        'k2_m': 24E-3,
        'theta_deg': 0,
        'dyn_fric_coeff': 0.14,
        'cart_height_m': 0.2104,
        'thrust_centerline_distance_m': 0.01, #TODO: Change
        'dt': 1E-3,
        'max_height_m': 1.44,
        'sampling_rate_hz': 20,
        'log': True
        }

    config = DEFAULT_CONFIG.copy()

    config['num_workers'] = 10
    config['env_config'] = env_config
    config['framework'] = 'torch'
    config['Q_model']['fcnet_hiddens'] = [64, 64]
    config['policy_model']['fcnet_hiddens'] = [64, 64]
    config['timesteps_per_iteration'] = 5000
    config['rollout_fragment_length'] = 1
    config['buffer_size'] = 30000
    config['prioritized_replay'] = True
    config['train_batch_size'] = 1024
    config['n_step'] = 5
    config['target_network_update_freq'] = 5
    #config['lambda'] = 0.9
    #config['lr'] = 5e-5
    #config['rollout_fragment_length'] = 500
    #config['model']['fcnet_hiddens'] = [64, 64]

    trainer = SACTrainer(config=config, env=SimEnv)

    for i in range(100):
        result = trainer.train()
        print(pretty_print(result))

    env = SimEnv(env_config)
    state = env.reset()
    done = False
    ep_reward = 0

    while not done:
        thrust = trainer.compute_action(state, explore=False)
        state, rw, done, _ = env.step(thrust)
        ep_reward += rw

    print(env.calc_rms())
    env.plot()