Ejemplo n.º 1
0
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=1e-4)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 weight_decay=1e-2)

        self.es = sepCEM(self.actor.get_size(),
                         mu_init=self.actor.get_params(),
                         sigma_init=1e-3,
                         damp=1e-3,
                         damp_limit=1e-5,
                         pop_size=10,
                         antithetic=True,
                         parents=4,
                         elitism=False)
Ejemplo n.º 2
0
    def __init__(self, config):
        self.config = config
        self.task = config.task_fn()
        self.worker_network = config.network_fn()
        self.target_network = config.network_fn()
        self.target_network.load_state_dict(self.worker_network.state_dict())
        self.actor_opt = config.actor_optimizer_fn(
            self.worker_network.actor.parameters())
        self.critic_opt = config.critic_optimizer_fn(
            self.worker_network.critic.parameters())
        self.replay = config.replay_fn()
        self.random_process = config.random_process_fn()
        self.criterion = nn.MSELoss()
        self.total_steps = 0
        self.sigma_init = 1e-3
        self.damp = 1e-3
        self.damp_limit = 1e-5
        self.pop_size = 10
        self.elitism = 'elitism'
        self.n_grad = 5
        self.start_steps = 1000  #10000
        self.n_episodes = 1
        self.n_noisy = 0

        self.state_normalizer = Normalizer(
            self.task.state_dim)  # null_normaliser #
        self.reward_normalizer = Normalizer(1)

        self.es = sepCEM(self.worker_network.actor.get_size(),
                         mu_init=self.worker_network.actor.get_params(),
                         sigma_init=self.sigma_init,
                         damp=self.damp,
                         damp_limit=self.damp_limit,
                         pop_size=self.pop_size,
                         antithetic=not self.pop_size % 2,
                         parents=self.pop_size // 2,
                         elitism=self.elitism)
                                           theta=args.ou_theta,
                                           sigma=args.ou_sigma)

    if USE_CUDA:
        critic.cuda()
        critic_t.cuda()
        actor.cuda()
        actor_t.cuda()

    print("OK 4")
    # CEM
    es = sepCEM(actor.get_size(),
                mu_init=actor.get_params(),
                sigma_init=args.sigma_init,
                damp=args.damp,
                damp_limit=args.damp_limit,
                pop_size=args.pop_size,
                antithetic=not args.pop_size % 2,
                parents=args.pop_size // 2,
                elitism=args.elitism)
    # es = Control(actor.get_size(), pop_size=args.pop_size, mu_init=actor.get_params())

    # training
    step_cpt = 0
    total_steps = 0
    actor_steps = 0
    df = pd.DataFrame(columns=[
        "total_steps", "average_score", "average_score_rl", "average_score_ea",
        "best_score"
    ])
    print("OK 5")