def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
Esempio n. 2
0
    def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2,
            gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300,
            max_size=1000000, batch_size=100,warmup=1000, noise=0.1):
        self.alpha = alpha
        self.beta = beta
        self.tau = tau
        self.batch_size = batch_size
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        self.gamma = gamma 
        self.n_actions = n_actions
        self.learn_step_cntr = 0
        self.time_step = 0
        self.warmup = warmup
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.update_actor_iter = update_actor_interval

        self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Critic1')
        self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Critic2')
        self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms,
                    fc2Dms, name='actor')
        # target nets
        self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Target_critic1')
        self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms,
                    name='Target_critic2')
        self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms,
                    fc2Dms, name='Target_actor')
        self.noise = noise 
        # set the target nets to be exactly as our nets
        self.update_network_parameters(tau=1)
Esempio n. 3
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params

        # _build up the actor/critic evaluated network
        self.actor_net = Actor(env_params, hidden_units=256)
        self.critic_net = Critic(env_params, hidden_units=256)

        # sync the networks across the cpus for parallel training (when running at workstation)
        sync_networks(self.actor_net)
        sync_networks(self.critic_net)

        # _build up the actor/critic target network
        self.actor_target_net = Actor(env_params, hidden_units=256)
        self.critic_target_net = Critic(env_params, hidden_units=256)

        # if gpu is used
        if self.args.cuda:
            self.actor_net.cuda()
            self.critic_net.cuda()
            self.actor_target_net.cuda()
            self.critic_target_net.cuda()

        # the optimizer of the networks
        self.actor_optimizer = torch.optim.Adam(
            self.actor_net.parameters(), lr=self.args.learning_rate_actor)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_net.parameters(), lr=self.args.learning_rate_critic)

        # HER sample function
        self.her_sample = HER(self.args.replay_strategy,
                              self.args.replay_ratio, self.env.compute_reward)

        # experience buffer
        self.exp_buffer = ReplayBuffer(self.env_params, self.args.buffer_size,
                                       self.her_sample.her_sample_transitions)

        # the normalization of the observation and goal
        self.obs_norm = Normalizer(size=env_params['obs'],
                                   clip_range=self.args.clip_range)
        self.goal_norm = Normalizer(size=env_params['d_goal'],
                                    clip_range=self.args.clip_range)

        # create the dictionary to save the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)

            # get the model path
            self.model_path = os.path.join(self.args.save_dir,
                                           self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
Esempio n. 4
0
    def __init__(self, env, sess):
        # Environment
        self.n_state = env.observation_space.shape[0]
        self.n_action = env.action_space.shape[0]

        # Neural Networks
        self.sess = sess
        self.actor = Actor(self.sess, self.n_state, self.n_action)
        self.critic = Critic(self.sess, self.n_state, self.n_action)

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(BUFFER_SIZE)
        # Ornstein-Uhlenbeck Noise
        self.exploration_noise = OUNoise(self.n_action)
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 n_actions,
                 gamma=0.99,
                 fc1Dms=400,
                 fc2Dms=300,
                 max_size=1000000,
                 batch_size=64):
        self.alpha = alpha
        self.tau = tau
        self.beta = beta
        self.batch_size = batch_size
        self.gamma = gamma
        self.n_actions = n_actions
        print(batch_size, fc1Dms, fc2Dms)
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.actor = ActorNet(alpha=alpha,
                              input_dims=input_dims,
                              n_actions=n_actions,
                              fc1Dms=fc1Dms,
                              fc2Dms=fc2Dms,
                              name='actor')
        self.critic = CriticNet(beta=beta,
                                input_dims=input_dims,
                                n_actions=n_actions,
                                fc1Dms=fc1Dms,
                                fc2Dms=fc2Dms,
                                name='critic')
        self.target_actor = ActorNet(alpha=alpha,
                                     input_dims=input_dims,
                                     n_actions=n_actions,
                                     fc1Dms=fc1Dms,
                                     fc2Dms=fc2Dms,
                                     name='target_actor')
        self.target_critic = CriticNet(beta=beta,
                                       input_dims=input_dims,
                                       n_actions=n_actions,
                                       fc1Dms=fc1Dms,
                                       fc2Dms=fc2Dms,
                                       name='target_critic')

        self.update_network_parameters(tau=1)
Esempio n. 6
0
    ########################## Make env and save inform. ###########################
    env = gym.make(args.env_name)
    args.action_dim = env.action_space.shape[0]
    args.max_action = int(env.action_space.high[0])
    args.state_dim = env.observation_space.shape[0]

    ################################### Set seed ###################################
    env.seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    ############## Set replay-buffer, rl-agent, es-agents, and actor ###############
    replay_buffer = ReplayBuffer(args,
                                 args.state_dim,
                                 args.action_dim,
                                 max_size=args.buffer_size)
    rl_agent = TD3(args.state_dim, args.action_dim, args.max_action, args)
    es_agent = CEM(args,
                   num_params=rl_agent.actor.get_size(),
                   mu_init=rl_agent.actor.get_params())
    actor = copy.deepcopy(rl_agent.actor)

    ##################### Set data-frame for saving results ########################
    df_log = pd.DataFrame(columns=["Step", "AvgES", "BestES", "RL"])
    df_steps = pd.DataFrame(columns=["Step"] +
                            [f"Ind{i}" for i in range(1, args.pop_size + 1)])
    df_fitness = pd.DataFrame(columns=["Step"] +
                              [f"Ind{i}" for i in range(1, args.pop_size + 1)])
    df_mu = pd.DataFrame(columns=["Step", "Mean", "Std"] +
                         [f"Reward{i}" for i in range(1, args.n_eval + 1)])