Example #1
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
#        print('hola action size', task.action_size)
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.3
        self.exploration_theta = 2.0
        self.exploration_sigma = 20
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 10
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
Example #2
0
    def __init__(self,
                 lr_actor=0.0003,
                 lr_critic=0.0003,
                 state_dim=8,
                 discount=0.99,
                 action_dim=1,
                 replay_buffer_capacity=1000000,
                 tau=0.005,
                 batch_size=256,
                 reward_scaling=1,
                 rbc_controller=RBCAgent,
                 safe_exploration=None,
                 hidden_dim=None):
        self.gamma = discount
        self.tau = tau
        self.memory = ReplayBuffer(input_shape=state_dim,
                                   n_actions=action_dim,
                                   max_mem_size=replay_buffer_capacity)
        self.batch_size = batch_size
        self.n_actions = action_dim
        self.rbc_controller = rbc_controller
        self.safe_exploration = safe_exploration
        self.hidden_size = hidden_dim

        self.actor = ActorNetwork(learning_rate=lr_actor,
                                  input_size=state_dim,
                                  max_action=1,
                                  n_actions=action_dim,
                                  name='actor',
                                  hidden_size=self.hidden_size)
        self.critic_1 = CriticNetwork(learning_rate=lr_critic,
                                      input_size=state_dim,
                                      n_actions=action_dim,
                                      name='critic_1',
                                      hidden_size=self.hidden_size)
        self.critic_2 = CriticNetwork(learning_rate=lr_critic,
                                      input_size=state_dim,
                                      n_actions=action_dim,
                                      name='critic_2',
                                      hidden_size=self.hidden_size)

        self.value = ValueNetwork(learning_rate=lr_critic,
                                  input_size=state_dim,
                                  name='value',
                                  hidden_size=self.hidden_size)
        self.target_value = ValueNetwork(learning_rate=lr_critic,
                                         input_size=state_dim,
                                         name='target_value',
                                         hidden_size=self.hidden_size)

        self.scale = reward_scaling
        self.update_network_parameters(tau=1)
Example #3
0
    def __init__(self,
                 task,
                 expl_mu,
                 expl_th,
                 expl_sigma,
                 gamma,
                 tau,
                 batch=64):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = expl_mu
        self.exploration_theta = expl_th
        self.exploration_sigma = expl_sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 200000
        self.batch_size = batch
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.10
        self.exploration_sigma = 0.15
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # score tracker
        self.best_score = -np.inf
        self.achievement = False

        # Episode variables
        self.reset_episode()
Example #5
0
def train():

    # build SFDQN
    print('building SFDQN')
    deep_sf = DeepSF(keras_model_handle=sf_model_lambda, **sfdqn_params)
    sfdqn = SFDQN(deep_sf=deep_sf,
                  buffer=ReplayBuffer(sfdqn_params['buffer_params']),
                  **sfdqn_params,
                  **agent_params)

    # train SFDQN
    print('training SFDQN')
    train_tasks, test_tasks = generate_tasks(False)
    sfdqn_perf = sfdqn.train(train_tasks,
                             n_samples,
                             test_tasks=test_tasks,
                             n_test_ev=agent_params['n_test_ev'])

    # build DQN
    print('building DQN')
    dqn = DQN(model_lambda=dqn_model_lambda,
              buffer=ReplayBuffer(dqn_params['buffer_params']),
              **dqn_params,
              **agent_params)

    # training DQN
    print('training DQN')
    train_tasks, test_tasks = generate_tasks(True)
    dqn_perf = dqn.train(train_tasks,
                         n_samples,
                         test_tasks=test_tasks,
                         n_test_ev=agent_params['n_test_ev'])

    # smooth data
    def smooth(y, box_pts):
        return np.convolve(y, np.ones(box_pts) / box_pts, mode='same')

    sfdqn_perf = smooth(sfdqn_perf, 10)[:-5]
    dqn_perf = smooth(dqn_perf, 10)[:-5]
    x = np.linspace(0, 4, sfdqn_perf.size)

    # reporting progress
    ticksize = 14
    textsize = 18
    plt.rc('font', size=textsize)  # controls default text sizes
    plt.rc('axes', titlesize=textsize)  # fontsize of the axes title
    plt.rc('axes', labelsize=textsize)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=ticksize)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=ticksize)  # fontsize of the tick labels
    plt.rc('legend', fontsize=ticksize)  # legend fontsize

    plt.figure(figsize=(8, 6))
    ax = plt.gca()
    ax.plot(x, sfdqn_perf, label='SFDQN')
    ax.plot(x, dqn_perf, label='DQN')
    plt.xlabel('training task index')
    plt.ylabel('averaged test episode reward')
    plt.title('Testing Reward Averaged over all Test Tasks')
    plt.tight_layout()
    plt.legend(frameon=False)
    plt.savefig('figures/sfdqn_return.png')
    def __init__(self,
                 observation_space=None,
                 action_space=None,
                 hidden_dim=None,
                 discount=0.99,
                 tau=0.005,
                 lr=None,
                 batch_size=256,
                 replay_buffer_capacity=1e5,
                 start_training=None,
                 exploration_period=None,
                 action_scaling_coef=1.,
                 reward_scaling=1.,
                 update_per_step=1,
                 iterations_as=2,
                 seed=0,
                 deterministic=None,
                 rbc_controller=None,
                 safe_exploration=None):

        if hidden_dim is None:
            hidden_dim = [256, 256]
        self.start_training = start_training
        self.discount = discount
        self.batch_size = batch_size
        self.tau = tau
        self.action_scaling_coef = action_scaling_coef
        self.reward_scaling = reward_scaling
        t.manual_seed(seed)
        np.random.seed(seed)
        self.deterministic = deterministic
        self.update_per_step = update_per_step
        self.iterations_as = iterations_as
        self.exploration_period = exploration_period
        self.action_list_ = []
        self.action_list2_ = []
        self.hidden_dim = hidden_dim
        self.rbc_controller = rbc_controller
        self.safe_exploration = safe_exploration
        self.reset_action_tracker()

        self.reset_reward_tracker()

        self.time_step = 0
        self.action_space = action_space
        self.observation_space = observation_space

        # Optimizers/Loss using the Huber loss
        self.soft_q_criterion = nn.SmoothL1Loss()

        # device
        self.device = t.device("cuda" if t.cuda.is_available() else "cpu")

        state_dim = self.observation_space.shape[0]
        action_dim = self.action_space.shape[0]
        self.alpha = 0.05

        self.memory = ReplayBuffer(input_shape=int(state_dim),
                                   n_actions=int(action_dim),
                                   max_mem_size=int(replay_buffer_capacity))

        # init networks
        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.device)

        self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                               hidden_dim).to(self.device)
        self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                               hidden_dim).to(self.device)

        for target_param, param in zip(self.target_soft_q_net1.parameters(),
                                       self.soft_q_net1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_soft_q_net2.parameters(),
                                       self.soft_q_net2.parameters()):
            target_param.data.copy_(param.data)

        # Policy
        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        self.action_space,
                                        self.action_scaling_coef,
                                        hidden_dim).to(self.device)
        self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(),
                                            lr=lr)
        self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(),
                                            lr=lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.target_entropy = -np.prod(self.action_space.shape).item()
        self.log_alpha = t.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr)
Example #7
0
    def __init__(self,
                 state_dim=None,
                 action_dim=None,
                 hidden_dim=None,
                 discount=0.99,
                 tau=0.005,
                 lr_actor=None,
                 lr_critic=None,
                 batch_size=256,
                 replay_buffer_capacity=1e5,
                 learning_start=None,
                 reward_scaling=1.,
                 seed=0,
                 rbc_controller=None,
                 safe_exploration=None,
                 automatic_entropy_tuning=False,
                 alpha=1):

        if hidden_dim is None:
            hidden_dim = [256, 256]
        self.learning_start = learning_start
        self.discount = discount
        self.batch_size = batch_size
        self.tau = tau
        self.reward_scaling = reward_scaling
        t.manual_seed(seed)
        np.random.seed(seed)
        self.action_list_ = []
        self.action_list2_ = []
        self.hidden_dim = hidden_dim
        self.rbc_controller = rbc_controller
        self.safe_exploration = safe_exploration
        self.automatic_entropy_tuning = automatic_entropy_tuning

        self.time_step = 0

        # Optimizers/Loss using the Huber loss
        # self.soft_q_criterion = f.mse_loss

        # device
        self.device = t.device("cuda" if t.cuda.is_available() else "cpu")

        self.memory = ReplayBuffer(input_shape=int(state_dim),
                                   n_actions=int(1),
                                   max_mem_size=int(replay_buffer_capacity))

        # init networks
        self.soft_q_net1 = SoftQNetworkDiscrete(state_dim, action_dim,
                                                hidden_dim).to(self.device)
        self.soft_q_net2 = SoftQNetworkDiscrete(state_dim, action_dim,
                                                hidden_dim).to(self.device)

        self.target_soft_q_net1 = SoftQNetworkDiscrete(
            state_dim, action_dim, hidden_dim).to(self.device)
        self.target_soft_q_net2 = SoftQNetworkDiscrete(
            state_dim, action_dim, hidden_dim).to(self.device)

        for target_param, param in zip(self.target_soft_q_net1.parameters(),
                                       self.soft_q_net1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_soft_q_net2.parameters(),
                                       self.soft_q_net2.parameters()):
            target_param.data.copy_(param.data)

        # Policy
        self.policy_net = PolicyNetworkDiscrete(state_dim, action_dim,
                                                hidden_dim).to(self.device)
        self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(),
                                            lr=lr_critic)
        self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(),
                                            lr=lr_critic)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=lr_actor)

        if self.automatic_entropy_tuning:
            # we set the max possible entropy as the target entropy
            self.target_entropy = -np.log((1.0 / action_dim)) * 0.98
            self.log_alpha = t.zeros(1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optimizer = optim.Adam([self.log_alpha],
                                              lr=lr_critic,
                                              eps=1e-4)
        else:
            self.alpha = alpha