Esempio n. 1
0
    def __init__(self, state_size, action_size, max_action, minibatch_size,
                 a_lr, c_lr, gamma, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.max_action = max_action

        self.critic_lr = c_lr
        self.actor_lr = a_lr

        self.actor_network = Actor(self.state_size, self.action_size,
                                   self.max_action, self.actor_lr)
        self.actor_target_network = Actor(self.state_size, self.action_size,
                                          self.max_action, self.actor_lr)
        self.critic_network = Critic(self.state_size, self.action_size,
                                     self.critic_lr)
        self.critic_target_network = Critic(self.state_size, self.action_size,
                                            self.critic_lr)

        self.actor_target_network.set_weights(self.actor_network.get_weights())
        self.critic_target_network.set_weights(
            self.critic_network.get_weights())

        self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr)
        self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr)

        self.replay_buffer = ReplayBuffer(1e6)
        self.MINIBATCH_SIZE = minibatch_size
        self.GAMMA = tf.cast(gamma, dtype=tf.float64)
        self.TAU = tau
        self.noise = OUNoise(self.action_size)
    def __init__(self, agent_id, state_size, action_size, rand_seed,
                 meta_agent):
        """ Creates a new DDPG Agent """

        self.agent_id = agent_id
        self.action_size = action_size

        # Defines the Actor Networks
        self.actor_local = Actor(state_size, action_size, rand_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  rand_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Defines the Critic Networks
        self.critic_local = Critic(state_size, action_size,
                                   meta_agent.agents_qty, rand_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    meta_agent.agents_qty,
                                    rand_seed).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=LR_CRITIC)  #, weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, rand_seed)

        # Refers to the MA agent memory
        self.memory = meta_agent.memory

        self.t_step = 0
Esempio n. 3
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.1  # for soft update of target parameters
Esempio n. 4
0
def test(env, trained_model):
    actor_net = NCS_nn.NCS_net(48, 4, 0.8).to(device)
    model = torch.load(trained_model)
    actor_net.load_state_dict(model)
    actor_net.eval()

    # IF YOU WAN TO START AT RANDOM INTERMEDIATE STATE
    # file_name = open("data_cube_5_10_07_19_1612.pkl", "rb")
    # data = pickle.load(file_name)
    # states = np.array(data["states"])
    # random_states_index = np.random.randint(0, len(states), size = len(states))

    noise = OUNoise(4)
    expl_noise = OUNoise(4, sigma=0.001)
    for _ in range(10):
        # inference
        obs, done = env.reset(), False
        # obs = env.env.intermediate_state_reset(states[np.random.choice(random_states_index, 1)[0]])
        print("start")
        # while not done:
        for _ in range(150):
            obs = torch.FloatTensor(np.array(obs).reshape(1, -1)).to(
                device)  # + expl_noise.noise()
            action = actor_net(obs).cpu().data.numpy().flatten()
            print(action)
            obs, reward, done, _ = env.step(action)
    def __init__(self, state_size, action_size, num_agents):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state for each agent
            action_size (int): dimension of each action for each agent
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(DEVICE)
        self.actor_target = Actor(state_size, action_size).to(DEVICE)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR,
                                          weight_decay=WEIGHT_DECAY_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(num_agents * state_size,
                                   num_agents * action_size).to(DEVICE)
        self.critic_target = Critic(num_agents * state_size,
                                    num_agents * action_size).to(DEVICE)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY_critic)

        # Noise process
        self.noise = OUNoise(action_size)  #single agent only
        self.noise_scale = NOISE_START

        # Make sure target is initialized with the same weight as the source (makes a big difference)
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)
Esempio n. 6
0
 def __init__(self,
              C,
              b,
              x,
              action_output_num,
              actor_size,
              replay_size=1000000,
              ou_noise=True,
              param_noise=True,
              noise_scale=0.3,
              final_noise_scale=0.3):
     self.C = C
     self.b = b
     self.x = x
     self.hd = action_output_num
     self.actor_size = actor_size
     self.memory = ReplayMemory(replay_size)
     self.new_b = None
     self.env = None
     self.agent = None
     self.ou_noise = ou_noise
     self.noise_scale = noise_scale
     self.final_noise_scale = final_noise_scale
     self.ounoise = OUNoise(action_output_num) if ou_noise else None
     self.param_noise = AdaptiveParamNoiseSpec(
         initial_stddev=0.05,
         desired_action_stddev=noise_scale,
         adaptation_coefficient=1.05) if param_noise else None
Esempio n. 7
0
    def __init__(self, agent_id, model, action_size=2, seed=0):
        """Initialize an Agent object.
        """
        self.seed = random.seed(seed)
        self.id = agent_id
        self.action_size = action_size

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Set weights for local and target actor, respectively, critic the same
        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        # Noise process
        self.noise = OUNoise(action_size, seed)
    def __init__(self, task):
        # Hyper parameters
        self.learning_rate_actor = 1e-4
        self.learning_rate_critic = 1e-3
        self.gamma = 0.99
        self.tau = 0.001

        # Define net
        self.sess = tf.Session()
        self.task = task
        self.actor = ActorNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_actor, \
                     self.task.action_low, self.task.action_high, self.tau)
        self.critic = CriticNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_critic, self.tau)

        # Define noise
        self.mu = 0
        self.theta = 0.15
        self.sigma = 0.20
        self.noise = OUNoise(self.task.action_size, self.mu, self.theta, self.sigma)

        # Define memory replay
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = Replay(self.buffer_size, self.batch_size)

        # Score
        self.best_score = -np.inf
        self.best_reward = -np.inf
Esempio n. 9
0
    def __init__(self, config, state_size, action_size):
        super(DDPGAgent, self).__init__()
        l1 = config['network']['hidden']
        l2 = int(config['network']['hidden'] / 2)
        self.actor = Actor(state_size, action_size, config['seed']['agent'],
                           l1, l2).to(device)
        self.critic = Critic(state_size, action_size, config['seed']['agent'],
                             l1, l2).to(device)
        self.target_actor = Actor(state_size, action_size,
                                  config['seed']['agent'], l1, l2).to(device)
        self.target_critic = Critic(state_size, action_size,
                                    config['seed']['agent'], l1, l2).to(device)

        self.noise = OUNoise(action_size,
                             mu=config['noise']['mu'],
                             sigma=config['noise']['sigma'],
                             theta=config['noise']['theta'])

        # initialize targets same as original networks
        self.hard_update(self.target_actor, self.actor)
        self.hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=config['LR_ACTOR'])
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=config['LR_CRITIC'])
Esempio n. 10
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
Esempio n. 11
0
    def __init__(
        self,
        num_agents,
        state_size,
        action_size,
        buffer_size=int(1e5),
        batch_size=128,
        gamma=0.99,
        tau=1e-3,
        lr_actor=1e-4,
        lr_critic=1e-3,
        weight_decay=0,
        random_seed=2,
    ):
        """Initialize an Agent object.

        Params
        ======
            num_agents (int): number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(
            action_size=action_size,
            buffer_size=buffer_size,
            batch_size=batch_size,
            seed=random_seed,
        )
Esempio n. 12
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 random_seed=0):
        """
            Initialize an Agent object.
        :param state_size: size of state
        :param action_size: size of action
        :param num_agents: number of agents
        :param gamma: discount factor
        :param tau: factor for soft update of target parameters
        :param lr_actor: Learning rate of actor
        :param lr_critic: Learning rate of critic
        :param random_seed: Random seed
        :param device: cuda or cpu
        """

        self.device = device
        self.gamma = gamma
        self.tau = tau

        self.num_agents = num_agents

        self.state_size = state_size
        self.action_size = action_size
        self.full_state_size = state_size * num_agents
        self.full_action_size = action_size * num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, device,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, device,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.full_state_size,
                                   self.full_action_size,
                                   device=device,
                                   random_seed=random_seed).to(device)
        self.critic_target = Critic(self.full_state_size,
                                    self.full_action_size,
                                    device=device,
                                    random_seed=random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)
Esempio n. 13
0
    def __init__(self, device, state_size, n_agents, action_size, random_seed, \
                         buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay,  \
                         learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'):

        # Set Computational device
        self.DEVICE = device

        # Init State, action and agent dimensions
        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.l_step = 0
        self.log_interval = 200

        # Init Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num

        # Init Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Init Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Init Noise Process
        self.noise = OUNoise((n_agents, action_size),
                             random_seed,
                             mu=0.,
                             theta=ou_theta,
                             sigma=ou_sigma)

        # Init Replay Memory
        self.memory = ReplayBuffer(device, action_size, buffer_size,
                                   batch_size, random_seed)
Esempio n. 14
0
 def test_random_action():
     env = gym.make('gym_kinova_gripper:kinovagripper-v0')
     obs, done = env.reset(), False
     noise = OUNoise(3)
     max_action = float(env.action_space.high[0])
     correct = 0
     noise.reset()
     cum_reward = 0.0
     for i in range(100):
         finger_actions = noise.noise().clip(-max_action, max_action)
         # actions = np.array([0.0, finger_actions[0], finger_actions[1], finger_actions[2]])
         actions = np.array([0.4, 0.5, 0.5, 0.5])
         obs, reward, done, _ = env.step(actions)
         inputs = torch.FloatTensor(np.array(obs)).to(device)
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed=0,
                 params=params):
        """
        Initialize an Agent object.
        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        num_agents (int): number of agents
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.params = params

        # Actor (Policy) Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.params['DEVICE'])
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.params['DEVICE'])
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.params['LR_ACTOR'])

        # Critic (Value) Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.params['DEVICE'])
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.params['DEVICE'])
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.params['LR_CRITIC'],
            weight_decay=self.params['WEIGHT_DECAY'])

        # Initialize target and local to same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'],
                                   self.params['BATCH_SIZE'], random_seed)
def main():
    ddpg = DDPG(GAMMA, TAU, torch.cuda.is_available())
    memory = ReplayMemory(REPLAY_SIZE)
    env.init_state()

    if os.path.exists('models/ddpg_actor_'):
        ddpg.load_model()

    updates = 0
    for i_episode in range(NUM_EPISODES):
        while True:
            ounoise = OUNoise(1,
                              scale=NOISE_SCALE -
                              NOISE_SCALE // NUM_EPISODES * i_episode)
            action = ddpg.select_action(env.state, ounoise)
            transition = env.step(action)
            memory.push(transition)

            if len(memory) > BATCH_SIZE:
                for _ in range(UPDATES_PER_STEP):
                    transitions = memory.sample(BATCH_SIZE)
                    random.shuffle(transitions)

                    batch = Transition(*zip(*transitions))
                    value_loss, policy_loss = ddpg.update_parameters(batch)

                    print(
                        "Episode: {}, Updates: {}, Value Loss: {}, Policy Loss: {}"
                        .format(i_episode, updates, value_loss, policy_loss))
                    updates += 1

                break

        if (i_episode + 1) % 100 == 0:
            ddpg.save_model()
Esempio n. 17
0
    def __init__(self, env):
        self.env = env
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.shape[0]
        self.action_low = self.env.action_space.low[0]
        self.action_high = self.env.action_space.high[0]

        # Learning rates
        self.actor_learning_rate = 1e-4
        self.critic_learning_rate = 1e-3

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_learning_rate)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_learning_rate)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_learning_rate)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_learning_rate)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.1
        self.exploration_sigma = 0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
    def __init__(self, task):
        self.task = task
        self.session = K.get_session()
        init = tf.global_variables_initializer()
        self.session.run(init)
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.score = -math.inf
        self.best_score = -math.inf
        self.last_loss = math.inf

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.noise_scale = (self.exploration_mu, self.exploration_theta,
                            self.exploration_sigma)
        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 16
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
Esempio n. 19
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # actor policy model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # critic value model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.25
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay buffer
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.9  # discount rate
        self.tau = 0.1  # soft update parameter

        self.total_reward = 0
        self.count = 0
        self.score = 0
        self.best_score = -np.inf

        self.reset_episode()
Esempio n. 20
0
    def __init__(self, state_size, action_size, random_seed):
        """ Creates a new DDPG agent initilizing the networks """
        self.state_size = state_size
        self.action_size = action_size

        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

        self.critic = Critic(state_size, action_size, 17).to(device)
        self.critic_target = Critic(state_size, action_size, 17).to(device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.actor = Actor(state_size, action_size, 17).to(device)
        self.actor_target = Actor(state_size, action_size, 17).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)

        self.seed = random.seed(random_seed)
        # Noise process
        self.noise = OUNoise(action_size, random_seed)
def main():
    my_env = env()

    agent = NAF_CNN(0.99, 0.001, 128, my_env.observation_space.shape[0],
                    my_env.action_space)

    parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
    parser.add_argument('--noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='initial noise scale (default: 0.3)')
    parser.add_argument('--final_noise_scale',
                        type=float,
                        default=0.3,
                        metavar='G',
                        help='final noise scale (default: 0.3)')
    parser.add_argument('--exploration_end',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of episodes with noise (default: 100)')
    args = parser.parse_args()

    ounoise = OUNoise(my_env.action_space.shape[0])
    ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
        0, args.exploration_end -
        1) / args.exploration_end + args.final_noise_scale
    ounoise.reset()

    state = my_env.reset()
    i = 10
    while i > 0:
        action = agent.select_action(state, ounoise)
        print("action: {}".format(action))
        next_state, reward, done = my_env.step(action)
        if done:
            break
        print(reward)
        i = i - 1
Esempio n. 22
0
    def __init__(self, task, sess):
        self.sess = sess
        self.env = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 0.0001
        self.tau = 0.001
        self.minibatch_size = 64
        self.critic_lr = 0.001
        self.gamma = 0.99
        self.buffer_size = 1000000
        self.random_seed = 1234
        self.summary_dir = "/"
        #self.max_episode = 100
        #self.max_episode_len = 100
        self.mu = 0

        self.actor = ActorNetwork(self.sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.tau, self.minibatch_size)

        self.critic = CriticNetwork(self.sess, self.state_size,
                                    self.action_size, self.critic_lr, self.tau,
                                    self.gamma,
                                    self.actor.get_num_trainable_vars())

        # Initialize replay memory
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed)
        self.sess.run(tf.global_variables_initializer())
        self.actor.update_target_network()
        self.critic.update_target_network()

        self.noise = OUNoise(self.action_size, self.mu)

        self.sess.run(tf.global_variables_initializer())
Esempio n. 23
0
    def __init__(self, state_space, action_space, max_action, device):
        self.state_size = state_space.shape[0]
        self.action_size = action_space.shape[0]
        self.max_action = max_action
        self.device = device
        self.actor_local = Actor(state_space.shape, action_space.high.size,
                                 max_action)
        self.actor_target = Actor(state_space.shape, action_space.high.size,
                                  max_action)
        self.actor_optimizer = optimizers.Adam(LR_ACTOR)
        # let target be equal to local
        self.actor_target.set_weights(self.actor_local.get_weights())

        self.critic_local = Critic(state_space.shape, action_space.high.size)
        self.critic_target = Critic(state_space.shape, action_space.high.size)
        self.critic_optimizer = optimizers.Adam(LR_CRITIC)
        # let target be equal to local
        self.critic_target.set_weights(self.critic_local.get_weights())

        self.noise = OUNoise(self.action_size)
        self.memory = ReplayBuffer(BUFFER_SIZE)

        self.current_steps = 0
Esempio n. 24
0
 def __init__(self, args, env):
     self.args = args
     self.env = env
     # get the number of inputs...
     num_inputs = self.env.observation_space.shape[0]
     num_actions = self.env.action_space.shape[0]
     self.action_scale = self.env.action_space.high[0]
     # build up the network
     self.actor_net = Actor(num_inputs, num_actions)
     self.critic_net = Critic(num_inputs, num_actions)
     # get the target network...
     self.actor_target_net = Actor(num_inputs, num_actions)
     self.critic_target_net = Critic(num_inputs, num_actions)
     if self.args.cuda:
         self.actor_net.cuda()
         self.critic_net.cuda()
         self.actor_target_net.cuda()
         self.critic_target_net.cuda()
     # copy the parameters..
     self.actor_target_net.load_state_dict(self.actor_net.state_dict())
     self.critic_target_net.load_state_dict(self.critic_net.state_dict())
     # setup the optimizer...
     self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                             lr=self.args.actor_lr)
     self.optimizer_critic = torch.optim.Adam(
         self.critic_net.parameters(),
         lr=self.args.critic_lr,
         weight_decay=self.args.critic_l2_reg)
     # setting up the noise
     self.ou_noise = OUNoise(num_actions)
     # check some dir
     if not os.path.exists(self.args.save_dir):
         os.mkdir(self.args.save_dir)
     self.model_path = self.args.save_dir + self.args.env_name + '/'
     if not os.path.exists(self.model_path):
         os.mkdir(self.model_path)
Esempio n. 25
0
    def __init__(self, fd, cfg, memory, explore=True):
        threading.Thread.__init__(self)
        self.fd = fd
        self.cfg = cfg
        self.memory = memory
        self.explore = explore
        self.agent = torch.load(cfg.get('nafcnn', 'agent'))
        self.ounoise = OUNoise(action_dimension=1)
        mpsched.persist_state(fd)

        self.env = Env(fd=self.fd,
                       time=self.cfg.getfloat('env', 'time'),
                       k=self.cfg.getint('env', 'k'),
                       alpha=self.cfg.getfloat('env', 'alpha'),
                       b=self.cfg.getfloat('env', 'b'),
                       c=self.cfg.getfloat('env', 'c'))
Esempio n. 26
0
    def __init__(self,
                 env=gym.make('Pendulum-v0'),
                 s_dim=2,
                 a_dim=1,
                 gamma=0.99,
                 episodes=100,
                 tau=0.001,
                 buffer_size=1e06,
                 minibatch_size=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 save_name='final_weights',
                 render=False):
        self.save_name = save_name
        self.render = render
        self.env = env
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.EPISODES = episodes
        self.MAX_TIME_STEPS = 200
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.GAMMA = gamma
        self.TAU = tau
        self.buffer_size = buffer_size
        self.minibatch_size = minibatch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.ou_noise = OUNoise(mean=np.zeros(1))

        self.actor = Actor(self.s_dim, self.a_dim).model()
        self.target_actor = Actor(self.s_dim, self.a_dim).model()
        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(self.s_dim, self.a_dim).model()
        self.critic_opt = tf.keras.optimizers.Adam(
            learning_rate=self.critic_lr)
        self.target_critic = Critic(self.s_dim, self.a_dim).model()
        self.target_critic.set_weights(self.critic.get_weights())

        self.replay_buffer = ReplayBuffer(self.buffer_size)
Esempio n. 27
0
class DDPG():
    def __init__(self, task, sess):
        self.sess = sess
        self.env = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 0.0001
        self.tau = 0.001
        self.minibatch_size = 64
        self.critic_lr = 0.001
        self.gamma = 0.99
        self.buffer_size = 1000000
        self.random_seed = 1234
        self.summary_dir = "/"
        #self.max_episode = 100
        #self.max_episode_len = 100
        self.mu = 0

        self.actor = ActorNetwork(self.sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.tau, self.minibatch_size)

        self.critic = CriticNetwork(self.sess, self.state_size,
                                    self.action_size, self.critic_lr, self.tau,
                                    self.gamma,
                                    self.actor.get_num_trainable_vars())

        # Initialize replay memory
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed)
        self.sess.run(tf.global_variables_initializer())
        self.actor.update_target_network()
        self.critic.update_target_network()

        self.noise = OUNoise(self.action_size, self.mu)

        self.sess.run(tf.global_variables_initializer())

    def reset_episode(self):
        #self.actor_noise.reset()
        state = self.env.reset()
        self.last_state = state
        self.ep_ave_max_q = 0
        self.ep_reward = 0
        return state

    def step(self, s, a, r, terminal, s2):
        # Save experience / reward
        #self.memory.add(self.last_state, action, reward, next_state, done)
        #summary_ops, summary_vars = self.build_summaries()
        self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )),
                               np.reshape(a, (self.actor.a_dim, )), r,
                               terminal, np.reshape(s2, (self.actor.s_dim, )))
        # Learn, if enough samples are available in memory
        if self.replay_buffer.size() > self.minibatch_size:

            s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch(
                self.minibatch_size)
            #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch)
            target_q = self.critic.predict_target(
                s2_batch, self.actor.predict_target(s2_batch))

            y_i = []
            for k in range(self.minibatch_size):
                if t_batch[k]:
                    y_i.append(r_batch[k])
                else:
                    y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                    # Update the critic given the targets
            predicted_q_value, _ = self.critic.train(
                s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

            #self.ep_ave_max_q += np.amax(predicted_q_value)

            # Update the actor policy using the sampled gradient
            a_outs = self.actor.predict(s_batch)
            grads = self.critic.action_gradients(s_batch, a_outs)
            self.actor.train(s_batch, grads[0])

            # Update target networks
            self.actor.update_target_network()
            self.critic.update_target_network()

        # Roll over last state and action
        self.last_state = s2
        '''
        self.ep_reward +=r
        
        if terminal:
            
            summary_str = self.sess.run(
            , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)})

            writer.add_summary(summary_str, i)
            #writer.flush()
            
            print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \
                             (self.ep_ave_max_q / float(j))))
             '''

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])

        actions = self.actor.predict(states)[0]
        #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size))
        #print(actions)

        return actions + self.noise.sample()  # add some noise for exploration

    def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch):

        target_q = self.critic.predict_target(
            s2_batch, self.actor.predict_target(s2_batch))

        y_i = []
        for k in range(self.minibatch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                # Update the critic given the targets
        predicted_q_value, _ = self.critic.train(
            s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

        #self.ep_ave_max_q += np.amax(predicted_q_value)

        # Update the actor policy using the sampled gradient
        a_outs = self.actor.predict(s_batch)
        grads = self.critic.action_gradients(s_batch, a_outs)
        self.actor.train(s_batch, grads[0])

        # Update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()

    def build_summaries(self):
        episode_reward = tf.Variable(0.)
        tf.summary.scalar("Reward", episode_reward)
        episode_ave_max_q = tf.Variable(0.)
        tf.summary.scalar("Qmax Value", episode_ave_max_q)

        summary_vars = [episode_reward, episode_ave_max_q]
        summary_ops = tf.summary.merge_all()

        return summary_ops, summary_vars
Esempio n. 28
0
    '''
    DEFINE THE ACTOR RL AGENT
    '''
    if args.algo == "NAF":
        agent = NAF(args.gamma, args.tau, args.hidden_size,
                    env.observation_space.shape[0], env.action_space)
        print("Initialized NAF")
    else:
        agent = DDPG(args.gamma, args.tau, args.hidden_size,
                     env.observation_space.shape[0], env.action_space)
        print("Initialized DDPG actor")
    '''
    DEFINE REPLAY BUFFER AND NOISE
    '''
    memory = ReplayMemory(args.replay_size)
    ounoise = OUNoise(env.action_space.shape[0])
    '''
    #############################
    Initialize the Evolution Part
    #############################
    '''
    evo = Evo(10)
    evo.initialize_fitness()

    # TODO: MOVE THE TRAINING CODE BELOW TO ITS RESPECTIVE FUNCTIONS
    rewards = []  # during training
    rewards_test_ERL = []  # during testing ERL policy
    rewards_test_DDPG = []

    print("Number of hidden units = " + str(args.hidden_size))
    print("Batch size = " + str(args.batch_size))
Esempio n. 29
0
class Agent(object):
    def __init__(self, state_space, action_space, max_action, device):
        self.state_size = state_space.shape[0]
        self.action_size = action_space.shape[0]
        self.max_action = max_action
        self.device = device
        self.actor_local = Actor(state_space.shape, action_space.high.size,
                                 max_action)
        self.actor_target = Actor(state_space.shape, action_space.high.size,
                                  max_action)
        self.actor_optimizer = optimizers.Adam(LR_ACTOR)
        # let target be equal to local
        self.actor_target.set_weights(self.actor_local.get_weights())

        self.critic_local = Critic(state_space.shape, action_space.high.size)
        self.critic_target = Critic(state_space.shape, action_space.high.size)
        self.critic_optimizer = optimizers.Adam(LR_CRITIC)
        # let target be equal to local
        self.critic_target.set_weights(self.critic_local.get_weights())

        self.noise = OUNoise(self.action_size)
        self.memory = ReplayBuffer(BUFFER_SIZE)

        self.current_steps = 0

    def step(self,
             state,
             action,
             reward,
             done,
             next_state,
             train=True) -> None:
        self.memory.store(state, action, reward, done, next_state)
        if train and self.memory.count > BATCH_SIZE and self.memory.count > MIN_MEM_SIZE:
            if self.current_steps % UPDATE_STEPS == 0:
                experiences = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, GAMMA)
            self.current_steps += 1

    @tf.function
    def critic_train(self, states, actions, rewards, dones, next_states):
        with tf.device(self.device):
            # Compute yi
            u_t = self.actor_target(next_states)
            q_t = self.critic_target([next_states, u_t])
            yi = tf.cast(rewards, dtype=tf.float64) + \
                 tf.cast(GAMMA, dtype=tf.float64) * \
                 tf.cast((1 - tf.cast(dones, dtype=tf.int64)), dtype=tf.float64) * \
                 tf.cast(q_t, dtype=tf.float64)

            # Compute MSE
            with tf.GradientTape() as tape:
                q_l = tf.cast(self.critic_local([states, actions]),
                              dtype=tf.float64)
                loss = (q_l - yi) * (q_l - yi)
                loss = tf.reduce_mean(loss)
                # Update critic by minimizing loss
                dloss_dql = tape.gradient(loss,
                                          self.critic_local.trainable_weights)
            self.critic_optimizer.apply_gradients(
                zip(dloss_dql, self.critic_local.trainable_weights))
        return

    @tf.function
    def actor_train(self, states):
        with tf.device(self.device):
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(self.actor_local.trainable_variables)
                u_l = self.actor_local(states)
                q_l = -tf.reduce_mean(self.critic_local([states, u_l]))
            j = tape.gradient(q_l, self.actor_local.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(j, self.actor_local.trainable_variables))
        return

    def learn(self, experiences, gamma) -> None:
        states, actions, rewards, dones, next_states = experiences
        states = np.array(states).reshape(BATCH_SIZE, self.state_size)
        states = tf.convert_to_tensor(states)
        actions = np.array(actions).reshape(BATCH_SIZE, self.action_size)
        actions = tf.convert_to_tensor(actions)
        rewards = np.array(rewards).reshape(BATCH_SIZE, 1)
        next_states = np.array(next_states).reshape(BATCH_SIZE,
                                                    self.state_size)
        dones = np.array(dones).reshape(BATCH_SIZE, 1)

        self.critic_train(states, actions, rewards, dones, next_states)
        self.actor_train(states)
        self.update_local()
        return

    def update_local(self):
        def soft_updates(local_model: tf.keras.Model,
                         target_model: tf.keras.Model) -> np.ndarray:
            local_weights = np.array(local_model.get_weights())
            target_weights = np.array(target_model.get_weights())

            assert len(local_weights) == len(target_weights)
            new_weights = TAU * local_weights + (1 - TAU) * target_weights
            return new_weights

        self.actor_target.set_weights(
            soft_updates(self.actor_local, self.actor_target))
        self.critic_target.set_weights(
            soft_updates(self.critic_local, self.critic_target))

    def store_weights(self, episode: int) -> None:
        self.actor_target.save_weights(
            join(CKPTS_PATH, ACTOR_CKPTS, f'cp-{episode}'))
        self.critic_target.save_weights(
            join(CKPTS_PATH, CRITIC_CKPTS, f'cp-{episode}'))
        return

    def act(self, state, add_noise=True) -> (float, float):
        state = np.array(state).reshape(1, self.state_size)
        pure_action = self.actor_local.predict(state)[0]
        action = self.noise.get_action(pure_action)
        return action, pure_action

    def reset(self):
        self.noise.reset()
Esempio n. 30
0
        'done_comparison_data': done_comparison_data,
        'scores': scores
    })

    # Actions generation
    exploration_mu = 0
    exploration_theta = 0.15
    exploration_sigma = 0.2
    action_size = 3
    action_low = np.array([1, 0, 1])
    action_high = np.array([10, 359, 2000])
    action_range = action_high - action_low

    #Start with random action
    action = np.array([np.random.uniform() for _ in action_low])
    noise = OUNoise(action.shape[0], exploration_mu, exploration_theta,
                    exploration_sigma)

    time_limit = 10
    for i in range(10):
        start_time = time.time()
        env.reset()
        done = False
        j = 0
        while not done:
            j += 1
            ns = noise.sample()
            action = action + ns

            v_size, angle, speed = np.array(transform_action(
                action, action_range, action_low),
                                            dtype='uint8')
Esempio n. 31
0
writer = SummaryWriter()

env.seed(args.seed)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
if args.algo == "NAF":
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                      env.observation_space.shape[0], env.action_space)
else:
    agent = DDPG(args.gamma, args.tau, args.hidden_size,
                      env.observation_space.shape[0], env.action_space)

memory = ReplayMemory(args.replay_size)

ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, 
    desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None

rewards = []
total_numsteps = 0
updates = 0

for i_episode in range(args.num_episodes):
    state = torch.Tensor([env.reset()])

    if args.ou_noise: 
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                      i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()