예제 #1
0
    def __init__(
        self,
        obs_dim,
        action_dim,
        action_gain,
        actor_learning_rate=0.0001,
        critic_learning_rate=0.001,
        gamma=0.99,
        tau=0.001,
    ):
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.tau = tau

        # make main networks
        self.actor = Actor(obs_dim, action_dim, action_gain,
                           actor_learning_rate)
        self.critic = Critic(obs_dim, action_dim, critic_learning_rate)

        # make target networks
        self.target_actor = Actor(obs_dim, action_dim, action_gain)
        self.target_actor.model.set_weights(self.actor.model.get_weights())
        self.target_critic = Critic(obs_dim, action_dim)
        self.target_critic.model.set_weights(self.critic.model.get_weights())
예제 #2
0
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.eps = eps_start
        self.t_step = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                
        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
예제 #3
0
    def __init__(self, state_size, action_size, random_seed, num_agents):
        """Initialize an Agent object.
         """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, sigma=0.1)

        # Replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        self.num_agents = num_agents
예제 #4
0
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPS

        #--- actor -----#

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=1e-3)

        #---- critic -----#

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=1e-3,
                                           weight_decay=0)

        self.noise = OUNoise(action_size, random_seed)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
    def __init__(
        self,
        state_size=24,
        action_size=2,
        BATCH_SIZE=128,
        BUFFER_SIZE=int(1e6),
        discount_factor=1,
        tau=1e-2,
        noise_coefficient_start=5,
        noise_coefficient_decay=0.99,
        LR_ACTOR=1e-3,
        LR_CRITIC=1e-3,
        WEIGHT_DECAY=1e-3,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")):
        """
			state_size (int): dimension of each state
			action_size (int): dimension of each action
			BATCH_SIZE (int): mini batch size
			BUFFER_SIZE (int): experience storing lenght, keep it as high as possible
			discount_factor (float): discount factor for calculating Q_target
			tau (float): interpolation parameter for updating target network
			noise_coefficient_start (float): value to be multiplied to OUNoise sample
			noise_coefficient_decay (float): exponential decay factor for value to be multiplied to OUNoise sample
			LR_ACTOR (float): learning rate for actor network
			LR_CRITIC (float): learning rate for critic network
			WEIGHT_DECAY (float): Weight decay for critic network optimizer
			device : "cuda:0" if torch.cuda.is_available() else "cpu"
		"""

        self.state_size = state_size
        print(device)
        self.action_size = action_size
        self.BATCH_SIZE = BATCH_SIZE
        self.BUFFER_SIZE = BUFFER_SIZE
        self.discount_factor = discount_factor
        self.tau = tau
        self.noise_coefficient = noise_coefficient_start
        self.noise_coefficient_decay = noise_coefficient_decay
        self.steps_completed = 0
        self.device = device
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(self.device)
        self.actor_target = Actor(state_size, action_size).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size).to(self.device)
        self.critic_target = Critic(state_size, action_size).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((1, action_size))

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE)
예제 #6
0
    def __init__(self,
                 env: gym.Env,
                 memory_size: int,
                 batch_size: int,
                 ou_noise_theta: float,
                 ou_noise_sigma: float,
                 gamma: float = 0.99,
                 tau: float = 5e-3,
                 initial_random_episode: int = 1e4,
                 name_cases='myproject'):
        """ Initialize. """

        # Logger
        self.wandb = wandb.init(project=name_cases)

        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]

        self.env = env
        self.memory = ReplayBuffer(memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.initial_random_episode = initial_random_episode

        # noise
        self.noise = OUNoise(
            action_dim,
            theta=ou_noise_theta,
            sigma=ou_noise_sigma,
        )

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        print(self.device)

        # networks
        self.actor = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target = Actor(obs_dim, action_dim).to(self.device)
        self.actor_target.load_state_dict(self.actor.state_dict())

        self.critic = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target = Critic(obs_dim + action_dim).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizer
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        # transition to store in memory
        self.transition = list()

        # total steps count
        self.total_step = 0
        # mode: train / test
        self.is_test = False
        self.populate(self.initial_random_episode)
예제 #7
0
    def __init__(self,
                 state_size,
                 action_size,
                 action_sigma=0.1,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2,
                 seed=0):
        '''
        TD3 Agent
        :param state_size: State Dimension
        :param action_size: Action dimension
        :param action_sigma: standard deviation of the noise to be added to the action
        :param memory_size:
        :param batch:
        :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper)
        :param noise_clip: How much noise to allow
        :param gamma:
        :param update_frequency:
        :param seed:
        '''

        self.state_size = state_size
        self.action_size = action_size

        self.action_sigma = action_sigma
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency
        self.seed = seed

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        #second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        # second Critic as described in the paper
        # https: // arxiv.org / pdf / 1802.09477.pdf
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch, seed=seed)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)
예제 #8
0
 def create_actor(self, alpha, hidden_layers):
     params = {
         'input_shape':      self.env.observation_space.shape,
         'output_shape':     self.env.action_space.shape,
         'hidden_layers':    hidden_layers
     }
     self.actor = OpenStruct()
     self.actor.online = Actor("{}.actor.online".format(self.name), **params)
     self.actor.target = Actor("{}.actor.target".format(self.name), **params)
예제 #9
0
    def __init__(self, n, state_size, action_size, random_seed, params):
        """Initialize an Agent object.
        
        Params
        ======
            n (int): number of agents in env
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            params (dict): dictionary with hyperparameters name-value pairs
        """
        self.n = n
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.BUFFER_SIZE = params["BUFFER_SIZE"]
        self.BATCH_SIZE = params["BATCH_SIZE"]
        self.GAMMA = params["GAMMA"]
        self.TAU = params["TAU"]
        self.LR_ACTOR = params["LR_ACTOR"]
        self.LR_CRITIC = params["LR_CRITIC"]
        self.WEIGHT_DECAY = params["WEIGHT_DECAY"]
        self.N_UPDATES = params["N_UPDATES"]
        self.UPDATE_STEP = params["UPDATE_STEP"]

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(self.n, action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)

        #Count timesteps
        self.timestep = 0
예제 #10
0
    def __init__(self,
                 init_pose=None,
                 init_velocities=None,
                 init_angle_velocities=None,
                 runtime=5.,
                 target_pos=None,
                 buffer_size=150000,
                 batch_size=32,
                 gamma=0.99,
                 replay_alpha=0.5,
                 beta_limit=10000):

        self.task = Task(init_pose, init_velocities, init_angle_velocities,
                         runtime, target_pos)

        self.state_size = self.task.state_size
        self.action_size = self.task.action_size

        self.state = self.task.reset()

        self.memory = PrioritizedReplay(buffer_size, batch_size, replay_alpha,
                                        beta_limit)

        self.actor = Actor(self.state_size, self.action_size,
                           self.task.action_low, self.task.action_high)
        self.actor_weights = self.actor.model.trainable_weights
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.task.action_low, self.task.action_high)

        self.critic = Critic(self.state_size, self.action_size)
        self.critic_weights = self.critic.model.trainable_weights
        self.critic_target = Critic(self.state_size, self.action_size)

        self.gamma = gamma

        # how much influence older weights have when updating target
        self.tau = 0.03

        #noise
        # GENTLE LANDING
        #self.mu = 0
        #self.theta = 0.1
        #self.sigma = 25
        self.mu = 0
        self.theta = 0.1
        self.sigma = 9
        self.noise = Noise(self.action_size, self.mu, self.theta, self.sigma)

        self.episodes = 0
        self.training_step = 0
예제 #11
0
    def __init__(self,
                 env,
                 act_dim,
                 state_dim,
                 goal_dim,
                 act_range,
                 buffer_size=int(1e6),
                 gamma=0.98,
                 lr=0.001,
                 tau=0.95):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = state_dim + goal_dim
        self.gamma = gamma
        self.lr = lr
        self.tau = tau
        self.env = env

        # Create actor and critic networks
        self.actor_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        self.critic_network = Critic(self.env_dim, act_dim, act_range)
        self.critic_target_network = Critic(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        # Optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=lr)

        # Replay buffer
        # self.buffer = MemoryBuffer(buffer_size)
        self.buffer = ReplayMemory(buffer_size)

        # Normalizers
        self.goal_normalizer = Normalizer(
            goal_dim, default_clip_range=5)  # Clip between [-5, 5]
        self.state_normalizer = Normalizer(state_dim, default_clip_range=5)
예제 #12
0
    def __init__(self, env, time_steps, hidden_dim):
        self.name = 'DDPG'  # name for uploading results
        self.scale = env.asset
        self.unit = env.unit
        self.seed = env.rd_seed

        self.time_dim = time_steps
        self.state_dim = env.observation_space.shape[1]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = 64
        self.memory_size = self.time_dim + self.batch_size * 10
        self.start_size = self.time_dim + self.batch_size * 2

        # Initialise actor & critic networks
        self.actor_network = Actor(self.time_dim, self.state_dim,
                                   self.action_dim, hidden_dim)
        self.critic_network = Critic(self.time_dim, self.state_dim,
                                     self.action_dim, hidden_dim)

        # Initialize replay buffer
        self.replay_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_next_state = torch.zeros(
            (self.start_size - 1, 3, self.state_dim), device=cuda)
        self.replay_action = torch.zeros(
            (self.start_size - 1, 1, self.state_dim), device=cuda)
        self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim,
                                         sigma=0.01 / self.action_dim)
        self.initial()
    def test_actor(self):
        Actor_obj = Actor(1, 16, 4)
        Critic_obj = Critic(4, 16, 1)
        # actor_optimizer = optim.SGD(Actor_obj.parameters(), lr=0.1, momentum=0.5)

        # Forward Propagation
        y = Actor_obj.forward(torch.FloatTensor([1]))
        self.assertTrue(len(y) == 4)
    def test_critic(self):
        Actor_obj = Actor(1, 16, 4)
        Critic_obj = Critic(4, 16, 1)
        # critic_optimizer = optim.SGD(Critic_obj.parameters(), lr=C_learning_rate)

        y = Actor_obj.forward(torch.FloatTensor([1]))
        # Forward Propagation
        y_pred = Critic_obj.forward(y)
        self.assertTrue(len(y_pred) == 1)
예제 #15
0
    def __init__(self,
                 env=gym.make('Pendulum-v0'),
                 s_dim=2,
                 a_dim=1,
                 gamma=0.99,
                 episodes=100,
                 tau=0.001,
                 buffer_size=1e06,
                 minibatch_size=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 save_name='final_weights',
                 render=False):
        self.save_name = save_name
        self.render = render
        self.env = env
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.EPISODES = episodes
        self.MAX_TIME_STEPS = 200
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.GAMMA = gamma
        self.TAU = tau
        self.buffer_size = buffer_size
        self.minibatch_size = minibatch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.ou_noise = OUNoise(mean=np.zeros(1))

        self.actor = Actor(self.s_dim, self.a_dim).model()
        self.target_actor = Actor(self.s_dim, self.a_dim).model()
        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(self.s_dim, self.a_dim).model()
        self.critic_opt = tf.keras.optimizers.Adam(
            learning_rate=self.critic_lr)
        self.target_critic = Critic(self.s_dim, self.a_dim).model()
        self.target_critic.set_weights(self.critic.get_weights())

        self.replay_buffer = ReplayBuffer(self.buffer_size)
예제 #16
0
    def __init__(self,
                 n_state,
                 n_action,
                 a_limit,
                 model_folder=None,
                 memory_size=10000,
                 batch_size=32,
                 tau=0.01,
                 gamma=0.99,
                 var=3.0):
        # Record the parameters
        self.n_state = n_state
        self.n_action = n_action
        self.a_limit = a_limit
        self.memory_size = memory_size
        self.model_folder = model_folder
        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.var = var

        # Create the network and related objects
        self.memory = np.zeros(
            [self.memory_size, 2 * self.n_state + self.n_action + 1],
            dtype=np.float32)
        self.memory_counter = 0
        self.eval_actor = Actor(self.n_state, self.n_action, self.a_limit)
        self.eval_critic = Critic(self.n_state, self.n_action)
        self.target_actor = Actor(self.n_state,
                                  self.n_action,
                                  self.a_limit,
                                  trainable=False)
        self.target_critic = Critic(self.n_state,
                                    self.n_action,
                                    trainable=False)

        self.actor_optimizer = Adam(self.eval_actor.parameters(), lr=0.001)
        self.critic_optimizer = Adam(self.eval_critic.parameters(), lr=0.002)
        self.criterion = nn.MSELoss()

        # Make sure the parameter of target network is the same as evaluate network
        self.hardCopy()
예제 #17
0
    def __init__(self,
                 env,
                 memory_size=1000000,
                 batch=128,
                 sigma=0.2,
                 noise_clip=0.5,
                 gamma=0.99,
                 update_frequency=2):

        self.states = env.observation_space
        self.state_size = env.observation_space.shape[0]
        self.actions = env.action_space
        self.action_size = env.action_space.shape[0]
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.update_frequency = update_frequency

        self.actor = Actor(self.state_size, self.action_size).to(device)
        self.critic0 = Critic(self.state_size, self.action_size).to(device)
        self.critic1 = Critic(self.state_size, self.action_size).to(device)

        self.target_actor = Actor(self.state_size, self.action_size).to(device)
        self.target_critic0 = Critic(self.state_size,
                                     self.action_size).to(device)
        self.target_critic1 = Critic(self.state_size,
                                     self.action_size).to(device)

        self.memory = ReplayBuffer(memory_size, batch)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR)

        self.soft_update(self.actor, self.target_actor, 1)
        self.soft_update(self.critic0, self.target_critic0, 1)
        self.soft_update(self.critic1, self.target_critic1, 1)
예제 #18
0
def MountainCar():
    env = gym.make('MountainCar-v0')
    env = env.unwrapped
    env.reset()
    env.render()

    n_features = env.observation_space.shape[0]
    n_actions = env.action_space.n

    sess = tf.Session()

    actor = Actor(sess, n_features, n_actions, lr=LR_A)
    critic = Critic(sess, n_features, lr=LR_C)
    sess.run(tf.global_variables_initializer())

    game = Game(env, actor, critic)
    game.run_mountain_car()
예제 #19
0
def main(args):
    with tf.device(args['device']):

        # tf
        tf.set_random_seed(args['rand_seed'])
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

        # env
        env = gym.make('TestEnv-v0')
        env.seed(args['rand_seed'])
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]
        concat_dim = 2
        batched_s_dim = [None, s_dim, concat_dim]
        batched_a_dim = [None, a_dim]

        # agents
        actor = Actor(sess, args['actor_lr'], args['tau'], args['batch_size'],
                      args['clip_val'], batched_s_dim, batched_a_dim)
        critic = Critic(sess, args['critic_lr'], args['tau'], args['clip_val'],
                        batched_s_dim, batched_a_dim)

        # experience
        exp = Experience(args['buffer_size'], args['batch_size'],
                         args['rand_seed'])

        # noise
        actor_noise = ActorNoise(actor.predict,
                                 a_dim,
                                 noise_type=args['noise_type'])

        # initialize
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = Model(sess, args['restore_path'])
        saver.restore_model()

        # training
        her = HER(saver, exp, env, actor, critic, actor_noise)
        if args['mode'] == 'train':
            her.train(args['gamma'], args['her_k'], args['max_episodes'],
                      args['max_episode_len'], args['replay_len'])
        else:
            her.play(args['max_episodes'], args['max_episode_len'])
예제 #20
0
    def __init__(self, env):
        LR_A = 0.001  # learning rate for actor
        LR_C = 0.01  # learning rate for critic
        num_features = env.observation_space.shape[0]
        # num_features = 14
        num_actions = env.action_space.shape[0]

        self.action_space = env.action_space

        sess = tf.Session()
        self.actor = Actor(
            sess,
            n_features=num_features,
            action_bound=[env.action_space.low[0], env.action_space.high[0]],
            lr=LR_A)
        self.critic = Critic(
            sess, n_features=num_features, lr=LR_C
        )  # we need a good teacher, so the teacher should learn faster than the actor
        sess.run(tf.global_variables_initializer())
예제 #21
0
def CartPoleAC():
    env1 = gym.make('CartPole-v0')
    # env2 = gym.make('CartPole-v0')
    env1.seed(10)
    # env2.seed(2)
    env1 = env1.unwrapped
    env1.reset()
    # env2 = env2.unwrapped
    # env2.reset()

    n_features = env1.observation_space.shape[0]
    n_actions = env1.action_space.n

    sess = tf.Session()

    actor = Actor(sess, n_features, n_actions, lr=LR_A)
    critic = Critic(sess, n_features, lr=LR_C)
    sess.run(tf.global_variables_initializer())

    g = Game(env1, actor, critic)
    g.run()
import tensorflow as tf
from actor_critic import Actor,Critic

env = gym.make('CartPole-v0')
env.seed(1)     
env = env.unwrapped

N_S = env.observation_space.shape[0]
N_A = env.action_space.n 

DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold
RENDER = False # rendering wastes time

sess = tf.Session()

actor = Actor(s_dim=N_S,a_dim=N_A,learning_rate=0.01,sess=sess)
critic = Critic(s_dim=N_S,learning_rate=0.05,reward_decay=0.9,sess=sess)

sess.run(tf.global_variables_initializer())

for i_episode in range(3000):

	s = env.reset()
	t = 0
	track_r = []

	while True:
		if RENDER:env.render()

		a = actor.choose_action(s)
		s_,r,done,info = env.step(a)
예제 #23
0
    EPISODES = 5000
    GAMMA = 0.98
    ALPHA = 0.005
    EPSILON = 0.5
    EPSILON_DECAY = 0.1

    env = gym.make('Pendulum-v0')
    # env = Pendulum()
    a_dim = env.action_space.shape[0]
    layer_size = [32, 32]
    s_dim = env.observation_space.shape[0]
    ddpg = DDPG(env, s_dim=s_dim, a_dim=a_dim)
    # actor_trained.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')
    # x, y = np.random.rand(2, 3), np.array([0.8, 0.4])
    # actor_trained.train_on_batch(x, y)
    actor_trained = Actor(s_dim, a_dim).model()
    actor_trained.load_weights('training/target_actor_weights')
    # actor_untrained = ddpg.actor
    print('hi')

    def collect_data(act_net):
        a_all, states_all = [], []
        obs = env.reset()
        for t in range(1000):
            obs = np.squeeze(obs)
            if obs.ndim == 1:
                a = act_net(obs[None, :])
            else:
                a = act_net(obs)
            obs, _, done, _ = env.step(a)
            states_all.append(obs)
    +'_d'+str(config.lr_decay_step)+'_'+str(config.lr_decay_rate) \
    + '_T'+str(config.temperature)+ '_steps'+str(config.nb_steps)+'_i'+str(config.init_B) 

print(dir_)


######################################          TEST    #################################


config.is_training = False
config.batch_size = 500 ##### #####
#config.max_length = 50 ##### #####
config.temperature = 1.2 ##### #####

tf.reset_default_graph()
actor = Actor(config) # Build graph

variables_to_save = [v for v in tf.global_variables() if 'Adam' not in v.name] # Save & restore all the variables.
saver = tf.train.Saver(var_list=variables_to_save, keep_checkpoint_every_n_hours=1.0)   


with tf.Session() as sess:  # start session
    sess.run(tf.global_variables_initializer()) # Run initialize op
    
    save_path = "save/"+dir_
    
    
    predictions_length, predictions_length_w2opt, time_mmodel, time_l2opt = [], [], [], []
    pred_all_2opt, time_all_2opt = [], []
    for i in tqdm(range(1000)): # test instance
        seed_ = 1+i
예제 #25
0
파일: test.py 프로젝트: bencottier/DRL-TF2
def test_policy(output_dir, env_name, episodes, checkpoint_number):
    """
    Run a learned policy with visualisation in the environment.

    Args:
        output_dir: str. Directory containing a JSON file named 
            'config.json' with experiment metadata, and a subfolder
            named 'training_checkpoints' containing model checkpoints.
        env_name: str. Name of the environment to run the policy in.
        episodes: int. Number of episodes to run the policy for.
    """
    # Load experimental metadata from file
    with open(os.path.join(output_dir, 'config.json'), 'r') as f:
        exp_data = json.load(f)

    # Create environment
    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Share information about action space with policy architecture
    ac_kwargs = dict(hidden_sizes=exp_data['ac_kwargs']['hidden_sizes'])
    ac_kwargs['action_space'] = env.action_space

    # Randomly initialise critic and actor networks
    critic = Critic(input_shape=(exp_data['batch_size'], obs_dim + act_dim),
                    **ac_kwargs)
    actor = Actor(input_shape=(exp_data['batch_size'], obs_dim), **ac_kwargs)

    # Optimizers
    critic_optimizer = tf.keras.optimizers.Adam(exp_data['q_lr'])
    actor_optimizer = tf.keras.optimizers.Adam(exp_data['pi_lr'])

    checkpoint_dir = os.path.join(output_dir, 'training_checkpoints')
    checkpoint = tf.train.Checkpoint(critic_optimizer=critic_optimizer,
                                     actor_optimizer=actor_optimizer,
                                     critic=critic,
                                     actor=actor)
    if checkpoint_number is not None:
        checkpoint.restore(
            os.path.join(checkpoint_dir,
                         f'ckpt-{checkpoint_number}')).expect_partial()
    else:
        checkpoint.restore(
            tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()

    # Run policy for specified number of episodes, recording return
    ep_rets = np.zeros(episodes)
    for i in range(episodes):
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        while not (d or (ep_len == exp_data['max_ep_len'])):
            env.render()
            o, r, d, _ = env.step(actor(o.reshape(1, -1)))
            if type(r) == np.ndarray:
                r = r[0]
            ep_ret += r
            ep_len += 1
        ep_rets[i] = ep_ret
        print(f'Episode {i}: return={ep_ret:.0f} length={ep_len}')
    # Summary stats
    print(f'avg={ep_rets.mean():.0f} std={ep_rets.std():.0f} ' \
            f'min={ep_rets.min():.0f} max={ep_rets.max():.0f}')
    env.close()
예제 #26
0
파일: learn.py 프로젝트: ilosea/RLSched
from job_generator import JobGenerator
from element import Machine, Job
import plot
import tensorflow as tf
import numpy as np
from actor_critic import Actor, Critic
import os

LOG_DIR = "./log"
LOG_FILE = "log_rl"
MODEL_DIR = "./model"

if __name__ == '__main__':
    sess = tf.Session()
    pa = Parameter()
    actor = Actor(sess, pa)
    critic = Critic(sess, pa)
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter(LOG_DIR, sess.graph)
    saver = tf.train.Saver()
    logger = open(LOG_FILE, "w")  # file to record the logs

    if not os.path.exists(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    Machine.reset()
    Job.reset()
    env = Environment(pa)
    mac_gen = MacGenerator(pa)
    job_gen = JobGenerator(pa)
    return inputs

if __name__ == '__main__':
    # load the model param
    model_path = 'saved_models/%s/%s/model.pt' % (args.env, args.her_strat)
    o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage)
    # create the environment
    env = gym.make(args.env)
    # get the env param
    obs = env.reset()
    # get the environment params
    act_dim = env.action_space.shape[0]
    env_dim = obs['observation'].shape[0] + obs['desired_goal'].shape[0]
    act_range = env.action_space.high[0]
    # create the actor network
    actor_network = Actor(env_dim, act_dim, act_range)
    actor_network.load_state_dict(model)
    actor_network.eval()
    for i in range(DEMO_LENGHT):
        observation = env.reset()
        # start to do the demo
        obs = observation['observation']
        g = observation['desired_goal']
        for t in range(env._max_episode_steps):
            env.render()
            inputs = process_inputs(obs, g, o_mean, o_std, g_mean, g_std)
            with torch.no_grad():
                pi = actor_network(inputs)
            action = pi.detach().numpy().squeeze()
            # put actions into the environment
            observation_new, reward, done, info = env.step(action)
예제 #28
0
    def train(self, max_episode=10, max_path_length=200, verbose=0):
        env = self.env
        avg_reward_sum = 0.

        #f_eps = open("episode.csv","w")
        #write_eps = csv.write(f_eps)

        for e in range(max_episode):
            env._reset()
            observation = env._reset()
            game_over = False
            reward_sum = 0

            inputs = []
            outputs = []
            predicteds = []
            rewards = []

            #f_iter = open("episode_{0}.csv".format(e),"w")
            #write_iter = csv.writer(f_iter)
            f_episode = "episode_{0}.csv".format(e)
            os.system("rm -rf {0}".format(f_episode))

            print(observation[0].shape, observation[1].shape)

            sess = tf.Session()

            actor = Actor(sess,
                          n_actions=self.env.action_space.n
                          # output_graph=True,
                          )

            critic = Critic(
                sess, n_actions=self.env.action_space.n
            )  # we need a good teacher, so the teacher should learn faster than the actor

            sess.run(tf.global_variables_initializer())

            while not game_over:

                action, aprob = actor.choose_action(observation)

                inputs.append(observation)
                predicteds.append(aprob)

                y = np.zeros([self.env.action_space.n])
                y[action] = 1.
                outputs.append(y)

                observation_, reward, actual_reward, game_over, info = self.env._step(
                    action)
                reward_sum += float(actual_reward)

                print(reward)
                #rewards.append(float(reward))
                rewards.append(float(reward))

                # After env.step
                td_error = critic.learn(
                    observation, reward_sum,
                    observation_)  # gradient = grad[r + gamma * V(s_) - V(s)]
                actor.learn(
                    observation, action,
                    td_error)  # true_gradient = grad[logPi(s,a) * td_error]

                # check memory for RNN model
                if len(inputs) > self.max_memory:
                    del inputs[0]
                    del outputs[0]
                    del predicteds[0]
                    del rewards[0]

                if verbose > 0:
                    if env.actions[action] == "LONG" or env.actions[
                            action] == "SHORT":
                        #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD":
                        color = bcolors.FAIL if env.actions[
                            action] == "LONG" else bcolors.OKBLUE
                        print("%s:\t%s\t%.2f\t%.2f\t" %
                              (info["dt"], color + env.actions[action] +
                               bcolors.ENDC, reward_sum, info["cum"]) +
                              ("\t".join([
                                  "%s:%.2f" % (l, i)
                                  for l, i in zip(env.actions, aprob.tolist())
                              ])))
                    #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])))
                    os.system("echo %s >> %s" %
                              ("%s:\t%s\t%.2f\t%.2f\t" %
                               (info["dt"], env.actions[action], reward_sum,
                                info["cum"]) +
                               ("\t".join([
                                   "%s:%.2f" % (l, i)
                                   for l, i in zip(env.actions, aprob.tolist())
                               ])), f_episode))

                avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
                toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (
                    e, info["code"],
                    (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) +
                    ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"],
                    avg_reward_sum)
                print(toPrint)
                if self.history_filename != None:
                    os.system("echo %s >> %s" %
                              (toPrint, self.history_filename))

                dim = len(inputs[0])
                inputs_ = [[] for i in range(dim)]
                for obs in inputs:
                    for i, block in enumerate(obs):
                        inputs_[i].append(block[0])
                inputs_ = [np.array(inputs_[i]) for i in range(dim)]

                outputs_ = np.vstack(outputs)
                predicteds_ = np.vstack(predicteds)
                rewards_ = np.vstack(rewards)

                print("shape: ", np.shape(rewards))

                print("fit model input.shape %s, output.shape %s" %
                      ([inputs_[i].shape
                        for i in range(len(inputs_))], outputs_.shape))

                np.set_printoptions(linewidth=200, suppress=True)
                print("currentTargetIndex:", env.currentTargetIndex)
예제 #29
0
MAX_EPISODE = 1000
MAX_EP_STEPS = 2000
LR_A = 0.001
LR_C = 0.01

env = gym.make('MountainCar-v0')
env = env.unwrapped
env.seed(1)

n_features = env.observation_space.shape[0]
n_actions = env.action_space.n

sess0 = tf.Session()
sess1 = tf.Session()
rl = [[
    Actor(sess0, n_features, n_actions, name='actor0', lr=LR_A),
    Critic(sess0, n_features, name='critic0', lr=LR_C)
],
      [
          Actor(sess1, n_features, n_actions, name='actor1', lr=LR_A),
          Critic(sess1, n_features, name='critic1', lr=LR_C)
      ]]
sess0.run(tf.global_variables_initializer())
sess1.run(tf.global_variables_initializer())

episode_positive = []
episode_negative = []
episode_mix = []
for episode in range(MAX_EPISODE):
    for i in range(len(rl)):
        step = 0
예제 #30
0
파일: ddpg.py 프로젝트: bencottier/DRL-TF2
def ddpg(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, 
         replay_size=int(1e6), discount=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 
         batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, 
         logger_kwargs=dict(), save_freq=1):
    """
    Implements the deep deterministic policy gradient algorithm.

    Performance statistics are logged to stdout and to file in 
    CSV format, and models are saved regularly during training.

    Args:
        env_fn: callable. Must load an instance of an environment 
            that implements the OpenAI Gym API.
        ac_kwargs: dict. Additional keyword arguments to be passed 
            to the Actor and Critic constructors.
        seed: int. Random seed.
        steps_per_epoch: int. Number of training steps or 
            environment interactions that make up one epoch.
        epochs: int. Number of epochs for training.
        replay_size: int. Maximum number of transitions that 
            can be stored in the replay buffer.
        discount: float. Rate of discounting on future reward, 
            usually denoted with the Greek letter gamma. Normally 
            between 0 and 1.
        polyak: float. Weighting of target estimator parameters
            in the target update (which is a "polayk" average).
        pi_lr: float. Learning rate for the policy or actor estimator.
        q_lr: float. Learning rate for the Q or critic estimator.
        batch_size: int. Number of transitions to sample from the 
            replay buffer per gradient update of the estimators.
        start_steps: int. Number of initial training steps where 
            actions are chosen at random instead of the policy, 
            as a means of increasing exploration.
        act_noise: float. Scale (standard deviation) of the Gaussian 
            noise added to the policy for exploration during training.
        max_ep_len: int. Maximum number of steps for one episode in 
            the environment. Episode length may be shorter if there
            are terminal states.
        logger_kwargs: dict. Keyword arguments to be passed to the 
            logger. Can be set up using utils.setup_logger_kwargs().
        save_freq: int. Models are saved per this number of epochs.
        
    """
    # Set up logging
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Set random seed for relevant modules
    tf.random.set_seed(seed)
    np.random.seed(seed)

    # Create environment
    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    if env._max_episode_steps < max_ep_len:
        max_ep_len = env._max_episode_steps
    if steps_per_epoch % max_ep_len != 0:
        """
        Training steps are batched at the end of a trajectory, so if 
        episode length does not divide steps per epoch, the size of 
        training step log arrays can be inconsistent. This takes the 
        upper bound on size, which wastes some memory but is easy.
        """
        max_logger_steps = steps_per_epoch + max_ep_len - (steps_per_epoch % max_ep_len)
    else:
        max_logger_steps = steps_per_epoch

    # Action limit for clipping
    # Assumes all dimensions have the same limit
    act_limit = env.action_space.high[0]

    # Give actor-critic model access to action space
    ac_kwargs['action_space'] = env.action_space

    # Randomly initialise critic and actor networks
    critic = Critic(input_shape=(batch_size, obs_dim + act_dim), lr=q_lr, **ac_kwargs)
    actor = Actor(input_shape=(batch_size, obs_dim), lr=pi_lr, **ac_kwargs)

    # Initialise target networks with the same weights as main networks
    critic_target = Critic(input_shape=(batch_size, obs_dim + act_dim), **ac_kwargs)
    actor_target = Actor(input_shape=(batch_size, obs_dim), **ac_kwargs)
    critic_target.set_weights(critic.get_weights())
    actor_target.set_weights(actor.get_weights())

    # Initialise replay buffer for storing and getting batches of transitions
    replay_buffer = ReplayBuffer(obs_dim, act_dim, size=replay_size)

    # Set up model checkpointing so we can resume training or test separately
    checkpoint_dir = os.path.join(logger.output_dir, 'training_checkpoints')
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(critic=critic, actor=actor)

    def get_action(o, noise_scale):
        """
        Computes an action from the policy (as a function of the 
        observation `o`) with added noise (scaled by `noise_scale`),
        clipped within the bounds of the action space.
        """
        a = actor(o.reshape(1, -1))
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    @tf.function
    def train_step(batch):
        """
        Performs a gradient update on the actor and critic estimators
        from the given batch of transitions.

        Args:
            batch: dict. A batch of transitions. Must store valid 
                values for 'obs1', 'acts', 'obs2', 'rwds', and 'done'. 
                Obtained from ReplayBuffer.sample_batch().
        Returns:
            A tuple of the Q values, critic loss, and actor loss.
        """
        with tf.GradientTape(persistent=True) as tape:
            # Critic loss
            q = critic(batch['obs1'], batch['acts'])
            q_pi_targ = critic_target(batch['obs2'], actor_target(batch['obs2']))
            backup = tf.stop_gradient(batch['rwds'] + discount * (1 - batch['done']) * q_pi_targ)
            q_loss = tf.reduce_mean((q - backup)**2)
            # Actor loss
            pi = actor(batch['obs1'])
            q_pi = critic(batch['obs1'], pi)
            pi_loss = -tf.reduce_mean(q_pi)
        # Q learning update
        critic_gradients = tape.gradient(q_loss, critic.trainable_variables)
        critic.optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))
        # Policy update
        actor_gradients = tape.gradient(pi_loss, actor.trainable_variables)
        actor.optimizer.apply_gradients(zip(actor_gradients, actor.trainable_variables))
        return q, q_loss, pi_loss

    def test_agent(n=10):
        """
        Evaluates the deterministic (noise-free) policy with a sample 
        of `n` trajectories.
        """
        for _ in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(n, TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    for t in range(total_steps):
        """
        Start with `start_steps` number of steps with random actions,
        to improve exploration. Then use the learned policy with some 
        noise added to keep up exploration (but less so).
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Execute a step in the environment
        o2, r, d, _ = env.step(a)
        o2 = np.squeeze(o2)  # bug fix for Pendulum-v0 environment, where act_dim == 1
        ep_ret += r
        ep_len += 1
        
        """
        Ignore the "done" signal if it comes from hitting the time
        horizon (that is, when it's an artificial terminal signal
        that isn't based on the agent's state)
        """
        d = False if ep_len==max_ep_len else d

        # Store transition in replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Advance the stored state
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)

                # Actor-critic update
                q, q_loss, pi_loss = train_step(batch)
                logger.store((max_logger_steps, batch_size), QVals=q.numpy())
                logger.store(max_logger_steps, LossQ=q_loss.numpy(), LossPi=pi_loss.numpy())

                # Target update
                critic_target.polyak_update(critic, polyak)
                actor_target.polyak_update(actor, polyak)

            logger.store(max_logger_steps // max_ep_len, EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Post-training for this epoch: save, test and write logs
        if t > 0 and (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

            # Save the model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                checkpoint.save(file_prefix=checkpoint_prefix)

            # Test the performance of the deterministic policy
            test_agent()

            # Log info about the epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t+1)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()