Ejemplo n.º 1
0
 def init(self, args, env):
     names = ['state0', 'action', 'state1', 'reward', 'terminal', 'goal']
     self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
     if args['--imit'] != '0':
         names.append('expVal')
         self.bufferImit = ReplayBuffer(limit=int(1e6), names=names.copy())
     self.critic = CriticDQNG(args, env)
Ejemplo n.º 2
0
    def __init__(self, state_size, action_size, seed, framework, buffer_type):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.framework = framework
        self.buffer_type = buffer_type

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        # def __init__(self, device, buffer_size, batch_size, alpha, beta):
        if self.buffer_type == 'PER_ReplayBuffer':
            self.memory = PER_ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE,
                                           ALPHA, BETA)
        if self.buffer_type == 'ReplayBuffer':
            self.memory = ReplayBuffer(device, action_size, BUFFER_SIZE,
                                       BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Ejemplo n.º 3
0
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=1e-3,
                 pol_lr=1e-4,
                 q_lr=5e-3,
                 batch_size=64,
                 buffer_size=10000,
                 target_noise=0.2,
                 action_noise=0.1,
                 clip_range=0.5,
                 update_delay=2):

        # environment stuff
        self.env = env
        self.num_act = env.action_space.shape[0]
        self.num_obs = env.observation_space.shape[0]
        self.eval_env = copy.deepcopy(env)

        # hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.pol_lr = pol_lr
        self.q_lr = q_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.target_noise = target_noise
        self.action_noise = action_noise
        self.clip_range = clip_range
        self.update_delay = 2

        # networks
        self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double()
        self.q1 = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.q2 = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.pol.init_weights()
        self.q1.init_weights()
        self.q2.init_weights()
        self.target_pol = copy.deepcopy(self.pol).double()
        self.target_q1 = copy.deepcopy(self.q1).double()
        self.target_q2 = copy.deepcopy(self.q2).double()

        # optimizers, buffer
        self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr)
        self.q1_opt = torch.optim.Adam(
            self.q1.parameters(),
            lr=self.q_lr,
        )
        self.q2_opt = torch.optim.Adam(
            self.q2.parameters(),
            lr=self.q_lr,
        )
        self.buffer = ReplayBuffer(self.buffer_size, 1000)
        self.mse_loss = torch.nn.MSELoss()

        self.cum_q1_loss = 0
        self.cum_q2_loss = 0
        self.cum_obj = 0
Ejemplo n.º 4
0
 def init(self, args, env):
     names = ['s0', 'a', 's1', 'r', 't', 'g']
     metrics = ['loss_dqn', 'loss_actor']
     self.buffer = ReplayBuffer(limit=int(1e6),
                                names=names.copy(),
                                args=args)
     self.actorCritic = ActorCriticDDPGG(args, env)
     for metric in metrics:
         self.metrics[metric] = 0
    def __init__(self,
                 env,
                 state_dim: int,
                 action_dim: int,
                 config: Dict,
                 device=None,
                 writer=None):
        self.logger = logging.getLogger("MADDPG")
        self.device = device if device is not None else DEVICE
        self.writer = writer

        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.agents_number = config['agents_number']

        hidden_layers = config.get('hidden_layers', (400, 300))
        noise_scale = config.get('noise_scale', 0.2)
        noise_sigma = config.get('noise_sigma', 0.1)
        actor_lr = config.get('actor_lr', 1e-3)
        actor_lr_decay = config.get('actor_lr_decay', 0)
        critic_lr = config.get('critic_lr', 1e-3)
        critic_lr_decay = config.get('critic_lr_decay', 0)
        self.actor_tau = config.get('actor_tau', 0.002)
        self.critic_tau = config.get('critic_tau', 0.002)
        create_agent = lambda: DDPGAgent(state_dim,
                                         action_dim,
                                         agents=self.agents_number,
                                         hidden_layers=hidden_layers,
                                         actor_lr=actor_lr,
                                         actor_lr_decay=actor_lr_decay,
                                         critic_lr=critic_lr,
                                         critic_lr_decay=critic_lr_decay,
                                         noise_scale=noise_scale,
                                         noise_sigma=noise_sigma,
                                         device=self.device)
        self.agents = [create_agent() for _ in range(self.agents_number)]

        self.discount = 0.99 if 'discount' not in config else config['discount']
        self.gradient_clip = 1.0 if 'gradient_clip' not in config else config[
            'gradient_clip']

        self.warm_up = 1e3 if 'warm_up' not in config else config['warm_up']
        self.buffer_size = int(
            1e6) if 'buffer_size' not in config else config['buffer_size']
        self.batch_size = config.get('batch_size', 128)
        self.p_batch_size = config.get('p_batch_size',
                                       int(self.batch_size // 2))
        self.n_batch_size = config.get('n_batch_size',
                                       int(self.batch_size // 4))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.update_every_iterations = config.get('update_every_iterations', 2)
        self.number_updates = config.get('number_updates', 2)

        self.reset()
Ejemplo n.º 6
0
 def init(self, args, env):
     names = ['s0', 'a', 's1', 'r', 't', 'g', 'm', 'task', 'mcr']
     metrics = ['loss_dqn', 'qval', 'val']
     self.buffer = ReplayBuffer(limit=int(1e6),
                                names=names.copy(),
                                args=args)
     self.actorCritic = ActorCriticDQNGM(args, env)
     for metric in metrics:
         self.metrics[metric] = 0
     self.goalcounts = np.zeros((len(self.env.goals), ))
Ejemplo n.º 7
0
    def __init__(self, state_size, action_size, args):
        """
        Initialize a D4PG Agent.
        """

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.state_size = state_size
        self.framework = args.framework
        self.eval = args.eval
        self.agent_count = 1
        self.learn_rate = args.learn_rate
        self.batch_size = args.batch_size
        self.buffer_size = args.buffer_size
        self.C = args.C
        self._epsilon = args.epsilon
        self.epsilon_decay = args.epsilon_decay
        self.epsilon_min = args.epsilon_min

        self.gamma = 0.99
        self.rollout = args.rollout
        self.tau = args.tau
        self.momentum = 1
        self.l2_decay = 0.0001
        self.update_type = "hard"

        self.t_step = 0
        self.episode = 0
        self.seed = 0

        # Set up memory buffers
        if args.prioritized_experience_replay:
            self.memory = PERBuffer(args.buffersize, self.batchsize,
                                    self.framestack, self.device, args.alpha,
                                    args.beta)
            self.criterion = WeightedLoss()
        else:
            self.memory = ReplayBuffer(self.device, self.buffer_size,
                                       self.gamma, self.rollout)

        #                    Initialize Q networks                         #
        self.q = self._make_model(state_size, action_size, args.pixels)
        self.q_target = self._make_model(state_size, action_size, args.pixels)
        self._hard_update(self.q, self.q_target)
        self.q_optimizer = self._set_optimizer(self.q.parameters(),
                                               lr=self.learn_rate,
                                               decay=self.l2_decay,
                                               momentum=self.momentum)

        self.new_episode()
Ejemplo n.º 8
0
 def __init__(self, s_dim, num_actions, lr):
     self.step = 0
     self.epStep = 0
     self.ep = 0
     self.tutorListened = True
     self.tutorInput = ''
     self.sDim = s_dim
     self.num_actions = num_actions
     self.learning_rate = lr
     self.names = ['state0', 'action', 'feedback', 'fWeight']
     self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)
     self.batchSize = 64
     self.episode = deque(maxlen=400)
     self.model = self.create_model()
Ejemplo n.º 9
0
    def __init__(self, env, hyperparameters, device, summary_writer=None):
        """Set parameters, initialize network."""

        state_space_shape = env.observation_space.shape
        action_space_size = env.action_space.n

        self.env = env

        self.online_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        self.target_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        # XXX maybe not really necesary?
        self.update_target_network()

        self.experience_replay = None

        self.accumulated_loss = []
        self.device = device

        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=hyperparameters['learning_rate'])

        self.double_DQN = hyperparameters['double_DQN']

        # Discount factor
        self.gamma = hyperparameters['gamma']

        # XXX ???
        self.n_multi_step = hyperparameters['n_multi_step']

        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.summary_writer = summary_writer

        # Greedy search hyperparameters
        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']
Ejemplo n.º 10
0
    def __init__(
        self,
        env,
        sub_states,
        layers,
        gamma=0.99,
        tau=1e-3,
        pol_lr=1e-4,
        q_lr=1e-3,
        batch_size=64,
        buffer_size=10000,
    ):

        # environment stuff
        self.env = env
        self.num_act = env.action_space.shape[0]
        self.num_obs = env.observation_space.shape[0]
        self.eval_env = copy.deepcopy(env)
        self.sub_states = sub_states
        self.layers = layers

        # hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.pol_lr = pol_lr
        self.q_lr = q_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        # networks
        self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double()
        # decomp critic
        self.q = DecompCritic(self.sub_states, self.num_act, layers).double()
        self.pol.init_weights()
        self.q.init_weights()
        self.target_pol = copy.deepcopy(self.pol).double()
        self.target_q = copy.deepcopy(self.q).double()

        # optimizers, buffer
        self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr)
        self.q_opt = torch.optim.Adam(
            self.q.parameters(),
            lr=self.q_lr,
        )
        self.buffer = ReplayBuffer(self.buffer_size, 1000)
        self.mse_loss = torch.nn.MSELoss()

        self.cum_loss = 0
        self.cum_obj = 0
Ejemplo n.º 11
0
 def __init__(self,
              n_actions,
              buffer_size=1000000,
              behaviour_policy='epsilon_greedy',
              discount_factor=0.99,
              clip_grad_norm_value=10.0,
              policy_args={}):
     self.discount_factor = discount_factor
     self.clip_grad_norm_value = clip_grad_norm_value
     self.replay_buffer = ReplayBuffer(capacity=buffer_size)
     if behaviour_policy == 'epsilon_greedy':
         self.policy = EpsilonGreedyPolicy(policy_args)
     else:
         self.policy = SoftPolicy()
     self.q_network = QNetwork(n_actions).to(device)
Ejemplo n.º 12
0
    def __init__(self, env, device, hyperparameters, summary_writer=None):
        '''
		Agent initialization. It create the CentralControl that control all the low
		'''
        self.rewards = []
        self.total_reward = 0
        self.birth_time = 0
        self.n_iter = 0
        self.n_games = 0
        self.ts_frame = 0
        self.ts = time.time()

        self.Memory = namedtuple(
            'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'],
            rename=False)

        # The CentralControl is the 'brain' of the agent
        self.cc = CentralControl(env.observation_space.shape,
                                 env.action_space.n, hyperparameters['gamma'],
                                 hyperparameters['n_multi_step'],
                                 hyperparameters['double_DQN'],
                                 hyperparameters['noisy_net'],
                                 hyperparameters['dueling'], device)

        self.cc.set_optimizer(hyperparameters['learning_rate'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

        self.accumulated_loss = []
        self.device = device

        # initialize the replay buffer (i.e. the memory) of the agent
        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])
        self.summary_writer = summary_writer

        self.noisy_net = hyperparameters['noisy_net']

        self.env = env
Ejemplo n.º 13
0
    def __init__(self, env, args):
        super(PlayroomGM, self).__init__(env)

        self.gamma = float(args['--gamma'])
        self.eps = float(args['--eps'])
        self.demo_f = [int(f) for f in args['--demo'].split(',')]

        self.feat = np.array([int(f) for f in args['--features'].split(',')])
        self.N = self.feat.shape[0]
        vs = np.zeros(shape=(self.N, self.state_dim[0]))
        vs[np.arange(self.N), self.feat] = 1
        self.vs = vs / np.sum(vs, axis=1, keepdims=True)
        self.R = 100
        self.idx = -1
        self.v = np.zeros(shape=(self.state_dim[0], 1))
        self.g = np.ones(shape=(self.state_dim[0]))
        self.queues = [CompetenceQueue() for _ in range(self.N)]
        self.names = ['s0', 'r0', 'a', 's1', 'r1', 'g', 'v', 'o', 'u']
        self.buffer = ReplayBuffer(limit=int(1e5), names=self.names, N=self.N)
Ejemplo n.º 14
0
    def __init__(self, env, device, cfg, summary_writer=None):
        '''
		Agent initialization. It create the CentralControl that control all the low
		'''

        # The CentralControl is the 'brain' of the agent
        self.cc = CentralControl(env.observation_space.shape,
                                 env.action_space.n, cfg.rl.gamma,
                                 cfg.rl.n_multi_step,
                                 cfg.neural_net.double_dqn,
                                 cfg.neural_net.noisy_net,
                                 cfg.neural_net.dueling, device)

        self.cc.set_optimizer(cfg.train.learning_rate)

        self.birth_time = time.time()

        self.iter_update_target = cfg.replay.n_iter_update_target
        self.buffer_start_size = cfg.replay.buffer_start_size

        self.epsilon_start = cfg.rl.epsilon_start
        self.epsilon = cfg.rl.epsilon_start
        self.epsilon_decay = cfg.rl.epsilon_decay
        self.epsilon_final = cfg.rl.epsilon_final

        self.accumulated_loss = []
        self.device = device

        # initialize the replay buffer (i.e. the memory) of the agent
        self.replay_buffer = ReplayBuffer(cfg.replay.buffer_capacity,
                                          cfg.rl.n_multi_step, cfg.rl.gamma)
        self.summary_writer = summary_writer

        self.noisy_net = cfg.neural_net.noisy_net

        self.env = env

        self.total_reward = 0
        self.n_iter = 0
        self.n_games = 0
        self.ts_frame = 0
        self.ts = time.time()
        self.rewards = []
Ejemplo n.º 15
0
Archivo: sac.py Proyecto: vietbt/RLpp
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=0.005,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 learning_starts=100,
                 train_freq=1,
                 batch_size=64,
                 target_update_interval=1,
                 gradient_steps=1,
                 target_entropy='auto',
                 ent_coef='auto',
                 random_exploration=0.0,
                 discrete=True,
                 regularized=True,
                 feature_extraction="cnn"):
        self.env = env
        self.learning_starts = learning_starts
        self.random_exploration = random_exploration
        self.train_freq = train_freq
        self.target_update_interval = target_update_interval
        self.batch_size = batch_size
        self.gradient_steps = gradient_steps
        self.learning_rate = learning_rate

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session(graph=self.graph)
            self.replay_buffer = ReplayBuffer(buffer_size)
            self.agent = SACAgent(self.sess,
                                  env,
                                  discrete=discrete,
                                  regularized=regularized,
                                  feature_extraction=feature_extraction)
            self.model = SACModel(self.sess, self.agent, target_entropy,
                                  ent_coef, gamma, tau)
            with self.sess.as_default():
                self.sess.run(tf.global_variables_initializer())
                self.sess.run(self.model.target_init_op)
        self.num_timesteps = 0
Ejemplo n.º 16
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Ejemplo n.º 17
0
def main():
    with tf.Session() as sess:

        actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                             ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE)
        critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

        #TODO: Ornstein-Uhlenbeck noise.

        sess.run(tf.global_variables_initializer())

        # initialize target net
        actor.update_target_network()
        critic.update_target_network()

        # initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE)

        # main loop.
        for ep in range(MAX_EPISODES):

            episode_reward = 0
            ep_batch_avg_q = 0

            s = ENV.reset()

            for step in range(MAX_EP_STEPS):

                a = actor.predict(np.reshape(s,
                                             (1, STATE_DIM)))  #+ actor_noise()
                s2, r, terminal, info = ENV.step(a[0])
                #print(s2)

                replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                                np.reshape(a, (ACTION_DIM,)), \
                                r, \
                                terminal, \
                                np.reshape(s2, (STATE_DIM,)))

                # Batch sampling.
                if replay_buffer.size() > MINIBATCH_SIZE and \
                    step % TRAIN_INTERVAL == 0:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # target Q値を計算.
                    target_action = actor.predict_target(s2_batch)
                    target_q = critic.predict_target(s2_batch, target_action)

                    # critic の target V値を計算.
                    targets = []
                    for i in range(MINIBATCH_SIZE):
                        if t_batch[i]:
                            # terminal
                            targets.append(r_batch[i])
                        else:
                            targets.append(r_batch[i] + GAMMA * target_q[i])

                    # Critic を train.
                    #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                    pred_q, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(targets, (MINIBATCH_SIZE, 1)))

                    # Actor を train.
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    #print(grads[0].shape)
                    #exit(1)
                    actor.train(s_batch, grads[0])

                    # Update target networks.
                    # 数batchに一度にするべき?
                    actor.update_target_network()
                    critic.update_target_network()

                    ep_batch_avg_q += np.mean(pred_q)

                s = s2
                episode_reward += r

                if terminal:
                    print('Episode:', ep, 'Reward:', episode_reward)
                    reward_log.append(episode_reward)
                    q_log.append(ep_batch_avg_q / step)

                    break
Ejemplo n.º 18
0
def train_expert(env_name):
    """Train expert policy in given environment."""
    if env_name == 'InvertedPendulum-v2':
        env = ExpertInvertedPendulumEnv()
        episode_limit = 200
        return_threshold = 200
    elif env_name == 'InvertedDoublePendulum-v2':
        env = ExpertInvertedDoublePendulumEnv()
        episode_limit = 50
        return_threshold = 460
    elif env_name == 'ThreeReacherEasy-v2':
        env = ThreeReacherEasyEnv()
        episode_limit = 50
        return_threshold = -0.8
    elif env_name == 'ReacherEasy-v2':
        env = ReacherEasyEnv()
        episode_limit = 50
        return_threshold = -0.8
    elif env_name == 'Hopper-v2':
        env = HopperEnv()
        episode_limit = 200
        return_threshold = 600
    elif env_name == 'HalfCheetah-v2':
        env = ExpertHalfCheetahEnv()
        episode_limit = 200
        return_threshold = 1000
    elif env_name == 'StrikerHumanSim-v2':
        env = StrikerHumanSimEnv()
        episode_limit = 200
        return_threshold = -190
    elif env_name == 'PusherHumanSim-v2':
        env = PusherHumanSimEnv()
        episode_limit = 200
        return_threshold = -80
    else:
        raise NotImplementedError
    buffer_size = 1000000
    init_random_samples = 1000
    exploration_noise = 0.2
    learning_rate = 3e-4
    batch_size = 256
    epochs = 200
    steps_per_epoch = 5000
    updates_per_step = 1
    update_actor_every = 1
    start_training = 512
    gamma = 0.99
    polyak = 0.995
    entropy_coefficient = 0.2
    clip_actor_gradients = False
    visual_env = True
    action_size = env.action_space.shape[0]
    tune_entropy_coefficient = True
    target_entropy = -1 * action_size

    def make_actor():
        actor = StochasticActor([
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(action_size * 2)
        ])
        return actor

    def make_critic():
        critic = Critic([
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(1)
        ])
        return critic

    optimizer = tf.keras.optimizers.Adam(learning_rate)

    replay_buffer = ReplayBuffer(buffer_size)
    sampler = Sampler(env,
                      episode_limit=episode_limit,
                      init_random_samples=init_random_samples,
                      visual_env=visual_env)
    agent = SAC(make_actor,
                make_critic,
                make_critic,
                actor_optimizer=optimizer,
                critic_optimizer=optimizer,
                gamma=gamma,
                polyak=polyak,
                entropy_coefficient=entropy_coefficient,
                tune_entropy_coefficient=tune_entropy_coefficient,
                target_entropy=target_entropy,
                clip_actor_gradients=clip_actor_gradients)
    if visual_env:
        obs = np.expand_dims(env.reset()['obs'], axis=0)
    else:
        obs = np.expand_dims(env.reset(), axis=0)
    agent(obs)
    agent.summary()

    mean_test_returns = []
    mean_test_std = []
    steps = []

    step_counter = 0
    for e in range(epochs):
        while step_counter < (e + 1) * steps_per_epoch:
            traj_data = sampler.sample_trajectory(agent, exploration_noise)
            replay_buffer.add(traj_data)
            if step_counter > start_training:
                agent.train(replay_buffer,
                            batch_size=batch_size,
                            n_updates=updates_per_step * traj_data['n'],
                            act_delay=update_actor_every)
            step_counter += traj_data['n']
        print('Epoch {}/{} - total steps {}'.format(e + 1, epochs,
                                                    step_counter))
        out = sampler.evaluate(agent, 10)
        mean_test_returns.append(out['mean'])
        mean_test_std.append(out['std'])
        steps.append(step_counter)
        if out['mean'] >= return_threshold:
            print('Early termination due to reaching return threshold')
            break
    plt.errorbar(steps, mean_test_returns, mean_test_std)
    plt.xlabel('steps')
    plt.ylabel('returns')
    plt.show()
    return agent
Ejemplo n.º 19
0
 def init(self, args, env):
     self.critic = np.zeros(shape=(5, 5, 4))
     self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)
Ejemplo n.º 20
0
    def learn(self, timesteps=10000, verbose=0, seed=None):
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)

        self.eps_range = self._eps_range(timesteps)
        replay_buffer = ReplayBuffer(self.buffer_size)

        self._init_model()

        obs = self.env.reset()
        for step in range(timesteps):
            # while not done:
            cur_eps = next(self.eps_range, None)
            if cur_eps is None:
                cur_eps = self.final_eps

            action = self._select_action(obs, cur_eps)

            new_obs, rewards, done, info = self.env.step(action)
            if done:
                new_obs = [
                    np.nan
                ] * self.obs_shape[0]  # hacky way to keep dimensions correct
            replay_buffer.add(obs, action, rewards, new_obs)

            obs = new_obs

            # learn gradient
            if step > self.learning_starts:
                if len(replay_buffer.buffer
                       ) < self.batch_size:  # buffer too small
                    continue
                samples = replay_buffer.sample(self.batch_size, self.device)
                obs_batch, actions_batch, rewards_batch, new_obs_batch = samples

                predicted_q_values = self._predictQValue(
                    self.step_model, obs_batch, actions_batch)
                ys = self._expectedLabels(self.target_model, new_obs_batch,
                                          rewards_batch)

                loss = F.smooth_l1_loss(predicted_q_values, ys)

                self.optim.zero_grad()
                loss.backward()
                for i in self.step_model.parameters():
                    i.grad.clamp_(min=-1, max=1)  # exploding gradient
                    # i.grad.clamp_(min=-10, max=10) # exploding gradient
                self.optim.step()

                # update target
                if step % self.target_network_update_freq == 0:
                    self.target_model.load_state_dict(
                        self.step_model.state_dict())

            if done:
                obs = self.env.reset()
            if verbose == 1:
                if step % (timesteps * 0.1) == 0:
                    perc = int(step / (timesteps * 0.1))
                    print(f"At step {step}")
                    print(f"{perc}% done")
Ejemplo n.º 21
0
    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        kwargs["expl_noise"] = args.expl_noise
        kwargs["tau"] = args.tau
        policy = TD3(**kwargs)
    elif args.policy == "SAC":
        kwargs["policy_freq"] = args.policy_freq
        kwargs["tau"] = args.tau
        policy = SAC(**kwargs)
    elif args.policy == "MPO":
        policy = MPO(**kwargs)
    if args.load_model != "":
        policy_file = (args.file_name
                       if args.load_model == "default" else args.load_model)
        policy.load(f"./models/{policy_file}")

    replay_buffer = ReplayBuffer(
        state_dim,
        action_dim,
        max_size=int(args.buffer_size),
    )

    train_loop = TRAIN_LOOPS[args.policy]

    train_loop(args, policy, replay_buffer, env)
Ejemplo n.º 22
0
 def init(self, args, env):
     names = ['state0', 'action', 'state1', 'reward', 'terminal']
     self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
     self.actorCritic = ActorCriticDDPG(args, env)
Ejemplo n.º 23
0
    actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                         MINIBATCH_SIZE, TAU, ACTOR_LEARNING_RATE,
                         critic.model)

    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

    #TODO: Ornstein-Uhlenbeck noise.

    sess.run(tf.global_variables_initializer())

    # initialize target net
    actor.update_target_network()
    critic.update_target_network()

    # initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE)

    # log variables
    reward_log = []

    # main loop.
    for ep in range(MAX_EPISODES):

        episode_reward = 0

        s = ENV.reset()

        for step in range(MAX_EP_STEPS):

            a = actor.predict(np.reshape(s, (1, STATE_DIM))) + actor_noise()
            s2, r, terminal, info = ENV.step(a[0])
Ejemplo n.º 24
0
 def init(self, args, env):
     names = ['state0', 'action', 'state1', 'reward', 'terminal']
     self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
     self.critic = CriticDQN(args, env)
     for metric_name in ['loss_dqn', 'qval', 'val']:
         self.metrics[metric_name] = 0
Ejemplo n.º 25
0
    def __init__(self,
                 env,
                 args,
                 e_decay=1,
                 e_min=0.05,
                 l2_decay=0.0001,
                 update_type="hard"):
        """
        Initialize a D4PG Agent.
        """

        self.device = args.device
        self.framework = "D4PG"
        self.eval = args.eval
        self.agent_count = env.agent_count
        self.actor_learn_rate = args.actor_learn_rate
        self.critic_learn_rate = args.critic_learn_rate
        self.batch_size = args.batch_size
        self.buffer_size = args.buffer_size
        self.action_size = env.action_size
        self.state_size = env.state_size
        self.C = args.C
        self._e = args.e
        self.e_decay = e_decay
        self.e_min = e_min
        self.gamma = args.gamma
        self.rollout = args.rollout
        self.tau = args.tau
        self.update_type = update_type

        self.num_atoms = args.num_atoms
        self.vmin = args.vmin
        self.vmax = args.vmax
        self.atoms = torch.linspace(self.vmin, self.vmax,
                                    self.num_atoms).to(self.device)

        self.t_step = 0
        self.episode = 0

        # Set up memory buffers, currently only standard replay is implemented #
        self.memory = ReplayBuffer(self.device, self.buffer_size, self.gamma,
                                   self.rollout)

        #                    Initialize ACTOR networks                         #
        self.actor = ActorNet(args.layer_sizes, self.state_size,
                              self.action_size).to(self.device)
        self.actor_target = ActorNet(args.layer_sizes, self.state_size,
                                     self.action_size).to(self.device)
        self._hard_update(self.actor, self.actor_target)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=self.actor_learn_rate,
                                      weight_decay=l2_decay)

        #                   Initialize CRITIC networks                         #
        self.critic = CriticNet(args.layer_sizes, self.state_size,
                                self.action_size,
                                self.num_atoms).to(self.device)
        self.critic_target = CriticNet(args.layer_sizes, self.state_size,
                                       self.action_size,
                                       self.num_atoms).to(self.device)
        self._hard_update(self.actor, self.actor_target)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=self.critic_learn_rate,
                                       weight_decay=l2_decay)

        self.new_episode()