예제 #1
0
 def init(self, args, env):
     names = ['state0', 'action', 'state1', 'reward', 'terminal', 'goal']
     self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
     if args['--imit'] != '0':
         names.append('expVal')
         self.bufferImit = ReplayBuffer(limit=int(1e6), names=names.copy())
     self.critic = CriticDQNG(args, env)
예제 #2
0
    def __init__(self, state_size, action_size, seed, framework, buffer_type):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.framework = framework
        self.buffer_type = buffer_type

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        # def __init__(self, device, buffer_size, batch_size, alpha, beta):
        if self.buffer_type == 'PER_ReplayBuffer':
            self.memory = PER_ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE,
                                           ALPHA, BETA)
        if self.buffer_type == 'ReplayBuffer':
            self.memory = ReplayBuffer(device, action_size, BUFFER_SIZE,
                                       BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #3
0
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=1e-3,
                 pol_lr=1e-4,
                 q_lr=5e-3,
                 batch_size=64,
                 buffer_size=10000,
                 target_noise=0.2,
                 action_noise=0.1,
                 clip_range=0.5,
                 update_delay=2):

        # environment stuff
        self.env = env
        self.num_act = env.action_space.shape[0]
        self.num_obs = env.observation_space.shape[0]
        self.eval_env = copy.deepcopy(env)

        # hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.pol_lr = pol_lr
        self.q_lr = q_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.target_noise = target_noise
        self.action_noise = action_noise
        self.clip_range = clip_range
        self.update_delay = 2

        # networks
        self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double()
        self.q1 = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.q2 = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.pol.init_weights()
        self.q1.init_weights()
        self.q2.init_weights()
        self.target_pol = copy.deepcopy(self.pol).double()
        self.target_q1 = copy.deepcopy(self.q1).double()
        self.target_q2 = copy.deepcopy(self.q2).double()

        # optimizers, buffer
        self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr)
        self.q1_opt = torch.optim.Adam(
            self.q1.parameters(),
            lr=self.q_lr,
        )
        self.q2_opt = torch.optim.Adam(
            self.q2.parameters(),
            lr=self.q_lr,
        )
        self.buffer = ReplayBuffer(self.buffer_size, 1000)
        self.mse_loss = torch.nn.MSELoss()

        self.cum_q1_loss = 0
        self.cum_q2_loss = 0
        self.cum_obj = 0
예제 #4
0
 def init(self, args, env):
     names = ['s0', 'a', 's1', 'r', 't', 'g']
     metrics = ['loss_dqn', 'loss_actor']
     self.buffer = ReplayBuffer(limit=int(1e6),
                                names=names.copy(),
                                args=args)
     self.actorCritic = ActorCriticDDPGG(args, env)
     for metric in metrics:
         self.metrics[metric] = 0
    def __init__(self,
                 env,
                 state_dim: int,
                 action_dim: int,
                 config: Dict,
                 device=None,
                 writer=None):
        self.logger = logging.getLogger("MADDPG")
        self.device = device if device is not None else DEVICE
        self.writer = writer

        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.agents_number = config['agents_number']

        hidden_layers = config.get('hidden_layers', (400, 300))
        noise_scale = config.get('noise_scale', 0.2)
        noise_sigma = config.get('noise_sigma', 0.1)
        actor_lr = config.get('actor_lr', 1e-3)
        actor_lr_decay = config.get('actor_lr_decay', 0)
        critic_lr = config.get('critic_lr', 1e-3)
        critic_lr_decay = config.get('critic_lr_decay', 0)
        self.actor_tau = config.get('actor_tau', 0.002)
        self.critic_tau = config.get('critic_tau', 0.002)
        create_agent = lambda: DDPGAgent(state_dim,
                                         action_dim,
                                         agents=self.agents_number,
                                         hidden_layers=hidden_layers,
                                         actor_lr=actor_lr,
                                         actor_lr_decay=actor_lr_decay,
                                         critic_lr=critic_lr,
                                         critic_lr_decay=critic_lr_decay,
                                         noise_scale=noise_scale,
                                         noise_sigma=noise_sigma,
                                         device=self.device)
        self.agents = [create_agent() for _ in range(self.agents_number)]

        self.discount = 0.99 if 'discount' not in config else config['discount']
        self.gradient_clip = 1.0 if 'gradient_clip' not in config else config[
            'gradient_clip']

        self.warm_up = 1e3 if 'warm_up' not in config else config['warm_up']
        self.buffer_size = int(
            1e6) if 'buffer_size' not in config else config['buffer_size']
        self.batch_size = config.get('batch_size', 128)
        self.p_batch_size = config.get('p_batch_size',
                                       int(self.batch_size // 2))
        self.n_batch_size = config.get('n_batch_size',
                                       int(self.batch_size // 4))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.update_every_iterations = config.get('update_every_iterations', 2)
        self.number_updates = config.get('number_updates', 2)

        self.reset()
예제 #6
0
 def init(self, args, env):
     names = ['s0', 'a', 's1', 'r', 't', 'g', 'm', 'task', 'mcr']
     metrics = ['loss_dqn', 'qval', 'val']
     self.buffer = ReplayBuffer(limit=int(1e6),
                                names=names.copy(),
                                args=args)
     self.actorCritic = ActorCriticDQNGM(args, env)
     for metric in metrics:
         self.metrics[metric] = 0
     self.goalcounts = np.zeros((len(self.env.goals), ))
예제 #7
0
class DDPGG(DDPG):
    def __init__(self, args, env, env_test, logger):
        super(DDPGG, self).__init__(args, env, env_test, logger)

    def init(self, args, env):
        names = ['s0', 'a', 's1', 'r', 't', 'g']
        metrics = ['loss_dqn', 'loss_actor']
        self.buffer = ReplayBuffer(limit=int(1e6),
                                   names=names.copy(),
                                   args=args)
        self.actorCritic = ActorCriticDDPGG(args, env)
        for metric in metrics:
            self.metrics[metric] = 0

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            targets_dqn = self.actorCritic.get_targets_dqn(
                exp['r'], exp['t'], exp['s1'], exp['g'])
            inputs = [exp['s0'], exp['a'], exp['g'], targets_dqn]
            loss_dqn = self.actorCritic.trainQval(inputs)
            action, criticActionGrads, invertedCriticActionGrads = self.actorCritic.trainActor(
                [exp['s0'], exp['g']])
            self.metrics['loss_dqn'] += np.squeeze(loss_dqn)
            self.actorCritic.target_train()

    def make_input(self, state, mode):
        if mode == 'train':
            input = [np.expand_dims(i, axis=0) for i in [state, self.env.goal]]
        else:
            input = [
                np.expand_dims(i, axis=0) for i in [state, self.env_test.goal]
            ]
        return input

    def reset(self):

        if self.trajectory:
            self.env.end_episode(self.trajectory)
            for expe in self.trajectory:
                self.buffer.append(expe.copy())
            if self.args['--her'] != '0':
                augmented_ep = self.env.augment_episode(self.trajectory)
                for e in augmented_ep:
                    self.buffer.append(e)
            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state
예제 #8
0
    def __init__(self, state_size, action_size, args):
        """
        Initialize a D4PG Agent.
        """

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.state_size = state_size
        self.framework = args.framework
        self.eval = args.eval
        self.agent_count = 1
        self.learn_rate = args.learn_rate
        self.batch_size = args.batch_size
        self.buffer_size = args.buffer_size
        self.C = args.C
        self._epsilon = args.epsilon
        self.epsilon_decay = args.epsilon_decay
        self.epsilon_min = args.epsilon_min

        self.gamma = 0.99
        self.rollout = args.rollout
        self.tau = args.tau
        self.momentum = 1
        self.l2_decay = 0.0001
        self.update_type = "hard"

        self.t_step = 0
        self.episode = 0
        self.seed = 0

        # Set up memory buffers
        if args.prioritized_experience_replay:
            self.memory = PERBuffer(args.buffersize, self.batchsize,
                                    self.framestack, self.device, args.alpha,
                                    args.beta)
            self.criterion = WeightedLoss()
        else:
            self.memory = ReplayBuffer(self.device, self.buffer_size,
                                       self.gamma, self.rollout)

        #                    Initialize Q networks                         #
        self.q = self._make_model(state_size, action_size, args.pixels)
        self.q_target = self._make_model(state_size, action_size, args.pixels)
        self._hard_update(self.q, self.q_target)
        self.q_optimizer = self._set_optimizer(self.q.parameters(),
                                               lr=self.learn_rate,
                                               decay=self.l2_decay,
                                               momentum=self.momentum)

        self.new_episode()
예제 #9
0
 def __init__(self, s_dim, num_actions, lr):
     self.step = 0
     self.epStep = 0
     self.ep = 0
     self.tutorListened = True
     self.tutorInput = ''
     self.sDim = s_dim
     self.num_actions = num_actions
     self.learning_rate = lr
     self.names = ['state0', 'action', 'feedback', 'fWeight']
     self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)
     self.batchSize = 64
     self.episode = deque(maxlen=400)
     self.model = self.create_model()
예제 #10
0
    def __init__(
        self,
        env,
        sub_states,
        layers,
        gamma=0.99,
        tau=1e-3,
        pol_lr=1e-4,
        q_lr=1e-3,
        batch_size=64,
        buffer_size=10000,
    ):

        # environment stuff
        self.env = env
        self.num_act = env.action_space.shape[0]
        self.num_obs = env.observation_space.shape[0]
        self.eval_env = copy.deepcopy(env)
        self.sub_states = sub_states
        self.layers = layers

        # hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.pol_lr = pol_lr
        self.q_lr = q_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        # networks
        self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double()
        # decomp critic
        self.q = DecompCritic(self.sub_states, self.num_act, layers).double()
        self.pol.init_weights()
        self.q.init_weights()
        self.target_pol = copy.deepcopy(self.pol).double()
        self.target_q = copy.deepcopy(self.q).double()

        # optimizers, buffer
        self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr)
        self.q_opt = torch.optim.Adam(
            self.q.parameters(),
            lr=self.q_lr,
        )
        self.buffer = ReplayBuffer(self.buffer_size, 1000)
        self.mse_loss = torch.nn.MSELoss()

        self.cum_loss = 0
        self.cum_obj = 0
예제 #11
0
    def __init__(self, env, hyperparameters, device, summary_writer=None):
        """Set parameters, initialize network."""

        state_space_shape = env.observation_space.shape
        action_space_size = env.action_space.n

        self.env = env

        self.online_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        self.target_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        # XXX maybe not really necesary?
        self.update_target_network()

        self.experience_replay = None

        self.accumulated_loss = []
        self.device = device

        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=hyperparameters['learning_rate'])

        self.double_DQN = hyperparameters['double_DQN']

        # Discount factor
        self.gamma = hyperparameters['gamma']

        # XXX ???
        self.n_multi_step = hyperparameters['n_multi_step']

        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.summary_writer = summary_writer

        # Greedy search hyperparameters
        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']
예제 #12
0
 def __init__(self,
              n_actions,
              buffer_size=1000000,
              behaviour_policy='epsilon_greedy',
              discount_factor=0.99,
              clip_grad_norm_value=10.0,
              policy_args={}):
     self.discount_factor = discount_factor
     self.clip_grad_norm_value = clip_grad_norm_value
     self.replay_buffer = ReplayBuffer(capacity=buffer_size)
     if behaviour_policy == 'epsilon_greedy':
         self.policy = EpsilonGreedyPolicy(policy_args)
     else:
         self.policy = SoftPolicy()
     self.q_network = QNetwork(n_actions).to(device)
예제 #13
0
class DQNG(DQN):
    def __init__(self, args, env, env_test, logger):
        super(DQNG, self).__init__(args, env, env_test, logger)

    def init(self, args, env):
        names = ['state0', 'action', 'state1', 'reward', 'terminal', 'goal']
        self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
        if args['--imit'] != '0':
            names.append('expVal')
            self.bufferImit = ReplayBuffer(limit=int(1e6), names=names.copy())
        self.critic = CriticDQNG(args, env)

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t, g = [exp[name] for name in self.buffer.names]
            targets_dqn = self.critic.get_targets_dqn(r, t, s1, g)
            inputs = [s0, a0, g]
            loss = self.critic.qvalModel.train_on_batch(inputs, targets_dqn)
            for i, metric in enumerate(self.critic.qvalModel.metrics_names):
                self.metrics[metric] += loss[i]

            if self.args[
                    '--imit'] != '0' and self.bufferImit.nb_entries > self.batch_size:
                exp = self.bufferImit.sample(self.batch_size)
                s0, a0, s1, r, t, g, e = [
                    exp[name] for name in self.bufferImit.names
                ]
                targets_dqn = self.critic.get_targets_dqn(r, t, s1, g)
                targets = [
                    targets_dqn,
                    np.zeros((self.batch_size, 1)),
                    np.zeros((self.batch_size, 1))
                ]
                inputs = [s0, a0, g, e]
                loss = self.critic.imitModel.train_on_batch(inputs, targets)
                for i, metric in enumerate(
                        self.critic.imitModel.metrics_names):
                    self.imitMetrics[metric] += loss[i]

            self.critic.target_train()

    def make_input(self, state, t):
        input = [np.expand_dims(i, axis=0) for i in [state, self.env.goal]]
        # temp = self.env.explor_temp(t)
        input.append(np.expand_dims([0.5], axis=0))
        return input
예제 #14
0
    def __init__(self, env, device, hyperparameters, summary_writer=None):
        '''
		Agent initialization. It create the CentralControl that control all the low
		'''
        self.rewards = []
        self.total_reward = 0
        self.birth_time = 0
        self.n_iter = 0
        self.n_games = 0
        self.ts_frame = 0
        self.ts = time.time()

        self.Memory = namedtuple(
            'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'],
            rename=False)

        # The CentralControl is the 'brain' of the agent
        self.cc = CentralControl(env.observation_space.shape,
                                 env.action_space.n, hyperparameters['gamma'],
                                 hyperparameters['n_multi_step'],
                                 hyperparameters['double_DQN'],
                                 hyperparameters['noisy_net'],
                                 hyperparameters['dueling'], device)

        self.cc.set_optimizer(hyperparameters['learning_rate'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

        self.accumulated_loss = []
        self.device = device

        # initialize the replay buffer (i.e. the memory) of the agent
        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])
        self.summary_writer = summary_writer

        self.noisy_net = hyperparameters['noisy_net']

        self.env = env
예제 #15
0
    def __init__(self, env, device, cfg, summary_writer=None):
        '''
		Agent initialization. It create the CentralControl that control all the low
		'''

        # The CentralControl is the 'brain' of the agent
        self.cc = CentralControl(env.observation_space.shape,
                                 env.action_space.n, cfg.rl.gamma,
                                 cfg.rl.n_multi_step,
                                 cfg.neural_net.double_dqn,
                                 cfg.neural_net.noisy_net,
                                 cfg.neural_net.dueling, device)

        self.cc.set_optimizer(cfg.train.learning_rate)

        self.birth_time = time.time()

        self.iter_update_target = cfg.replay.n_iter_update_target
        self.buffer_start_size = cfg.replay.buffer_start_size

        self.epsilon_start = cfg.rl.epsilon_start
        self.epsilon = cfg.rl.epsilon_start
        self.epsilon_decay = cfg.rl.epsilon_decay
        self.epsilon_final = cfg.rl.epsilon_final

        self.accumulated_loss = []
        self.device = device

        # initialize the replay buffer (i.e. the memory) of the agent
        self.replay_buffer = ReplayBuffer(cfg.replay.buffer_capacity,
                                          cfg.rl.n_multi_step, cfg.rl.gamma)
        self.summary_writer = summary_writer

        self.noisy_net = cfg.neural_net.noisy_net

        self.env = env

        self.total_reward = 0
        self.n_iter = 0
        self.n_games = 0
        self.ts_frame = 0
        self.ts = time.time()
        self.rewards = []
예제 #16
0
    def __init__(self, env, args):
        super(PlayroomGM, self).__init__(env)

        self.gamma = float(args['--gamma'])
        self.eps = float(args['--eps'])
        self.demo_f = [int(f) for f in args['--demo'].split(',')]

        self.feat = np.array([int(f) for f in args['--features'].split(',')])
        self.N = self.feat.shape[0]
        vs = np.zeros(shape=(self.N, self.state_dim[0]))
        vs[np.arange(self.N), self.feat] = 1
        self.vs = vs / np.sum(vs, axis=1, keepdims=True)
        self.R = 100
        self.idx = -1
        self.v = np.zeros(shape=(self.state_dim[0], 1))
        self.g = np.ones(shape=(self.state_dim[0]))
        self.queues = [CompetenceQueue() for _ in range(self.N)]
        self.names = ['s0', 'r0', 'a', 's1', 'r1', 'g', 'v', 'o', 'u']
        self.buffer = ReplayBuffer(limit=int(1e5), names=self.names, N=self.N)
예제 #17
0
class Qoff(Agent):
    def __init__(self, args, env, env_test, logger):
        super(Qoff, self).__init__(args, env, env_test, logger)
        self.args = args
        self.gamma = 0.99
        self.lr = 0.1
        self.names = ['state0', 'action', 'state1', 'reward', 'terminal']
        self.init(args, env)

    def init(self, args, env):
        self.critic = np.zeros(shape=(5, 5, 4))
        self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)

    def train(self):
        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t, g, m = [exp[name] for name in self.names]
            for k in range(self.batch_size):
                target = r[k] + (1 - t[k]) * self.gamma * np.max(
                    self.critic[tuple(s1[k])])
                self.critic[tuple(s0[k])][a0[k]] = self.lr * target + \
                                                       (1 - self.lr) * self.critic[tuple(s0[k])][a0[k]]

    def act(self, state):
        if np.random.rand() < 0.2:
            action = np.random.randint(self.env.action_space.n)
        else:
            action = np.argmax(self.critic[tuple(state)])
        return action

    def reset(self):

        if self.trajectory:
            self.env.processEp(self.trajectory)
            for expe in reversed(self.trajectory):
                self.buffer.append(expe.copy())
            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state
예제 #18
0
파일: sac.py 프로젝트: vietbt/RLpp
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=0.005,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 learning_starts=100,
                 train_freq=1,
                 batch_size=64,
                 target_update_interval=1,
                 gradient_steps=1,
                 target_entropy='auto',
                 ent_coef='auto',
                 random_exploration=0.0,
                 discrete=True,
                 regularized=True,
                 feature_extraction="cnn"):
        self.env = env
        self.learning_starts = learning_starts
        self.random_exploration = random_exploration
        self.train_freq = train_freq
        self.target_update_interval = target_update_interval
        self.batch_size = batch_size
        self.gradient_steps = gradient_steps
        self.learning_rate = learning_rate

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session(graph=self.graph)
            self.replay_buffer = ReplayBuffer(buffer_size)
            self.agent = SACAgent(self.sess,
                                  env,
                                  discrete=discrete,
                                  regularized=regularized,
                                  feature_extraction=feature_extraction)
            self.model = SACModel(self.sess, self.agent, target_entropy,
                                  ent_coef, gamma, tau)
            with self.sess.as_default():
                self.sess.run(tf.global_variables_initializer())
                self.sess.run(self.model.target_init_op)
        self.num_timesteps = 0
예제 #19
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #20
0
def muzero(config: MuZeroConfig):
    ray.init()
    storage = SharedStorage.remote(config)
    replay_buffer = ReplayBuffer.remote(config)

    leaner = Leaner.remote(config, storage, replay_buffer)

    temperatures = list(np.linspace(1.0, 0.1, num=config.num_actors))
    actors = [
        Actor.remote(config, storage, replay_buffer, temperature)
        for temperature in temperatures
    ]
    workers = [leaner] + actors

    ray.get([worker.start.remote() for worker in workers])
    ray.shutdown()
예제 #21
0
파일: sac.py 프로젝트: vietbt/RLpp
class SAC:
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=0.005,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 learning_starts=100,
                 train_freq=1,
                 batch_size=64,
                 target_update_interval=1,
                 gradient_steps=1,
                 target_entropy='auto',
                 ent_coef='auto',
                 random_exploration=0.0,
                 discrete=True,
                 regularized=True,
                 feature_extraction="cnn"):
        self.env = env
        self.learning_starts = learning_starts
        self.random_exploration = random_exploration
        self.train_freq = train_freq
        self.target_update_interval = target_update_interval
        self.batch_size = batch_size
        self.gradient_steps = gradient_steps
        self.learning_rate = learning_rate

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session(graph=self.graph)
            self.replay_buffer = ReplayBuffer(buffer_size)
            self.agent = SACAgent(self.sess,
                                  env,
                                  discrete=discrete,
                                  regularized=regularized,
                                  feature_extraction=feature_extraction)
            self.model = SACModel(self.sess, self.agent, target_entropy,
                                  ent_coef, gamma, tau)
            with self.sess.as_default():
                self.sess.run(tf.global_variables_initializer())
                self.sess.run(self.model.target_init_op)
        self.num_timesteps = 0

    def train(self, learning_rate):
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = self.replay_buffer.sample(
            self.batch_size)
        # print("batch_actions:", batch_actions.shape)
        # print("self.agent.actions_ph:", self.agent.actions_ph)

        feed_dict = {
            self.agent.obs_ph: batch_obs,
            self.agent.next_obs_ph: batch_next_obs,
            self.model.rewards_ph: batch_rewards.reshape(self.batch_size, -1),
            self.model.terminals_ph: batch_dones.reshape(self.batch_size, -1),
            self.model.learning_rate_ph: learning_rate
        }
        if not self.agent.discrete:
            feed_dict[self.agent.actions_ph] = batch_actions
        else:
            batch_actions = batch_actions.reshape(-1)
            feed_dict[self.agent.actions_ph] = batch_actions
        policy_loss, qf1_loss, qf2_loss, value_loss, *values = self.sess.run(
            self.model.step_ops, feed_dict)
        return policy_loss, qf1_loss, qf2_loss

    def learn(self, total_timesteps):
        learning_rate = get_schedule_fn(self.learning_rate)
        episode_rewards = [0]
        mb_losses = []
        obs = self.env.reset()
        for step in range(total_timesteps):
            if self.num_timesteps < self.learning_starts or np.random.rand(
            ) < self.random_exploration:
                unscaled_action = self.env.action_space.sample()
                action = scale_action(self.env.action_space, unscaled_action)
            else:
                action = self.agent.step(obs[None]).flatten()
                unscaled_action = unscale_action(self.env.action_space, action)
            # print("\nunscaled_action:", unscaled_action)
            new_obs, reward, done, _ = self.env.step(unscaled_action)
            self.num_timesteps += 1
            self.replay_buffer.add(obs, action, reward, new_obs, done)
            obs = new_obs

            if self.num_timesteps % self.train_freq == 0:
                for grad_step in range(self.gradient_steps):
                    if not self.replay_buffer.can_sample(
                            self.batch_size
                    ) or self.num_timesteps < self.learning_starts:
                        break
                    frac = 1.0 - step / total_timesteps
                    current_lr = learning_rate(frac)
                    mb_losses.append(self.train(current_lr))
                    if (step + grad_step) % self.target_update_interval == 0:
                        self.sess.run(self.model.target_update_op)

            episode_rewards[-1] += reward
            if done:
                obs = self.env.reset()
                episode_rewards.append(0)

            mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)
            loss_str = "/".join([f"{x:.3f}" for x in np.mean(mb_losses, 0)
                                 ]) if len(mb_losses) > 0 else "NaN"
            print(f"Step {step} - reward: {mean_reward} - loss: {loss_str}",
                  end="\n" if step % 500 == 0 else "\r")
예제 #22
0
class Agent():
    def __init__(self, s_dim, num_actions, lr):
        self.step = 0
        self.epStep = 0
        self.ep = 0
        self.tutorListened = True
        self.tutorInput = ''
        self.sDim = s_dim
        self.num_actions = num_actions
        self.learning_rate = lr
        self.names = ['state0', 'action', 'feedback', 'fWeight']
        self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)
        self.batchSize = 64
        self.episode = deque(maxlen=400)
        self.model = self.create_model()

    def create_model(self):
        state = Input(shape=self.sDim)
        action = Input(shape=(1,), dtype='uint8')
        l1 = Dense(400, activation="relu")(state)
        feedback = Dense(self.num_actions, activation=None, kernel_initializer='random_uniform')(l1)
        feedback = Reshape((1, self.num_actions))(feedback)
        mask = Lambda(K.one_hot, arguments={'num_classes': self.num_actions},
                      output_shape=(self.num_actions,))(action)
        feedback = multiply([feedback, mask])
        feedback = Lambda(K.sum, arguments={'axis': 2})(feedback)
        feedbackModel = Model(inputs=[state, action], outputs=feedback)
        feedbackModel.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return feedbackModel

    def train(self):
        loss = 0
        if self.buffer.nb_entries > self.batchSize:
            samples = self.buffer.sample(self.batchSize)
            s, a, targets, weights = [np.array(samples[name]) for name in self.names]
            loss = self.model.train_on_batch(x=[s,a], y=targets, sample_weight=weights)
        return loss

    def tutorListener(self):
        self.tutorInput = input("> ")
        print("maybe updating...the kbdInput variable is: {}".format(self.tutorInput))
        self.tutorListened = True

    def run(self):
        state0 = np.random.randint(0, 4, size=(5,))
        while self.step < 100000:

            if self.tutorInput != '':
                print("Received new keyboard Input. Setting playing ID to keyboard input value")
                for i in range(1,10):
                    self.episode[-i]['fWeight'] = 1
                    self.episode[-i]['feedback'] = self.tutorInput
                self.tutorInput = ''
            else:
                action = np.random.randint(self.num_actions)
                state1 = np.random.randint(0, 4, size=(5,))
                self.step += 1
                self.epStep += 1
                experience = {'state0': state0, 'action': action, 'fWeight': 0}
                self.episode.append(experience)
                self.loss = self.train()
                state0 = state1
                time.sleep(0.001)

            if self.tutorListened:
                self.tutorListened = False
                self.listener = Thread(target=self.tutorListener)
                self.listener.start()

            if self.epStep >= 200:
                if self.ep > 0:
                    for s in range(self.epStep):
                        exp = self.episode.popleft()
                        if exp['fWeight'] != 0:
                            self.buffer.append(exp)
                self.epStep = 0
                self.ep += 1
                state0 = np.random.randint(0, 4, size=(5,))
            if self.step % 1000 == 0:
                print(self.step, self.loss)

    def input(self):
        while True:
            if input() == '+':
                inputStep = self.step
                time.sleep(2)
                print('input +1, step = ', inputStep)
            elif input() == '-':
                inputStep = self.step
                time.sleep(2)
                print('input -1, step = ', inputStep)
            else:
                print('wrong input')
예제 #23
0
파일: main.py 프로젝트: yosider/RLSnipets
def main():
    with tf.Session() as sess:

        actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                             ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE)
        critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

        #TODO: Ornstein-Uhlenbeck noise.

        sess.run(tf.global_variables_initializer())

        # initialize target net
        actor.update_target_network()
        critic.update_target_network()

        # initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE)

        # main loop.
        for ep in range(MAX_EPISODES):

            episode_reward = 0
            ep_batch_avg_q = 0

            s = ENV.reset()

            for step in range(MAX_EP_STEPS):

                a = actor.predict(np.reshape(s,
                                             (1, STATE_DIM)))  #+ actor_noise()
                s2, r, terminal, info = ENV.step(a[0])
                #print(s2)

                replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                                np.reshape(a, (ACTION_DIM,)), \
                                r, \
                                terminal, \
                                np.reshape(s2, (STATE_DIM,)))

                # Batch sampling.
                if replay_buffer.size() > MINIBATCH_SIZE and \
                    step % TRAIN_INTERVAL == 0:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # target Q値を計算.
                    target_action = actor.predict_target(s2_batch)
                    target_q = critic.predict_target(s2_batch, target_action)

                    # critic の target V値を計算.
                    targets = []
                    for i in range(MINIBATCH_SIZE):
                        if t_batch[i]:
                            # terminal
                            targets.append(r_batch[i])
                        else:
                            targets.append(r_batch[i] + GAMMA * target_q[i])

                    # Critic を train.
                    #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                    pred_q, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(targets, (MINIBATCH_SIZE, 1)))

                    # Actor を train.
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    #print(grads[0].shape)
                    #exit(1)
                    actor.train(s_batch, grads[0])

                    # Update target networks.
                    # 数batchに一度にするべき?
                    actor.update_target_network()
                    critic.update_target_network()

                    ep_batch_avg_q += np.mean(pred_q)

                s = s2
                episode_reward += r

                if terminal:
                    print('Episode:', ep, 'Reward:', episode_reward)
                    reward_log.append(episode_reward)
                    q_log.append(ep_batch_avg_q / step)

                    break
예제 #24
0
def train_expert(env_name):
    """Train expert policy in given environment."""
    if env_name == 'InvertedPendulum-v2':
        env = ExpertInvertedPendulumEnv()
        episode_limit = 200
        return_threshold = 200
    elif env_name == 'InvertedDoublePendulum-v2':
        env = ExpertInvertedDoublePendulumEnv()
        episode_limit = 50
        return_threshold = 460
    elif env_name == 'ThreeReacherEasy-v2':
        env = ThreeReacherEasyEnv()
        episode_limit = 50
        return_threshold = -0.8
    elif env_name == 'ReacherEasy-v2':
        env = ReacherEasyEnv()
        episode_limit = 50
        return_threshold = -0.8
    elif env_name == 'Hopper-v2':
        env = HopperEnv()
        episode_limit = 200
        return_threshold = 600
    elif env_name == 'HalfCheetah-v2':
        env = ExpertHalfCheetahEnv()
        episode_limit = 200
        return_threshold = 1000
    elif env_name == 'StrikerHumanSim-v2':
        env = StrikerHumanSimEnv()
        episode_limit = 200
        return_threshold = -190
    elif env_name == 'PusherHumanSim-v2':
        env = PusherHumanSimEnv()
        episode_limit = 200
        return_threshold = -80
    else:
        raise NotImplementedError
    buffer_size = 1000000
    init_random_samples = 1000
    exploration_noise = 0.2
    learning_rate = 3e-4
    batch_size = 256
    epochs = 200
    steps_per_epoch = 5000
    updates_per_step = 1
    update_actor_every = 1
    start_training = 512
    gamma = 0.99
    polyak = 0.995
    entropy_coefficient = 0.2
    clip_actor_gradients = False
    visual_env = True
    action_size = env.action_space.shape[0]
    tune_entropy_coefficient = True
    target_entropy = -1 * action_size

    def make_actor():
        actor = StochasticActor([
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(action_size * 2)
        ])
        return actor

    def make_critic():
        critic = Critic([
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(1)
        ])
        return critic

    optimizer = tf.keras.optimizers.Adam(learning_rate)

    replay_buffer = ReplayBuffer(buffer_size)
    sampler = Sampler(env,
                      episode_limit=episode_limit,
                      init_random_samples=init_random_samples,
                      visual_env=visual_env)
    agent = SAC(make_actor,
                make_critic,
                make_critic,
                actor_optimizer=optimizer,
                critic_optimizer=optimizer,
                gamma=gamma,
                polyak=polyak,
                entropy_coefficient=entropy_coefficient,
                tune_entropy_coefficient=tune_entropy_coefficient,
                target_entropy=target_entropy,
                clip_actor_gradients=clip_actor_gradients)
    if visual_env:
        obs = np.expand_dims(env.reset()['obs'], axis=0)
    else:
        obs = np.expand_dims(env.reset(), axis=0)
    agent(obs)
    agent.summary()

    mean_test_returns = []
    mean_test_std = []
    steps = []

    step_counter = 0
    for e in range(epochs):
        while step_counter < (e + 1) * steps_per_epoch:
            traj_data = sampler.sample_trajectory(agent, exploration_noise)
            replay_buffer.add(traj_data)
            if step_counter > start_training:
                agent.train(replay_buffer,
                            batch_size=batch_size,
                            n_updates=updates_per_step * traj_data['n'],
                            act_delay=update_actor_every)
            step_counter += traj_data['n']
        print('Epoch {}/{} - total steps {}'.format(e + 1, epochs,
                                                    step_counter))
        out = sampler.evaluate(agent, 10)
        mean_test_returns.append(out['mean'])
        mean_test_std.append(out['std'])
        steps.append(step_counter)
        if out['mean'] >= return_threshold:
            print('Early termination due to reaching return threshold')
            break
    plt.errorbar(steps, mean_test_returns, mean_test_std)
    plt.xlabel('steps')
    plt.ylabel('returns')
    plt.show()
    return agent
예제 #25
0
class DQNAgent():
    '''
	Agent class. It control all the agent functionalities
	'''
    rewards = []
    total_reward = 0
    birth_time = 0
    n_iter = 0
    n_games = 0
    ts_frame = 0
    ts = time.time()

    Memory = namedtuple('Memory',
                        ['obs', 'action', 'new_obs', 'reward', 'done'],
                        rename=False)

    def __init__(self, env, device, hyperparameters, summary_writer=None):
        '''
		Agent initialization. It create the CentralControl that control all the low
		'''

        # The CentralControl is the 'brain' of the agent
        self.cc = CentralControl(env.observation_space.shape,
                                 env.action_space.n, hyperparameters['gamma'],
                                 hyperparameters['n_multi_step'],
                                 hyperparameters['double_DQN'],
                                 hyperparameters['noisy_net'],
                                 hyperparameters['dueling'], device)

        self.cc.set_optimizer(hyperparameters['learning_rate'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

        self.accumulated_loss = []
        self.device = device

        # initialize the replay buffer (i.e. the memory) of the agent
        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])
        self.summary_writer = summary_writer

        self.noisy_net = hyperparameters['noisy_net']

        self.env = env

    def act(self, obs):
        '''
		Greedy action outputted by the NN in the CentralControl
		'''
        return self.cc.get_max_action(obs)

    def act_eps_greedy(self, obs):
        '''
		E-greedy action
		'''

        # In case of a noisy net, it takes a greedy action
        if self.noisy_net:
            return self.act(obs)

        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return self.act(obs)

    def add_env_feedback(self, obs, action, new_obs, reward, done):
        '''
		Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean.
		'''

        # Create the new memory and update the buffer
        new_memory = self.Memory(obs=obs,
                                 action=action,
                                 new_obs=new_obs,
                                 reward=reward,
                                 done=done)
        self.replay_buffer.append(new_memory)

        # update the variables
        self.n_iter += 1
        # decrease epsilon
        self.epsilon = max(
            self.epsilon_final,
            self.epsilon_start - self.n_iter / self.epsilon_decay)
        self.total_reward += reward

    def sample_and_optimize(self, batch_size):
        '''
		Sample batch_size memories from the buffer and optimize them
		'''

        if len(self.replay_buffer) > self.buffer_start_size:
            # sample
            mini_batch = self.replay_buffer.sample(batch_size)
            # optimize
            l_loss = self.cc.optimize(mini_batch)
            self.accumulated_loss.append(l_loss)

        # update target NN
        if self.n_iter % self.iter_update_target == 0:
            self.cc.update_target()

    def reset_stats(self):
        '''
		Reset the agent's statistics
		'''
        self.rewards.append(self.total_reward)
        self.total_reward = 0
        self.accumulated_loss = []
        self.n_games += 1

    def print_info(self):
        '''
		Print information about the agent
		'''
        fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts)
        print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' %
              (self.n_iter, self.n_games, self.total_reward,
               np.mean(self.rewards[-40:]), self.epsilon, fps,
               np.mean(self.accumulated_loss)))

        self.ts_frame = self.n_iter
        self.ts = time.time()

        if self.summary_writer != None:
            self.summary_writer.add_scalar('reward', self.total_reward,
                                           self.n_games)
            self.summary_writer.add_scalar('mean_reward',
                                           np.mean(self.rewards[-40:]),
                                           self.n_games)
            self.summary_writer.add_scalar('10_mean_reward',
                                           np.mean(self.rewards[-10:]),
                                           self.n_games)
            self.summary_writer.add_scalar('esilon', self.epsilon,
                                           self.n_games)
            self.summary_writer.add_scalar('loss',
                                           np.mean(self.accumulated_loss),
                                           self.n_games)
예제 #26
0
 def init(self, args, env):
     self.critic = np.zeros(shape=(5, 5, 4))
     self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)
예제 #27
0
    def learn(self, timesteps=10000, verbose=0, seed=None):
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)

        self.eps_range = self._eps_range(timesteps)
        replay_buffer = ReplayBuffer(self.buffer_size)

        self._init_model()

        obs = self.env.reset()
        for step in range(timesteps):
            # while not done:
            cur_eps = next(self.eps_range, None)
            if cur_eps is None:
                cur_eps = self.final_eps

            action = self._select_action(obs, cur_eps)

            new_obs, rewards, done, info = self.env.step(action)
            if done:
                new_obs = [
                    np.nan
                ] * self.obs_shape[0]  # hacky way to keep dimensions correct
            replay_buffer.add(obs, action, rewards, new_obs)

            obs = new_obs

            # learn gradient
            if step > self.learning_starts:
                if len(replay_buffer.buffer
                       ) < self.batch_size:  # buffer too small
                    continue
                samples = replay_buffer.sample(self.batch_size, self.device)
                obs_batch, actions_batch, rewards_batch, new_obs_batch = samples

                predicted_q_values = self._predictQValue(
                    self.step_model, obs_batch, actions_batch)
                ys = self._expectedLabels(self.target_model, new_obs_batch,
                                          rewards_batch)

                loss = F.smooth_l1_loss(predicted_q_values, ys)

                self.optim.zero_grad()
                loss.backward()
                for i in self.step_model.parameters():
                    i.grad.clamp_(min=-1, max=1)  # exploding gradient
                    # i.grad.clamp_(min=-10, max=10) # exploding gradient
                self.optim.step()

                # update target
                if step % self.target_network_update_freq == 0:
                    self.target_model.load_state_dict(
                        self.step_model.state_dict())

            if done:
                obs = self.env.reset()
            if verbose == 1:
                if step % (timesteps * 0.1) == 0:
                    perc = int(step / (timesteps * 0.1))
                    print(f"At step {step}")
                    print(f"{perc}% done")
예제 #28
0
class DDPG(Agent):
    def __init__(self, args, env, env_test, logger):
        super(DDPG, self).__init__(args, env, env_test, logger)
        self.args = args
        self.init(args, env)
        for metric in self.critic.model.metrics_names:
            self.metrics[self.critic.model.name + '_' + metric] = 0

    def init(self, args, env):
        names = ['state0', 'action', 'state1', 'reward', 'terminal']
        self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
        self.actorCritic = ActorCriticDDPG(args, env)
        # self.critic = CriticDDPG(args, env)
        # self.actor = ActorDDPG(args, env)

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t = [exp[name] for name in self.buffer.names]
            a1 = self.actor.target_model.predict_on_batch(s1)
            a1 = np.clip(a1, self.env.action_space.low,
                         self.env.action_space.high)
            q = self.critic.Tmodel.predict_on_batch([s1, a1])
            targets = r + (1 - t) * self.critic.gamma * np.squeeze(q)
            targets = np.clip(targets, self.env.minR / (1 - self.critic.gamma),
                              self.env.maxR)
            inputs = [s0, a0]
            loss = self.critic.model.train_on_batch(inputs, targets)
            for i, metric in enumerate(self.critic.model.metrics_names):
                self.metrics[metric] += loss[i]

            # a2 = self.actor.model.predict_on_batch(s0)
            # grads = self.critic.gradsModel.predict_on_batch([s0, a2])
            # low = self.env.action_space.low
            # high = self.env.action_space.high
            # for d in range(grads[0].shape[0]):
            #     width = high[d] - low[d]
            #     for k in range(self.batch_size):
            #         if grads[k][d] >= 0:
            #             grads[k][d] *= (high[d] - a2[k][d]) / width
            #         else:
            #             grads[k][d] *= (a2[k][d] - low[d]) / width
            # self.actor.train(s0, grads)

            self.actor.target_train()
            self.critic.target_train()

    def reset(self):

        if self.trajectory:
            T = int(self.trajectory[-1]['terminal'])
            R = np.sum([
                self.env.unshape(exp['reward'], exp['terminal'])
                for exp in self.trajectory
            ])
            S = len(self.trajectory)
            self.env.processEp(R, S, T)
            for expe in reversed(self.trajectory):
                self.buffer.append(expe.copy())

            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def make_input(self, state):
        input = [np.reshape(state, (1, self.actor.s_dim[0]))]
        return input

    def act(self, state):
        input = self.make_input(state)
        action = self.actor.model.predict(input, batch_size=1)
        noise = np.random.normal(0., 0.1, size=action.shape)
        action = noise + action
        action = np.clip(action, self.env.action_space.low,
                         self.env.action_space.high)
        action = action.squeeze()
        return action
예제 #29
0
class PlayroomGM(Wrapper):
    def __init__(self, env, args):
        super(PlayroomGM, self).__init__(env)

        self.gamma = float(args['--gamma'])
        self.eps = float(args['--eps'])
        self.demo_f = [int(f) for f in args['--demo'].split(',')]

        self.feat = np.array([int(f) for f in args['--features'].split(',')])
        self.N = self.feat.shape[0]
        vs = np.zeros(shape=(self.N, self.state_dim[0]))
        vs[np.arange(self.N), self.feat] = 1
        self.vs = vs / np.sum(vs, axis=1, keepdims=True)
        self.R = 100
        self.idx = -1
        self.v = np.zeros(shape=(self.state_dim[0], 1))
        self.g = np.ones(shape=(self.state_dim[0]))
        self.queues = [CompetenceQueue() for _ in range(self.N)]
        self.names = ['s0', 'r0', 'a', 's1', 'r1', 'g', 'v', 'o', 'u']
        self.buffer = ReplayBuffer(limit=int(1e5), names=self.names, N=self.N)

    def reset(self, exp):
        self.idx, self.v = self.sample_v(exp['s0'])
        exp['g'] = self.g
        exp['v'] = self.v
        return exp

    def get_r(self, s, g, v):
        return self.R * np.sum(np.multiply(v, s == g), axis=1, keepdims=True)

    def sample_v(self, s):
        remaining_v = [i for i in range(self.N) if s[self.feat[i]] != 1]
        probs = self.get_probs(idxs=remaining_v, eps=self.eps)
        idx = np.random.choice(remaining_v, p=probs)
        v = self.vs[idx]
        return idx, v

    def sampleT(self, batch_size):
        idxs = [
            i for i in range(self.N)
            if self.buffer._tutorBuffers[i]._numsamples > batch_size
        ]
        probs = self.get_probs(idxs=idxs, eps=self.eps)
        t = np.random.choice(idxs, p=probs)
        samples = self.buffer.sampleT(batch_size, t)
        return samples, t

    def end_episode(self, episode):
        term = episode[-1]['r1'][self.idx] == self.R
        self.queues[self.idx].process_ep(episode, term)
        base_util = np.zeros(shape=(self.N, ))
        base_util[self.idx] = 1
        self.process_trajectory(episode, base_util=base_util)

    def process_trajectory(self, trajectory, base_util=None):
        if base_util is None:
            u = np.zeros(shape=(self.N, ))
        else:
            u = base_util
        u = np.expand_dims(u, axis=1)
        # mcr = np.zeros(shape=(self.N,))
        for exp in reversed(trajectory):
            u = self.gamma * u
            u[np.where(exp['r1'] > exp['r0'])] = 1

            # u_idx = np.where(u != 0)
            # mcr[u_idx] = exp['r1'][u_idx] + self.gamma * mcr[u_idx]
            exp['u'] = u.squeeze()
            # exp['mcr'] = mcr
            if any(u != 0):
                self.buffer.append(exp.copy())

    # def sample(self, batchsize):
    #     probs = self.get_probs(idxs=range(self.N), eps=self.eps2)
    #     idx = np.random.choice(self.N, p=probs)
    #     samples = self.buffer.sample(batchsize, idx)
    #     if samples is not None:
    #         self.queues[idx].process_samples(samples)
    #     return idx, samples
    #
    # def sampleT(self, batchsize):
    #     probs = self.get_probs(idxs=range(self.N), eps=self.eps3)
    #     idx = np.random.choice(self.N, p=probs)
    #     samples = self.buffer.sampleT(batchsize, idx)
    #     if samples is not None:
    #         self.queues[idx].process_samplesT(samples)
    #     return idx, samples

    def get_demo(self):
        demo = []
        exp = {}
        exp['s0'] = self.env.reset()
        exp['r0'] = self.get_r(exp['s0'], self.g, self.vs).squeeze()
        exp['g'] = self.g
        task = np.random.choice(self.demo_f)
        exp['v'] = self.vs[list(self.feat).index(task)]
        while True:
            a, done = self.opt_action(task)
            if done:
                break
            else:
                exp['a'] = np.expand_dims(a, axis=1)
                exp['s1'] = self.env.step(exp['a'], True)[0]
                exp['r1'] = self.get_r(exp['s1'], self.g, self.vs).squeeze()
                exp['o'] = 1
                demo.append(exp.copy())
                exp['s0'] = exp['s1']
                exp['r0'] = exp['r1']

        return demo, task

    def opt_action(self, t):
        return self.env.opt_action(t)

    def get_stats(self):
        stats = {}
        for i, f in enumerate(self.feat):
            self.queues[i].update()
            for key, val in self.queues[i].get_stats().items():
                stats[key + str(f)] = val
            self.queues[i].init_stat()
        return stats

    def get_cps(self):
        return [np.maximum(abs(q.CP + 0.05) - 0.05, 0) for q in self.queues]

    def get_probs(self, idxs, eps):
        cps = self.get_cps()
        vals = [cps[idx] for idx in idxs]
        l = len(vals)
        s = np.sum(vals)
        if s == 0:
            probs = [1 / l] * l
        else:
            probs = [eps / l + (1 - eps) * v / s for v in vals]
        return probs

    @property
    def state_dim(self):
        return 8,

    @property
    def goal_dim(self):
        return 8,

    @property
    def action_dim(self):
        return 5
예제 #30
0
    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        kwargs["expl_noise"] = args.expl_noise
        kwargs["tau"] = args.tau
        policy = TD3(**kwargs)
    elif args.policy == "SAC":
        kwargs["policy_freq"] = args.policy_freq
        kwargs["tau"] = args.tau
        policy = SAC(**kwargs)
    elif args.policy == "MPO":
        policy = MPO(**kwargs)
    if args.load_model != "":
        policy_file = (args.file_name
                       if args.load_model == "default" else args.load_model)
        policy.load(f"./models/{policy_file}")

    replay_buffer = ReplayBuffer(
        state_dim,
        action_dim,
        max_size=int(args.buffer_size),
    )

    train_loop = TRAIN_LOOPS[args.policy]

    train_loop(args, policy, replay_buffer, env)