def init(self, args, env): names = ['state0', 'action', 'state1', 'reward', 'terminal', 'goal'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy()) if args['--imit'] != '0': names.append('expVal') self.bufferImit = ReplayBuffer(limit=int(1e6), names=names.copy()) self.critic = CriticDQNG(args, env)
def __init__(self, state_size, action_size, seed, framework, buffer_type): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.framework = framework self.buffer_type = buffer_type # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory # def __init__(self, device, buffer_size, batch_size, alpha, beta): if self.buffer_type == 'PER_ReplayBuffer': self.memory = PER_ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE, ALPHA, BETA) if self.buffer_type == 'ReplayBuffer': self.memory = ReplayBuffer(device, action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, env, gamma=0.99, tau=1e-3, pol_lr=1e-4, q_lr=5e-3, batch_size=64, buffer_size=10000, target_noise=0.2, action_noise=0.1, clip_range=0.5, update_delay=2): # environment stuff self.env = env self.num_act = env.action_space.shape[0] self.num_obs = env.observation_space.shape[0] self.eval_env = copy.deepcopy(env) # hyper parameters self.gamma = gamma self.tau = tau self.pol_lr = pol_lr self.q_lr = q_lr self.batch_size = batch_size self.buffer_size = buffer_size self.target_noise = target_noise self.action_noise = action_noise self.clip_range = clip_range self.update_delay = 2 # networks self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double() self.q1 = Critic(self.num_obs, self.num_act, [400, 300]).double() self.q2 = Critic(self.num_obs, self.num_act, [400, 300]).double() self.pol.init_weights() self.q1.init_weights() self.q2.init_weights() self.target_pol = copy.deepcopy(self.pol).double() self.target_q1 = copy.deepcopy(self.q1).double() self.target_q2 = copy.deepcopy(self.q2).double() # optimizers, buffer self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr) self.q1_opt = torch.optim.Adam( self.q1.parameters(), lr=self.q_lr, ) self.q2_opt = torch.optim.Adam( self.q2.parameters(), lr=self.q_lr, ) self.buffer = ReplayBuffer(self.buffer_size, 1000) self.mse_loss = torch.nn.MSELoss() self.cum_q1_loss = 0 self.cum_q2_loss = 0 self.cum_obj = 0
def init(self, args, env): names = ['s0', 'a', 's1', 'r', 't', 'g'] metrics = ['loss_dqn', 'loss_actor'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy(), args=args) self.actorCritic = ActorCriticDDPGG(args, env) for metric in metrics: self.metrics[metric] = 0
def __init__(self, env, state_dim: int, action_dim: int, config: Dict, device=None, writer=None): self.logger = logging.getLogger("MADDPG") self.device = device if device is not None else DEVICE self.writer = writer self.env = env self.state_dim = state_dim self.action_dim = action_dim self.agents_number = config['agents_number'] hidden_layers = config.get('hidden_layers', (400, 300)) noise_scale = config.get('noise_scale', 0.2) noise_sigma = config.get('noise_sigma', 0.1) actor_lr = config.get('actor_lr', 1e-3) actor_lr_decay = config.get('actor_lr_decay', 0) critic_lr = config.get('critic_lr', 1e-3) critic_lr_decay = config.get('critic_lr_decay', 0) self.actor_tau = config.get('actor_tau', 0.002) self.critic_tau = config.get('critic_tau', 0.002) create_agent = lambda: DDPGAgent(state_dim, action_dim, agents=self.agents_number, hidden_layers=hidden_layers, actor_lr=actor_lr, actor_lr_decay=actor_lr_decay, critic_lr=critic_lr, critic_lr_decay=critic_lr_decay, noise_scale=noise_scale, noise_sigma=noise_sigma, device=self.device) self.agents = [create_agent() for _ in range(self.agents_number)] self.discount = 0.99 if 'discount' not in config else config['discount'] self.gradient_clip = 1.0 if 'gradient_clip' not in config else config[ 'gradient_clip'] self.warm_up = 1e3 if 'warm_up' not in config else config['warm_up'] self.buffer_size = int( 1e6) if 'buffer_size' not in config else config['buffer_size'] self.batch_size = config.get('batch_size', 128) self.p_batch_size = config.get('p_batch_size', int(self.batch_size // 2)) self.n_batch_size = config.get('n_batch_size', int(self.batch_size // 4)) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.update_every_iterations = config.get('update_every_iterations', 2) self.number_updates = config.get('number_updates', 2) self.reset()
def init(self, args, env): names = ['s0', 'a', 's1', 'r', 't', 'g', 'm', 'task', 'mcr'] metrics = ['loss_dqn', 'qval', 'val'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy(), args=args) self.actorCritic = ActorCriticDQNGM(args, env) for metric in metrics: self.metrics[metric] = 0 self.goalcounts = np.zeros((len(self.env.goals), ))
class DDPGG(DDPG): def __init__(self, args, env, env_test, logger): super(DDPGG, self).__init__(args, env, env_test, logger) def init(self, args, env): names = ['s0', 'a', 's1', 'r', 't', 'g'] metrics = ['loss_dqn', 'loss_actor'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy(), args=args) self.actorCritic = ActorCriticDDPGG(args, env) for metric in metrics: self.metrics[metric] = 0 def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) targets_dqn = self.actorCritic.get_targets_dqn( exp['r'], exp['t'], exp['s1'], exp['g']) inputs = [exp['s0'], exp['a'], exp['g'], targets_dqn] loss_dqn = self.actorCritic.trainQval(inputs) action, criticActionGrads, invertedCriticActionGrads = self.actorCritic.trainActor( [exp['s0'], exp['g']]) self.metrics['loss_dqn'] += np.squeeze(loss_dqn) self.actorCritic.target_train() def make_input(self, state, mode): if mode == 'train': input = [np.expand_dims(i, axis=0) for i in [state, self.env.goal]] else: input = [ np.expand_dims(i, axis=0) for i in [state, self.env_test.goal] ] return input def reset(self): if self.trajectory: self.env.end_episode(self.trajectory) for expe in self.trajectory: self.buffer.append(expe.copy()) if self.args['--her'] != '0': augmented_ep = self.env.augment_episode(self.trajectory) for e in augmented_ep: self.buffer.append(e) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state
def __init__(self, state_size, action_size, args): """ Initialize a D4PG Agent. """ self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.action_size = action_size self.state_size = state_size self.framework = args.framework self.eval = args.eval self.agent_count = 1 self.learn_rate = args.learn_rate self.batch_size = args.batch_size self.buffer_size = args.buffer_size self.C = args.C self._epsilon = args.epsilon self.epsilon_decay = args.epsilon_decay self.epsilon_min = args.epsilon_min self.gamma = 0.99 self.rollout = args.rollout self.tau = args.tau self.momentum = 1 self.l2_decay = 0.0001 self.update_type = "hard" self.t_step = 0 self.episode = 0 self.seed = 0 # Set up memory buffers if args.prioritized_experience_replay: self.memory = PERBuffer(args.buffersize, self.batchsize, self.framestack, self.device, args.alpha, args.beta) self.criterion = WeightedLoss() else: self.memory = ReplayBuffer(self.device, self.buffer_size, self.gamma, self.rollout) # Initialize Q networks # self.q = self._make_model(state_size, action_size, args.pixels) self.q_target = self._make_model(state_size, action_size, args.pixels) self._hard_update(self.q, self.q_target) self.q_optimizer = self._set_optimizer(self.q.parameters(), lr=self.learn_rate, decay=self.l2_decay, momentum=self.momentum) self.new_episode()
def __init__(self, s_dim, num_actions, lr): self.step = 0 self.epStep = 0 self.ep = 0 self.tutorListened = True self.tutorInput = '' self.sDim = s_dim self.num_actions = num_actions self.learning_rate = lr self.names = ['state0', 'action', 'feedback', 'fWeight'] self.buffer = ReplayBuffer(limit=int(1e6), names=self.names) self.batchSize = 64 self.episode = deque(maxlen=400) self.model = self.create_model()
def __init__( self, env, sub_states, layers, gamma=0.99, tau=1e-3, pol_lr=1e-4, q_lr=1e-3, batch_size=64, buffer_size=10000, ): # environment stuff self.env = env self.num_act = env.action_space.shape[0] self.num_obs = env.observation_space.shape[0] self.eval_env = copy.deepcopy(env) self.sub_states = sub_states self.layers = layers # hyper parameters self.gamma = gamma self.tau = tau self.pol_lr = pol_lr self.q_lr = q_lr self.batch_size = batch_size self.buffer_size = buffer_size # networks self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double() # decomp critic self.q = DecompCritic(self.sub_states, self.num_act, layers).double() self.pol.init_weights() self.q.init_weights() self.target_pol = copy.deepcopy(self.pol).double() self.target_q = copy.deepcopy(self.q).double() # optimizers, buffer self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr) self.q_opt = torch.optim.Adam( self.q.parameters(), lr=self.q_lr, ) self.buffer = ReplayBuffer(self.buffer_size, 1000) self.mse_loss = torch.nn.MSELoss() self.cum_loss = 0 self.cum_obj = 0
def __init__(self, env, hyperparameters, device, summary_writer=None): """Set parameters, initialize network.""" state_space_shape = env.observation_space.shape action_space_size = env.action_space.n self.env = env self.online_network = DQN(state_space_shape, action_space_size).to(device) self.target_network = DQN(state_space_shape, action_space_size).to(device) # XXX maybe not really necesary? self.update_target_network() self.experience_replay = None self.accumulated_loss = [] self.device = device self.optimizer = optim.Adam(self.online_network.parameters(), lr=hyperparameters['learning_rate']) self.double_DQN = hyperparameters['double_DQN'] # Discount factor self.gamma = hyperparameters['gamma'] # XXX ??? self.n_multi_step = hyperparameters['n_multi_step'] self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma']) self.birth_time = time.time() self.iter_update_target = hyperparameters['n_iter_update_target'] self.buffer_start_size = hyperparameters['buffer_start_size'] self.summary_writer = summary_writer # Greedy search hyperparameters self.epsilon_start = hyperparameters['epsilon_start'] self.epsilon = hyperparameters['epsilon_start'] self.epsilon_decay = hyperparameters['epsilon_decay'] self.epsilon_final = hyperparameters['epsilon_final']
def __init__(self, n_actions, buffer_size=1000000, behaviour_policy='epsilon_greedy', discount_factor=0.99, clip_grad_norm_value=10.0, policy_args={}): self.discount_factor = discount_factor self.clip_grad_norm_value = clip_grad_norm_value self.replay_buffer = ReplayBuffer(capacity=buffer_size) if behaviour_policy == 'epsilon_greedy': self.policy = EpsilonGreedyPolicy(policy_args) else: self.policy = SoftPolicy() self.q_network = QNetwork(n_actions).to(device)
class DQNG(DQN): def __init__(self, args, env, env_test, logger): super(DQNG, self).__init__(args, env, env_test, logger) def init(self, args, env): names = ['state0', 'action', 'state1', 'reward', 'terminal', 'goal'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy()) if args['--imit'] != '0': names.append('expVal') self.bufferImit = ReplayBuffer(limit=int(1e6), names=names.copy()) self.critic = CriticDQNG(args, env) def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) s0, a0, s1, r, t, g = [exp[name] for name in self.buffer.names] targets_dqn = self.critic.get_targets_dqn(r, t, s1, g) inputs = [s0, a0, g] loss = self.critic.qvalModel.train_on_batch(inputs, targets_dqn) for i, metric in enumerate(self.critic.qvalModel.metrics_names): self.metrics[metric] += loss[i] if self.args[ '--imit'] != '0' and self.bufferImit.nb_entries > self.batch_size: exp = self.bufferImit.sample(self.batch_size) s0, a0, s1, r, t, g, e = [ exp[name] for name in self.bufferImit.names ] targets_dqn = self.critic.get_targets_dqn(r, t, s1, g) targets = [ targets_dqn, np.zeros((self.batch_size, 1)), np.zeros((self.batch_size, 1)) ] inputs = [s0, a0, g, e] loss = self.critic.imitModel.train_on_batch(inputs, targets) for i, metric in enumerate( self.critic.imitModel.metrics_names): self.imitMetrics[metric] += loss[i] self.critic.target_train() def make_input(self, state, t): input = [np.expand_dims(i, axis=0) for i in [state, self.env.goal]] # temp = self.env.explor_temp(t) input.append(np.expand_dims([0.5], axis=0)) return input
def __init__(self, env, device, hyperparameters, summary_writer=None): ''' Agent initialization. It create the CentralControl that control all the low ''' self.rewards = [] self.total_reward = 0 self.birth_time = 0 self.n_iter = 0 self.n_games = 0 self.ts_frame = 0 self.ts = time.time() self.Memory = namedtuple( 'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], rename=False) # The CentralControl is the 'brain' of the agent self.cc = CentralControl(env.observation_space.shape, env.action_space.n, hyperparameters['gamma'], hyperparameters['n_multi_step'], hyperparameters['double_DQN'], hyperparameters['noisy_net'], hyperparameters['dueling'], device) self.cc.set_optimizer(hyperparameters['learning_rate']) self.birth_time = time.time() self.iter_update_target = hyperparameters['n_iter_update_target'] self.buffer_start_size = hyperparameters['buffer_start_size'] self.epsilon_start = hyperparameters['epsilon_start'] self.epsilon = hyperparameters['epsilon_start'] self.epsilon_decay = hyperparameters['epsilon_decay'] self.epsilon_final = hyperparameters['epsilon_final'] self.accumulated_loss = [] self.device = device # initialize the replay buffer (i.e. the memory) of the agent self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma']) self.summary_writer = summary_writer self.noisy_net = hyperparameters['noisy_net'] self.env = env
def __init__(self, env, device, cfg, summary_writer=None): ''' Agent initialization. It create the CentralControl that control all the low ''' # The CentralControl is the 'brain' of the agent self.cc = CentralControl(env.observation_space.shape, env.action_space.n, cfg.rl.gamma, cfg.rl.n_multi_step, cfg.neural_net.double_dqn, cfg.neural_net.noisy_net, cfg.neural_net.dueling, device) self.cc.set_optimizer(cfg.train.learning_rate) self.birth_time = time.time() self.iter_update_target = cfg.replay.n_iter_update_target self.buffer_start_size = cfg.replay.buffer_start_size self.epsilon_start = cfg.rl.epsilon_start self.epsilon = cfg.rl.epsilon_start self.epsilon_decay = cfg.rl.epsilon_decay self.epsilon_final = cfg.rl.epsilon_final self.accumulated_loss = [] self.device = device # initialize the replay buffer (i.e. the memory) of the agent self.replay_buffer = ReplayBuffer(cfg.replay.buffer_capacity, cfg.rl.n_multi_step, cfg.rl.gamma) self.summary_writer = summary_writer self.noisy_net = cfg.neural_net.noisy_net self.env = env self.total_reward = 0 self.n_iter = 0 self.n_games = 0 self.ts_frame = 0 self.ts = time.time() self.rewards = []
def __init__(self, env, args): super(PlayroomGM, self).__init__(env) self.gamma = float(args['--gamma']) self.eps = float(args['--eps']) self.demo_f = [int(f) for f in args['--demo'].split(',')] self.feat = np.array([int(f) for f in args['--features'].split(',')]) self.N = self.feat.shape[0] vs = np.zeros(shape=(self.N, self.state_dim[0])) vs[np.arange(self.N), self.feat] = 1 self.vs = vs / np.sum(vs, axis=1, keepdims=True) self.R = 100 self.idx = -1 self.v = np.zeros(shape=(self.state_dim[0], 1)) self.g = np.ones(shape=(self.state_dim[0])) self.queues = [CompetenceQueue() for _ in range(self.N)] self.names = ['s0', 'r0', 'a', 's1', 'r1', 'g', 'v', 'o', 'u'] self.buffer = ReplayBuffer(limit=int(1e5), names=self.names, N=self.N)
class Qoff(Agent): def __init__(self, args, env, env_test, logger): super(Qoff, self).__init__(args, env, env_test, logger) self.args = args self.gamma = 0.99 self.lr = 0.1 self.names = ['state0', 'action', 'state1', 'reward', 'terminal'] self.init(args, env) def init(self, args, env): self.critic = np.zeros(shape=(5, 5, 4)) self.buffer = ReplayBuffer(limit=int(1e6), names=self.names) def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) s0, a0, s1, r, t, g, m = [exp[name] for name in self.names] for k in range(self.batch_size): target = r[k] + (1 - t[k]) * self.gamma * np.max( self.critic[tuple(s1[k])]) self.critic[tuple(s0[k])][a0[k]] = self.lr * target + \ (1 - self.lr) * self.critic[tuple(s0[k])][a0[k]] def act(self, state): if np.random.rand() < 0.2: action = np.random.randint(self.env.action_space.n) else: action = np.argmax(self.critic[tuple(state)]) return action def reset(self): if self.trajectory: self.env.processEp(self.trajectory) for expe in reversed(self.trajectory): self.buffer.append(expe.copy()) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state
def __init__(self, env, gamma=0.99, tau=0.005, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, target_update_interval=1, gradient_steps=1, target_entropy='auto', ent_coef='auto', random_exploration=0.0, discrete=True, regularized=True, feature_extraction="cnn"): self.env = env self.learning_starts = learning_starts self.random_exploration = random_exploration self.train_freq = train_freq self.target_update_interval = target_update_interval self.batch_size = batch_size self.gradient_steps = gradient_steps self.learning_rate = learning_rate self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session(graph=self.graph) self.replay_buffer = ReplayBuffer(buffer_size) self.agent = SACAgent(self.sess, env, discrete=discrete, regularized=regularized, feature_extraction=feature_extraction) self.model = SACModel(self.sess, self.agent, target_entropy, ent_coef, gamma, tau) with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(self.model.target_init_op) self.num_timesteps = 0
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def muzero(config: MuZeroConfig): ray.init() storage = SharedStorage.remote(config) replay_buffer = ReplayBuffer.remote(config) leaner = Leaner.remote(config, storage, replay_buffer) temperatures = list(np.linspace(1.0, 0.1, num=config.num_actors)) actors = [ Actor.remote(config, storage, replay_buffer, temperature) for temperature in temperatures ] workers = [leaner] + actors ray.get([worker.start.remote() for worker in workers]) ray.shutdown()
class SAC: def __init__(self, env, gamma=0.99, tau=0.005, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, target_update_interval=1, gradient_steps=1, target_entropy='auto', ent_coef='auto', random_exploration=0.0, discrete=True, regularized=True, feature_extraction="cnn"): self.env = env self.learning_starts = learning_starts self.random_exploration = random_exploration self.train_freq = train_freq self.target_update_interval = target_update_interval self.batch_size = batch_size self.gradient_steps = gradient_steps self.learning_rate = learning_rate self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session(graph=self.graph) self.replay_buffer = ReplayBuffer(buffer_size) self.agent = SACAgent(self.sess, env, discrete=discrete, regularized=regularized, feature_extraction=feature_extraction) self.model = SACModel(self.sess, self.agent, target_entropy, ent_coef, gamma, tau) with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(self.model.target_init_op) self.num_timesteps = 0 def train(self, learning_rate): batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = self.replay_buffer.sample( self.batch_size) # print("batch_actions:", batch_actions.shape) # print("self.agent.actions_ph:", self.agent.actions_ph) feed_dict = { self.agent.obs_ph: batch_obs, self.agent.next_obs_ph: batch_next_obs, self.model.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.model.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.model.learning_rate_ph: learning_rate } if not self.agent.discrete: feed_dict[self.agent.actions_ph] = batch_actions else: batch_actions = batch_actions.reshape(-1) feed_dict[self.agent.actions_ph] = batch_actions policy_loss, qf1_loss, qf2_loss, value_loss, *values = self.sess.run( self.model.step_ops, feed_dict) return policy_loss, qf1_loss, qf2_loss def learn(self, total_timesteps): learning_rate = get_schedule_fn(self.learning_rate) episode_rewards = [0] mb_losses = [] obs = self.env.reset() for step in range(total_timesteps): if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: unscaled_action = self.env.action_space.sample() action = scale_action(self.env.action_space, unscaled_action) else: action = self.agent.step(obs[None]).flatten() unscaled_action = unscale_action(self.env.action_space, action) # print("\nunscaled_action:", unscaled_action) new_obs, reward, done, _ = self.env.step(unscaled_action) self.num_timesteps += 1 self.replay_buffer.add(obs, action, reward, new_obs, done) obs = new_obs if self.num_timesteps % self.train_freq == 0: for grad_step in range(self.gradient_steps): if not self.replay_buffer.can_sample( self.batch_size ) or self.num_timesteps < self.learning_starts: break frac = 1.0 - step / total_timesteps current_lr = learning_rate(frac) mb_losses.append(self.train(current_lr)) if (step + grad_step) % self.target_update_interval == 0: self.sess.run(self.model.target_update_op) episode_rewards[-1] += reward if done: obs = self.env.reset() episode_rewards.append(0) mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) loss_str = "/".join([f"{x:.3f}" for x in np.mean(mb_losses, 0) ]) if len(mb_losses) > 0 else "NaN" print(f"Step {step} - reward: {mean_reward} - loss: {loss_str}", end="\n" if step % 500 == 0 else "\r")
class Agent(): def __init__(self, s_dim, num_actions, lr): self.step = 0 self.epStep = 0 self.ep = 0 self.tutorListened = True self.tutorInput = '' self.sDim = s_dim self.num_actions = num_actions self.learning_rate = lr self.names = ['state0', 'action', 'feedback', 'fWeight'] self.buffer = ReplayBuffer(limit=int(1e6), names=self.names) self.batchSize = 64 self.episode = deque(maxlen=400) self.model = self.create_model() def create_model(self): state = Input(shape=self.sDim) action = Input(shape=(1,), dtype='uint8') l1 = Dense(400, activation="relu")(state) feedback = Dense(self.num_actions, activation=None, kernel_initializer='random_uniform')(l1) feedback = Reshape((1, self.num_actions))(feedback) mask = Lambda(K.one_hot, arguments={'num_classes': self.num_actions}, output_shape=(self.num_actions,))(action) feedback = multiply([feedback, mask]) feedback = Lambda(K.sum, arguments={'axis': 2})(feedback) feedbackModel = Model(inputs=[state, action], outputs=feedback) feedbackModel.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) return feedbackModel def train(self): loss = 0 if self.buffer.nb_entries > self.batchSize: samples = self.buffer.sample(self.batchSize) s, a, targets, weights = [np.array(samples[name]) for name in self.names] loss = self.model.train_on_batch(x=[s,a], y=targets, sample_weight=weights) return loss def tutorListener(self): self.tutorInput = input("> ") print("maybe updating...the kbdInput variable is: {}".format(self.tutorInput)) self.tutorListened = True def run(self): state0 = np.random.randint(0, 4, size=(5,)) while self.step < 100000: if self.tutorInput != '': print("Received new keyboard Input. Setting playing ID to keyboard input value") for i in range(1,10): self.episode[-i]['fWeight'] = 1 self.episode[-i]['feedback'] = self.tutorInput self.tutorInput = '' else: action = np.random.randint(self.num_actions) state1 = np.random.randint(0, 4, size=(5,)) self.step += 1 self.epStep += 1 experience = {'state0': state0, 'action': action, 'fWeight': 0} self.episode.append(experience) self.loss = self.train() state0 = state1 time.sleep(0.001) if self.tutorListened: self.tutorListened = False self.listener = Thread(target=self.tutorListener) self.listener.start() if self.epStep >= 200: if self.ep > 0: for s in range(self.epStep): exp = self.episode.popleft() if exp['fWeight'] != 0: self.buffer.append(exp) self.epStep = 0 self.ep += 1 state0 = np.random.randint(0, 4, size=(5,)) if self.step % 1000 == 0: print(self.step, self.loss) def input(self): while True: if input() == '+': inputStep = self.step time.sleep(2) print('input +1, step = ', inputStep) elif input() == '-': inputStep = self.step time.sleep(2) print('input -1, step = ', inputStep) else: print('wrong input')
def main(): with tf.Session() as sess: actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND, ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE) critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM)) #TODO: Ornstein-Uhlenbeck noise. sess.run(tf.global_variables_initializer()) # initialize target net actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE) # main loop. for ep in range(MAX_EPISODES): episode_reward = 0 ep_batch_avg_q = 0 s = ENV.reset() for step in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, STATE_DIM))) #+ actor_noise() s2, r, terminal, info = ENV.step(a[0]) #print(s2) replay_buffer.add(np.reshape(s, (STATE_DIM,)), \ np.reshape(a, (ACTION_DIM,)), \ r, \ terminal, \ np.reshape(s2, (STATE_DIM,))) # Batch sampling. if replay_buffer.size() > MINIBATCH_SIZE and \ step % TRAIN_INTERVAL == 0: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # target Q値を計算. target_action = actor.predict_target(s2_batch) target_q = critic.predict_target(s2_batch, target_action) # critic の target V値を計算. targets = [] for i in range(MINIBATCH_SIZE): if t_batch[i]: # terminal targets.append(r_batch[i]) else: targets.append(r_batch[i] + GAMMA * target_q[i]) # Critic を train. #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切. pred_q, _ = critic.train( s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1))) # Actor を train. a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) #print(grads[0].shape) #exit(1) actor.train(s_batch, grads[0]) # Update target networks. # 数batchに一度にするべき? actor.update_target_network() critic.update_target_network() ep_batch_avg_q += np.mean(pred_q) s = s2 episode_reward += r if terminal: print('Episode:', ep, 'Reward:', episode_reward) reward_log.append(episode_reward) q_log.append(ep_batch_avg_q / step) break
def train_expert(env_name): """Train expert policy in given environment.""" if env_name == 'InvertedPendulum-v2': env = ExpertInvertedPendulumEnv() episode_limit = 200 return_threshold = 200 elif env_name == 'InvertedDoublePendulum-v2': env = ExpertInvertedDoublePendulumEnv() episode_limit = 50 return_threshold = 460 elif env_name == 'ThreeReacherEasy-v2': env = ThreeReacherEasyEnv() episode_limit = 50 return_threshold = -0.8 elif env_name == 'ReacherEasy-v2': env = ReacherEasyEnv() episode_limit = 50 return_threshold = -0.8 elif env_name == 'Hopper-v2': env = HopperEnv() episode_limit = 200 return_threshold = 600 elif env_name == 'HalfCheetah-v2': env = ExpertHalfCheetahEnv() episode_limit = 200 return_threshold = 1000 elif env_name == 'StrikerHumanSim-v2': env = StrikerHumanSimEnv() episode_limit = 200 return_threshold = -190 elif env_name == 'PusherHumanSim-v2': env = PusherHumanSimEnv() episode_limit = 200 return_threshold = -80 else: raise NotImplementedError buffer_size = 1000000 init_random_samples = 1000 exploration_noise = 0.2 learning_rate = 3e-4 batch_size = 256 epochs = 200 steps_per_epoch = 5000 updates_per_step = 1 update_actor_every = 1 start_training = 512 gamma = 0.99 polyak = 0.995 entropy_coefficient = 0.2 clip_actor_gradients = False visual_env = True action_size = env.action_space.shape[0] tune_entropy_coefficient = True target_entropy = -1 * action_size def make_actor(): actor = StochasticActor([ tf.keras.layers.Dense(256, 'relu'), tf.keras.layers.Dense(256, 'relu'), tf.keras.layers.Dense(action_size * 2) ]) return actor def make_critic(): critic = Critic([ tf.keras.layers.Dense(256, 'relu'), tf.keras.layers.Dense(256, 'relu'), tf.keras.layers.Dense(1) ]) return critic optimizer = tf.keras.optimizers.Adam(learning_rate) replay_buffer = ReplayBuffer(buffer_size) sampler = Sampler(env, episode_limit=episode_limit, init_random_samples=init_random_samples, visual_env=visual_env) agent = SAC(make_actor, make_critic, make_critic, actor_optimizer=optimizer, critic_optimizer=optimizer, gamma=gamma, polyak=polyak, entropy_coefficient=entropy_coefficient, tune_entropy_coefficient=tune_entropy_coefficient, target_entropy=target_entropy, clip_actor_gradients=clip_actor_gradients) if visual_env: obs = np.expand_dims(env.reset()['obs'], axis=0) else: obs = np.expand_dims(env.reset(), axis=0) agent(obs) agent.summary() mean_test_returns = [] mean_test_std = [] steps = [] step_counter = 0 for e in range(epochs): while step_counter < (e + 1) * steps_per_epoch: traj_data = sampler.sample_trajectory(agent, exploration_noise) replay_buffer.add(traj_data) if step_counter > start_training: agent.train(replay_buffer, batch_size=batch_size, n_updates=updates_per_step * traj_data['n'], act_delay=update_actor_every) step_counter += traj_data['n'] print('Epoch {}/{} - total steps {}'.format(e + 1, epochs, step_counter)) out = sampler.evaluate(agent, 10) mean_test_returns.append(out['mean']) mean_test_std.append(out['std']) steps.append(step_counter) if out['mean'] >= return_threshold: print('Early termination due to reaching return threshold') break plt.errorbar(steps, mean_test_returns, mean_test_std) plt.xlabel('steps') plt.ylabel('returns') plt.show() return agent
class DQNAgent(): ''' Agent class. It control all the agent functionalities ''' rewards = [] total_reward = 0 birth_time = 0 n_iter = 0 n_games = 0 ts_frame = 0 ts = time.time() Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], rename=False) def __init__(self, env, device, hyperparameters, summary_writer=None): ''' Agent initialization. It create the CentralControl that control all the low ''' # The CentralControl is the 'brain' of the agent self.cc = CentralControl(env.observation_space.shape, env.action_space.n, hyperparameters['gamma'], hyperparameters['n_multi_step'], hyperparameters['double_DQN'], hyperparameters['noisy_net'], hyperparameters['dueling'], device) self.cc.set_optimizer(hyperparameters['learning_rate']) self.birth_time = time.time() self.iter_update_target = hyperparameters['n_iter_update_target'] self.buffer_start_size = hyperparameters['buffer_start_size'] self.epsilon_start = hyperparameters['epsilon_start'] self.epsilon = hyperparameters['epsilon_start'] self.epsilon_decay = hyperparameters['epsilon_decay'] self.epsilon_final = hyperparameters['epsilon_final'] self.accumulated_loss = [] self.device = device # initialize the replay buffer (i.e. the memory) of the agent self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma']) self.summary_writer = summary_writer self.noisy_net = hyperparameters['noisy_net'] self.env = env def act(self, obs): ''' Greedy action outputted by the NN in the CentralControl ''' return self.cc.get_max_action(obs) def act_eps_greedy(self, obs): ''' E-greedy action ''' # In case of a noisy net, it takes a greedy action if self.noisy_net: return self.act(obs) if np.random.random() < self.epsilon: return self.env.action_space.sample() else: return self.act(obs) def add_env_feedback(self, obs, action, new_obs, reward, done): ''' Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean. ''' # Create the new memory and update the buffer new_memory = self.Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done) self.replay_buffer.append(new_memory) # update the variables self.n_iter += 1 # decrease epsilon self.epsilon = max( self.epsilon_final, self.epsilon_start - self.n_iter / self.epsilon_decay) self.total_reward += reward def sample_and_optimize(self, batch_size): ''' Sample batch_size memories from the buffer and optimize them ''' if len(self.replay_buffer) > self.buffer_start_size: # sample mini_batch = self.replay_buffer.sample(batch_size) # optimize l_loss = self.cc.optimize(mini_batch) self.accumulated_loss.append(l_loss) # update target NN if self.n_iter % self.iter_update_target == 0: self.cc.update_target() def reset_stats(self): ''' Reset the agent's statistics ''' self.rewards.append(self.total_reward) self.total_reward = 0 self.accumulated_loss = [] self.n_games += 1 def print_info(self): ''' Print information about the agent ''' fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts) print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' % (self.n_iter, self.n_games, self.total_reward, np.mean(self.rewards[-40:]), self.epsilon, fps, np.mean(self.accumulated_loss))) self.ts_frame = self.n_iter self.ts = time.time() if self.summary_writer != None: self.summary_writer.add_scalar('reward', self.total_reward, self.n_games) self.summary_writer.add_scalar('mean_reward', np.mean(self.rewards[-40:]), self.n_games) self.summary_writer.add_scalar('10_mean_reward', np.mean(self.rewards[-10:]), self.n_games) self.summary_writer.add_scalar('esilon', self.epsilon, self.n_games) self.summary_writer.add_scalar('loss', np.mean(self.accumulated_loss), self.n_games)
def init(self, args, env): self.critic = np.zeros(shape=(5, 5, 4)) self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)
def learn(self, timesteps=10000, verbose=0, seed=None): if seed is not None: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) self.eps_range = self._eps_range(timesteps) replay_buffer = ReplayBuffer(self.buffer_size) self._init_model() obs = self.env.reset() for step in range(timesteps): # while not done: cur_eps = next(self.eps_range, None) if cur_eps is None: cur_eps = self.final_eps action = self._select_action(obs, cur_eps) new_obs, rewards, done, info = self.env.step(action) if done: new_obs = [ np.nan ] * self.obs_shape[0] # hacky way to keep dimensions correct replay_buffer.add(obs, action, rewards, new_obs) obs = new_obs # learn gradient if step > self.learning_starts: if len(replay_buffer.buffer ) < self.batch_size: # buffer too small continue samples = replay_buffer.sample(self.batch_size, self.device) obs_batch, actions_batch, rewards_batch, new_obs_batch = samples predicted_q_values = self._predictQValue( self.step_model, obs_batch, actions_batch) ys = self._expectedLabels(self.target_model, new_obs_batch, rewards_batch) loss = F.smooth_l1_loss(predicted_q_values, ys) self.optim.zero_grad() loss.backward() for i in self.step_model.parameters(): i.grad.clamp_(min=-1, max=1) # exploding gradient # i.grad.clamp_(min=-10, max=10) # exploding gradient self.optim.step() # update target if step % self.target_network_update_freq == 0: self.target_model.load_state_dict( self.step_model.state_dict()) if done: obs = self.env.reset() if verbose == 1: if step % (timesteps * 0.1) == 0: perc = int(step / (timesteps * 0.1)) print(f"At step {step}") print(f"{perc}% done")
class DDPG(Agent): def __init__(self, args, env, env_test, logger): super(DDPG, self).__init__(args, env, env_test, logger) self.args = args self.init(args, env) for metric in self.critic.model.metrics_names: self.metrics[self.critic.model.name + '_' + metric] = 0 def init(self, args, env): names = ['state0', 'action', 'state1', 'reward', 'terminal'] self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy()) self.actorCritic = ActorCriticDDPG(args, env) # self.critic = CriticDDPG(args, env) # self.actor = ActorDDPG(args, env) def train(self): if self.buffer.nb_entries > self.batch_size: exp = self.buffer.sample(self.batch_size) s0, a0, s1, r, t = [exp[name] for name in self.buffer.names] a1 = self.actor.target_model.predict_on_batch(s1) a1 = np.clip(a1, self.env.action_space.low, self.env.action_space.high) q = self.critic.Tmodel.predict_on_batch([s1, a1]) targets = r + (1 - t) * self.critic.gamma * np.squeeze(q) targets = np.clip(targets, self.env.minR / (1 - self.critic.gamma), self.env.maxR) inputs = [s0, a0] loss = self.critic.model.train_on_batch(inputs, targets) for i, metric in enumerate(self.critic.model.metrics_names): self.metrics[metric] += loss[i] # a2 = self.actor.model.predict_on_batch(s0) # grads = self.critic.gradsModel.predict_on_batch([s0, a2]) # low = self.env.action_space.low # high = self.env.action_space.high # for d in range(grads[0].shape[0]): # width = high[d] - low[d] # for k in range(self.batch_size): # if grads[k][d] >= 0: # grads[k][d] *= (high[d] - a2[k][d]) / width # else: # grads[k][d] *= (a2[k][d] - low[d]) / width # self.actor.train(s0, grads) self.actor.target_train() self.critic.target_train() def reset(self): if self.trajectory: T = int(self.trajectory[-1]['terminal']) R = np.sum([ self.env.unshape(exp['reward'], exp['terminal']) for exp in self.trajectory ]) S = len(self.trajectory) self.env.processEp(R, S, T) for expe in reversed(self.trajectory): self.buffer.append(expe.copy()) self.trajectory.clear() state = self.env.reset() self.episode_step = 0 return state def make_input(self, state): input = [np.reshape(state, (1, self.actor.s_dim[0]))] return input def act(self, state): input = self.make_input(state) action = self.actor.model.predict(input, batch_size=1) noise = np.random.normal(0., 0.1, size=action.shape) action = noise + action action = np.clip(action, self.env.action_space.low, self.env.action_space.high) action = action.squeeze() return action
class PlayroomGM(Wrapper): def __init__(self, env, args): super(PlayroomGM, self).__init__(env) self.gamma = float(args['--gamma']) self.eps = float(args['--eps']) self.demo_f = [int(f) for f in args['--demo'].split(',')] self.feat = np.array([int(f) for f in args['--features'].split(',')]) self.N = self.feat.shape[0] vs = np.zeros(shape=(self.N, self.state_dim[0])) vs[np.arange(self.N), self.feat] = 1 self.vs = vs / np.sum(vs, axis=1, keepdims=True) self.R = 100 self.idx = -1 self.v = np.zeros(shape=(self.state_dim[0], 1)) self.g = np.ones(shape=(self.state_dim[0])) self.queues = [CompetenceQueue() for _ in range(self.N)] self.names = ['s0', 'r0', 'a', 's1', 'r1', 'g', 'v', 'o', 'u'] self.buffer = ReplayBuffer(limit=int(1e5), names=self.names, N=self.N) def reset(self, exp): self.idx, self.v = self.sample_v(exp['s0']) exp['g'] = self.g exp['v'] = self.v return exp def get_r(self, s, g, v): return self.R * np.sum(np.multiply(v, s == g), axis=1, keepdims=True) def sample_v(self, s): remaining_v = [i for i in range(self.N) if s[self.feat[i]] != 1] probs = self.get_probs(idxs=remaining_v, eps=self.eps) idx = np.random.choice(remaining_v, p=probs) v = self.vs[idx] return idx, v def sampleT(self, batch_size): idxs = [ i for i in range(self.N) if self.buffer._tutorBuffers[i]._numsamples > batch_size ] probs = self.get_probs(idxs=idxs, eps=self.eps) t = np.random.choice(idxs, p=probs) samples = self.buffer.sampleT(batch_size, t) return samples, t def end_episode(self, episode): term = episode[-1]['r1'][self.idx] == self.R self.queues[self.idx].process_ep(episode, term) base_util = np.zeros(shape=(self.N, )) base_util[self.idx] = 1 self.process_trajectory(episode, base_util=base_util) def process_trajectory(self, trajectory, base_util=None): if base_util is None: u = np.zeros(shape=(self.N, )) else: u = base_util u = np.expand_dims(u, axis=1) # mcr = np.zeros(shape=(self.N,)) for exp in reversed(trajectory): u = self.gamma * u u[np.where(exp['r1'] > exp['r0'])] = 1 # u_idx = np.where(u != 0) # mcr[u_idx] = exp['r1'][u_idx] + self.gamma * mcr[u_idx] exp['u'] = u.squeeze() # exp['mcr'] = mcr if any(u != 0): self.buffer.append(exp.copy()) # def sample(self, batchsize): # probs = self.get_probs(idxs=range(self.N), eps=self.eps2) # idx = np.random.choice(self.N, p=probs) # samples = self.buffer.sample(batchsize, idx) # if samples is not None: # self.queues[idx].process_samples(samples) # return idx, samples # # def sampleT(self, batchsize): # probs = self.get_probs(idxs=range(self.N), eps=self.eps3) # idx = np.random.choice(self.N, p=probs) # samples = self.buffer.sampleT(batchsize, idx) # if samples is not None: # self.queues[idx].process_samplesT(samples) # return idx, samples def get_demo(self): demo = [] exp = {} exp['s0'] = self.env.reset() exp['r0'] = self.get_r(exp['s0'], self.g, self.vs).squeeze() exp['g'] = self.g task = np.random.choice(self.demo_f) exp['v'] = self.vs[list(self.feat).index(task)] while True: a, done = self.opt_action(task) if done: break else: exp['a'] = np.expand_dims(a, axis=1) exp['s1'] = self.env.step(exp['a'], True)[0] exp['r1'] = self.get_r(exp['s1'], self.g, self.vs).squeeze() exp['o'] = 1 demo.append(exp.copy()) exp['s0'] = exp['s1'] exp['r0'] = exp['r1'] return demo, task def opt_action(self, t): return self.env.opt_action(t) def get_stats(self): stats = {} for i, f in enumerate(self.feat): self.queues[i].update() for key, val in self.queues[i].get_stats().items(): stats[key + str(f)] = val self.queues[i].init_stat() return stats def get_cps(self): return [np.maximum(abs(q.CP + 0.05) - 0.05, 0) for q in self.queues] def get_probs(self, idxs, eps): cps = self.get_cps() vals = [cps[idx] for idx in idxs] l = len(vals) s = np.sum(vals) if s == 0: probs = [1 / l] * l else: probs = [eps / l + (1 - eps) * v / s for v in vals] return probs @property def state_dim(self): return 8, @property def goal_dim(self): return 8, @property def action_dim(self): return 5
# Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq kwargs["expl_noise"] = args.expl_noise kwargs["tau"] = args.tau policy = TD3(**kwargs) elif args.policy == "SAC": kwargs["policy_freq"] = args.policy_freq kwargs["tau"] = args.tau policy = SAC(**kwargs) elif args.policy == "MPO": policy = MPO(**kwargs) if args.load_model != "": policy_file = (args.file_name if args.load_model == "default" else args.load_model) policy.load(f"./models/{policy_file}") replay_buffer = ReplayBuffer( state_dim, action_dim, max_size=int(args.buffer_size), ) train_loop = TRAIN_LOOPS[args.policy] train_loop(args, policy, replay_buffer, env)