def __init__(self, env): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.batch_size = BATCH_SIZE self.name = 'POMDP' self.environment = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.max_len_trajectory = self.environment.spec.max_episode_steps + 1 self.noise = ou_noise.OUNoise(action_dimension=self.action_dim, theta=0.023, sigma=0.02) self.Actor_eval = actor.ActorNet(self.state_dim, self.action_dim).cuda() self.Actor_target = actor.ActorNet(self.state_dim, self.action_dim).cuda() self.Critic_eval = critic.CriticNet(self.state_dim, self.action_dim).cuda() self.Critic_target = critic.CriticNet(self.state_dim, self.action_dim).cuda() self.replay_buffer = replay_buffer.ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY, self.max_len_trajectory, self.Actor_eval.last_epi) # self.replay_buffer = np.zeros((REPLAY_BUFFER_SIZE, self.state_dim * 2 + 1 + 2 + self.action_dim)) self.buffer_counter = 0 # 记忆库计数 self.ctrain = optim.Adam(self.Critic_eval.parameters(), lr=LR_C) self.atrain = optim.Adam(self.Actor_eval.parameters(), lr=LR_A) self.loss_td = nn.MSELoss().cuda() self.trace_length = TRACE_LENGTH # (num_layers * num_directions, mini_batch, hidden_size[out_put size]) self.hidden_a = torch.from_numpy(self.state_initializer(shape=(actor.NUM_RNN_LAYER, BATCH_SIZE, self.Actor_eval.out_put_size), mode='z')).cuda() self.hidden_c = torch.from_numpy(self.state_initializer(shape=(actor.NUM_RNN_LAYER, BATCH_SIZE, self.Critic_eval.out_put_size), mode='z')).cuda() # (num_layers * num_directions, mini_batch, hidden_size[out_put size]) self.target_actor_init_h_batch = self.actor_init_h_batch = (self.hidden_a, self.hidden_a) self.target_critic_init_h_batch = self.critic_init_h_batch = (self.hidden_c, self.hidden_c) self.discounting_mat_dict = {}
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = models.QNetwork(state_size, action_size, seed, fc1_units=2 * state_size).to(device) self.qnetwork_target = models.QNetwork(state_size, action_size, seed, fc1_units=2 * state_size).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = replay_buffer.ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 print("The network used for the Simple Double Q-Learning agent:") print(self.qnetwork_local)
def testOverWrite(self): with self.test_session(): buffer = replay_buffer.ReplayBuffer() for i in range(100000): buffer.add(np.array([i, 2*i]), np.array([i % 5,]), np.array([i]), False, np.array([2*i, 3*i])) num_samples = 32 self.assertEqual(replay_buffer.MAX_SIZE , buffer.size())
def test_store_many_1d(): rb = RB.ReplayBuffer(1, 1, 100) S, A, R, Sn, D = [np.random.randn(10, 1) for i in range(5)] rb.store_many(S, A, R, Sn, D) assert rb.S1.shape == (100, 1) assert rb.Sn.shape == (100, 1) assert rb.A1.shape == (100, 1) assert rb.R1n.shape == (100, 1) assert rb.Done1.shape == (100, 1)
def create_replay_memory(self, transition_content): if self.config['dqn_rm_type'] == 'uniform': self.replay_memory = replay_buffer.ReplayBuffer(self.config['dqn_rm_max'], transition_content=transition_content) elif self.config['dqn_rm_type'] == 'per': self.replay_memory = replay_buffer.PrioritizedReplayBuffer(self.config['dqn_rm_max'], transition_content=transition_content, alpha=self.config['dqn_per_alpha']) self.per_beta = self.config['dqn_per_beta'] self.per_beta_inc = (1.0 - self.per_beta) / float(self.total_optimiser_steps)
def __init__(self, num_actions): self.num_actions = num_actions self.epsilon = INITIAL_EPSILON self.epsilon_step = (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORATION_STEPS self.t = 0 # Parameters used for summary self.total_reward = 0 self.total_q_max = 0 self.total_loss = 0 self.duration = 0 self.episode = 0 self.all_reward = [] self.all_v = [] self.all_target_v = [] # Create replay memory self.replay_memory = replay_buffer.ReplayBuffer(NUM_REPLAY_MEMORY) # Create q network self.s, self.q_values, q_network = self.build_network() q_network_weights = q_network.trainable_weights # Create target network self.st, self.target_q_values, target_network = self.build_network() target_network_weights = target_network.trainable_weights # Define target network update operation self.update_target_network = [ target_network_weights[i].assign(q_network_weights[i]) for i in range(len(target_network_weights)) ] # Define loss and gradient update operation self.a, self.y, self.loss, self.grads_update = self.build_training_op( q_network_weights) self.sess = tf.InteractiveSession() self.saver = tf.train.Saver(q_network_weights) self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary( ) if not os.path.exists(SAVE_NETWORK_PATH): os.makedirs(SAVE_NETWORK_PATH) self.sess.run(tf.initialize_all_variables()) # Load network if LOAD_NETWORK: self.load_network() # Initialize target network self.sess.run(self.update_target_network)
def test_store_many_3d(): rb = RB.ReplayBuffer(3, 3, 100, reward_steps=3) S, A, R, Sn = [np.random.randn(10, 3) for i in range(4)] D = np.random.randn(10, 1) rb.store_many(S, A, R, Sn, D) assert rb.S1.shape == (100, 3) assert rb.Sn.shape == (100, 3) assert rb.A1.shape == (100, 3) assert rb.R1n.shape == (100, 3) assert rb.Done1.shape == (100, 1)
def __init__(self, input_length, output_length, device): self.device = device self.dqn, self.target_dqn = ( MolDQN(input_length, output_length).to(self.device), MolDQN(input_length, output_length).to(self.device), ) for p in self.target_dqn.parameters(): p.requires_grad = False self.replay_buffer = replay_buffer.ReplayBuffer(REPLAY_BUFFER_CAPACITY) self.optimizer = getattr(opt, hyp.optimizer)(self.dqn.parameters(), lr=hyp.learning_rate)
def __init__(self, env, gamma=0.99): self.env = env self.obs_space = self.env.observation_space self.action_space = self.env.action_space.n self.policy = networks.MlpDQNLayer(env.observation_space, env.action_space) self.double = networks.MlpDQNLayer(env.observation_space, env.action_space) self.replay_buffer = replay_buffer.ReplayBuffer() self.gamma = gamma
def test_custom_batch_size(): rb = RB.ReplayBuffer(3, 3, 100, reward_steps=3, batch_size=17) for t in range(50): S, A, R, Sn = [np.random.randn(10, 3) for i in range(4)] D = np.random.randn(10, 1) rb.store_many(S, A, R, Sn, D) S, A, R, Sn, D = rb.sample_batch() assert S.shape == (17, 3) assert A.shape == (17, 3) assert R.shape == (17, 3) assert Sn.shape == (17, 3) assert D.shape == (17, 1)
def test_store_many_3d_repeatedly_and_sample(): rb = RB.ReplayBuffer(3, 3, 100, reward_steps=3) for t in range(50): S, A, R, Sn = [np.random.randn(10, 3) for i in range(4)] D = np.random.randn(10, 1) rb.store_many(S, A, R, Sn, D) S, A, R, Sn, D = rb.sample_batch() assert S.shape == (64, 3) assert A.shape == (64, 3) assert R.shape == (64, 3) assert Sn.shape == (64, 3) assert D.shape == (64, 1)
def train(self): os.makedirs(self.config.results_path, exist_ok=True) # Initialize workers training_worker = trainer.Trainer(copy.deepcopy(self.muzero_weights), self.config) shared_storage_worker = shared_storage.SharedStorage( copy.deepcopy(self.muzero_weights), self.game_name, self.config, ) replay_buffer_worker = replay_buffer.ReplayBuffer(self.config) # Pre-load buffer if pulling from persistent storage if self.replay_buffer: for game_history_id in self.replay_buffer: replay_buffer_worker.save_game( self.replay_buffer[game_history_id]) print("\nLoaded {} games from replay buffer.".format( len(self.replay_buffer))) self_play_workers = [ self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(self.config.seed + seed), self.config, ) for seed in range(self.config.num_actors) ] # Launch workers [ self_play_worker.continuous_self_play(shared_storage_worker, replay_buffer_worker) for self_play_worker in self_play_workers ] training_worker.continuous_update_weights(replay_buffer_worker, shared_storage_worker) # Save performance in TensorBoard print("Printing Logging info") self._logging_loop(shared_storage_worker, replay_buffer_worker) self.muzero_weights = shared_storage.get_weights() self.replay_buffer = replay_buffer_worker.get_buffer() # Persist replay buffer to disk print("\n\nPersisting replay buffer games to disk...") pickle.dump( self.replay_buffer, open(os.path.join(self.config.results_path, "replay_buffer.pkl"), "wb"), )
def __init__(self, alpha, beta, input_dims, tau, env, num_actions, layer1_size, layer2_size, layer3_size, layer4_size, output_dir, gamma=0.99, batch_size=64, max_size=100000): self.alpha = alpha self.beta = beta self.gamma = gamma self.tau = tau self.input_dims = input_dims self.batch_size = batch_size self.memory = replay_buffer.ReplayBuffer(max_size, input_dims, num_actions) print("max_size", max_size) self.her_memory = her.HERbuffer(max_size, input_dims, num_actions, env) self.actor = actor_nw.ActorNw('Actor', alpha, input_dims, layer1_size, layer2_size, layer3_size, layer4_size, num_actions, output_dir) self.critic = critic_nw.CriticNw('Critic', beta, input_dims, layer1_size, layer2_size, layer3_size, layer4_size, num_actions, output_dir) self.target_actor = actor_nw.ActorNw('TargetActor', alpha, input_dims, layer1_size, layer2_size, layer3_size, layer4_size, num_actions, output_dir) self.target_critic = critic_nw.CriticNw('TargetCritic', beta, input_dims, layer1_size, layer2_size, layer3_size, layer4_size, num_actions, output_dir) self.noise = noise.OUActionNoise(mu=np.zeros(num_actions)) self.update_network_parameters(tau=1)
def test_buffer_wrapping(): N = 11 rb = RB.ReplayBuffer(3, 3, 21, reward_steps=3) for t in range(2): S, A, R, Sn = [ np.linspace((3, 4, 5), (8.5, 8.5, 8.5), N) for i in range(4) ] D = np.random.randn(N, 1) rb.store_many(S, A, R, Sn, D) S, A, R, Sn = rb.S1, rb.A1, rb.R1n, rb.Sn x = np.array((8.5, 8.5, 8.5)) assert np.all(S[0, :] == x) assert np.all(A[0, :] == x) assert np.all(R[0, :] == x) assert np.all(Sn[0, :] == x)
def testSample(self): with self.test_session(): buffer = replay_buffer.ReplayBuffer() for i in range(1000): buffer.add(np.array([i, 2*i]), np.array([i % 5,]), np.array([i]), False, np.array([2*i, 3*i])) num_samples = 32 for j in range(50): old_states, actions, rewards, dones, new_states = buffer.sample(num_samples) reward_set = set() for s in range(num_samples): i = rewards[s][0] self.assertNotIn(i, reward_set) reward_set.add(i) self.assertFalse(dones[s]) self.assertTrue((actions[s] == i % 5).all()) self.assertTrue((old_states[s] == np.array([i, 2*i])).all()) self.assertTrue((new_states[s] == np.array([2*i, 3*i])).all())
def __init__(self, n_actions, starter_learning_rate=0.000025, gamma=0.99, memory_size=50000, batch_size=32, n_explore=10000, frame_per_action=4, replace_target_iter=500): self.n_actions = n_actions self.gamma = gamma self.memory_size = memory_size self.batch_size = batch_size self.n_explore = n_explore self.frame_per_action = frame_per_action self.replace_target_iter = replace_target_iter self.time_step = 0 self.replay_memory = replay_buffer.ReplayBuffer(memory_size) self.global_step = tf.Variable(0, trainable=False, name='global_step') self.lr = tf.train.exponential_decay(starter_learning_rate, self.global_step, 10000, 0.96) self.createNetwork() q_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Q_network') t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_network') self.replace_target_op = [ tf.assign(t, q) for t, q in zip(t_params, q_params) ] self.merged = tf.summary.merge_all() self.saver = tf.train.Saver() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.sess.graph.finalize() ckpt = tf.train.get_checkpoint_state(SIGN) if ckpt and ckpt.model_checkpoint_path: self.saver.restore(self.sess, ckpt.model_checkpoint_path) print("Successfully loaded:", ckpt.model_checkpoint_path) else: print("Could not find old network weights") self.writer = tf.summary.FileWriter("logs/" + SIGN, self.sess.graph)
def __init__(self, env): self._buffer = replay_buffer.ReplayBuffer() self.env = env (self.critic_state_input, self.action_input, self.critic) = self._create_critic_model() [ self.variable_summaries(x, "critic") for x in self.critic.trainable_weights ] (self.actor_state_input, self.actor) = self._create_actor_model() [ self.variable_summaries(x, "actor") for x in self.actor.trainable_weights ] self.target_critic = Model.from_config(self.critic.get_config()) self.target_critic.set_weights(self.critic.get_weights()) [ self.variable_summaries(x, "target_critic") for x in self.target_critic.trainable_weights ] self.target_actor = Model.from_config(self.actor.get_config()) self.target_actor.set_weights(self.actor.get_weights()) [ self.variable_summaries(x, "target_actor") for x in self.target_actor.trainable_weights ] if FLAGS.load_model_from_file: self.critic.load_model(TARGET_CRITIC_FILE) self.actor.load_model(TARGET_ACTOR_FILE) self.critic.load_model(CRITIC_FILE) self.actor.load_model(ACTOR_FILE) self.epsilon_min = FLAGS.epsilon_min self.epsilon = FLAGS.epsilon self.epsilon_decay = FLAGS.epsilon_decay self.assign_weights = self.get_weight_assignment_op() self.sess = tf.Session() K.set_session(self.sess) self.summarize = None self.train_writer = tf.summary.FileWriter( "train_summaries/train_%d" % time.time(), self.sess.graph) self.action_grads = K.gradients(self.critic.outputs[0] / BATCH_SIZE, self.action_input) self.sess.run(tf.global_variables_initializer())
def __init__(self, q_network=networks.q_func.QFunc, policy_network=networks.policy.PolicyNet, tau=0.005, batch_size=256, look_ahead=1, look_behind=1, gate_width=1, gate_height=0.5, gamma=0.9): super().__init__() self.q_net_1 = q_network() self.q_net_2 = q_network() self.target_1 = q_network() self.target_2 = q_network() self.policy_net = policy_network() self.replay_buffer = replay_buffer.ReplayBuffer() self.standard_normal = torch.distributions.Normal(0, 1) self.tau = tau self.batch_size = batch_size self.init_target() self.state_size = (1 + look_ahead + look_behind) * 3 self.action_size = 2 self.look_ahead = look_ahead self.look_behind = look_behind self.gate_width = 1 self.gate_height = 0.5 self.env = envs.race_traj.RaceTrajEnv(gate_height=gate_height, gate_width=gate_width) self.alpha = torch.abs(self.standard_normal.sample()) * -1 self.gamma = gamma self.entropy_target = -2
def train(self): # Manage GPUs ''' if 0 < self.num_gpus: num_gpus_per_worker = self.num_gpus / ( self.config.train_on_gpu + self.config.num_workers * self.config.selfplay_on_gpu + log_in_tensorboard * self.config.selfplay_on_gpu + self.config.use_last_model_value * self.config.reanalyse_on_gpu ) if 1 < num_gpus_per_worker: num_gpus_per_worker = math.floor(num_gpus_per_worker) else: num_gpus_per_worker = 0 ''' # Initialize Worker Threads for SP_worker_index in range(self.config.num_workers): self.self_play_workers.append( self_play.SelfPlay(self.checkpoint, self.Game, self.config, self.config.seed + SP_worker_index)) self.training_worker = trainer.Trainer(self.checkpoint, self.config) self.replay_buffer_worker = replay_buffer.ReplayBuffer( self.checkpoint, self.replay_buffer, self.config) self.shared_storage_worker = shared_storage.SharedStorage( self.checkpoint, self.config) self.shared_storage_worker.set_info("terminate", False) #Launch Workers play_thread = threading.Thread( target=self.self_play_workers[0].continuous_self_play, args=(self.shared_storage_worker, self.replay_buffer_worker)) train_thread = threading.Thread( target=self.training_worker.continuous_update_weights, args=(self.shared_storage_worker, self.replay_buffer_worker)) play_thread.start() train_thread.start()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--policy_name", default="TD3") # Policy name parser.add_argument("--env_name", default="Pendulum-v0") # OpenAI gym environment name parser.add_argument("--replay_buffer", default="prioritized") # Replay Buffer type parser.add_argument("--replay_buffer_size", default=5e4, type=int) # Replay Buffer capacity parser.add_argument("--replay_buffer_alpha", default=0.6, type=float) # Replay Buffer prioritization weight parser.add_argument("--seed", default=0, type=int) # Sets Gym, PyTorch and Numpy seeds parser.add_argument("--start_timesteps", default=1e4, type=int) # How many time steps purely random policy is run for parser.add_argument("--eval_freq", default=1e3, type=float) # How often (time steps) we evaluate parser.add_argument("--max_timesteps", default=5e4, type=float) # Max time steps to run environment for parser.add_argument("--save_models", default="True", type=bool) # Whether or not models are saved parser.add_argument("--expl_noise", default=0.1, type=float) # Std of Gaussian exploration noise parser.add_argument("--batch_size", default=100, type=int) # Batch size for both actor and critic parser.add_argument("--discount", default=0.99, type=float) # Discount factor parser.add_argument("--tau", default=0.005, type=float) # Target network update rate parser.add_argument("--policy_noise", default=0.2, type=float) # Noise added to target policy during critic update parser.add_argument("--noise_clip", default=0.5, type=float) # Range to clip target policy noise parser.add_argument("--policy_freq", default=2, type=int) # Frequency of delayed policy updates parser.add_argument("--lr_actor", default=0.001, type=float) # Learning rate of actor parser.add_argument("--lr_critic", default=0.001, type=float) # Learning rate of critic parser.add_argument("--prioritized_replay_eps", default=1e-3, type=float) # Replay Buffer epsilon (PRE) parser.add_argument("--prioritized_replay_beta0", default=0.4, type=float) # Replay Buffer initial beta (PRE) args = parser.parse_args() #Training kwargs kwargs = { "policy_name": args.policy_name, "env_name": args.env_name, "replay_buffer": args.replay_buffer, "replay_buffer_size": args.replay_buffer_size, "replay_buffer_alpha": args.replay_buffer_alpha, "seed": args.seed, "start_timesteps": args.start_timesteps, "eval_freq": args.eval_freq, "max_timesteps": args.max_timesteps, "save_models": args.save_models, "expl_noise": args.expl_noise, "batch_size": args.batch_size, "discount": args.discount, "tau": args.tau, "policy_noise": args.policy_noise, "noise_clip": args.noise_clip, "policy_freq": args.policy_freq, "lr_actor": args.lr_actor, "prioritized_replay_eps": args.prioritized_replay_eps, "prioritized_replay_beta0": args.prioritized_replay_beta0 } # cls os.system('cls' if os.name == 'nt' else 'clear') if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") # Time stamp for repeated test names ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H-%M-%S') test_name = "%s_%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed), ts) plot_name = "%s_%s_%s_%s_plot.png" % (args.policy_name, args.env_name, str(args.seed), ts) kwargs_name = "%s_%s_%s_%s_kwargs.csv" % (args.policy_name, args.env_name, str(args.seed), ts) scores_name = "%s_%s_%s_%s_scores.csv" % (args.policy_name, args.env_name, str(args.seed), ts) print("---------------------------------------") print("Settings: %s" % (test_name)) utils.save_kwargs(kwargs, "./results/%s" % (kwargs_name)) print("---------------------------------------") # Environment and Agent instantiation env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Instantiate Replay Buffer if args.replay_buffer == "vanilla": replay_buffer = rb.ReplayBuffer(size = args.replay_buffer_size) PER = False elif args.replay_buffer == "prioritized": replay_buffer = rb.PrioritizedReplayBuffer(size = int(np.round(np.sqrt(args.replay_buffer_size))), alpha = args.replay_buffer_alpha) PER = True prioritized_replay_beta_iters = args.max_timesteps prioritized_replay_beta0 = args.prioritized_replay_beta0 beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p = prioritized_replay_beta0, final_p = 1.0) # Instantiate policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps) # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] # Training loop ####################################### total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 episode_rewards = [] done = True while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print('Total T: {} Episode Num: {} Episode T: {} Reward: {}'.format(total_timesteps, episode_num, episode_timesteps, episode_reward)) episode_rewards.append(episode_reward) # PER Beta scheduled update if PER: beta = beta_schedule.value(total_timesteps) else: beta = 0. # Policy update step if args.policy_name == "TD3": policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, beta) else: policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, beta) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append(evaluate_policy(env, policy)) # save evaluation #if args.save_models: policy.save(test_name, directory="./pytorch_models") #np.save("./results/%s" % (test_name), evaluations) # Reset environment obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.select_action(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high) # Perform action new_obs, reward, done, _ = env.step(action) done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) episode_reward += reward # Push experience into replay buffer experience = (obs, action, reward, new_obs, done_bool) replay_buffer.add(experience) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # Final evaluation evaluations.append(evaluate_policy(env, policy)) # Save results if args.save_models: policy.save("%s" % (test_name), directory="./pytorch_models") #np.save("./results/%s" % (evaluations_file), evaluations) #np.save("./results/%s" % ('rewards.txt'), episode_rewards) utils.save_scores(episode_rewards, "./results/%s" % (scores_name)) utils.plot(episode_rewards, "./results/%s" % (plot_name), 1)
args.name: { 'autoencoder': {}, 'correlation': {}, 'policies': {} } } }) LOG.setup(os.path.join('.', args.run_dir, args.name)) env = SimpleGridworld() dummy_env = SimpleGridworld() net = IndepFeatureLearner(lmbda=args.lmbda, learning_rate=args.learning_rate, gpu_num=args.gpu_num) buffer = replay_buffer.ReplayBuffer(10000) visualization_freq = 10000 batch_size = args.batch_size def run_training_step(buffer: replay_buffer.ReplayBuffer, net: IndepFeatureLearner): positions, _, _, _, _ = buffer.sample(batch_size) s_list = [] sp_list = [] action_list = [] for pos in positions: s = dummy_env.get_observation(pos) #actions = net.([s]) # [1, num_factors] sp = []
def __init__(self, hp): self.hp = hp self.memory = replay_buffer.ReplayBuffer(hp) self.agents = [ddpg.Agent(self.hp) for _ in range(self.hp.num_agents)] self.losses = (0., 0.)
def main(): if FLAGS.seed is not None: np.random.seed(FLAGS.seed) n_units = FLAGS.target_units if FLAGS.target_units is not None else 32 hidden_sizes = [n_units for _ in range(FLAGS.target_layers)] policy_args = {'hidden_sizes': hidden_sizes} n_units = FLAGS.behavior_units if FLAGS.behavior_units is not None else 32 hidden_sizes = [n_units for _ in range(FLAGS.behavior_layers)] behavior_args = {'hidden_sizes': hidden_sizes} true_value, _, avg_length, policy_args =\ common.load_policy_args(FLAGS.restore_path, policy_args) if FLAGS.behavior_path is not None: _, _, _, behavior_args = common.load_policy_args( FLAGS.behavior_path, behavior_args) else: _, _, _, behavior_args = common.load_policy_args( FLAGS.restore_path, policy_args) policy_args['seed'] = FLAGS.seed behavior_args['seed'] = FLAGS.seed obs_len = FLAGS.obs_len print(policy_args, behavior_args) policy_str = 'Gaussian' if FLAGS.env == 'CartPole-v0': policy_str = 'boltzmann' distribution = rl.ReinforcementLearning( FLAGS.env, policy_str, max_path_length=FLAGS.max_path_length, scope=FLAGS.scope, policy_args=policy_args) behavior_dist = rl.ReinforcementLearning( FLAGS.env, policy_str, max_path_length=FLAGS.max_path_length, scope=FLAGS.behavior_scope, policy_args=behavior_args) if tf.gfile.Exists('%s.meta' % FLAGS.restore_path): distribution.policy.load_policy(FLAGS.restore_path) if FLAGS.behavior_path is not None: if tf.gfile.Exists('%s.meta' % FLAGS.behavior_path): behavior_dist.policy.load_policy(FLAGS.behavior_path) else: if tf.gfile.Exists('%s.meta' % FLAGS.restore_path): behavior_dist.policy.load_policy(FLAGS.restore_path) n_units = FLAGS.hidden_units if FLAGS.hidden_units is not None else 32 hidden_sizes = [n_units for _ in range(FLAGS.hidden_layers)] mle_args = { 'train_type': 'supervised', 'seed': FLAGS.seed, 'hidden_sizes': hidden_sizes, 'entropy_coeff': FLAGS.entropy_coeff, 'weight_decay': FLAGS.weight_decay, 'act_fn': tf.nn.relu, 'learning_rate': 1e-03 } obs_space = distribution.env.observation_space act_space = distribution.env.action_space obs_space = common.elongate_space(obs_space, act_space, obs_len) if policy_str == 'Gaussian': policy_cls = policies.GaussianPolicy mle_args['learn_std'] = FLAGS.learn_std else: policy_cls = policies.ContinuousStateBoltzmannPolicy mle_policy = policy_cls(obs_space, act_space, scope='mle', **mle_args) learning_iters = FLAGS.num_iters batch_size = FLAGS.batch_size ope_paths = [] validation_paths = [] replay_buffer = rb.ReplayBuffer() eval_buffer = rb.ReplayBuffer() results = blackbox_results_pb2.FitResults() results.method_name = 'nn_%d_%d' % (FLAGS.hidden_layers, n_units) replay_buffer.empty() eval_buffer.empty() # Get true value for eval policy. Either from file or with MC eval. if true_value is None: # if we couldn't load true value or we are using a mixture policy true_value = 0.0 num_true_trajs = max(10000, 10 * batch_size) length = 0.0 for _ in range(num_true_trajs): path, G = distribution.sample() true_value += G length += len(path['rewards']) true_value /= num_true_trajs avg_length = length / num_true_trajs results.true_value = true_value print('Avg path length %f' % avg_length) print('True value %f' % true_value) def mse(x): return (x - true_value)**2 # Collect paths with behavior policy for _ in range(batch_size): path, G = behavior_dist.sample() replay_buffer.add_path(path, G) ope_paths.append(path) # Off-policy evaluation with true behavior policy common.load_importance_weights(distribution.policy, ope_paths, obs_len=1) is_estimate, is_variance = eval_target_policy(ope_paths, weighted=False) is_mse = mse(is_estimate) results.density_estimate = is_estimate results.density_variance = is_variance results.density_mse = is_mse print('###################') print('True Value %f' % true_value) print_results('True IS', is_estimate, is_variance, is_mse) pct_batch = 0.2 for _ in range(int(pct_batch * batch_size)): path, G = behavior_dist.sample() validation_paths.append(path) # Get eval data _, _, eval_obs, eval_acts = common.get_train_test_data(validation_paths, split=0.0, obs_len=obs_len) train_obs, train_acts, _, _ = common.get_train_test_data(ope_paths, split=1.0, obs_len=obs_len) policy_eval_paths = ope_paths inds = np.arange(len(train_obs)) common.load_importance_weights(distribution.policy, policy_eval_paths, mle_policy, obs_len=obs_len) entropy = mle_policy.entropy(train_obs) print('Entropy %f' % entropy) train_loss = eval_pi_loss(mle_policy, train_obs, train_acts) eval_loss = eval_pi_loss(mle_policy, eval_obs, eval_acts) is_estimate, is_variance = eval_target_policy(policy_eval_paths, weighted=False) is_mse = mse(is_estimate) print_results('RIS', is_estimate, is_variance, is_mse) print('Training Loss %f' % train_loss) print('Validation Loss %f' % eval_loss) print('Entropy %f' % entropy) print('###################') add_results(results, is_estimate, is_variance, is_mse, entropy, train_loss, eval_loss, 0) for itr in range(learning_iters): if FLAGS.mini_batch_size is not None: m = len(train_obs) inds = np.random.randint(m, size=FLAGS.mini_batch_size) obs_batch, acts_batch = train_obs[inds], train_acts[inds] # loss is training error, v_loss is validation error computed on # samples that we will also use in policy evaluation, eval_loss is # validation loss on samples that will not be used in # policy evaluation. mle_policy.supervised_update(obs_batch, acts_batch) train_loss = eval_pi_loss(mle_policy, train_obs, train_acts) eval_loss = eval_pi_loss(mle_policy, eval_obs, eval_acts) entropy = mle_policy.entropy(train_obs) if itr > 0 and itr % FLAGS.eval_freq == 0: common.load_importance_weights(distribution.policy, policy_eval_paths, mle_policy, obs_len=obs_len) is_estimate, is_variance = eval_target_policy(policy_eval_paths, weighted=False) is_mse = mse(is_estimate) if itr % FLAGS.print_freq == 0: print('###################') print('Iteration %d' % itr) print_results('RIS', is_estimate, is_variance, is_mse) print('Training Loss %f' % train_loss) print('Validation Loss %f' % eval_loss) print('Entropy %f' % entropy) print('###################') add_results(results, is_estimate, is_variance, is_mse, entropy, train_loss, eval_loss, itr) if FLAGS.result_file is not None: with open(FLAGS.result_file, 'wb') as w: w.write(results.SerializeToString())
def testAdd(self): with self.test_session(): buffer = replay_buffer.ReplayBuffer() for i in range(1000): buffer.add(np.array([i, 2*i]), np.array([i % 5,]), np.array([i]), False, np.array([2*i, 3*i])) self.assertEqual(1000, buffer.size())
def testInit(self): with self.test_session(): buffer = replay_buffer.ReplayBuffer()
def test_init(): rb = RB.ReplayBuffer(2, 1, 100) assert rb.S1.shape == (100, 2) assert rb.A1.shape == (100, 1)
def train(self): # ray.init() os.makedirs(self.config.results_path, exist_ok=True) # Initialize workers # training_worker = trainer.Trainer.options( # num_gpus=1 if "cuda" in self.config.training_device else 0 # ).remote(copy.deepcopy(self.muzero_weights), self.config) training_worker = trainer.Trainer(copy.deepcopy(self.muzero_weights), self.config) # shared_storage_worker = shared_storage.SharedStorage.remote( # copy.deepcopy(self.muzero_weights), self.game_name, self.config, # ) shared_storage_worker = shared_storage.SharedStorage( copy.deepcopy(self.muzero_weights), self.game_name, self.config, ) # replay_buffer_worker = replay_buffer.ReplayBuffer.remote(self.config) replay_buffer_worker = replay_buffer.ReplayBuffer(self.config) # Pre-load buffer if pulling from persistent storage if self.replay_buffer: for game_history_id in self.replay_buffer: # replay_buffer_worker.save_game.remote( replay_buffer_worker.save_game( self.replay_buffer[game_history_id]) print("\nLoaded {} games from replay buffer.".format( len(self.replay_buffer))) self_play_workers = [ # self_play.SelfPlay.remote( self_play.SelfPlay( copy.deepcopy(self.muzero_weights), self.Game(self.config.seed + seed), self.config, ) for seed in range(self.config.num_actors) ] # # Launch workers # [ # # self_play_worker.continuous_self_play.remote( # self_play_worker.continuous_self_play( # shared_storage_worker, replay_buffer_worker # ) # for self_play_worker in self_play_workers # ] # # training_worker.continuous_update_weights.remote( # training_worker.continuous_update_weights( # replay_buffer_worker, shared_storage_worker # ) # # Save performance in TensorBoard # self._logging_loop(shared_storage_worker, replay_buffer_worker) while True: # play a game [ self_play_worker.joe_self_play(shared_storage_worker, replay_buffer_worker) for self_play_worker in self_play_workers ] self._joe_logging(shared_storage_worker, replay_buffer_worker) training_worker.joe_update_weights(replay_buffer_worker, shared_storage_worker) info = shared_storage_worker.get_info() if info["training_step"] >= self.config.training_steps: break # self.muzero_weights = ray.get(shared_storage_worker.get_weights.remote()) self.muzero_weights = shared_storage_worker.get_weights() # self.replay_buffer = ray.get(replay_buffer_worker.get_buffer.remote()) self.replay_buffer = replay_buffer_worker.get_buffer() # Persist replay buffer to disk print("\n\nPersisting replay buffer games to disk...") pickle.dump( self.replay_buffer, open(os.path.join(self.config.results_path, "replay_buffer.pkl"), "wb"), )
def __init__(self, id, config, session, type): self.id = id self.name = 'AGENT_' + type.upper() + '_' + str(id) self.session = session self.type = type # Extract relevant configuration: self.config = {} self.config['env_n_actions'] = config['env_n_actions'] self.config['env_obs_dims'] = config['env_obs_dims'] self.config['env_type'] = config['env_type'] dqn_config_params = [ 'dqn_gamma', 'dqn_rm_init', 'dqn_rm_max', 'dqn_target_update', 'dqn_batch_size', 'dqn_learning_rate', 'dqn_train_period', 'dqn_adam_epsilon', 'dqn_epsilon_start', 'dqn_epsilon_final', 'dqn_epsilon_steps', 'dqn_huber_loss_delta' ] for param in dqn_config_params: self.config[param] = config[param] self.epsilon = self.config['dqn_epsilon_start'] self.epsilon_step_size = (self.config['dqn_epsilon_start'] - self.config['dqn_epsilon_final']) \ / self.config['dqn_epsilon_steps'] # Scoped names self.name_online = self.name + '/' + 'DQN_ONLINE' self.name_target = self.name + '/' + 'DQN_TARGET' self.obs, self.q_values, self.evaluation, self.latent_features = \ self.build_model(self.name_online, self.config['env_n_actions']) self.obs_target, self.q_values_target, self.evaluation_target, self.latent_features_target = \ self.build_model(self.name_target, self.config['env_n_actions']) trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name_online) trainable_vars_by_name = { var.name[len(self.name_online):]: var for var in trainable_vars } trainable_vars_t = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name_target) trainable_vars_by_name_t = { var.name[len(self.name_target):]: var for var in trainable_vars_t } copy_ops = [ target_var.assign(trainable_vars_by_name[var_name]) for var_name, target_var in trainable_vars_by_name_t.items() ] self.update_target_weights = tf.group(*copy_ops) self.action, self.td_target, self.td_error, self.loss, self.grads_update = self.build_training_ops( ) self.replay_memory = replay_buffer.ReplayBuffer( self.config['dqn_rm_max']) # -------------------------------------------------------------------------------------------------------------- self.post_init_steps = 0 self.training_steps = 0 self.n_episode = 0 self.sample_loss_mean = 0.0 self.sample_n = 0.0 self.self_loss_mean = 0.0 self.self_loss_n = 0.0 self.expert_loss_mean = 0.0 self.expert_loss_n = 0.0 self.ep_r_steps = 0 self.rnd_rm = None
rpbuffer.add((s0, action, r1, terminal, s1)) s0 = s1 env.close() def play(env, actor, games=20): for i in range(games): terminal = False s0 = env.reset() while not terminal: env.render() action = np.random.choice([0, 1]) s0, _, terminal, _ = env.step(action) env.close() if __name__ == "__main__": env = gym.make(ENV) actor = None rpbuffer = replay_buffer.ReplayBuffer(FRAME_SZ) if "-t" in sys.argv: train(env, actor, rpbuffer) if "-p" in sys.argv: play(env, actor)
her = False render = False if __name__ == '__main__': env = gym.make(env_name) env.seed(0) random.seed(0) np.random.seed(0) # Make a directory to store the learned policies dirname = datetime.datetime.now().isoformat() os.mkdir(dirname) replay_buffer = replay_buffer.ReplayBuffer(buffer_size) sample_batch = replay_buffer.get_batch ddpg = ddpg.DDPG(env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode) for epoch in range(n_epoch): print("Start training epoch", epoch) for cycle in range(n_cycles): for episode in range(n_episode): state = env.reset() state = np.concatenate((state['observation'], state['achieved_goal'], state['desired_goal'])) tot_reward = 0 ddpg.reset_noise()