Ejemplo n.º 1
0
    def __init__(self, env):

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.batch_size = BATCH_SIZE
        self.name = 'POMDP'
        self.environment = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.max_len_trajectory = self.environment.spec.max_episode_steps + 1
        self.noise = ou_noise.OUNoise(action_dimension=self.action_dim, theta=0.023, sigma=0.02)
        self.Actor_eval = actor.ActorNet(self.state_dim, self.action_dim).cuda()
        self.Actor_target = actor.ActorNet(self.state_dim, self.action_dim).cuda()
        self.Critic_eval = critic.CriticNet(self.state_dim, self.action_dim).cuda()
        self.Critic_target = critic.CriticNet(self.state_dim, self.action_dim).cuda()
        self.replay_buffer = replay_buffer.ReplayBuffer(REPLAY_BUFFER_SIZE,
                                                        DIRECTORY, self.max_len_trajectory, self.Actor_eval.last_epi)
        # self.replay_buffer = np.zeros((REPLAY_BUFFER_SIZE, self.state_dim * 2 + 1 + 2 + self.action_dim))
        self.buffer_counter = 0  # 记忆库计数
        self.ctrain = optim.Adam(self.Critic_eval.parameters(), lr=LR_C)
        self.atrain = optim.Adam(self.Actor_eval.parameters(), lr=LR_A)
        self.loss_td = nn.MSELoss().cuda()
        self.trace_length = TRACE_LENGTH

        # (num_layers * num_directions, mini_batch, hidden_size[out_put size])
        self.hidden_a = torch.from_numpy(self.state_initializer(shape=(actor.NUM_RNN_LAYER, BATCH_SIZE,
                                                                       self.Actor_eval.out_put_size), mode='z')).cuda()
        self.hidden_c = torch.from_numpy(self.state_initializer(shape=(actor.NUM_RNN_LAYER, BATCH_SIZE,
                                                                       self.Critic_eval.out_put_size), mode='z')).cuda()
        # (num_layers * num_directions, mini_batch, hidden_size[out_put size])

        self.target_actor_init_h_batch = self.actor_init_h_batch = (self.hidden_a, self.hidden_a)
        self.target_critic_init_h_batch = self.critic_init_h_batch = (self.hidden_c, self.hidden_c)
        self.discounting_mat_dict = {}
Ejemplo n.º 2
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = models.QNetwork(state_size,
                                              action_size,
                                              seed,
                                              fc1_units=2 *
                                              state_size).to(device)
        self.qnetwork_target = models.QNetwork(state_size,
                                               action_size,
                                               seed,
                                               fc1_units=2 *
                                               state_size).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = replay_buffer.ReplayBuffer(action_size, BUFFER_SIZE,
                                                 BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        print("The network used for the Simple Double Q-Learning agent:")
        print(self.qnetwork_local)
Ejemplo n.º 3
0
 def testOverWrite(self):
   with self.test_session():
     buffer = replay_buffer.ReplayBuffer()
     for i in range(100000):
       buffer.add(np.array([i, 2*i]), np.array([i % 5,]), np.array([i]), False, np.array([2*i, 3*i]))
     num_samples = 32
     self.assertEqual(replay_buffer.MAX_SIZE , buffer.size())
def test_store_many_1d():
    rb = RB.ReplayBuffer(1, 1, 100)
    S, A, R, Sn, D = [np.random.randn(10, 1) for i in range(5)]
    rb.store_many(S, A, R, Sn, D)
    assert rb.S1.shape == (100, 1)
    assert rb.Sn.shape == (100, 1)
    assert rb.A1.shape == (100, 1)
    assert rb.R1n.shape == (100, 1)
    assert rb.Done1.shape == (100, 1)
Ejemplo n.º 5
0
 def create_replay_memory(self, transition_content):
     if self.config['dqn_rm_type'] == 'uniform':
         self.replay_memory = replay_buffer.ReplayBuffer(self.config['dqn_rm_max'],
                                                         transition_content=transition_content)
     elif self.config['dqn_rm_type'] == 'per':
         self.replay_memory = replay_buffer.PrioritizedReplayBuffer(self.config['dqn_rm_max'],
                                                                    transition_content=transition_content,
                                                                    alpha=self.config['dqn_per_alpha'])
         self.per_beta = self.config['dqn_per_beta']
         self.per_beta_inc = (1.0 - self.per_beta) / float(self.total_optimiser_steps)
Ejemplo n.º 6
0
    def __init__(self, num_actions):
        self.num_actions = num_actions
        self.epsilon = INITIAL_EPSILON
        self.epsilon_step = (INITIAL_EPSILON -
                             FINAL_EPSILON) / EXPLORATION_STEPS
        self.t = 0

        # Parameters used for summary
        self.total_reward = 0
        self.total_q_max = 0
        self.total_loss = 0
        self.duration = 0
        self.episode = 0

        self.all_reward = []
        self.all_v = []
        self.all_target_v = []

        # Create replay memory
        self.replay_memory = replay_buffer.ReplayBuffer(NUM_REPLAY_MEMORY)

        # Create q network
        self.s, self.q_values, q_network = self.build_network()
        q_network_weights = q_network.trainable_weights

        # Create target network
        self.st, self.target_q_values, target_network = self.build_network()
        target_network_weights = target_network.trainable_weights

        # Define target network update operation
        self.update_target_network = [
            target_network_weights[i].assign(q_network_weights[i])
            for i in range(len(target_network_weights))
        ]

        # Define loss and gradient update operation
        self.a, self.y, self.loss, self.grads_update = self.build_training_op(
            q_network_weights)

        self.sess = tf.InteractiveSession()
        self.saver = tf.train.Saver(q_network_weights)
        self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary(
        )

        if not os.path.exists(SAVE_NETWORK_PATH):
            os.makedirs(SAVE_NETWORK_PATH)

        self.sess.run(tf.initialize_all_variables())

        # Load network
        if LOAD_NETWORK:
            self.load_network()

        # Initialize target network
        self.sess.run(self.update_target_network)
def test_store_many_3d():
    rb = RB.ReplayBuffer(3, 3, 100, reward_steps=3)
    S, A, R, Sn = [np.random.randn(10, 3) for i in range(4)]
    D = np.random.randn(10, 1)
    rb.store_many(S, A, R, Sn, D)

    assert rb.S1.shape == (100, 3)
    assert rb.Sn.shape == (100, 3)
    assert rb.A1.shape == (100, 3)
    assert rb.R1n.shape == (100, 3)
    assert rb.Done1.shape == (100, 1)
Ejemplo n.º 8
0
 def __init__(self, input_length, output_length, device):
     self.device = device
     self.dqn, self.target_dqn = (
         MolDQN(input_length, output_length).to(self.device),
         MolDQN(input_length, output_length).to(self.device),
     )
     for p in self.target_dqn.parameters():
         p.requires_grad = False
     self.replay_buffer = replay_buffer.ReplayBuffer(REPLAY_BUFFER_CAPACITY)
     self.optimizer = getattr(opt, hyp.optimizer)(self.dqn.parameters(),
                                                  lr=hyp.learning_rate)
Ejemplo n.º 9
0
    def __init__(self, env, gamma=0.99):

        self.env = env
        self.obs_space = self.env.observation_space
        self.action_space = self.env.action_space.n
        self.policy = networks.MlpDQNLayer(env.observation_space,
                                           env.action_space)
        self.double = networks.MlpDQNLayer(env.observation_space,
                                           env.action_space)
        self.replay_buffer = replay_buffer.ReplayBuffer()
        self.gamma = gamma
def test_custom_batch_size():
    rb = RB.ReplayBuffer(3, 3, 100, reward_steps=3, batch_size=17)
    for t in range(50):

        S, A, R, Sn = [np.random.randn(10, 3) for i in range(4)]
        D = np.random.randn(10, 1)
        rb.store_many(S, A, R, Sn, D)
    S, A, R, Sn, D = rb.sample_batch()
    assert S.shape == (17, 3)
    assert A.shape == (17, 3)
    assert R.shape == (17, 3)
    assert Sn.shape == (17, 3)
    assert D.shape == (17, 1)
def test_store_many_3d_repeatedly_and_sample():
    rb = RB.ReplayBuffer(3, 3, 100, reward_steps=3)
    for t in range(50):

        S, A, R, Sn = [np.random.randn(10, 3) for i in range(4)]
        D = np.random.randn(10, 1)
        rb.store_many(S, A, R, Sn, D)
    S, A, R, Sn, D = rb.sample_batch()
    assert S.shape == (64, 3)
    assert A.shape == (64, 3)
    assert R.shape == (64, 3)
    assert Sn.shape == (64, 3)
    assert D.shape == (64, 1)
Ejemplo n.º 12
0
    def train(self):
        os.makedirs(self.config.results_path, exist_ok=True)

        # Initialize workers
        training_worker = trainer.Trainer(copy.deepcopy(self.muzero_weights),
                                          self.config)
        shared_storage_worker = shared_storage.SharedStorage(
            copy.deepcopy(self.muzero_weights),
            self.game_name,
            self.config,
        )
        replay_buffer_worker = replay_buffer.ReplayBuffer(self.config)
        # Pre-load buffer if pulling from persistent storage
        if self.replay_buffer:
            for game_history_id in self.replay_buffer:
                replay_buffer_worker.save_game(
                    self.replay_buffer[game_history_id])
            print("\nLoaded {} games from replay buffer.".format(
                len(self.replay_buffer)))
        self_play_workers = [
            self_play.SelfPlay(
                copy.deepcopy(self.muzero_weights),
                self.Game(self.config.seed + seed),
                self.config,
            ) for seed in range(self.config.num_actors)
        ]

        # Launch workers
        [
            self_play_worker.continuous_self_play(shared_storage_worker,
                                                  replay_buffer_worker)
            for self_play_worker in self_play_workers
        ]
        training_worker.continuous_update_weights(replay_buffer_worker,
                                                  shared_storage_worker)

        # Save performance in TensorBoard
        print("Printing Logging info")
        self._logging_loop(shared_storage_worker, replay_buffer_worker)

        self.muzero_weights = shared_storage.get_weights()
        self.replay_buffer = replay_buffer_worker.get_buffer()
        # Persist replay buffer to disk
        print("\n\nPersisting replay buffer games to disk...")
        pickle.dump(
            self.replay_buffer,
            open(os.path.join(self.config.results_path, "replay_buffer.pkl"),
                 "wb"),
        )
Ejemplo n.º 13
0
    def __init__(self,
                 alpha,
                 beta,
                 input_dims,
                 tau,
                 env,
                 num_actions,
                 layer1_size,
                 layer2_size,
                 layer3_size,
                 layer4_size,
                 output_dir,
                 gamma=0.99,
                 batch_size=64,
                 max_size=100000):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.tau = tau
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.memory = replay_buffer.ReplayBuffer(max_size, input_dims,
                                                 num_actions)
        print("max_size", max_size)
        self.her_memory = her.HERbuffer(max_size, input_dims, num_actions, env)

        self.actor = actor_nw.ActorNw('Actor', alpha, input_dims, layer1_size,
                                      layer2_size, layer3_size, layer4_size,
                                      num_actions, output_dir)

        self.critic = critic_nw.CriticNw('Critic', beta, input_dims,
                                         layer1_size, layer2_size, layer3_size,
                                         layer4_size, num_actions, output_dir)

        self.target_actor = actor_nw.ActorNw('TargetActor', alpha, input_dims,
                                             layer1_size, layer2_size,
                                             layer3_size, layer4_size,
                                             num_actions, output_dir)

        self.target_critic = critic_nw.CriticNw('TargetCritic', beta,
                                                input_dims, layer1_size,
                                                layer2_size, layer3_size,
                                                layer4_size, num_actions,
                                                output_dir)

        self.noise = noise.OUActionNoise(mu=np.zeros(num_actions))

        self.update_network_parameters(tau=1)
def test_buffer_wrapping():
    N = 11
    rb = RB.ReplayBuffer(3, 3, 21, reward_steps=3)
    for t in range(2):
        S, A, R, Sn = [
            np.linspace((3, 4, 5), (8.5, 8.5, 8.5), N) for i in range(4)
        ]
        D = np.random.randn(N, 1)
        rb.store_many(S, A, R, Sn, D)

    S, A, R, Sn = rb.S1, rb.A1, rb.R1n, rb.Sn
    x = np.array((8.5, 8.5, 8.5))

    assert np.all(S[0, :] == x)
    assert np.all(A[0, :] == x)
    assert np.all(R[0, :] == x)
    assert np.all(Sn[0, :] == x)
Ejemplo n.º 15
0
 def testSample(self):
   with self.test_session():
     buffer = replay_buffer.ReplayBuffer()
     for i in range(1000):
       buffer.add(np.array([i, 2*i]), np.array([i % 5,]), np.array([i]), False, np.array([2*i, 3*i]))
     num_samples = 32
     for j in range(50):
       old_states, actions, rewards, dones, new_states = buffer.sample(num_samples)
       reward_set = set()
       for s in range(num_samples):
         i = rewards[s][0]
         self.assertNotIn(i, reward_set)
         reward_set.add(i)
         self.assertFalse(dones[s])
         self.assertTrue((actions[s] == i % 5).all())
         self.assertTrue((old_states[s] == np.array([i, 2*i])).all())
         self.assertTrue((new_states[s] == np.array([2*i, 3*i])).all())
Ejemplo n.º 16
0
    def __init__(self,
                 n_actions,
                 starter_learning_rate=0.000025,
                 gamma=0.99,
                 memory_size=50000,
                 batch_size=32,
                 n_explore=10000,
                 frame_per_action=4,
                 replace_target_iter=500):
        self.n_actions = n_actions
        self.gamma = gamma
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.n_explore = n_explore
        self.frame_per_action = frame_per_action
        self.replace_target_iter = replace_target_iter

        self.time_step = 0
        self.replay_memory = replay_buffer.ReplayBuffer(memory_size)

        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.lr = tf.train.exponential_decay(starter_learning_rate,
                                             self.global_step, 10000, 0.96)
        self.createNetwork()
        q_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope='Q_network')
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope='target_network')
        self.replace_target_op = [
            tf.assign(t, q) for t, q in zip(t_params, q_params)
        ]

        self.merged = tf.summary.merge_all()
        self.saver = tf.train.Saver()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.sess.graph.finalize()

        ckpt = tf.train.get_checkpoint_state(SIGN)
        if ckpt and ckpt.model_checkpoint_path:
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
            print("Successfully loaded:", ckpt.model_checkpoint_path)
        else:
            print("Could not find old network weights")

        self.writer = tf.summary.FileWriter("logs/" + SIGN, self.sess.graph)
Ejemplo n.º 17
0
 def __init__(self, env):
     self._buffer = replay_buffer.ReplayBuffer()
     self.env = env
     (self.critic_state_input, self.action_input,
      self.critic) = self._create_critic_model()
     [
         self.variable_summaries(x, "critic")
         for x in self.critic.trainable_weights
     ]
     (self.actor_state_input, self.actor) = self._create_actor_model()
     [
         self.variable_summaries(x, "actor")
         for x in self.actor.trainable_weights
     ]
     self.target_critic = Model.from_config(self.critic.get_config())
     self.target_critic.set_weights(self.critic.get_weights())
     [
         self.variable_summaries(x, "target_critic")
         for x in self.target_critic.trainable_weights
     ]
     self.target_actor = Model.from_config(self.actor.get_config())
     self.target_actor.set_weights(self.actor.get_weights())
     [
         self.variable_summaries(x, "target_actor")
         for x in self.target_actor.trainable_weights
     ]
     if FLAGS.load_model_from_file:
         self.critic.load_model(TARGET_CRITIC_FILE)
         self.actor.load_model(TARGET_ACTOR_FILE)
         self.critic.load_model(CRITIC_FILE)
         self.actor.load_model(ACTOR_FILE)
     self.epsilon_min = FLAGS.epsilon_min
     self.epsilon = FLAGS.epsilon
     self.epsilon_decay = FLAGS.epsilon_decay
     self.assign_weights = self.get_weight_assignment_op()
     self.sess = tf.Session()
     K.set_session(self.sess)
     self.summarize = None
     self.train_writer = tf.summary.FileWriter(
         "train_summaries/train_%d" % time.time(), self.sess.graph)
     self.action_grads = K.gradients(self.critic.outputs[0] / BATCH_SIZE,
                                     self.action_input)
     self.sess.run(tf.global_variables_initializer())
Ejemplo n.º 18
0
    def __init__(self,
                 q_network=networks.q_func.QFunc,
                 policy_network=networks.policy.PolicyNet,
                 tau=0.005,
                 batch_size=256,
                 look_ahead=1,
                 look_behind=1,
                 gate_width=1,
                 gate_height=0.5,
                 gamma=0.9):
        super().__init__()
        self.q_net_1 = q_network()
        self.q_net_2 = q_network()

        self.target_1 = q_network()
        self.target_2 = q_network()

        self.policy_net = policy_network()
        self.replay_buffer = replay_buffer.ReplayBuffer()
        self.standard_normal = torch.distributions.Normal(0, 1)

        self.tau = tau
        self.batch_size = batch_size

        self.init_target()

        self.state_size = (1 + look_ahead + look_behind) * 3
        self.action_size = 2
        self.look_ahead = look_ahead
        self.look_behind = look_behind
        self.gate_width = 1
        self.gate_height = 0.5
        self.env = envs.race_traj.RaceTrajEnv(gate_height=gate_height,
                                              gate_width=gate_width)

        self.alpha = torch.abs(self.standard_normal.sample()) * -1
        self.gamma = gamma
        self.entropy_target = -2
Ejemplo n.º 19
0
    def train(self):
        # Manage GPUs
        '''
        if 0 < self.num_gpus:
            num_gpus_per_worker = self.num_gpus / (
                self.config.train_on_gpu
                + self.config.num_workers * self.config.selfplay_on_gpu
                + log_in_tensorboard * self.config.selfplay_on_gpu
                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
            )
            if 1 < num_gpus_per_worker:
                num_gpus_per_worker = math.floor(num_gpus_per_worker)
        else:
            num_gpus_per_worker = 0
        '''

        # Initialize Worker Threads
        for SP_worker_index in range(self.config.num_workers):
            self.self_play_workers.append(
                self_play.SelfPlay(self.checkpoint, self.Game, self.config,
                                   self.config.seed + SP_worker_index))
        self.training_worker = trainer.Trainer(self.checkpoint, self.config)

        self.replay_buffer_worker = replay_buffer.ReplayBuffer(
            self.checkpoint, self.replay_buffer, self.config)
        self.shared_storage_worker = shared_storage.SharedStorage(
            self.checkpoint, self.config)
        self.shared_storage_worker.set_info("terminate", False)
        #Launch Workers
        play_thread = threading.Thread(
            target=self.self_play_workers[0].continuous_self_play,
            args=(self.shared_storage_worker, self.replay_buffer_worker))
        train_thread = threading.Thread(
            target=self.training_worker.continuous_update_weights,
            args=(self.shared_storage_worker, self.replay_buffer_worker))
        play_thread.start()
        train_thread.start()
Ejemplo n.º 20
0
def main():
	
	parser = argparse.ArgumentParser()
	parser.add_argument("--policy_name", default="TD3")							# Policy name
	parser.add_argument("--env_name", default="Pendulum-v0")					# OpenAI gym environment name
	parser.add_argument("--replay_buffer", default="prioritized")				# Replay Buffer type
	parser.add_argument("--replay_buffer_size", default=5e4, type=int)			# Replay Buffer capacity
	parser.add_argument("--replay_buffer_alpha", default=0.6, type=float)		# Replay Buffer prioritization weight
	parser.add_argument("--seed", default=0, type=int)							# Sets Gym, PyTorch and Numpy seeds
	parser.add_argument("--start_timesteps", default=1e4, type=int)				# How many time steps purely random policy is run for
	parser.add_argument("--eval_freq", default=1e3, type=float)					# How often (time steps) we evaluate
	parser.add_argument("--max_timesteps", default=5e4, type=float)				# Max time steps to run environment for
	parser.add_argument("--save_models", default="True", type=bool)				# Whether or not models are saved
	parser.add_argument("--expl_noise", default=0.1, type=float)				# Std of Gaussian exploration noise
	parser.add_argument("--batch_size", default=100, type=int)					# Batch size for both actor and critic
	parser.add_argument("--discount", default=0.99, type=float)					# Discount factor
	parser.add_argument("--tau", default=0.005, type=float)						# Target network update rate
	parser.add_argument("--policy_noise", default=0.2, type=float)				# Noise added to target policy during critic update
	parser.add_argument("--noise_clip", default=0.5, type=float)				# Range to clip target policy noise
	parser.add_argument("--policy_freq", default=2, type=int)					# Frequency of delayed policy updates
	parser.add_argument("--lr_actor", default=0.001, type=float)				# Learning rate of actor
	parser.add_argument("--lr_critic", default=0.001, type=float)				# Learning rate of critic
	parser.add_argument("--prioritized_replay_eps", default=1e-3, type=float)	# Replay Buffer epsilon (PRE)
	parser.add_argument("--prioritized_replay_beta0", default=0.4, type=float)	# Replay Buffer initial beta (PRE)
	args = parser.parse_args()

#Training kwargs
	kwargs = {  "policy_name": args.policy_name,
				"env_name": args.env_name,
				"replay_buffer": args.replay_buffer,
				"replay_buffer_size": args.replay_buffer_size,
				"replay_buffer_alpha": args.replay_buffer_alpha,
				"seed": args.seed,
				"start_timesteps": args.start_timesteps,
				"eval_freq": args.eval_freq,
				"max_timesteps": args.max_timesteps,
				"save_models": args.save_models,
				"expl_noise": args.expl_noise,
				"batch_size": args.batch_size,
				"discount": args.discount,
				"tau": args.tau,
				"policy_noise": args.policy_noise,
				"noise_clip": args.noise_clip,
				"policy_freq": args.policy_freq,
				"lr_actor": args.lr_actor,
				"prioritized_replay_eps": args.prioritized_replay_eps,
				"prioritized_replay_beta0": args.prioritized_replay_beta0
         }

	# cls
	os.system('cls' if os.name == 'nt' else 'clear')

	if not os.path.exists("./results"):
    		os.makedirs("./results")
	if args.save_models and not os.path.exists("./pytorch_models"):
		os.makedirs("./pytorch_models")

	# Time stamp for repeated test names
	ts = time.time()
	ts = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H-%M-%S')

	test_name = "%s_%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed), ts)
	plot_name = "%s_%s_%s_%s_plot.png" % (args.policy_name, args.env_name, str(args.seed), ts)
	kwargs_name = "%s_%s_%s_%s_kwargs.csv" % (args.policy_name, args.env_name, str(args.seed), ts)
	scores_name = "%s_%s_%s_%s_scores.csv" % (args.policy_name, args.env_name, str(args.seed), ts)

	print("---------------------------------------")
	print("Settings: %s" % (test_name))
	utils.save_kwargs(kwargs, "./results/%s" % (kwargs_name))
	print("---------------------------------------")

	# Environment and Agent instantiation

	env = gym.make(args.env_name)

	# Set seeds
	env.seed(args.seed)
	torch.manual_seed(args.seed)
	np.random.seed(args.seed)
	
	state_dim = env.observation_space.shape[0]
	action_dim = env.action_space.shape[0] 
	max_action = float(env.action_space.high[0])

	# Instantiate Replay Buffer	
	if args.replay_buffer == "vanilla": 
		replay_buffer = rb.ReplayBuffer(size = args.replay_buffer_size)
		PER = False
	elif args.replay_buffer == "prioritized": 
		replay_buffer = rb.PrioritizedReplayBuffer(size = int(np.round(np.sqrt(args.replay_buffer_size))), 
												   alpha = args.replay_buffer_alpha)
		PER = True
		prioritized_replay_beta_iters = args.max_timesteps
		prioritized_replay_beta0 = args.prioritized_replay_beta0
		beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p = prioritized_replay_beta0,
                                       final_p = 1.0)

	# Instantiate policy
	if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps)
	elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps)

	# Evaluate untrained policy
	evaluations = [evaluate_policy(env, policy)] 

	# Training loop #######################################

	total_timesteps = 0
	timesteps_since_eval = 0
	episode_num = 0
	episode_rewards = []
	done = True 

	while total_timesteps < args.max_timesteps:
		
		if done: 

			if total_timesteps != 0: 
				print('Total T: {} Episode Num: {} Episode T: {} Reward: {}'.format(total_timesteps, episode_num, episode_timesteps, episode_reward))
				episode_rewards.append(episode_reward)
				
				# PER Beta scheduled update 
				if PER: beta = beta_schedule.value(total_timesteps)
				else: beta = 0.
				# Policy update step
				if args.policy_name == "TD3":
					policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, beta)
				else: 
					policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, beta)
			
			# Evaluate episode
			if timesteps_since_eval >= args.eval_freq:
				timesteps_since_eval %= args.eval_freq
				evaluations.append(evaluate_policy(env, policy))
				
				# save evaluation
				#if args.save_models: policy.save(test_name, directory="./pytorch_models")
				#np.save("./results/%s" % (test_name), evaluations) 
			
			# Reset environment
			obs = env.reset()
			done = False
			episode_reward = 0
			episode_timesteps = 0
			episode_num += 1 
		
		# Select action randomly or according to policy
		if total_timesteps < args.start_timesteps:
			action = env.action_space.sample()
		else:
			action = policy.select_action(np.array(obs))
			if args.expl_noise != 0: 
				action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)

		# Perform action
		new_obs, reward, done, _ = env.step(action) 
		done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
		episode_reward += reward

		# Push experience into replay buffer
		experience = (obs, action, reward, new_obs, done_bool)
		replay_buffer.add(experience)

		obs = new_obs

		episode_timesteps += 1
		total_timesteps += 1
		timesteps_since_eval += 1
		
	# Final evaluation 
	evaluations.append(evaluate_policy(env, policy))
	
	# Save results
	if args.save_models: policy.save("%s" % (test_name), directory="./pytorch_models")
	#np.save("./results/%s" % (evaluations_file), evaluations)  
	#np.save("./results/%s" % ('rewards.txt'), episode_rewards) 
	utils.save_scores(episode_rewards, "./results/%s" % (scores_name))
	utils.plot(episode_rewards, "./results/%s" % (plot_name), 1)
Ejemplo n.º 21
0
            args.name: {
                'autoencoder': {},
                'correlation': {},
                'policies': {}
            }
        }
    })

LOG.setup(os.path.join('.', args.run_dir, args.name))

env = SimpleGridworld()
dummy_env = SimpleGridworld()
net = IndepFeatureLearner(lmbda=args.lmbda,
                          learning_rate=args.learning_rate,
                          gpu_num=args.gpu_num)
buffer = replay_buffer.ReplayBuffer(10000)

visualization_freq = 10000
batch_size = args.batch_size


def run_training_step(buffer: replay_buffer.ReplayBuffer,
                      net: IndepFeatureLearner):
    positions, _, _, _, _ = buffer.sample(batch_size)
    s_list = []
    sp_list = []
    action_list = []
    for pos in positions:
        s = dummy_env.get_observation(pos)
        #actions = net.([s]) # [1, num_factors]
        sp = []
Ejemplo n.º 22
0
 def __init__(self, hp):
     self.hp = hp
     self.memory = replay_buffer.ReplayBuffer(hp)
     self.agents = [ddpg.Agent(self.hp) for _ in range(self.hp.num_agents)]
     self.losses = (0., 0.)
Ejemplo n.º 23
0
def main():

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    n_units = FLAGS.target_units if FLAGS.target_units is not None else 32
    hidden_sizes = [n_units for _ in range(FLAGS.target_layers)]
    policy_args = {'hidden_sizes': hidden_sizes}
    n_units = FLAGS.behavior_units if FLAGS.behavior_units is not None else 32
    hidden_sizes = [n_units for _ in range(FLAGS.behavior_layers)]
    behavior_args = {'hidden_sizes': hidden_sizes}

    true_value, _, avg_length, policy_args =\
        common.load_policy_args(FLAGS.restore_path, policy_args)

    if FLAGS.behavior_path is not None:
        _, _, _, behavior_args = common.load_policy_args(
            FLAGS.behavior_path, behavior_args)
    else:
        _, _, _, behavior_args = common.load_policy_args(
            FLAGS.restore_path, policy_args)

    policy_args['seed'] = FLAGS.seed
    behavior_args['seed'] = FLAGS.seed
    obs_len = FLAGS.obs_len
    print(policy_args, behavior_args)
    policy_str = 'Gaussian'
    if FLAGS.env == 'CartPole-v0':
        policy_str = 'boltzmann'
    distribution = rl.ReinforcementLearning(
        FLAGS.env,
        policy_str,
        max_path_length=FLAGS.max_path_length,
        scope=FLAGS.scope,
        policy_args=policy_args)
    behavior_dist = rl.ReinforcementLearning(
        FLAGS.env,
        policy_str,
        max_path_length=FLAGS.max_path_length,
        scope=FLAGS.behavior_scope,
        policy_args=behavior_args)

    if tf.gfile.Exists('%s.meta' % FLAGS.restore_path):
        distribution.policy.load_policy(FLAGS.restore_path)
    if FLAGS.behavior_path is not None:
        if tf.gfile.Exists('%s.meta' % FLAGS.behavior_path):
            behavior_dist.policy.load_policy(FLAGS.behavior_path)
    else:
        if tf.gfile.Exists('%s.meta' % FLAGS.restore_path):
            behavior_dist.policy.load_policy(FLAGS.restore_path)

    n_units = FLAGS.hidden_units if FLAGS.hidden_units is not None else 32
    hidden_sizes = [n_units for _ in range(FLAGS.hidden_layers)]
    mle_args = {
        'train_type': 'supervised',
        'seed': FLAGS.seed,
        'hidden_sizes': hidden_sizes,
        'entropy_coeff': FLAGS.entropy_coeff,
        'weight_decay': FLAGS.weight_decay,
        'act_fn': tf.nn.relu,
        'learning_rate': 1e-03
    }
    obs_space = distribution.env.observation_space
    act_space = distribution.env.action_space
    obs_space = common.elongate_space(obs_space, act_space, obs_len)
    if policy_str == 'Gaussian':
        policy_cls = policies.GaussianPolicy
        mle_args['learn_std'] = FLAGS.learn_std
    else:
        policy_cls = policies.ContinuousStateBoltzmannPolicy
    mle_policy = policy_cls(obs_space, act_space, scope='mle', **mle_args)
    learning_iters = FLAGS.num_iters
    batch_size = FLAGS.batch_size
    ope_paths = []
    validation_paths = []
    replay_buffer = rb.ReplayBuffer()
    eval_buffer = rb.ReplayBuffer()

    results = blackbox_results_pb2.FitResults()
    results.method_name = 'nn_%d_%d' % (FLAGS.hidden_layers, n_units)

    replay_buffer.empty()
    eval_buffer.empty()

    # Get true value for eval policy. Either from file or with MC eval.
    if true_value is None:
        # if we couldn't load true value or we are using a mixture policy
        true_value = 0.0
        num_true_trajs = max(10000, 10 * batch_size)
        length = 0.0
        for _ in range(num_true_trajs):
            path, G = distribution.sample()
            true_value += G
            length += len(path['rewards'])
        true_value /= num_true_trajs
        avg_length = length / num_true_trajs
    results.true_value = true_value
    print('Avg path length %f' % avg_length)
    print('True value %f' % true_value)

    def mse(x):
        return (x - true_value)**2

    # Collect paths with behavior policy
    for _ in range(batch_size):
        path, G = behavior_dist.sample()
        replay_buffer.add_path(path, G)
        ope_paths.append(path)

    # Off-policy evaluation with true behavior policy
    common.load_importance_weights(distribution.policy, ope_paths, obs_len=1)
    is_estimate, is_variance = eval_target_policy(ope_paths, weighted=False)
    is_mse = mse(is_estimate)
    results.density_estimate = is_estimate
    results.density_variance = is_variance
    results.density_mse = is_mse
    print('###################')
    print('True Value %f' % true_value)
    print_results('True IS', is_estimate, is_variance, is_mse)

    pct_batch = 0.2
    for _ in range(int(pct_batch * batch_size)):
        path, G = behavior_dist.sample()
        validation_paths.append(path)

    # Get eval data
    _, _, eval_obs, eval_acts = common.get_train_test_data(validation_paths,
                                                           split=0.0,
                                                           obs_len=obs_len)
    train_obs, train_acts, _, _ = common.get_train_test_data(ope_paths,
                                                             split=1.0,
                                                             obs_len=obs_len)
    policy_eval_paths = ope_paths

    inds = np.arange(len(train_obs))

    common.load_importance_weights(distribution.policy,
                                   policy_eval_paths,
                                   mle_policy,
                                   obs_len=obs_len)
    entropy = mle_policy.entropy(train_obs)
    print('Entropy %f' % entropy)
    train_loss = eval_pi_loss(mle_policy, train_obs, train_acts)
    eval_loss = eval_pi_loss(mle_policy, eval_obs, eval_acts)

    is_estimate, is_variance = eval_target_policy(policy_eval_paths,
                                                  weighted=False)
    is_mse = mse(is_estimate)

    print_results('RIS', is_estimate, is_variance, is_mse)
    print('Training Loss %f' % train_loss)
    print('Validation Loss %f' % eval_loss)
    print('Entropy %f' % entropy)
    print('###################')
    add_results(results, is_estimate, is_variance, is_mse, entropy, train_loss,
                eval_loss, 0)

    for itr in range(learning_iters):

        if FLAGS.mini_batch_size is not None:
            m = len(train_obs)
            inds = np.random.randint(m, size=FLAGS.mini_batch_size)

        obs_batch, acts_batch = train_obs[inds], train_acts[inds]

        # loss is training error, v_loss is validation error computed on
        # samples that we will also use in policy evaluation, eval_loss is
        # validation loss on samples that will not be used in
        # policy evaluation.
        mle_policy.supervised_update(obs_batch, acts_batch)
        train_loss = eval_pi_loss(mle_policy, train_obs, train_acts)
        eval_loss = eval_pi_loss(mle_policy, eval_obs, eval_acts)
        entropy = mle_policy.entropy(train_obs)

        if itr > 0 and itr % FLAGS.eval_freq == 0:
            common.load_importance_weights(distribution.policy,
                                           policy_eval_paths,
                                           mle_policy,
                                           obs_len=obs_len)
            is_estimate, is_variance = eval_target_policy(policy_eval_paths,
                                                          weighted=False)

            is_mse = mse(is_estimate)

            if itr % FLAGS.print_freq == 0:
                print('###################')
                print('Iteration %d' % itr)
                print_results('RIS', is_estimate, is_variance, is_mse)
                print('Training Loss %f' % train_loss)
                print('Validation Loss %f' % eval_loss)
                print('Entropy %f' % entropy)
                print('###################')
            add_results(results, is_estimate, is_variance, is_mse, entropy,
                        train_loss, eval_loss, itr)

    if FLAGS.result_file is not None:
        with open(FLAGS.result_file, 'wb') as w:
            w.write(results.SerializeToString())
Ejemplo n.º 24
0
 def testAdd(self):
   with self.test_session():
     buffer = replay_buffer.ReplayBuffer()
     for i in range(1000):
       buffer.add(np.array([i, 2*i]), np.array([i % 5,]), np.array([i]), False, np.array([2*i, 3*i]))
     self.assertEqual(1000, buffer.size())
Ejemplo n.º 25
0
 def testInit(self):
   with self.test_session():
     buffer = replay_buffer.ReplayBuffer()
def test_init():
    rb = RB.ReplayBuffer(2, 1, 100)
    assert rb.S1.shape == (100, 2)
    assert rb.A1.shape == (100, 1)
Ejemplo n.º 27
0
    def train(self):
        # ray.init()
        os.makedirs(self.config.results_path, exist_ok=True)

        # Initialize workers
        # training_worker = trainer.Trainer.options(
        #     num_gpus=1 if "cuda" in self.config.training_device else 0
        # ).remote(copy.deepcopy(self.muzero_weights), self.config)
        training_worker = trainer.Trainer(copy.deepcopy(self.muzero_weights),
                                          self.config)
        # shared_storage_worker = shared_storage.SharedStorage.remote(
        #     copy.deepcopy(self.muzero_weights), self.game_name, self.config,
        # )
        shared_storage_worker = shared_storage.SharedStorage(
            copy.deepcopy(self.muzero_weights),
            self.game_name,
            self.config,
        )
        # replay_buffer_worker = replay_buffer.ReplayBuffer.remote(self.config)
        replay_buffer_worker = replay_buffer.ReplayBuffer(self.config)
        # Pre-load buffer if pulling from persistent storage
        if self.replay_buffer:
            for game_history_id in self.replay_buffer:
                # replay_buffer_worker.save_game.remote(
                replay_buffer_worker.save_game(
                    self.replay_buffer[game_history_id])
            print("\nLoaded {} games from replay buffer.".format(
                len(self.replay_buffer)))
        self_play_workers = [
            # self_play.SelfPlay.remote(
            self_play.SelfPlay(
                copy.deepcopy(self.muzero_weights),
                self.Game(self.config.seed + seed),
                self.config,
            ) for seed in range(self.config.num_actors)
        ]

        # # Launch workers
        # [
        #     # self_play_worker.continuous_self_play.remote(
        #     self_play_worker.continuous_self_play(
        #         shared_storage_worker, replay_buffer_worker
        #     )
        #     for self_play_worker in self_play_workers
        # ]
        # # training_worker.continuous_update_weights.remote(
        # training_worker.continuous_update_weights(
        #     replay_buffer_worker, shared_storage_worker
        # )
        # # Save performance in TensorBoard
        # self._logging_loop(shared_storage_worker, replay_buffer_worker)

        while True:
            # play a game
            [
                self_play_worker.joe_self_play(shared_storage_worker,
                                               replay_buffer_worker)
                for self_play_worker in self_play_workers
            ]
            self._joe_logging(shared_storage_worker, replay_buffer_worker)
            training_worker.joe_update_weights(replay_buffer_worker,
                                               shared_storage_worker)
            info = shared_storage_worker.get_info()
            if info["training_step"] >= self.config.training_steps:
                break

        # self.muzero_weights = ray.get(shared_storage_worker.get_weights.remote())
        self.muzero_weights = shared_storage_worker.get_weights()
        # self.replay_buffer = ray.get(replay_buffer_worker.get_buffer.remote())
        self.replay_buffer = replay_buffer_worker.get_buffer()
        # Persist replay buffer to disk
        print("\n\nPersisting replay buffer games to disk...")
        pickle.dump(
            self.replay_buffer,
            open(os.path.join(self.config.results_path, "replay_buffer.pkl"),
                 "wb"),
        )
    def __init__(self, id, config, session, type):
        self.id = id
        self.name = 'AGENT_' + type.upper() + '_' + str(id)
        self.session = session
        self.type = type

        # Extract relevant configuration:
        self.config = {}
        self.config['env_n_actions'] = config['env_n_actions']
        self.config['env_obs_dims'] = config['env_obs_dims']
        self.config['env_type'] = config['env_type']

        dqn_config_params = [
            'dqn_gamma', 'dqn_rm_init', 'dqn_rm_max', 'dqn_target_update',
            'dqn_batch_size', 'dqn_learning_rate', 'dqn_train_period',
            'dqn_adam_epsilon', 'dqn_epsilon_start', 'dqn_epsilon_final',
            'dqn_epsilon_steps', 'dqn_huber_loss_delta'
        ]

        for param in dqn_config_params:
            self.config[param] = config[param]

        self.epsilon = self.config['dqn_epsilon_start']
        self.epsilon_step_size = (self.config['dqn_epsilon_start'] - self.config['dqn_epsilon_final']) \
                                 / self.config['dqn_epsilon_steps']

        # Scoped names
        self.name_online = self.name + '/' + 'DQN_ONLINE'
        self.name_target = self.name + '/' + 'DQN_TARGET'

        self.obs, self.q_values, self.evaluation, self.latent_features = \
            self.build_model(self.name_online, self.config['env_n_actions'])

        self.obs_target, self.q_values_target, self.evaluation_target, self.latent_features_target = \
            self.build_model(self.name_target, self.config['env_n_actions'])

        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           scope=self.name_online)
        trainable_vars_by_name = {
            var.name[len(self.name_online):]: var
            for var in trainable_vars
        }

        trainable_vars_t = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             scope=self.name_target)
        trainable_vars_by_name_t = {
            var.name[len(self.name_target):]: var
            for var in trainable_vars_t
        }

        copy_ops = [
            target_var.assign(trainable_vars_by_name[var_name])
            for var_name, target_var in trainable_vars_by_name_t.items()
        ]
        self.update_target_weights = tf.group(*copy_ops)

        self.action, self.td_target, self.td_error, self.loss, self.grads_update = self.build_training_ops(
        )

        self.replay_memory = replay_buffer.ReplayBuffer(
            self.config['dqn_rm_max'])

        # --------------------------------------------------------------------------------------------------------------

        self.post_init_steps = 0
        self.training_steps = 0
        self.n_episode = 0

        self.sample_loss_mean = 0.0
        self.sample_n = 0.0

        self.self_loss_mean = 0.0
        self.self_loss_n = 0.0
        self.expert_loss_mean = 0.0
        self.expert_loss_n = 0.0
        self.ep_r_steps = 0

        self.rnd_rm = None
Ejemplo n.º 29
0
            rpbuffer.add((s0, action, r1, terminal, s1))
            s0 = s1

    env.close()


def play(env, actor, games=20):
    for i in range(games):
        terminal = False
        s0 = env.reset()

        while not terminal:
            env.render()
            action = np.random.choice([0, 1])
            s0, _, terminal, _ = env.step(action)

    env.close()


if __name__ == "__main__":

    env = gym.make(ENV)
    actor = None
    rpbuffer = replay_buffer.ReplayBuffer(FRAME_SZ)

    if "-t" in sys.argv:
        train(env, actor, rpbuffer)

    if "-p" in sys.argv:
        play(env, actor)
Ejemplo n.º 30
0
her = False
render = False



if __name__ == '__main__':
	env = gym.make(env_name)
	env.seed(0)
	random.seed(0)
	np.random.seed(0)
	
	# Make a directory to store the learned policies
	dirname = datetime.datetime.now().isoformat()
	os.mkdir(dirname)
	
	replay_buffer = replay_buffer.ReplayBuffer(buffer_size)
	sample_batch = replay_buffer.get_batch
	
	
	ddpg = ddpg.DDPG(env, replay_buffer, sample_batch, train_iter, gamma, tau,
		batch_size, n_train, n_episode)
		
		
	for epoch in range(n_epoch):
		print("Start training epoch", epoch)
		for cycle in range(n_cycles):
			for episode in range(n_episode):
				state = env.reset()
				state = np.concatenate((state['observation'], state['achieved_goal'], state['desired_goal']))
				tot_reward = 0
				ddpg.reset_noise()