Beispiel #1
0
    def __init__(self, input_dim, output_dim, lr, gamma, seed_num=False):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.actions = range(output_dim)
        self.lr = lr
        self.gamma = gamma
        self.tau = 0.1
        self.seed_num = seed_num

        #For experience replay
        self.memory = []
        self.memory_size = 10000
        self.batchsize = 32

        #Actor & critic
        self.actor = Actor(input_dim, output_dim, self.lr)
        self.critic = Critic(input_dim, output_dim, self.lr, self.gamma)

        if seed_num != False:
            set_random_seed(seed_num)  #seed tensorflow
            seed(seed_num)  #seed numpy
Beispiel #2
0
    def __init__(self, task):
        self.task = task
        # For quadcopter task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
        
        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.best_w_actor = None
        self.best_w_critic = None
        self.score = 0
Beispiel #3
0
class Agent:
    def __init__(self, state_size, batch_size, is_eval = False):
        self.state_size = state_size #
        self.action_size = 3
        self.buffer_size = 1000000
        self.batch_size = batch_size
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.inventory = []
        self.is_eval = is_eval    
        self.gamma = 0.99 
        self.tau = 0.001 
        self.actor_local = Actor(self.state_size, self.action_size) 
        self.actor_target = Actor(self.state_size, self.action_size)
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)    
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        
    def act(self, state):
        options = self.actor_local.model.predict(state) 
        self.last_state = state
        if not self.is_eval:
            return choice(range(3), p = options[0])     
        return np.argmax(options[0])
    
    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state,done) 
        if len(self.memory) > self.batch_size:   
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)    
            self.last_state = next_state   
            
    def learn(self, experiences):               
        states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)    
        actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) 
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) 
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x = [states, actions], y = Q_targets) 
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),(-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1]) 
        self.soft_update(self.actor_local.model, self.actor_target.model)
        
    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        assert len(local_weights) == len(target_weights)
        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Beispiel #4
0
    def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma,
                 mem_size, actor_l1_size, actor_l2_size, critic_l1_size,
                 critic_l2_size, batch_size):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(mem_size, n_states, n_actions)
        self.batch_size = batch_size

        self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                           actor_l2_size)
        self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size,
                             critic_l2_size)

        self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                                  actor_l2_size)
        self.target_critic = Critic(lr_critic, n_states, n_actions,
                                    critic_l1_size, critic_l2_size)

        self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005)

        self.update_network_parameters(tau=1)
    def __init__(self,
                 state_space_dim,
                 action_space_dim,
                 min_action_val,
                 max_action_val,
                 hidden_layer_size=512,
                 gamma=0.99,
                 tau=0.0001,
                 path_to_load=None):
        self.gamma = gamma
        self.tau = tau
        self.min_action_val = min_action_val
        self.max_action_val = max_action_val
        self.buffer = Buffer(state_space_dim, action_space_dim)
        self.noise_generator = GaussianNoise(0., 0.2, action_space_dim)

        self.actor = Actor(state_space_dim, action_space_dim, max_action_val,
                           hidden_layer_size)
        self.critic = Critic(state_space_dim, action_space_dim,
                             hidden_layer_size)

        if path_to_load is not None:
            if os.path.exists(path_to_load + "_actor.h5") and \
                    os.path.exists(path_to_load + "_critic.h5"):
                self.load(path_to_load)

        self.target_actor = Actor(state_space_dim, action_space_dim,
                                  max_action_val, hidden_layer_size)
        self.target_critic = Critic(state_space_dim, action_space_dim,
                                    hidden_layer_size)

        self.target_actor.model.set_weights(self.actor.model.get_weights())
        self.target_critic.model.set_weights(self.critic.model.get_weights())

        critic_lr = 0.002
        actor_lr = 0.001

        self.critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
        self.actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
Beispiel #6
0
    def __init__(self, input_dim, output_dim, lr, gamma):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.actions = range(output_dim)
        self.lr = lr
        self.gamma = gamma
        self.route = []
        self.state = 'searching'
        self.idle_time = 0
        self.active_time = 0

        #These will store the samples from which the agent will learn
        self.states = []
        self.action_samples = []
        self.rewards = []

        #Make actor and critic
        self.actor = Actor(input_dim, output_dim, self.lr)
        self.critic = Critic(input_dim, output_dim, self.lr)

        self.train_actor = self.actor.optimizer()
        self.train_critic = self.critic.optimizer()
Beispiel #7
0
 def __init__(self, gamma=0.99):
     self.gamma = gamma
     # self.a_opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
     # self.c_opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
     self.a_opt = tf.keras.optimizers.Adam(learning_rate=7e-3)
     self.c_opt = tf.keras.optimizers.Adam(learning_rate=7e-3)
     self.f1 = tf.keras.layers.Dense(22, activation='relu')
     self.f2 = tf.keras.layers.Dense(22, activation='relu')
     self.sigma = tf.keras.layers.Dense(1, activation=None)
     self.mu = tf.keras.layers.Dense(1, activation=None)
     self.actor = Actor()
     self.critic = Critic()
     self.clip_pram = 0.2
Beispiel #8
0
    def __init__(self, state_size, batch_size, is_eval=False):
        self.state_size = state_size
        self.action_size = 3
        self.buffer_size = 1000000
        self.batch_size = batch_size
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.inventory = []
        self.is_eval = is_eval

        self.gamma = 0.99
        self.tau = 0.001

        self.actor_local = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)

        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
Beispiel #9
0
    def __init__(self, input_dim, output_dim, lr, gamma, tau, clipnorm,
                 verbose):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.actions = range(output_dim)
        self.lr = lr
        self.gamma = gamma
        self.tau = tau

        #Buffer for experience replay
        self.S = []
        self.A = []
        self.R = []
        self.S1 = []
        self.D = []
        self.memory_size = 10**3

        #Make actor and critic
        self.actor = Actor(input_dim, output_dim, lr, gamma, tau, clipnorm,
                           verbose)
        self.critic = Critic(input_dim, output_dim, lr, gamma, tau, clipnorm,
                             verbose)
Beispiel #10
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.01
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.15
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 10000000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor
        self.tau = 0.1  # for soft update of target parameters
    def __init__(self,
                 input_dim,
                 output_dim,
                 tau=0.001,
                 gamma=0.99,
                 train_batch_size=640):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.tau = tau
        self.gamma = gamma
        self.train_batch_size = train_batch_size
        self.main_critic = Critic(input_dim, output_dim, tau, gamma)
        self.target_critic = Critic(input_dim, output_dim, tau, gamma)

        self.main_actor = Actor(input_dim, output_dim, tau, gamma)
        self.target_actor = Actor(input_dim, output_dim, tau, gamma)

        self.target_critic.model.set_weights(
            self.main_critic.model.get_weights())
        self.target_actor.model.set_weights(
            self.main_actor.model.get_weights())

        self.memory = ReplayBuffer(batch_size=train_batch_size)
Beispiel #12
0
def train(show_baseline=False, continue_train=False, \
    model_save_path='best_model', learn_freq= 5, memory_size = 20000, \
    memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \
    gamma = 0.9, alpha = 0.9, max_episode=1000, ):

    evaluate_env_list_path = 'env_list_set1'
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env()
    action_dim = 4
    obs_dim_1 = 45
    request_dim = 17
    obs_dim_2 = 10
    obs_dim = obs_dim_1 + obs_dim_2 * 7
    encoder = Encoder(input_size=request_dim, output_size=obs_dim_2, \
        use_rnn=False, use_gru=True, use_lstm=False)
    rpm = ReplayMemory(memory_size)  # DQN的经验回放池
    critic = Critic(obs_dim=obs_dim, action_dim=action_dim, encoder=encoder)
    agent = Agent(critic=critic,
                  obs_dim=obs_dim,
                  action_dim=action_dim,
                  lr=learning_rate,
                  gamma=gamma,
                  alpha=alpha)

    if continue_train:
        agent.load(model_save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < memory_warmup_size:
        run_episode(env, agent, rpm, memory_warmup_size, learn_freq,
                    batch_size)

    # start train
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 100):
            total_reward = run_episode(env, agent, rpm, memory_warmup_size,
                                       learn_freq, batch_size)
            episode += 1
        # for parameter in critic.parameters():
        #     print(parameter)
        #     break
        # test part
        # print(critic.parameters())
        eval_reward = evaluate(evaluate_env_list_path, agent, render=False)
        print('episode:{}  Test reward:{}'.format(episode, eval_reward))
    agent.save(model_save_path)
Beispiel #13
0
    def _build_graph(self):
        self.actor = Actor()
        self.critic = Critic()

        if self.mode == tf.estimator.ModeKeys.TRAIN:
            ave_ep_reward = tf.placeholder(tf.float32, name='ave_ep_reward')
            tf.summary.scalar('ave_ep_reward', ave_ep_reward)
            self.loss = ave_ep_reward
            global_step = tf.train.get_global_step()
            self.train_op = tf.assign_add(global_step, 1)
            self.training_hooks = [TrainingHook(self)]
        else:
            self.loss = tf.constant(1)
            self.evaluation_hooks = [EvalHook(self)]
Beispiel #14
0
    def __init__(self, state_space, action_space, max_action, device):
        self.state_size = state_space.shape[0]
        self.action_size = action_space.shape[0]
        self.max_action = max_action
        self.device = device
        self.actor_local = Actor(state_space.shape, action_space.high.size,
                                 max_action)
        self.actor_target = Actor(state_space.shape, action_space.high.size,
                                  max_action)
        self.actor_optimizer = optimizers.Adam(LR_ACTOR)
        # let target be equal to local
        self.actor_target.set_weights(self.actor_local.get_weights())

        self.critic_local = Critic(state_space.shape, action_space.high.size)
        self.critic_target = Critic(state_space.shape, action_space.high.size)
        self.critic_optimizer = optimizers.Adam(LR_CRITIC)
        # let target be equal to local
        self.critic_target.set_weights(self.critic_local.get_weights())

        self.noise = OUNoise(self.action_size)
        self.memory = ReplayBuffer(BUFFER_SIZE)

        self.current_steps = 0
Beispiel #15
0
 def __init__(self,
              act_dim,
              env_dim,
              act_range,
              k,
              buffer_size=10000,
              gamma=0.99,
              lr=0.001,
              tau=0.001):
     """ Initialization
     """
     # Environment and A2C parameters
     self.act_dim = act_dim
     self.act_range = act_range
     self.env_dim = (1, ) + (13, )
     self.gamma = gamma
     # Create actor and critic networks
     self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
     self.critic = Critic(self.env_dim, act_dim, lr, tau)
     # self.buffer = MemoryBuffer(buffer_size)
     self.buffer = deque(maxlen=buffer_size)
     self.count = 0
     self.buffer_size = buffer_size
Beispiel #16
0
    def __init__(self, state_size, action_size, config, seed):
        """Initialize a DDPG agent
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            ddpg_config (config): configuration of DDPG 
            seed (int): random seed
        """

        self.gamma = config.gamma
        self.tau = config.tau
        self.seed = np.random.seed(seed)

        # actor networks
        self.actor_local = Actor(state_size, action_size, config.units_actor,
                                 seed).to(device)
        self.actor_target = Actor(state_size, action_size, config.units_actor,
                                  seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          config.lr_actor)

        # critic newtworks
        self.critic_local = Critic(state_size, action_size,
                                   config.units_critic, seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    config.units_critic, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           config.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, seed, config.mu, config.theta,
                             config.sigma)

        # Replay Buffer
        self.memory = ReplayBuffer(config.buffer_size, config.batch_size, seed)
    def __init__(self,
                 num_agents=8,
                 env_name='LunarLanderContinuous-v2',
                 network='mlp',
                 num_steps=32):
        # set up environment, observation memory
        self.num_agents = num_agents
        self.num_steps = num_steps
        self.network = network

        temp_env = gym.make(env_name)
        self.obs_space_size = temp_env.observation_space.shape[0]

        self.memory = None

        # Initialize model, loss and optimizer
        self.actor = Actor(temp_env, network)
        self.critic = Critic()
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
        self.mse = tf.keras.losses.MeanSquaredError()
        self.actor_loss = None
        self.critic_loss = None

        # instantiate variable to store recurrent states of the agents
        self.agents_recurrent_state = None
        self.update_recurrent_state = None

        # store action distribution during update
        self.action_dist = None

        # Set up checkpoint paths
        self.checkpoint_directory_a = f"./training_checkpoints/{self.network}/actor"
        self.checkpoint_directory_c = f"./training_checkpoints/{self.network}/critic"

        # instantiate multiple agents (ray actors) and set first one as chief
        self.agent_list = [
            A2CAgent.remote(self.num_steps, env_name)
            for _ in range(num_agents)
        ]
        self.agent_list[0].set_chief.remote()

        # Prepare Tensorboard
        current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
        train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
        self.train_summary_writer = tf.summary.create_file_writer(
            train_log_dir)

        self.step = 0
Beispiel #18
0
 def test_get_gradient(self):
     image_shape = (256, 1, 28, 28)
     device = 'cuda'
     z_dim = 64
     gen = Generator(z_dim).to(device)
     crit = Critic().to(device)
     real = torch.randn(*image_shape, device=device) + 1
     fake = torch.randn(*image_shape, device=device) - 1
     epsilon_shape = [1 for _ in image_shape]
     epsilon_shape[0] = image_shape[0]
     epsilon = torch.rand(epsilon_shape, device=device).requires_grad_()
     gradient = get_gradient(crit, real, fake, epsilon)
     self.assertEqual(tuple(gradient.shape), image_shape)
     self.assertGreater(gradient.max(), 0)
     self.assertLess(gradient.min(), 0)
 def __init__(self,
              state_dim,
              action_dim,
              final_activation=tf.nn.tanh,
              action_bound=0.4,
              training_batch_size=32,
              GAMMA=0.95,
              lr=0.001,
              replay_buffer_size=1024):
     self.ID = random_string(10)
     self.state_dim = state_dim
     self.action_dim = action_dim
     self.final_activation = final_activation
     self.action_bound = action_bound
     self.GAMMA = GAMMA
     self.lr = lr
     self.replay_buffer_size = replay_buffer_size
     self.replay_buffer = ReplayBuffer(replay_buffer_size)
     self.training_batch_size = training_batch_size
     with tf.variable_scope(self.ID) as scope:
         self.actor = Actor(self.state_dim, self.action_dim,
                            self.action_bound, self.lr,
                            self.final_activation)
         self.critic = Critic(self.state_dim, self.action_dim, self.lr)
Beispiel #20
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
Beispiel #21
0
    def __init__(self, state_size, action_size, max_action, minibatch_size, a_lr, c_lr, gamma, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.max_action = max_action
        
        self.critic_lr = c_lr
        self.actor_lr = a_lr

        self.actor_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr)
        self.actor_target_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr)
        self.critic_network = Critic(self.state_size, self.action_size, self.critic_lr)
        self.critic_target_network = Critic(self.state_size, self.action_size, self.critic_lr)

        self.actor_target_network.set_weights(self.actor_network.get_weights())
        self.critic_target_network.set_weights(self.critic_network.get_weights())

        self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr)
        self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr)

        self.replay_buffer = ReplayBuffer(100000)
        self.MINIBATCH_SIZE = minibatch_size
        self.GAMMA = tf.cast(gamma, dtype=tf.float64)
        self.TAU = tau
        self.noise = OUNoise(self.action_size)
Beispiel #22
0
def main():
  with tf.Session() as sess:
    env = gym.make('Humanoid-v1')

    np.random.seed(123)
    env.seed(123)
    tf.set_random_seed(123)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    action_bounds = env.action_space.high

    actor = Actor(sess, state_size, action_size, action_bounds, actor_learning_rate, temperature)

    critic = Critic(sess, state_size, action_size, critic_learning_rate, temperature, actor.get_trainable_vars())

    train(sess, env, actor, critic)
Beispiel #23
0
def train(show_baseline=False, continue_train=False, \
    model_save_path='best_model', learn_freq= 5, memory_size = 20000, \
    memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \
    gamma = 0.9, alpha = 0.9, max_episode=1000, ):

    evaluate_env_list_path = 'env_list_set1'
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env()
    action_dim = 4
    obs_dim = 45
    rpm = ReplayMemory(memory_size)  # DQN的经验回放池

    critic = Critic(obs_dim=obs_dim, action_dim=action_dim)
    agent = Agent(critic=critic,
                  obs_dim=obs_dim,
                  action_dim=action_dim,
                  lr=learning_rate,
                  gamma=gamma,
                  alpha=alpha)

    if continue_train:
        agent.load(model_save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < memory_warmup_size:
        run_episode(env, agent, rpm, memory_warmup_size, learn_freq,
                    batch_size)

    # start train
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 10):
            total_reward = run_episode(env, agent, rpm, memory_warmup_size,
                                       learn_freq, batch_size)
            episode += 1
        # for name, param in critic.state_dict().items():
        #     # name: str
        #     # param: Tensor
        #     print(param)
        # test part
        eval_reward = evaluate(evaluate_env_list_path, agent, render=False)
        print('episode:{}  Test reward:{}'.format(episode, eval_reward))
    agent.save(model_save_path)
Beispiel #24
0
def main():
    with tf.Session() as sess:
        env = gym.make('Pendulum-v0')
        np.random.seed(random_seed)
        tf.set_random_seed(random_seed)
        env.seed(random_seed)
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.shape[0]
        action_bound_high = env.action_space.high
        action_bound_low = env.action_space.low
        if action_bound_high == -action_bound_low:
            action_bounds = action_bound_high
        actor = Actor(sess, state_size, action_size, action_bounds,
                      actor_learning_rate, temperature)
        critic = Critic(sess, state_size, action_size, critic_learning_rate,
                        temperature, actor.get_trainable_vars())
        train(sess, env, actor, critic)
    def __init__(self,
                 params,
                 tasks,
                 layers,
                 icm_layers,
                 build_graph=GRAPH_PATH):
        self.params = params
        self.curr_length = 1
        self.tasks = tasks

        actor_layers, critic_layers = layers

        self.actor_layers = actor_layers
        self.critic_layers = critic_layers
        self.icm_layers = icm_layers

        self.encoder = Encoder(params)
        self.icm = ICM(params, self.icm_layers, self.encoder)
        self.subpolicy = Subpolicy(params, self.actor_layers, self.encoder)
        self.critic = Critic(params, self.critic_layers, self.encoder)
        self.taskpolicy = Taskpolicy(self.subpolicy, self.critic, self.icm,
                                     params)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        tf.get_default_graph().finalize()
        if build_graph != None:
            self.writer = tf.summary.FileWriter(build_graph,
                                                self.session.graph)

            filename = 'PARAMS/' + str(self.params['iteration']) + '.txt'
            if not os.path.exists(os.path.dirname(filename)):
                try:
                    os.makedirs(os.path.dirname(filename))
                except OSError as exc:  # Guard against race condition
                    if exc.errno != errno.EEXIST:
                        raise
            with open(filename, 'w') as file:
                for kw in self.params:
                    arg = self.params[kw]
                    if type(arg) == "function":
                        file.write(str(kw) + ' ' + arg.__name__)
                    else:
                        file.write(str(kw) + ' ' + str(arg))
                    file.write("\n")
Beispiel #26
0
class Agent:
    def __init__(self, state_size, action_size, buffer_size, minibatch_size,
                 action_high, action_low):
        sess = tf.Session()
        self.actor = Actor(sess, state_size, action_size, action_high,
                           action_low)
        self.critic = Critic(sess, state_size, action_size)
        self.noiser = OUnoise(action_size, action_high, action_low)
        self.buffer = Buffer(buffer_size)
        self.minibatch_size = minibatch_size
        self.action_high = action_high
        self.action_low = action_low
        self.training = False
        sess.run(tf.global_variables_initializer())

    def choose_action(self, state):
        state = np.array([state])
        action = self.actor.action(state)[0]
        action = action + self.noiser._noise()
        #clip
        for i, (high, low) in enumerate(zip(self.action_high,
                                            self.action_low)):
            if action[i] > high:
                action[i] = high
            elif action[i] < low:
                action[i] = low
        return action

    def train(self, transition):
        self.buffer.store(transition)
        if not self.training and len(
                self.buffer.transitions) == self.minibatch_size:
            self.training = True
        if self.training:
            minibatch = np.array(
                random.sample(self.buffer.transitions, self.minibatch_size))
            state_batch = np.vstack(minibatch[:, 0])
            action_batch = np.vstack(minibatch[:, 1])
            next_state_batch = np.vstack(minibatch[:, 2])
            reward_batch = np.vstack(minibatch[:, 3])
            done_batch = np.vstack(minibatch[:, 4])
            next_action_batch = self.actor.next_action(next_state_batch)
            q_target = reward_batch + (
                1 - done_batch) * gamma * self.critic.next_q_value(
                    next_state_batch, next_action_batch)
            self.critic.train(state_batch, action_batch, q_target)
            action_grad_batch = self.critic.action_grad(
                state_batch, self.actor.action(state_batch))
            self.actor.train(state_batch, action_grad_batch)
            self.critic.update_target()
            self.actor.update_target()
    def __init__(self,env_name, num_threads, gamma= 0.99,actor_learning_rate = 0.001, actor_batch_size = 64,critic_learning_rate = 0.01,\
            entropy_beta = 0.01,critic_batch_size = 16,critic_epochs = 100, max_episodes_per_thread = 100, episode_to_train= 4):
        self.envs = [gym.make(env_name).env for _ in range(num_threads)]
        if self.envs[0].observation_space.shape == ():
            input_shape = 1
        else:
            input_shape = self.envs[0].observation_space.shape[0]

        self.actor = Actor(actor_learning_rate, actor_batch_size, input_shape,
                           self.envs[0].action_space.n, entropy_beta)
        self.critic = Critic(critic_learning_rate, critic_batch_size,
                             critic_epochs, input_shape, 1)
        batch = Batch(self.actor, self.critic, batch_size=actor_batch_size)
        lock = Lock()
        self.threads = [
            Env_thread("thread" + str(i), lock, batch, self.envs[i],
                       self.actor, self.critic, gamma, max_episodes_per_thread,
                       episode_to_train) for i in range(num_threads)
        ]
Beispiel #28
0
    def _build_update_op(self):
        global_step = tf.train.get_global_step()
        tf.assign_add(global_step, 1, name='global_step_add')

        # with tf.variable_scope('eval_net'):
        #     self.eval_critic = Critic(self.eval_actor.actions)
        #
        #     self.eval_actor.build_train_op(self.eval_critic.qa_value)

        with tf.variable_scope('target_net'):
            self.target_actor = Actor()
            self.target_critic = Critic(self.target_actor.actions)

        actor_update_op = [
            tf.assign(
                target_param,
                target_param * (1 - Config.train.TAU) +
                train_param * Config.train.TAU) for train_param, target_param
            in zip(self.eval_actor.params, self.target_actor.params)
        ]

        critic_update_op = [
            tf.assign(
                target_param,
                target_param * (1 - Config.train.TAU) +
                train_param * Config.train.TAU) for train_param, target_param
            in zip(self.eval_critic.params, self.target_critic.params)
        ]

        actor_init_op = [
            tf.assign(target_param,
                      train_param) for train_param, target_param in zip(
                          self.eval_actor.params, self.target_actor.params)
        ]

        critic_init_op = [
            tf.assign(target_param,
                      train_param) for train_param, target_param in zip(
                          self.eval_critic.params, self.target_critic.params)
        ]

        self.update_target_op = tf.group(actor_update_op + critic_update_op)
        self.init_target_op = tf.group(actor_init_op + critic_init_op)
    def play_game(self):
        sess = tf.Session()

        # init states, add neigh-dim
        # order in states is important
        states = collections.defaultdict(list)
        for t in self.generator.Ts:
            for t_customer in t.customers_C:
                states[t.id].append(100)
            for t_customer in t.customers_CP:
                states[t.id].append(100)
            for t_customer in t.customers_M:
                states[t.id].append(100)
            for t_peer in t.peers_T:
                states[t.id].append(100)

            # reachable end-to-end throughput (all advertised are considered here)
            for destination in t.table:
                states[t.id].append(0)
            for destination in t.table:
                states[t.id].append(0)

        # create AC-model, define action set
        for i in self.Ns:
            # node i
            n_features = len(states[i.id])
            actor = Actor(sess, n_features, i.n_actions, i.id)
            critic = Critic(sess, n_features, i.id)
            i.set_rl_setting(actor, critic)
            sess.run(tf.global_variables_initializer())
        '''
            loop time as time epoch
        '''
        for t in self.MAX:
            # TODO, generate TF, think flow that is not reachable?, or all destinations are reachable

            # every node takes the actions
            actions = []
            for i in self.Ns:
                # node i
                s = np.array(states[i.id])
                actions.append(i.actor.choose_action(s))
Beispiel #30
0
 def __init__(self,input_dim, output_dim, lr, gamma, loss_clipping, c1):
     self.input_dim = input_dim
     self.output_dim = output_dim
     self.actions = range(output_dim)  
     self.lr = lr
     self.gamma = gamma
     self.loss_clipping = loss_clipping  #for actor loss function
     self.c1 = c1   #weight for entropy term in actor loss function
     self.num_epochs = 10
     self.batchsize = 10
     
     #These will store the samples from which the agent will learn
     self.states = []
     self.actions = []
     self.pi_vecs = []
     self.rewards = []
     
     #Make actor and critic
     self.actor = Actor(input_dim,output_dim,lr,gamma,loss_clipping,c1)
     self.critic = Critic(input_dim,output_dim, self.lr)