Esempio n. 1
0
File: ai.py Progetto: ipa-maa/safety
 def __init__(self, state_shape, nb_actions, action_dim, reward_dim, history_len=1, gamma=.99,
              learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0,
              minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1,
              num_units=250, remove_features=False, use_mean=False, use_hra=True, rng=None):
     self.rng = rng
     self.history_len = history_len
     self.state_shape = [1] + state_shape
     self.nb_actions = nb_actions
     self.action_dim = action_dim
     self.reward_dim = reward_dim
     self.gamma = gamma
     self.learning_rate = learning_rate
     self.learning_rate_start = learning_rate
     self.epsilon = epsilon
     self.start_epsilon = epsilon
     self.test_epsilon = test_epsilon
     self.final_epsilon = final_epsilon
     self.minibatch_size = minibatch_size
     self.update_freq = update_freq
     self.update_counter = 0
     self.nb_units = num_units
     self.use_mean = use_mean
     self.use_hra = use_hra
     self.remove_features = remove_features
     self.learning_frequency = learning_frequency
     self.replay_max_size = replay_max_size
     self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len, rng=self.rng,
                                         state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim)
     self.networks = [self._build_network() for _ in range(self.reward_dim)]
     self.target_networks = [self._build_network() for _ in range(self.reward_dim)]
     self.all_params = flatten([network.trainable_weights for network in self.networks])
     self.all_target_params = flatten([target_network.trainable_weights for target_network in self.target_networks])
     self.weight_transfer(from_model=self.networks, to_model=self.target_networks)
     self._compile_learning()
     print('Compiled Model and Learning.')
    def __init__(self, num_state, num_action, configDict, train=True):
        super(AlgoA2C, self).__init__(num_state,
                                      num_action,
                                      configDict,
                                      createResults=False)

        # parameters of Internal DRL algorithm:
        ## Memory:
        self.MEMORY_CAPACITY = 100000
        self.GAMMA = 0.95
        ## Deep network:
        self.MEMORY_BATCH_SIZE = 64  # number of data for one training! ?(Maybe we can set MEMORY_BATCH_SIZE = MEMORY_CAPACITY)

        self.train = train
        if train:
            ## RL algorithm:
            ## Random selection proportion:
            self.MAX_EPSILON = 1.0
            self.MIN_EPSILON = 0.01
            self.LAMBDA = 0.005  # speed of decay
            self.epsilon = self.MAX_EPSILON
        else:
            self.epsilon = 0.0

        self.brain = Brain(num_state,
                           num_action,
                           configDict,
                           RL_GAMMA=self.GAMMA)

        self.memory = ExperienceReplay(self.MEMORY_CAPACITY)
        self.next_model(configDict)
 def __init__(self, state_size, action_size, num_agents, \
              gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \
              buffer_size = 1e5, buffer_type = 'replay', policy_update = 1, \
              noise_init = 1.0, noise_decay=0.9995, min_noise=0.1):
     # General info
     self.state_size = state_size
     self.action_size = action_size
     self.num_agents = num_agents
     self.t_step = 0
     self.gamma = gamma
     # Actor Networks -- Policy-based
     self.actors = [
         DDPG_Actor(state_size, action_size, hidden_dims=(128, 128))
         for i in range(num_agents)
     ]
     self.actor_optimizers = [
         optim.Adam(actor.parameters(), lr=lr_actor)
         for actor in self.actors
     ]
     # targets
     self.target_actors = [
         DDPG_Actor(state_size, action_size, hidden_dims=(128, 128))
         for i in range(num_agents)
     ]
     [
         self.hard_update(self.actors[i], self.target_actors[i])
         for i in range(num_agents)
     ]
     # Critic Network -- Value-based --> in this approach we will use one common network for all the actors
     self.critic = DDPG_Critic(state_size,
                               action_size,
                               hidden_dims=(128, 128))
     self.target_critic = DDPG_Critic(state_size,
                                      action_size,
                                      hidden_dims=(128, 128))
     self.hard_update(self.critic, self.target_critic)
     self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                        lr=lr_critic)
     # How to update networks
     self.tau = tau
     self.policy_update = policy_update
     # Replay memory
     self.buffer_type = buffer_type
     self.memory = ExperienceReplay(action_size,
                                    int(buffer_size))  #ExperienceReplay
     self.per = PrioritizedExperienceReplay(capacity=int(buffer_size),
                                            alpha=0.6,
                                            beta=0.9,
                                            error_offset=0.001)
     # NormalNoiseStrategy
     self.normal_noise = NormalNoiseStrategy(noise_init=noise_init,\
                                             noise_decay=noise_decay,\
                                             min_noise_ratio = min_noise)
Esempio n. 4
0
    def __init__(self, baseline, state_shape=[4], nb_actions=9, action_dim=1, reward_dim=1, history_len=1, gamma=.99,
                 learning_rate=0.00025, epsilon=0.05, final_epsilon=0.05, test_epsilon=0.0, annealing_steps=1000,
                 minibatch_size=32, replay_max_size=100, update_freq=50, learning_frequency=1, ddqn=False, learning_type='pi_b',
                 network_size='nature', normalize=1., device=None, kappa=0.003, minimum_count=0, epsilon_soft=0):

        self.history_len = history_len
        self.state_shape = state_shape
        self.nb_actions = nb_actions
        self.action_dim = action_dim
        self.reward_dim = reward_dim
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.start_learning_rate = learning_rate
        self.epsilon = epsilon
        self.start_epsilon = epsilon
        self.test_epsilon = test_epsilon
        self.final_epsilon = final_epsilon
        self.decay_steps = annealing_steps
        self.minibatch_size = minibatch_size
        self.network_size = network_size
        self.update_freq = update_freq
        self.update_counter = 0
        self.normalize = normalize
        self.learning_frequency = learning_frequency  # frequency that the target network is updated
        self.replay_max_size = replay_max_size
        self.transitions = ExperienceReplay(max_size=self.replay_max_size, history_len=history_len,
                                            state_shape=state_shape, action_dim=action_dim, reward_dim=reward_dim)
        self.ddqn = ddqn
        self.device = device
        self.network = self._build_network()
        self.target_network = self._build_network()
        self.weight_transfer(from_model=self.network, to_model=self.target_network)
        self.network.to(self.device)
        self.target_network.to(self.device)
        self.optimizer = optim.RMSprop(self.network.parameters(), lr=self.learning_rate, alpha=0.95, eps=1e-07)

        # SPIBB parameters
        self.baseline = baseline
        self.learning_type = learning_type
        self.kappa = kappa
        self.minimum_count = minimum_count
        self.epsilon_soft = epsilon_soft
        self.training_step = 0
        self.interaction_step = 0  # counts interactions with the environment (during training and evaluation)
        self.logger = None
 def __init__(self, state_size, action_size, num_agents, seed, \
              gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \
              buffer_size = 10e5, buffer_type = 'replay', policy_update = 1):
     # General info
     self.state_size = state_size
     self.action_size = action_size
     self.num_agents = num_agents
     self.seed = random.seed(seed)
     self.t_step = 0
     self.gamma = gamma
     # Actor Network -- Policy-based
     self.actor = DDPG_Actor(state_size,
                             action_size,
                             hidden_dims=(128, 128),
                             seed=seed)
     self.target_actor = DDPG_Actor(state_size,
                                    action_size,
                                    hidden_dims=(128, 128),
                                    seed=seed)
     self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
     # Critic Network -- Value-based
     self.critic = DDPG_Critic(state_size,
                               action_size,
                               hidden_dims=(128, 128),
                               seed=seed)
     self.target_critic = DDPG_Critic(state_size,
                                      action_size,
                                      hidden_dims=(128, 128),
                                      seed=seed)
     self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                        lr=lr_critic)
     self.tau = tau
     # Replay memory
     self.buffer_type = buffer_type
     self.memory = ExperienceReplay(action_size,
                                    int(buffer_size))  #ExperienceReplay
     self.per = PrioritizedExperienceReplay(capacity=int(buffer_size),
                                            alpha=0.6,
                                            beta=0.9,
                                            error_offset=0.001)
     # NormalNoiseStrategy
     self.normal_noise = NormalNoiseStrategy()
     # Delayed Updates from TD3
     self.policy_update = policy_update
Esempio n. 6
0
    def __init__(self,
                 osize,
                 asize,
                 seed,
                 buffersize=int(1e6),
                 gamma=0.99,
                 epsilon=0.05,
                 epsilondecay=1e6,
                 epsilonmin=0.1,
                 minibatchsize=128,
                 lr=0.01,
                 tau=0.01):
        """
        Initialize DQN agent parameters.
        """

        # initialize agent parameters
        self.osize = osize
        self.asize = asize
        self.gamma = gamma
        self.epsilon0 = epsilon
        self.epsilon = epsilon
        self.epsilondecay = epsilondecay
        self.epsilonmin = epsilonmin
        self.minibatchsize = minibatchsize
        self.lr = lr
        self.tau = tau
        self.stepcount = 0
        self.loss_log = []

        # set the random seed
        self.seed = torch.manual_seed(seed)

        # create local and target Q networks
        self.Q = QNetwork(osize, asize).to(self.device)
        self.targetQ = QNetwork(osize, asize).to(self.device)

        # initialize optimizer
        self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr)

        # initialize experience replay
        self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed)
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size=int(1e5),
                 batch_size=256,
                 learn_every=1,
                 update_every=1,
                 gamma=0.99,
                 tau=0.02,
                 lr_actor=2e-4,
                 lr_critic=2e-3,
                 random_seed=None,
                 use_asn=True,
                 asn_kwargs={},
                 use_psn=False,
                 psn_kwargs={},
                 use_per=False,
                 restore=None):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.update_every = update_every
        self.learn_every = learn_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        # Keep track of how many times we've updated weights
        self.i_updates = 0
        self.i_step = 0
        self.use_asn = use_asn
        self.use_psn = use_psn
        self.use_per = use_per

        if random_seed is not None:
            random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        if self.use_psn:
            self.actor_perturbed = Actor(state_size, action_size).to(device)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=device)
            self.actor_local.load_state_dict(checkpoint[0]['actor'])
            self.actor_target.load_state_dict(checkpoint[0]['actor'])
            if self.use_psn:
                self.actor_perturbed.load_state_dict(checkpoint[0]['actor'])
            self.critic_local.load_state_dict(checkpoint[0]['critic'])
            self.critic_target.load_state_dict(checkpoint[0]['critic'])

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic)

        # Hard copy weights from local to target networks
        policy_update(self.actor_local, self.actor_target, 1.0)
        policy_update(self.critic_local, self.critic_target, 1.0)

        # Noise process
        if self.use_asn:
            self.action_noise = OUNoise(action_size, **asn_kwargs)

        if self.use_psn:
            self.param_noise = ParameterSpaceNoise(**psn_kwargs)

        if self.use_per:
            self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size,
                                                      random_seed)
        else:
            self.buffer = ExperienceReplay(buffer_size, batch_size,
                                           random_seed)
Esempio n. 8
0
    def __init__(self,
                 state_shape,
                 nb_actions,
                 action_dim,
                 reward_dim,
                 history_len=1,
                 gamma=.99,
                 is_aggregator=True,
                 learning_rate=0.00025,
                 transfer_lr=0.0001,
                 final_lr=0.001,
                 annealing_lr=True,
                 annealing=True,
                 annealing_episodes=5000,
                 epsilon=1.0,
                 final_epsilon=0.05,
                 test_epsilon=0.001,
                 minibatch_size=32,
                 replay_max_size=100,
                 replay_memory_size=50000,
                 update_freq=50,
                 learning_frequency=1,
                 num_units=250,
                 remove_features=False,
                 use_mean=False,
                 use_hra=True,
                 rng=None,
                 test=False,
                 transfer_learn=False):
        self.test = test
        self.transfer_learn = transfer_learn

        self.rng = rng
        self.history_len = history_len
        # self.state_shape = [1] + state_shape # この操作が謎
        self.state_shape = state_shape
        self.nb_actions = nb_actions
        self.action_dim = action_dim
        self.reward_dim = reward_dim
        self.gamma = gamma

        self.is_aggregator = is_aggregator
        self.agg_w = np.ones((self.reward_dim, 1, 1))

        self.qs = np.zeros((self.reward_dim, 1, self.nb_actions))
        self.agg_q = np.zeros((self.reward_dim, 1, self.nb_actions))
        self.merged_q = np.zeros((1, self.nb_actions))
        self.qs_list = []
        self.agg_q_list = []
        self.merged_q_list = []

        self.epsilon = epsilon
        self.start_epsilon = epsilon
        self.test_epsilon = test_epsilon
        self.final_epsilon = final_epsilon
        self.annealing = annealing
        self.annealing_episodes = annealing_episodes
        self.annealing_episode = (self.start_epsilon -
                                  self.final_epsilon) / self.annealing_episodes

        if not self.transfer_learn:
            self.learning_rate = learning_rate
            self.start_lr = learning_rate
        else:
            self.learning_rate = transfer_lr
            self.start_lr = transfer_lr
        self.final_lr = final_lr
        self.annealing_lr = annealing_lr
        self.annealing_episode_lr = (self.start_lr -
                                     self.final_lr) / self.annealing_episodes

        self.get_action_time_channel = np.zeros(4)
        self.get_max_a_time_channel = np.zeros(3)

        self.minibatch_size = minibatch_size
        self.update_freq = update_freq
        self.update_counter = 0
        self.nb_units = num_units
        self.use_mean = use_mean
        self.use_hra = use_hra
        self.remove_features = remove_features
        self.learning_frequency = learning_frequency
        self.replay_max_size = replay_max_size
        self.replay_memory_size = replay_memory_size

        self.transitions = ExperienceReplay(max_size=self.replay_max_size,
                                            history_len=history_len,
                                            rng=self.rng,
                                            state_shape=state_shape,
                                            action_dim=action_dim,
                                            reward_dim=reward_dim)

        # ネットワークの構築
        self.networks = [self._build_network() for _ in range(self.reward_dim)]
        self.target_networks = [
            self._build_network() for _ in range(self.reward_dim)
        ]

        # パラメータの保持 reward_dim個のネットワークにある各層の重みをflatten
        self.all_params = flatten(
            [network.trainable_weights for network in self.networks])
        self.all_target_params = flatten([
            target_network.trainable_weights
            for target_network in self.target_networks
        ])

        # target_networksの重みを更新する.
        self.weight_transfer(from_model=self.networks,
                             to_model=self.target_networks)

        # ネットワークのコンパイル lossなどの定義
        self._compile_learning()
        if not self.test:
            if self.transfer_learn:
                self.load_weights(
                    weights_file_path=
                    './learned_weights/init_weights_7chan/q_network_weights.h5'
                )
                print('Compiled Model. -- Transfer Learning -- ')
                print('learning rate: ' + str(self.learning_rate))
            else:
                print('Compiled Model. -- Learning -- ')

        else:
            # self.load_weights(weights_file_path='./results/test_weights/q_network_weights.h5')
            # self.load_weights(weights_file_path='./learned_weights/test_weights_7chan/q_network_weights.h5')
            self.load_weights(
                weights_file_path=
                './learned_weights/test_weights_7chan_8room/q_network_weights.h5'
            )

            print('Compiled Model and Load weights. -- Testing -- ')