Beispiel #1
0
    def __init__(self, actor_id, config, dev, shared_state, shared_queue, eps):
        #        self.env = suite.load(domain_name="walker", task_name="run")
        #        self.action_size = self.env.action_spec().shape[0]
        #        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.env = env_cover(config, dev)
        self.num_env = config['num_envs']
        self.shared_queue = shared_queue
        self.shared_state = shared_state
        self.dev = dev

        self.actor_id = actor_id
        self.burn_in_length = config['burn_in_length']  # 40-80
        self.learning_length = config['learning_length']
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = config['n_step']
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)
        #        self.memory_sequence_size = 1000
        #        self.memory = ReplayMemory(memory_sequence_size=self.memory_sequence_size)
        #        self.memory_save_interval = 3
        self.max_frame = config['actor_max_frame']
        self.gamma = config['gamma']
        #        self.actor_parameter_update_interval = config['actor_parameter_update_interval']
        self.max_shared_q_size = config['max_shared_q_size']

        self.model_path = './'
        self.memory_path = './'

        self.actor = ActorNet(dev, config).to(self.dev)
        self.target_actor = ActorNet(dev, config).to(self.dev)
        self.critic = CriticNet(dev, config).to(self.dev)
        self.target_critic = CriticNet(dev, config).to(self.dev)

        self.actor.load_state_dict(self.shared_state["actor"].state_dict())
        self.target_actor.load_state_dict(
            self.shared_state["target_actor"].state_dict())
        self.critic.load_state_dict(self.shared_state["critic"].state_dict())
        self.target_critic.load_state_dict(
            self.shared_state["target_critic"].state_dict())

        #        self.actor.load_state_dict(self.shared_state["actor"])
        #        self.target_actor.load_state_dict(self.shared_state["target_actor"])
        #        self.critic.load_state_dict(self.shared_state["critic"])
        #        self.target_critic.load_state_dict(self.shared_state["target_critic"])
        self.action_argmax = config['action_argmax']

        #        self.load_model()
        self.epsilon = eps
Beispiel #2
0
 def load_model(self):
     if os.path.isfile(self.model_path + 'model.pt'):
         while True:
             try:
                 # TODO: Delete
                 self.actor = ActorNet(self.obs_size, self.action_size,
                                       self.actor_id % 2 + 1).cuda().eval()
                 self.target_actor = deepcopy(self.actor)
                 self.critic = CriticNet(self.obs_size, self.action_size,
                                         self.actor_id % 2 +
                                         1).cuda().eval()
                 self.target_critic = deepcopy(self.critic)
                 #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)})
                 model_dict = torch.load(self.model_path + 'model.pt')
                 self.actor.load_state_dict(model_dict['actor'])
                 self.target_actor.load_state_dict(
                     model_dict['target_actor'])
                 self.critic.load_state_dict(model_dict['critic'])
                 self.target_critic.load_state_dict(
                     model_dict['target_critic'])
                 self.actor.cuda(self.actor_id % 2 + 1)
                 self.target_actor.cuda(self.actor_id % 2 + 1)
                 self.critic.cuda(self.actor_id % 2 + 1)
                 self.target_critic.cuda(self.actor_id % 2 + 1)
             except:
                 sleep(np.random.rand() * 5 + 2)
             else:
                 break
Beispiel #3
0
    def __init__(self, actor_id):
        self.env = suite.load(domain_name="walker", task_name="run")
        self.action_size = self.env.action_spec().shape[0]
        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.actor_id = actor_id
        self.burn_in_length = 20  # 40-80
        self.learning_length = 40
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = 5
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)
        self.memory_sequence_size = 1000
        self.memory = ReplayMemory(
            memory_sequence_size=self.memory_sequence_size)
        self.memory_save_interval = 3

        self.gamma = 0.997
        self.actor_parameter_update_interval = 500
        self.model_path = './model_data/'
        self.actor = ActorNet(self.obs_size,
                              self.action_size,
                              cuda_id=self.actor_id % 2 +
                              1).cuda(self.actor_id % 2 + 1).eval()
        self.target_actor = deepcopy(self.actor)
        self.critic = CriticNet(self.obs_size,
                                self.action_size,
                                cuda_id=self.actor_id % 2 +
                                1).cuda(self.actor_id % 2 + 1).eval()
        self.target_critic = deepcopy(self.critic)
        self.load_model()
        self.epsilon = 1
        self.last_obs = None
    def reset(self):
        self.action_space = self.env.action_space
        obs_space = self.env.observation_space.spaces
        obs_len = obs_space['observation'].shape[0]
        goal_len = obs_space['desired_goal'].shape[0]
        self.state_size = obs_len + goal_len
        self.actions_size = self.action_space.shape[0]
        max_action = float(self.env.action_space.high[0])

        self.actor = ActorNet(self.state_size, *self.config['net_sizes'],
                              self.actions_size, max_action)
        self.critic = CriticNet(self.state_size, *self.config['net_sizes'],
                                self.actions_size)
        self.actor_target = ActorNet(self.state_size,
                                     *self.config['net_sizes'],
                                     self.actions_size, max_action)
        self.critic_target = CriticNet(self.state_size,
                                       *self.config['net_sizes'],
                                       self.actions_size)
        self.actor_optim = Adam(self.actor.parameters(),
                                lr=self.config['learning_rate'])
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=self.config['learning_rate'])

        self.update(self.critic_target, self.critic, 1)
        self.update(self.actor_target, self.actor, 1)

        self.epsilon = self.config['epsilon']
        self.epsilon_decay = self.config['epsilon_decay']
        self.gamma = self.config['gamma']

        if self.config['PER']:
            self.memory = self.memory = PrioritizedMemory(
                self.config['memory_size'], self.config["memory_alpha"],
                self.config["memory_epsilon"], self.config["memory_beta"],
                self.config["memory_beta_increment"])
        else:
            self.memory = ReplayBuffer(self.config['memory_size'])

        self.batch_size = self.config['batch_size']
        self.normalizer = Normalizer(obs_len, goal_len)
        # warm up the normalizer
        self.normalizer.observe(self.env.reset())
Beispiel #5
0
    def __init__(self, state_size, action_size, num_agents,
                 hidden_actor, hidden_critic, lr_actor, lr_critic,
                 buffer_size, agent_id, use_PER=False, seed=0):

        super(DDPGAgent, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.agent_id = agent_id

        # num_agents*action_size
        self.actor_local = ActorNet(state_size, hidden_actor, action_size, seed=seed).to(device)
        self.critic_local = CriticNet(num_agents*state_size, num_agents*action_size, hidden_critic, 1, seed=seed).to(device)
        self.actor_target = ActorNet(state_size, hidden_actor, action_size, seed=seed).to(device)
        self.critic_target = CriticNet(num_agents*state_size, num_agents*action_size, hidden_critic, 1, seed=seed).to(device)

        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0.) #weight_decay=1.e-5

        self.memory = ReplayBuffer(buffer_size, num_agents, state_size, action_size, use_PER)

        # initialize targets same as original networks
        hard_update(self.actor_target, self.actor_local)
        hard_update(self.critic_target, self.critic_local)
    def __init__(self, n_actors):
        self.env = suite.load(domain_name="walker", task_name="run")
        self.n_actions = self.env.action_spec().shape[0]
        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.n_actors = n_actors
        self.burn_in_length = 20  # 40-80
        self.learning_length = 40
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = 5
        self.memory_sequence_size = 5000000
        self.batch_size = 32
        self.memory = LearnerReplayMemory(
            memory_sequence_size=self.memory_sequence_size,
            batch_size=self.batch_size)

        self.model_path = './model_data/'
        self.memory_path = './memory_data/'
        self.actor = ActorNet(self.obs_size, self.n_actions, 0).cuda()
        self.target_actor = deepcopy(self.actor).eval()
        self.critic = CriticNet(self.obs_size, self.n_actions, 0).cuda()
        self.target_critic = deepcopy(self.critic).eval()
        self.model_save_interval = 50  # 50
        self.memory_update_interval = 50  # 50
        self.target_update_inverval = 500  # 100

        self.gamma = 0.997
        self.actor_lr = 1e-4
        self.critic_lr = 1e-3
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.actor_criterion = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.critic_criterion = nn.MSELoss()
        self.save_model()
Beispiel #7
0
    def __init__(self, learner_id, config, dev, shared_state, shared_queue):

        self.action_size = config['action_space']
        self.obs_size = config['obs_space']

        self.shared_queue = shared_queue
        self.shared_state = shared_state

        self.dev = dev
        self.id = learner_id
        self.burn_in_length = config['burn_in_length']  # 40-80
        self.learning_length = config['learning_length']
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = config['n_step']
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)

        self.gamma = config['gamma']
        #        self.actor_parameter_update_interval = config['actor_parameter_update_interval']

        self.actor = ActorNet(dev, config).to(self.dev)
        self.target_actor = ActorNet(dev, config).to(self.dev)
        self.critic = CriticNet(dev, config).to(self.dev)
        self.target_critic = CriticNet(dev, config).to(self.dev)

        self.actor.load_state_dict(self.shared_state["actor"].state_dict())
        self.target_actor.load_state_dict(
            self.shared_state["target_actor"].state_dict())
        self.critic.load_state_dict(self.shared_state["critic"].state_dict())
        self.target_critic.load_state_dict(
            self.shared_state["target_critic"].state_dict())

        #        self.actor.load_state_dict(self.shared_state["actor"])
        #        self.target_actor.load_state_dict(self.shared_state["target_actor"])
        #        self.critic.load_state_dict(self.shared_state["critic"])
        #        self.target_critic.load_state_dict(self.shared_state["target_critic"])

        self.learner_actor_rate = config['learner_actor_rate']

        self.num_actors = learner_id
        self.n_actions = 1
        self.max_frame = config['learner_max_frame']

        self.memory_sequence_size = config['memory_sequence_size']
        self.batch_size = config['batch_size']
        self.memory = LearnerReplayMemory(self.memory_sequence_size, config,
                                          dev)

        self.model_path = './'
        #        self.memory_path = './memory_data/'
        #        self.model_save_interval = 10 # 50
        self.learner_parameter_update_interval = config[
            'learner_parameter_update_interval']  # 50
        self.target_update_inverval = config['target_update_interval']  # 100

        self.gamma = config['gamma']
        self.actor_lr = config['actor_lr']
        self.critic_lr = config['critic_lr']
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.actor_criterion = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.critic_criterion = nn.MSELoss()
Beispiel #8
0
    num_processes = config['num_processes']
    use_cuda = torch.cuda.is_available()
    dev_cpu = torch.device('cpu')
    dev_gpu = torch.device('cuda' if use_cuda else 'cpu')

    #    manager = mp.Manager()
    #    shared_state = manager.dict()
    #    shared_queue = manager.Queue()

    shared_queue = mp.Queue()

    #    shared_queue = queue.Queue()
    shared_state = dict()

    shared_state["actor"] = ActorNet(dev_cpu, config).share_memory()
    shared_state["critic"] = CriticNet(dev_cpu, config).share_memory()
    shared_state["target_actor"] = ActorNet(dev_cpu, config).share_memory()
    shared_state["target_critic"] = CriticNet(dev_cpu, config).share_memory()
    #    shared_state["frame"] = mp.Array('i', [0 for i in range(num_processes)])
    #    shared_state["sleep"] = mp.Array('i', [0 for i in range(num_processes)])
    shared_state["update"] = mp.Array('i', [0 for i in range(num_processes)])

    #    shared_state["actor"] = ActorNet(config['obs_space'], config['action_space'],dev_cpu)
    #    shared_state["critic"] = CriticNet(config['obs_space'], config['action_space'],dev_cpu)
    #    shared_state["target_actor"] = ActorNet(config['obs_space'], config['action_space'],dev_cpu)
    #    shared_state["target_critic"] = CriticNet(config['obs_space'], config['action_space'],dev_cpu)
    #    shared_state["frame"] = [0 for i in range(num_processes)]
    #    shared_state["sleep"] = [0 for i in range(num_processes)]
    #    shared_state["update"]=False

    #
Beispiel #9
0
    def __init__(self,
                 env,
                 args,
                 e_decay=1,
                 e_min=0.05,
                 l2_decay=0.0001,
                 update_type="hard"):
        """
        Initialize a D4PG Agent.
        """

        self.device = args.device
        self.framework = "D4PG"
        self.eval = args.eval
        self.agent_count = env.agent_count
        self.actor_learn_rate = args.actor_learn_rate
        self.critic_learn_rate = args.critic_learn_rate
        self.batch_size = args.batch_size
        self.buffer_size = args.buffer_size
        self.action_size = env.action_size
        self.state_size = env.state_size
        self.C = args.C
        self._e = args.e
        self.e_decay = e_decay
        self.e_min = e_min
        self.gamma = args.gamma
        self.rollout = args.rollout
        self.tau = args.tau
        self.update_type = update_type

        self.num_atoms = args.num_atoms
        self.vmin = args.vmin
        self.vmax = args.vmax
        self.atoms = torch.linspace(self.vmin, self.vmax,
                                    self.num_atoms).to(self.device)

        self.t_step = 0
        self.episode = 0

        # Set up memory buffers, currently only standard replay is implemented #
        self.memory = ReplayBuffer(self.device, self.buffer_size, self.gamma,
                                   self.rollout)

        #                    Initialize ACTOR networks                         #
        self.actor = ActorNet(args.layer_sizes, self.state_size,
                              self.action_size).to(self.device)
        self.actor_target = ActorNet(args.layer_sizes, self.state_size,
                                     self.action_size).to(self.device)
        self._hard_update(self.actor, self.actor_target)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=self.actor_learn_rate,
                                      weight_decay=l2_decay)

        #                   Initialize CRITIC networks                         #
        self.critic = CriticNet(args.layer_sizes, self.state_size,
                                self.action_size,
                                self.num_atoms).to(self.device)
        self.critic_target = CriticNet(args.layer_sizes, self.state_size,
                                       self.action_size,
                                       self.num_atoms).to(self.device)
        self._hard_update(self.actor, self.actor_target)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=self.critic_learn_rate,
                                       weight_decay=l2_decay)

        self.new_episode()