Ejemplo n.º 1
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    target_net.load_state_dict(online_net.state_dict())
    online_net.share_memory()
    target_net.share_memory()

    optimizer = SharedAdam(online_net.parameters(), lr=lr)
    global_ep, global_ep_r, res_queue = mp.Value('i',
                                                 0), mp.Value('d',
                                                              0.), mp.Queue()

    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()

    workers = [
        Worker(online_net, target_net, optimizer, global_ep, global_ep_r,
               res_queue, i) for i in range(mp.cpu_count())
    ]
    [w.start() for w in workers]
    res = []
    while True:
        r = res_queue.get()
        if r is not None:
            res.append(r)
            [ep, ep_r, loss] = r
            writer.add_scalar('log/score', float(ep_r), ep)
            writer.add_scalar('log/loss', float(loss), ep)
        else:
            break
    [w.join() for w in workers]
Ejemplo n.º 2
0
class Actor:
    def __init__(self, actor_id, n_actors, shared_dict, device='cpu'):
        # params
        self.gamma = 0.99
        self.epsilon = 0.4 ** (1 + actor_id * 7 / (n_actors - 1))
        self.bootstrap_steps = 3
        self.alpha = 0.6
        self.priority_epsilon = 1e-6
        self.device = device
        self.actor_id = actor_id

        # path
        self.memory_path = os.path.join(
            './', 'logs', 'memory')

        # memory
        self.memory_size = 50000
        self.batch_size = 32
        self.action_repeat = 4
        self.n_stacks = 4
        self.burn_in_length = 10
        self.learning_length = 10
        self.overlap_length = 10
        self.eta = 0.9
        self.sequence_length = self.burn_in_length + self.learning_length
        self.stack_count = self.n_stacks // self.action_repeat
        self.memory_save_interval = 5
        self.episode_start_index = 0
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps)

        # net
        self.shared_dict = shared_dict
        self.net_load_interval = 5
        self.net = QNet(self.device).to(self.device)
        self.target_net = QNet(self.device).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())

        # env
        self.env = PongEnv(self.action_repeat, self.n_stacks)
        self.episode_reward = 0
        self.n_episodes = 0
        self.n_steps = 0
        self.memory_count = 0
        self.state = self.env.reset()
    
    def run(self):
        while True:
            self.step()

    def step(self):
        state = self.state
        action, q_value, h, c, target_q_value, target_h, target_c = self.select_action(state)
        q_value = q_value.detach().cpu().numpy()
        target_q_value = target_q_value.detach().cpu().numpy()
        next_state, reward, done, _ = self.env.step(action)
        self.episode_reward += reward
        self.n_steps += 1

        self.n_steps_memory.add(q_value, state[-self.action_repeat:], h, c, target_h, target_c, action, reward, self.stack_count)
        if self.stack_count > 1:
            self.stack_count -= 1
        
        if self.n_steps > self.bootstrap_steps:
            pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get()
            priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done)
            self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority)
            self.memory_count += 1
        self.state = next_state.copy()

        if done:
            while self.n_steps_memory.size > 0:
                pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get()
                priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done)
                self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority)
                self.memory_count += 1
            self.reset()
    
    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_value, h, c = self.net(state, True)
            target_q_value, target_h, target_c = self.target_net(state, True)
        if np.random.random() < self.epsilon:
            action = np.random.randint(6)
        else:
            action = q_value.argmax().item()

        return action, q_value, h, c, target_q_value, target_h, target_c
    
    def reset(self):
        if self.n_episodes % 1 == 0:
            print('episodes:', self.n_episodes, 'actor_id:', self.actor_id, 'return:', self.episode_reward)

        self.net.reset()
        self.target_net.reset()
        self.set_seq_start_index()
        self.state = self.env.reset()
        self.episode_start_index = self.replay_memory.index
        self.episode_reward = 0
        self.n_episodes += 1
        self.n_steps = 0
        self.memory_count = 0
        self.stack_count = self.n_stacks // self.action_repeat

        # reset n_step memory
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)

        # save replay memory
        if self.n_episodes % self.memory_save_interval == 0:
            self.replay_memory.save(self.memory_path, self.actor_id)
            self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps)
            self.episode_start_index = 0
            gc.collect()
        
        # load net
        if self.n_episodes % self.net_load_interval == 0:
            self.load_model()
    
    def load_model(self):
        try:
            self.net.load_state_dict(self.shared_dict['net_state'])
            self.target_net.load_state_dict(self.shared_dict['target_net_state'])
        except:
            print('load error')

    def calc_priority(self, q_value, action, reward, next_q_value, target_next_q_value, done):
        q_value = q_value.reshape(-1)[action]
        target_next_q_value = target_next_q_value.reshape(-1)

        if done:
            target_q_value = reward
        else:
            next_action = next_q_value.argmax(-1)
            target_next_q_value = target_next_q_value[next_action]
            target_q_value = reward + (self.gamma**self.bootstrap_steps) * target_next_q_value
        priority = np.abs(q_value - target_q_value) + self.priority_epsilon
        priority = priority ** self.alpha
    
        return priority
    
    def set_seq_start_index(self):
        last_index = self.replay_memory.index
        start_index  = self.episode_start_index

        seq_start_index = [i for i in range(start_index, last_index-self.sequence_length, self.overlap_length)]
        seq_start_index.append(last_index - self.sequence_length)
        seq_start_index = np.array(seq_start_index)
        self.replay_memory.update_sequence_priority(seq_start_index)
        self.replay_memory.memory['is_seq_start'][seq_start_index] = 1
Ejemplo n.º 3
0
            else:
                mask = 1

            memory.push(history, next_history, action, reward, mask)

            score += reward
            history = deepcopy(next_history)

            if steps > hp.initial_exploration:
                if epsilon > 0.1:
                    episode_len += 1
                    epsilon -= 0.0001

            if steps % hp.update_target:
                update_target_model(model, target_model)

            if done:
                print('episode: ', episode, 'steps: ', steps, 'epsilon: ',
                      round(epsilon, 4), ' score: ', score)
                batch = memory.sample()
                for _ in range(episode_len):
                    train_model(model, target_model, batch, optimizer)
                break

        if episode % hp.save_freq:
            score = int(score)
            directory = 'save_model/'
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save(model.state_dict(),
                       'save_model/' + str(score) + 'model.pt')
Ejemplo n.º 4
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    print('state size:', state_size)
    print('action size:', action_size)

    q_net = QNet(state_size, action_size, args)
    target_q_net = QNet(state_size, action_size, args)
    optimizer = optim.Adam(q_net.parameters(), lr=0.001)

    update_target_model(q_net, target_q_net)

    writer = SummaryWriter(args.logdir)

    replay_buffer = deque(maxlen=10000)
    running_score = 0
    steps = 0

    for episode in range(args.max_iter_num):
        done = False
        score = 0

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if args.render:
                env.render()

            steps += 1

            q_values = q_net(torch.Tensor(state))
            action = get_action(q_values, action_size, args.epsilon)

            next_state, reward, done, _ = env.step(action)

            next_state = np.reshape(next_state, [1, state_size])
            reward = reward if not done or score == 499 else -1
            mask = 0 if done else 1

            replay_buffer.append((state, action, reward, next_state, mask))

            state = next_state
            score += reward

            if steps > args.initial_exploration:
                args.epsilon -= args.epsilon_decay
                args.epsilon = max(args.epsilon, 0.1)

                mini_batch = random.sample(replay_buffer, args.batch_size)

                q_net.train(), target_q_net.train()
                train_model(q_net, target_q_net, optimizer, mini_batch)

                if steps % args.update_target == 0:
                    update_target_model(q_net, target_q_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score

        if episode % args.log_interval == 0:
            print(
                '{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format(
                    episode, running_score, args.epsilon))
            writer.add_scalar('log/score', float(score), episode)

        if running_score > args.goal_score:
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)

            ckpt_path = args.save_path + 'model.pth.tar'
            torch.save(q_net.state_dict(), ckpt_path)
            print('Running score exceeds 400. So end')
            break
Ejemplo n.º 5
0
def main():

    if not (os.path.isdir("logs")):
        os.makedirs("logs")

    if (args.entropy and args.boltzmann):
        raise ValueError("Entropy as well as Boltzmann set.")

    print(args)

    working_dir = "logs/" + args.dir
    if not (os.path.isdir(working_dir)):
        os.mkdir(working_dir)

    env = QubeSwingupEnv(use_simulator=True)

    num_inputs = env.observation_space.shape[0]
    num_actions = NUMBER_OF_ACTIONS
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)

    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter(working_dir)

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory_With_TDError(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0
    training_started = False

    best_running_score = -1000

    for e in range(args.e):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)
        start_time = time.time()

        while not done:
            steps += 1
            action = get_action(state,
                                target_net,
                                epsilon,
                                use_entropy=args.entropy,
                                use_boltzmann=args.boltzmann)
            next_state, reward, done, info = env.step(
                get_continuous_action(action))

            reward = give_me_reward(info["alpha"], info["theta"])

            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            action_one_hot = np.zeros(NUMBER_OF_ACTIONS)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                if not training_started:
                    print("---------------- training started ---------------")
                    training_started = True
                epsilon -= 0.000005
                epsilon = max(epsilon, 0.1)
                beta += 0.000005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights, device)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        end_time = time.time()
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.
                format(e, running_score, epsilon, beta))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > best_running_score and args.save:
            torch.save(online_net.state_dict(),
                       working_dir + "/best_model.pth")
            best_running_score = running_score
Ejemplo n.º 6
0
class QTDAgent(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 learning_rate=0.001,
                 reward_decay=0.99,
                 e_greedy=0.9):
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.lr = learning_rate
        self.gamma = reward_decay  #  in according to the parameters in the formulation.
        self.epsilon = e_greedy
        self.EPS_START = 0.9
        self.EPS_END = 0.05
        self.EPS_DECAY = 30000  # this decay is to slow. # TO DO: figure out the relationship between the decay and the totoal step.
        # try to use a good strategy to solve this problem.
        use_cuda = torch.cuda.is_available()
        self.LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
        self.FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
        self.model = QNet(self.state_dim,
                          self.action_dim).cuda() if use_cuda else QNet(
                              self.state_dim, self.action_dim)

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        # self.scheduler = optim.StepLR(self.optimizer, step_size=10000, gamma=0.5) # the learning rate decrease by a factor gamma every 10000 step_size.

        util.weights_init(self.model)

    def sbc(self, v, volatile=False):
        return Variable(self.FloatTensor((np.expand_dims(v, 0).tolist())),
                        volatile=volatile)

    def get_actions(self, state):
        action = self.model(self.sbc(state, volatile=True))
        return action

    def select_action(self, state, steps_done):
        util.adjust_learning_rate(self.optimizer,
                                  self.lr,
                                  steps_done,
                                  10000,
                                  lr_decay=0.2)  # global steps_done
        sample = random.random()
        esp_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
                np.exp(-1. * steps_done / self.EPS_DECAY)

        if sample > esp_threshold:
            actions = self.get_actions(state)
            action = actions.data.max(1)[1].view(1, 1)
            return action
        else:
            return self.LongTensor([[random.randrange(self.action_dim)]])

    def update(self, pending):  #	def update(self, s, a, r, s_, a_,done=False):
        pending_len = len(pending)
        loss = 0
        while (pending_len):
            pending_len = pending_len - 1
            [s, a, r, s_, a_, done] = pending[pending_len]
            if (done == True):
                expect_state_action_value = r
            else:
                non_final_next_states = self.model(self.sbc(s_, volatile=True))
                expect_state_action_value = r + self.gamma * non_final_next_states.max(
                    1)[0]
                expect_state_action_value.volatile = False
            # expect_state_action_value = r + self.gamma*self.model(Variable(torch.from_numpy(np.expand_dims(s_,0).astype('float32')))).max(1)[0]
            state_action_value = self.model(self.sbc(s))[0, a]
            loss += 0.5 * (state_action_value -
                           expect_state_action_value).pow(2)
        self.optimizer.zero_grad()
        loss.backward()
        # loss.backward()
        # for param in self.model.parameters():
        # 	param.grad.data.clamp_(-1,1)
        self.optimizer.step()

    def save_model(self, path):
        torch.save(self.model.state_dict(), '{}QTDAgent.pt'.format(path))
        # torch.save(self.target_critic.state_dict(), '{}/critic.pt'.format(path))
        print('Models saved successfully')

    def load_model(self, name):
        self.model.load_state_dict(name)
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(net, target_net)

    optimizer = optim.Adam(net.parameters(), lr=0.001)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    
    net.to(device)
    target_net.to(device)
    net.train()
    target_net.train()
    memory = Memory(10000)
    running_score = 0
    epsilon = 1.0
    steps = 0
    
    for e in range(3000):
        done = False
        
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(state)
            action = get_action(epsilon, qvalue, num_actions)
            next_state, reward, done, _ = env.step(action)
            
            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)
            
            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            memory.push(state, next_state, action, reward, mask)

            score += reward
            state = next_state

            if steps > args.initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(args.batch_size)
                train_model(net, target_net, optimizer, batch, args.batch_size)

                if steps % args.update_target:
                    update_target_model(net, target_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % args.log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))
            writer.add_scalar('log/score', float(score), running_score)

        if running_score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break   
Ejemplo n.º 8
0
class Actor:
    def __init__(self, actor_id, n_actors, device='cpu'):
        # params
        self.gamma = 0.99
        self.epsilon = 0.4**(1 + actor_id * 7 / (n_actors - 1))
        self.bootstrap_steps = 3
        self.alpha = 0.6
        self.priority_epsilon = 1e-6
        self.device = device
        self.actor_id = actor_id

        # path
        self.memory_path = os.path.join('./', 'logs', 'memory')
        self.net_path = os.path.join('./', 'logs', 'model', 'net.pt')
        self.target_net_path = os.path.join('./', 'logs', 'model',
                                            'target_net.pt')

        # memory
        self.memory_size = 50000
        self.batch_size = 32
        self.action_repeat = 4
        self.n_stacks = 4
        self.stack_count = self.n_stacks // self.action_repeat
        self.memory_save_interval = 1
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size,
                                          self.bootstrap_steps)

        # net
        self.net_load_interval = 5
        self.net = QNet(self.net_path).to(self.device)
        self.target_net = QNet(self.target_net_path).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())

        # env
        self.env = PongEnv(self.action_repeat, self.n_stacks)
        self.episode_reward = 0
        self.n_episodes = 0
        self.n_steps = 0
        self.memory_count = 0
        self.state = self.env.reset()

    def run(self):
        while True:
            self.step()

    def step(self):
        state = self.state
        action = self.select_action(state)
        next_state, reward, done, _ = self.env.step(action)
        self.episode_reward += reward
        self.n_steps += 1

        self.n_steps_memory.add(state[-self.action_repeat:], action, reward,
                                self.stack_count)
        if self.stack_count > 1:
            self.stack_count -= 1

        if self.n_steps > self.bootstrap_steps:
            state, action, reward, stack_count = self.n_steps_memory.get()
            self.replay_memory.add(state, action, reward, done, stack_count)
            self.memory_count += 1
        self.state = next_state.copy()

        if done:
            while self.n_steps_memory.size > 0:
                state, action, reward, stack_count = self.n_steps_memory.get()
                self.replay_memory.add(state, action, reward, done,
                                       stack_count)
                self.memory_count += 1
            self.reset()

    def select_action(self, state):
        if np.random.random() < self.epsilon:
            action = np.random.randint(6)
        else:
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                q_val = self.net(state)
                action = q_val.argmax().item()
        return action

    def reset(self):
        if self.n_episodes % 1 == 0:
            print('episodes:', self.n_episodes, 'actor_id:', self.actor_id,
                  'return:', self.episode_reward)

        self.calc_priority()
        self.state = self.env.reset()
        self.episode_reward = 0
        self.n_episodes += 1
        self.n_steps = 0
        self.memory_count = 0
        self.stack_count = self.n_stacks // self.action_repeat

        # reset n_step memory
        self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma)

        # save replay memory
        if self.n_episodes % self.memory_save_interval == 0:
            self.replay_memory.save(self.memory_path, self.actor_id)
            self.replay_memory = ReplayMemory(self.memory_size,
                                              self.batch_size,
                                              self.bootstrap_steps)

        # load net
        if self.n_episodes % self.net_load_interval == 0:
            self.net.load()
            self.target_net.load()

    def calc_priority(self):
        last_index = self.replay_memory.size
        start_index = last_index - self.memory_count

        batch, index = self.replay_memory.indexing_sample(
            start_index, last_index, self.device)
        batch_size = batch['state'].shape[0]
        priority = np.zeros(batch_size, dtype=np.float32)

        mini_batch_size = 500
        for start_index in range(0, batch_size, mini_batch_size):
            last_index = min(start_index + mini_batch_size, batch_size)
            mini_batch = dict()
            for key in batch.keys():
                if key in ['reward', 'done']:
                    mini_batch[key] = batch[key][start_index:last_index]
                else:
                    mini_batch[key] = torch.tensor(
                        batch[key][start_index:last_index]).to(self.device)
            mini_batch['action'] = mini_batch['action'].view(-1, 1).long()

            with torch.no_grad():
                # q_value
                q_value = self.net(mini_batch['state']).gather(
                    1, mini_batch['action']).view(-1, 1).cpu().numpy()

                # taget_q_value
                next_action = torch.argmax(self.net(mini_batch['next_state']),
                                           1).view(-1, 1)
                next_q_value = self.target_net(
                    mini_batch['next_state']).gather(
                        1, next_action).cpu().numpy()

            target_q_value = mini_batch['reward'] + (
                self.gamma**
                self.bootstrap_steps) * next_q_value * (1 - mini_batch['done'])
            delta = np.abs(q_value -
                           target_q_value).reshape(-1) + self.priority_epsilon
            delta = delta**self.alpha
            priority[start_index:last_index] = delta

        self.replay_memory.update_priority(index, priority)
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = 3
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = QNet(num_actions)
    target_net = QNet(num_actions)
    update_target_model(net, target_net)

    optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)

    net.to(device)
    target_net.to(device)
    net.train()
    target_net.train()
    memory = Memory(100000)
    running_score = 0
    epsilon = 1.0
    steps = 0

    for e in range(10000):
        done = False
        dead = False

        score = 0
        avg_loss = []
        start_life = 5
        state = env.reset()

        state = pre_process(state)
        state = torch.Tensor(state).to(device)
        history = torch.stack((state, state, state, state))

        for i in range(3):
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            state = pre_process(state)
            state = torch.Tensor(state).to(device)
            state = state.unsqueeze(0)
            history = torch.cat((state, history[:-1]), dim=0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(history.unsqueeze(0))
            action = get_action(epsilon, qvalue, num_actions)

            next_state, reward, done, info = env.step(action + 1)

            next_state = pre_process(next_state)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            next_history = torch.cat((next_state, history[:-1]), dim=0)

            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            score += reward
            reward = np.clip(reward, -1, 1)

            mask = 0 if dead else 1
            memory.push(history.cpu(), next_history.cpu(), action, reward,
                        mask)

            if dead:
                dead = False

            if steps > args.initial_exploration:
                epsilon -= 1e-6
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(args.batch_size)
                loss = train_model(net, target_net, optimizer, batch)

                if steps % args.update_target:
                    update_target_model(net, target_net)
            else:
                loss = 0

            avg_loss.append(loss)
            history = next_history

        if e % args.log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}'
                .format(e, score, epsilon, steps, np.mean(avg_loss)))
            writer.add_scalar('log/score', float(score), steps)
            writer.add_scalar('log/score', np.mean(avg_loss), steps)

        if score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break
def train(render):
    online_net = QNet(h=84, w=84, outputs=36)
    online_net.load_state_dict(torch.load('saved/online_net.pt'))
    target_net = QNet(h=84, w=84, outputs=36)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    memory = torch.load('saved/model_memory.pt')
    epsilon = 0.1
    steps = 0
    beta = beta_start
    loss = 0

    for e in range(100000):
        #level = random.choice(LEVEL_SET)
        level = 'Level01'
        env = make_retro(game=env_name,
                         state=level,
                         use_restricted_actions=retro.Actions.DISCRETE)

        done = False

        total_reward = 0.0
        state = env.reset()
        state = torch.Tensor(state).to(device).permute(2, 0, 1)
        #state = state.view(state.size()[0], -1)
        state = state.unsqueeze(0)

        while not done:
            steps += 1
            action = get_action(state.to(device), target_net, epsilon, env)

            if render:
                env.render()

            next_state, reward, done, info = env.step(action)

            next_state = torch.Tensor(next_state).permute(2, 0, 1)
            #next_state = next_state.view(next_state.size()[0], -1)
            next_state = next_state.unsqueeze(0)

            total_reward += reward

            mask = 0 if done else 1
            action_one_hot = torch.zeros(36)
            action_one_hot[action] = 1

            reward = torch.tensor([info['score']]).to(device)
            memory.push(state, next_state, action_one_hot, reward, mask)

            state = next_state

            if len(memory) > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.02)
                beta += 0.00005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        if e % 1 == 0:
            print('{} episode | Total Reward: {}'.format(e, total_reward))
            torch.save(online_net.state_dict(), 'saved/online_net.pt')
            torch.save(memory, 'saved/model_memory.pt')
        env.close()
Ejemplo n.º 11
0
class Learner:
    def __init__(self, n_actors, shared_dict, device='cuda:0'):
        # params
        self.gamma = 0.99
        self.alpha = 0.6
        self.bootstrap_steps = 3
        self.initial_exploration = 50000
        self.priority_epsilon = 1e-6
        self.device = device
        self.n_epochs = 0
        self.n_actors = n_actors

        # path
        self.memory_path = os.path.join('./', 'logs', 'memory')

        # memory
        self.burn_in_length = 10
        self.learning_length = 10
        self.sequence_length = self.burn_in_length + self.learning_length
        self.memory_size = 500000
        self.batch_size = 8
        self.memory_load_interval = 20
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size,
                                          self.bootstrap_steps)

        # net
        self.shared_dict = shared_dict
        self.net_save_interval = 100
        self.target_update_interval = 1000
        self.net = QNet(self.device).to(self.device)
        self.target_net = QNet(self.device).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())
        self.save_model()
        self.optim = optim.RMSprop(self.net.parameters(),
                                   lr=0.00025 / 4.0,
                                   alpha=0.95,
                                   eps=1.5e-7,
                                   centered=True)

    def run(self):
        while True:
            if self.replay_memory.size > self.initial_exploration:
                self.train()
                if self.n_epochs % 100 == 0:
                    print('trained', self.n_epochs, 'epochs')
            self.interval()

    def train(self):
        batch, seq_index, index = self.replay_memory.sample(self.device)

        self.net.set_state(batch['hs'], batch['cs'])
        self.target_net.set_state(batch['target_hs'], batch['target_cs'])

        ### burn-in step ###
        state = batch['state'][:self.burn_in_length]
        next_state = batch['next_state'][:self.burn_in_length]
        with torch.no_grad():
            _ = self.net(state)
            _ = self.target_net(next_state)

        ### learning step ###
        state = batch['state'][self.burn_in_length:]
        next_state = batch['next_state'][self.burn_in_length:]

        # q_value
        q_value = self.net(state).gather(1, batch['action'].view(-1, 1))

        # target q_value
        with torch.no_grad():
            next_action = torch.argmax(self.net(next_state), 1).view(-1, 1)
            next_q_value = self.target_net(next_state).gather(1, next_action)
            target_q_value = batch["reward"].view(
                -1, 1) + (self.gamma**self.bootstrap_steps) * next_q_value * (
                    1 - batch['done'].view(-1, 1))

        # update
        self.optim.zero_grad()
        loss = torch.mean(0.5 * (q_value - target_q_value)**2)
        loss.backward()
        self.optim.step()

        priority = (np.abs(
            (q_value - target_q_value).detach().cpu().numpy()).reshape(-1) +
                    self.priority_epsilon)**self.alpha
        self.replay_memory.update_priority(
            index[self.burn_in_length:].reshape(-1), priority)
        self.replay_memory.update_sequence_priority(seq_index, True)

    def interval(self):
        self.n_epochs += 1
        if self.n_epochs % self.target_update_interval == 0:
            self.target_net.load_state_dict(self.net.state_dict())
        if self.n_epochs % self.net_save_interval == 0:
            self.save_model()
        if self.n_epochs % self.memory_load_interval == 0:
            for i in range(self.n_actors):
                self.replay_memory.load(self.memory_path, i)

    def save_model(self):
        self.shared_dict['net_state'] = deepcopy(self.net).cpu().state_dict()
        self.shared_dict['target_net_state'] = deepcopy(
            self.target_net).cpu().state_dict()
Ejemplo n.º 12
0
        cur_loss = trainEval(train_loader, model, optimizer, args, True)
        val_loss = trainEval(val_loader, model, optimizer, args, False)
        test_loss = trainEval(test_loader, model, optimizer, args, False)
       
        metrics = {'epoch': epoch}
        metrics['mse_train'] = cur_loss
        metrics['mse_val'] = val_loss
        metrics['mse_test'] = test_loss
        log = log.append(metrics, ignore_index=True)
        log.to_csv(log_file, index=False)
        
        a_t.append(cur_loss)
        a_v.append(val_loss)
        a_te.append(test_loss)

        if best_mse is None or (val_loss < best_mse):
            plotGraph(a_t, a_v, a_te, '.', run_name)
            plotGraph(a_t, a_v, a_te, run_dir, run_name)
            best_mse = val_loss
            ckpt = os.path.join(ckpt_dir, 'ckpt_e{}.pth'.format(epoch))
            torch.save({
                'epoch': epoch,
                'mse_train': cur_loss,
                'mse_val': val_loss,
                'mse_test': test_loss,
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }, ckpt)

        scheduler.step(val_loss)
Ejemplo n.º 13
0
    optimizer.step()
    return loss


# Build environment
env = make_atari('PongNoFrameskip-v4', stack=2)
env = wrap_pytorch(env)

number_actions = env.action_space.n
replay_buffer = ReplayBuffer(replay_memory_size)

# Separate target net & policy net
input_shape = env.reset().shape
current_net = QNet(input_shape, number_actions).to(device)
target_net = QNet(input_shape, number_actions).to(device)  # with older weights
target_net.load_state_dict(current_net.state_dict())
target_net.eval()
optimizer = opt_algorithm(current_net.parameters(), lr=learning_rate)

n_episode = 1
episode_return = 0
best_return = 0
returns = []
state = env.reset()
for i in count():
    # env.render()
    eps = get_epsilon(i)
    action = select_action(state,
                           current_net,
                           eps,
                           number_action=number_actions)
Ejemplo n.º 14
0
def main(L, mouse_initial_indices, rewardlist, actions_list):
    if mouse_initial_indices is None:
        all_possible_starting_positions = np.array([*np.where(L == 1)]).T
    scores = [0]
    best_scores = [0]
    env = deepcopy(L)
    torch.manual_seed(2020)

    num_inputs = 2 + 1
    num_actions = 4
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    # writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0
    inint = mouse_initial_indices
    best_score = 0
    number_episode = 1000
    for e in range(number_episode):
        if inint is None:
            mouse_initial_indices = all_possible_starting_positions[
                np.random.choice(range(len(all_possible_starting_positions)))]

        done = False
        env = deepcopy(L)
        eaubue = 0.
        score = 0
        state = np.array(mouse_initial_indices)
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            action = get_action(state, target_net, epsilon, env, eaubue=eaubue)
            newstate = state + torch.Tensor(np.array(
                actions_list[action])).to(device)
            if env[int(newstate[0][0].tolist()),
                   int(newstate[0][1].tolist())] != 0:
                next_state = newstate
                new_eaubue = eaubue
                reward = rewardlist[env[int(newstate[0][0].tolist()),
                                        int(newstate[0][1].tolist())]]
                if env[int(newstate[0][0].tolist()),
                       int(newstate[0][1].tolist())] == 2:
                    done = True
                if env[int(newstate[0][0].tolist()),
                       int(newstate[0][1].tolist()
                           )] == 4:  #if the mouse is in the water
                    env[int(newstate[0][0].tolist()),
                        int(newstate[0][1].tolist()
                            )] = 5  #there is no more water
                    new_eaubue = 1.
            else:
                next_state = state
                reward = rewardlist[0]
                new_eaubue = eaubue

            mask = 0 if done else 1
            action_one_hot = np.zeros(4)
            action_one_hot[action] = 1
            memory.push(
                torch.cat((
                    state,
                    torch.tensor(eaubue).unsqueeze(0).unsqueeze(0).to(device)),
                          1),
                torch.cat((next_state, torch.tensor(new_eaubue).unsqueeze(
                    0).unsqueeze(0).to(device)), 1), action_one_hot, reward,
                mask)

            score += reward
            state = next_state
            eaubue = new_eaubue

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        # print("OK")
        if score > 35:
            print(score)
        running_score = 0.99 * running_score + 0.01 * score
        # running_score=score
        scores.append(running_score)
        best_scores.append(
            score if score > best_scores[-1] else best_scores[-1])
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | best score: {:.2f} | epsilon: {:.2f}'
                .format(e, running_score, best_score, epsilon))
            # writer.add_scalar('log/score', float(running_score), e)
            # writer.add_scalar('log/loss', float(loss), e)
            if score > best_score:
                best_score = score
            torch.save(online_net.state_dict(), "./qlearning_model")

        if running_score > goal_score:
            break

    return number_episode, scores, best_scores
Ejemplo n.º 15
0
class Learner:
    def __init__(self, n_actors, device='cuda:0'):
        # params
        self.gamma = 0.99
        self.alpha = 0.6
        self.bootstrap_steps = 3
        self.initial_exploration = 50000
        self.priority_epsilon = 1e-6
        self.device = device
        self.n_epochs = 0
        self.n_actors = n_actors

        # path
        self.memory_path = os.path.join('./', 'logs', 'memory')
        self.net_path = os.path.join('./', 'logs', 'model', 'net.pt')
        self.target_net_path = os.path.join('./', 'logs', 'model',
                                            'target_net.pt')

        # memory
        self.memory_size = 500000
        self.batch_size = 128
        self.memory_load_interval = 10
        self.replay_memory = ReplayMemory(self.memory_size, self.batch_size,
                                          self.bootstrap_steps)

        # net
        self.net_save_interval = 50
        self.target_update_interval = 1000
        self.net = QNet(self.net_path, self.device).to(self.device)
        self.target_net = QNet(self.target_net_path,
                               self.device).to(self.device)
        self.target_net.load_state_dict(self.net.state_dict())
        self.net.save()
        self.target_net.save()
        self.optim = optim.RMSprop(self.net.parameters(),
                                   lr=0.00025 / 4.0,
                                   alpha=0.95,
                                   eps=1.5e-7,
                                   centered=True)

    def run(self):
        while True:
            if self.replay_memory.size > self.initial_exploration:
                self.train()
            self.interval()

    def train(self):
        batch, index, weights = self.replay_memory.sample(self.device)

        # q_value
        q_value = self.net(batch['state'])
        q_value = q_value.gather(1, batch['action'])

        # target q_value
        with torch.no_grad():
            next_action = torch.argmax(self.net(batch["next_state"]),
                                       1).view(-1, 1)
            next_q_value = self.target_net(batch["next_state"]).gather(
                1, next_action)
            target_q_value = batch["reward"] + (
                self.gamma**
                self.bootstrap_steps) * next_q_value * (1 - batch['done'])

        # update
        self.optim.zero_grad()
        loss = torch.mean(0.5 * (q_value - target_q_value)**2)
        loss.backward()
        self.optim.step()

        priority = (np.abs(
            (q_value - target_q_value).detach().cpu().numpy()).reshape(-1) +
                    self.priority_epsilon)**self.alpha
        self.replay_memory.update_priority(index, priority)

    def interval(self):
        self.n_epochs += 1
        if self.n_epochs % self.target_update_interval == 0:
            self.target_net.load_state_dict(self.net.state_dict())
        if self.n_epochs % self.net_save_interval == 0:
            self.net.save()
            self.target_net.save()
        if self.n_epochs % self.memory_load_interval == 0:
            for i in range(self.n_actors):
                self.replay_memory.load(self.memory_path, i)