コード例 #1
0
    def __init__(self):
        super(MoveToBeacon, self).__init__()

        self.num_actions = len(available_actions)
        self.input_flat = 84 * 84  # Size of the screen
        self.wh = 84
        # Minimap sizes
        self.mm_input_flat = 64 * 64
        self.mm_wh = 64

        self.batch_size = 32
        self.max_memory_size = 2000

        self.gamma = .99
        self.learning_rate = 1e-4
        self.epsilon = 1.
        self.final_epsilon = .05
        self.epsilon_decay = 0.999

        self.total_rewards = deque(maxlen=100)
        self.current_reward = 0
        self.actions_taken = np.zeros(self.num_actions)
        self.rewards = []

        self.total_actions = []

        self.memory = ReplayMemory(self.num_actions, self.batch_size,
                                   self.max_memory_size, self.gamma)
        self.model = Model(self.wh, self.input_flat, self.mm_wh,
                           self.mm_input_flat, 1, self.num_actions,
                           self.learning_rate, self.memory)
        if self.model.loaded_model:
            self.epsilon = 0.05
コード例 #2
0
    def __init__(self, skip=True, episodic=True):
        self.env = wrap_dqn(gym.make('BreakoutDeterministic-v4'), skip,
                            episodic)
        self.num_actions = self.env.action_space.n
        self.dqn = DQN(self.num_actions).cuda()
        self.target_dqn = DQN(self.num_actions).cuda()

        self.buffer = ReplayMemory(200000)
        self.gamma = 0.99

        self.optimizer = optim.RMSprop(self.dqn.parameters(),
                                       lr=0.00025,
                                       eps=0.001,
                                       alpha=0.95)
        self.out_dir = '/scratch/ab8084/atari/saved/'
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

        self.reward_episodes = []
        self.lengths_episodes = []
        self.benchmark = -10000
コード例 #3
0
def experiment(NUM_EXP, MAX_EPISODE, PUNISHMENT, ALPHA, GAMMA,
               EPS_START, EPS_END, EPS_DECAY, BATCH_SIZE, TARGET_UPDATE, MEMORY_SIZE,
               HIDDEN_DIM1, HIDDEN_DIM2, DEVICE, file):
    ggg = instance(file)[0]
    kdata = instance(file)[1]
    
    origin = np.array([ggg.vs.select(name = i).indices[0] for i in kdata[0,:]])
    destination = np.array([ggg.vs.select(name = i).indices[0] for i in kdata[1,:]])

    env = Env(ggg, origin, destination, kdata[2,:], 0)
    
    setup_dict = {'num_exp':NUM_EXP, 'max_episodes':MAX_EPISODE, 'punishment':PUNISHMENT, 'alpha':ALPHA, 'gamma':GAMMA, 'eps_start':EPS_START, 'eps_end':EPS_END, 'eps_decay':EPS_DECAY, 'batch_size':BATCH_SIZE, 'target_update':TARGET_UPDATE, 'memory_size':MEMORY_SIZE, 'hidden_dim1':HIDDEN_DIM1, 'hidden_dim2':HIDDEN_DIM2}        
    
    EXP_DATA = []
    file_name1 = time.strftime("%Y%m%d-%H%M%S")
    for j in range(NUM_EXP):
        
        print(j)
        memory = [ReplayMemory(MEMORY_SIZE) for i in range(env.numagent)]
        multi = [DQN_Agent(i, env, memory[i],
                           hidden_dim1 = HIDDEN_DIM1, hidden_dim2 = HIDDEN_DIM2, device = DEVICE, alpha = ALPHA, gamma = GAMMA, batch_size = BATCH_SIZE,
                           eps_start = EPS_START, eps_end = EPS_END, eps_decay = EPS_DECAY)
                    for i in range(env.numagent)]
    
        start = time.time()
        episode_rewards, episode_success, episode_length, best_states, best_actions = train(env, multi, memory, TARGET_UPDATE, MAX_EPISODE, PUNISHMENT, DEVICE)
        end = time.time()
        episode_time = end-start
        
        best_answer = np.max([episode_rewards[i].sum() for i in range(MAX_EPISODE)])
        
        target_policy_answer = sum(test(env, multi, "target"))
#        if target_policy_answer > 0:
#            target_policy_answer -= ARRIVAL_BONUS*env.numagent
        
        parameters = [multi[i].target_net.state_dict() for i in range(env.numagent)]
        
        save = [episode_rewards, episode_success, episode_length, episode_time, best_states, best_actions, best_answer, target_policy_answer, parameters]
        EXP_DATA.append(save)

    file_name2 = time.strftime("%Y%m%d-%H%M%S")
    with open('/home/sle175/rlcombopt/data/%s__%s.p' % (file_name1, file_name2), 'wb') as file:
        pickle.dump(setup_dict, file)
        pickle.dump(EXP_DATA, file)
        
    return
コード例 #4
0
if not os.path.exists('result/model'):
    os.mkdir('result/model')
    os.mkdir('result/test')
with open('result/config.txt', 'w') as f:
    f.write("base reward: {:f}\n".format(BASE_REWARD))
    f.write("batch size:: {:d}\n".format(BATCH_SIZE))
    f.write("gamma: {:f}\n".format(GAMMA))
    f.write("num input: {:d}\n".format(NUM_INPUT))
f.close()

policy_net = DQN(HIDDEN, NUM_ACTION, NUM_INPUT)
target_net = DQN(HIDDEN, NUM_ACTION, NUM_INPUT)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(8000)
lr_schedular = optim.lr_scheduler.CosineAnnealingLR(optimizer, max_epoch, 0.0001)

def select_action(state, test=False):
    if test:
        with torch.no_grad():
            a = policy_net(state).max(1)[1].view(1, 1)
        return a
    else:
        global epoch
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * epoch / EPS_DECAY)
        if sample > eps_threshold:
            with torch.no_grad():
                a = policy_net(state).max(1)[1].view(1, 1)
            print('act according to model: %d\n' % a.squeeze())
コード例 #5
0
    'eps_end': EPS_END,
    'eps_decay': EPS_DECAY,
    'batch_size': BATCH_SIZE,
    'target_update': TARGET_UPDATE,
    'memory_size': MEMORY_SIZE,
    'hidden_dim1': HIDDEN_DIM1,
    'hidden_dim2': HIDDEN_DIM2
}

#%%
EXP_DATA = []
file_name1 = time.strftime("%Y%m%d-%H%M%S")
for j in range(NUM_EXP):

    print(j)
    memory = [ReplayMemory(MEMORY_SIZE) for i in range(env.numagent)]
    multi = [
        DQN_Agent(i,
                  env,
                  memory[i],
                  hidden_dim1=HIDDEN_DIM1,
                  hidden_dim2=HIDDEN_DIM2,
                  device=DEVICE,
                  alpha=ALPHA,
                  gamma=GAMMA,
                  batch_size=BATCH_SIZE,
                  eps_start=EPS_START,
                  eps_end=EPS_END,
                  eps_decay=EPS_DECAY) for i in range(env.numagent)
    ]
コード例 #6
0
class MoveToBeacon(base_agent.BaseAgent):
    """An agent specifically for solving the MoveToBeacon map."""
    def __init__(self):
        super(MoveToBeacon, self).__init__()

        self.num_actions = len(available_actions)
        self.input_flat = 84 * 84  # Size of the screen
        self.wh = 84
        # Minimap sizes
        self.mm_input_flat = 64 * 64
        self.mm_wh = 64

        self.batch_size = 32
        self.max_memory_size = 2000

        self.gamma = .99
        self.learning_rate = 1e-4
        self.epsilon = 1.
        self.final_epsilon = .05
        self.epsilon_decay = 0.999

        self.total_rewards = deque(maxlen=100)
        self.current_reward = 0
        self.actions_taken = np.zeros(self.num_actions)
        self.rewards = []

        self.total_actions = []

        self.memory = ReplayMemory(self.num_actions, self.batch_size,
                                   self.max_memory_size, self.gamma)
        self.model = Model(self.wh, self.input_flat, self.mm_wh,
                           self.mm_input_flat, 1, self.num_actions,
                           self.learning_rate, self.memory)
        if self.model.loaded_model:
            self.epsilon = 0.05

    def step(self, obs):
        # Current observable state
        screen_player_relative = obs.observation["screen"][_PLAYER_RELATIVE]
        current_state = screen_player_relative.flatten()
        mm_player_relative = obs.observation['minimap'][_MM_PLAYER_RELATIVE]
        minimap_state = mm_player_relative.flatten()

        army_state = obs.observation['screen'][_SELECT].flatten()
        # army_selected = np.array([1]) if 1 in obs.observation['screen'][_SELECT] else np.array([0])

        if len(self.memory.memory) > 0:
            self.memory.update([current_state, minimap_state, army_state])
            self.model.train()

        super(MoveToBeacon, self).step(obs)

        legal_actions = obs.observation['available_actions']

        if random.random() < self.epsilon:
            action = legal_actions[random.randint(0, len(legal_actions)) - 1]
            action = available_actions.index(action)
        else:
            # feed_dict = {self.model.screen_input: [current_state], self.model.minimap_input: [minimap_state],
            #              self.model.army_input: [army_selected]}
            feed_dict = {self.model.army_input: [army_state]}
            output = self.model.session.run(self.model.output, feed_dict)[0]
            output = [
                value if action in legal_actions else -9e10
                for action, value in zip(available_actions, output)
            ]
            action = np.argmax(output)
            self.actions_taken[int(action)] += 1
        self.total_actions.append(action)

        # print('Action taken: {}'.format(action))
        reward = obs.reward

        self.current_reward += reward
        if obs.last():
            self.total_rewards.append(self.current_reward)
            self.rewards.append(self.current_reward)
            self.current_reward = 0
            if self.episodes % 100 == 0 and self.episodes > 0:
                self.model.save()
                print('Highest: {} | Lowest: {} | Average: {}'.format(
                    max(self.total_rewards), min(self.total_rewards),
                    np.mean(self.total_rewards)))
                print(self.actions_taken)
            if self.episodes % 1000 == 0 and self.episodes > 0:
                pickle.dump(
                    self.total_actions,
                    open('/home/rob/Documents/uni/fyp/sc2/actions8.pkl', 'wb'))
                pickle.dump(
                    self.rewards,
                    open('/home/rob/Documents/uni/fyp/sc2/rewards8.pkl', 'wb'))
                exit(0)

        if self.epsilon > self.final_epsilon:
            self.epsilon = self.epsilon * self.epsilon_decay

        self.memory.add([current_state, minimap_state, army_state], action,
                        reward, obs.last())
        # self.model.train()

        if available_actions[action] == _NO_OP:
            return actions.FunctionCall(_NO_OP, [])
        elif available_actions[action] == _SELECT_ARMY:
            return actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        elif available_actions[action] == _ATTACK_SCREEN \
                or available_actions[action] == _MOVE_SCREEN \
                or available_actions[action] == _PATROL_SCREEN \
                or available_actions[action] == _SMART_SCREEN:
            # This is the scripted one
            neutral_y, neutral_x = (
                screen_player_relative == _PLAYER_NEUTRAL).nonzero()
            target = [int(neutral_x.mean()), int(neutral_y.mean())]
            return actions.FunctionCall(available_actions[action],
                                        [_NOT_QUEUED, target])
        elif available_actions[action] == _STOP_QUICK:
            return actions.FunctionCall(available_actions[action],
                                        [_NOT_QUEUED])
        elif available_actions[action] == _HOLD_POSITION_QUICK:
            return actions.FunctionCall(available_actions[action],
                                        [_NOT_QUEUED])
        elif available_actions[action] == _ATTACK_MINIMAP \
                or available_actions[action] == _MOVE_MINIMAP \
                or available_actions[action] == _PATROL_MINIMAP \
                or available_actions[action] == _SMART_MINIMAP:
            neutral_y, neutral_x = (
                mm_player_relative == _PLAYER_NEUTRAL).nonzero()
            target = [int(neutral_x.mean()), int(neutral_y.mean())]
            return actions.FunctionCall(available_actions[action],
                                        [_NOT_QUEUED, target])
        else:
            return actions.FunctionCall(_NO_OP, [])
コード例 #7
0
ファイル: train.py プロジェクト: suraj-nair-1/cs244b_project
n_actions = env.action_space.n
n_agents = env.num_agents
grid_size = env.grid_size
input_size = grid_size * grid_size
output_size = 1  #n_actions * n_agents
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

policy_net = DQN(input_size, HIDDEN_SIZE, output_size, n_actions,
                 n_agents).to(device)
target_net = DQN(input_size, HIDDEN_SIZE, output_size, n_actions,
                 n_agents).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters())
memory = ReplayMemory(1000000)

steps_done = 0

logger = Logger('./logs/ours9/')

all_acts = []
for agent1_act in range(n_actions):
    for agent2_act in range(n_actions):
        for agent3_act in range(n_actions):
            a = torch.zeros((n_agents, n_actions)).to(device)
            a[0, agent1_act] = 1
            a[1, agent2_act] = 1
            a[2, agent3_act] = 1
            all_acts.append(a)
all_acts = torch.stack(all_acts, 0)
コード例 #8
0
    os.makedirs(savedir_pre)

n_alpha = 5
gamma = 0.9  #since it may take several moves to goal, making gamma high
epsilon = 0.9  # epsilon for exploration or exploitation
input_size = 11 * 11 * 5  # 11x11 is the size of the gridworld, 5 channels include walls, goals etc
mseloss = torch.nn.MSELoss()

for j in range(n_agents):
    time0 = time.time()
    pi = get_policy(co_alpha, n_alpha,
                    rand_seed=j)  # get the policy with dirichlet distribution

    #new memory for each agent
    buffer = 32
    memory = ReplayMemory(buffer)

    # Network and the optimizer
    charnet = charNet(in_channels=10, out_channels=2, lstm_hidden=16)
    charnet = charnet.float()
    prenet = preNet(in_channels=7, out_channels=5)
    optimizer = optim.Adam([{
        'params': charnet.parameters()
    }, {
        'params': prenet.parameters(),
        'lr': 0.01
    }],
                           lr=1e-2)
    actions_save = []
    BATCH_SIZE = random.randint(2, 11)  # N_past ~ U(2, 10)
コード例 #9
0
class Agent(object):
    '''
    Implements training and testing methods
    '''
    def __init__(self, skip=True, episodic=True):
        self.env = wrap_dqn(gym.make('BreakoutDeterministic-v4'), skip,
                            episodic)
        self.num_actions = self.env.action_space.n
        self.dqn = DQN(self.num_actions).cuda()
        self.target_dqn = DQN(self.num_actions).cuda()

        self.buffer = ReplayMemory(200000)
        self.gamma = 0.99

        self.optimizer = optim.RMSprop(self.dqn.parameters(),
                                       lr=0.00025,
                                       eps=0.001,
                                       alpha=0.95)
        self.out_dir = '/scratch/ab8084/atari/saved/'
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

        self.reward_episodes = []
        self.lengths_episodes = []
        self.benchmark = -10000

    def to_var(self, x):
        '''
        Converts torch tensor x to torch variable
        '''
        return Variable(x).cuda()

    def predict_q_values(self, states):
        '''
        Computes q values of states by passing them through the behavior network
        states: numpy array, shape is (batch_size,frames,width,height)
        returns actions: shape is (batch_size, num_actions)
        '''
        states = self.to_var(torch.from_numpy(states).float())
        actions = self.dqn(states)
        return actions

    def predict_q_target_values(self, states):
        '''
        Computes q values of next states by passing them through the target network
        states: numpy array, shape is (batch_size,frames,width,height)
        returns actions: shape is (batch_size, num_actions)
        '''
        states = self.to_var(torch.from_numpy(states).float())
        actions = self.target_dqn(states)
        return actions

    def select_action(self, state, epsilon):
        choice = np.random.choice([0, 1], p=(epsilon, (1 - epsilon)))

        if choice == 0:
            return np.random.choice(range(self.num_actions))
        else:
            state = np.expand_dims(state, 0)
            actions = self.predict_q_values(state)
            return np.argmax(actions.data.cpu().numpy())

    def update(self, states, targets, actions):
        '''
        Calculates loss and updates the weights of the behavior network using backprop
        states: numpy array, shape is (batch_size,frames,width,height)
        actions: numpy array, shape is(batch_size,num_actions)
        targets: numpy array, shape is (batch_size)
        '''
        targets = self.to_var(
            torch.unsqueeze(torch.from_numpy(targets).float(), -1))
        actions = self.to_var(
            torch.unsqueeze(torch.from_numpy(actions).long(), -1))
        predicted_values = self.predict_q_values(states)
        affected_values = torch.gather(predicted_values, 1, actions)
        loss = F.smooth_l1_loss(affected_values, targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def calculate_q_targets(self, next_states, rewards, dones):
        '''
        Calculate targets from target network
        next_states: numpy array, shape is (batch_size, frames, width, height)
        rewards: numpy array, shape is (batch_size,)
        dones: numpy array, shape is (batch_size,)
        '''
        dones_mask = (dones == 1)
        predicted_q_target_values = self.predict_q_target_values(next_states)
        next_max_q_values = np.max(
            predicted_q_target_values.data.cpu().numpy(), axis=1)
        next_max_q_values[dones_mask] = 0
        q_targets = rewards + self.gamma * next_max_q_values
        return q_targets

    def sync_target_network(self):
        '''
        Copies weights from estimation to target network
        '''
        primary_params = list(self.dqn.parameters())
        target_params = list(self.target_dqn.parameters())
        for i in range(0, len(primary_params)):
            target_params[i].data[:] = primary_params[i].data[:]

    def play(self, episodes):
        '''
        plays for epsiodes number of episodes
        '''
        for i in range(1, episodes + 1):
            done = False
            state = self.env.reset()
            plt.imshow(state)
            plt.axis('off')
            plt.show()
            while not done:
                action = self.select_action(state, 0)
                state, reward, done, _ = self.env.step(action)
                display.clear_output(wait=True)
                plt.imshow(self.env.render())
                plt.axis('off')
                plt.show()
                time.pause(0.03)

    def close_env(self):
        '''
        Clean up
        '''
        self.env.close()

    def get_epsilon(self, total_steps, max_epsilon_steps, epsilon_start,
                    epsilon_final):
        return max(epsilon_final,
                   epsilon_start - total_steps / max_epsilon_steps)

    def save_final_model(self):
        '''
        Saves final model to the disk
        '''
        filename = '{}/final_model_breakout_skipTrue.pth'.format(self.out_dir)
        torch.save(
            {
                'model_state_dict': self.dqn.state_dict(),
                'benchmark': self.benchmark,
                'lenghts_rewards': self.lengths_episodes,
                'rewards_episodes': self.reward_episodes
            }, filename)

    def load_model(self, filename):
        '''
        Loads model from the disk
        
        filename: model filename
        '''
        try:
            checkpoint = torch.load(
                '/scratch/ab8084/atari/saved/final_model_breakout_skipTrue.pth'
            )
            self.dqn.load_state_dict(checkpoint['model_state_dict'])
            self.benchmark = checkpoint['benchmark']
        except:
            self.dqn.load_state_dict(torch.load(filename))

        self.sync_target_network()

    def train(self, replay_buffer_fill_len, batch_size, episodes, stop_reward,
              max_epsilon_steps, epsilon_start, epsilon_final,
              sync_target_net_freq):
        '''
        replay_buffer_fill_len: how many elements should replay buffer contain before training starts
        batch_size: batch size
        episodes: how many episodes (max. value) to iterate
        stop_reward: running reward value to be reached. upon reaching that value the training is stoped
        max_epsilon_steps: maximum number of epsilon steps
        epsilon_start: start epsilon value
        epsilon_final: final epsilon value, effectively a limit
        sync_target_net_freq: how often to sync estimation and target networks
        '''

        start_time = time.time()
        print('Start training at: ' + time.asctime(time.localtime(start_time)))

        total_steps = 0
        running_episode_reward = 0

        print('Populating Replay Buffer')
        print('\n')

        state = self.env.reset()
        for i in range(replay_buffer_fill_len):
            done = False
            action = self.select_action(state, 0.05)
            next_state, reward, done, _ = self.env.step(action)
            self.buffer.add(state, action, reward, done, next_state)
            state = next_state
            if done:
                state = self.env.reset()

        print(
            'Replay Buffer populated with {} transitions, starting training...'
            .format(self.buffer.count()))
        print('\n')

        for i in range(1, episodes + 1):
            done = False
            state = self.env.reset()

            episode_reward = 0
            episode_length = 0

            while not done:
                if (total_steps % sync_target_net_freq) == 0:
                    print('synchronizing target network...')
                    #print('\n')
                    self.sync_target_network()

                epsilon = self.get_epsilon(total_steps, max_epsilon_steps,
                                           epsilon_start, epsilon_final)
                action = self.select_action(state, epsilon)

                next_state, reward, done, _ = self.env.step(action)
                self.buffer.add(state, action, reward, done, next_state)
                s_batch, a_batch, r_batch, d_batch, next_s_batch = self.buffer.sample(
                    batch_size)
                q_targets = self.calculate_q_targets(next_s_batch, r_batch,
                                                     d_batch)

                self.update(s_batch, q_targets, a_batch)

                state = next_state
                total_steps += 1
                episode_length += 1
                episode_reward += np.sign(reward)

            self.reward_episodes.append(episode_reward)
            self.lengths_episodes.append(episode_length)

            running_episode_reward = running_episode_reward * 0.9 + 0.1 * episode_reward
            if (i % 1000) == 0 or (running_episode_reward > stop_reward):
                print(
                    'global step: {}'.format(total_steps),
                    ' | episode: {}'.format(i),
                    ' | mean episode_length: {}'.format(
                        np.mean(self.lengths_episodes[-1000:])),
                    ' | mean episode reward: {}'.format(
                        np.mean(self.reward_episodes[-1000:])))
                #self.lengths_episodes=[]
                #self.reward_episodes=[]
                #print('episode: {}'.format(i))
                #print('current epsilon: {}'.format(round(epsilon, 2)))
                #print('mean episode_length: {}'.format(np.mean(lengths_episodes[-50:])))
                #print('mean episode reward: {}'.format(np.mean(reward_episodes[-50:])))
                #print('\n')
            if episode_reward > self.benchmark:
                print('global step: {}'.format(total_steps),
                      ' | episode: {}'.format(i),
                      ' | episode_length: {}'.format(episode_length),
                      ' | episode reward: {}'.format(episode_reward))
                self.benchmark = episode_reward
                self.save_final_model()

            if running_episode_reward > stop_reward:
                print('stop reward reached!')
                print('saving final model...')
                print('\n')
                #self.save_final_model()
                break

        print('Finish training at: ' +
              time.asctime(time.localtime(start_time)))