Esempio n. 1
0
    def __init__(self,
                 model_path,
                 dtype,
                 seed=451):
        self._seed = seed
        self._idx = 0
        self.np_random, _ = seeding.np_random(seed)
        self._dtype = dtype
        self.env = MancalaEnv(seed)
        state = self.env._reset()

        self._model = ActorCritic(
            state.shape[0], self.env.action_space).type(dtype)
        self._model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) 
Esempio n. 2
0
class AgentA3C(Agent):
    '''Agent which leverages Actor Critic Learning'''

    def __init__(self,
                 model_path,
                 dtype,
                 seed=451):
        self._seed = seed
        self._idx = 0
        self.np_random, _ = seeding.np_random(seed)
        self._dtype = dtype
        self.env = MancalaEnv(seed)
        state = self.env._reset()

        self._model = ActorCritic(
            state.shape[0], self.env.action_space).type(dtype)
        self._model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) 

    def _move(self, game):
        '''Return move which ends in score hole'''
        assert not game.over()
        self._idx += 1
        game_clone, rot_flag = game.clone_turn()
        move_options = Agent.valid_indices(game_clone)

        state = self.env.force(game_clone)
        state = torch.from_numpy(state).type(self._dtype)
        cx = Variable(torch.zeros(1, 400).type(self._dtype), volatile=True)
        hx = Variable(torch.zeros(1, 400).type(self._dtype), volatile=True)

        _, logit, (hx, cx) = self._model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        scores = [(action, score) for action, score in enumerate(
            prob[0].data.tolist()) if action in move_options]

        valid_actions = [action for action, _ in scores]
        valid_scores = np.array([score for _, score in scores])

        final_move = self.np_random.choice(valid_actions, 1, p=valid_scores/valid_scores.sum())[0]

        return Game.rotate_board(rot_flag, final_move)
Esempio n. 3
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1, keepdim=True)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
def test(rank, params, shared_model):
    # desynchronising the agents
    torch.manual_seed(params.seed + rank)
    # create the environment
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    # since this is test mode, then we need to evaluate the model
    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    # initialize all the required parameters
    reward_sum = 0
    done = True
    # start time to measure the time of computations
    start_time = time.time()
    actions = deque(maxlen = 100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            # reload last state of the model
            model.load_state_dict(shared_model.state_dict())
            # reinitialize the cell and hidden states
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            # we keep the same cell and hidden states
            # while making sure they are torch variables
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        # get the predictions of the model
        # output of critic, output of actor, hidden and cell states
        value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        # immediately play the action because there is no need to train
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0, 0])
        reward_sum += reward
        if done: # when the game is done
            print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
            # reinitialize everything after game is done
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            # do a break of 60 seconds to let the other agents practice
            time.sleep(60)
        # get new state
        state = torch.from_numpy(state)
Esempio n. 5
0
def test(rank, args, shared_model, dtype):
    test_ctr = 0
    torch.manual_seed(args.seed + rank)

    #set up logger
    timestring = str(date.today()) + '_' + time.strftime(
        "%Hh-%Mm-%Ss", time.localtime(time.time()))
    run_name = args.save_name + '_' + timestring
    configure("logs/run_" + run_name, flush_secs=5)

    env = create_atari_env(args.env_name, args.evaluate, run_name)
    env.seed(args.seed + rank)
    state = env.reset()

    model = ActorCritic(state.shape[0], env.action_space).type(dtype)

    model.eval()

    state = torch.from_numpy(state).type(dtype)
    reward_sum = 0
    max_reward = -99999999
    done = True

    start_time = time.time()

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256).type(dtype), volatile=True)
            hx = Variable(torch.zeros(1, 256).type(dtype), volatile=True)
        else:
            cx = Variable(cx.data.type(dtype), volatile=True)
            hx = Variable(hx.data.type(dtype), volatile=True)

        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.cpu().numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))

            # if not stuck or args.evaluate:
            log_value('Reward', reward_sum, test_ctr)
            log_value('Episode length', episode_length, test_ctr)

            if reward_sum >= max_reward:
                pickle.dump(shared_model.state_dict(),
                            open(args.save_name + '_max' + '.p', 'wb'))
                max_reward = reward_sum

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            test_ctr += 1

            if test_ctr % 10 == 0 and not args.evaluate:
                pickle.dump(shared_model.state_dict(),
                            open(args.save_name + '.p', 'wb'))

            if not args.evaluate:
                time.sleep(60)
            elif test_ctr == evaluation_episodes:
                # Ensure the environment is closed so we can complete the submission
                env.close()
                gym.upload('monitor/' + run_name, api_key=api_key)

        state = torch.from_numpy(state).type(dtype)
Esempio n. 6
0
def train(rank, args, shared_model, dtype):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)
    state = env.reset()

    model = ActorCritic(state.shape[0], env.action_space).type(dtype)

    optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    values = []
    log_probs = []

    state = torch.from_numpy(state).type(dtype)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256).type(dtype))
            hx = Variable(torch.zeros(1, 256).type(dtype))
        else:
            cx = Variable(cx.data.type(dtype))
            hx = Variable(hx.data.type(dtype))

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.cpu().numpy())
            done = done or episode_length >= args.max_episode_length

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state).type(dtype)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1).type(dtype)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1).type(dtype)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.beta * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 7
0
        self.seed = 1
        self.num_processes = 16
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'Breakout-v0'


# Main run
os.environ['OMP_NUM_THREADS'] = '1'  # 1 thread per core
params = Params()  # get all out parameters and initialize them
torch.manual_seed(params.seed)  # set the seed
env = create_atari_env(
    params.env_name
)  # get the environment, create an optimized env using universe
shared_model = ActorCritic(
    env.observation_space.shape[0], env.action_space
)  # model shared by every agent and store it in the computer
shared_model.share_memory()
optimizer = a3c_custom_optim.SharedAdam(
    shared_model.parameters(),
    lr=params.lr)  # link optimizer to shared model act on the shared model
optimizer.share_memory()  # store the optimizer in the memory
processes = []
p = mp.Process(target=test, args=(
    params.num_processes, params,
    shared_model))  # runs a funciton on an independent thread (from torch)
p.start()
processes.append(p)
for rank in range(0, params.num_processes):
    p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
    p.start()
Esempio n. 8
0
    help='path/prefix for the filename to load shared model\'s parameters')
parser.add_argument('--evaluate',
                    action="store_true",
                    help='whether to evaluate results and upload to gym')

if __name__ == '__main__':
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
    ) else torch.FloatTensor

    env = create_atari_env(args.env_name)
    state = env.reset()
    shared_model = ActorCritic(state.shape[0], env.action_space).type(dtype)
    if args.load_name is not None:
        shared_model.load_state_dict(
            pickle.load(open(args.load_name + '.p', 'rb')))
    shared_model.share_memory()

    # train(1,args,shared_model,dtype)
    processes = []

    p = mp.Process(target=test,
                   args=(args.num_processes, args, shared_model, dtype))
    p.start()
    processes.append(p)

    if not args.evaluate:
        for rank in range(0, args.num_processes):
Esempio n. 9
0
def train(rank, args, shared_model, counter, lock, optimizer):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    obs = []

    state = env.reset()
    state = torch.from_numpy(state)
    obs.append(obs)
    done = True

    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        human_obs = []
        action_dists = []
        values = []
        log_probs = []
        actions = []
        rewards = []
        entropies = []
        '''Requiring trajectories:
		value, logit, (hx, cx)
        '''

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            # print('value', value.data.numpy())
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            # print('log_prob', log_prob.data.numpy())
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)
            # action_dists.append(prob.data.numpy())
            action_dists.append(prob)
            # print('action_dists', action_dists)

            action = prob.multinomial().data
            actions.append(action)
            # print('actions', np.array(actions))
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, info = env.step(action.numpy())

            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            obs.append(state)
            # values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            human_obs.append(info.get("human_obs"))

            # print('log_probs', np.concatenate(log_probs))
            # print('human_obs', human_obs)
            # print('action_dists', action_dists)
            if done:
                # path = create_path(obs, human_obs, action_dists, rewards, actions)
                # print('create_path',)
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        path = create_path(obs, human_obs, action_dists, rewards, actions)
        print('create_path')
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
def train(rank, params, shared_model, optimizer):
    # have to desynchronize every training agent
    # use rank to shift each seed, n agents mean rank is 0 to n
    torch.manual_seed(params.seed + rank) # desync each traiing agent
    # create environment for breakout
    env = create_atari_env(params.env_name)
    # align seed of the environment on the agent
    # each agent has it's own copy of the environment
    # we need to align each of the agent on one specific environment
    # associate different seed to each agent so they can have separate env
    env.seed(params.seed + rank)
    # create a3c model
    model = ActorCritic(env.observation_space.shape[0], env.action_space) # insert thru env
    # get state of env
    # state is 1 by 42 by 42 (1 is black)
    state = env.reset()
    # convert into torch tensors
    state = torch.from_numpy(state)
    # done is when game is over
    done = True
    episode_length = 0 # increment the episode_length
    while True:
        episode_length += 1
        model.load_state_dict(shared_model.state_dict())
        if done:
            # reinitialize the hidden and cell states
            # since output is 256 we need 256 zeroes
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            # keep data
            cx = Variable(cx.date)
            hx = Variable(hx.data)
        values = [] # value of the critic
        log_probs = []
        rewards = []
        entropies = []
        # loop over exploration steps
        for step in range(params.num_steps):
            # get predictions of the model, apply it to the input
            # get the values of the v function
            value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # model need to be unsqueezed
            # get the probabiliteis using softmax
            prob = F.softmax(action_values)
            # remember entropy is the minus of the sum of the product log prob times prob
            log_prob = F.log_softmax(action_values)
            entropy = -(log_prob * prob).sum(1)
            # append to entropies
            entropies.append(entropy)
            
            action = prob.multinomial().data # take a random draw of the actions available
            log_prob = log_prob.gather(1, Variable(action)) # associate with the action
            # append to values and log_probs
            values.append(value)
            log_probs.append(log_prob)
            # by reaching a new state, we get a reward, refer to the env code
            state, reward, done = env.step(action.numpy())
            # make sure agent is not stuck in a state
            # limit the time by limiting max_episode_length
            done = (done or episode_length >= params.max_episode_length)
            # make sure reward between -1 and +1
            reward = max(min(reward, 1), -1)
            # check if game is done and then restart environment
            if done:
                episode_length = 0
                state = env.reset()
            # remember that state is an image in the form of a numpy array
            state = torch.from_numpy(state)
            # append reward to the rewards now
            rewards.append(reward)
            if done: # stop exploration if done
                break
        # cumulative reward
        R = torch.zeros(1, 1)
        if not done: # cumulative reward is the output of model in prev state
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data
        values.append(Variable(R))
        # calculate loss now
        # remember we have 2 types of loss
        policy_loss = 0
        value_loss = 0
        R = Variable(R) # must be torch as we're comparing gradient R is a term of value loss
        # initialise the GAE generalised advantage estimation (advantage of action in state compared to another state)
        gae = torch.zeros(1, 1) # A(a, s) = Q(a, s) - V(s)
        # stochastic gradient descent
        # reversed is so that we can move back in time
        for i in reversed(range(len(rewards))):
            R = params.gamma * R + rewards[i] # we will get R = r_0 + gamma * r_1 + gamma^2 * r_2 + ... + gamma^(n-1) * r_(n-1) + gamma^nb_steps * V(last state)
            # compute the advantage of reward against the value
            advantage = R - values[i]
            # get the value loss Q*(a*, s) = V*(s)
            value_loss = value_loss + 0.5 * advantage.pow(2) # loss generated by the predictions of the V function output by the critic
            # use GAE for policy loss, temporal diff of state value
            TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
            gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamme*tau)^i * TD(i)
            # we can now finally calculate policy loss
            # log of probability of the entropy are negative values
            # we maximise the probability of playing the action that will maximise the advantage
            # purpose of entropy is to prevent falling too quickly into a trap
            # where all actions 0 but one is 1, entropy is to control that from happening
            policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # policy_loss = - sum_i log(pi_i) + 0.01*R_i (entropy)
        # apply stochastic gradient descent
        optimizer.zero_grad()
        # give more importance to policy loss as its smaller than value loss
        (policy_loss + 0.5 * value_loss).backward()
        # prevent gradient from generating very large values
        # 40 is such that the norm of the gradient stays between 0 and 40
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)
        # make sure model and share_model share the same grad
        ensure_shared_grads(model, shared_model)
        # now optimize to reduce the losses
        optimizer.step()
            
            
Esempio n. 11
0
                        type=str,
                        help='saved directory')
    parser.add_argument('--max-episode-length', default=8e7, type=int)
    parser.add_argument('--no_shared', default=False, type=bool, help='')
    return parser.parse_args()


# def printlog(args, s, end='\n', mode='a'):
#     print(s, end=end)
#     f=open(args.save_dir+'log.txt',mode) ; f.write(s+'\n') ; f.close()

args = get_args()
# print('args', args)
# env = make_env(args.env)
env = create_atari_env(args.env_name)
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
shared_model.share_memory()

if args.no_shared:
    optimizer = None
else:
    optimizer = SharedAdam(shared_model.parameters(), lr=args.lr)
    optimizer.share_memory()
# exp_name = slugify(args.env)
# print('num_actions', env.action_space.n)
# n_pretrain_labels = 0

# episode_logger = EpisodeLogger('test')

# reward_model = OriginalEnvironmentReward(episode_logger)
# args.pretrain_iters = 0  # Don't bother pre-training a traditional RL agent