Beispiel #1
0
 def drop_piece(self, action, old_board):
     board = old_board.clone()
     board.rel_x, board.rel_y, board.rotation_idx = 1, *action
     while env.is_available(board, (1, 0)):
         board = env.make(board, env.EMPTY)
         board.rel_x += 1
         board = env.make(board, env.PIECE)
         self.board = board.area
         self.update()
         self.root.update()
         time.sleep(self.speed)
Beispiel #2
0
def main():
    args = arg_parser()

    if args.mode == "train":
        env = environment.make(args.env, args)
        if args.networks == "MLP":
            nn = MLP(env.observation_space.shape[0], env.action_space,
                     args.n_frames)
        elif args.networks == "CONV":
            nn = CONV(args.n_frames, env.action_space)

        optimizer = SharedAdam(nn.parameters())

        threads = []
        thread = mp.Process(target=test, args=(args, nn))
        thread.start()
        threads.append(thread)

        for i in range(0, args.n_workers):
            thread = mp.Process(target=train, args=(i, args, nn, optimizer))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()
    elif args.mode == "test":
        evaluate(args)
def test(args, nn):
    ptitle('Test Agent')

    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    env = environment.make(args.env, args)

    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)

    player.model = MLP(player.env.observation_space.shape[0],
                       player.env.action_space, args.n_frames)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()

    player.model.eval()
    max_score = 0

    while True:
        if player.done:
            player.model.load_state_dict(nn.state_dict())

        player.action_test()
        reward_sum += player.reward

        if player.done:
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, reward {1}, average reward {2:.4f}".format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, reward_mean))

            if reward_sum >= max_score:
                max_score = reward_sum
                state_to_save = player.model.state_dict()
                torch.save(state_to_save, '{}.dat'.format(args.model_save_dir))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()
def evaluate(args):
    torch.set_default_tensor_type('torch.FloatTensor')

    saved_state = torch.load(
        '{}.dat'.format(args.model_load_dir),
        map_location=lambda storage, loc: storage
    )

    log = {}
    setup_logger('{}_eval_log'.format(args.env), r'{0}{1}_eval_log'.format(
        args.log, args.env))
    log['{}_eval_log'.format(args.env)] = logging.getLogger(
        '{}_eval_log'.format(args.env))

    d_args = vars(args)
    for k in d_args.keys():
        log['{}_eval_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    env = environment.make("{}".format(args.env), args)
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)

    if args.networks == "MLP":
        player.model = MLP(env.observation_space.shape[0], env.action_space, args.n_frames)
    elif args.networks == "CONV":
        player.model = CONV(args.n_frames, env.action_space)

    if True:
        player.env = gym.wrappers.Monitor(
            player.env, "{}_monitor".format(args.env), lambda episode_id: True, force=True)

    player.model.load_state_dict(saved_state)

    player.model.eval()
    for i_episode in range(args.rollout):
        player.state = player.env.reset()
        player.state = torch.from_numpy(player.state).float()
        player.eps_len = 0
        reward_sum = 0
        while True:
            if args.render:
                if i_episode % 1 == 0:
                    player.env.render()

            player.action_test()
            reward_sum += player.reward

            if player.done:
                num_tests += 1
                reward_total_sum += reward_sum
                reward_mean = reward_total_sum / num_tests
                log['{}_eval_log'.format(args.env)].info(
                    "reward, {0}, average reward, {1:.4f}".format(reward_sum, reward_mean))
                break
Beispiel #5
0
if __name__=="__main__":
    logging.disable(logging.NOTSET)
    nElevator = int(sys.argv[1])
    nFloor = int(sys.argv[2])
    spawnRates = [1/360]+[1/360]*(nFloor-1)
    avgWeight = 135
    weightLimit = 1200
    loadTime = 1
    beta = 0.01
    lr = 1e-4

"""
Initialize environment and optimizers
"""
# initialize environment
env = gym.make(nElevator, nFloor, spawnRates, avgWeight, weightLimit, loadTime)
obssize = env.observation_space_size
actsize = env.action_space_size
print("state space dimension", obssize)
print("action space size", actsize)

# initialize tensorflow session
sess = tf.Session()

# initialize an optimizer for each elevator
optimizer_list = []
for i in range(nElevator):
    optimizer_list.append(tf.train.AdamOptimizer(lr))

# initialize a NNet for each elevator
Q=[]
Beispiel #6
0
def main():
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--cuda", default=True, action='store_true',
    # help='Enable cuda')
    # parser.add_argument("--env", default=DEFAULT_ENV_NAME,
    # help='Name of the environment, default = ' + DEFAULT_ENV_NAME)
    # parser.add_argument('--reward', type=float, default=MEAN_REWARD_BOUND,
    # help='Mean reward boundary for stopping of training, default = %.2f'%(MEAN_REWARD_BOUND))
    # args = parser.parse_args()
    # device = torch.device('cuda' if args.cuda else 'cpu')
    device = torch.device('cuda')
    env = make('glovedatarms.npy', 'labels2.npy', 4, 1, -1)
    net = dqn.DQN(env.observation_shape, env.n_actions).to(device)
    tgt_net = dqn.DQN(env.observation_shape, env.n_actions).to(device)
    print(net)
    net.load_state_dict(torch.load('best.dat'))
    buffer = ExperienceBuffer(REPLAY_SIZE)
    file = open('replay_mem.obj', 'rb')
    buffer = pickle.load(file)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None
    while True:
        frame_idx += 1
        epsilon = max(EPSILON_FINAL,
                      EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
        reward = agent.play_step(net, epsilon, device)
        if reward is not None:
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print(
                "%d: done %d passes, mean reward %.3f, eps %.2f, speed %.2f f/s"
                % (frame_idx, len(total_rewards), mean_reward, epsilon, speed))
            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), 'best.dat')
                filehandler = open('replay_mem.obj', 'wb')
                pickle.dump(buffer, filehandler)
                if best_mean_reward is not None:
                    print(
                        "Best mean reward updated %.3f -> %.3f, model saved" %
                        (best_mean_reward, mean_reward))
                best_mean_reward = mean_reward
            if mean_reward > best_mean_reward:
                print("Solved in %d frames!" % frame_idx)
                break
        if len(buffer) < REPLAY_START_SIZE:
            continue
        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())
        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, tgt_net, device=device)
        loss_t.backward()
        optimizer.step()
     T.ToTensor()])


def get_screen(env, device):
    # transpose into torch order (CHW)
    screen = env.render(mode='rgb_array').transpose((2, 0, 1))

    # Strip off the top and bottom of the screen
    screen = screen[:, 160:320]

    # Convert to float, rescare, convert to torch tensor
    # (this doesn't require a copy)
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen)
    # Resize, and add a batch dimension (BCHW)
    return resize(screen).unsqueeze(0).to(device)


if __name__ == "__main__":
    import matplotlib.pyplot as plt
    from environment import make

    env = make("unity")
    env.start()

    plt.figure()
    plt.imshow(get_screen(env, torch.device("cpu")).cpu().squeeze(0).permute(
        1, 2, 0).numpy(),
               interpolation='none')
    plt.title('Example extracted screen')
    plt.show()
def train(rank, args, nn, optimizer):
    ptitle('Training Agent: {}'.format(rank))

    env = environment.make(args.env, args)
    env.seed(RANDOM_SEED + rank)

    player = Agent(None, env, args, None)
    player.model = MLP(player.env.observation_space.shape[0],
                       player.env.action_space, args.n_frames)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    while True:
        player.model.load_state_dict(nn.state_dict())
        if player.done:
            player.cx = Variable(torch.zeros(1, 128))
            player.hx = Variable(torch.zeros(1, 128))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.n_steps):

            player.action_train()

            if player.done:
                break

        if player.done:
            player.eps_len = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)

        if not player.done:
            state = player.state
            value, _, _, _ = player.model(
                (Variable(state), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            #          print(player.rewards[i])
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                (player.log_probs[i].sum() * Variable(gae)) - \
                (0.01 * player.entropies[i].sum())

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, nn, gpu=False)
        optimizer.step()
        player.clear_actions()