Beispiel #1
0
 def __init__(self):
     # self.env = gym.make(ENV_NAME)
     self.env = Environment(DRAW_TRAINING)
     self.state = self.env.reset()
     self.rewards = collections.defaultdict(float)
     self.transits = collections.defaultdict(collections.Counter)
     self.values = collections.defaultdict(float)
Beispiel #2
0
class Agent:
    def __init__(self):
        self.env = Environment(DRAW_TRAINING)
        self.state = self.env.reset()
        self.values = collections.defaultdict(
            float)  # less memory wasted, only store q-values

    # get s, a, r ,ns
    def sample_env(self):
        action = self.env.get_action_random()
        old_state = self.state
        new_state, reward, is_done, _ = self.env.step(action)
        self.state = self.env.reset() if is_done else new_state
        return old_state, action, reward, new_state

    # iterate over all action values and return the best one
    def best_value_and_action(self, state):
        best_value, best_action = None, None
        for action in range(self.env.get_action_size()):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_value, best_action

    # q-value is calculated for s, a and stored in table
    def value_update(self, s, a, r, next_s):
        best_v, _ = self.best_value_and_action(next_s)
        new_val = r + GAMMA * best_v
        old_val = self.values[(s, a)]
        self.values[(s, a)] = old_val * (1 - ALPHA) + new_val * ALPHA

    # value table is not altered, only measures agent
    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, _ = env.step(action)
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward
Beispiel #3
0
class Agent:
    def __init__(self):
        # self.env = gym.make(ENV_NAME)
        self.env = Environment(DRAW_TRAINING)
        self.state = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(collections.Counter)
        self.values = collections.defaultdict(float)

    def play_n_random_steps(self, count):
        # rand = random.uniform(0.2, 0.8)     # more or less and he does nothing
        for _ in range(count):
            # if _ % 1000 == 0:
            #     rand = random.uniform(0.2, 0.8)
            #     print(rand)
            # action = np.random.choice((0, 1), 1, p=(rand, 1 - rand))
            # action = action.item(0)
            action = self.env.get_action_random()
            new_state, reward, is_done, _ = self.env.step(action)
            self.rewards[(self.state, action, new_state)] = reward
            self.transits[(self.state, action)][new_state] += 1
            self.state = self.env.reset() if is_done else new_state
            # print(len(self.transits))

    def select_action(self, state):
        best_action, best_value = None, None
        for action in range(self.env.get_action_size()):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            action = self.select_action(state)
            new_state, reward, is_done, _ = env.step(action)
            self.rewards[(state, action, new_state)] = reward
            self.transits[(state, action)][new_state] += 1
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward

    def value_iteration(self):
        for state in range(self.env.get_observation_size()):
            for action in range(self.env.get_action_size()):
                action_value = 0.0
                target_counts = self.transits[(state, action)]
                total = sum(target_counts.values())
                for tgt_state, count in target_counts.items():
                    reward = self.rewards[(state, action, tgt_state)]
                    best_action = self.select_action(tgt_state)
                    action_value += (count / total) * (
                        reward + GAMMA * self.values[(tgt_state, best_action)])
                self.values[(state, action)] = action_value
class Agent:
    def __init__(self):
        self.env = Environment(DRAW_TRAINING)
        self.state = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(collections.Counter)
        self.values = collections.defaultdict(float)

    def play_n_random_steps(self, count):
        for _ in range(count):
            action = self.env.get_action_random()
            new_state, reward, is_done, _ = self.env.step(action)
            self.rewards[(self.state, action, new_state)] = reward
            self.transits[(self.state, action)][new_state] += 1
            self.state = self.env.reset() if is_done else new_state

    def calc_action_value(self, state, action):
        target_counts = self.transits[(state, action)]
        total = sum(target_counts.values())
        action_value = 0.0
        for tgt_state, count in target_counts.items():
            reward = self.rewards[(state, action, tgt_state)]
            action_value += (count / total) * (reward +
                                               GAMMA * self.values[tgt_state])
        return action_value

    def select_action(self, state):
        best_action, best_value = None, None
        # for action in range(self.env.action_space.n):
        for action in range(self.env.get_action_size()):
            action_value = self.calc_action_value(state, action)
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            action = self.select_action(state)
            new_state, reward, is_done, _ = env.step(action)
            self.rewards[(state, action, new_state)] = reward
            self.transits[(state, action)][new_state] += 1
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward

    def value_iteration(self):
        for state in range(self.env.get_observation_size()):
            state_values = [
                self.calc_action_value(state, action)
                for action in range(self.env.get_action_size())
            ]
            self.values[state] = max(state_values)
def play_flappyb():
    env = Environment(  # True, 1, True, DIFFICULTY_PLAY, OBS_THIS_PIPE_PLAY)
        draw=True,
        fps=1,
        debug=True,
        dist_to_pipe=DIFFICULTY_PLAY,
        dist_between_pipes=DIST_BETWEEN_PIPES,
        obs_this_pipe=OBS_THIS_PIPE_PLAY)

    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME))
    dqn_solver = DQNSolver(observation_space, action_space, model)

    for i in range(20):
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        is_done = False
        while not is_done:
            action = dqn_solver.act_free(state)
            # action = env.get_action_random()
            state_next, reward, terminal, info = env.step_buffer(action)
            is_done = terminal
            state = np.reshape(state_next, [1, observation_space])
def learn_flappyb():
    env = Environment(  # DRAW, 1, False, DIFFICULTY_LEARN)
        draw=DRAW,
        fps=1,
        debug=False,
        dist_to_pipe=DIFFICULTY_LEARN,
        dist_between_pipes=DIST_BETWEEN_PIPES,
        obs_this_pipe=OBS_THIS_PIPE_LEARN)
    writer = None
    if WRITE:
        writer = SummaryWriter(comment=NAME)
    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    model = load_model('models/dqn/{}.h5'.format(LOAD_NAME))
    dqn_solver = DQNSolver(observation_space, action_space, model)
    run = 0

    if SAVE_MODEL:
        name = '{}-PART={}'.format(NAME, run)
        dqn_solver.model.save('models/dqn/{}.h5'.format(name))
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        reward_score = 0

        while True:
            step += 1
            action = dqn_solver.act(state, env)
            state_next, reward, terminal, info = env.step_buffer(action)
            reward_score += reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " +
                      str(dqn_solver.exploration_rate) + ", score: " +
                      str(reward_score))
                if WRITE:
                    writer.add_scalar("reward", reward_score, run)
                break
            dqn_solver.experience_replay()
        if (run % 100 == 0) and SAVE_MODEL:
            name = '{}-PART={}'.format(NAME, run)
            dqn_solver.model.save('models/dqn/{}.h5'.format(name))
    if WRITE:
        writer.close()
Beispiel #7
0
 def __init__(self):
     self.env = Environment(DRAW_TRAINING)
     self.state = self.env.reset()
     self.values = collections.defaultdict(
         float)  # less memory wasted, only store q-values
Beispiel #8
0
    # value table is not altered, only measures agent
    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, _ = env.step(action)
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward


if __name__ == "__main__":
    test_env = Environment(DRAW)
    agent = Agent()
    writer = None
    if WRITE:
        writer = SummaryWriter(comment=NAME)
    iter_no = 0
    best_reward = 0.0
    while True:
        iter_no += 1
        print('#', iter_no)
        s, a, r, next_s = agent.sample_env()
        agent.value_update(s, a, r, next_s)

        reward = 0.0
        for _ in range(TEST_EPISODES):
            reward += agent.play_episode(test_env)
    train_obs = []
    train_act = []
    elite_batch = []
    for example, discounted_reward in zip(batch, disc_rewards):
        if discounted_reward > reward_bound:
            train_obs.extend(map(lambda step: step.observation, example.steps))
            train_act.extend(map(lambda step: step.action, example.steps))
            elite_batch.append(example)

    return elite_batch, train_obs, train_act, reward_bound


if __name__ == "__main__":
    random.seed(12345)
    env = Environment(DRAW)         # activate save

    obs_size = env.get_observation_size()
    n_actions = env.get_action_size()

    net = Net(obs_size, n_actions)
    net.load_state_dict(torch.load('models/cross_entropy/{}-PART=240.pt'.format(NAME)))
    net.eval()

    # torch.save(net.state_dict(), 'models/cross_entropy/{}-PART=0.pt'.format(NAME))

    objective = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)

    writer = None
    if WRITE:
class Agent:
    def __init__(self):
        self.total_reward = 0

    def step(self, env):
        current_obs = env.get_observation_space()  # emtpy for now
        action = env.get_action_random()
        obs, reward, is_done, _ = env.step(action)
        self.total_reward += reward


# HUMAN PLAYS

env = Environment(draw=True,
                  fps=20,
                  debug=True,
                  dist_to_pipe=40,
                  dist_between_pipes=150,
                  obs_this_pipe=True)
env.run_human_game()

# RANDOM AGENT

# agent = Agent()
# env = Environment(True, 10)

# for i in range(10):
# 	env.reset()
# 	while not env.is_done:
# 	    agent.step(env)

# print("Total reward = {}".format(agent.total_reward))