Esempio n. 1
0
#%matplotlib inline

is_ipython = 'inline' in matplotlib.get_backend()

env = gym.make("MiniGrid-Empty-8x8-v0").unwrapped

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

screen_width = 600

resize = T.Compose([T.ToPILImage(), T.Resize(40, interpolation=Image.CUBIC), T.ToTensor()])

plotter = Plotter()

with open("config.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display
plt.ion()

env.reset()


class ReplayMemory(object):

    def __init__(self, capacity):
Esempio n. 2
0
def main():
    print(
        "Warning: location-based QL is deprecated and non-functional; use QL-obs instead"
    )
    return

    parser = OptionParser()
    parser.add_option("-e",
                      "--env-name",
                      dest="env_name",
                      help="gym environment to load",
                      default='MiniGrid-Empty-8x8-v0')
    (options, args) = parser.parse_args()

    # Load the gym environment
    env = gym.make(options.env_name)

    def resetEnv():
        env.seed()
        env.reset()

    # Convert action from numeric value to environmental directional actions
    def get_action(temp_action):
        act = None

        if temp_action == 0:
            act = env.actions.left
        elif temp_action == 1:
            act = env.actions.up
        elif temp_action == 2:
            act = env.actions.right
        elif temp_action == 3:
            act = env.actions.down
        else:
            print("unknown key")
            return

        return act

    # Assign state values from 2d array(positions on grid are mapped out in 2d(rows/columns)) to 1d for states
    # e.g. 8x8 grid(2d array) mapped to equivalent states
    def table_conversion():
        width = env.width - 2
        pos_loc = []

        for i in range(width):
            pos_loc.append(np.arange(width * i, width * (i + 1)))
        return pos_loc

    plotter = Plotter()

    # Initialize environment
    resetEnv()

    # parameters, can be adjusted
    episodes = 500
    epsilon = 0.8
    decay = 0.99
    alpha = 0.1
    gamma = 0.6

    # metrics
    steps_to_complete = []

    # Initalize q-table [observation space x action space]
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    table_locator = table_conversion()

    for e in range(episodes):
        # Calculate new epsilon-decay value -- decays with each new episode
        epsilon = epsilon * decay

        # Initial agents
        agents = env.reset()

        states = {}
        for agent_id, agent_pos in agents.agent_pos.items():
            # Convert state(grid position) to a 1d state value
            states[agent_id] = table_locator[agent_pos[0] - 1][agent_pos[1] -
                                                               1]

        while True:
            renderer = env.render('human')

            time.sleep(5)

            # Determine whether to explore or exploit for all agents during current step
            if random.uniform(0, 1) < epsilon:
                exploit = False  #explore
            else:
                exploit = True  #exploit

            # Determine action for each agent
            actions = {}
            for agent_id, agent_pos in agents.agent_pos.items():
                if exploit is False:
                    temp_action = env.action_space.sample()  #explore
                else:
                    temp_action = np.argmax(
                        q_table[states[agent_id]])  #exploit

                # Convert action from numeric to environment-accepted directional action
                actions[agent_id] = get_action(temp_action)

            # Take step
            obs, reward, done, agents, info = env.step(actions)
            # print('reward=%.2f' % (reward))
            print(obs['image'][0])

            # Calculate q-table values for each agent
            for agent_id, agent_pos in agents.agent_pos.items():
                # Using the agents new position returned from the environment, convert from grid coordinates to table based state for next state
                next_state = table_locator[agent_pos[0] - 1][agent_pos[1] - 1]
                old_val = q_table[states[agent_id], actions[agent_id]]

                # New possible max at the next state for q table calculations
                next_max = np.max(q_table[next_state])

                # Calculate new q value
                new_q_val = (1 - alpha) * old_val + alpha * (reward[agent_id] +
                                                             gamma + next_max)
                print(
                    str(agent_id) + ':' +
                    'step=%s,reward=%.2f, new_q_val=%.2f, state=%i, action=%s'
                    % (env.step_count, reward[agent_id], new_q_val,
                       states[agent_id], actions[agent_id]))
                # print(obs[agent_id])
                q_table[states[agent_id], actions[agent_id]] = new_q_val

                states[agent_id] = next_state

            # time.sleep(10000)
            # time.sleep(1.5)

            if done:
                # plot steps by episode
                steps_to_complete.append(env.step_count)
                plotter.plot_steps(steps_to_complete, '-lr')

                print('done!')
                print(q_table)
                break

    print("Training finished.\n")
Esempio n. 3
0
def main():
    parser = OptionParser()
    parser.add_option(
        "-e",
        "--env-name",
        dest="env_name",
        help="gym environment to load",
        default='MiniGrid-Empty-8x8-v0'
    )
    (options, args) = parser.parse_args()

    # Load the gym environment
    env = gym.make(options.env_name)

    # def resetEnv():
    #     env.seed()
    #     env.reset()

    # def sha1(s):
    #     return hashlib.sha1(s).hexdigest()

    # Convert action from numeric value to environmental directional actions
    def get_action(temp_action):
        act = None

        if temp_action == 0:
            act = env.actions.left
        elif temp_action == 1:
            act = env.actions.up
        elif temp_action == 2:
            act = env.actions.right
        elif temp_action == 3:
            act = env.actions.down
        else:
            print("unknown key")
            return

        return act

    plotter = Plotter()

    with open("config.yml", 'r') as ymlfile:
        cfg= yaml.load(ymlfile)

    # render boolean
    grid_render = cfg['rnd']['grid_render']
    grid_obs_render = cfg['rnd']['grid_obs_render']
    obs_render = cfg['rnd']['obs_render']
    gray = cfg['rnd']['grayscale']
    sleep = cfg['rnd']['sleep']

    # parameters, can be adjusted in config.yml    
    episodes = cfg['ql']['episodes']
    epsilon = cfg['ql']['epsilon']
    decay = cfg['ql']['decay']
    alpha = cfg['ql']['alpha']
    gamma = cfg['ql']['gamma']

    # metrics
    steps_to_complete = []

    # Initalize q-table [observation space x action space]
    # q_table = defaultdict(lambda: np.random.uniform(size=(env.action_space.n,)))
    q_table = defaultdict(lambda: np.zeros(shape=(len(env.actions),)))

    run_ep = 0
    for e in range(episodes+1000):
        if e >= episodes and run_ep == 0:
            grid_obs_render = True
            grid_render = True
            run_ep = int(input("Enter number of episodes: "))
            sleep = float(input("Enter sleep interval: "))

        if run_ep > 0:
            run_ep -= 1

        # Calculate new epsilon-decay value -- decays with each new episode
        epsilon = epsilon*decay

        # Initial agents
        obs = env.reset()

        states = {}
        for agent_id in obs:
            # Convert state(grid position) to a 1d state value
            # states[agent_id] = sha1(np.array(init_obs[agent_id]))
            temp_obs = ''
            for list in obs[agent_id]:
                temp = ','.join(map(str, list))
                temp_obs += ',' + temp
            states[agent_id]  = temp_obs

        while True:
            if obs_render:
                env.get_obs_render(obs, grayscale=gray)
            if grid_render:
                env.render('human', highlight=grid_obs_render, grayscale=gray, info="Episode: %s \tStep: %s" % (str(e),str(env.step_count)))

            time.sleep(sleep)

            # Determine whether to explore or exploit for all agents during current step
            if np.random.uniform(0, 1) < epsilon:
                exploit = False #explore
            else:
                exploit = True  #exploit

            # Determine action for each agent
            actions = {}
            for agent_id in obs:
                if exploit is False:
                    temp_action = env.action_space.sample() #explore
                else:
                    temp_action = np.argmax(q_table[states[agent_id]]) #exploit

                # Convert action from numeric to environment-accepted directional action
                actions[agent_id] = get_action(temp_action)

            # Take step
            obs, reward, done, agents, info = env.step(actions)
            
            # Calculate q-table values for each agent
            for agent_id in obs:
                # Using the agents new position returned from the environment, convert from grid coordinates to table based state for next state
                # next_state = sha1(np.array(obs[agent_id]))
                next_state = ''
                for list in obs[agent_id]:
                    temp = ','.join(map(str, list))
                    next_state += ',' + temp
                old_val = q_table[states[agent_id]][actions[agent_id]]
                
                # New possible max at the next state for q table calculations
                next_max = np.max(q_table[next_state])

                # Calculate new q value
                new_q_val = (1-alpha) * old_val + alpha * (reward[agent_id] + gamma * next_max)
                print(str(agent_id) + ':' + 'episode=%s, step=%s, reward=%.2f, new_q_val=%.2f, state=%s, action=%s' \
                            % (e, env.step_count, reward[agent_id], new_q_val, states[agent_id], actions[agent_id]))
                
                q_table[states[agent_id]][actions[agent_id]] = new_q_val

                states[agent_id] = next_state
           
            if done:
                print('done!')

                # plot steps by episode
                steps_to_complete.append(env.step_count)

                # if e % 1000 == 0:
                #     plotter.plot_steps(steps_to_complete)
                #     with open("qt_output.csv", "w") as outfile:
                #         writer = csv.writer(outfile)
                #         for key, val in q_table.items():
                #             writer.writerow([key, *val])
                break


    print("Training finished.\n")

    # csv store steps_to_complete
    filename = "steps_{}x{}_o{}_a{}_r{}_t{}.csv".format(env.grid_size, env.grid_size, cfg['env']['obstacles'], env.n_agents, env.obs_radius, env.reward_type)
    w = csv.writer(open(filename, "w+"))
    for i in range(len(steps_to_complete)):
        w.writerow([i, steps_to_complete[i]])

    # png save plot/show
    plotter.plot_steps(steps_to_complete)

    # #csv store q_table
    # w = csv.writer(open("qt_output.csv", "w+"))
    # for key, val in q_table.items():
    #     w.writerow([key, val])

    # pkl q_table
    f = open("qt.pkl","wb+")
    for key in q_table:
        print(key)
    pickle.dump(dict(q_table), f)
    f.close()