Esempio n. 1
0
def evaluate(name='or', max_trajectory = 20):    
            
    if name == 'B.S':
        dqn = dqn_and
        goal_condition=lambda x: x.colour == 'blue' and x.shape == 'square'
    elif name == 'B.-S':
        dqn = ComposedDQN([dqn_blue,dqn_not_square], compose="and")
        goal_condition=lambda x: x.colour == 'blue' and not x.shape == 'square'
    elif name == 'S.-B':
        dqn = ComposedDQN([dqn_square,dqn_not_blue], compose="and")
        goal_condition=lambda x: x.shape == 'square' and not x.colour == 'blue'
    elif name == '-(B+S)':
        dqn = dqn_not_or
        goal_condition=lambda x: not (x.colour == 'blue' or x.shape == 'square')
    elif name == 'B':
        dqn = dqn_blue
        goal_condition=lambda x: x.colour == 'blue'
    elif name == '-B':
        dqn = dqn_not_blue
        goal_condition=lambda x: not x.colour == 'blue'
    elif name == 'S':
        dqn = dqn_square
        goal_condition=lambda x: x.shape == 'square'
    elif name == '-S':
        dqn = dqn_not_square
        goal_condition=lambda x: not x.shape == 'square'
    elif name == 'B+S':
        dqn = dqn_or
        goal_condition=lambda x: x.colour == 'blue' or x.shape == 'square'
    elif name == 'B+-S':
        dqn = ComposedDQN([dqn_blue,dqn_not_square], compose="or")
        goal_condition=lambda x: x.colour == 'blue' or not x.shape == 'square'
    elif name == 'S+-B':
        dqn = ComposedDQN([dqn_square,dqn_not_blue], compose="or")
        goal_condition=lambda x: x.shape == 'square' or not x.colour == 'blue'
    elif name == '-(B.S)':
        dqn = dqn_not_and
        goal_condition=lambda x: not (x.colour == 'blue' and x.shape == 'square')
    elif name == '-BxorS':
        dqn = ComposedDQN([dqn_xor], compose="not")
        goal_condition=lambda x: not((x.colour == 'blue' or x.shape == 'square') and not (x.colour == 'blue' and x.shape == 'square'))
    elif name == 'BxorS':
        dqn = dqn_xor
        goal_condition=lambda x: (x.colour == 'blue' or x.shape == 'square') and not (x.colour == 'blue' and x.shape == 'square')
    else:
        print("Invalid name")
        return
    
    env = MaxLength(WarpFrame(CollectEnv(start_positions=start_positions,goal_condition=goal_condition)), max_trajectory)
    
    G = 0
    with torch.no_grad():
        obs = env.reset()                
        for _ in range(max_trajectory):
            obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
            values = []
            for goal in goals:
                goal = torch.from_numpy(np.array(goal)).type(FloatTensor).unsqueeze(0)
                x = torch.cat((obs,goal),dim=3)
                values.append(dqn(x).squeeze(0))
            values = torch.stack(values,1).t()
            action = values.data.max(0)[0].max(0)[1].item()
            obs, reward, done, _ = env.step(action)        
            G += reward

            if done:
                break
    return G
        'player': (3, 4),
        'crate_purple': (6, 3),
        'circle_purple': (7, 7),
        'circle_beige': (1, 7),
        'crate_beige': (2, 2),
        'crate_blue': (8, 1),
        'circle_blue': (2, 8)
    }
    env = WarpFrame(
        CollectEnv(start_positions=start_positions,
                   goal_condition=lambda x: x.shape == 'square' and x.colour ==
                   'blue'))

    dqn_blue = load('../../models/blue/model.dqn', env)
    dqn_crate = load('../../models/crate/model.dqn', env)
    dqn = ComposedDQN([dqn_blue, dqn_crate], [1, 1], or_compose=False)

    values = np.zeros_like(env.env.board, dtype=float)
    for pos in env.env.free_spaces:
        positions = copy.deepcopy(start_positions)

        positions = remove(positions, pos)

        positions['player'] = pos
        env = WarpFrame(
            CollectEnv(start_positions=positions,
                       goal_condition=lambda x: x.shape == 'square' and x.
                       colour == 'blue'))
        obs = env.reset()
        obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
        v = get_value(dqn, obs)
Esempio n. 3
0
# Tasks = ["B.S", "B.-S", "S.-B", "-(B+S)", "B", "-B", "S", "-S", "B+S", "B+-S", "S+-B", "-(B.S)", "-BxorS", "BxorS"]
# Tasks_N = [1, 1, 2, 2, 2, 4, 3, 3, 4, 4, 5, 5, 3, 3]
Tasks = ["B", "S", "B+S", "B.S", "BxorS"]
Tasks_P = [[(1,8),(8,1)], [(8,1),(1,1),(6,3)], [(1,8),(8,1),(1,1),(6,3)], [(8,1)], [(1,8),(1,1),(6,3)]]

goals = []
if os.path.exists('./goals.h5'):
    goals = dd.io.load('goals.h5')

env = CollectEnv()
dqn_blue = load('./models/blue/model.dqn', env)
dqn_square = load('./models/crate/model.dqn', env)
if torch.cuda.is_available():
    dqn_blue.cuda()
    dqn_square.cuda()
dqn_not_blue = ComposedDQN([dqn_blue], compose="not")
dqn_not_square = ComposedDQN([dqn_square], compose="not")
dqn_or = ComposedDQN([dqn_blue,dqn_square], compose="or")
dqn_not_or = ComposedDQN([dqn_or], compose="not")
dqn_and = ComposedDQN([dqn_blue,dqn_square], compose="and")
dqn_not_and = ComposedDQN([dqn_and], compose="not")
dqn_xor = ComposedDQN([dqn_or,dqn_not_and], compose="and")
    
def evaluate(name='or', max_trajectory = 20):    
            
    if name == 'B.S':
        dqn = dqn_and
        goal_condition=lambda x: x.colour == 'blue' and x.shape == 'square'
    elif name == 'B.-S':
        dqn = ComposedDQN([dqn_blue,dqn_not_square], compose="and")
        goal_condition=lambda x: x.colour == 'blue' and not x.shape == 'square'
               ('beige', 'circle')}
    task = MaxLength(WarpFrame(
        MultiCollectEnv(lambda collected: targets.issubset({(c.colour, c.shape) for c in collected}),
                        lambda x: (x.colour, x.shape) in targets)), max_trajectory)

    #agent = train('./models/temporal3/results', task) # 1 million
    #save('./models/temporal3/model.dqn', agent)

    dqn = load('./models/temporal3/model.dqn', task)  # dqn trained on full task

    max_episodes = 50000
    max_trajectory = 50

    dqn1 = load('./models/purple/model.dqn', task)
    dqn2 = load('./models/blue/model.dqn', task)
    dqn3 = load('./models/beige/model.dqn', task)
    dqn_composed = ComposedDQN([dqn1, dqn2, dqn3])

    for dqn, name in [(dqn, 'full_task'),  (dqn_composed, 'composed')]:
        env = Monitor(task, './experiment_temporal/' + name + '/', video_callable=False, force=True)
        for episode in range(max_episodes):
            if episode % 1000 == 0:
                print(episode)
            obs = env.reset()
            for _ in range(max_trajectory):
                obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
                action = get_action(dqn, obs)
                obs, reward, done, _ = env.step(action)
                env.render()
                if done:
                    break
Esempio n. 5
0
Tasks_N = [2, 2, 3, 4, 1, 3]

goals = []
if os.path.exists('./goals.h5'):
    goals = dd.io.load('goals.h5')

env = CollectEnv()
dqn_purple = load('./models/purple/model.dqn', env)
dqn_blue = load('./models/blue/model.dqn', env)
dqn_square = load('./models/crate/model.dqn', env)
if torch.cuda.is_available():
    dqn_purple.cuda()
    dqn_blue.cuda()
    dqn_square.cuda()

max_evf = ComposedDQN([dqn_purple, dqn_blue, dqn_square])
dqn_not_blue = ComposedDQN([dqn_blue], dqn_max=max_evf, compose="not")
dqn_not_square = ComposedDQN([dqn_square], dqn_max=max_evf, compose="not")
dqn_or_purple = ComposedDQN([dqn_purple, dqn_blue], compose="or")
dqn_or = ComposedDQN([dqn_blue, dqn_square], compose="or")
dqn_not_or = ComposedDQN([dqn_or], dqn_max=max_evf, compose="not")
dqn_and = ComposedDQN([dqn_blue, dqn_square], compose="and")
dqn_not_and = ComposedDQN([dqn_and], dqn_max=max_evf, compose="not")
dqn_xor = ComposedDQN([dqn_or, dqn_not_and], compose="and")


def evaluate(name='or', save_trajectories=True, max_trajectory=20):

    if name == 'B.S':
        dqn = dqn_and
        goal_condition = lambda x: x.colour == 'blue' and x.shape == 'square'
from trainer import load
from wrappers import WarpFrame, MaxLength

if __name__ == '__main__':

    max_episodes = 50000
    max_trajectory = 50

    task = MaxLength(WarpFrame(CollectEnv(goal_condition=lambda x: x.colour == 'blue' and x.shape == 'square')),
                     max_trajectory)

    dqn_blue_crate = load('./models/blue_crate/model.dqn', task)
    dqn_blue = load('./models/blue/model.dqn', task)
    dqn_crate = load('./models/crate/model.dqn', task)

    dqn_composed_or = ComposedDQN([dqn_blue, dqn_crate])
    dqn_composed_and = ComposedDQN([dqn_blue, dqn_crate], or_compose=False)

    for dqn, name in [(dqn_blue_crate, 'blue_crate'), (dqn_composed_or, 'blue_or_crate'),
                      (dqn_composed_and, 'blue_and_crate')]:

        env = Monitor(task, './experiment_approx_and/' + name + '/', video_callable=False, force=True)
        for episode in range(max_episodes):
            if episode % 1000 == 0:
                print(episode)
            obs = env.reset()
            for _ in range(max_trajectory):
                action = get_action(dqn, obs)
                obs, reward, done, _ = env.step(action)
                if done:
                    break
Esempio n. 7
0
def compose(dqns, weights):
    return ComposedDQN(dqns, weights)
Esempio n. 8
0
    CollectEnv(start_positions=start_positions,
               goal_condition=goal_condition)))

goals = []
if os.path.exists('./goals.h5'):
    goals = dd.io.load('goals.h5')

dqn_blue = load('./models/blue/model.dqn', env)
dqn_purple = load('./models/purple/model.dqn', env)
dqn_crate = load('./models/crate/model.dqn', env)
if torch.cuda.is_available():
    dqn_blue.cuda()
    dqn_purple.cuda()
    dqn_crate.cuda()

dqn_or = ComposedDQN([dqn_blue, dqn_crate], compose="or")
dqn_and = ComposedDQN([dqn_blue, dqn_crate], compose="and")
dqn_not_blue = ComposedDQN([dqn_blue], compose="not")
dqn_not_crate = ComposedDQN([dqn_crate], compose="not")
dqn_blue_not_crate = ComposedDQN([dqn_blue, dqn_not_crate], compose="and")
dqn_crate_not_blue = ComposedDQN([dqn_not_blue, dqn_crate], compose="and")
dqn_xor = ComposedDQN([dqn_blue_not_crate, dqn_crate_not_blue], compose="or")

dqn = dqn_and
name = 'and'

if name == 'blue':
    goal_condition = lambda x: x.colour == 'blue'
elif name == 'purple':
    goal_condition = lambda x: x.colour == 'purple'
elif name == 'square':
    def exp(name='or',
            save_trajectories=True,
            max_episodes=4,
            max_trajectory=7):

        env = CollectEnv()
        dqn_blue = load('./models/blue/model.dqn', env)
        dqn_crate = load('./models/crate/model.dqn', env)
        if torch.cuda.is_available():
            dqn_blue.cuda()
            dqn_crate.cuda()

        dqn_not = ComposedDQN([dqn_blue], compose="not")
        dqn_or = ComposedDQN([dqn_blue, dqn_crate], compose="or")
        dqn_and = ComposedDQN([dqn_blue, dqn_crate], compose="and")
        dqn_not_and = ComposedDQN([dqn_and], compose="not")
        dqn_xor = ComposedDQN([dqn_or, dqn_not_and], compose="and")

        goals = []
        if os.path.exists('./goals.h5'):
            goals = dd.io.load('goals.h5')

        if name == 'blue':
            dqn = dqn_blue
            goal_condition = lambda x: x.colour == 'blue'
        elif name == 'square':
            dqn = dqn_crate
            goal_condition = lambda x: x.shape == 'square'
        elif name == 'not':
            dqn = dqn_not
            goal_condition = lambda x: not x.colour == 'blue'
        elif name == 'or':
            dqn = dqn_or
            goal_condition = lambda x: x.colour == 'blue' or x.shape == 'square'
        elif name == 'and':
            dqn = dqn_and
            goal_condition = lambda x: x.colour == 'blue' and x.shape == 'square'
        elif name == 'xor':
            dqn = dqn_xor
            goal_condition = lambda x: (
                x.colour == 'blue' or x.shape == 'square') and not (
                    x.colour == 'blue' and x.shape == 'square')
        # else:
        #     print("Invalid name")
        #     return

        # env = MaxLength(WarpFrame(CollectEnv(start_positions=start_positions,goal_condition=lambda x: True)), max_trajectory)

        poss = [(3, 4), (1, 2), (5, 7), (5, 2)]
        trajectories = []
        with torch.no_grad():
            episode = 0
            while episode < max_episodes:
                start_positions['crate_beige'] = poss[episode]
                env = (WarpFrame(
                    CollectEnv(start_positions=start_positions,
                               changePlayerPos=False,
                               goal_condition=lambda x: True)))
                obs = env.reset()

                trajectory = []
                for _ in range(max_trajectory):
                    trajectory.append(
                        Image.fromarray(np.uint8(env.render(mode='rgb_img'))))

                    obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
                    values = []
                    for goal in goals:
                        goal = torch.from_numpy(
                            np.array(goal)).type(FloatTensor).unsqueeze(0)
                        x = torch.cat((obs, goal), dim=3)
                        values.append(dqn_blue(x).squeeze(0))
                    values = torch.stack(values, 1).t()
                    valuesb = values.data.max(0)[0]
                    values = []
                    for goal in goals:
                        goal = torch.from_numpy(
                            np.array(goal)).type(FloatTensor).unsqueeze(0)
                        x = torch.cat((obs, goal), dim=3)
                        values.append(dqn_crate(x).squeeze(0))
                    values = torch.stack(values, 1).t()
                    valuess = values.data.max(0)[0]
                    values = torch.stack((valuesb, valuess), 0).min(0)[0]
                    action = values.max(0)[1].item()
                    obs, reward, done, _ = env.step(action)
                    if done:
                        break
                trajectories += trajectory[:-1]
                episode += 1

        if save_trajectories:
            trajectories[0].save('./trajectories/' + name + '.gif',
                                 save_all=True,
                                 append_images=trajectories[1:],
                                 optimize=False,
                                 duration=250,
                                 loop=0)
Esempio n. 10
0
        'player': (2, 1),
        'crate_purple': (6, 3),
        'circle_purple': (7, 7),
        'circle_beige': (1, 7),
        'crate_beige': (2, 2),
        'crate_blue': (8, 1),
        'circle_blue': (2, 8)
    }
    env = WarpFrame(
        CollectEnv(start_positions=start_positions,
                   goal_condition=lambda x: x.colour == 'purple' or x.colour ==
                   'blue'))

    dqn1 = load('../../models/purple/model.dqn', env)
    dqn2 = load('../../models/blue/model.dqn', env)
    dqn = ComposedDQN([dqn1, dqn2], [1, 1.1])
    obs = env.reset()
    positions = list()
    positions.append(env.env.player.position)
    env.render()

    for _ in range(100):
        obs = np.array(obs)
        obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)

        action = get_action(dqn, obs)
        obs, reward, done, _ = env.step(action)
        env.render()
        positions.append(env.env.player.position)
        if done:
            obs = env.reset()
Esempio n. 11
0
    task = MaxLength(WarpFrame(CollectEnv(goal_condition=lambda x: (x.colour == 'beige' and x.shape == 'square')
                                                                   or (x.colour == 'purple' and x.shape == 'circle'))),
                     max_trajectory)
    env = Monitor(task, './experiment_weighted_or/', video_callable=False, force=True)

    dqn_purple_circle = load('./models/purple_circle/model.dqn', task)  # entropy regularised functions
    dqn_beige_crate = load('./models/beige_crate/model.dqn', task)  # entropy regularised functions
    weights = np.arange(1/3, 3.01, 0.05)

    tally = {i: [] for i in range(len(weights))}

    for iter in range(max_iterations):
        for i, weight in enumerate(weights):
            collected_count = [0, 0]
            weight = 1
            dqn_composed = ComposedDQN([dqn_beige_crate, dqn_purple_circle], [weight, 1])
            for episode in range(max_episodes):
                if episode % 1000 == 0:
                    print(episode)
                obs = env.reset()

                for _ in range(max_trajectory):
                    obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
                    # action = dqn_composed(Variable(obs, volatile=True)).data.max(1)[1].view(1, 1)[0][0]
                    action = get_action(dqn_composed, obs)
                    obs, reward, done, info = env.step(action)
                    if done:
                        collected = info['collected']
                        if len([c for c in collected if c.colour == 'beige' and c.shape == 'square']) > 0:
                            collected_count[0] += 1
                        elif len([c for c in collected if c.colour == 'purple' and c.shape == 'circle']) > 0:
Esempio n. 12
0
    'player': (3, 4),
    'crate_purple': (6, 3),
    'circle_purple': (7, 7),
    'circle_beige': (1, 7),
    'crate_beige': (2, 2),
    'crate_blue': (8, 1),
    'circle_blue': (2, 8)
}
env = WarpFrame(
    CollectEnv(
        start_positions=start_positions,
        goal_condition=lambda x: x.colour == 'purple' or x.colour == 'blue'))

dqn_blue = load('../../models/blue/model.dqn', env)
dqn_purple = load('../../models/purple/model.dqn', env)
dqn = ComposedDQN([dqn_blue, dqn_purple], [1, 1])

values = np.zeros_like(env.env.board, dtype=float)
for pos in env.env.free_spaces:
    positions = copy.deepcopy(start_positions)

    positions = remove(positions, pos)

    positions['player'] = pos
    env = WarpFrame(
        CollectEnv(start_positions=positions,
                   goal_condition=lambda x: x.colour == 'purple' or x.colour ==
                   'blue'))
    obs = env.reset()
    obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
    v = get_value(dqn, obs)
Esempio n. 13
0
        'player': (5, 5),
        'crate_purple': (6, 3),
        'circle_purple': (7, 7),
        'circle_beige': (1, 7),
        'crate_beige': (2, 2),
        'crate_blue': (8, 1),
        'circle_blue': (2, 8)
    }
    env = WarpFrame(
        CollectEnv(start_positions=start_positions,
                   goal_condition=lambda x: x.shape == 'square' and x.colour ==
                   'blue'))

    dqn1 = load('../../models/crate/model.dqn', env)
    dqn2 = load('../../models/blue/model.dqn', env)
    dqn = ComposedDQN([dqn1, dqn2], [1, 1], or_compose=False)
    obs = env.reset()
    positions = list()
    positions.append(env.env.player.position)
    env.render()

    for _ in range(100):
        obs = np.array(obs)
        obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)

        action = get_action(dqn, obs)
        obs, reward, done, _ = env.step(action)
        env.render()
        positions.append(env.env.player.position)
        if done:
            obs = env.reset()
Esempio n. 14
0
    env = WarpFrame(
        MultiCollectEnv(termination_condition=lambda collected: targets.
                        issubset({(c.colour, c.shape)
                                  for c in collected}),
                        reward_condition=lambda x:
                        (x.colour, x.shape) in targets,
                        start_positions=start_positions))

    dqn1 = load('../../models/purple/model.dqn', env)
    # dqn2 = load('../../models/purple_circle/model.dqn', env)
    dqn3 = load('../../models/blue/model.dqn', env)
    dqn4 = load('../../models/beige/model.dqn', env)
    # dqn = ComposedDQN([dqn1, dqn2, dqn3, dqn4], [1,1,1,1])
    #dqn1 = load('../../models/crate/model.dqn', env)
    #dqn2 = load('../../models/blue/model.dqn', env)
    dqn = ComposedDQN([dqn1, dqn3, dqn4])

    obs = env.reset()
    positions = list()
    positions.append(env.env.player.position)
    env.render()

    for _ in range(100):
        obs = np.array(obs)
        obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
        action = get_action(dqn, obs)
        obs, reward, done, _ = env.step(action)
        env.render()
        positions.append(env.env.player.position)
        if done:
            obs = env.reset()
Esempio n. 15
0
    start_positions = {'player': (3, 4),
                       'crate_purple': (6, 3),
                       'circle_purple': (7, 7),
                       'circle_beige': (1, 7),
                       'crate_beige': (2, 2),
                       'crate_blue': (8, 1),
                       'circle_blue': (2, 8)}

    env = WarpFrame(CollectEnv(start_positions=start_positions,
                               goal_condition=lambda x: (x.colour == 'beige' and x.shape == 'square')
                                                        or (x.colour == 'purple' and x.shape == 'circle')))

    dqn_purple_circle = load('../../models/purple_circle/model.dqn', env)
    dqn_beige_crate = load('../../models/beige_crate/model.dqn', env)
    dqn = ComposedDQN([dqn_purple_circle, dqn_beige_crate], [3, 2])  # TODO put weights here!

    values = np.zeros_like(env.env.board, dtype=float)
    for pos in env.env.free_spaces:
        positions = copy.deepcopy(start_positions)

        positions = remove(positions, pos)

        positions['player'] = pos
        env = WarpFrame(CollectEnv(start_positions=positions,
                                   goal_condition=lambda x: (x.colour == 'beige' and x.shape == 'square')
                                                            or (x.colour == 'purple' and x.shape == 'circle')))

        obs = env.reset()
        obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
        v = get_value(dqn, obs)
Esempio n. 16
0
    def exp(name='or',
            save_trajectories=True,
            max_episodes=4,
            max_trajectory=20):

        env = CollectEnv()
        dqn_purple = load('./models/purple/model.dqn', env)
        dqn_blue = load('./models/blue/model.dqn', env)
        dqn_crate = load('./models/crate/model.dqn', env)
        if torch.cuda.is_available():
            dqn_purple.cuda()
            dqn_blue.cuda()
            dqn_crate.cuda()

        dqn_max = ComposedDQN([dqn_purple, dqn_blue, dqn_crate], compose="or")
        dqn_not = ComposedDQN([dqn_blue], dqn_max=dqn_max, compose="not")
        dqn_or = ComposedDQN([dqn_blue, dqn_crate], compose="or")
        dqn_and = ComposedDQN([dqn_blue, dqn_crate], compose="and")
        dqn_not_and = ComposedDQN([dqn_and], dqn_max=dqn_max, compose="not")
        dqn_xor = ComposedDQN([dqn_or, dqn_not_and], compose="and")

        goals = []
        if os.path.exists('./goals.h5'):
            goals = dd.io.load('goals.h5')

        if name == 'blue':
            dqn = dqn_blue
            goal_condition = lambda x: x.colour == 'blue'
        elif name == 'purple':
            dqn = dqn_purple
            goal_condition = lambda x: x.colour == 'purple'
        elif name == 'square':
            dqn = dqn_crate
            goal_condition = lambda x: x.shape == 'square'
        if name == 'not':
            dqn = dqn_not
            goal_condition = lambda x: not x.colour == 'blue'
        elif name == 'or':
            dqn = dqn_or
            goal_condition = lambda x: x.colour == 'blue' or x.shape == 'square'
        elif name == 'and':
            dqn = dqn_and
            goal_condition = lambda x: x.colour == 'blue' and x.shape == 'square'
        elif name == 'xor':
            dqn = dqn_xor
            goal_condition = lambda x: (
                x.colour == 'blue' or x.shape == 'square') and not (
                    x.colour == 'blue' and x.shape == 'square')
        else:
            print("Invalid name")
            return

        env = MaxLength(WarpFrame(CollectEnv(goal_condition=goal_condition)),
                        max_trajectory)

        trajectories = []
        with torch.no_grad():
            episode = 0
            while episode < max_episodes:
                obs = env.reset()
                trajectory = []
                for _ in range(max_trajectory):
                    trajectory.append(
                        Image.fromarray(np.uint8(env.render(mode='rgb_img'))))

                    obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0)
                    values = []
                    for goal in goals:
                        goal = torch.from_numpy(
                            np.array(goal)).type(FloatTensor).unsqueeze(0)
                        x = torch.cat((obs, goal), dim=3)
                        values.append(dqn(x).squeeze(0))
                    values = torch.stack(values, 1).t()
                    action = values.data.max(0)[0].max(0)[1].item()
                    obs, reward, done, _ = env.step(action)
                    if done:
                        episode += 1
                        trajectories += trajectory[:-1]
                        break

        if save_trajectories:
            trajectories[0].save('./trajectories/' + name + '.gif',
                                 save_all=True,
                                 append_images=trajectories[1:],
                                 optimize=False,
                                 duration=250,
                                 loop=0)