Esempio n. 1
0
 def astar(self, world, start, goal, robots=[]):
     '''robots is a list of robots to add to the world'''
     for (i, j) in robots:
         world[i, j] = 1
     try:
         path = cpp_mstar.find_path(world, [start], [goal], 1, 5)
     except NoSolutionError:
         path = None
     for (i, j) in robots:
         world[i, j] = 0
     return path
Esempio n. 2
0
    def expert_until_first_goal(self, inflation=2.0, time_limit=60.0):
        world = self.getObstacleMap()
        start_positions = []
        goals = []
        start_positions_dir = self.getPositions()
        goals_dir = self.getGoals()
        for i in range(1, self.world.num_agents + 1):
            start_positions.append(start_positions_dir[i])
            goals.append(goals_dir[i])
        mstar_path = None
        start_time = time.time()
        try:
            max_time += time_limit
            mstar_path = cpp_mstar.find_path(world, start_positions, goals, inflation, time_limit/5.0)

        except OutOfTimeError:
            # M* timed out
            print("timeout")
            print('World', world)
            print('Start Pos', start_positions)
            print('Goals', goals)
        except NoSolutionError:
            print("nosol????")
            print('World', world)
            print('Start Pos', start_positions)
            print('Goals', goals)

        except:
            c_time = time.time() - start_time
            if c_time > time_limit:
                return mstar_path

            #print("cpp_mstar crash most likely... trying python mstar instead")
            try:
                mstar_path = od_mstar.find_path(world, start_positions, goals,
                                                inflation=inflation, time_limit=time_limit)
            except OutOfTimeError:
                # M* timed out
                print("timeout")
                print('World', world)
                print('Start Pos', start_positions)
                print('Goals', goals)
            except NoSolutionError:
                print("nosol????")
                print('World', world)
                print('Start Pos', start_positions)
                print('Goals', goals)
            except:
                print("Unknown bug?!")

        return mstar_path
Esempio n. 3
0
    def work(self, max_episode_length, gamma, sess, coord, saver):
        global episode_count, swarm_reward, episode_rewards, episode_lengths, episode_mean_values, episode_invalid_ops, episode_wrong_blocking  #, episode_invalid_goals
        total_steps, i_buf = 0, 0
        episode_buffers, s1Values = [[] for _ in range(NUM_BUFFERS)
                                     ], [[] for _ in range(NUM_BUFFERS)]

        with sess.as_default(), sess.graph.as_default():
            while self.shouldRun(coord, episode_count):
                sess.run(self.pull_global)

                episode_buffer, episode_values = [], []
                episode_reward = episode_step_count = episode_inv_count = 0
                d = False

                # Initial state from the environment
                if self.agentID == 1:
                    self.env._reset(self.agentID)
                self.synchronize()  # synchronize starting time of the threads
                validActions = self.env._listNextValidActions(self.agentID)
                s = self.env._observe(self.agentID)
                blocking = False
                p = self.env.world.getPos(self.agentID)
                on_goal = self.env.world.goals[p[0], p[1]] == self.agentID
                s = self.env._observe(self.agentID)
                rnn_state = self.local_AC.state_init
                rnn_state0 = rnn_state
                RewardNb = 0
                wrong_blocking = 0
                wrong_on_goal = 0

                if self.agentID == 1:
                    global demon_probs
                    demon_probs[self.metaAgentID] = np.random.rand()
                self.synchronize()  # synchronize starting time of the threads

                # reset swarm_reward (for tensorboard)
                swarm_reward[self.metaAgentID] = 0
                if episode_count < PRIMING_LENGTH or demon_probs[
                        self.metaAgentID] < DEMONSTRATION_PROB:
                    #for the first PRIMING_LENGTH episodes, or with a certain probability
                    #don't train on the episode and instead observe a demonstration from M*
                    if self.workerID == 1 and episode_count % 100 == 0:
                        saver.save(
                            sess, model_path + '/model-' +
                            str(int(episode_count)) + '.cptk')
                    global rollouts
                    rollouts[self.metaAgentID] = None
                    if (self.agentID == 1):
                        world = self.env.getObstacleMap()
                        start_positions = tuple(self.env.getPositions())
                        goals = tuple(self.env.getGoals())
                        try:
                            mstar_path = cpp_mstar.find_path(
                                world, start_positions, goals, 2, 5)
                            rollouts[self.metaAgentID] = self.parse_path(
                                mstar_path)
                        except OutOfTimeError:
                            #M* timed out
                            print("timeout", episode_count)
                        except NoSolutionError:
                            print("nosol????", episode_count, start_positions)
                    self.synchronize()
                    if rollouts[self.metaAgentID] is not None:
                        i_l = self.train(
                            rollouts[self.metaAgentID][self.agentID - 1],
                            sess,
                            gamma,
                            None,
                            rnn_state0,
                            imitation=True)
                        episode_count += 1. / num_workers
                        if self.agentID == 1:
                            summary = tf.Summary()
                            summary.value.add(tag='Losses/Imitation loss',
                                              simple_value=i_l)
                            global_summary.add_summary(summary,
                                                       int(episode_count))
                            global_summary.flush()
                        continue
                    continue
                saveGIF = False
                if OUTPUT_GIFS and self.workerID == 1 and (
                    (not TRAINING) or (episode_count >= self.nextGIF)):
                    saveGIF = True
                    self.nextGIF = episode_count + 64
                    GIF_episode = int(episode_count)
                    episode_frames = [
                        self.env._render(mode='rgb_array',
                                         screen_height=900,
                                         screen_width=900)
                    ]

                while (not self.env.finished):  # Give me something!
                    #Take an action using probabilities from policy network output.
                    a_dist, v, rnn_state, pred_blocking, pred_on_goal = sess.run(
                        [
                            self.local_AC.policy, self.local_AC.value,
                            self.local_AC.state_out, self.local_AC.blocking,
                            self.local_AC.on_goal
                        ],
                        feed_dict={
                            self.local_AC.inputs: [s[0]],
                            self.local_AC.goal_pos: [s[1]],
                            self.local_AC.state_in[0]: rnn_state[0],
                            self.local_AC.state_in[1]: rnn_state[1]
                        })

                    if (not (np.argmax(a_dist.flatten()) in validActions)):
                        episode_inv_count += 1
                    train_valid = np.zeros(a_size)
                    train_valid[validActions] = 1

                    valid_dist = np.array([a_dist[0, validActions]])
                    valid_dist /= np.sum(valid_dist)

                    if TRAINING:
                        if (pred_blocking.flatten()[0] < 0.5) == blocking:
                            wrong_blocking += 1
                        if (pred_on_goal.flatten()[0] < 0.5) == on_goal:
                            wrong_on_goal += 1
                        a = validActions[np.random.choice(
                            range(valid_dist.shape[1]), p=valid_dist.ravel())]
                        train_val = 1.
                    else:
                        a = np.argmax(a_dist.flatten())
                        if a not in validActions or not GREEDY:
                            a = validActions[np.random.choice(
                                range(valid_dist.shape[1]),
                                p=valid_dist.ravel())]
                        train_val = 1.

                    _, r, _, _, on_goal, blocking, _ = self.env._step(
                        (self.agentID, a), episode=episode_count)

                    self.synchronize()  # synchronize threads

                    # Get common observation for all agents after all individual actions have been performed
                    s1 = self.env._observe(self.agentID)
                    validActions = self.env._listNextValidActions(
                        self.agentID, a, episode=episode_count)
                    d = self.env.finished

                    if saveGIF:
                        episode_frames.append(
                            self.env._render(mode='rgb_array',
                                             screen_width=900,
                                             screen_height=900))

                    episode_buffer.append([
                        s[0], a, r, s1, d, v[0, 0], train_valid, pred_on_goal,
                        int(on_goal), pred_blocking,
                        int(blocking), s[1], train_val
                    ])
                    episode_values.append(v[0, 0])
                    episode_reward += r
                    s = s1
                    total_steps += 1
                    episode_step_count += 1

                    if r > 0:
                        RewardNb += 1
                    if d == True:
                        print('\n{} Goodbye World. We did it!'.format(
                            episode_step_count),
                              end='\n')

                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    if TRAINING and (len(episode_buffer) %
                                     EXPERIENCE_BUFFER_SIZE == 0 or d):
                        # Since we don't know what the true final return is, we "bootstrap" from our current value estimation.
                        if len(episode_buffer) >= EXPERIENCE_BUFFER_SIZE:
                            episode_buffers[i_buf] = episode_buffer[
                                -EXPERIENCE_BUFFER_SIZE:]
                        else:
                            episode_buffers[i_buf] = episode_buffer[:]

                        if d:
                            s1Values[i_buf] = 0
                        else:
                            s1Values[i_buf] = sess.run(
                                self.local_AC.value,
                                feed_dict={
                                    self.local_AC.inputs: np.array([s[0]]),
                                    self.local_AC.goal_pos: [s[1]],
                                    self.local_AC.state_in[0]: rnn_state[0],
                                    self.local_AC.state_in[1]: rnn_state[1]
                                })[0, 0]

                        if (episode_count - EPISODE_START) < NUM_BUFFERS:
                            i_rand = np.random.randint(i_buf + 1)
                        else:
                            i_rand = np.random.randint(NUM_BUFFERS)
                            tmp = np.array(episode_buffers[i_rand])
                            while tmp.shape[0] == 0:
                                i_rand = np.random.randint(NUM_BUFFERS)
                                tmp = np.array(episode_buffers[i_rand])
                        v_l, p_l, valid_l, e_l, b_l, og_l, g_n, v_n = self.train(
                            episode_buffers[i_rand], sess, gamma,
                            s1Values[i_rand], rnn_state0)

                        i_buf = (i_buf + 1) % NUM_BUFFERS
                        rnn_state0 = rnn_state
                        episode_buffers[i_buf] = []

                    self.synchronize()  # synchronize threads
                    # sess.run(self.pull_global)
                    if episode_step_count >= max_episode_length or d:
                        break

                episode_lengths[self.metaAgentID].append(episode_step_count)
                episode_mean_values[self.metaAgentID].append(
                    np.nanmean(episode_values))
                episode_invalid_ops[self.metaAgentID].append(episode_inv_count)
                episode_wrong_blocking[self.metaAgentID].append(wrong_blocking)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % EXPERIENCE_BUFFER_SIZE == 0 and printQ:
                    print(
                        '                                                                                   ',
                        end='\r')
                    print('{} Episode terminated ({},{})'.format(
                        episode_count, self.agentID, RewardNb),
                          end='\r')

                swarm_reward[self.metaAgentID] += episode_reward

                self.synchronize()  # synchronize threads

                episode_rewards[self.metaAgentID].append(
                    swarm_reward[self.metaAgentID])

                if not TRAINING:
                    mutex.acquire()
                    if episode_count < NUM_EXPS:
                        plan_durations[episode_count] = episode_step_count
                    if self.workerID == 1:
                        episode_count += 1
                        print(
                            '({}) Thread {}: {} steps, {:.2f} reward ({} invalids).'
                            .format(episode_count, self.workerID,
                                    episode_step_count, episode_reward,
                                    episode_inv_count))
                    GIF_episode = int(episode_count)
                    mutex.release()
                else:
                    episode_count += 1. / num_workers

                    if episode_count % SUMMARY_WINDOW == 0:
                        if episode_count % 100 == 0:
                            print('Saving Model', end='\n')
                            saver.save(
                                sess, model_path + '/model-' +
                                str(int(episode_count)) + '.cptk')
                            print('Saved Model', end='\n')
                        SL = SUMMARY_WINDOW * num_workers
                        mean_reward = np.nanmean(
                            episode_rewards[self.metaAgentID][-SL:])
                        mean_length = np.nanmean(
                            episode_lengths[self.metaAgentID][-SL:])
                        mean_value = np.nanmean(
                            episode_mean_values[self.metaAgentID][-SL:])
                        mean_invalid = np.nanmean(
                            episode_invalid_ops[self.metaAgentID][-SL:])
                        mean_wrong_blocking = np.nanmean(
                            episode_wrong_blocking[self.metaAgentID][-SL:])
                        current_learning_rate = sess.run(
                            lr, feed_dict={global_step: episode_count})

                        summary = tf.Summary()
                        summary.value.add(tag='Perf/Learning Rate',
                                          simple_value=current_learning_rate)
                        summary.value.add(tag='Perf/Reward',
                                          simple_value=mean_reward)
                        summary.value.add(tag='Perf/Length',
                                          simple_value=mean_length)
                        summary.value.add(
                            tag='Perf/Valid Rate',
                            simple_value=(mean_length - mean_invalid) /
                            mean_length)
                        summary.value.add(
                            tag='Perf/Blocking Prediction Accuracy',
                            simple_value=(mean_length - mean_wrong_blocking) /
                            mean_length)

                        summary.value.add(tag='Losses/Value Loss',
                                          simple_value=v_l)
                        summary.value.add(tag='Losses/Policy Loss',
                                          simple_value=p_l)
                        summary.value.add(tag='Losses/Blocking Loss',
                                          simple_value=b_l)
                        summary.value.add(tag='Losses/On Goal Loss',
                                          simple_value=og_l)
                        summary.value.add(tag='Losses/Valid Loss',
                                          simple_value=valid_l)
                        summary.value.add(tag='Losses/Grad Norm',
                                          simple_value=g_n)
                        summary.value.add(tag='Losses/Var Norm',
                                          simple_value=v_n)
                        global_summary.add_summary(summary, int(episode_count))

                        global_summary.flush()

                        if printQ:
                            print('{} Tensorboard updated ({})'.format(
                                episode_count, self.workerID),
                                  end='\r')

                if saveGIF:
                    # Dump episode frames for external gif generation (otherwise, makes the jupyter kernel crash)
                    time_per_step = 0.1
                    images = np.array(episode_frames)
                    if TRAINING:
                        make_gif(
                            images, '{}/episode_{:d}_{:d}_{:.1f}.gif'.format(
                                gifs_path, GIF_episode, episode_step_count,
                                swarm_reward[self.metaAgentID]))
                    else:
                        make_gif(images,
                                 '{}/episode_{:d}_{:d}.gif'.format(
                                     gifs_path, GIF_episode,
                                     episode_step_count),
                                 duration=len(images) * time_per_step,
                                 true_image=True,
                                 salience=False)
                if SAVE_EPISODE_BUFFER:
                    with open('gifs3D/episode_{}.dat'.format(GIF_episode),
                              'wb') as file:
                        pickle.dump(episode_buffer, file)