def __init__(self,
              config,
              global_network,
              thread_index,
              network_scope="network",
              scene_scope="scene",
              task_scope="task"):
     self.thread_index = thread_index
     self.config = config
     self.network_scope = network_scope
     self.scene_scope = scene_scope
     self.task_scope = task_scope
     self.scopes = [network_scope, scene_scope, task_scope]
     self.local_network = global_network
     self.env = Environment({
         'scene_name': self.scene_scope,
         'terminal_state_id': int(self.task_scope)
     })
     self.env.reset()
     self.expert = Expert(self.env)
     self.local_t = 0
     self.episode_length = 0
     self.first_iteration = True  # first iteration of Dagger
     # training dataset
     self.states = []
     self.actions = []
     self.targets = []
Esempio n. 2
0
    def _evaluate(self, sess, list_of_tasks, num_episodes, max_steps,
                  success_cutoff):

        scene_scopes = list_of_tasks.keys()
        results = {}

        for scene_scope in scene_scopes:

            for task_scope in list_of_tasks[scene_scope]:

                env = Environment({
                    'scene_name': scene_scope,
                    'terminal_state_id': int(task_scope)
                })
                ep_lengths = []
                ep_collisions = []
                oracle_lengths = []
                ep_successes = []

                scopes = [self.network_scope, scene_scope, task_scope]

                for i_episode in range(num_episodes):

                    env.reset()
                    oracle_lengths.append(env.shortest_path_distances[
                        env.current_state_id][env.terminal_state_id])

                    terminal = False
                    ep_length = 0
                    ep_collision = 0

                    while not terminal:

                        pi_values = self.local_network.run_policy(
                            sess, env.s_t, env.target, scopes)
                        action = sample_action(pi_values)
                        env.step(action)
                        env.update()

                        terminal = env.terminal
                        if ep_length == max_steps: break
                        if env.collided: ep_collision += 1
                        ep_length += 1

                    ep_lengths.append(ep_length)
                    ep_collisions.append(ep_collision)
                    ep_successes.append(int(ep_length < success_cutoff))

                results[scene_scope + task_scope] = [
                    np.mean(ep_lengths),
                    np.mean(ep_collisions),
                    np.mean(oracle_lengths),
                    np.mean(ep_successes)
                ]

        return results
Esempio n. 3
0
    def main(args):
    	self.scene_scope=bathroom_02
    	self.task_scope=37  #26 43 53 32 41
    	self.env = Environment({'scene_name': self.scene_scope,'terminal_state_id': int(self.task_scope)})
    	self.env.reset()
        Policy = Policy_net('policy', env) #buiding the actor critic graph / object

        PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) #gradiet updatror object or the graph
        pdb.set_trace()
        D = Discriminator(env) #discriminator of the Gan Kind of thing
Esempio n. 4
0
    f = open("/home/hmi/Desktop/visual-navigation-checkpoint/path.py", "w")
    #fo = open("/home/hmi/Desktop/visual-navigation-checkpoint/Dang_result.py", "a")

    counter = 0
    for scene_scope in scene_scopes:

        scene_stats[scene_scope] = []
        f.write('PATH = {')
        #fo.write(scene_scope +'\n')
        for task_scope in list_of_tasks[scene_scope]:
            #fo.write('%s \n', %task_scope)

            env = Environment({
                'scene_name': scene_scope,
                'terminal_state_id': task_scope[0],
                'checkpoint_state_id': task_scope[1]
            })
            ep_rewards = []
            ep_lengths = []
            ep_collisions = []

            scopes = [network_scope, scene_scope, task_scope]

            print('evaluation: %s %s' % (scene_scope, task_scope))

            viewer = SimpleImageViewer()
            #NUM_EVAL_EPISODES
            for i_episode in range(2):

                env.reset()
Esempio n. 5
0
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.training_scene,
                'terminal_state_id': self.task_scope,
                'checkpoint_state_id': self.checkpoint_scope
            })
            self.env.reset()

        states = []
        actions = []
        rewards = []
        values = []
        targets = []
        checkpoints = []
        positions = []
        auxilaries = []
        auxilaries_cl = []
        aclists = []
        colists = []
        isCheckpointed = []
        collision = []
        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.s_position, self.env.checkpoint,
                self.env.s_a_t, self.env.s_c_t, self.env.isCheckpoint,
                self.env.s_aux_cl, self.scopes)
            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)
            checkpoints.append(self.env.checkpoint)
            positions.append(self.env.s_position)
            aclists.append(self.env.s_a_t)
            colists.append(self.env.s_c_t)
            collision.append(self.env.s_aux_cl)
            isCheckpointed.append(int(self.env.isCheckpoint))
            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward
            terminal = self.env.terminal

            #if self.env.isCheckpoint:
            #  sys.stdout.write("CHECKPOINT \n")
            if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()
            auxilaries.append(self.env.s_aux)
            auxilaries_cl.append(self.env.s_aux_cl)

            if terminal:
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.env.reset()

                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.s_position,
                                             self.env.checkpoint,
                                             self.env.s_a_t, self.env.s_c_t,
                                             self.env.isCheckpoint,
                                             self.env.s_aux_cl, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()
        positions.reverse()
        auxilaries.reverse()
        auxilaries_cl.reverse()
        aclists.reverse()
        colists.reverse()
        isCheckpointed.reverse()
        collision.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_t = []
        batch_c = []
        batch_p = []
        batch_aux = []
        batch_aux_cl = []
        batch_al = []
        batch_cl = []
        batch_ic = []
        batch_collision = []
        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti, ci, pi, auxi, aux_cl_i, ali, cli, ici,
             coli) in zip(actions, rewards, states, values, targets,
                          checkpoints, positions, auxilaries, auxilaries_cl,
                          aclists, colists, isCheckpointed, collision):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1
            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)
            batch_c.append(ci)
            batch_p.append(pi)
            batch_aux.append(auxi)
            batch_aux_cl.append(aux_cl_i)
            batch_al.append(ali)
            batch_cl.append(cli)
            batch_ic.append(ici)
            batch_collision.append(coli)

        sess.run(self.accum_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.t: batch_p,
                     self.local_network.c: batch_c,
                     self.local_network.td: batch_td,
                     self.local_network.aux: batch_aux,
                     self.local_network.aux_cl: batch_aux_cl,
                     self.local_network.al: batch_al,
                     self.local_network.cl: batch_cl,
                     self.local_network.ic: batch_ic,
                     self.local_network.col: batch_collision,
                     self.local_network.r: batch_R
                 })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t, terminal
Esempio n. 6
0
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })

            self.env.reset()  #resetting the environment for each thread

        self.env_Oracle = Environment(
            {  #Every iteration in the thread the expert start with the current state of the agent
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope),
                'initial_state': self.env.current_state_id
            })

        self.env_Oracle.reset()

        states = []  #to keeep state ,actions ,targets and other stae
        actions = []
        rewards = []
        values = []
        targets = []
        dones = []

        states_oracle = []
        actions_oracle = []
        targets_oracle = []

        terminal_end = False  #in the start terminal state_end is false

        sess.run(
            self.reset_gradients
        )  #resetting the gradient positions when starting the process for each Iteration
        sess.run(self.sync)  # copy weights from shared to local

        #dicriminator sync
        ##########################
        sess.run(self.sync_discriminator_G_l
                 )  #Copy the weights from the sharead to the local
        sess.run(self.reset_gradients_d
                 )  #resetting the gradients of the discriminator slosts
        ########################

        start_local_t = self.local_t
        self.oracle = ShortestPathOracle(self.env_Oracle, ACTION_SIZE)

        #########################################################################################
        #Sampling the Expert Trajectories
        for i in range(100):
            #We might need to use an for loop to finish the expert trajectory first
            oracle_pi = self.oracle.run_policy(
                self.env_Oracle.current_state_id
            )  #get the policy of the oracle which means the shotest path kind of action in the given state
            oracle_action = self.choose_action(oracle_pi)

            states_oracle.append(self.env_Oracle.s_t)
            actions_oracle.append(oracle_action)
            targets_oracle.append(self.env_Oracle.target)

            self.env_Oracle.step(oracle_action)

            terminal_o = self.env_Oracle.terminal

            self.env_Oracle.update()

            if terminal_o:
                break

        ##############################################################################################

        # t_max times loop
        for i in range(
                LOCAL_T_MAX
        ):  #one thread will run for maximum amoound to 5 iterations then do a gradiet uodate

            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)

            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward  #getting the reward from the env
            terminal = self.env.terminal  #geting whether the agent went to a terminal state

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01  #this is the normal reward here 10 if terminal all the others it is -0.01 (ollision donesst take in to the accout)
            if self.episode_length > 5e3:
                terminal = True  #Here we do not let agent to run more that 5000 steps so we make it terminal
            #but the above terminal thing has no effect on giving 10 as the rwaerd because we set the rweard above

            self.episode_reward += reward
            self.episode_length += 1
            #this is what is the maximum value got in the episode
            self.episode_max_q = max(self.episode_max_q, np.max(
                value_))  #self.episode_max_q-This is -inf in the beggining

            # clip reward
            rewards.append(
                np.clip(reward, -1, 1)
            )  #make sure the rewartds is between -1 and +1 even thore rtthere is a 10

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()

            if terminal:  #if we go to the terminal state we will surely break the function
                score = self.local_discriminator.run_critic(
                    sess, states, targets, actions, self.scopes_d)
                sys.stdout.write("Critic_Score = {0}".format(score))
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0  #after terminal state we gonna make all these variables to zero
                self.episode_length = 0  #Now the AI need to start from new position
                self.episode_max_q = -np.inf  #after a terminaltion we do this
                self.env.reset()

                break

        R = 0.0  #In the terminal Return is nothing  #If it's terminal end we do not have a return from the final state

        if not terminal_end:  #But if it's not the turminal Return is the next value function
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)

        #Agent's Samples
        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        #Expert's Samples
        states_oracle.reverse()
        actions_oracle.reverse()
        actions_oracle.reverse()

        #Agent's batch
        batch_si = []
        batch_a = []
        batch_actions = []
        batch_td = []
        batch_R = []
        batch_t = []

        #Expert's Batch
        batch_si_ex = []
        batch_a_ex = []
        batch_t_ex = []

        batch_si_d = []
        batch_t_d = []
        batch_actions_d = []

        #This is for the
        for (s_e, a_e, t_e) in zip(states_oracle, actions_oracle,
                                   targets_oracle):
            batch_si_ex.append(s_e)
            batch_a_ex.append(a_e)
            batch_t_ex.append(t_e)

        for (ai, si, ti) in zip(actions, states, targets):

            batch_actions_d.append(ai)
            batch_si_d.append(si)
            batch_t_d.append(ti)

        cur_learning_rate = self._anneal_learning_rate(global_t)

        for i in range(10):

            #sess.run(self.reset_gradients_d)

            sess.run(
                self.
                accum_gradients_d,  #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence
                feed_dict={
                    self.local_discriminator.s_e: batch_si_ex,
                    self.local_discriminator.Actions_e: batch_a_ex,
                    self.local_discriminator.s_a: batch_si_d,
                    self.local_discriminator.Actions_a: batch_actions_d,
                    self.local_discriminator.t_e: batch_t_ex,
                    self.local_discriminator.t_a: batch_t_d
                })

            sess.run(
                self.
                apply_gradients_discriminator,  #directly gradients get apply on the global discri
                feed_dict={self.learning_rate_input: 0.00005})

            loss = sess.run(self.local_discriminator.total_loss_d,
                            feed_dict={
                                self.local_discriminator.s_e: batch_si_ex,
                                self.local_discriminator.Actions_e: batch_a_ex,
                                self.local_discriminator.s_a: batch_si_d,
                                self.local_discriminator.Actions_a:
                                batch_actions_d,
                                self.local_discriminator.t_e: batch_t_ex,
                                self.local_discriminator.t_a: batch_t_d
                            })

            sess.run(self.clip_local_d_weights
                     )  #every update make sure u clip weihtfs

        critic_r = self.local_discriminator.run_critic(sess, batch_si_d,
                                                       batch_t_d,
                                                       batch_actions_d,
                                                       self.scopes_d)
        critic_r = critic_r * 0.1

        rewards = rewards + critic_r  #We concatenate the rewrds function

        # Compute the advantage function , return and stack them as batches in Agent
        for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values,
                                        targets):
            R = ri + GAMMA * R  #calculatung the adcantage function
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1  #making the actions one hot
            batch_actions.append(ai)
            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)

        #syncying the new paramters to the old network in the thread PPO
        sess.run(self.old_new_sync)
        for i in range(4):
            #sess.run(self.reset_gradients) #reset the gradients
            sess.run(
                self.
                accum_gradients,  #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence
                feed_dict={
                    self.local_network.s: batch_si,
                    self.local_network.a: batch_a,
                    self.local_network.t: batch_t,
                    self.local_network.td: batch_td,
                    self.local_network.r: batch_R,
                })

            sess.run(
                self.
                apply_gradients_local,  #apply the gradients to the local networ
                feed_dict={self.learning_rate_input: cur_learning_rate})

        #theoritcally we can have one accume gradient operation here
        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        sess.run(
            self.sync_discriminator_l_G
        )  #syncing the paramters from the local network to the global newok

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Esempio n. 7
0
        # check collision
        if env.collided:
            print('Collision occurs.')
            env.collided = False

        # check quit command
        if stop_requested: break

        viewer.imshow(env.observation)


if __name__ == '__main__':

    env = Environment({
        'scene_name': 'kitchen_02',
        'terminal_state_id': 'chair1',
        'checkpoint_state_id': 'k_c1'
    })
    # manually disable terminal states
    #env.terminals = np.zeros_like(env.terminals)
    #env.terminal_states, = np.where(env.terminals)
    env.reset()
    #print(env.s_position)

    human_agent_action = None
    human_wants_restart = False
    stop_requested = False

    viewer = SimpleImageViewer()
    viewer.imshow(env.observation)
    viewer.window.on_key_press = key_press
Esempio n. 8
0
            '/data/xinchao5')[1]
        saver.restore(sess, checkpoint_path)
        print("checkpoint loaded: {}".format(checkpoint_path))
    else:
        print("Could not find old checkpoint")

    scene_stats = dict()
    action_list = []
    for scene_scope in scene_scopes:

        scene_stats[scene_scope] = []
        for task_scope in list_of_tasks[scene_scope]:

            env = Environment({
                'scene_name': 'bedroom_05',
                'terminal_state_id': int(task_scope),
                #'initial_state': EVAL_INIT_LOC,
            })
            real_target_xz.append(
                [xz_numpy[int(task_scope)][0], xz_numpy[int(task_scope)][1]])
            ep_rewards = []
            ep_lengths = []
            ep_collisions = []

            scopes = [network_scope, scene_scope, task_scope]
            #time.sleep(5)
            if 1:
                time.sleep(1)
                cv2.imshow('target image', env.observation_target)
                cv2.waitKey(0)
            viewer = SimpleImageViewer()
Esempio n. 9
0
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })

        states = []
        actions = []
        rewards = []
        values = []
        targets = []

        rnn_inits = []
        state_representation = []
        usf = []
        reward_vector = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        #At each  episode start we set the initial state of the RNN to zero
        start_local_t = self.local_t

        start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(LOCAL_T_MAX):

            pi_, value_, usf_s_g = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)

            imidia_s = self.local_network.run_state(sess, self.env.s_t,
                                                    self.scopes)

            #usf_s_g = self.local_network.run_usf(sess, self.env.s_t, self.env.target,self.rnn_state_init[0] ,self.rnn_state_init[1] ,self.scopes)

            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)
            usf.append(usf_s_g)

            state_representation.append(imidia_s)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward
            terminal = self.env.terminal

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01
            if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()

            if i == (LOCAL_T_MAX - 1) or terminal:

                imidiate_state_representation_next = []
                usf_next = []
                #reward_vector_predictor_next=[]

                last_state = self.env.s_t
                imidia_s_next = self.local_network.run_state(
                    sess, self.env.s_t, self.scopes)
                state_representation_next = state_representation[1:] + [
                    imidia_s_next
                ]

                if terminal:
                    usf_next_imi = 0
                else:
                    usf_next_imi = self.local_network.run_usf(
                        sess, self.env.s_t, self.env.target, self.scopes)

                usf_next = usf[1:] + [usf_next_imi]

            if terminal:
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                oneResult = [
                    global_t, self.thread_index, self.scene_scope,
                    self.task_scope, self.episode_reward, self.episode_length,
                    self.episode_max_q
                ]
                with open('trainingOutput.csv', 'a+') as fp:
                    # fd.write(oneResult)
                    wr = csv.writer(fp)
                    wr.writerow(oneResult)

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.local_network.reset_state()
                self.env.reset()

                break

        R = 0.0
        usf_R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)
            usf_R = self.local_network.run_usf(sess, self.env.s_t,
                                               self.env.target, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        state_representation.reverse()
        state_representation_next.reverse()

        usf_next.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_usf_R = []
        batch_t = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti, state,
             usf_n) in zip(actions, rewards, states, values, targets,
                           state_representation_next, usf_next):

            R = ri + GAMMA * R
            usf_R = state + GAMMA * usf_R
            #usf_R = state + GAMMA*usf_n

            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_usf_R.append(usf_R)
            batch_t.append(ti)

        #We need to reverse this since in the training we unroll for  5 steps unlike in the inferences

        batch_si.reverse()
        batch_a.reverse()
        batch_td.reverse()
        batch_R.reverse()
        batch_usf_R.reverse()
        batch_t.reverse()

        sess.run(self.accum_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.t: batch_t,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.local_network.return_usf: batch_usf_R,
                     self.local_network.initial_lstm_state: start_lstm_state,
                     self.local_network.step_size: [len(batch_a)],
                 })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })
            self.env.reset()  #resetting the environment for each thread

        states = []  #to keeep state ,actions ,targets and other stae
        actions = []
        rewards = []
        values = []
        targets = []
        dones = []

        terminal_end = False  #in the start terminal state_end is false

        # reset accumulated gradients
        sess.run(
            self.reset_gradients
        )  #resetting the gradient positions when starting the process for each

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        # t_max times loop
        for i in range(
                LOCAL_T_MAX
        ):  #one thread will run for maximum amoound to 5 iterations then do a gradiet uodate

            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)
            #pi_Old, value_Old = self.local_network.run_policy_and_value_old(sess, self.env.s_t, self.env.target, self.scopes)

            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward  #getting the reward from the env
            terminal = self.env.terminal  #geting whether the agent went to a terminal state

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01  #this is the normal reward here 10 if terminal all the others it is -0.01 (ollision donesst take in to the accout)
            if self.episode_length > 5e3:
                terminal = True  #Here we do not let agent to run more that 5000 steps so we make it terminal
            #but the above terminal thing has no effect on giving 10 as the rwaerd because we set the rweard above

            self.episode_reward += reward
            self.episode_length += 1
            #this is what is the maximum value got in the episode
            self.episode_max_q = max(self.episode_max_q, np.max(
                value_))  #self.episode_max_q-This is -inf in the beggining

            # clip reward
            rewards.append(
                np.clip(reward, -1, 1)
            )  #make sure the rewartds is between -1 and +1 even thore rtthere is a 10

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()

            if terminal:  #if we go to the terminal state we will surely break the function
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))
                terminal_end = True
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0  #after terminal state we gonna make all these variables to zero
                self.episode_length = 0  #Now the AI need to start from new position
                self.episode_max_q = -np.inf  #after a terminaltion we do this
                self.env.reset()

                break
        '''
    Here I should call the discriminator  and get the reward signal from that 
    R_D=sess.run(D.get_reward(state,action))
    '''

        R = 0.0  #In the terminal Return is nothing  #If it's terminal end we do not have a return from the final state

        if not terminal_end:  #But if it's not the turminal Return is the next value function
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)

        Returns = np.zeros_like(rewards)
        Advants = np.zeros_like(rewards)
        lastgaelam = 0
        LAMBDA = 0.9
        GAM = 0.9

        self.nsteps = len(rewards)

        ############################################################################# we should assined all params to the new params

        #This will only has an effect on

        #####################################################################

        for t in reversed(range(self.nsteps)):
            if t == self.nsteps - 1:
                nextnonterminal = 1.0 - bool(
                    R == 0)  #if R ==0 means the agent found the terminal stage
                nextvalues = R

            else:
                nextnonterminal = 1.0 - bool(R == 0)
                nextvalues = values[t + 1]
            delta = rewards[t] + GAM * nextvalues * nextnonterminal - values[t]
            Advants[
                t] = lastgaelam = delta + GAM * LAMBDA * lastgaelam * nextnonterminal
            Returns[t] = Advants[t] + values[t]

        #Returns=Advants+values #This is more of the v_next

        Advants = (Advants - Advants.mean()) / (Advants.std() + 1e-5)
        #Returns=(Returns - Returns.mean()) / (Returns.std() + 1e-5)

        Returns = Returns.tolist()
        Advants = Advants.tolist()

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()
        Returns.reverse()
        Advants.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_t = []
        batch_advant = []
        batch_Return = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti, Re, Ad) in zip(actions, rewards, states,
                                                values, targets, Returns,
                                                Advants):
            R = ri + GAMMA * R  #calculatung the adcantage function
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1  #making the actions one hot

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)
            batch_advant.append(Ad)
            batch_Return.append(Re)

        sess.run(self.old_new_sync)
        cur_learning_rate = self._anneal_learning_rate(global_t)

        for i in range(3):

            sess.run(
                self.
                accum_gradients,  #since we update the algorithm for given action ,given state, given advatns and given value and given reward we do not care about the sequence
                feed_dict={
                    self.local_network.s: batch_si,
                    self.local_network.a: batch_a,
                    self.local_network.t: batch_t,
                    self.local_network.td: batch_td,
                    self.local_network.r: batch_R,
                    self.local_network.Returns: batch_Return,
                    self.local_network.Advantages: batch_advant
                })

            sess.run(self.apply_gradients_local,
                     feed_dict={self.learning_rate_input: cur_learning_rate})

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Esempio n. 11
0
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):  #This is to run the process
        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.env = Environment(
                {  #This is where you access in to the environment  #scene_loader import THORDiscreteEnvironment 
                    'scene_name': self.scene_scope,
                    'terminal_state_id': int(self.task_scope)
                })

            self.env.reset()
            self.oracle = ShortestPathOracle(
                self.env, ACTION_SIZE
            )  #Get the probabilities of the shortest paths to go to exat position

        states = []
        targets = []
        oracle_pis = []  #expert policies

        terminal_end = False

        if self.mode is "train":  #if the trainign is there
            # reset accumulated gradients
            sess.run(self.reset_gradients)  #reet all the gradients

            # copy weights from shared to local
            sess.run(self.sync)  #

        start_local_t = self.local_t

        # t_max times loop (5 steps)
        for i in range(LOCAL_T_MAX):  # This is for the training

            flipped_run = self.encourage_symmetry and np.random.random() > 0.5

            if flipped_run:
                s_t = self.env.target
                g = self.env.s_t
            else:
                s_t = self.env.s_t
                g = self.env.target  #first the initial state start with same state 4 times as the history stacked as frames 2048*5

            smashnet_pi = self.local_network.run_policy(
                sess, s_t, g,
                self.scopes)  #now gethe policy frmo the local network
            if flipped_run: smashnet_pi = self._flip_policy(smashnet_pi)

            oracle_pi = self.oracle.run_policy(
                self.env.current_state_id
            )  #get the policy of the oracle which means the shotest path kind of action in the given state

            diffidence_rate = self._anneal_diffidence_rate(global_t)

            action = self.choose_action(smashnet_pi, oracle_pi,
                                        diffidence_rate)

            states.append(s_t)  #stack action
            targets.append(g)  #stack target position
            if flipped_run: oracle_pis.append(self._flip_policy(oracle_pi))
            else: oracle_pis.append(oracle_pi)  #get the expert's policies

            # if VERBOSE and global_t % 10000 == 0:
            #       print("Thread %d" % (self.thread_index))
            #       sys.stdout.write("SmashNet Pi = {}, Oracle Pi = {}\n".format(["{:0.2f}".format(i) for i in smashnet_pi], ["{:0.2f}".format(i) for i in oracle_pi]))

            if VALIDATE and global_t % VALIDATE_FREQUENCY == 0 and global_t > 0 and self.thread_index == 0:  #This is for the alidation of the results
                results = self._evaluate(sess,
                                         list_of_tasks=VALID_TASK_LIST,
                                         num_episodes=NUM_VAL_EPISODES,
                                         max_steps=MAX_VALID_STEPS,
                                         success_cutoff=SUCCESS_CUTOFF)
                print("Thread %d" % (self.thread_index))
                print("Validation results: %s" % (results))

            self.env.step(action)  #here we change the next step

            is_terminal = self.env.terminal or self.episode_length > 5e3
            if self.mode is "val" and self.episode_length > 1e3:
                is_terminal = True

            self.episode_length += 1
            self.episode_pi_sim += 1. - cosine(smashnet_pi, oracle_pi)

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()  #update the new state

            if is_terminal:
                terminal_end = True
                if self.mode is "val":
                    sess.run(self.sync)
                    sys.stdout.write(
                        "time %d | thread #%d | scene %s | target %s | episode length = %d\n"
                        % (global_t, self.thread_index, self.scene_scope,
                           self.task_scope, self.episode_length))

                summary_values = {
                    "episode_length_input":
                    float(self.episode_length),
                    "episode_pi_sim_input":
                    self.episode_pi_sim / float(self.episode_length),
                    "episode_loss_input":
                    float(self.episode_loss)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_length = 0
                self.episode_pi_sim = 0
                self.episode_loss = 0
                self.env.reset()

            break

        if self.mode is "train":
            states.reverse()
            oracle_pis.reverse()

            batch_si = []
            batch_ti = []
            batch_opi = []

            # compute and accmulate gradients
            for (si, ti, opi) in zip(states, targets, oracle_pis):

                batch_si.append(si)
                batch_ti.append(ti)
                batch_opi.append(opi)

            sess.run(self.accum_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.t: batch_ti,
                         self.local_network.opi: batch_opi
                     })

            self.episode_loss += sum(
                sess.run(self.local_network.loss,
                         feed_dict={
                             self.local_network.s: batch_si,
                             self.local_network.t: batch_ti,
                             self.local_network.opi: batch_opi
                         }))

            cur_learning_rate = self._anneal_learning_rate(global_t)
            sess.run(self.apply_gradients,
                     feed_dict={self.learning_rate_input: cur_learning_rate})

        # if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
        #   sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Esempio n. 12
0
def main(args):
    scene_scope = 'bathroom_02'
    task_scope = 26  #26 43 53 32 41
    env = Environment({
        'scene_name': scene_scope,
        'terminal_state_id': int(task_scope)
    })

    S_Class = SIAMESE()  #Creating  a siamese class -object

    Policy = Policy_net(
        'policy', S_Class)  #buiding the actor critic graph / object  , Passing
    Old_Policy = Policy_net('old_policy',
                            S_Class)  #same thing as the other PPO

    PPO = PPOTrain(Policy, Old_Policy,
                   gamma=args.gamma)  #gradiet updatror object or the graph
    D = Discriminator(S_Class)  #discriminator of the Gan Kind of thing
    '''
    batch_n=tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese')
    '''

    #Loading Expert Data State/Tragets etc
    expert_observations = np.genfromtxt(
        'trajectory/observations.csv')  #load expert demnetrations
    expert_targets = np.genfromtxt('trajectory/targets.csv')
    expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)
    expert_observations = np.reshape(expert_observations,
                                     newshape=[-1, 2048, 4])
    expert_targets = np.reshape(expert_targets, newshape=[-1, 2048, 4])

    saver = tf.train.Saver(
    )  #Assign another save if you want to use BC weights
    if args.restore:  #We need a seperate saver only for assigning paramters from BC trained thing
        saver2 = tf.tran.Saver([
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='policy'),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Siamese')
        ])

    with tf.Session() as sess:
        writer = tf.summary.FileWriter(args.logdir, sess.graph)
        sess.run(
            tf.global_variables_initializer()
        )  #here already variables get intialized both old policy and new policy net

        if args.restore:
            if args.model == '':
                saver2.restore(
                    sess,
                    args.modeldir + '/' + args.alg + '/' + 'shamane.ckpt')
                print("Model Reastored")
            else:
                saver.restore(
                    sess, args.modeldir + '/' + args.alg + '/' +
                    'model.ckpt-' + args.model)

        success_num = 0  #This is use to check whether my agent went to the terminal point

        #var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

        for iteration in range(
                100000):  #args.iteration):#Here start the adversial training
            print(
                "Starting ........ The Iteration---------------------------------------------------- :",
                iteration)
            observations = []
            actions = []
            #rewards = []
            targets = []  #for the gail
            v_preds = []
            run_policy_steps = 0

            while (
                    True
            ):  #Here what is happenning is , this again samples  trajectories from untrain agent
                run_policy_steps += 1
                obs = np.stack([env.s_t]).astype(
                    dtype=np.float32
                )  # prepare to feed placeholder Policy.obs #Initial observation
                target = np.stack([env.s_target]).astype(
                    dtype=np.float32
                )  #This is to make sure that input is [batch_size,2048,4]

                act, v_pred, prob = Policy.act(
                    state=obs, target=target,
                    stochastic=True)  # Agents action and values

                act = np.asscalar(act)
                v_pred = np.asscalar(v_pred)

                observations.append(obs)  #save the set of observations
                targets.append(target)
                actions.append(act)  #save the set of actions
                v_preds.append(v_pred)

                #next_obs, reward, done, info = env.step(act)  #get the next observation and reward acording to the observation
                next_obs, is_terminal, is_collided = env.step(act)

                if is_terminal:
                    success_num = success_num + 1
                    print(
                        "Congratz yoy just reach the terminal state which is:",
                        env.terminal_state_id)

                if is_collided:
                    print(
                        "Bad Luck your agent just collided couldn't made it  to the terminal state which is :",
                        env.terminal_state_id)

                if (is_terminal or is_collided
                        or (run_policy_steps
                            == 100)):  #run one episode till the termination
                    print("Number Of Exploration by the AGENT:",
                          run_policy_steps)
                    v_preds_next = v_preds[1:] + [
                        0
                    ]  # next state of terminate state has 0 state value #this list use to update the parameters of the calue net
                    print(
                        "Environment is resetting after the collition/Terminal"
                    )
                    obs = env.reset()
                    #reward = -1
                    break  #with tihs vreak all obsercation ,action and other lists get empty

            #print(sum(rewards))

            writer.add_summary(
                tf.Summary(value=[
                    tf.Summary.Value(tag='episode_length',
                                     simple_value=run_policy_steps)
                ]), iteration)
            #writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
            #, iteration)

            if success_num >= 5000:
                saver.save(sess, args.savedir + '/model.ckpt')
                print('Clear!! Model saved.')
                break
            #else:
            #success_num = 0

            # convert list to numpy array for feeding tf.placeholder
            observations = np.reshape(observations,
                                      newshape=[-1, 2048,
                                                4])  #collect observations
            targets = np.reshape(targets, newshape=[-1, 2048, 4])
            actions = np.array(actions).astype(
                dtype=np.int32)  #collect the actions

            # train discriminator  #Here comes the Discriminator !!
            Dis_input = [
                expert_observations, expert_targets, expert_actions,
                observations, targets, actions
            ]
            observations.shape[0]
            expert_observations.shape[0]

            if observations.shape[0] < expert_observations.shape[0]:
                High = observations.shape[0]
            else:
                High = expert_observations.shape[0]
            for i in range(100):
                sample_indices = np.random.randint(low=0, high=High, size=32)
                sampled_inp_D = [
                    np.take(a=a, indices=sample_indices, axis=0)
                    for a in Dis_input
                ]

                D.train(expert_s=sampled_inp_D[0],
                        expert_t=sampled_inp_D[1],
                        expert_a=sampled_inp_D[2],
                        agent_s=sampled_inp_D[3],
                        agent_t=sampled_inp_D[4],
                        agent_a=sampled_inp_D[5])
                '''
               
                D.train(expert_s=expert_observations,        
                        expert_t=expert_targets,
                        expert_a=expert_actions,
                        agent_s=observations,
                        agent_t=targets,
                        agent_a=actions)
                '''


#To get rewards we can use a RNN , then we can get the each time unit output to collect the reward function
            d_rewards = D.get_rewards(
                agent_s=observations, agent_t=targets, agent_a=actions
            )  #how well our agent performed with respect to the expert
            d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(
                dtype=np.float32)  #rewards for each action pair

            gaes = PPO.get_gaes(
                rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next
            )  #this to calcuate the advantage function in PPO
            gaes = np.array(gaes).astype(dtype=np.float32)
            # gaes = (gaes - gaes.mean()) / gaes.std()
            v_preds_next = np.array(v_preds_next).astype(
                dtype=np.float32)  #This is the next value function

            #train policy
            inp = [
                observations, targets, actions, gaes, d_rewards, v_preds_next
            ]
            PPO.assign_policy_parameters(
            )  #Assigning policy params means assigning the weights to the default policy nets
            for epoch in range(
                    100
            ):  #This is to train the Agent (Actor Critic ) from the obtaiend agent performances and already trained discriminator
                sample_indices = np.random.randint(
                    low=0, high=observations.shape[0],
                    size=32)  # indices are in [low, high)

                sampled_inp = [
                    np.take(a=a, indices=sample_indices, axis=0) for a in inp
                ]  # Here trainign the policy network

                PPO.train(state=sampled_inp[0],
                          targets=sampled_inp[1],
                          actions=sampled_inp[2],
                          gaes=sampled_inp[3],
                          rewards=sampled_inp[4],
                          v_preds_next=sampled_inp[5])

            summary = PPO.get_summary(obs=inp[0],
                                      target=inp[1],
                                      actions=inp[2],
                                      gaes=inp[3],
                                      rewards=inp[4],
                                      v_preds_next=inp[5])

            writer.add_summary(summary, iteration)
        writer.close()
Esempio n. 13
0
def train(rank,
          scene_scope,
          task_scope,
          args,
          shared_model,
          counter,
          lock,
          optimizer=None):
    torch.manual_seed(args.seed + rank)

    #env = create_atari_env(args.env_name)
    #env.seed(args.seed + rank)

    env = Environment({
        'scene_name': scene_scope,
        'terminal_state_id': int(task_scope)
    })

    model = ActorCriticFFNetwork(ACTION_SIZE)

    if optimizer is None:
        # TODO: Discount learning rate based on episode length
        optimizer = my_optim.SharedRMSprop(shared_model.parameters(),
                                           lr=args.lr,
                                           alpha=args.alpha,
                                           eps=args.eps)
        optimizer.share_memory()

    model.train()

    env.reset()
    state = torch.from_numpy(env.s_t)
    done = True

    episode_length = 0
    for i in range(int(args.max_episode_length)):
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        '''
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)
        '''

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            print('Thread: ', rank, ', step: ', step, 'epochs:', i)
            episode_length += 1
            logit, value = model(env.s_t, env.target)
            prob = F.softmax(logit, dim=1)
            log_prob = F.log_softmax(logit, dim=1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).data
            log_prob = log_prob.gather(1, Variable(action))

            env.step(action)
            #state, reward, done, _ = env.step(action.numpy())
            env.update()
            state = env.s_t
            reward = env.reward
            done = env.terminal

            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                if counter.value % 1000 == 0:
                    print('Now saving data. Please wait.')
                    torch.save(shared_model.state_dict(),
                               CHECKPOINT_DIR + '/' + 'checkpoint.pth.tar')
                counter.value += 1

            if done:
                episode_length = 0
                if env.terminal:
                    print('Task completed')
                counter.value += 1

            if done:
                episode_length = 0
                env.reset()
                state = env.s_t

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            _, value = model(env.s_t, env.target)
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Esempio n. 14
0
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded: {}".format(checkpoint.model_checkpoint_path))
    else:
        print("Could not find old checkpoint")

    scene_stats = dict()
    for scene_scope in scene_scopes:

        scene_stats[scene_scope] = []
        for task_scope in list_of_tasks[scene_scope]:

            print(task_scope)
            print("______________________________________________")

            env = Environment({
                'scene_name': scene_scope,
                'terminal_state_id': int(task_scope)
            })
            ep_rewards = []
            ep_lengths = []
            ep_collisions = []

            scopes = [network_scope, scene_scope, task_scope]

            for i_episode in range(NUM_EVAL_EPISODES):

                env.reset()
                terminal = False
                ep_reward = 0
                ep_collision = 0
                ep_t = 0
Esempio n. 15
0
    def save_expert(self):

        states = []
        action_list = []
        target_list = []
        action_history_list = []

        for i in range(10):
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': int(self.task_scope)
            })
            print(
                "Starting -",
                "*******************************************************************-----------",
                i)
            print("From the environment Current State ID--",
                  self.env.current_state_id)
            print("From the environment Traget State ID--",
                  self.env.terminal_state_id)
            print("Frm the environment Number of possible states",
                  self.env.n_locations)
            print(
                "________________________________________________________________________________"
            )

            self.oracle = ShortestPathOracle(self.env, ACTION_SIZE)
            action_itr = 0  #To stack action history

            while (not (self.env.terminal)):

                s_t = self.env.s_t
                target = self.env.s_target
                #self.oracle = ShortestPathOracle(self.env, ACTION_SIZE)
                oracle_pi = self.oracle.run_policy(
                    self.env.current_state_id
                )  #get the policy of the oracle which means the shotest path kind of action in the given state

                action = self.choose_action(
                    oracle_pi)  #select the action probabilistically

                if action_itr == 0:
                    action_his = np.tile(action, (4))
                    action_itr = 1
                else:
                    action_his = np.append(action_his[1:], action)

                #Saving current state,targets and action thriplets
                states.append(s_t)  #stack action
                action_list.append(action)
                target_list.append(target)
                action_history_list.append(action_his)

                self.env.step(action)  #here we change the next step

                is_terminal = self.env.terminal
                is_collided = self.env.collided
                self.local_t += 1

                if is_collided:
                    print(
                        "Wrong action-------- Error Error Error Error Error Error Error Error Error Error"
                    )
                    break

                # s_t1 -> s_t
                self.env.update()  #update the new state
                #self.env.reset() #With this

            print("Done with one epoach one start state to end goal ")

            self.env.reset()

        states = np.reshape(states, newshape=[-1] + list([8192]))
        target_list = np.reshape(target_list, newshape=[-1] + list([8192]))
        self.open_file_and_save('trajectory/observations.csv', states)
        print(np.shape(states))
        print(np.shape(action_list))
        print(np.shape(action_history_list))
        print(np.shape(target_list))
        self.open_file_and_save('trajectory/actions.csv', action_list)
        self.open_file_and_save('trajectory/targets.csv', target_list)
        self.open_file_and_save('trajectory/actions_history.csv',
                                action_history_list)
Esempio n. 16
0
    def process(self, sess, global_t, summary_writer, summary_op,
                summary_placeholders):

        if self.env is None:
            # lazy evaluation
            time.sleep(self.thread_index * 1.0)
            self.task_scope_name = random.randint(1, 468) - 1
            self.env = Environment({
                'scene_name': self.scene_scope,
                'terminal_state_id': self.task_scope_name
            })
            self.env.reset()

        states = []
        actions = []
        rewards = []
        values = []
        targets = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.env.s_t, self.env.target, self.scopes)
            action = self.choose_action(pi_)

            states.append(self.env.s_t)
            actions.append(action)
            values.append(value_)
            targets.append(self.env.target)

            if VERBOSE and (self.thread_index
                            == 0) and (self.local_t % 1000) == 0:
                sys.stdout.write("Pi = {0} V = {1}\n".format(pi_, value_))

            # process game
            self.env.step(action)

            # receive game result
            reward = self.env.reward
            terminal = self.env.terminal

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01
            if self.episode_length > 5e3: terminal = True

            self.episode_reward += reward
            self.episode_length += 1
            self.episode_max_q = max(self.episode_max_q, np.max(value_))

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.env.update()

            if terminal:
                terminal_end = True
                print '----------'
                print('real terminal id is {}'.format(self.task_scope_name))
                sys.stdout.write(
                    "time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, self.scene_scope, self.task_scope,
                       self.episode_reward, self.scene_scope, self.task_scope,
                       self.episode_length, self.scene_scope, self.task_scope,
                       self.episode_max_q))

                summary_values = {
                    "episode_reward_input": self.episode_reward,
                    "episode_length_input": float(self.episode_length),
                    "episode_max_q_input": self.episode_max_q,
                    "learning_rate_input": self._anneal_learning_rate(global_t)
                }

                self._record_score(sess, summary_writer, summary_op,
                                   summary_placeholders, summary_values,
                                   global_t)
                self.episode_reward = 0
                self.episode_length = 0
                self.episode_max_q = -np.inf
                self.task_scope_name = random.randint(1, 468) - 1
                self.env = Environment({
                    'scene_name':
                    self.scene_scope,
                    'terminal_state_id':
                    self.task_scope_name
                })
                self.env.reset()
                print('init id is {}'.format(self.env.current_state_id))
                print '----------'
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.env.s_t,
                                             self.env.target, self.scopes)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []
        batch_t = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi, ti) in zip(actions, rewards, states, values,
                                        targets):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)
            batch_t.append(ti)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()
            batch_t.reverse()

            sess.run(self.accum_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.t: batch_t,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R,
                         self.local_network.initial_lstm_state:
                         start_lstm_state,
                     })
            #  self.local_network.step_size: [len(batch_a)]
        else:
            sess.run(self.accum_gradients,
                     feed_dict={
                         self.local_network.s: batch_si,
                         self.local_network.a: batch_a,
                         self.local_network.t: batch_t,
                         self.local_network.td: batch_td,
                         self.local_network.r: batch_R
                     })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if VERBOSE and (self.thread_index == 0) and (self.local_t % 100) == 0:
            sys.stdout.write("Local timestep %d\n" % self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Esempio n. 17
0
        else:
            checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(sess, checkpoint_path)
        print("checkpoint loaded: {}".format(checkpoint_path))
    else:
        print("Could not find old checkpoint")

    scene_stats = dict()
    for scene_scope in scene_scopes:

        scene_stats[scene_scope] = []
        for task_scope in list_of_tasks[scene_scope]:

            env = Environment({
                'scene_name': scene_scope,
                'terminal_state_id': int(task_scope),
                'initial_state': EVAL_INIT_LOC,
            })
            ep_rewards = []
            ep_lengths = []
            ep_collisions = []

            scopes = [network_scope, scene_scope, task_scope]

            for i_episode in range(NUM_EVAL_EPISODES):

                env.reset()
                terminal = False
                ep_reward = 0
                ep_collision = 0
                ep_t = 0
Esempio n. 18
0
def test(rank, scene_scope, task_scope, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)
    
    env = Environment({
        'scene_name': scene_scope,
        'terminal_state_id': int(task_scope)
        })
    
    model = ActorCriticFFNetwork(ACTION_SIZE)

    model.eval()

    height, width, layers = env.observation.shape
    video = cv2.VideoWriter('video/' + task_scope + '.mp4',-1,1,(width,height))

    env.reset()
    state = torch.from_numpy(env.s_t)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0

    img = cv2.cvtColor(env.observation, cv2.COLOR_BGR2RGB)
    video.write(img)
    for i in range(100):
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

        logit, value = model(env.s_t, env.target)
        prob = F.softmax(logit, dim=1)
        action = prob.max(1, keepdim=True)[1].data.numpy()
        env.step(action[0, 0])
        env.update()        
        img = cv2.cvtColor(env.observation, cv2.COLOR_BGR2RGB)
        video.write(img)
        
        reward = env.reward
        state = env.s_t
        done = env.terminal
        print(env.terminal_state_id, env.current_state_id)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            env.reset()
            state = env.s_t
            break

        state = torch.from_numpy(state)
    cv2.destroyAllWindows()
    video.release()
Esempio n. 19
0
        saver.restore(sess, checkpoint_path)
        print("checkpoint loaded: {}".format(checkpoint_path))
    else:
        print("Could not find old checkpoint")

    scene_stats = dict()
    action_list = []
    flag_i = 0
    for scene_scope in scene_scopes:

        scene_stats[scene_scope] = []
        for task_scope in list_of_tasks[scene_scope]:

            env = Environment({
                'scene_name': SCENE_NAME,
                'terminal_state_id': int(task_scope),
                #'initial_state': EVAL_INIT_LOC,
            })
            real_target_xz.append(
                [xz_numpy[int(task_scope)][0], xz_numpy[int(task_scope)][1]])
            ep_rewards = []
            ep_lengths = []
            ep_collisions = []

            scopes = [network_scope, scene_scope, task_scope]
            #time.sleep(5)
            a = np.arange(300 * 400 * 3)
            imx = a.reshape(300, 400, 3).astype('uint8')
            # im0 = img_from_thor()
            #env.last_event.frame.astype('float32')
            show_im = env.observation_target