Exemple #1
0
 def evaluate(self, visualize=False):
     # Try the policy
     traj_list = []
     for n_iter in range(5):
         problem = ConveyorBelt()  # different "initial" state
         traj = problem.execute_policy(self, 20, self.v)
         traj_list.append(traj)
         problem.env.Destroy()
         RaveDestroy()
     avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
     std_J = np.std([np.sum(traj['r']) for traj in traj_list])
     return avg_J, std_J
Exemple #2
0
 def evaluate(self, visualize=False):
     # Try the policy
     traj_list = []
     for n_iter in range(5):
         problem = ConveyorBelt()  # different "initial" state
         traj = problem.execute_policy(self, 20, self.v)
         traj_list.append(traj)
         problem.env.Destroy()
         RaveDestroy()
     avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
     std_J = np.std([np.sum(traj['r']) for traj in traj_list])
     #pfile = open(self.save_folder+'/performance.txt','a')
     #pfile.write(str(i)+','+str(avg_J)+','+str(std_J)+'\n')
     #pfile.close()
     return avg_J, std_J
Exemple #3
0
    def parallel_rollout(self):
        n_procs = 5
        pool = ThreadPool(n_procs)
        procs = []
        problems = []
        for i in range(n_procs):
            problems.append(ConveyorBelt())  # different "initial" state

        traj_list = []
        for i in range(n_procs):
            print 'applying', i
            procs.append(
                pool.apply_async(self.rollout_thread, args=(
                    problems[i],
                    i,
                )))

        pool.close()
        pool.join()
        print[p.successful() for p in procs]
        for pidx, p in enumerate(procs):
            if not p.successful():  # Why does it ever fail?
                print pidx, 'Unsuccessful'
                traj_list.append(self.rollout_thread(problems[pidx], pidx))
            else:
                traj_list.append(p.get())

        return traj_list
Exemple #4
0
    def train(self,states,actions,rewards,sprimes,sumR,traj_lengths,\
              epochs=500,d_lr=1e-3,g_lr=1e-4):
        states = states.squeeze()
        sprimes = sprimes.squeeze()
        true_performance_list = []
        G_performance_list = []
        mse_list = []

        K.set_value(self.opt_G.lr, g_lr)
        K.set_value(self.opt_D.lr, d_lr)

        print self.opt_G.get_config()
        print "Fitting V..."
        current_best_J = -np.inf

        stime = time.time()

        self.update_V(states, sumR)
        adv = self.compute_A(states, actions, sprimes, rewards, traj_lengths)
        self.update_pi(states, actions, adv)

        self.saveWeights(additional_name='epoch_' + str(0))
        print time.time() - stime

        # train pi
        for i in range(1, epochs):
            stime = time.time()
            print 'Completed: %.2f%%' % (i / float(epochs) * 100)
            # Try policy - 5 trajectories, each 20 long
            traj_list = []
            for n_iter in range(
                    5):  # N = 5, T = 20, using the notation from PPO paper
                problem = ConveyorBelt()  # different "initial" state
                traj = problem.execute_policy(self,
                                              20,
                                              visualize=self.visualize)
                traj_list.append(traj)
                problem.env.Destroy()
                RaveDestroy()
            avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
            std_J = np.std([np.sum(traj['r']) for traj in traj_list])
            pfile = open(self.save_folder + '/performance.txt', 'a')
            pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n')
            pfile.close()
            print 'Score of this policy', avg_J
            print time.time() - stime

            # Add new data to the buffer
            new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(
                traj_list)
            new_a = self.a_scaler.transform(new_a)

            self.update_V(new_s, new_sumR)
            new_sumA = self.compute_A(new_s, new_a, new_sprime, new_r,
                                      new_traj_lengths)
            self.update_pi(new_s, new_a, new_sumA)

            if avg_J > current_best_J:
                current_best_J = avg_J
                theta_star = self.save_folder + '/policy_search_' + str(
                    i) + '.h5'
                self.saveWeights(additional_name='epoch_'+\
                                str(i)+'_'+str(avg_J))
Exemple #5
0
 def __init__(self, problem_idx, n_actions_per_node):
     ConveyorBelt.__init__(self, problem_idx, n_actions_per_node)
     self.set_objects_not_in_goal(self.objects)
Exemple #6
0
    def train(self,states,actions,\
              epochs=500,d_lr=1e-3,g_lr=1e-4):
        states = states.squeeze()

        true_performance_list = []
        G_performance_list = []
        mse_list = []

        n_data = states.shape[0]
        BATCH_SIZE = np.min([32, int(len(actions) * 0.1)])
        if BATCH_SIZE == 0:
            BATCH_SIZE = 1
        print BATCH_SIZE

        K.set_value(self.opt_G.lr, g_lr)
        K.set_value(self.opt_D.lr, d_lr)
        print self.opt_G.get_config()

        current_best_J = -np.inf
        n_score_train = 1
        performance_list = []
        pfile = open(self.save_folder + '/performance.txt', 'w')
        for i in range(1, epochs):
            stime = time.time()

            # Rollouts
            # 5 trajectories, each 20 long
            stime = time.time()
            traj_list = []
            for n_iter in range(5):
                problem = ConveyorBelt()  # different "initial" state
                traj = problem.execute_policy(self, 20, self.v)
                traj_list.append(traj)
                problem.env.Destroy()
                RaveDestroy()
            avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
            std_J = np.std([np.sum(traj['r']) for traj in traj_list])
            pfile = open(self.save_folder + '/performance.txt', 'a')
            pfile.write(str(avg_J) + ',' + str(std_J) + '\n')
            pfile.close()
            print 'Score of this policy', avg_J

            # new rollout dataset
            new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(
                traj_list)
            new_a = self.a_scaler.transform(new_a)

            # choose a batch of data
            indices = np.random.randint(0, actions.shape[0], size=BATCH_SIZE)
            s_batch = np.array(states[indices, :])  # collision vector
            a_batch = np.array(actions[indices, :])

            pi_indices = np.random.randint(0, new_a.shape[0], size=BATCH_SIZE)
            pi_s_batch = np.array(new_s[pi_indices, :])  # collision vector
            pi_a_batch = np.array(new_a[pi_indices, :])

            # make their scores
            fake_scores = np.zeros((BATCH_SIZE, 1))
            real_scores = np.ones((BATCH_SIZE, 1))
            batch_x = np.vstack([pi_a_batch, a_batch])
            batch_w = np.vstack([pi_s_batch, s_batch])
            batch_scores = np.vstack([fake_scores, real_scores])

            # Update  D
            self.disc.fit({
                'x': batch_x,
                'w': batch_w
            },
                          batch_scores,
                          epochs=1,
                          verbose=False)
            new_r, new_sumR = self.compute_r_using_D(traj_list)

            # update value function
            self.update_V(new_s, new_sumR)

            # update policy
            new_sumA = self.compute_A(new_s, new_a, new_sprime, new_r,
                                      new_traj_lengths)
            self.update_pi(new_s, new_a, new_sumA)

            self.saveWeights(additional_name='epoch_'+\
                            str(i)+'_'+str(avg_J))

            print 'Completed: %.2f%%' % (i / float(epochs) * 100)
            print "Epoch took: %.2fs" % (time.time() - stime)
Exemple #7
0
    def train(self,states,actions,rewards,sprimes,\
              epochs=500,d_lr=1e-3,g_lr=1e-4):
        states = states.squeeze()
        sprimes = sprimes.squeeze()
        true_performance_list = []
        G_performance_list = []
        mse_list = []

        n_data = states.shape[0]
        BATCH_SIZE = np.min([32, int(len(actions) * 0.1)])
        if BATCH_SIZE == 0:
            BATCH_SIZE = 1
        print BATCH_SIZE

        K.set_value(self.opt_G.lr, g_lr)
        K.set_value(self.opt_D.lr, d_lr)
        print self.opt_G.get_config()

        current_best_J = -np.inf
        pfile = open(self.save_folder + '/performance.txt', 'w')

        # n_episodes = epochs*5
        # T = 20, but we update it once we finish executing all T
        # This is because this is an episodic task - you can only learn meaningful moves
        # if you go deep in the trajectory.
        # So, we have 300*5*20 RL data
        for i in range(1, epochs):
            print 'Completed: %.2f%%' % (i / float(epochs) * 100)
            stime = time.time()

            terminal_state_idxs = np.where(
                np.sum(np.sum(sprimes, axis=-1), axis=-1) == 0)[0]
            nonterminal_mask = np.ones((sprimes.shape[0], 1))
            nonterminal_mask[terminal_state_idxs, :] = 0

            # make the targets
            fake = self.a_gen.predict([sprimes])  # predicted by pi
            real = actions

            real_targets = rewards + np.multiply(
                self.disc.predict([fake, sprimes]), nonterminal_mask)
            stime = time.time()
            self.update_disc(real, states, real_targets, BATCH_SIZE)
            self.update_pi(states, BATCH_SIZE)
            print 'Fitting time', time.time() - stime

            # Technically speaking, we should update the policy every timestep.
            # What if we update it 100 times after we executed 5 episodes, each with 20 timesteps??
            stime = time.time()
            traj_list = []
            for n_iter in range(5):
                problem = ConveyorBelt()  # different "initial" state
                traj = problem.execute_policy(self, 20, self.v)
                traj_list.append(traj)
                problem.env.Destroy()
                RaveDestroy()
            avg_J = np.mean([np.sum(traj['r']) for traj in traj_list])
            std_J = np.std([np.sum(traj['r']) for traj in traj_list])
            pfile = open(self.save_folder + '/performance.txt', 'a')
            pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n')
            pfile.close()
            print 'Score of this policy', avg_J

            # Add new data to the buffer - only if this was a non-zero trajectory
            if avg_J > 1.0:
                new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data(
                    traj_list)
                new_a = self.a_scaler.transform(new_a)
                states = np.r_[states, new_s.squeeze()]
                actions = np.r_[actions, new_a]
                rewards = np.r_[rewards, new_r]
                sprimes = np.r_[sprimes, new_sprime.squeeze()]
                print "Rollout time", time.time() - stime

            if avg_J > current_best_J:
                current_best_J = avg_J
                theta_star = self.save_folder + '/policy_search_' + str(
                    i) + '.h5'
                self.saveWeights(additional_name='tau_'+str(self.tau)+'epoch_'+\
                                str(i)+'_'+str(avg_J))