Ejemplo n.º 1
0
    def path_callback(self, path):
        path_length = len(path["obs"])
        self._steps_since_last_training += path_length

        self.agent_logger.log_episode(path)

        # We may be in a new part of the environment, so we take new segments to build comparisons from
        segment = sample_segment_from_path(path, int(self._frames_per_segment))
        if segment:
            self.recent_segments.append(segment)

        # If we need more comparisons, then we build them from our recent segments
        too_many_labels = len(
            self.comparison_collector.unlabeled_comparisons) > 20
        if len(self.comparison_collector) < int(
                self.label_schedule.n_desired_labels
        ) and not too_many_labels and len(self.recent_segments) > 1:
            print("test 2")
            i1 = random.randint(0, len(self.recent_segments) - 1)
            i2 = random.randint(0, len(self.recent_segments) - 1)
            if i1 != i2:
                self.comparison_collector.add_segment_pair(
                    self.recent_segments[i1], self.recent_segments[i2])
                # random.choice(self.recent_segments),
                # random.choice(self.recent_segments))

        # Train our predictor every X steps
        if self._steps_since_last_training >= int(
                self._n_timesteps_per_predictor_training):
            self.train_predictor()
            self._steps_since_last_training -= self._steps_since_last_training
Ejemplo n.º 2
0
    def path_callback(self, path):
        # Video recording to elicit human feedback every x steps.
        if (self._num_paths_seen % self.paths_per_wait <=
                self.paths_per_selection) and (
                    self.clip_manager.total_number_of_clips <
                    self.label_schedule.n_desired_labels):
            if (len(self.collected_paths) < self.paths_per_selection):
                self.collected_paths.append(path)
            elif (len(self.collected_paths) == self.paths_per_selection):
                selected_paths, selection_time = self.selector.select(
                    self.collected_paths)
                for selected_path in selected_paths:
                    segment = sample_segment_from_path(
                        selected_path, int(self.model._frames_per_segment))
                    if segment:
                        self.model.clip_manager.add(
                            segment, source="on-policy callback")

                self.model.clip_manager.sort_clips(
                    wait_until_database_fully_sorted=True)
                self.collected_paths = []
                print("clips sorted.")

        self._num_paths_seen += 1
        self.model.path_callback(path)
Ejemplo n.º 3
0
    def path_callback_explore(self, path, other_paths):
        path_length = len(path["obs"])
        self._steps_since_last_training += path_length/2.0

        self.agent_logger.log_episode(path)

        
        # We may be in a new part of the environment, so we take new segments to build comparisons from
        segment = sample_segment_from_path(path, int(self._frames_per_segment))
        if segment:
            self.recent_segments.append(segment)

        for i in range(len(other_paths)):
            other_segment = sample_segment_from_path(other_paths[i], int(self._frames_per_segment))
            if other_segment and segment:
                self.comparison_collector.add_segment_pair(segment, other_segment)
        # Train our predictor every X steps
        if self._steps_since_last_training >= int(self._n_timesteps_per_predictor_training):
            # sleep(2)
            self.train_predictor()
            self._steps_since_last_training -= self._steps_since_last_training
Ejemplo n.º 4
0
    def path_callback(self, path):
        path_length = len(path["obs"])
        self._steps_since_last_training += path_length

        self.agent_logger.log_episode(path)

        # We may be in a new part of the environment, so we take new segments to build comparisons from
        segment = sample_segment_from_path(path, int(self._frames_per_segment))
        if segment:
            self.recent_segments.append(segment)

            # If we need more comparisons, then we build them from our recent segments
            #if len(self.comparison_collector) < int(self.label_schedule.n_desired_labels):
            #            n_cand_pairs = 20
            #            cand_pairs_idx = np.random.randint(len(self.recent_segments), size=(n_cand_pairs, 2))
            #            cand_pairs = []
            #            segment_pairs = []
            random_segment = basic_segments_from_rand_rollout_1(
                self.env_id,
                make_with_torque_removed,
                n_desired_segments=1,
                clip_length_in_seconds=CLIP_LENGTH)[0]


#            for i in range(1):#(n_cand_pairs):
#                segment_pair = {}
#                segment_pair['segment1'] = self.recent_segments[cand_pairs_idx[i,0]]
#                segment_pair['segment2'] = self.recent_segments[cand_pairs_idx[i,1]]
#segment_pair['segment2'] = random_segment

#                which_seg = np.zeros((self.num_r))
#                for j in range(self.num_r):
#                    which_seg[j] = self.predict_segment_individual_reward(segment_pair['segment1'], j)> self.predict_segment_individual_reward(segment_pair['segment2'], j)

#                segment_pair['std'] = np.std(which_seg)
#                segment_pairs.append(segment_pair)

#            max_std_idx = np.argmax(np.array([pair_std['std'] for pair_std in segment_pairs]))
#            chosen_pair = segment_pairs[max_std_idx]
#            self.comparison_collector.add_segment_pair(
#                chosen_pair['segment1'],chosen_pair['segment2'])
#        if len(self.comparison_collector) < int(self.label_schedule.n_desired_labels):
        self.comparison_collector.add_segment_pair(
            random.choice(self.recent_segments),
            random.choice(self.recent_segments))

        # Train our predictor every X steps
        if self._steps_since_last_training >= int(
                self._n_timesteps_per_predictor_training):
            self.train_predictor()
            self._steps_since_last_training -= self._steps_since_last_training
Ejemplo n.º 5
0
    def path_callback(self, path):
        super().path_callback(path)
        self._episode_count += 1

        # We may be in a new part of the environment, so we take a clip to learn from if requested
        if self.clip_manager.total_number_of_clips < self.label_schedule.n_desired_labels:
            new_clip = sample_segment_from_path(path,
                                                int(self._frames_per_segment))
            if new_clip:
                self.clip_manager.add(new_clip, source="on-policy callback")

        # Train our model every X episodes
        if self._episode_count % self._episodes_per_training == 0:
            self.train(iterations=self._iterations_per_training,
                       report_frequency=25)

        # Save our model every X steps
        if self._episode_count % self._episodes_per_checkpoint == 0:
            self.save_model_checkpoint()
Ejemplo n.º 6
0
    def path_callback(self, path):
        path_length = len(path["obs"])
        self._steps_since_last_training += path_length

        self.agent_logger.log_episode(path)

        # We may be in a new part of the environment, so we take new segments to build comparisons from
        segment = sample_segment_from_path(path, int(self._frames_per_segment))
        if segment:
            self.recent_segments.append(segment)

        # If we need more comparisons, then we build them from our recent segments
        if len(self.comparison_collector) < int(
                self.label_schedule.n_desired_labels):
            self.comparison_collector.add_segment_pair(
                random.choice(self.recent_segments),
                random.choice(self.recent_segments))

        # Train our predictor every X steps
        if self._steps_since_last_training >= int(
                self._n_timesteps_per_predictor_training):
            self.train_predictor()
            print('finished train the predictor')
            self._steps_since_last_training -= self._steps_since_last_training
Ejemplo n.º 7
0
    def path_callback(self, path):
        path_length = len(path["obs"])
        self._steps_since_last_training += path_length/2.0

        self.agent_logger.log_episode(path)

        
        # We may be in a new part of the environment, so we take new segments to build comparisons from
        segment = sample_segment_from_path(path, int(self._frames_per_segment))
        if segment:
            self.recent_segments.append(segment)

        score_threshold_up = 0.8
        score_threshold_low = 0.2
        # If we need more comparisons, then we build them from our recent segments
        if len(self.comparison_collector) < int(self.label_schedule.n_desired_labels) and len(self.recent_segments)>10:
            # generate segments
            num_sampled_segments = 20
            sampeled_segments = []
            sampeled_segments_score = np.zeros((num_sampled_segments))
            for i in range(num_sampled_segments):
                sampled_segment = random.choice(self.recent_segments)
                sampeled_segments.append(sampled_segment)
                sampeled_segments_score[i] = self.predict_segment_quality(sampled_segment)[1]

            good_segment = sampeled_segments[np.argmax(sampeled_segments_score)]
            bad_segment  = sampeled_segments[np.argmin(sampeled_segments_score)]


            # find the good segment
            # no_seg = True
            # while no_seg:
            #     sample_seg = random.choice(self.recent_segments)
            #     obs_act_seg = self.obs_act_combine(sample_seg['obs'], sample_seg['actions'])
            #     if (predict_segment_quality[1] > 0.5):
            #         good_segment = sample_seg
            #         no_seg = False

            # # find the bad segment
            # no_seg = True
            # while no_seg:
            #     sample_seg = random.choice(self.recent_segments)
            #     obs_act_seg = self.obs_act_combine(sample_seg['obs'], sample_seg['actions'])
            #     if (predict_segment_quality[1] < 0.5):
            #         bad_segment = sample_seg
            #         no_seg = False


            if(np.max(sampeled_segments_score)>score_threshold_up and np.min(sampeled_segments_score)<score_threshold_low):
                self.comparison_collector.add_segment_pair_with_label(good_segment, bad_segment, 0)
            else:
                self.comparison_collector.add_segment_pair(good_segment, bad_segment)

            # 
        #     n_cand_pairs = 1
        #     cand_pairs_idx = np.random.randint(len(self.recent_segments), size=(n_cand_pairs, 2))
        #     cand_pairs = []
        #     segment_pairs = []
        #     for i in range(n_cand_pairs):
        #         segment_pair = {}
        #         segment_pair['segment1'] = self.recent_segments[cand_pairs_idx[i,0]]
        #         segment_pair['segment2'] = self.recent_segments[cand_pairs_idx[i,1]]
                
        #         which_seg = np.zeros((self.num_r))
        #         for j in range(self.num_r):
        #             which_seg[j] = self.predict_segment_individual_reward(segment_pair['segment1'], j)> self.predict_segment_individual_reward(segment_pair['segment2'], j)
                
        #         segment_pair['std'] = np.std(which_seg)
        #         segment_pairs.append(segment_pair)

        #     max_std_idx = np.argmax(np.array([pair_std['std'] for pair_std in segment_pairs]))
        #     chosen_pair = segment_pairs[max_std_idx]
        #     self.comparison_collector.add_segment_pair(
        #         chosen_pair['segment1'],chosen_pair['segment2'])
        # Train our predictor every X steps
        if self._steps_since_last_training >= int(self._n_timesteps_per_predictor_training):
            # sleep(5)
            self.train_predictor()
            self._steps_since_last_training -= self._steps_since_last_training
Ejemplo n.º 8
0
    def path_callback(self, path):
        path_length = len(path["obs"])
        self._steps_since_last_training += path_length

        self.agent_logger.log_episode(path)

        # We may be in a new part of the environment, so we take new segments to build comparisons from
        segment = sample_segment_from_path(path, int(self._frames_per_segment))
        if segment:
            self.recent_segments.append(segment)

        # If we need more comparisons, then we build them from our recent segments
        if self.use_bnn and self._elapsed_predictor_training_iters < self.random_sample_break:
            self.rew_bnn.save_params()
            best_kl = float("-inf")
            best_a = 0
            best_b = 0
            pair_indices = np.random.randint(low=0,
                                             high=len(self.recent_segments),
                                             size=(self.info_gain_samples, 2))
            for i in range(len(pair_indices)):
                a = pair_indices[i][0]
                b = pair_indices[i][1]
                if a == b:
                    continue
                else:
                    with self.graph.as_default():
                        seg1_obs = self.recent_segments[a]["obs"]
                        seg2_obs = self.recent_segments[b]["obs"]
                        seg1_acts = self.recent_segments[a]["actions"]
                        seg2_acts = self.recent_segments[b]["actions"]
                        kl1 = self.sess.run(
                            [self.planning_kl],
                            feed_dict={
                                self.segment_obs_placeholder: [seg1_obs],
                                self.segment_act_placeholder: [seg1_acts],
                                self.segment_alt_obs_placeholder: [seg2_obs],
                                self.segment_alt_act_placeholder: [seg2_acts],
                                self.plan_labels: [0],
                                K.learning_phase(): False
                            })
                        kl2 = self.sess.run(
                            [self.planning_kl],
                            feed_dict={
                                self.segment_obs_placeholder: [seg1_obs],
                                self.segment_act_placeholder: [seg1_acts],
                                self.segment_alt_obs_placeholder: [seg2_obs],
                                self.segment_alt_act_placeholder: [seg2_acts],
                                self.plan_labels: [1],
                                K.learning_phase(): False
                            })
                        prob = self.sess.run(
                            [self.softmax_rew],
                            feed_dict={
                                self.segment_obs_placeholder: [seg1_obs],
                                self.segment_act_placeholder: [seg1_acts],
                                self.segment_alt_obs_placeholder: [seg2_obs],
                                self.segment_alt_act_placeholder: [seg2_acts],
                                K.learning_phase(): False
                            })
                        p1 = prob[0][0][0]
                        p2 = prob[0][0][1]
                        kl1 = kl1[0]
                        kl2 = kl2[0]
                        #print("rewards ", p1, p2)
                        #print("kls ", kl1, kl2)
                        kl_val = p1 * kl1 + p2 * kl2
                        #print("KL: ", kl_val)
                        if kl_val > best_kl:
                            best_kl = kl_val
                            best_a = a
                            best_b = b
            # print("bestKL", best_kl, best_a, best_b)
            segments = [
                self.recent_segments[best_a], self.recent_segments[best_b]
            ]
        else:
            segments = random.sample(
                self.recent_segments,
                2) if len(self.recent_segments) > 2 else None

        if len(self.comparison_collector) < int(
                self.label_schedule.n_desired_labels):
            self.comparison_collector.add_segment_pair(segments[0],
                                                       segments[1])

        # Train our predictor every X steps
        if self._steps_since_last_training >= int(
                self._n_timesteps_per_predictor_training):
            self.train_predictor()
            self._steps_since_last_training -= self._steps_since_last_training