def path_callback(self, path): path_length = len(path["obs"]) self._steps_since_last_training += path_length self.agent_logger.log_episode(path) # We may be in a new part of the environment, so we take new segments to build comparisons from segment = sample_segment_from_path(path, int(self._frames_per_segment)) if segment: self.recent_segments.append(segment) # If we need more comparisons, then we build them from our recent segments too_many_labels = len( self.comparison_collector.unlabeled_comparisons) > 20 if len(self.comparison_collector) < int( self.label_schedule.n_desired_labels ) and not too_many_labels and len(self.recent_segments) > 1: print("test 2") i1 = random.randint(0, len(self.recent_segments) - 1) i2 = random.randint(0, len(self.recent_segments) - 1) if i1 != i2: self.comparison_collector.add_segment_pair( self.recent_segments[i1], self.recent_segments[i2]) # random.choice(self.recent_segments), # random.choice(self.recent_segments)) # Train our predictor every X steps if self._steps_since_last_training >= int( self._n_timesteps_per_predictor_training): self.train_predictor() self._steps_since_last_training -= self._steps_since_last_training
def path_callback(self, path): # Video recording to elicit human feedback every x steps. if (self._num_paths_seen % self.paths_per_wait <= self.paths_per_selection) and ( self.clip_manager.total_number_of_clips < self.label_schedule.n_desired_labels): if (len(self.collected_paths) < self.paths_per_selection): self.collected_paths.append(path) elif (len(self.collected_paths) == self.paths_per_selection): selected_paths, selection_time = self.selector.select( self.collected_paths) for selected_path in selected_paths: segment = sample_segment_from_path( selected_path, int(self.model._frames_per_segment)) if segment: self.model.clip_manager.add( segment, source="on-policy callback") self.model.clip_manager.sort_clips( wait_until_database_fully_sorted=True) self.collected_paths = [] print("clips sorted.") self._num_paths_seen += 1 self.model.path_callback(path)
def path_callback_explore(self, path, other_paths): path_length = len(path["obs"]) self._steps_since_last_training += path_length/2.0 self.agent_logger.log_episode(path) # We may be in a new part of the environment, so we take new segments to build comparisons from segment = sample_segment_from_path(path, int(self._frames_per_segment)) if segment: self.recent_segments.append(segment) for i in range(len(other_paths)): other_segment = sample_segment_from_path(other_paths[i], int(self._frames_per_segment)) if other_segment and segment: self.comparison_collector.add_segment_pair(segment, other_segment) # Train our predictor every X steps if self._steps_since_last_training >= int(self._n_timesteps_per_predictor_training): # sleep(2) self.train_predictor() self._steps_since_last_training -= self._steps_since_last_training
def path_callback(self, path): path_length = len(path["obs"]) self._steps_since_last_training += path_length self.agent_logger.log_episode(path) # We may be in a new part of the environment, so we take new segments to build comparisons from segment = sample_segment_from_path(path, int(self._frames_per_segment)) if segment: self.recent_segments.append(segment) # If we need more comparisons, then we build them from our recent segments #if len(self.comparison_collector) < int(self.label_schedule.n_desired_labels): # n_cand_pairs = 20 # cand_pairs_idx = np.random.randint(len(self.recent_segments), size=(n_cand_pairs, 2)) # cand_pairs = [] # segment_pairs = [] random_segment = basic_segments_from_rand_rollout_1( self.env_id, make_with_torque_removed, n_desired_segments=1, clip_length_in_seconds=CLIP_LENGTH)[0] # for i in range(1):#(n_cand_pairs): # segment_pair = {} # segment_pair['segment1'] = self.recent_segments[cand_pairs_idx[i,0]] # segment_pair['segment2'] = self.recent_segments[cand_pairs_idx[i,1]] #segment_pair['segment2'] = random_segment # which_seg = np.zeros((self.num_r)) # for j in range(self.num_r): # which_seg[j] = self.predict_segment_individual_reward(segment_pair['segment1'], j)> self.predict_segment_individual_reward(segment_pair['segment2'], j) # segment_pair['std'] = np.std(which_seg) # segment_pairs.append(segment_pair) # max_std_idx = np.argmax(np.array([pair_std['std'] for pair_std in segment_pairs])) # chosen_pair = segment_pairs[max_std_idx] # self.comparison_collector.add_segment_pair( # chosen_pair['segment1'],chosen_pair['segment2']) # if len(self.comparison_collector) < int(self.label_schedule.n_desired_labels): self.comparison_collector.add_segment_pair( random.choice(self.recent_segments), random.choice(self.recent_segments)) # Train our predictor every X steps if self._steps_since_last_training >= int( self._n_timesteps_per_predictor_training): self.train_predictor() self._steps_since_last_training -= self._steps_since_last_training
def path_callback(self, path): super().path_callback(path) self._episode_count += 1 # We may be in a new part of the environment, so we take a clip to learn from if requested if self.clip_manager.total_number_of_clips < self.label_schedule.n_desired_labels: new_clip = sample_segment_from_path(path, int(self._frames_per_segment)) if new_clip: self.clip_manager.add(new_clip, source="on-policy callback") # Train our model every X episodes if self._episode_count % self._episodes_per_training == 0: self.train(iterations=self._iterations_per_training, report_frequency=25) # Save our model every X steps if self._episode_count % self._episodes_per_checkpoint == 0: self.save_model_checkpoint()
def path_callback(self, path): path_length = len(path["obs"]) self._steps_since_last_training += path_length self.agent_logger.log_episode(path) # We may be in a new part of the environment, so we take new segments to build comparisons from segment = sample_segment_from_path(path, int(self._frames_per_segment)) if segment: self.recent_segments.append(segment) # If we need more comparisons, then we build them from our recent segments if len(self.comparison_collector) < int( self.label_schedule.n_desired_labels): self.comparison_collector.add_segment_pair( random.choice(self.recent_segments), random.choice(self.recent_segments)) # Train our predictor every X steps if self._steps_since_last_training >= int( self._n_timesteps_per_predictor_training): self.train_predictor() print('finished train the predictor') self._steps_since_last_training -= self._steps_since_last_training
def path_callback(self, path): path_length = len(path["obs"]) self._steps_since_last_training += path_length/2.0 self.agent_logger.log_episode(path) # We may be in a new part of the environment, so we take new segments to build comparisons from segment = sample_segment_from_path(path, int(self._frames_per_segment)) if segment: self.recent_segments.append(segment) score_threshold_up = 0.8 score_threshold_low = 0.2 # If we need more comparisons, then we build them from our recent segments if len(self.comparison_collector) < int(self.label_schedule.n_desired_labels) and len(self.recent_segments)>10: # generate segments num_sampled_segments = 20 sampeled_segments = [] sampeled_segments_score = np.zeros((num_sampled_segments)) for i in range(num_sampled_segments): sampled_segment = random.choice(self.recent_segments) sampeled_segments.append(sampled_segment) sampeled_segments_score[i] = self.predict_segment_quality(sampled_segment)[1] good_segment = sampeled_segments[np.argmax(sampeled_segments_score)] bad_segment = sampeled_segments[np.argmin(sampeled_segments_score)] # find the good segment # no_seg = True # while no_seg: # sample_seg = random.choice(self.recent_segments) # obs_act_seg = self.obs_act_combine(sample_seg['obs'], sample_seg['actions']) # if (predict_segment_quality[1] > 0.5): # good_segment = sample_seg # no_seg = False # # find the bad segment # no_seg = True # while no_seg: # sample_seg = random.choice(self.recent_segments) # obs_act_seg = self.obs_act_combine(sample_seg['obs'], sample_seg['actions']) # if (predict_segment_quality[1] < 0.5): # bad_segment = sample_seg # no_seg = False if(np.max(sampeled_segments_score)>score_threshold_up and np.min(sampeled_segments_score)<score_threshold_low): self.comparison_collector.add_segment_pair_with_label(good_segment, bad_segment, 0) else: self.comparison_collector.add_segment_pair(good_segment, bad_segment) # # n_cand_pairs = 1 # cand_pairs_idx = np.random.randint(len(self.recent_segments), size=(n_cand_pairs, 2)) # cand_pairs = [] # segment_pairs = [] # for i in range(n_cand_pairs): # segment_pair = {} # segment_pair['segment1'] = self.recent_segments[cand_pairs_idx[i,0]] # segment_pair['segment2'] = self.recent_segments[cand_pairs_idx[i,1]] # which_seg = np.zeros((self.num_r)) # for j in range(self.num_r): # which_seg[j] = self.predict_segment_individual_reward(segment_pair['segment1'], j)> self.predict_segment_individual_reward(segment_pair['segment2'], j) # segment_pair['std'] = np.std(which_seg) # segment_pairs.append(segment_pair) # max_std_idx = np.argmax(np.array([pair_std['std'] for pair_std in segment_pairs])) # chosen_pair = segment_pairs[max_std_idx] # self.comparison_collector.add_segment_pair( # chosen_pair['segment1'],chosen_pair['segment2']) # Train our predictor every X steps if self._steps_since_last_training >= int(self._n_timesteps_per_predictor_training): # sleep(5) self.train_predictor() self._steps_since_last_training -= self._steps_since_last_training
def path_callback(self, path): path_length = len(path["obs"]) self._steps_since_last_training += path_length self.agent_logger.log_episode(path) # We may be in a new part of the environment, so we take new segments to build comparisons from segment = sample_segment_from_path(path, int(self._frames_per_segment)) if segment: self.recent_segments.append(segment) # If we need more comparisons, then we build them from our recent segments if self.use_bnn and self._elapsed_predictor_training_iters < self.random_sample_break: self.rew_bnn.save_params() best_kl = float("-inf") best_a = 0 best_b = 0 pair_indices = np.random.randint(low=0, high=len(self.recent_segments), size=(self.info_gain_samples, 2)) for i in range(len(pair_indices)): a = pair_indices[i][0] b = pair_indices[i][1] if a == b: continue else: with self.graph.as_default(): seg1_obs = self.recent_segments[a]["obs"] seg2_obs = self.recent_segments[b]["obs"] seg1_acts = self.recent_segments[a]["actions"] seg2_acts = self.recent_segments[b]["actions"] kl1 = self.sess.run( [self.planning_kl], feed_dict={ self.segment_obs_placeholder: [seg1_obs], self.segment_act_placeholder: [seg1_acts], self.segment_alt_obs_placeholder: [seg2_obs], self.segment_alt_act_placeholder: [seg2_acts], self.plan_labels: [0], K.learning_phase(): False }) kl2 = self.sess.run( [self.planning_kl], feed_dict={ self.segment_obs_placeholder: [seg1_obs], self.segment_act_placeholder: [seg1_acts], self.segment_alt_obs_placeholder: [seg2_obs], self.segment_alt_act_placeholder: [seg2_acts], self.plan_labels: [1], K.learning_phase(): False }) prob = self.sess.run( [self.softmax_rew], feed_dict={ self.segment_obs_placeholder: [seg1_obs], self.segment_act_placeholder: [seg1_acts], self.segment_alt_obs_placeholder: [seg2_obs], self.segment_alt_act_placeholder: [seg2_acts], K.learning_phase(): False }) p1 = prob[0][0][0] p2 = prob[0][0][1] kl1 = kl1[0] kl2 = kl2[0] #print("rewards ", p1, p2) #print("kls ", kl1, kl2) kl_val = p1 * kl1 + p2 * kl2 #print("KL: ", kl_val) if kl_val > best_kl: best_kl = kl_val best_a = a best_b = b # print("bestKL", best_kl, best_a, best_b) segments = [ self.recent_segments[best_a], self.recent_segments[best_b] ] else: segments = random.sample( self.recent_segments, 2) if len(self.recent_segments) > 2 else None if len(self.comparison_collector) < int( self.label_schedule.n_desired_labels): self.comparison_collector.add_segment_pair(segments[0], segments[1]) # Train our predictor every X steps if self._steps_since_last_training >= int( self._n_timesteps_per_predictor_training): self.train_predictor() self._steps_since_last_training -= self._steps_since_last_training