def draw_state(self, return_list=False): if not constants.DRAWING: return from utils import drawing curr_image = self.game_state.detection_image.copy() curr_depth = self.game_state.s_t_depth if curr_depth is not None: curr_depth = self.game_state.s_t_depth.copy() curr_depth[0, 0] = 0 curr_depth[0, 1] = constants.MAX_DEPTH label = np.flipud(self.get_label()) patch = np.flipud(self.game_state.graph.get_graph_patch(self.pose)[0]) state_image = self.game_state.draw_state().copy() memory_map = np.flipud(self.game_state.graph.memory.copy()) memory_map = np.concatenate( (memory_map[:, :, [0]], np.zeros( memory_map[:, :, [0]].shape), memory_map[:, :, 1:]), axis=2) images = [ curr_image, state_image, np.minimum(memory_map[:, :, 0], 200), np.argmax(memory_map[:, :, 1:], axis=2), label[:, :], np.minimum(patch[:, :, 0], 10), ] if return_list: return images action_str = 'action: %s possible %.3f' % ( self.action_util.actions[np.where( self.action == 1)[0].squeeze()]['action'], self.is_possible) titles = [ '%07d' % self.num_steps, action_str, 'Occupancy Map', 'Objects Map', 'Label Patch', 'Learned Patch' ] image = drawing.subplot(images, 4, 3, curr_image.shape[1], curr_image.shape[0], titles=titles, border=3) return image
def run(): try: with tf.variable_scope('nav_global_network'): network = FreeSpaceNetwork(constants.GRU_SIZE, constants.BATCH_SIZE, constants.NUM_UNROLLS) network.create_net() training_step = network.training_op with tf.variable_scope('loss'): loss_summary_op = tf.summary.merge([ tf.summary.scalar('loss', network.loss), ]) summary_full = tf.summary.merge_all() conv_var_list = [ v for v in tf.trainable_variables() if 'conv' in v.name and 'weight' in v.name and (v.get_shape().as_list()[0] != 1 or v.get_shape().as_list()[1] != 1 ) ] for var in conv_var_list: tf_util.conv_variable_summaries(var, scope=var.name.replace('/', '_')[:-2]) summary_with_images = tf.summary.merge_all() # prepare session sess = tf_util.Session() seq_inds = np.zeros((constants.BATCH_SIZE, 2), dtype=np.int32) sequence_generators = [] for thread_index in range(constants.PARALLEL_SIZE): gpus = str(constants.GPU_ID).split(',') sequence_generator = SequenceGenerator(sess) sequence_generators.append(sequence_generator) sess.run(tf.global_variables_initializer()) if not (constants.DEBUG or constants.DRAWING): from utils import py_util time_str = py_util.get_time_str() summary_writer = tf.summary.FileWriter( os.path.join(constants.LOG_FILE, time_str), sess.graph) else: summary_writer = None saver = tf.train.Saver(max_to_keep=3) # init or load checkpoint start_it = tf_util.restore_from_dir(sess, constants.CHECKPOINT_DIR) sess.graph.finalize() data_lock = threading.Lock() def load_new_data(thread_index): global data_buffer global data_counts sequence_generator = sequence_generators[thread_index] counter = 0 while True: while not (len(data_buffer) < constants.REPLAY_BUFFER_SIZE or np.max(data_counts) > 0): time.sleep(1) counter += 1 if constants.DEBUG: print('\nThread %d' % thread_index) new_data, bounds, goal_pose = sequence_generator.generate_episode( ) new_data = { key: ([new_data[ii][key] for ii in range(len(new_data))]) for key in new_data[0] } new_data['goal_pose'] = goal_pose new_data['memory'] = np.zeros( (constants.SPATIAL_MAP_HEIGHT, constants.SPATIAL_MAP_WIDTH, constants.MEMORY_SIZE)) new_data['gru_state'] = np.zeros(constants.GRU_SIZE) if constants.DRAWING: new_data['debug_images'] = sequence_generator.debug_images data_lock.acquire() if len(data_buffer) < constants.REPLAY_BUFFER_SIZE: data_counts[len(data_buffer)] = 0 data_buffer.append(new_data) counts = data_counts[:len(data_buffer)] if counter % 10 == 0: print( 'Buffer size %d Num used %d Max used amount %d' % (len(data_buffer), len( counts[counts > 0]), np.max(counts))) else: max_count_ind = np.argmax(data_counts) data_buffer[max_count_ind] = new_data data_counts[max_count_ind] = 0 if counter % 10 == 0: print('Num used %d Max used amount %d' % (len(data_counts[data_counts > 0]), np.max(data_counts))) data_lock.release() threads = [] for i in range(constants.PARALLEL_SIZE): load_data_thread = threading.Thread(target=load_new_data, args=(i, )) load_data_thread.daemon = True load_data_thread.start() threads.append(load_data_thread) time.sleep(1) sequences = [None] * constants.BATCH_SIZE curr_it = 0 dataTimeTotal = 0.00001 solverTimeTotal = 0.00001 summaryTimeTotal = 0.00001 totalTimeTotal = 0.00001 chosen_inds = set() loc_to_chosen_ind = {} for iteration in range(start_it, constants.MAX_TIME_STEP): if iteration == start_it or iteration % 10 == 1: currentTimeStart = time.time() tStart = time.time() batch_data = [] batch_action = [] batch_memory = [] batch_gru_state = [] batch_labels = [] batch_pose = [] batch_mask = [] batch_goal_pose = [] batch_pose_indicator = [] batch_possible_label = [] batch_debug_images = [] for bb in range(constants.BATCH_SIZE): if seq_inds[bb, 0] == seq_inds[bb, 1]: # Pick a new random sequence pickable_inds = set( np.where(data_counts < 100)[0]) - chosen_inds count_size = len(pickable_inds) while count_size == 0: pickable_inds = set( np.where(data_counts < 100)[0]) - chosen_inds count_size = len(pickable_inds) time.sleep(1) random_ind = random.sample(pickable_inds, 1)[0] data_lock.acquire() sequences[bb] = data_buffer[random_ind] goal_pose = sequences[bb]['goal_pose'] sequences[bb]['memory'] = np.zeros( (constants.SPATIAL_MAP_HEIGHT, constants.SPATIAL_MAP_WIDTH, constants.MEMORY_SIZE)) sequences[bb]['gru_state'] = np.zeros(constants.GRU_SIZE) data_counts[random_ind] += 1 if bb in loc_to_chosen_ind: chosen_inds.remove(loc_to_chosen_ind[bb]) loc_to_chosen_ind[bb] = random_ind chosen_inds.add(random_ind) data_lock.release() seq_inds[bb, 0] = 0 seq_inds[bb, 1] = len(sequences[bb]['color']) data_len = min(constants.NUM_UNROLLS, seq_inds[bb, 1] - seq_inds[bb, 0]) ind0 = seq_inds[bb, 0] ind1 = seq_inds[bb, 0] + data_len data = sequences[bb]['color'][ind0:ind1] action = sequences[bb]['action'][ind0:ind1] labels = sequences[bb]['label'][ind0:ind1] memory = sequences[bb]['memory'].copy() gru_state = sequences[bb]['gru_state'].copy() pose = sequences[bb]['pose'][ind0:ind1] goal_pose = sequences[bb]['goal_pose'] mask = sequences[bb]['weight'][ind0:ind1] pose_indicator = sequences[bb]['pose_indicator'][ind0:ind1] possible_label = sequences[bb]['possible_label'][ind0:ind1] if constants.DRAWING: batch_debug_images.append( sequences[bb]['debug_images'][ind0:ind1]) if data_len < (constants.NUM_UNROLLS): seq_inds[bb, :] = 0 data.extend([ np.zeros_like(data[0]) for _ in range(constants.NUM_UNROLLS - data_len) ]) action.extend([ np.zeros_like(action[0]) for _ in range(constants.NUM_UNROLLS - data_len) ]) labels.extend([ np.zeros_like(labels[0]) for _ in range(constants.NUM_UNROLLS - data_len) ]) pose.extend([ pose[-1] for _ in range(constants.NUM_UNROLLS - data_len) ]) mask.extend([ np.zeros_like(mask[0]) for _ in range(constants.NUM_UNROLLS - data_len) ]) pose_indicator.extend([ np.zeros_like(pose_indicator[0]) for _ in range(constants.NUM_UNROLLS - data_len) ]) possible_label.extend([ np.zeros_like(possible_label[0]) for _ in range(constants.NUM_UNROLLS - data_len) ]) else: seq_inds[bb, 0] += constants.NUM_UNROLLS batch_data.append(data) batch_action.append(action) batch_memory.append(memory) batch_gru_state.append(gru_state) batch_pose.append(pose) batch_goal_pose.append(goal_pose) batch_labels.append(labels) batch_mask.append(mask) batch_pose_indicator.append(pose_indicator) batch_possible_label.append(possible_label) feed_dict = { network.image_placeholder: np.ascontiguousarray(batch_data), network.action_placeholder: np.ascontiguousarray(batch_action), network.gru_placeholder: np.ascontiguousarray(batch_gru_state), network.pose_placeholder: np.ascontiguousarray(batch_pose), network.goal_pose_placeholder: np.ascontiguousarray(batch_goal_pose), network.labels_placeholder: np.ascontiguousarray(batch_labels)[..., np.newaxis], network.mask_placeholder: np.ascontiguousarray(batch_mask), network.pose_indicator_placeholder: np.ascontiguousarray(batch_pose_indicator), network.possible_label_placeholder: np.ascontiguousarray(batch_possible_label), network.memory_placeholders: np.ascontiguousarray(batch_memory), } dataTEnd = time.time() summaryTime = 0 if constants.DEBUG or constants.DRAWING: outputs = sess.run([ training_step, network.loss, network.gru_state, network.patch_weights_sigm, network.gru_outputs_full, network.is_possible_sigm, network.pose_indicator_placeholder, network.terminal_patches, network.gru_outputs ], feed_dict=feed_dict) else: if iteration == start_it + 10: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() outputs = sess.run([ training_step, network.loss, network.gru_state, summary_with_images, network.gru_outputs ], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) loss_summary = outputs[3] summary_writer.add_run_metadata(run_metadata, 'step_%07d' % iteration) summary_writer.add_summary(loss_summary, iteration) summary_writer.flush() elif iteration % 10 == 0: if iteration % 100 == 0: outputs = sess.run([ training_step, network.loss, network.gru_state, summary_with_images, network.gru_outputs ], feed_dict=feed_dict) elif iteration % 10 == 0: outputs = sess.run([ training_step, network.loss, network.gru_state, loss_summary_op, network.gru_outputs ], feed_dict=feed_dict) loss_summary = outputs[3] summaryTStart = time.time() summary_writer.add_summary(loss_summary, iteration) summary_writer.flush() summaryTime = time.time() - summaryTStart else: outputs = sess.run([ training_step, network.loss, network.gru_state, network.gru_outputs ], feed_dict=feed_dict) gru_state_out = outputs[2] memory_out = outputs[-1] for mm in range(constants.BATCH_SIZE): sequences[mm]['memory'] = memory_out[mm, ...] sequences[mm]['gru_state'] = gru_state_out[mm, ...] loss = outputs[1] solverTEnd = time.time() if constants.DEBUG or constants.DRAWING: # Look at outputs patch_weights = outputs[3] is_possible = outputs[5] pose_indicator = outputs[6] terminal_patches = outputs[7] data_lock.acquire() for bb in range(constants.BATCH_SIZE): for tt in range(constants.NUM_UNROLLS): if batch_mask[bb][tt] == 0: break if constants.DRAWING: import cv2 import scipy.misc from utils import drawing curr_image = batch_data[bb][tt] label = np.flipud(batch_labels[bb][tt]) debug_images = batch_debug_images[bb][tt] color_image = debug_images['color'] state_image = debug_images['state_image'] label_memory_image = debug_images[ 'label_memory'][:, :, 0] label_memory_image_class = np.argmax( debug_images['label_memory'][:, :, 1:], axis=2) label_memory_image_class[0, 0] = constants.NUM_CLASSES label_patch = debug_images['label'] print('Possible pred %.3f' % is_possible[bb, tt]) print('Possible label %.3f' % batch_possible_label[bb][tt]) patch = np.flipud(patch_weights[bb, tt, ...]) patch_occupancy = patch[:, :, 0] print('occ', patch_occupancy) print('label', label) terminal_patch = np.flipud( np.sum(terminal_patches[bb, tt, ...], axis=2)) image_list = [ debug_images['color'], state_image, debug_images['label_memory'][:, :, 0], debug_images['memory_map'][:, :, 0], label[:, :], patch_occupancy, np.flipud(pose_indicator[bb, tt]), terminal_patch, ] image = drawing.subplot(image_list, 4, 2, constants.SCREEN_WIDTH, constants.SCREEN_HEIGHT) cv2.imshow('image', image[:, :, ::-1]) cv2.waitKey(0) else: pdb.set_trace() data_lock.release() if not (constants.DEBUG or constants.DRAWING) and ( iteration % 500 == 0 or iteration == constants.MAX_TIME_STEP - 1): saverTStart = time.time() tf_util.save(saver, sess, constants.CHECKPOINT_DIR, iteration) saverTEnd = time.time() print('Saver: %.3f' % (saverTEnd - saverTStart)) curr_it += 1 dataTimeTotal += dataTEnd - tStart summaryTimeTotal += summaryTime solverTimeTotal += solverTEnd - dataTEnd - summaryTime totalTimeTotal += time.time() - tStart if iteration == start_it or (iteration) % 10 == 0: print('Iteration: %d' % (iteration)) print('Loss: %.3f' % loss) print('Data: %.3f' % (dataTimeTotal / curr_it)) print('Solver: %.3f' % (solverTimeTotal / curr_it)) print('Summary: %.3f' % (summaryTimeTotal / curr_it)) print('Total: %.3f' % (totalTimeTotal / curr_it)) print('Current: %.3f\n' % ((time.time() - currentTimeStart) / min(10, curr_it))) except: import traceback traceback.print_exc() finally: # Save final model if not (constants.DEBUG or constants.DRAWING): tf_util.save(saver, sess, constants.CHECKPOINT_DIR, iteration)
def draw_state(self, return_list=False, action=None): if not constants.DRAWING: return from utils import drawing curr_image = self.game_state.detection_image.copy() state_image = self.game_state.draw_state() pi = self.pi.copy().squeeze() action_size = max(len(pi), 100) action_hist = np.zeros((action_size, action_size)) for ii, pi_i in enumerate(pi): action_hist[:max(int(np.round(pi_i * action_size)), 1), int(ii * action_size / len(pi)):int((ii + 1) * action_size / len(pi))] = (ii + 1) action_hist = np.flipud(action_hist) images = [ curr_image, np.argmax(self.game_state.detection_mask_image, 2), action_hist, state_image, ] if type(action) == int: action = self.game_state.get_action(action)[0] action_str = game_util.get_action_str(action) if action_str == 'Answer': if self.game_state.question_type_ind != 1: action_str += ' ' + str(self.answer > 0.5) else: action_str += ' ' + str(np.argmax(self.answer)) if self.game_state.question_type_ind == 0: question_str = '%03d Ex Q: %s A: %s' % ( self.num_steps, constants.OBJECTS[self.game_state.question_target], bool(self.game_state.answer)) elif self.game_state.question_type_ind == 1: question_str = '%03d # Q: %s A: %d' % ( self.num_steps, constants.OBJECTS[self.game_state.question_target], self.game_state.answer) elif self.game_state.question_type_ind == 2: question_str = '%03d Q: %s in %s A: %d' % ( self.num_steps, constants.OBJECTS[self.game_state.question_target[0]], constants.OBJECTS[self.game_state.question_target[1]], self.game_state.answer) else: raise Exception('No matching question number') titles = [ question_str, action_str, 'reward %.3f, value %.3f' % (self.reward, self.v) ] if return_list: return action_hist image = drawing.subplot(images, 2, 2, curr_image.shape[1], curr_image.shape[0], titles=titles) if not os.path.exists('visualizations/images'): os.makedirs('visualizations/images') cv2.imwrite( 'visualizations/images/state_%05d.jpg' % self.global_step_id, image[:, :, ::-1]) return image
num_total += 1 action_key = '' state.reset(*question) while action_key != 'answer': if constants.DEBUG: images = [ state.s_t, state.detection_image, state.s_t_depth, state.event.class_segmentation_frame, state.event.instance_segmentation_frame ] titles = [ 'state', 'detections', 'depth', 'class segmentation', 'instance segmentation' ] image = drawing.subplot(images, 2, 3, constants.SCREEN_WIDTH, constants.SCREEN_HEIGHT, 5, titles) cv2.imshow('image', image[:, :, ::-1]) cv2.waitKey(10) print( 'w: MoveAhead\na: RotateLeft\ns: RotateRight\no: OpenObject\nc: CloseObject\n+: LookUp\n-: LookDown\nanswer: Open answer dialog. type {true, false, yes, no}\nq: quit\ndd: enter debug' ) new_action_key = input(">> ") if new_action_key != '': action_key = new_action_key state.step(action_key) answer = None while answer is None: answer = input("answer: ").lower() if answer in {'true', 'false', 'yes', 'no'}:
def visualize_results(self, input, output, target, step, add_to_keys): if 'reconstructed_rgb' in output and 'reconstructed_rgb' in target: batch_to_visualize = 0 # pdb.set_trace() output_reconstruct = output['reconstructed_rgb'][ batch_to_visualize] target_reconstruct = target['reconstructed_rgb'][ batch_to_visualize] all_images = [ channel_last(normalize(img)) for img in [target_reconstruct, output_reconstruct] ] combined_images_before_append = torch.stack(all_images, dim=1) combined_images = combine_image_table( combined_images_before_append) combined_images = channel_first(combined_images) self.log_writer.add_image(tag=('reconstruct_rgb_viz' + '/' + add_to_keys), img_tensor=combined_images, global_step=step) if 'verb_class' in output and 'verb_class' in target: batch_to_visualize = 10 output_verb_class = output[ 'verb_actual_class'][:batch_to_visualize] target_verb_class = target['verb_class'][:batch_to_visualize] rgb_images = (input['rgb'][:batch_to_visualize]) batch_size, seq_len, c, w, h = rgb_images.shape # half_way = int(seq_len / 2) # rgb_images = rgb_images[:,[0,half_way, -1]] rgb_images = channel_last(normalize(rgb_images)) combined_images_before_append = put_epic_class_text_on_images( rgb_images, target_verb_class, output_verb_class, self.dataset.VERB_CLASS_TO_NAME) combined_images = combine_image_table( combined_images_before_append) combined_images = channel_first(combined_images) self.log_writer.add_image(tag=('verb_class_rgb_viz' + '/' + add_to_keys), img_tensor=combined_images, global_step=step) if 'class_probs' in output and 'class_probs' in target: batch_to_visualize = 10 output_verb_class = output['class_probs'][:batch_to_visualize] target_verb_class = target['class_probs'][:batch_to_visualize] rgb_images = (input['rgb'][:batch_to_visualize]) batch_size, seq_len, c, w, h = rgb_images.shape # half_way = int(seq_len / 2) # rgb_images = rgb_images[:,[0,half_way, -1]] rgb_images = channel_last(normalize(rgb_images)) class_names = self.dataset.VERB_CLASS_TO_NAME _, output_top_k = torch.topk(output_verb_class, k=5, dim=-1) _, target_top_k = torch.topk(target_verb_class, k=5, dim=-1) output_top_k = output_top_k.squeeze(1) target_top_k = target_top_k.squeeze(1) output_text_list = [] target_text_list = [] for b_ind in range(len(output_top_k)): output_verbs = '/'.join([ class_names[cls.item()].split(' ')[0].split('/')[0] for cls in output_top_k[b_ind] ]) target_verbs = '/'.join([ class_names[cls.item()].split(' ')[0].split('/')[0] for cls in target_top_k[b_ind] ]) output_text_list.append(output_verbs) target_text_list.append(target_verbs) combined_images_before_append = put_text_on_images( rgb_images, output_text_list, target_text_list, color_list=None, font_scale=0.3, line_type=1) combined_images = combine_image_table( combined_images_before_append) combined_images = channel_first(combined_images) self.log_writer.add_image(tag=('verb_class_rgb_viz' + '/' + add_to_keys), img_tensor=combined_images, global_step=step) if 'vind_class' in output and 'vind_class' in target: batch_to_visualize = 10 output_pose_class = output[ 'vind_actual_class'][:batch_to_visualize] target_pose_class = target['vind_class'][:batch_to_visualize] rgb_images = (input['rgb'][:batch_to_visualize]) mask_images = (target['combined_mask'][:batch_to_visualize]) batch_size, seq_len, c, w, h = rgb_images.shape rgb_images = channel_last(normalize(rgb_images)) mask_images = channel_last(normalize(mask_images)) combined_images = torch.cat([rgb_images, mask_images], dim=1) combined_images_before_append = put_epic_class_text_on_images( combined_images, target_pose_class, output_pose_class, {i: str(i) for i in range(100)}) combined_images = combine_image_table( combined_images_before_append) combined_images = channel_first(combined_images) self.log_writer.add_image(tag=('vind_class' + '/' + add_to_keys), img_tensor=combined_images, global_step=step) if 'scene_class' in output and 'scene_class' in target: batch_to_visualize = 10 output_scene_class = output[ 'scene_actual_class'][:batch_to_visualize] target_scene_class = target['scene_class'][:batch_to_visualize] rgb_images = (input['rgb'][:batch_to_visualize]) batch_size, seq_len, c, w, h = rgb_images.shape rgb_images = channel_last(normalize(rgb_images)) combined_images_before_append = put_epic_class_text_on_images( rgb_images, target_scene_class, output_scene_class, self.dataset.SUN_SCENE_INDEX_TO_NAME) combined_images = combine_image_table( combined_images_before_append) combined_images = channel_first(combined_images) self.log_writer.add_image(tag=('scene_class_rgb_viz' + '/' + add_to_keys), img_tensor=combined_images, global_step=step) if 'move_label' in output and 'move_label' in target: from utils.constants import IMU_INDEX_TO_NAME def get_one_set_str(imus, imu_indices): result = '' for imu_ind in range(len(imu_indices)): imu_name = IMU_INDEX_TO_NAME[imu_indices[imu_ind]] move_label = imus[imu_ind] if move_label == 0: result += imu_name + '0-' elif move_label == 1: result += imu_name + '1-' return result def translate_move_label(move_labels, list_of_imus): seq_len, num_imus = move_labels.shape result = [] for seq_ind in range(seq_len): this_item = get_one_set_str(move_labels[seq_ind], list_of_imus) result.append(this_item) return result batch_to_visualize = 0 # output_move_label = output['move_label'][batch_to_visualize] output_move_label = output['cleaned_move_label'][ batch_to_visualize] target_move_label = target['move_label'][batch_to_visualize] rgb_images = (target['rgb'][batch_to_visualize]) seq_len = rgb_images.shape[0] output_images = channel_last(normalize(rgb_images)).cpu().numpy() target_images = channel_last(normalize(rgb_images)).cpu().numpy() list_of_images = [target_images[i] for i in range(seq_len)] list_of_images += [output_images[i] for i in range(seq_len)] target_titles = translate_move_label(target_move_label, self.imus) output_titles = translate_move_label(output_move_label, self.imus) target_titles = ['gt-' + x for x in target_titles] titles = target_titles + output_titles combined_images = drawing.subplot(list_of_images, 2, seq_len, 224, 224, 5, titles) combined_images = channel_first(combined_images) self.log_writer.add_image(tag=('move_label_viz' + '/' + add_to_keys), img_tensor=combined_images, global_step=step) if 'gaze_points' in output and 'gaze_points' in target: batch_to_visualize = 0 rgb_images = channel_last( normalize(target['rgb'][batch_to_visualize])) target_gaze_images = visualize_gaze( rgb_images, target['gaze_points'][batch_to_visualize]) output_gaze_images = visualize_gaze( rgb_images, output['gaze_points'][batch_to_visualize]) combined_images_before_append = torch.stack( [target_gaze_images, output_gaze_images], dim=1) combined_images = combine_image_table( combined_images_before_append) combined_images = channel_first(combined_images) self.log_writer.add_image(tag=('gaze_viz' + '/' + add_to_keys), img_tensor=combined_images, global_step=step) if 'depth' in output: self.visualize_feature(input, output, target, step, add_to_keys, 'depth', depth_normalize) if 'walk' in output: output['real_walk'] = torch.argmax(output['walk'], dim=2).unsqueeze(2) target['real_walk'] = target['walk'] self.visualize_feature(input, output, target, step, add_to_keys, 'real_walk', identity)
def draw_state(self, return_list=False, action=None): if not constants.DRAWING: return # Rows are: # 0 - Map weights (not fed to decision network) # 1 and 2 - meshgrid # 3 - coverage # 4 - teleport locations # 5 - free space map # 6 - visited locations # 7+ - object location from utils import drawing curr_image = self.game_state.detection_image.copy() state_image = self.game_state.draw_state() action_hist = np.zeros((3, 3, 3)) pi = self.pi.copy() if constants.STEPS_AHEAD == 5: action_hist = np.concatenate((pi, np.zeros(3))) action_hist = action_hist.reshape(7, 5) elif constants.STEPS_AHEAD == 1: action_hist = np.concatenate((pi, np.zeros(1))) action_hist = action_hist.reshape(3, 3) flat_action_size = max(len(pi), 100) flat_action_hist = np.zeros((flat_action_size, flat_action_size)) for ii, flat_action_i in enumerate(pi): flat_action_hist[:max( int(np.round(flat_action_i * flat_action_size)), 1), int(ii * flat_action_size / len(pi)):int((ii + 1) * flat_action_size / len(pi))] = (ii + 1) flat_action_hist = np.flipud(flat_action_hist) # Answer histogram ans = self.answer if len(ans) == 1: ans = [1 - ans[0], ans[0]] ans_size = max(len(ans), 100) ans_hist = np.zeros((ans_size, ans_size)) for ii, ans_i in enumerate(ans): ans_hist[:max(int(np.round(ans_i * ans_size)), 1), int(ii * ans_size / len(ans)):int((ii + 1) * ans_size / len(ans))] = (ii + 1) ans_hist = np.flipud(ans_hist) dil = np.flipud(self.dilation) dil[0, 0] = 4 coverage = int(self.coverage * 100 / self.max_coverage) possible = np.zeros((3, 3, 3)) possible_pred = np.zeros((3, 3, 3)) if constants.STEPS_AHEAD == 5: possible = self.possible_moves.copy() possible = np.concatenate((possible, np.zeros(4))) possible = possible.reshape(constants.STEPS_AHEAD + 2, constants.STEPS_AHEAD) possible_pred = self.possible_moves_pred.copy() possible_pred = np.concatenate((possible_pred, np.zeros(4))) possible_pred = possible_pred.reshape(constants.STEPS_AHEAD + 2, constants.STEPS_AHEAD) elif constants.STEPS_AHEAD == 1: possible = self.possible_moves.copy() possible = np.concatenate((possible, np.zeros(2))) possible = possible.reshape(3, 3) possible_pred = self.possible_moves_pred.copy() possible_pred = np.concatenate((possible_pred, np.zeros(2))) possible_pred = possible_pred.reshape(3, 3) if self.game_state.question_type_ind in {2, 3}: obj_mem = self.spatial_map.memory[:, :, 7 + self.game_state. question_target[1]].copy() obj_mem += self.spatial_map.memory[:, :, 7 + self.game_state. object_target] * 2 else: obj_mem = self.spatial_map.memory[:, :, 7 + self.game_state. object_target].copy() obj_mem[0, 0] = 2 memory_map = np.flipud(self.spatial_map.memory[:, :, 7:].copy()) curr_objs = np.argmax(memory_map, axis=2) gt_objs = np.flipud( np.argmax(self.game_state.xray_graph.memory[:, :, 1:], 2)) curr_objs[0, 0] = np.max(gt_objs) memory_crop = self.memory_crops[0, ...].copy() memory_crop_cov = np.argmax(np.flipud(memory_crop), axis=2) gt_semantic_crop = np.flipud( np.argmax(self.next_memory_crops_rot, axis=2)) images = [ curr_image, state_image, dil + np.max(np.flipud(self.spatial_map.memory[:, :, 3:5]) * np.array([1, 3]), axis=2), memory_crop_cov, ans_hist, flat_action_hist, np.flipud(action_hist), np.flipud(possible), np.flipud(possible_pred), gt_objs, curr_objs, np.flipud(obj_mem), ] if type(action) == int: action = self.game_state.get_action(action)[0] action_str = game_util.get_action_str(action) if action_str == 'Answer': if self.game_state.question_type_ind != 1: action_str += ' ' + str(self.answer > 0.5) elif self.game_state.question_type_ind == 1: action_str += ' ' + str(np.argmax(self.answer)) if self.game_state.question_type_ind == 0: question_str = '%03d S %s Ex Q: %s A: %s' % ( self.num_steps, self.game_state.scene_name[9:], constants.OBJECTS[self.game_state.question_target], bool(self.game_state.answer)) elif self.game_state.question_type_ind == 1: question_str = '%03d S %s # Q: %s A: %d' % ( self.num_steps, self.game_state.scene_name[9:], constants.OBJECTS[self.game_state.question_target], self.game_state.answer) elif self.game_state.question_type_ind == 2: question_str = '%03d S %s Q: %s in %s A: %s' % ( self.num_steps, self.game_state.scene_name[9:], constants.OBJECTS[self.game_state.question_target[0]], constants.OBJECTS[self.game_state.question_target[1]], bool(self.game_state.answer)) else: raise Exception('No matching question number') titles = [ question_str, str(self.answer), action_str, 'coverage %d%% can end %s' % (coverage, bool(self.game_state.can_end)), 'reward %.3f, value %.3f' % (self.reward, self.v), ] if return_list: return action_hist image = drawing.subplot(images, 4, 3, curr_image.shape[1], curr_image.shape[0], titles=titles, border=3) if not os.path.exists('visualizations/images'): os.makedirs('visualizations/images') cv2.imwrite( 'visualizations/images/state_%05d.jpg' % self.global_step_id, image[:, :, ::-1]) return image
gt_map = (2 - im_dict['label_memory'][:, :, 0]) image_list = [ im_dict['detections'] if constants.OBJECT_DETECTION else im_dict['color'], im_dict['state_image'], im_dict['memory_map'][:, :, 0], gt_map + np.argmax( im_dict['memory_map'][:, :, 1:constants.NUM_RECEPTACLES + 2], axis=2), gt_map + np.argmax( im_dict['memory_map'][:, :, constants.NUM_RECEPTACLES + 2:], axis=2), ] titles = [ 'color', 'state', 'occupied', 'label receptacles', 'label objects' ] print('possible pred', im_dict['possible_pred']) image = drawing.subplot(image_list, 2, 2, constants.SCREEN_WIDTH, constants.SCREEN_HEIGHT, titles=titles) cv2.imshow('image', image[:, :, ::-1]) cv2.waitKey(0)
def run(): try: os.environ["CUDA_VISIBLE_DEVICES"] = str(constants.GPU_ID) with tf.variable_scope('global_network'): network = QAPlannerNetwork(constants.RL_GRU_SIZE, int(constants.BATCH_SIZE), 1) network.create_net() training_step = network.training(network.rl_total_loss) conv_var_list = [v for v in tf.trainable_variables() if 'conv' in v.name and 'weight' in v.name and (v.get_shape().as_list()[0] != 1 or v.get_shape().as_list()[1] != 1)] for var in conv_var_list: tf_util.conv_variable_summaries(var, scope=var.name.replace('/', '_')[:-2]) summary_with_images = tf.summary.merge_all() with tf.variable_scope('supervised_loss'): loss_ph = tf.placeholder(tf.float32) accs_ph = [tf.placeholder(tf.float32) for _ in range(4)] loss_summary_op = tf.summary.merge([ tf.summary.scalar('supervised_loss', loss_ph), tf.summary.scalar('acc_1_exist', accs_ph[0]), tf.summary.scalar('acc_2_count', accs_ph[1]), tf.summary.scalar('acc_3_contains', accs_ph[2]), ]) # prepare session sess = tf_util.Session() sess.run(tf.global_variables_initializer()) if not (constants.DEBUG or constants.DRAWING): from utils import py_util time_str = py_util.get_time_str() summary_writer = tf.summary.FileWriter(os.path.join(constants.LOG_FILE, time_str), sess.graph) else: summary_writer = None # init or load checkpoint with saver saver = tf.train.Saver(max_to_keep=3) start_it = tf_util.restore_from_dir(sess, constants.CHECKPOINT_DIR) sess.graph.finalize() import h5py h5_file = sorted(glob.glob('question_data_dump/*.h5'), key=os.path.getmtime)[-1] dataset = h5py.File(h5_file) num_entries = np.sum(np.sum(dataset['question_data/pose_placeholder'][...], axis=1) > 0) print('num_entries', num_entries) start_inds = dataset['question_data/new_episode'][:num_entries] start_inds = np.where(start_inds[1:] != 1)[0] curr_it = 0 data_time_total = 0 solver_time_total = 0 total_time_total = 0 for iteration in range(start_it, constants.MAX_TIME_STEP): if iteration == start_it or iteration % 10 == 1: current_time_start = time.time() t_start = time.time() rand_inds = np.sort(np.random.choice(start_inds, int(constants.BATCH_SIZE), replace=False)) rand_inds = rand_inds.tolist() existence_answer_placeholder = dataset['question_data/existence_answer_placeholder'][rand_inds] counting_answer_placeholder = dataset['question_data/counting_answer_placeholder'][rand_inds] question_type_placeholder = dataset['question_data/question_type_placeholder'][rand_inds] question_object_placeholder = dataset['question_data/question_object_placeholder'][rand_inds] question_container_placeholder = dataset['question_data/question_container_placeholder'][rand_inds] pose_placeholder = dataset['question_data/pose_placeholder'][rand_inds] image_placeholder = dataset['question_data/image_placeholder'][rand_inds] map_mask_placeholder = np.ascontiguousarray(dataset['question_data/map_mask_placeholder'][rand_inds]) meta_action_placeholder = dataset['question_data/meta_action_placeholder'][rand_inds] possible_move_placeholder = dataset['question_data/possible_move_placeholder'][rand_inds] taken_action = dataset['question_data/taken_action'][rand_inds] answer_weight = np.ones((constants.BATCH_SIZE, 1)) map_mask_placeholder = np.ascontiguousarray(map_mask_placeholder, dtype=np.float32) map_mask_placeholder[:, :, :, :2] -= 2 map_mask_placeholder[:, :, :, :2] /= constants.STEPS_AHEAD map_mask_starts = map_mask_placeholder.copy() for bb in range(0, constants.BATCH_SIZE): object_ind = int(question_object_placeholder[bb]) question_type_ind = int(question_type_placeholder[bb]) if question_type_ind in {2, 3}: container_ind = np.argmax(question_container_placeholder[bb]) max_map_inds = np.argmax(map_mask_placeholder[bb, ...], axis=2) map_range = np.where(max_map_inds > 0) map_range_x = (np.min(map_range[1]), np.max(map_range[1])) map_range_y = (np.min(map_range[0]), np.max(map_range[0])) for jj in range(random.randint(0, 100)): tmp_patch_start = (random.randint(map_range_x[0], map_range_x[1]), random.randint(map_range_y[0], map_range_y[1])) tmp_patch_end = (random.randint(map_range_x[0], map_range_x[1]), random.randint(map_range_y[0], map_range_y[1])) patch_start = (min(tmp_patch_start[0], tmp_patch_end[0]), min(tmp_patch_start[1], tmp_patch_end[1])) patch_end = (max(tmp_patch_start[0], tmp_patch_end[0]), max(tmp_patch_start[1], tmp_patch_end[1])) patch = map_mask_placeholder[bb, patch_start[1]:patch_end[1], patch_start[0]:patch_end[0], :] if question_type_ind in {2, 3}: obj_mem = patch[:, :, 6 + container_ind] + patch[:, :, 6 + object_ind] else: obj_mem = patch[:, :, 6 + object_ind].copy() obj_mem += patch[:, :, 2] # make sure seen locations stay marked. if patch.size > 0 and np.max(obj_mem) == 0: map_mask_placeholder[bb, patch_start[1]:patch_end[1], patch_start[0]:patch_end[0], 6:] = 0 feed_dict = { network.existence_answer_placeholder: np.ascontiguousarray(existence_answer_placeholder), network.counting_answer_placeholder: np.ascontiguousarray(counting_answer_placeholder), network.question_type_placeholder: np.ascontiguousarray(question_type_placeholder), network.question_object_placeholder: np.ascontiguousarray(question_object_placeholder), network.question_container_placeholder: np.ascontiguousarray(question_container_placeholder), network.question_direction_placeholder: np.zeros((constants.BATCH_SIZE, 4), dtype=np.float32), network.pose_placeholder: np.ascontiguousarray(pose_placeholder), network.image_placeholder: np.ascontiguousarray(image_placeholder), network.map_mask_placeholder: map_mask_placeholder, network.meta_action_placeholder: np.ascontiguousarray(meta_action_placeholder), network.possible_move_placeholder: np.ascontiguousarray(possible_move_placeholder), network.taken_action: np.ascontiguousarray(taken_action), network.answer_weight: np.ascontiguousarray(answer_weight), network.episode_length_placeholder: np.ones((constants.BATCH_SIZE)), network.question_count_placeholder: np.zeros((constants.BATCH_SIZE)), } new_feed_dict = {} for key,value in feed_dict.items(): if len(value.squeeze().shape) > 1: new_feed_dict[key] = np.reshape(value, [int(constants.BATCH_SIZE), 1] + list(value.squeeze().shape[1:])) else: new_feed_dict[key] = np.reshape(value, [int(constants.BATCH_SIZE), 1]) feed_dict = new_feed_dict feed_dict[network.taken_action] = np.reshape(feed_dict[network.taken_action], (constants.BATCH_SIZE, -1)) feed_dict[network.gru_placeholder] = np.zeros((int(constants.BATCH_SIZE), constants.RL_GRU_SIZE)) data_t_end = time.time() if constants.DEBUG or constants.DRAWING: outputs = sess.run( [training_step, network.rl_total_loss, network.existence_answer, network.counting_answer, network.possible_moves, network.memory_crops_rot, network.taken_action], feed_dict=feed_dict) else: if iteration == start_it + 10: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() outputs = sess.run([training_step, network.rl_total_loss, summary_with_images], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) loss_summary = outputs[2] summary_writer.add_run_metadata(run_metadata, 'step_%07d' % iteration) summary_writer.add_summary(loss_summary, iteration) summary_writer.flush() elif iteration % 10 == 0: if iteration % 100 == 0: outputs = sess.run( [training_step, network.rl_total_loss, summary_with_images], feed_dict=feed_dict) loss_summary = outputs[2] elif iteration % 10 == 0: outputs = sess.run( [training_step, network.rl_total_loss, network.existence_answer, network.counting_answer, ], feed_dict=feed_dict) outputs[2] = outputs[2].reshape(-1, 1) acc_q0 = np.sum((existence_answer_placeholder == (outputs[2] > 0.5)) * (question_type_placeholder == 0)) / np.maximum(1, np.sum(question_type_placeholder == 0)) acc_q1 = np.sum((counting_answer_placeholder == np.argmax(outputs[3], axis=1)[..., np.newaxis]) * (question_type_placeholder == 1)) / np.maximum(1, np.sum(question_type_placeholder == 1)) acc_q2 = np.sum((existence_answer_placeholder == (outputs[2] > 0.5)) * (question_type_placeholder == 2)) / np.maximum(1, np.sum(question_type_placeholder == 2)) acc_q3 = np.sum((existence_answer_placeholder == (outputs[2] > 0.5)) * (question_type_placeholder == 3)) / np.maximum(1, np.sum(question_type_placeholder == 3)) curr_loss = outputs[1] outputs = sess.run([loss_summary_op], feed_dict={ accs_ph[0]: acc_q0, accs_ph[1]: acc_q1, accs_ph[2]: acc_q2, accs_ph[3]: acc_q3, loss_ph: curr_loss, }) loss_summary = outputs[0] outputs.append(curr_loss) summary_writer.add_summary(loss_summary, iteration) summary_writer.flush() else: outputs = sess.run([training_step, network.rl_total_loss], feed_dict=feed_dict) loss = outputs[1] solver_t_end = time.time() if constants.DEBUG or constants.DRAWING: # Look at outputs guess_bool = outputs[2].flatten() guess_count = outputs[3] possible_moves_pred = outputs[4] memory_crop = outputs[5] print('loss', loss) for bb in range(constants.BATCH_SIZE): if constants.DRAWING: import cv2 import scipy.misc from utils import drawing object_ind = int(question_object_placeholder[bb]) question_type_ind = question_type_placeholder[bb] if question_type_ind == 1: answer = counting_answer_placeholder[bb] guess = guess_count[bb] else: answer = existence_answer_placeholder[bb] guess = np.concatenate(([1 - guess_bool[bb]], [guess_bool[bb]])) if question_type_ind[0] in {2, 3}: container_ind = np.argmax(question_container_placeholder[bb]) obj_mem = np.flipud(map_mask_placeholder[bb, :, :, 6 + container_ind]).copy() obj_mem += 2 * np.flipud(map_mask_placeholder[bb, :, :, 6 + object_ind]) else: obj_mem = np.flipud(map_mask_placeholder[bb, :, :, 6 + object_ind]) possible = possible_move_placeholder[bb,...].flatten() possible = np.concatenate((possible, np.zeros(4))) possible = possible.reshape(constants.STEPS_AHEAD + 2, constants.STEPS_AHEAD) possible_pred = possible_moves_pred[bb,...].flatten() possible_pred = np.concatenate((possible_pred, np.zeros(4))) possible_pred = possible_pred.reshape(constants.STEPS_AHEAD + 2, constants.STEPS_AHEAD) mem2 = np.flipud(np.argmax(memory_crop[bb,...], axis=2)) mem2[0, 0] = memory_crop.shape[-1] - 2 # Answer histogram ans = guess if len(ans) == 1: ans = [ans[0], 1 - ans[0]] ans_size = max(len(ans), 100) ans_hist = np.zeros((ans_size, ans_size)) for ii,ans_i in enumerate(ans): ans_hist[:max(int(np.round(ans_i * ans_size)), 1), int(ii * ans_size / len(ans)):int((ii+1) * ans_size / len(ans))] = (ii + 1) ans_hist = np.flipud(ans_hist) image_list = [ image_placeholder[bb,...], ans_hist, np.flipud(possible), np.flipud(possible_pred), mem2, np.flipud(np.argmax(map_mask_starts[bb, :, :, 2:], axis=2)), obj_mem, ] if question_type_ind == 0: question_str = 'Ex Q: %s A: %s' % (constants.OBJECTS[object_ind], bool(answer)) elif question_type_ind == 1: question_str = '# Q: %s A: %d' % (constants.OBJECTS[object_ind], answer) elif question_type_ind == 2: question_str = 'Q: %s in %s A: %s' % ( constants.OBJECTS[object_ind], constants.OBJECTS[container_ind], bool(answer)) image = drawing.subplot(image_list, 4, 2, constants.SCREEN_WIDTH, constants.SCREEN_HEIGHT, titles=[question_str, 'A: %s' % str(np.argmax(guess))], border=2) cv2.imshow('image', image[:, :, ::-1]) cv2.waitKey(0) else: pdb.set_trace() if not (constants.DEBUG or constants.DRAWING) and (iteration % 1000 == 0 or iteration == constants.MAX_TIME_STEP - 1): saver_t_start = time.time() tf_util.save(saver, sess, constants.CHECKPOINT_DIR, iteration) saver_t_end = time.time() print('Saver: %.3f' % (saver_t_end - saver_t_start)) curr_it += 1 data_time_total += data_t_end - t_start solver_time_total += solver_t_end - data_t_end total_time_total += time.time() - t_start if iteration == start_it or (iteration) % 10 == 0: print('Iteration: %d' % (iteration)) print('Loss: %.3f' % loss) print('Data: %.3f' % (data_time_total / curr_it)) print('Solver: %.3f' % (solver_time_total / curr_it)) print('Total: %.3f' % (total_time_total / curr_it)) print('Current: %.3f\n' % ((time.time() - current_time_start) / min(10, curr_it))) except: import traceback traceback.print_exc() finally: # Save final model if not (constants.DEBUG or constants.DRAWING): tf_util.save(saver, sess, constants.CHECKPOINT_DIR, iteration)