def get_next_image_crops(images, labels, dd, noisy_box, mirrored, real_motion, network_outs): if network_outs is not None: xyxy_pred = network_outs.squeeze() / 10 output_box = bb_util.from_crop_coordinate_system(xyxy_pred, noisy_box, CROP_PAD, 1) bbox_prev = noisy_box elif dd == 0: bbox_prev = labels[dd] else: bbox_prev = labels[dd - 1] bbox_on = labels[dd] if dd == 0: noisy_box = bbox_on.copy() elif not real_motion and network_outs is None: noisy_box = add_noise(bbox_on, bbox_on, images[0].shape[1], images[0].shape[0]) else: noisy_box = fix_bbox_intersection(bbox_prev, bbox_on) image0 = im_util.get_cropped_input(images[max(dd - 1, 0)], bbox_prev, CROP_PAD, CROP_SIZE)[0] image1 = im_util.get_cropped_input(images[dd], noisy_box, CROP_PAD, CROP_SIZE)[0] shifted_bbox = bb_util.to_crop_coordinate_system(bbox_on, noisy_box, CROP_PAD, 1) shifted_bbox_xywh = bb_util.xyxy_to_xywh(shifted_bbox) xywh_labels = shifted_bbox_xywh xyxy_labels = bb_util.xywh_to_xyxy(xywh_labels) * 10 return image0, image1, xyxy_labels, noisy_box
def returnConvLayers(self, bbox, image, starting_boxes=None): start_time = time.time() if type(image) == str: image = cv2.imread(image)[:, :, ::-1] else: image = image.copy() image_read_time = time.time() - start_time # Get inputs for each track. images = [] # lstmStates = [[] for _ in range(4)] pastBBoxesPadded = [] croppedInput0, pastBBoxPadded = im_util.get_cropped_input( image, bbox, CROP_PAD, CROP_SIZE) cv2.imshow('', croppedInput0) input = np.tile(croppedInput0[np.newaxis, ...], (2, 1, 1, 1)) feed_dict = { self.imagePlaceholder: input, } convFeatures = self.sess.run([self.conv_layers], feed_dict=feed_dict) return convFeatures
def track(self, unique_id, image, starting_box=None): start_time = time.time() if type(image) == str: image = cv2.imread(image)[:, :, ::-1] else: image = image image_read_time = time.time() - start_time if starting_box is not None: lstmState = [np.zeros((1, LSTM_SIZE)) for _ in range(4)] pastBBox = np.array(starting_box) # turns list into numpy array if not and copies for safety. prevImage = image originalFeatures = None forwardCount = 0 elif unique_id in self.tracked_data: lstmState, pastBBox, prevImage, originalFeatures, forwardCount = self.tracked_data[unique_id] else: raise Exception('Unique_id %s with no initial bounding box' % unique_id) self._profiler.start(self._re3_crop_profiler) croppedInput0, pastBBoxPadded = im_util.get_cropped_input(prevImage, pastBBox, CROP_PAD, CROP_SIZE) croppedInput1, _ = im_util.get_cropped_input(image, pastBBox, CROP_PAD, CROP_SIZE) self._profiler.stop(self._re3_crop_profiler) feed_dict = { self.imagePlaceholder: [croppedInput0, croppedInput1], self.prevLstmState: lstmState, self.batch_size: 1, } self._profiler.start(self._re3_sess_profiler) rawOutput, s1, s2 = self.sess.run([self.outputs, self.state1, self.state2], feed_dict=feed_dict) lstmState = [s1[0], s1[1], s2[0], s2[1]] self._profiler.stop(self._re3_sess_profiler) if forwardCount == 0: originalFeatures = [s1[0], s1[1], s2[0], s2[1]] # prevImage = image # Shift output box to full image coordinate system. outputBox = bb_util.from_crop_coordinate_system(rawOutput.squeeze() / 10.0, pastBBoxPadded, 1, 1) if forwardCount > 0 and forwardCount % MAX_TRACK_LENGTH == 0: self._profiler.start(self._re3_crop2_profiler) croppedInput, _ = im_util.get_cropped_input(image, outputBox, CROP_PAD, CROP_SIZE) input = np.tile(croppedInput[np.newaxis, ...], (2, 1, 1, 1)) self._profiler.stop(self._re3_crop2_profiler) feed_dict = { self.imagePlaceholder: input, self.prevLstmState: originalFeatures, self.batch_size: 1, } self._profiler.start(self._re3_sess2_profiler) rawOutput, s1, s2 = self.sess.run([self.outputs, self.state1, self.state2], feed_dict=feed_dict) self._profiler.stop(self._re3_sess2_profiler) lstmState = [s1[0], s1[1], s2[0], s2[1]] forwardCount += 1 self.total_forward_count += 1 if starting_box is not None: # Use label if it's given outputBox = np.array(starting_box) self.tracked_data[unique_id] = (lstmState, outputBox, image, originalFeatures, forwardCount) end_time = time.time() if self.total_forward_count > 0: self.time += (end_time - start_time - image_read_time) if SPEED_OUTPUT and self.total_forward_count % 100 == 0: print('Current tracking speed: %.3f FPS' % (1 / (end_time - start_time - image_read_time))) print('Current image read speed: %.3f FPS' % (1 / (image_read_time))) print('Mean tracking speed: %.3f FPS\n' % (self.total_forward_count / max(.00001, self.time))) return outputBox
def multi_track(self, unique_ids, image, starting_boxes=None): start_time = time.time() assert type(unique_ids) == list, 'unique_ids must be a list for multi_track' assert len(unique_ids) > 1, 'unique_ids must be at least 2 elements' if type(image) == str: image = cv2.imread(image)[:, :, ::-1] else: image = image.copy() image_read_time = time.time() - start_time # Get inputs for each track. images = [] lstmStates = [[] for _ in range(4)] pastBBoxesPadded = [] if starting_boxes is None: starting_boxes = dict() for unique_id in unique_ids: if unique_id in starting_boxes: lstmState = [np.zeros((1, LSTM_SIZE)) for _ in range(4)] # turns list into numpy array if not and copies for safety. pastBBox = np.array(starting_boxes[unique_id]) prevImage = image originalFeatures = None forwardCount = 0 self.tracked_data[unique_id] = (lstmState, pastBBox, image, originalFeatures, forwardCount) elif unique_id in self.tracked_data: lstmState, pastBBox, prevImage, originalFeatures, forwardCount = self.tracked_data[unique_id] else: raise Exception('Unique_id %s with no initial bounding box' % unique_id) self._profiler.start(self._re3_crop_profiler) croppedInput0, pastBBoxPadded = im_util.get_cropped_input(prevImage, pastBBox, CROP_PAD, CROP_SIZE) croppedInput1, _ = im_util.get_cropped_input(image, pastBBox, CROP_PAD, CROP_SIZE) self._profiler.stop(self._re3_crop_profiler) pastBBoxesPadded.append(pastBBoxPadded) images.extend([croppedInput0, croppedInput1]) for ss, state in enumerate(lstmState): lstmStates[ss].append(state.squeeze()) lstmStateArrays = [] for state in lstmStates: lstmStateArrays.append(np.array(state)) feed_dict = { self.imagePlaceholder: images, self.prevLstmState: lstmStateArrays, self.batch_size: len(images) / 2 } self._profiler.start(self._re3_sess_profiler) rawOutput, s1, s2 = self.sess.run([self.outputs, self.state1, self.state2], feed_dict=feed_dict) self._profiler.stop(self._re3_sess_profiler) outputBoxes = np.zeros((len(unique_ids), 4)) for uu, unique_id in enumerate(unique_ids): lstmState, pastBBox, prevImage, originalFeatures, forwardCount = self.tracked_data[unique_id] lstmState = [s1[0][[uu], :], s1[1][[uu], :], s2[0][[uu], :], s2[1][[uu], :]] if forwardCount == 0: originalFeatures = [s1[0][[uu], :], s1[1][[uu], :], s2[0][[uu], :], s2[1][[uu], :]] # prevImage = image # Shift output box to full image coordinate system. pastBBoxPadded = pastBBoxesPadded[uu] outputBox = bb_util.from_crop_coordinate_system(rawOutput[uu, :].squeeze() / 10.0, pastBBoxPadded, 1, 1) if forwardCount > 0 and forwardCount % MAX_TRACK_LENGTH == 0: self._profiler.start(self._re3_crop2_profiler) croppedInput, _ = im_util.get_cropped_input(image, outputBox, CROP_PAD, CROP_SIZE) input = np.tile(croppedInput[np.newaxis, ...], (2, 1, 1, 1)) self._profiler.stop(self._re3_crop2_profiler) feed_dict = { self.imagePlaceholder: input, self.prevLstmState: originalFeatures, self.batch_size: 1, } self._profiler.start(self._re3_sess2_profiler) _, s1_new, s2_new = self.sess.run([self.outputs, self.state1, self.state2], feed_dict=feed_dict) self._profiler.stop(self._re3_sess2_profiler) lstmState = [s1_new[0], s1_new[1], s2_new[0], s2_new[1]] forwardCount += 1 self.total_forward_count += 1 if unique_id in starting_boxes: # Use label if it's given outputBox = np.array(starting_boxes[unique_id]) outputBoxes[uu, :] = outputBox self.tracked_data[unique_id] = (lstmState, outputBox, image, originalFeatures, forwardCount) end_time = time.time() if self.total_forward_count > 0: self.time += (end_time - start_time - image_read_time) if SPEED_OUTPUT and self.total_forward_count % 100 == 0: print('Current tracking speed per object: %.3f FPS' % ( len(unique_ids) / (end_time - start_time - image_read_time))) print('Current tracking speed per frame: %.3f FPS' % (1 / (end_time - start_time - image_read_time))) print('Current image read speed: %.3f FPS' % (1 / (image_read_time))) print('Mean tracking speed per object: %.3f FPS\n' % (self.total_forward_count / max(.00001, self.time))) return outputBoxes
def track(self, unique_id, image, starting_box=None): convFeatures1 = [] start_time = time.time() if type(image) == str: image = cv2.imread(image)[:, :, ::-1] else: image = image.copy() image_read_time = time.time() - start_time if starting_box is not None: lstmState = [np.zeros((1, LSTM_SIZE)) for _ in range(4)] print("starting_box>>") pastBBox = np.array( starting_box ) # turns list into numpy array if not and copies for safety. prevImage = image originalFeatures = None forwardCount = 0 convFeatures = None elif unique_id in self.tracked_data: lstmState, pastBBox, prevImage, originalFeatures, forwardCount, convFeatures = self.tracked_data[ unique_id] else: raise Exception('Unique_id %s with no initial bounding box' % unique_id) croppedInput0, pastBBoxPadded = im_util.get_cropped_input( prevImage, pastBBox, CROP_PAD, CROP_SIZE) croppedInput1, _ = im_util.get_cropped_input(image, pastBBox, CROP_PAD, CROP_SIZE) feed_dict = { self.imagePlaceholder: [croppedInput0, croppedInput1], self.prevLstmState: lstmState, self.batch_size: 1, } rawOutput, s1, s2, convFeatures = self.sess.run( [self.outputs, self.state1, self.state2, self.conv_layers1], feed_dict=feed_dict) convFeatures1 = self.sess.run([self.conv_layers], feed_dict=feed_dict) lstmState = [s1[0], s1[1], s2[0], s2[1]] if forwardCount == 0: originalFeatures = [s1[0], s1[1], s2[0], s2[1]] prevImage = image # Shift output box to full image coordinate system. outputBox = bb_util.from_crop_coordinate_system( rawOutput.squeeze() / 10.0, pastBBoxPadded, 1, 1) if forwardCount > 0 and forwardCount % MAX_TRACK_LENGTH == 0: croppedInput, _ = im_util.get_cropped_input( image, outputBox, CROP_PAD, CROP_SIZE) input = np.tile(croppedInput[np.newaxis, ...], (2, 1, 1, 1)) feed_dict = { self.imagePlaceholder: input, self.prevLstmState: originalFeatures, self.batch_size: 1, } rawOutput, s1, s2, convFeatures = self.sess.run( [self.outputs, self.state1, self.state2, self.conv_layers1], feed_dict=feed_dict) convFeatures1 = self.sess.run([self.conv_layers], feed_dict=feed_dict) lstmState = [s1[0], s1[1], s2[0], s2[1]] forwardCount += 1 self.total_forward_count += 1 if starting_box is not None: # Use label if it's given outputBox = np.array(starting_box) self.tracked_data[unique_id] = (lstmState, outputBox, image, originalFeatures, forwardCount, convFeatures1) end_time = time.time() if self.total_forward_count > 0: self.time += (end_time - start_time - image_read_time) return outputBox, self.tracked_data
def get_data_sequence(self): try: # Preallocate the space for the images and labels. tImage = np.zeros((self.delta, 2, CROP_SIZE, CROP_SIZE, 3), dtype=np.uint8) xywhLabels = np.zeros((self.delta, 4), dtype=np.float32) mirrored = random.random() < 0.5 useSimulator = random.random() < USE_SIMULATOR gtType = random.random() realMotion = random.random() < REAL_MOTION_PROB # Initialize first frame (give the network context). if useSimulator: # Initialize the simulation and run through a few frames. trackingObj, trackedObjects, background = simulator.create_new_track( ) for _ in range(random.randint(0, 200)): simulator.step(trackedObjects) bbox = trackingObj.get_object_box() occlusion = simulator.measure_occlusion( bbox, trackingObj.occluder_boxes, cropPad=1) if occlusion > .2: break for _ in range(1000): bbox = trackingObj.get_object_box() occlusion = simulator.measure_occlusion( bbox, trackingObj.occluder_boxes, cropPad=1) if occlusion < 0.01: break simulator.step(trackedObjects) initBox = trackingObj.get_object_box() if self.debug: images = [ simulator.get_image_for_frame(trackedObjects, background) ] else: images = [np.zeros((SIMULATION_HEIGHT, SIMULATION_WIDTH))] else: # Read a new data sequence from batch cache and get the ground truth. (batchKey, images) = self.getData() gtKey = batchKey imageIndex = self.key_lookup[gtKey] initBox = self.datasets[gtKey[0]][imageIndex, :4].copy() if self.debug: bboxes = [] cropBBoxes = [] # bboxPrev starts at the initial box and is the best guess (or gt) for the image0 location. # noisyBox holds the bboxPrev estimate plus some noise. bboxPrev = initBox lstmState = None for dd in range(self.delta): # bboxOn is the gt location in image1 if useSimulator: bboxOn = trackingObj.get_object_box() else: newKey = list(gtKey) newKey[3] += dd newKey = tuple(newKey) imageIndex = self.key_lookup[newKey] bboxOn = self.datasets[newKey[0]][imageIndex, :4].copy() if dd == 0: noisyBox = bboxOn.copy() elif not realMotion and not useSimulator and gtType >= USE_NETWORK_PROB: noisyBox = self.add_noise(bboxOn, bboxOn, images[0].shape[1], images[0].shape[0]) else: noisyBox = self.fix_bbox_intersection( bboxPrev, bboxOn, images[0].shape[1], images[0].shape[0]) if useSimulator: patch = simulator.render_patch(bboxPrev, background, trackedObjects) tImage[dd, 0, ...] = patch if dd > 0: simulator.step(trackedObjects) bboxOn = trackingObj.get_object_box() noisyBox = self.fix_bbox_intersection( bboxPrev, bboxOn, images[0].shape[1], images[0].shape[0]) else: tImage[dd, 0, ...] = im_util.get_cropped_input( images[max(dd - 1, 0)], bboxPrev, CROP_PAD, CROP_SIZE)[0] if useSimulator: patch = simulator.render_patch(noisyBox, background, trackedObjects) tImage[dd, 1, ...] = patch if self.debug: images.append( simulator.get_image_for_frame( trackedObjects, background)) else: tImage[dd, 1, ...] = im_util.get_cropped_input( images[dd], noisyBox, CROP_PAD, CROP_SIZE)[0] shiftedBBox = bb_util.to_crop_coordinate_system( bboxOn, noisyBox, CROP_PAD, 1) shiftedBBoxXYWH = bb_util.xyxy_to_xywh(shiftedBBox) xywhLabels[dd, :] = shiftedBBoxXYWH if gtType < USE_NETWORK_PROB: # Run through a single forward pass to get the next box estimate. if dd < self.delta - 1: if dd == 0: lstmState = self.initialLstmState feed_dict = { self.forwardNetworkImagePlaceholder: tImage[dd, ...], self.prevLstmState: lstmState } networkOuts, s1, s2 = self.sess.run( [self.networkOutputs, self.state1, self.state2], feed_dict=feed_dict) lstmState = (s1[0], s1[1], s2[0], s2[1]) xyxyPred = networkOuts.squeeze() / 10 outputBox = bb_util.from_crop_coordinate_system( xyxyPred, noisyBox, CROP_PAD, 1) bboxPrev = outputBox if self.debug: bboxes.append(outputBox) cropBBoxes.append(xyxyPred) else: bboxPrev = bboxOn if self.debug: # Look at the inputs to make sure they are correct. image0 = tImage[dd, 0, ...].copy() image1 = tImage[dd, 1, ...].copy() xyxyLabel = bb_util.xywh_to_xyxy( xywhLabels[dd, :].squeeze()) print('xyxy raw', xyxyLabel, 'actual', xyxyLabel * CROP_PAD) label = np.zeros((CROP_PAD, CROP_PAD)) drawing.drawRect(label, xyxyLabel * CROP_PAD, 0, 1) drawing.drawRect( image0, bb_util.xywh_to_xyxy(np.full((4, 1), .5) * CROP_SIZE), 2, [255, 0, 0]) bigImage0 = images[max(dd - 1, 0)].copy() bigImage1 = images[dd].copy() if dd < len(cropBBoxes): drawing.drawRect(bigImage1, bboxes[dd], 5, [255, 0, 0]) drawing.drawRect(image1, cropBBoxes[dd] * CROP_SIZE, 1, [0, 255, 0]) print('pred raw', cropBBoxes[dd], 'actual', cropBBoxes[dd] * CROP_PAD) print('\n') label[0, 0] = 1 label[0, 1] = 0 plots = [bigImage0, bigImage1, image0, image1] subplot = drawing.subplot(plots, 2, 2, outputWidth=OUTPUT_WIDTH, outputHeight=OUTPUT_HEIGHT, border=5) cv2.imshow('debug', subplot[:, :, ::-1]) cv2.waitKey(1) if mirrored: tImage = np.fliplr(tImage.transpose(2, 3, 4, 0, 1)).transpose( 3, 4, 0, 1, 2) xywhLabels[..., 0] = 1 - xywhLabels[..., 0] tImage = tImage.reshape([self.delta * 2] + list(tImage.shape[2:])) xyxyLabels = bb_util.xywh_to_xyxy(xywhLabels.T).T * 10 xyxyLabels = xyxyLabels.astype(np.float32) return tImage, xyxyLabels except Exception as e: import traceback traceback.print_exc() import pdb pdb.set_trace() print('exception')
def track(self, unique_id, image, starting_box=None): start_time = time.time() if type(image) == str: image = cv2.imread(image)[:, :, ::-1] else: image = image.copy() image_read_time = time.time() - start_time if starting_box is not None: lstm_state = None past_bbox = np.array(starting_box) # turns list into numpy array if not and copies for safety. prev_image = image original_features = None forward_count = 0 elif unique_id in self.tracked_data: lstm_state, past_bbox, prev_image, original_features, forward_count = self.tracked_data[unique_id] else: raise Exception("Unique_id %s with no initial bounding box" % unique_id) cropped_input0, past_b_box_padded = im_util.get_cropped_input(prev_image, past_bbox, CROP_PAD, CROP_SIZE) cropped_input1, _ = im_util.get_cropped_input(image, past_bbox, CROP_PAD, CROP_SIZE) # import pdb # pdb.set_trace() image_input = pt_util.from_numpy((np.stack([cropped_input0, cropped_input1]))) raw_output = self.network(image_input, lstm_state) raw_output = pt_util.to_numpy_array(raw_output) lstm_state = self.network.lstm_state if forward_count == 0: original_features = [var.clone().detach() for var in self.network.lstm_state] prev_image = image # Shift output box to full image coordinate system. output_box = bb_util.from_crop_coordinate_system(raw_output.squeeze() / 10.0, past_b_box_padded, 1, 1) # import pdb # pdb.set_trace() if forward_count > 0 and forward_count % MAX_TRACK_LENGTH == 0: cropped_input, _ = im_util.get_cropped_input(image, output_box, CROP_PAD, CROP_SIZE) image_input = pt_util.from_numpy(np.tile(cropped_input[np.newaxis, ...], (2, 1, 1, 1))) self.network(image_input, original_features) lstm_state = self.network.lstm_state forward_count += 1 self.total_forward_count += 1 if starting_box is not None: # Use label if it's given output_box = np.array(starting_box) self.tracked_data[unique_id] = (lstm_state, output_box, image, original_features, forward_count) end_time = time.time() if self.total_forward_count > 0: self.t_time += end_time - start_time - image_read_time if SPEED_OUTPUT and self.total_forward_count % 100 == 0: print("Current tracking speed: %.3f FPS" % (1 / (end_time - start_time - image_read_time))) print("Current image read speed: %.3f FPS" % (1 / (image_read_time))) print("Mean tracking speed: %.3f FPS\n" % (self.total_forward_count / max(0.00001, self.t_time))) return output_box