def forward_step(self, input_var, hidden, encoder_outputs, encoder_mask): batch_size = input_var.size(0) dec_len = input_var.size(1) embedded = self.embedding(input_var) embedded = self.input_dropout(embedded) output, hidden = self.lstm(embedded, hidden) attn = None p_copy = None if self.use_attention: # output ~ [ht, attn_ctx] output, attn = self.attention(output, encoder_outputs, encoder_mask) if self.use_copy: p_copy = F.sigmoid( self.copy( torch.cat( (output, embedded), dim=2).view(batch_size * dec_len, -1))).squeeze(1).view(batch_size, dec_len) predicted_softmax = stable_softmax( self.out(output.contiguous().view(-1, self.hidden_dim))).view( batch_size, dec_len, -1) return predicted_softmax, hidden, attn, p_copy
def forward(self, x): self.z1 = np.array([self.W1.dot(x)]).transpose() + self.b1 vec_rectified_linear_unit = np.vectorize(rectified_linear_unit) self.h = vec_rectified_linear_unit(self.z1) self.h = self.h.transpose()[0] self.z2 = np.array([self.W2.dot(self.h)]).transpose() + self.b2 self.y_hat = stable_softmax(self.z2)
def forward(self, output, context, mask): # refer to OpenNMT-py # https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/modules/global_attention.py#L121 dim = self.dim src_batch, src_len, src_dim = context.size() tgt_batch, tgt_len, tgt_dim = output.size() assert src_batch == tgt_batch assert src_dim == tgt_dim assert src_dim == self.dim hehe = torch.isnan(output) if torch.sum(hehe) > 0: print('nan found in output:', output) for i in range(tgt_batch): for j in range(tgt_len): bb = torch.isnan(output[i, j, :]) if torch.sum(bb) > 0: print(output[i, j, :]) sys.exit() hehe = torch.isnan(context) if torch.sum(hehe) > 0: print('nan found in context:', context) sys.exit() wq = self.linear_query(output.contiguous().view(-1, dim)) wq = wq.view(tgt_batch, tgt_len, 1, dim) wq = wq.expand(tgt_batch, tgt_len, src_len, dim) hehe = torch.isnan(wq) if torch.sum(hehe) > 0: print('nan found in wq:', wq) sys.exit() uh = self.linear_context(context.contiguous().view(-1, dim)) uh = uh.view(src_batch, 1, src_len, dim) uh = uh.expand(src_batch, tgt_len, src_len, dim) hehe = torch.isnan(uh) if torch.sum(hehe) > 0: print('nan found in uh:', uh) sys.exit() wquh = F.tanh(wq + uh) hehe = torch.isnan(wquh) if torch.sum(hehe) > 0: print('nan found in wquh:', wquh) sys.exit() score = self.v(wquh.view(-1, dim)).view(tgt_batch, tgt_len, src_len) hehe = torch.isnan(score) if torch.sum(hehe) > 0: print('nan found in score:', score) for i in range(tgt_batch): for j in range(tgt_len): bb = torch.isnan(score[i, j, :]) if torch.sum(bb) > 0: print(score[i, j, :]) sys.exit() # mask = [batch_size, tgt_len, src_len] mask = mask.unsqueeze(1).expand_as( score) #.contiguous().view(src_batch*tgt_len, src_len) attn = stable_softmax(score, mask) ##score.data.masked_fill_(mask, -float('inf')) #score = score.view(tgt_batch*tgt_len, src_len) #max_by_row = torch.max(score, dim=1, keepdim=True)[0] ##attn = F.softmax(score-max_by_row, dim=1).view(tgt_batch, tgt_len, src_len) #attn = torch.exp(score-max_by_row) * (1.0 - mask.float()) #sum_attn = torch.sum(attn, dim=1, keepdim=True) ##zero_mask = torch.eq(attn, 0.) #attn = attn/sum_attn ##attn.masked_fill_(zero_mask, 0.) #attn = attn.view(tgt_batch, tgt_len, src_len) # (batch, out_len, in_len) * (batch, in_len, dim) -> (batch, out_len, dim) mix = torch.bmm(attn, context) # concat -> (batch, out_len, 2*dim) combined = torch.cat((mix, output), dim=2) # output -> (batch, out_len, dim) output = self.linear_out(combined.view(-1, 2 * dim)).view( tgt_batch, -1, dim) # output ~ [ht, attn_ctx] return output, attn
def detect_video(model, dataset, video_path): ''' Experimental''' import cv2 # Video capture vcapture = cv2.VideoCapture(video_path) width = int(vcapture.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vcapture.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = vcapture.get(cv2.CAP_PROP_FPS) # Camera projection mat width = dataset.camera.width / 2 # TODO: work on original image size not 1/2 height = dataset.camera.height / 2 fov_horizontal = np.pi / 2 fx = width / (2 * np.tan(dataset.camera.fov_x / 2)) fy = -height / (2 * np.tan(dataset.camera.fov_y / 2)) K = np.matrix([[fx, 0, width / 2], [0, fy, height / 2], [0, 0, 1]]) R_cam_unreal = np.matrix([[0, 1, 0], [0, 0, 1], [1, 0, 0]]) # Define codec and create video writer vwriter = cv2.VideoWriter("video_real.avi", cv2.VideoWriter_fourcc(*'MJPG'), fps, (int(width), int(height))) count = 0 pose_est_acc = [] success = True while success: print("frame: ", count) count += 1 # Read next image success, image = vcapture.read() if success and count > 16900: # OpenCV returns images as BGR, convert to RGB image = image[..., ::-1] image = image[:, 1:-150, :] # crop image = np.pad(image, [(400, 400), (400, 400), (0, 0)], mode='constant', constant_values=0) image[:, :, 0] = 0.21 * image[:, :, 0] + 0.72 * image[:, :, 1] + 0.07 * image[:, :, 2] image[:, :, 1] = image[:, :, 0] image[:, :, 2] = image[:, :, 0] # Resize to network input shape molded_image, window, scale, padding, crop = utils.resize_image( image, min_dim=model.config.IMAGE_MIN_DIM, min_scale=model.config.IMAGE_MIN_SCALE, max_dim=model.config.IMAGE_MAX_DIM, mode=model.config.IMAGE_RESIZE_MODE) # Detect objects results = model.detect([image], verbose=0)[0] loc_est = results['loc'] ori_pmf = utils.stable_softmax(results['ori']) q_est, q_est_cov = se3lib.quat_weighted_avg( dataset.ori_histogram_map, ori_pmf) z = loc_est[2] x = loc_est[0] y = loc_est[1] print(str(z) + " " + str(x) + " " + str(y)) # Recover Unreal orientation: R_wo R_co = se3lib.quat2SO3(q_est) R_co = R_cam_unreal.T * R_co R_wc = se3lib.euler2SO3_unreal(0, 0, 0) R_wo = R_wc * R_co roll, pitch, yaw = se3lib.SO32euler(R_wo) # print(str(-pitch) + " " + str(yaw) + " " + str(-roll)) # Stack frame gt pose_est = np.array( [loc_est[2], loc_est[0], loc_est[1], -pitch, yaw, -roll]) pose_est_acc.append(pose_est) # Crop and resize image to match original input size margin = (model.config.IMAGE_MAX_DIM - 480) // 2 image = molded_image[margin:model.config.IMAGE_MAX_DIM - margin, :, :] # Show image #fig, ax_1 = plt.subplots(1, 1, figsize=(12, 8)) utils.plot_axes(image, q_est, loc_est, K, 5.0) # ax_1.imshow(image) # ax_1.set_xticks([]) # ax_1.set_yticks([]) nr_bins_per_dim = model.config.ORI_BINS_PER_DIM utils.visualize_weights(ori_pmf, ori_pmf, nr_bins_per_dim) # plt.show(block=True) # Add image to video writer vwriter.write(image) if count > 17200: success = False vwriter.release()
def detect_dataset(model, dataset, nr_images): """ Tests model on N random images of the dataset and shows the results. """ # Variance used only for prob. orientation estimation delta = model.config.BETA / model.config.ORI_BINS_PER_DIM var = delta**2 / 12 for i in range(nr_images): image_id = random.choice(dataset.image_ids) # Load pose in all formats loc_gt = dataset.load_location(image_id) q_gt = dataset.load_quaternion(image_id) I, I_meta, loc_encoded_gt, ori_encoded_gt = \ net.load_image_gt(dataset, model.config, image_id) image_ori = dataset.load_image(image_id) info = dataset.image_info[image_id] # Run detection results = model.detect([image_ori], verbose=1) # Retrieve location if model.config.REGRESS_LOC: loc_est = results[0]['loc'] else: loc_pmf = utils.stable_softmax(results[0]['loc']) # Compute location mean according to first moment loc_est = np.asmatrix(loc_pmf) * np.asmatrix( dataset.histogram_3D_map) # Compute loc encoding error loc_encoded_gt = np.asmatrix(loc_encoded_gt) * np.asmatrix( dataset.histogram_3D_map) loc_encoded_err = np.linalg.norm(loc_encoded_gt - loc_gt) # Retrieve orientation if model.config.REGRESS_ORI: if model.config.ORIENTATION_PARAM == 'quaternion': q_est = results[0]['ori'] elif model.config.ORIENTATION_PARAM == 'euler_angles': q_est = se3lib.SO32quat( se3lib.euler2SO3_left(results[0]['ori'][0], results[0]['ori'][1], results[0]['ori'][2])) elif model.config.ORIENTATION_PARAM == 'angle_axis': theta = np.linalg.norm(results[0]['ori']) if theta < 1e-6: v = [0, 0, 0] else: v = results[0]['ori'] / theta q_est = se3lib.angleaxis2quat(v, theta) else: ori_pmf = utils.stable_softmax(results[0]['ori']) # Compute mean quaternion q_est, q_est_cov = se3lib.quat_weighted_avg( dataset.ori_histogram_map, ori_pmf) # Multimodal estimation # Uncomment this block to try the EM framework # nr_EM_iterations = 5 # Q_mean, Q_var, Q_priors, model_scores = fit_GMM_to_orientation(dataset.ori_histogram_map, ori_pmf, nr_EM_iterations, var) # print('Multimodal errors',2 * np.arccos(np.abs(np.asmatrix(Q_mean) * np.asmatrix(q_gt).transpose())) * 180 / np.pi) # # q_est_1 = Q_mean[0, :] # q_est_2 = Q_mean[1, :] # utils.polar_plot(q_est_1, q_est_2) # Compute Errors angular_err = 2 * np.arccos( np.abs(np.asmatrix(q_est) * np.asmatrix(q_gt).transpose())) * 180 / np.pi loc_err = np.linalg.norm(loc_est - loc_gt) print('GT location: ', loc_gt) print('Est location: ', loc_est) print('Processed Image:', info['path']) print('Est orientation: ', q_est) print('GT_orientation: ', q_gt) print('Location error: ', loc_err) print('Angular error: ', angular_err) # Visualize PMFs if not model.config.REGRESS_ORI: nr_bins_per_dim = model.config.ORI_BINS_PER_DIM utils.visualize_weights(ori_encoded_gt, ori_pmf, nr_bins_per_dim) # Show image fig, (ax_1, ax_2) = plt.subplots(1, 2, figsize=(12, 8)) ax_1.imshow(image_ori) ax_1.set_xticks([]) ax_1.set_yticks([]) ax_2.imshow(image_ori) ax_2.set_xticks([]) ax_2.set_yticks([]) height_ori = np.shape(image_ori)[0] width_ori = np.shape(image_ori)[1] # Recover focal lengths fx = dataset.camera.fx fy = dataset.camera.fy K = np.matrix([[fx, 0, width_ori / 2], [0, fy, height_ori / 2], [0, 0, 1]]) # Speed labels expresses q_obj_cam whereas # Urso labels expresses q_cam_obj if dataset.name == 'Speed': q_est = se3lib.quat_inv(q_est) q_gt = se3lib.quat_inv(q_gt) utils.visualize_axes(ax_1, q_gt, loc_gt, K, 100) utils.visualize_axes(ax_2, q_est, loc_est, K, 100) utils.polar_plot(q_gt, q_est) # Location overlap visualization fig, ax = plt.subplots() ax.imshow(image_ori) # Project 3D coords for visualization x_est = loc_est[0] / loc_est[2] y_est = loc_est[1] / loc_est[2] x_gt = loc_gt[0] / loc_gt[2] y_gt = loc_gt[1] / loc_gt[2] if not model.config.REGRESS_LOC: x_decoded_gt = loc_encoded_gt[0, 0] / loc_encoded_gt[0, 2] y_decoded_gt = loc_encoded_gt[0, 1] / loc_encoded_gt[0, 2] circ = Circle((x_decoded_gt * fx + width_ori / 2, height_ori / 2 + y_decoded_gt * fy), 7, facecolor='b', label='encoded') ax.add_patch(circ) # Plot locations circ_gt = Circle( (x_gt * fx + width_ori / 2, height_ori / 2 + y_gt * fy), 15, facecolor='r', label='gt') ax.add_patch(circ_gt) circ = Circle( (x_est * fx + width_ori / 2, height_ori / 2 + y_est * fy), 10, facecolor='g', label='pred') ax.add_patch(circ) ax.legend(loc='upper right', shadow=True, fontsize='x-small') plt.show()
def evaluate(model, dataset): """ Evaluates model on all dataset images. Assumes all images have corresponding pose labels. """ loc_err_acc = [] loc_encoded_err_acc = [] ori_err_acc = [] ori_encoded_err_acc = [] distances_acc = [] esa_scores_acc = [] # Variance used only for prob. orientation estimation delta = model.config.BETA / model.config.ORI_BINS_PER_DIM var = delta**2 / 12 for image_id in dataset.image_ids: print('Image ID:', image_id) # Load pose in all formats loc_gt = dataset.load_location(image_id) q_gt = dataset.load_quaternion(image_id) image = dataset.load_image(image_id) results = model.detect([image], verbose=1) if model.config.REGRESS_KEYPOINTS: # Experimental I, I_meta, loc_gt, k1_gt, k2_gt = \ net.load_image_gt(dataset, model.config, image_id) loc_est = results[0]['loc'] k1_est = results[0]['k1'] k2_est = results[0]['k2'] # Prepare keypoint matches # TODO: take scale into account and get rid of magic numbers P1 = np.zeros((3, 3)) P1[2, 0] = 3.0 P1[1, 1] = 3.0 P2 = np.zeros((3, 3)) P2[:, 0] = k1_est P2[:, 1] = k2_est P2[:, 2] = loc_est t, R = se3lib.pose_3Dto3D(np.asmatrix(P1), np.asmatrix(P2)) q_est = se3lib.SO32quat(R.T) else: I, I_meta, loc_encoded_gt, ori_encoded_gt = \ net.load_image_gt(dataset, model.config, image_id) # Retrieve location if model.config.REGRESS_LOC: loc_est = results[0]['loc'] else: loc_pmf = utils.stable_softmax(results[0]['loc']) # Compute location mean according to first moment loc_est = np.asmatrix(loc_pmf) * np.asmatrix( dataset.histogram_3D_map) # Compute loc encoding error loc_decoded_gt = np.asmatrix(loc_encoded_gt) * np.asmatrix( dataset.histogram_3D_map) loc_encoded_err = np.linalg.norm(loc_decoded_gt - loc_gt) loc_encoded_err_acc.append(loc_encoded_err) # Retrieve orientation if model.config.REGRESS_ORI: if model.config.ORIENTATION_PARAM == 'quaternion': q_est = results[0]['ori'] elif model.config.ORIENTATION_PARAM == 'euler_angles': q_est = se3lib.SO32quat( se3lib.euler2SO3_left(results[0]['ori'][0], results[0]['ori'][1], results[0]['ori'][2])) elif model.config.ORIENTATION_PARAM == 'angle_axis': theta = np.linalg.norm(results[0]['ori']) if theta < 1e-6: v = [0, 0, 0] else: v = results[0]['ori'] / theta q_est = se3lib.angleaxis2quat(v, theta) else: ori_pmf = utils.stable_softmax(results[0]['ori']) # Compute mean quaternion q_est, q_est_cov = se3lib.quat_weighted_avg( dataset.ori_histogram_map, ori_pmf) # Multimodal estimation # Uncomment this block to try the EM framework # nr_EM_iterations = 5 # Q_mean, Q_var, Q_priors, model_scores = fit_GMM_to_orientation(dataset.ori_histogram_map, ori_pmf, # nr_EM_iterations, var) # # print('Err:', angular_err) # angular_err = 2*np.arccos(np.abs(np.asmatrix(Q_mean)*np.asmatrix(q_gt).transpose()))*180/np.pi # # # Select best of two # if len(angular_err) == 1 or angular_err[0]<angular_err[1]: # q_est = Q_mean[0, :] # else: # q_est = Q_mean[1, :] # # print('Err:',angular_err) # Compute encoded error q_encoded_gt, _ = se3lib.quat_weighted_avg( dataset.ori_histogram_map, ori_encoded_gt) ori_encoded_err = 2 * np.arccos( np.abs( np.asmatrix(q_encoded_gt) * np.asmatrix(q_gt).transpose())) * 180 / np.pi ori_encoded_err_acc.append(ori_encoded_err) # 3. Angular error angular_err = 2 * np.arccos( np.abs(np.asmatrix(q_est) * np.asmatrix(q_gt).transpose())) * 180 / np.pi ori_err_acc.append(angular_err.item(0)) # 4. Loc error loc_err = np.linalg.norm(loc_est - loc_gt) loc_err_acc.append(loc_err) print('Loc Error: ', loc_err) print('Ori Error: ', angular_err) # Compute ESA score esa_score = loc_err / np.linalg.norm(loc_gt) + 2 * np.arccos( np.abs(np.asmatrix(q_est) * np.asmatrix(q_gt).transpose())) esa_scores_acc.append(esa_score) # Store depth distances_acc.append(loc_gt[2]) print('Mean est. location error: ', np.mean(loc_err_acc)) print('Mean est. orientation error: ', np.mean(ori_err_acc)) print('ESA score: ', np.mean(esa_scores_acc)) print('Mean encoded location error: ', np.mean(loc_encoded_err_acc)) # Dump results pd.DataFrame(np.asarray(ori_err_acc)).to_csv("ori_err.csv") pd.DataFrame(np.asarray(loc_err_acc)).to_csv("loc_err.csv") pd.DataFrame(np.asarray(distances_acc)).to_csv("dists_err.csv")
def test_and_submit(model, dataset_virtual, dataset_real): """ Evaluates model on ESA challenge test-set (no labels) and outputs submission file in a format compatible with the ESA server (probably down by now) """ # ESA API from submission import SubmissionWriter submission = SubmissionWriter() # TODO: Make the next 2 loops a nested loop # Synthetic test set for image_id in dataset_virtual.image_ids: print('Image ID:', image_id) image = dataset_virtual.load_image(image_id) info = dataset_virtual.image_info[image_id] results = model.detect([image], verbose=1) # Retrieve location if model.config.REGRESS_LOC: loc_est = results[0]['loc'] else: loc_pmf = utils.stable_softmax(results[0]['loc']) # Compute location mean according to first moment loc_est = np.asmatrix(loc_pmf) * np.asmatrix( dataset_virtual.histogram_3D_map) # Retrieve orientation if model.config.REGRESS_ORI: if model.config.ORIENTATION_PARAM == 'quaternion': q_est = results[0]['ori'] elif model.config.ORIENTATION_PARAM == 'euler_angles': q_est = se3lib.SO32quat( se3lib.euler2SO3_left(results[0]['ori'][0], results[0]['ori'][1], results[0]['ori'][2])) elif model.config.ORIENTATION_PARAM == 'angle_axis': theta = np.linalg.norm(results[0]['ori']) if theta < 1e-6: v = [0, 0, 0] else: v = results[0]['ori'] / theta q_est = se3lib.angleaxis2quat(v, theta) else: ori_pmf = utils.stable_softmax(results[0]['ori']) # Compute mean quaternion q_est, q_est_cov = se3lib.quat_weighted_avg( dataset_virtual.ori_histogram_map, ori_pmf) # Change quaternion order q_rect = [q_est[3], q_est[0], q_est[1], q_est[2]] submission.append_test(info['path'].split('/')[-1], q_rect, loc_est) # Real test set for image_id in dataset_real.image_ids: print('Image ID:', image_id) image = dataset_real.load_image(image_id) info = dataset_real.image_info[image_id] results = model.detect([image], verbose=1) # Retrieve location if model.config.REGRESS_LOC: loc_est = results[0]['loc'] else: loc_pmf = utils.stable_softmax(results[0]['loc']) # Compute location mean according to first moment loc_est = np.asmatrix(loc_pmf) * np.asmatrix( dataset_real.histogram_3D_map) # Retrieve orientation if model.config.REGRESS_ORI: if model.config.ORIENTATION_PARAM == 'quaternion': q_est = results[0]['ori'] elif model.config.ORIENTATION_PARAM == 'euler_angles': q_est = se3lib.SO32quat( se3lib.euler2SO3_left(results[0]['ori'][0], results[0]['ori'][1], results[0]['ori'][2])) elif model.config.ORIENTATION_PARAM == 'angle_axis': theta = np.linalg.norm(results[0]['ori']) if theta < 1e-6: v = [0, 0, 0] else: v = results[0]['ori'] / theta q_est = se3lib.angleaxis2quat(v, theta) else: ori_pmf = utils.stable_softmax(results[0]['ori']) # Compute mean quaternion q_est, q_est_cov = se3lib.quat_weighted_avg( dataset_real.ori_histogram_map, ori_pmf) # Change quaternion order q_rect = [q_est[3], q_est[0], q_est[1], q_est[2]] submission.append_real_test(info['path'].split('/')[-1], q_rect, loc_est) submission.export(suffix='debug') print('Submission exported.')
def evaluate_image(model, dataset, image_id): # Load pose in all formats loc_gt = dataset.load_location(image_id) q_gt = dataset.load_quaternion(image_id) image = dataset.load_image(image_id) I, I_meta, loc_encoded_gt, ori_encoded_gt = \ net.load_image_gt(dataset, model.config, image_id) results = model.detect([image], verbose=1) # Retrieve location if model.config.REGRESS_LOC: loc_est = results[0]['loc'] else: loc_pmf = utils.stable_softmax(results[0]['loc']) # Compute location mean according to first moment loc_est = np.asmatrix(loc_pmf) * np.asmatrix(dataset.histogram_3D_map) # Compute loc encoding error loc_decoded_gt = np.asmatrix(loc_encoded_gt) * np.asmatrix( dataset.histogram_3D_map) loc_encoded_err = np.linalg.norm(loc_decoded_gt - loc_gt) # Retrieve orientation if model.config.REGRESS_ORI: if model.config.ORIENTATION_PARAM == 'quaternion': q_est = results[0]['ori'] elif model.config.ORIENTATION_PARAM == 'euler_angles': q_est = se3lib.SO32quat( se3lib.euler2SO3_left(results[0]['ori'][0], results[0]['ori'][1], results[0]['ori'][2])) elif model.config.ORIENTATION_PARAM == 'angle_axis': theta = np.linalg.norm(results[0]['ori']) if theta < 1e-6: v = [0, 0, 0] else: v = results[0]['ori'] / theta q_est = se3lib.angleaxis2quat(v, theta) else: ori_pmf = utils.stable_softmax(results[0]['ori']) # Compute mean quaternion q_est, q_est_cov = se3lib.quat_weighted_avg(dataset.ori_histogram_map, ori_pmf) # Compute encoded error q_encoded_gt, _ = se3lib.quat_weighted_avg(dataset.ori_histogram_map, ori_encoded_gt) ori_encoded_err = 2 * np.arccos( np.abs(np.asmatrix(q_encoded_gt) * np.asmatrix(q_gt).transpose())) * 180 / np.pi # Compute errors angular_err = 2 * np.arccos( np.abs(np.asmatrix(q_est) * np.asmatrix(q_gt).transpose())) # angular_err_in_deg = angular_err* 180 / np.pi loc_err = np.linalg.norm(loc_est - loc_gt) loc_rel_err = loc_err / np.linalg.norm(loc_gt) # Compute ESA score esa_score = loc_rel_err + angular_err return loc_err, angular_err, loc_rel_err, esa_score