def find_best_permutation_prec_recall(gt, output, acceptable_window=np.pi / 18): """ Finds the best permutation for evaluation. Then uses that to find the precision and recall Inputs: gt, output: list of sources. lengths may differ Returns: Permutation that matches outputs to gt along with tp, fn and fp """ n = max(len(gt), len(output)) if len(gt) > len(output): output += [np.inf] * (n - len(output)) elif len(output) > len(gt): gt += [np.inf] * (n - len(gt)) best_perm = None best_inliers = -1 for perm in itertools.permutations(range(n)): curr_inliers = 0 for idx1, idx2 in enumerate(perm): if angular_distance(gt[idx1], output[idx2]) < acceptable_window: curr_inliers += 1 if curr_inliers > best_inliers: best_inliers = curr_inliers best_perm = list(perm) return localization_precision_recall(best_perm, gt, output, acceptable_window)
def localization_precision_recall(permutation, gt, output, acceptable_window=np.pi / 18): tp, fn, fp = 0, 0, 0 for idx1, idx2 in enumerate(permutation): if angular_distance(gt[idx1], output[idx2]) < acceptable_window: tp += 1 elif gt[idx1] == np.inf: fp += 1 elif output[idx2] == np.inf: fn += 1 else: fn += 1 fp += 1 return permutation, (tp, fn, fp)
def nms(candidate_voices, nms_cutoff): """ Runs non-max suppression on the candidate voices """ final_proposals = [] initial_proposals = candidate_voices while len(initial_proposals) > 0: new_initial_proposals = [] sorted_candidates = sorted(initial_proposals, key=lambda x: x[1], reverse=True) # Choose the loudest voice best_candidate_voice = sorted_candidates[0] final_proposals.append(best_candidate_voice) sorted_candidates.pop(0) # See if any of the rest should be removed for candidate_voice in sorted_candidates: different_locations = utils.angular_distance( candidate_voice.angle, best_candidate_voice.angle) > NMS_RADIUS # different_content = abs( # candidate_voice.data - # best_candidate_voice.data).mean() > nms_cutoff different_content = si_sdr( candidate_voice.data[0], best_candidate_voice.data[0]) < nms_cutoff if different_locations or different_content: new_initial_proposals.append(candidate_voice) initial_proposals = new_initial_proposals return final_proposals
def evaluate_dir(idx): if args.debug: curr_writing_dir = "{:05d}".format(idx) if not os.path.exists(curr_writing_dir): os.makedirs(curr_writing_dir) args.writing_dir = curr_writing_dir curr_dir = all_dirs[idx] # Loads the data mixed_data, gt = get_items(curr_dir, args) # Prevents CUDA out of memory gpu_lock.acquire() if args.prec_recall: # Case where we don't know the number of sources candidate_voices = run_separation(mixed_data, model, args) # Case where we know the number of sources else: # Normal run if not args.oracle_position: candidate_voices = run_separation(mixed_data, model, args, 0.005) # In order to compute SDR or angle error, the number of outputs must match gt # We set a very low threshold to ensure we get the correct number of outputs if args.oracle_position or len(candidate_voices) < len(gt): print("Had to go again\n") candidate_voices = run_separation(mixed_data, model, args, 0.000001) # Use the GT positions to find the best sources if args.oracle_position: trimmed_voices = [] for gt_idx in range(args.n_voices): best_idx = np.argmin( np.array([ angular_distance(x.angle, gt[gt_idx].angle) for x in candidate_voices ])) trimmed_voices.append(candidate_voices[best_idx]) candidate_voices = trimmed_voices # Take the top N voices else: candidate_voices = candidate_voices[:args.n_voices] if len(candidate_voices) != len(gt): print( f"Not enough outputs for dir {curr_dir}. Lower threshold to evaluate." ) return if args.debug: sf.write(os.path.join(args.writing_dir, "mixed.wav"), mixed_data[0], args.sr) for voice in candidate_voices: fname = "out_angle{:.2f}.wav".format(voice.angle * 180 / np.pi) sf.write(os.path.join(args.writing_dir, fname), voice.data[0], args.sr) gpu_lock.release() curr_angle_errors = [] curr_input_sdr = [] curr_output_sdr = [] best_permutation, (tp, fn, fp) = find_best_permutation_prec_recall( [x.angle for x in gt], [x.angle for x in candidate_voices]) if args.prec_recall: all_tp.append(tp) all_fn.append(fn) all_fp.append(fp) # Evaluate SDR and Angular Error else: for gt_idx, output_idx in enumerate(best_permutation): angle_error = angular_distance( candidate_voices[output_idx].angle, gt[gt_idx].angle) # print(angle_error) curr_angle_errors.append(angle_error) # To speed up we only evaluate channel 0. For rigorous results # set that to false input_sdr = compute_sdr(gt[gt_idx].data, mixed_data, single_channel=True) output_sdr = compute_sdr(gt[gt_idx].data, candidate_voices[output_idx].data, single_channel=True) curr_input_sdr.append(input_sdr) curr_output_sdr.append(output_sdr) # print(curr_input_sdr) # print(curr_output_sdr) all_angle_errors[idx] = curr_angle_errors all_input_sdr[idx] = curr_input_sdr all_output_sdr[idx] = curr_output_sdr