def sed_event_wise_tps_fns_fps(sed_pds, sed_gts, at_detected2d, sed_thres): """shape: (n_clips, n_time, n_classes) """ (n_clips, n_time, n_classes) = sed_pds.shape detected3d = (np.sign(sed_pds - sed_thres) + 1.) / 2 detected3d *= at_detected2d[:, None, :] # Event based SED. tps, fns, fps = [], [], [] for j1 in xrange(n_classes): total_tp = 0 total_fn = 0 total_fp = 0 for i1 in xrange(n_clips): pd_pairs = vad.activity_detection(detected3d[i1, :, j1], sed_thres, n_smooth=24, n_salt=4) gt_pairs = vad.activity_detection(sed_gts[i1, :, j1], sed_thres, n_smooth=24, n_salt=4) pd_pairs = np.array(pd_pairs) gt_pairs = np.array(gt_pairs) if len(gt_pairs) > 0 and len(pd_pairs) > 0: matching = mir_eval.transcription.match_note_onsets(gt_pairs, pd_pairs, onset_tolerance=12) # onset collar=500 ms tp = len(matching) else: tp = 0 fn = len(gt_pairs) - tp fp = len(pd_pairs) - tp total_tp += tp total_fn += fn total_fp += fp tps.append(total_tp) fns.append(total_fn) fps.append(total_fp) return tps, fns, fps
def get_est_event_list(sed_outputs, audio_names, labels): hop_size = config.window_size - config.overlap seconds_per_frame = hop_size / float(config.sample_rate) ix_to_lb = config.ix_to_lb est_event_list = [] for (n, audio_name) in enumerate(audio_names): for k in range(len(labels)): bgn_fin_pairs = activity_detection(sed_outputs[n, :, k], thres=0.2, low_thres=0.1, n_smooth=10, n_salt=10) if len(bgn_fin_pairs) > 0: for [bgn, fin] in bgn_fin_pairs: est_event = { 'filename': audio_name, 'onset': bgn * seconds_per_frame, 'offset': fin * seconds_per_frame, 'event_label': ix_to_lb[k] } est_event_list.append(est_event) return est_event_list
def frame_prediction_to_event_prediction(output_dict, sed_params_dict): """Write output to submission file. Args: output_dict: { 'audio_name': (audios_num), 'clipwise_output': (audios_num, classes_num), 'framewise_output': (audios_num, frames_num, classes_num)} sed_params_dict: { 'audio_tagging_threshold': float between 0 and 1, 'sed_high_threshold': : float between 0 and 1, 'sed_low_threshold': : float between 0 and 1, 'n_smooth': int, silence between the same sound event shorter than this number will be filled with the sound event 'n_salt': int, sound event shorter than this number will be removed} """ (audios_num, frames_num, classes_num) = output_dict['framewise_output'].shape frames_per_second = config.frames_per_second labels = config.labels event_list = [] def _float_to_list(x): if 'list' in str(type(x)): return x else: return [x] * classes_num sed_params_dict['audio_tagging_threshold'] = _float_to_list( sed_params_dict['audio_tagging_threshold']) sed_params_dict['sed_high_threshold'] = _float_to_list( sed_params_dict['sed_high_threshold']) sed_params_dict['sed_low_threshold'] = _float_to_list( sed_params_dict['sed_low_threshold']) sed_params_dict['n_smooth'] = _float_to_list(sed_params_dict['n_smooth']) sed_params_dict['n_salt'] = _float_to_list(sed_params_dict['n_salt']) for n in range(audios_num): for k in range(classes_num): if output_dict['clipwise_output'][n, k] \ > sed_params_dict['audio_tagging_threshold'][k]: bgn_fin_pairs = activity_detection( x=output_dict['framewise_output'][n, :, k], thres=sed_params_dict['sed_high_threshold'][k], low_thres=sed_params_dict['sed_low_threshold'][k], n_smooth=sed_params_dict['n_smooth'][k], n_salt=sed_params_dict['n_salt'][k]) for pair in bgn_fin_pairs: event = { 'filename': output_dict['audio_name'][n], 'onset': pair[0] / float(frames_per_second), 'offset': pair[1] / float(frames_per_second), 'event_label': labels[k] } event_list.append(event) return event_list
def write_submission(output_dict, sed_params_dict, submission_path): '''Write output to submission file. Args: output_dict: { 'audio_name': (audios_num), 'clipwise_output': (audios_num, classes_num), 'framewise_output': (audios_num, frames_num, classes_num)} sed_params_dict: { 'audio_tagging_threshold': float between 0 and 1, 'sed_high_threshold': : float between 0 and 1, 'sed_low_threshold': : float between 0 and 1, 'n_smooth': int, silence between the same sound event shorter than this number will be filled with the sound event 'n_salt': int, sound event shorter than this number will be removed} submission_path: string, path to write out submission ''' (audios_num, frames_num, classes_num) = output_dict['framewise_output'].shape frames_per_second = config.frames_per_second labels = config.labels f = open(submission_path, 'w') f.write('{}\t{}\t{}\t{}\n'.format('filename', 'onset', 'offset', 'event_label')) for n in range(audios_num): for k in range(classes_num): if output_dict['clipwise_output'][n, k] \ > sed_params_dict['sed_high_threshold']: bgn_fin_pairs = activity_detection( x=output_dict['framewise_output'][n, :, k], thres=sed_params_dict['sed_high_threshold'], low_thres=sed_params_dict['sed_low_threshold'], n_smooth=sed_params_dict['n_smooth'], n_salt=sed_params_dict['n_salt']) for pair in bgn_fin_pairs: bgn_time = pair[0] / float(frames_per_second) fin_time = pair[1] / float(frames_per_second) f.write('{}\t{}\t{}\t{}\n'.format( output_dict['audio_name'][n], bgn_time, fin_time, labels[k])) f.close() logging.info(' Write submission file to {}' ''.format(submission_path))
def calculate_estimated_event_list(audio_names, predictions, frame_wise_probs, seconds_per_frame, sed_thres, sed_low_thres): """Calculate estimated event list from frame wise probabilites. Args: audio_names: list of str. predictions: (audios_num, classes_num), value of 0 or 1 frame_wise_probs: (audios_num, time_steps, classes_num) seconds_per_frame: float Returns: estimated_event_list: list of events """ ix_to_lb = config.ix_to_lb estimated_event_list = [] for (n, audio_name) in enumerate(audio_names): for event_index in predictions[n]: bgn_fin_pairs = activity_detection(frame_wise_probs[n, :, event_index], thres=sed_thres, low_thres=sed_low_thres, n_smooth=1, n_salt=0) for [bgn, fin] in bgn_fin_pairs: event = { 'filename': audio_name, 'onset': bgn * seconds_per_frame, 'offset': fin * seconds_per_frame, 'event_label': ix_to_lb[event_index] } estimated_event_list.append(event) return estimated_event_list
def print_stats(at_pds, at_gts, sed_pds, sed_gts): np.set_printoptions(threshold=np.nan, linewidth=1000, precision=3, suppress=True) events = cfg.events # AT evaluate. # TP, FN, FP, TN logging.info("====== Audio tagging (AT) ======") logging.info("%stp\tfn\tfp\ttn" % "".ljust(16)) (tp, fn, fp, tn) = tp_fn_fp_tn(at_pds, at_gts, at_thres, average='micro') logging.info("%s%d\t%d\t%d\t%d\t" % ("Global".ljust(16), tp, fn, fp, tn)) (tps, fns, fps, tns) = tp_fn_fp_tn(at_pds, at_gts, at_thres, average=None) for i1 in xrange(len(tps)): logging.info( "%s%d\t%d\t%d\t%d\t" % (events[i1].ljust(16), tps[i1], fns[i1], fps[i1], tns[i1])) # Prec, recall, fvalue, AUC, eer logging.info("%sPrec\tRecall\tFvalue\tAUC\tEER" % "".ljust(16)) (prec, recall, fvalue) = prec_recall_fvalue(at_pds, at_gts, at_thres, average='micro') auc = metrics.roc_auc_score(at_gts, at_pds, average='micro') eer = equal_error_rate(at_pds, at_gts, average='micro') logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Global".ljust(16), prec, recall, fvalue, auc, eer)) (prec, recall, fvalue) = prec_recall_fvalue(at_pds, at_gts, at_thres, average='macro') auc = metrics.roc_auc_score(at_gts, at_pds, average='macro') eer = equal_error_rate(at_pds, at_gts, average='macro') logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), prec, recall, fvalue, auc, eer)) (precs, recalls, fvalues) = prec_recall_fvalue(at_pds, at_gts, at_thres, average=None) aucs = metrics.roc_auc_score(at_gts, at_pds, average=None) eers = equal_error_rate(at_pds, at_gts, average=None) for i1 in xrange(len(tps)): logging.info("%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), precs[i1], recalls[i1], fvalues[i1], aucs[i1], eers[i1])) # SED evaluate logging.info("====== Frame based SED ======") (n_clips, n_time, n_classes) = sed_pds.shape logging.info("%stp\tfn\tfp\ttn" % "".ljust(16)) sed_pds_2d = sed_pds.reshape((n_clips * n_time, n_classes)) sed_gts_2d = sed_gts.reshape((n_clips * n_time, n_classes)) (tp, fn, fp, tn) = tp_fn_fp_tn(sed_pds_2d, sed_gts_2d, sed_thres, average='micro') logging.info("%s*%d\t*%d\t*%d\t*%d\t" % ("*Global".ljust(16), tp, fn, fp, tn)) (tps, fns, fps, tns) = tp_fn_fp_tn(sed_pds_2d, sed_gts_2d, sed_thres, average=None) for i1 in xrange(len(tps)): logging.info( "%s%d\t%d\t%d\t%d\t" % (events[i1].ljust(16), tps[i1], fns[i1], fps[i1], tns[i1])) # Prec, recall, fvalue logging.info("%sPrec\tRecall\tFvalue\tAUC\tER\tn_sub\tn_del\tn_ins" % "".ljust(16)) (prec, recall, fvalue) = prec_recall_fvalue(sed_pds_2d, sed_gts_2d, sed_thres, average='micro') auc = metrics.roc_auc_score(sed_gts_2d, sed_pds_2d, average='micro') (er, n_sub, n_del, n_ins) = error_rate(sed_pds_2d, sed_gts_2d, sed_thres, average='micro') logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Global".ljust(16), prec, recall, fvalue, auc, er, n_sub, n_del, n_ins)) (prec, recall, fvalue) = prec_recall_fvalue(sed_pds_2d, sed_gts_2d, sed_thres, average='macro') auc = metrics.roc_auc_score(sed_gts_2d, sed_pds_2d, average='macro') (er, n_sub, n_del, n_ins) = error_rate(sed_pds_2d, sed_gts_2d, sed_thres, average='macro') logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), prec, recall, fvalue, auc, er, n_sub, n_del, n_ins)) (precs, recalls, fvalues) = prec_recall_fvalue(sed_pds_2d, sed_gts_2d, sed_thres, average=None) aucs = metrics.roc_auc_score(sed_gts_2d, sed_pds_2d, average=None) (ers, n_subs, n_dels, n_inss) = error_rate(sed_pds_2d, sed_gts_2d, sed_thres, average=None) for i1 in xrange(len(tps)): logging.info( "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), precs[i1], recalls[i1], fvalues[i1], aucs[i1], ers[i1], n_subs[i1], n_dels[i1], n_inss[i1])) logging.info("====== Event based SED ======") at_detected2d = (np.sign(at_pds - at_thres) + 1.) / 2 (tps, fns, fps) = sed_event_wise_tps_fns_fps(sed_pds, sed_gts, at_detected2d, sed_thres) logging.info("%stp\tfn\tfp" % "".ljust(16)) logging.info("%s*%d\t*%d\t*%d" % ("*Total:".ljust(16), np.sum(tps), np.sum(fns), np.sum(fps))) for i1 in xrange(n_classes): logging.info("%s%d\t%d\t%d" % (events[i1].ljust(16), tps[i1], fns[i1], fps[i1])) logging.info("------ Prec, recall, fvalue ------") logging.info("%sPrec\tRecall\tFvalue\tAUC\tER\tn_sub\tn_del\tn_ins" % "".ljust(16)) (prec, recall, fvalue) = prec_recall_fvalue_from_tp_fn_fp(tps, fns, fps, average='micro') (er, n_sub, n_del, n_ins) = error_rate_from_tp_fn_fp(tps, fns, fps, average='micro') logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Global".ljust(16), prec, recall, fvalue, auc, er, n_sub, n_del, n_ins)) (prec, recall, fvalue) = prec_recall_fvalue_from_tp_fn_fp(tps, fns, fps, average='macro') (er, n_sub, n_del, n_ins) = error_rate_from_tp_fn_fp(tps, fns, fps, average='macro') logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), prec, recall, fvalue, auc, er, n_sub, n_del, n_ins)) (precs, recalls, fvalues) = prec_recall_fvalue_from_tp_fn_fp(tps, fns, fps, average=None) (ers, n_subs, n_dels, n_inss) = error_rate_from_tp_fn_fp(tps, fns, fps, average=None) for i1 in xrange(n_classes): logging.info( "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), precs[i1], recalls[i1], fvalues[i1], aucs[i1], ers[i1], n_subs[i1], n_dels[i1], n_inss[i1])) if False: post_det3d = np.zeros_like(detected3d) for i1 in xrange(n_clips): for j1 in xrange(n_classes): lists = vad.activity_detection(detected3d[i1, :, j1], sed_thres, n_smooth=24, n_salt=18) for (bgn, fin) in lists: post_det3d[i1, bgn:fin, j1] = 1 for i1 in xrange(len(sed_pds)): print("gt:", at_gts[i1]) print("pd:", at_pds[i1]) fig, axs = plt.subplots(4, 1, sharex=True) axs[0].matshow(sed_gts[i1].T, origin='lower', aspect='auto') axs[1].matshow(sed_pds[i1].T, origin='lower', aspect='auto') axs[2].matshow(detected3d[i1].T, origin='lower', aspect='auto') axs[3].matshow(post_det3d[i1].T, origin='lower', aspect='auto') plt.show()