def predict_new(args): workspace = args.workspace # Load model. md_path = os.path.join(workspace, "models", "main", args.model_name) md = serializations.load(md_path) # Simulate new data. x_new = np.random.normal(size=(3, 10, 128)) # (n_clips, n_time, n_in) # Obtain final classification probability on an audio clip. [y] = md.predict(x_new) # (n_clips, n_out) print("y.shape: %s" % (y.shape, )) # Obtain intermedial classification & attention value in the neural network. observe_nodes = [ md.find_layer('cla').output_, md.find_layer('att').output_ ] f_forward = md.get_observe_forward_func(observe_nodes) # Forward function. [cla, att] = md.run_function(f_forward, x_new, batch_size=None, tr_phase=0.) print("classification.shape: %s" % (cla.shape, )) # (n_clips, n_time, n_out) print("attention.shape: %s" % (att.shape, )) # (n_clips, n_time, n_out)
def recognize(args): workspace = cfg.workspace events = cfg.events n_events = args.n_events snr = args.snr md_na = args.model_name lb_to_ix = cfg.lb_to_ix n_out = len(cfg.events) te_fold = cfg.te_fold md_path = os.path.join(workspace, "models", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr, md_na) md = serializations.load(md_path) # Load data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) x = te_x at_gts = te_at_y sed_gts = te_sed_y na_list = te_na_list # Recognize. [at_pds] = md.predict(x) # (N, 16) observe_nodes = [md.find_layer('detect').output_] f_forward = md.get_observe_forward_func(observe_nodes) [seg_masks] = md.run_function(f_forward, x, batch_size=500, tr_phase=0.) # (n_clips, n_time, n_out) seg_masks = np.transpose(seg_masks, (0, 2, 1))[:, :, :, np.newaxis] # Dump to pickle. out_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr, os.path.splitext(md_na)[0]) pp_data.create_folder(out_dir) out_at_path = os.path.join(out_dir, "at_probs.p") out_seg_masks_path = os.path.join(out_dir, "seg_masks.p") cPickle.dump(at_pds, open(out_at_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(seg_masks, open(out_seg_masks_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Print stats. sed_pds = np.mean(seg_masks, axis=-1) # (N, n_out, n_time) sed_pds = np.transpose(sed_pds, (0, 2, 1)) # (N, n_time, n_out) print_stats(at_pds, at_gts, sed_pds, sed_gts)
def detect_cv(): # init paths if type=='home': fe_fd = cfg.dev_fe_mel_home_fd labels = cfg.labels_home lb_to_id = cfg.lb_to_id_home id_to_lb = cfg.id_to_lb_home tr_txt = cfg.dev_evaluation_fd + '/home_fold' + str(fold) + '_train.txt' te_txt = cfg.dev_evaluation_fd + '/home_fold' + str(fold) + '_evaluate.txt' meta_fd = cfg.dev_meta_home_fd if type=='resi': fe_fd = cfg.dev_fe_mel_resi_fd labels = cfg.labels_resi lb_to_id = cfg.lb_to_id_resi id_to_lb = cfg.id_to_lb_resi tr_txt = cfg.dev_evaluation_fd + '/residential_area_fold' + str(fold) + '_train.txt' te_txt = cfg.dev_evaluation_fd + '/residential_area_fold' + str(fold) + '_evaluate.txt' meta_fd = cfg.dev_meta_resi_fd n_out = len( labels ) # load model md = serializations.load( md_path ) # get wav names to be detected te_names = pp_dev_data.GetWavNamesFromTxt( te_txt ) # do recognize for each test audio names = os.listdir( fe_fd ) names = sorted( names ) y_pred_list = [] # detect and write out to txt pp_dev_data.CreateFolder( cfg.dev_results_fd ) file_list = [] for na in names: if na[0:4] in te_names: print na gt_file = meta_fd + '/' + na[0:4] + '.ann' out_file = cfg.dev_results_fd + '/'+na[0:4]+'_detect.ann' X = cPickle.load( open( fe_fd+'/'+na, 'rb' ) ) X = mat_2d_to_3d( X, agg_num, hop ) y_pred = md.predict( X ) y_pred_list.append( y_pred ) out_list = pp_dev_data.OutMatToList( y_pred, thres, id_to_lb ) pp_dev_data.PrintListToTxt( out_list, out_file ) file_list.append( { 'reference_file': gt_file, 'estimated_file': out_file } ) # print results for this fold pp_dev_data.PrintScore( file_list, labels )
def recognize(md_path, te_fe_fd, te_csv_file, n_concat, hop, scaler): """Recognize and get statistics. Args: md_path: string. Path of model. te_fe_fd: string. Folder path containing testing features. te_csv_file: string. Path of test csv file. n_concat: integar. Number of frames to concatenate. hop: integar. Number of frames to hop. scaler: None | scaler object. """ # Load model md = serializations.load(md_path) # Recognize and get statistics n_labels = len(cfg.labels) confuse_mat = np.zeros((n_labels, n_labels)) # confusion matrix frame_based_accs = [] # Get test file names with open(te_csv_file, 'rb') as f: reader = csv.reader(f) lis = list(reader) # Predict for each scene for li in lis: # Load data [na, lb] = li[0].split('\t') na = na.split('/')[1][0:-4] path = te_fe_fd + '/' + na + '.f' x = cPickle.load(open(path, 'rb')) if scaler: x = scaler.transform(x) x = mat_2d_to_3d(x, n_concat, hop) # Predict p_y_preds = md.predict(x)[0] # (n_block,label) pred_ids = np.argmax(p_y_preds, axis=-1) # (n_block,) pred_id = int(get_mode_value(pred_ids)) gt_id = cfg.lb_to_id[lb] # Statistics confuse_mat[gt_id, pred_id] += 1 n_correct_frames = list(pred_ids).count(gt_id) frame_based_accs += [float(n_correct_frames) / len(pred_ids)] clip_based_acc = np.sum(np.diag( np.diag(confuse_mat))) / np.sum(confuse_mat) frame_based_acc = np.mean(frame_based_accs) print 'event_acc:', clip_based_acc print 'frame_acc:', frame_based_acc print confuse_mat
def recognize(md_path, te_fe_fd, te_csv_file, n_concat, hop, scaler): """Recognize and get statistics. Args: md_path: string. Path of model. te_fe_fd: string. Folder path containing testing features. te_csv_file: string. Path of test csv file. n_concat: integar. Number of frames to concatenate. hop: integar. Number of frames to hop. scaler: None | scaler object. """ # Load model md = serializations.load(md_path) # Recognize and get statistics n_labels = len(cfg.labels) confuse_mat = np.zeros((n_labels, n_labels)) # confusion matrix frame_based_accs = [] # Get test file names with open(te_csv_file, 'rb') as f: reader = csv.reader(f) lis = list(reader) # Predict for each scene for li in lis: # Load data [na, lb] = li[0].split('\t') na = na.split('/')[1][0:-4] path = te_fe_fd + '/' + na + '.f' x = cPickle.load(open(path, 'rb')) if scaler: x = scaler.transform(x) x = mat_2d_to_3d(x, n_concat, hop) # Predict p_y_preds = md.predict(x)[0] # (n_block,label) pred_ids = np.argmax(p_y_preds, axis=-1) # (n_block,) pred_id = int(get_mode_value(pred_ids)) gt_id = cfg.lb_to_id[lb] # Statistics confuse_mat[gt_id, pred_id] += 1 n_correct_frames = list(pred_ids).count(gt_id) frame_based_accs += [float(n_correct_frames) / len(pred_ids)] clip_based_acc = np.sum(np.diag(np.diag(confuse_mat))) / np.sum(confuse_mat) frame_based_acc = np.mean(frame_based_accs) print 'event_acc:', clip_based_acc print 'frame_acc:', frame_based_acc print confuse_mat
def plot_seg_masks(args): # Load data. te_pack_path = os.path.join(workspace, "packed_features", "logmel", "testing.h5") scaler_path = os.path.join(workspace, "scalers", "logmel", "training.scaler") with h5py.File(te_pack_path, 'r') as hf: te_na_list = list(hf.get('na_list')) te_x = np.array(hf.get('x')) te_y = np.array(hf.get('y')) te_x_unscaled = te_x # unscaled x for plot. scaler = pickle.load(open(scaler_path, 'rb')) te_x = pp_data.do_scaler_on_x3d(te_x, scaler) # Load model. md_path = os.path.join(workspace, "models", pp_data.get_filename(__file__), args.model_name) md = serializations.load(md_path) # Observe function. observe_nodes = [md.find_layer('seg_masks').output_] f_forward = md.get_observe_forward_func(observe_nodes) [seg_masks] = md.run_function(f_forward, te_x, batch_size=50, tr_phase=0.) print("Segmentation masks: %s" % (seg_masks.shape, )) # Plot segmentation masks. for i1 in xrange(len(seg_masks)): na = te_na_list[i1] if ".mix_0db.wav" in na: print(na) gt_y = te_y[i1].astype(np.float32) print(gt_y) print("Ground truth: %s" % cfg.events[np.argmax(gt_y)]) events_ex = cfg.events + ['bg'] fig, axs = plt.subplots(3, 2, sharex=True) axs[0, 0].matshow(te_x_unscaled[i1].T, origin='lower', aspect='auto') axs[0, 0].set_title("log Mel spectrogram") for i2 in xrange(0, 4): axs[i2 / 2 + 1, i2 % 2].matshow(seg_masks[i1, i2].T, origin='lower', aspect='auto', vmin=0, vmax=1) axs[i2 / 2 + 1, i2 % 2].set_title(events_ex[i2]) plt.show()
def plot_hotmap(args): workspace = cfg.workspace events = cfg.events md_na = args.model_name n_events = args.n_events te_fold = cfg.te_fold feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, is_scale=is_scale) md_path = os.path.join(workspace, "models", pp_data.get_filename(__file__), "n_events=%d" % n_events, md_na) md = serializations.load(md_path) x = te_x y = te_at_y observe_nodes = [md.find_layer('hotmap').output_] f_forward = md.get_observe_forward_func(observe_nodes) [a4] = md.run_function(f_forward, x, batch_size=500, tr_phase=0.) print a4.shape for i1 in xrange(len(a4)): # if te_na_list[i1] == 'CR_lounge_220110_0731.s2700_chunk48': print(y[i1]) # print np.mean(a4[i1], axis=(1,2)) fig, axs = plt.subplots(5, 4, sharex=True) axs[0, 0].matshow(x[i1].T, origin='lower', aspect='auto') for i2 in xrange(16): axs[i2 / 4 + 1, i2 % 4].matshow(a4[i1, i2].T, origin='lower', aspect='auto', vmin=0, vmax=1) axs[i2 / 4 + 1, i2 % 4].set_title(events[i2]) plt.show()
def detect(): # init paths if type == 'home': fe_fd = cfg.eva_fe_mel_home_fd labels = cfg.labels_home lb_to_id = cfg.lb_to_id_home id_to_lb = cfg.id_to_lb_home if type == 'resi': fe_fd = cfg.eva_fe_mel_resi_fd labels = cfg.labels_resi lb_to_id = cfg.lb_to_id_resi id_to_lb = cfg.id_to_lb_resi n_out = len(labels) # load model md = serializations.load(md_path) # do recognize for each test audio names = os.listdir(fe_fd) names = sorted(names) pp_dev_data.CreateFolder(cfg.eva_results_fd) pp_dev_data.CreateFolder(cfg.eva_results_fd + '/' + type) # detect and write out for all audios for na in names: X = cPickle.load(open(fe_fd + '/' + na, 'rb')) X = mat_2d_to_3d(X, agg_num, hop) y_pred = md.predict(X) outlist = pp_dev_data.OutMatToList(y_pred, thres, id_to_lb) full_na = type + '/audio/' + na[0:4] + '.wav' out_txt_path = cfg.eva_results_fd + '/' + type + '/' + na[ 0:4] + '_detect.ann' f = open(out_txt_path, 'w') for li in outlist: f.write(full_na + '\t' + str(li['event_onset']) + '\t' + str(li['event_offset']) + '\t' + li['event_label'] + '\n') print 'Write out detection result to', out_txt_path, 'successfully!' f.close()
def detect(): # init paths if type=='home': fe_fd = cfg.eva_fe_mel_home_fd labels = cfg.labels_home lb_to_id = cfg.lb_to_id_home id_to_lb = cfg.id_to_lb_home if type=='resi': fe_fd = cfg.eva_fe_mel_resi_fd labels = cfg.labels_resi lb_to_id = cfg.lb_to_id_resi id_to_lb = cfg.id_to_lb_resi n_out = len( labels ) # load model md = serializations.load( md_path ) # do recognize for each test audio names = os.listdir( fe_fd ) names = sorted( names ) pp_dev_data.CreateFolder( cfg.eva_results_fd ) pp_dev_data.CreateFolder( cfg.eva_results_fd+'/'+type ) # detect and write out for all audios for na in names: X = cPickle.load( open( fe_fd+'/'+na, 'rb' ) ) X = mat_2d_to_3d( X, agg_num, hop ) y_pred = md.predict( X ) outlist = pp_dev_data.OutMatToList( y_pred, thres, id_to_lb ) full_na = type + '/audio/' + na[0:4] + '.wav' out_txt_path = cfg.eva_results_fd+'/'+type+'/'+na[0:4]+'_detect.ann' f = open( out_txt_path, 'w') for li in outlist: f.write( full_na + '\t' + str(li['event_onset']) + '\t' + str(li['event_offset']) + '\t' + li['event_label'] + '\n' ) print 'Write out detection result to', out_txt_path, 'successfully!' f.close()
import config as cfg from main_dnn import mul from main_rnn import get_last from mir_eval.separation import bss_eval_sources n_freq = 513 agg_num = 3 # This value should be the same as the training phase! hop = 1 # hop must be 1 n_hid = 500 # load data te_X2d_mix, te_X3d_mix, te_y2d_chn0, te_y2d_chn1, te_y3d_chn0, te_y3d_chn1 = pp_data.LoadData( cfg.fe_fft_fd, agg_num, hop, [cfg.te_list[0]]) # load model md = serializations.load('Md/md100.p') # get predicted abs spectrogram [out_chn0, out_chn1] = md.predict(np.abs(te_X3d_mix)) # recover wav s_out_chn0 = pp_data.recover_wav_from_abs(out_chn0, te_X2d_mix) s_out_chn1 = pp_data.recover_wav_from_abs(out_chn1, te_X2d_mix) s_gt_chn0 = pp_data.recover_wav_from_cmplx(te_y2d_chn0) s_gt_chn1 = pp_data.recover_wav_from_cmplx(te_y2d_chn1) # write out wavs pp_data.write_wav(s_out_chn0, 16000., cfg.results_fd + '/' + 'recover_chn0.wav') pp_data.write_wav(s_out_chn1, 16000., cfg.results_fd + '/' + 'recover_chn1.wav')
''' SUMMARY: plot 1-st autoencoder learned weights AUTHOR: Qiuqiang Kong Created: 2016.10.06 Modified: - -------------------------------------- ''' from hat import serializations import matplotlib.pyplot as plt # load weights of 1-st layer md = serializations.load('Results/md_ae1.p') W = md.find_layer('a1').W_ # plot autoencoder learned weights num_to_plot = 10 for i1 in range(num_to_plot): w = W[:, i1].reshape((28, 28)) plt.matshow(w) plt.show()
from hat import serializations import prepare_dev_data as pp_dev_data import config as cfg from evaluation import * import pickle import cPickle import os # hyper-params agg_num = 100 # should be same as training phase hop = 1 eva_fe_fd = cfg.eva_fe_mel_fd thres = 0.2 # load model md = serializations.load(cfg.dev_md_fd + '/md100.p') # evaluate for each test feature names = os.listdir(cfg.eva_wav_fd) names = sorted(names) results = [] if not os.path.exists(cfg.eva_results_fd): os.makedirs(cfg.eva_results_fd) for na in names: print na # load data te_fe = eva_fe_fd + '/' + na[0:-4] + '.f' X = cPickle.load(open(te_fe, 'rb')) X3d = mat_2d_to_3d(X, agg_num, hop) # detect
def recognize_on_test_data(): # test_fe_fd = cfg.test_denoise_fe_enhance_mel_fd test_fe_fd = cfg.test_denoise_fe_enhance_pool_fft_fd # load data md = serializations.load( cfg.wbl_dev_md_fd+'/cnn_fft/md3000_iters.p' ) names = os.listdir( test_fe_fd ) names = sorted(names) i1 = 0 f = open(cfg.scrap_fd + "/test_bird_result.csv", 'w') for na in names: if i1!=0: f.write("\n") if i1%1==0: path = test_fe_fd + "/" + na X = cPickle.load( open( path, 'rb' ) ) [n_chunks, n_freq] = X.shape #X = pp_data.wipe_click2d( X ) X = X.reshape( (1, n_chunks, n_freq) ) X *=10000 # amplitude test data, which is useful n_pad = int( cfg.n_duration/2 ) X, mask = pad_trunc_seqs( X, n_pad, 'post' ) mask = pp_data.cut_test_fe_tail( mask ) X *= mask[:,:,None] [out3d, detect3d] = md.predict( [X, mask], batch_size=100 ) out3d *= mask[:,:,None] detect3d *= mask[:,:,None] uni_mu = detect3d[0,:,0] / ( np.sum( detect3d[0,:,0] ) + 1e-8 ) score = np.sum( out3d[0,:,0] * uni_mu ) if score < 0.00001: score=0 print i1, na, score, np.sum(out3d[0,:,0]*detect3d[0,:,0]), np.sum(detect3d[0,:,0]) # # Plot for debug! # fig, axs = plt.subplots(4, sharex=True) # axs[0].matshow( np.log(X[0,:,:].T), origin='lower', aspect='auto' ) # axs[0].set_title('mel spectrogram') # # axs[1].stem( detect3d[0,:,0] ) # axs[1].set_ylim([0,1]) # axs[1].set_title('detector') # # # axs[2].stem( out3d[0] ) # axs[2].set_ylim([0,1]) # axs[2].set_title('classifier') # # # axs[3].stem( detect3d[0,:,0]*out3d[0,:,0] ) # axs[3].set_ylim([0,1]) # axs[3].set_title(score) # plt.show() f.write(na[0:-2] + "," + str(score)) i1 += 1 f.close()
def recognize(args): workspace = args.workspace md_path = os.path.join(workspace, "models", pp_data.get_filename(__file__), args.model_name) t1 = time.time() # Load scaler. scaler_path = os.path.join(workspace, "scalers", "logmel", "training.scaler") scaler = pickle.load(open(scaler_path, 'rb')) # Load model. md = serializations.load(md_path) # Observe function. observe_nodes = [md.find_layer('seg_masks').output_] f_forward = md.get_observe_forward_func(observe_nodes) audio_dir = os.path.join(workspace, "mixed_audio", "testing") names = os.listdir(audio_dir) at_pd_ary = [] at_gt_ary = [] sed_pd_ary = [] sed_gt_ary = [] # For all audio clips. for na in names: if '.mix_0db.wav' in na: logging.info(na) # Load audio. bare_na = os.path.splitext(os.path.splitext(na)[0])[0] audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio # Load yaml. yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_na) with open(yaml_path, 'r') as f: data = yaml.load(f) event_type = data['event_type'] # Calculate feature. x = pp_data.calc_feat(mixed_audio) x3d = pp_data.do_scaler_on_x3d(x[np.newaxis, ...], scaler) # Ground truth. gt_y = [0, 0, 0, 0] gt_y[cfg.lb_to_ix[event_type]] = 1 at_gt_ary.append(gt_y) # Audio tagging (AT) prediction. [pred_y] = md.predict(x3d) # (1, n_events+1) pred_y = pred_y[0] # (n_events+1,) at_pd_ary.append(pred_y) # Sound event detection (SED) prediction. [masks] = md.run_function( f_forward, x3d, batch_size=10, tr_phase=0.) # (1, n_events+1, n_time, n_freq) masks = masks[0] # (n_events+1, n_time, n_freq) sed_pd = np.mean(masks, axis=-1).T # (n_time, n_events+1) sed_pd_ary.append(sed_pd) sed_gt = np.zeros_like(sed_pd) [bgn_sec, fin_sec] = data['event_segment'] bgn_fr = int(bgn_sec * cfg.sample_rate / float(cfg.n_window - cfg.n_overlap)) fin_fr = int(fin_sec * cfg.sample_rate / float(cfg.n_window - cfg.n_overlap)) sed_gt[bgn_fr:fin_fr, cfg.lb_to_ix[event_type]] = 1 sed_gt_ary.append(sed_gt) at_pd_ary = np.array(at_pd_ary) at_gt_ary = np.array(at_gt_ary) sed_pd_ary = np.array(sed_pd_ary) sed_gt_ary = np.array(sed_gt_ary) # Write out AT and SED presence probabilites. logging.info("at_pd_ary.shape: %s" % (at_pd_ary.shape, )) logging.info("at_gt_ary.shape: %s" % (at_gt_ary.shape, )) logging.info("sed_pd_ary.shape: %s" % (sed_pd_ary.shape, )) logging.info("sed_gt_ary.shape: %s" % (sed_gt_ary.shape, )) dict = {} dict['at_pd_ary'] = at_pd_ary dict['at_gt_ary'] = at_gt_ary dict['sed_pd_ary'] = sed_pd_ary dict['sed_gt_ary'] = sed_gt_ary out_path = os.path.join(workspace, "_tmp", "_at_sed_dict.p") pp_data.create_folder(os.path.dirname(out_path)) cPickle.dump(dict, open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) logging.info("Recognize time: %s" % (time.time() - t1, ))
from hat.metrics import prec_recall_fvalue import hat.backend as K import config as cfg import prepare_dev_data as pp_dev_data import prepare_eva_data as pp_eva_data import csv import cPickle # hyper-params agg_num = 11 hop = 15 fold = 1 n_labels = len( cfg.labels ) # load model md = serializations.load( cfg.eva_md_fd + '/md10.p' ) # prepare data te_X = pp_eva_data.GetAllData( cfg.eva_fe_mel_fd, cfg.eva_csv_path, agg_num, hop ) # do recognize and evaluation thres = 0.4 # thres, tune to prec=recall n_labels = len( cfg.labels ) pp_dev_data.CreateFolder( cfg.eva_results_fd ) txt_out_path = cfg.eva_results_fd+'/task4_results.txt' fwrite = open( txt_out_path, 'w') with open( cfg.eva_csv_path, 'rb') as f: reader = csv.reader(f) lis = list(reader)
def jsc_separation(args): """Joing separation-classification (JSC) source separation. """ workspace = args.workspace scaler_path = os.path.join(workspace, "scalers", "logmel", "training.scaler") scaler = pickle.load(open(scaler_path, 'rb')) md_path = os.path.join(workspace, "models", "main", args.model_name) md = serializations.load(md_path) out_dir = os.path.join(workspace, "separated_wavs", "jsc_separation") pp_data.create_folder(out_dir) observe_nodes = [md.find_layer('seg_masks').output_] f_forward = md.get_observe_forward_func(observe_nodes) audio_dir = os.path.join(os.path.join(workspace, "mixed_audio", "testing")) names = os.listdir(audio_dir) n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) for na in names: if ".mix" in na: # Read yaml bare_name = os.path.splitext(os.path.splitext(na)[0])[0] yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_name) with open(yaml_path, 'r') as f: data = yaml.load(f) event_type = data['event_type'] print(na, event_type) # Read audio audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, _) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio # Spectrogram [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') [f, t, event_spec] = signal.spectral.spectrogram(x=event_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') [f, t, mixed_spec] = signal.spectral.spectrogram(x=mixed_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') bg_spec = bg_spec.T event_spec = event_spec.T mixed_spec = mixed_spec.T # Log Mel spectrogram mixed_x = pp_data.calc_feat(mixed_audio) x3d = pp_data.do_scaler_on_x3d(mixed_x[np.newaxis, ...], scaler) # Segmentation masks [mel_masks] = md.run_function(f_forward, x3d, batch_size=10, tr_phase=0.) mel_masks = mel_masks[0] # (n_time, 64) spec_masks = np.dot(mel_masks, inverse_melW) # (n_time, 513) if args.plot_only: mixed_mel_spec = np.dot(np.abs(mixed_spec), melW.T) bg_mel_spec = np.dot(np.abs(bg_spec), melW.T) event_mel_spec = np.dot(np.abs(event_spec), melW.T) ratio = 1.7 # 5 dB event_mask = (np.sign(event_mel_spec / (bg_mel_spec * ratio) - 1) + 1) / 2 fig, axs = plt.subplots(3, 2, sharex=True) axs[0, 0].matshow(np.log(mixed_mel_spec.T), origin='lower', aspect='auto') axs[0, 1].matshow(event_mask.T, origin='lower', aspect='auto') axs[1, 0].matshow(spec_masks[0].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[1, 1].matshow(spec_masks[1].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[2, 0].matshow(spec_masks[2].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[2, 1].matshow(spec_masks[3].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[0, 0].set_title('log Mel of mixture') axs[0, 1].set_title('IBM of event') axs[1, 0].set_title('babycry') axs[1, 1].set_title('glassbreak') axs[2, 0].set_title('gunshot') axs[2, 1].set_title('bg') plt.show() else: # Separated spec separated_specs = spec_masks * np.abs(mixed_spec)[None, :, :] # Write out all events and bg enlarged_events = cfg.events + ['bg'] for i1 in xrange(4): s = spectrogram_to_wave.recover_wav( separated_specs[i1], mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=len(mixed_audio)) s *= recover_scaler pp_data.write_audio( os.path.join( out_dir, "%s.sep_%s.wav" % (bare_name, enlarged_events[i1])), s, fs) # Write out event s = spectrogram_to_wave.recover_wav( separated_specs[cfg.lb_to_ix[event_type]], mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=len(mixed_audio)) s *= recover_scaler pp_data.write_audio( os.path.join(out_dir, "%s.sep_event.wav" % bare_name), s, fs) # Write out origin mix pp_data.write_audio( os.path.join(out_dir, "%s.sep_mix.wav" % bare_name), mixed_audio, fs)
from hat.metrics import prec_recall_fvalue import hat.backend as K import config as cfg import prepare_dev_data as pp_dev_data import prepare_eva_data as pp_eva_data import csv import cPickle # hyper-params agg_num = 11 hop = 15 fold = 1 n_labels = len(cfg.labels) # load model md = serializations.load(cfg.eva_md_fd + '/md10.p') # prepare data te_X = pp_eva_data.GetAllData(cfg.eva_fe_mel_fd, cfg.eva_csv_path, agg_num, hop) # do recognize and evaluation thres = 0.4 # thres, tune to prec=recall n_labels = len(cfg.labels) pp_dev_data.CreateFolder(cfg.eva_results_fd) txt_out_path = cfg.eva_results_fd + '/task4_results.txt' fwrite = open(txt_out_path, 'w') with open(cfg.eva_csv_path, 'rb') as f: reader = csv.reader(f) lis = list(reader)
def recognize0(): # load data dict = cPickle.load( open( cfg.scrap_fd+'/denoise_enhance_pool_fft_all0.p', 'rb' ) ) tr_X, tr_mask, tr_y, tr_na_list, te_X, te_mask, te_y, te_na_list = dict['tr_X'], dict['tr_mask'], dict['tr_y'], dict['tr_na_list'], dict['te_X'], dict['te_mask'], dict['te_y'], dict['te_na_list'] tr_X = pp_data.wipe_click( tr_X, tr_na_list ) te_X = pp_data.wipe_click( te_X, te_na_list ) print tr_X.shape, tr_y.shape, te_X.shape, te_y.shape x = te_X mask = te_mask y = te_y na_list = te_na_list [n_songs, n_chunks, n_freq] = x.shape #K = 10 K = n_songs x= x[0:K] mask = mask[0:K] for epoch in np.arange(1000,5100,1000): md = serializations.load( cfg.wbl_dev_md_fd+'/cnn_fft/md'+str(epoch)+'_iters.p' ) [out3d, detect3d] = md.predict( [x, mask], batch_size=100 ) # shape: (K, n_chunks, n_out) out3d *= mask[:,:,None] detect3d *= mask[:,:,None] score_ary = [] gt_ary = [] for i1 in xrange(K): uni_mu = detect3d[i1,:,0] / np.sum( detect3d[i1,:,0] ) score = np.sum( out3d[i1,:,0] * uni_mu ) score_ary.append( score ) gt_ary.append( y[i1] ) # plot, deubg, DO NOT DELETE! # print i1, y[i1], na_list[i1], score, np.sum(out3d[i1,:,0]*detect3d[i1,:,0]), np.sum(detect3d[i1,:,0]) # # fig, axs = plt.subplots(4, sharex=True) # axs[0].matshow( np.log(x[i1,:,:].T), origin='lower', aspect='auto' ) # axs[0].set_title('mel spectrogram') # # axs[1].stem( detect3d[i1,:,0] ) # axs[1].set_ylim([0,1]) # axs[1].set_title('detector') # # # axs[2].stem( out3d[i1] ) # axs[2].set_ylim([0,1]) # axs[2].set_title('classifier') # # # axs[3].stem( detect3d[i1,:,0]*out3d[i1,:,0] ) # axs[3].set_ylim([0,1]) # axs[3].set_title('overall') # plt.show() acc_ary, auc = pp_data.get_auc( score_ary, gt_ary ) plt.plot( np.arange( 0, 1+1e-6, 0.1 ), acc_ary, alpha=epoch/float(5000), color='r' ) plt.axis( [0,1,0,1] ) print auc plt.show()
import prepare_dev_data as pp_dev_data import config as cfg from evaluation import * import pickle import cPickle import os # hyper-params agg_num = 100 hop = 1 te_fe_fd = cfg.dev_te_fe_mel_fd test_noise = '-6' # can be '0_', '6_', '-6' thres = 0.2 # load model md = serializations.load( cfg.dev_md_fd + '/md100.p' ) # evaluate for each test feature names = os.listdir(cfg.dev_ann_fd) names = sorted(names) results = [] if not os.path.exists( cfg.dev_results_fd ): os.makedirs( cfg.dev_results_fd ) for na in names: if na[10:12]==test_noise: print na # load data ann_path = cfg.dev_ann_fd + '/' + na gt_list = pp_dev_data.ReadAnn( ann_path ) # ground truth list te_fe = te_fe_fd + '/' + na[0:-4] + '.f' X = cPickle.load( open(te_fe, 'rb') )
import os from hat.models import Model from hat.layers.core import InputLayer, Dense, Dropout from hat.callbacks import SaveModel, Validation from hat.preprocessing import sparse_to_categorical from hat.optimizers import SGD, Adam from hat import serializations # init params n_in = 784 n_hid = 500 n_out = 10 lay_in = InputLayer(in_shape=(n_in, )) a = Dense(n_out=n_hid, act='relu')(lay_in) a = Dropout(p_drop=0.2)(a) a = Dense(n_out=n_hid, act='relu')(a) a = Dropout(p_drop=0.2)(a) lay_out = Dense(n_out=n_out, act='softmax')(a) md = Model(in_layers=[lay_in], out_layers=[lay_out]) md.compile() md.summary() # Save model md_path = 'model.p' serializations.save(md=md, path=md_path) # Load model md_load = serializations.load(md_path)