def create_cv_csv(out_path): """Create cross validation csv file. """ dataset_dir = cfg.dataset_dir workspace = cfg.workspace events = cfg.events n_folds = cfg.n_folds pp_data.create_folder(os.path.dirname(out_path)) f = open(out_path, 'w') f.write("name\tfold\n") names = os.listdir(dataset_dir) for event in events: event_names = [e for e in names if event in e] kf = KFold(n_splits=n_folds, shuffle=False, random_state=None) fold = 0 for (tr_idxes, te_idxes) in kf.split(event_names): for idx in te_idxes: event_name = event_names[idx] f.write("%s\t%d\n" % (event_name, fold)) fold += 1 f.close() print("Write out to %s" % n_folds)
def recognize(args): workspace = cfg.workspace events = cfg.events n_events = args.n_events snr = args.snr md_na = args.model_name lb_to_ix = cfg.lb_to_ix n_out = len(cfg.events) te_fold = cfg.te_fold md_path = os.path.join(workspace, "models", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr, md_na) md = serializations.load(md_path) # Load data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) x = te_x at_gts = te_at_y sed_gts = te_sed_y na_list = te_na_list # Recognize. [at_pds] = md.predict(x) # (N, 16) observe_nodes = [md.find_layer('detect').output_] f_forward = md.get_observe_forward_func(observe_nodes) [seg_masks] = md.run_function(f_forward, x, batch_size=500, tr_phase=0.) # (n_clips, n_time, n_out) seg_masks = np.transpose(seg_masks, (0, 2, 1))[:, :, :, np.newaxis] # Dump to pickle. out_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr, os.path.splitext(md_na)[0]) pp_data.create_folder(out_dir) out_at_path = os.path.join(out_dir, "at_probs.p") out_seg_masks_path = os.path.join(out_dir, "seg_masks.p") cPickle.dump(at_pds, open(out_at_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(seg_masks, open(out_seg_masks_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Print stats. sed_pds = np.mean(seg_masks, axis=-1) # (N, n_out, n_time) sed_pds = np.transpose(sed_pds, (0, 2, 1)) # (N, n_time, n_out) print_stats(at_pds, at_gts, sed_pds, sed_gts)
def no_separation(args): """Write out un-separated mixture as baseline. """ workspace = args.workspace out_dir = os.path.join(workspace, "separated_wavs", "no_separation") pp_data.create_folder(out_dir) audio_dir = os.path.join(workspace, "mixed_audio", "testing") names = os.listdir(audio_dir) for na in names: if '.mix_0db.wav' in na: print(na) audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio bare_na = os.path.splitext(os.path.splitext(na)[0])[0] pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"), mixed_audio, fs) pp_data.write_audio( os.path.join(out_dir, bare_na + ".sep_event.wav"), mixed_audio, fs) print("Write out finished!")
def calculate_scalar(args): workspace = args.workspace stack_num = args.stack_num hop_frames = args.hop_frames filename = args.filename audio_type = 'speech' hdf5_file = os.path.join(args.workspace, "features", "cmplx_spectrogram.h5") data_type = 'train' batch_size = 500 data_loader = pp_data.DataLoader(hdf5_file, data_type, audio_type, stack_num, hop_frames, center_only=True, batch_size=batch_size) x_all = [] iter = 0 max_iter = 100 for (batch_x, batch_y) in data_loader.generate(): x_all.append(batch_x) iter += 1 if iter == max_iter: break x_all = np.concatenate(x_all, axis=0) x_all = np.abs(x_all) x_all = transform(x_all, type='numpy') (mean_, std_) = pp_data.calculate_scalar(x_all) out_path = os.path.join(workspace, "scalars", filename, "scalar.p") pp_data.create_folder(os.path.dirname(out_path)) cPickle.dump((mean_, std_), open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) print("Scalar saved to %s" % out_path)
def train(tr_fe_fd, tr_csv_file, te_fe_fd, te_csv_file, n_concat, hop, scaler, out_md_fd): # Prepare data tr_x, tr_y = pp_data.get_matrix_format_data( fe_fd=tr_fe_fd, csv_file=tr_csv_file, n_concat=n_concat, hop=hop, scaler=scaler) te_x, te_y = pp_data.get_matrix_format_data( fe_fd=te_fe_fd, csv_file=te_csv_file, n_concat=n_concat, hop=hop, scaler=scaler) n_freq = tr_x.shape[2] print 'tr_x.shape:', tr_x.shape # (n_samples, n_concat, n_freq) print 'tr_y.shape:', tr_y.shape # (n_samples, n_labels) # Build model n_out = len(cfg.labels) seq = Sequential() seq.add(InputLayer((n_concat, n_freq))) seq.add(Flatten()) seq.add(Dropout(0.2)) seq.add(Dense(200, act='relu')) seq.add(Dropout(0.2)) seq.add(Dense(200, act='relu')) seq.add(Dropout(0.2)) seq.add(Dense(n_out, act='softmax')) md = seq.compile() md.summary() # Validation. # tr_err, te_err are frame based. To get event based err, run recognize.py validation = Validation(tr_x=tr_x, tr_y=tr_y, va_x=None, va_y=None, te_x=te_x, te_y=te_y, batch_size=500, call_freq=1, dump_path=None) # Save model pp_data.create_folder(out_md_fd) save_model = SaveModel(out_md_fd, call_freq=2) # Callbacks callbacks = [validation, save_model] # Optimizer optimizer = Adam(1e-3) # fit model md.fit(x=tr_x, y=tr_y, batch_size=100, n_epochs=101, loss_func='categorical_crossentropy', optimizer=optimizer, callbacks=callbacks)
def get_avg_stats(args, file_name, bgn_iter, fin_iter, interval_iter): eval_hdf5_path = os.path.join(args.cpickle_dir, "eval.h5") workspace = args.workspace # Load ground truth (te_x, te_y, te_id_list) = pp_data.load_data(eval_hdf5_path) y = te_y # Average prediction probabilities of several iterations prob_dir = os.path.join(workspace, "probs", file_name, "test") names = os.listdir(prob_dir) probs = [] iters = range(bgn_iter, fin_iter, interval_iter) for iter in iters: pickle_path = os.path.join(prob_dir, "prob_%d_iters.p" % iter) prob = cPickle.load(open(pickle_path, 'rb')) probs.append(prob) #print(len(probs)) avg_prob = np.mean(np.array(probs), axis=0) # Compute stats t1 = time.time() n_out = y.shape[1] stats = [] for k in range(n_out): (precisions, recalls, thresholds) = metrics.precision_recall_curve(y[:, k], avg_prob[:, k]) avg_precision = metrics.average_precision_score(y[:, k], avg_prob[:, k], average=None) (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], avg_prob[:, k]) auc = metrics.roc_auc_score(y[:, k], avg_prob[:, k], average=None) #eer = pp_data.eer(avg_prob[:, k], y[:, k]) skip = 1000 dict = {'precisions': precisions[0::skip], 'recalls': recalls[0::skip], 'AP': avg_precision, 'fpr': fpr[0::skip], 'fnr': 1. - tpr[0::skip], 'auc': auc} stats.append(dict) logging.info("Callback time: %s" % (time.time() - t1,)) # Dump stats dump_path = os.path.join(workspace, "stats", pp_data.get_filename(__file__), "test", "avg_%d_%d_%d.p" % (bgn_iter, fin_iter, interval_iter)) pp_data.create_folder(os.path.dirname(dump_path)) cPickle.dump(stats, open(dump_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) #print(stats.shape) #for i, e in enumerate(stats): # logging.info("%d. mAP: %f, auc: %f, d_prime: %f" % (i, e['AP'], e['auc'], pp_data.d_prime(e['auc']))) # Write out to log logging.info("bgn_iter, fin_iter, interval_iter: %d, %d, %d" % (bgn_iter, fin_iter, interval_iter)) logging.info("mAP: %f" % np.mean([e['AP'] for e in stats])) auc = np.mean([e['auc'] for e in stats]) logging.info("auc: %f" % auc) logging.info("d_prime: %f" % pp_data.d_prime(auc))
def my_plot(pd, gt, picture_path, threshold=None): classes = cfg.classes ig = cfg.ig mg = cfg.mg estimate_path = picture_path.replace("picture", "estimate_txt") estimate_path = estimate_path.replace("jpg", "txt") folder, _ = os.path.split(picture_path) if not os.path.exists(folder): create_folder(folder) folder, _ = os.path.split(estimate_path) if not os.path.exists(folder): create_folder(folder) result = open(estimate_path, 'at') n_cls = len(classes) if threshold == None: pd_ = pd.argmax(axis=-1) for i in range(n_cls): #''' plt.subplot(221 + i) plt.plot(range(240), gt[:, i], 'r') plt.bar(left=range(240), height=pd[:, i], width=1, color='b') plt.xlim(0, 251) plt.ylim(0, 1.1) #''' if not i == 0: if threshold == None: class_ind = np.where(pd_ == i)[0] pd_class = np.zeros(pd_.shape) pd_class[class_ind] = 1 segments = pro_boundary(pd_class, 0, mg[i], ig[i]) else: segments = pro_boundary(pd[:, i], threshold[i], mg[i], ig[i]) for j in range(len(segments)): #''' plt.plot([segments[j][0]] * 240, np.arange(240) / 240.0 * 1.1, 'g') plt.plot([segments[j][1]] * 240, np.arange(240) / 240.0 * 1.1, 'g') #''' result.write( str(segments[j][0] * cfg.step_time) + '\t' + str(segments[j][1] * cfg.step_time) + '\t' + classes[i] + '\n') #''' plt.savefig(picture_path) #plt.show() plt.close() #''' result.close()
def eval(md, x, y, out_dir, out_probs_dir, iter_): # Predict t1 = time.time() (n_clips, n_time, n_freq) = x.shape (x, y) = pp_data.transform_data(x, y) prob = md.predict(x) prob = prob.astype(np.float32) if out_dir: pp_data.create_folder(out_dir) #out_prob_path = os.path.join(out_probs_dir, "prob_%d_iters.p" %iter_) # Dump predicted probabilites for future average if out_probs_dir: pp_data.create_folder(out_probs_dir) out_prob_path = os.path.join(out_probs_dir, "prob_%d_iters.p" % iter_) cPickle.dump(prob, open(out_prob_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Compute and dump stats n_out = y.shape[1] stats = [] t1 = time.time() for k in range(n_out): (precisions, recalls, thresholds) = metrics.precision_recall_curve(y[:, k], prob[:, k]) avg_precision = metrics.average_precision_score(y[:, k], prob[:, k], average=None) (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], prob[:, k]) auc = metrics.roc_auc_score(y[:, k], prob[:, k], average=None) #eer = pp_data.eer(prob[:, k], y[:, k]) skip = 1000 dict = { 'precisions': precisions[0::skip], 'recalls': recalls[0::skip], 'AP': avg_precision, 'fpr': fpr[0::skip], 'fnr': 1. - tpr[0::skip], 'auc': auc } stats.append(dict) logging.info("Callback time: %s" % (time.time() - t1, )) dump_path = os.path.join(out_dir, "md%d_iters.p" % iter_) cPickle.dump(stats, open(dump_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) logging.info("mAP: %f" % np.mean([e['AP'] for e in stats]))
def write_out_at_sed(md, gen, f_forward, x, at_y, sed_y, n_events, snr, te_fold): workspace = cfg.workspace pred_at_all = [] seg_masks_all = [] gt_at_all = [] gt_sed_all = [] for [batch_x, batch_at_y, batch_sed_y] in gen.generate(zs=[x, at_y, sed_y]): # AT. [at_pred] = md.predict(batch_x, batch_size=None) pred_at_all.append(at_pred) # SED. [seg_masks] = md.run_function(func=f_forward, z=[batch_x], batch_size=500, tr_phase=0.) seg_masks_all.append(seg_masks) gt_at_all.append(batch_at_y) gt_sed_all.append(batch_sed_y) # DO NOT SHUFFLE DATA! pred_at_all = np.concatenate(pred_at_all, axis=0) seg_masks_all = np.concatenate(seg_masks_all, axis=0) gt_at_all = np.concatenate(gt_at_all, axis=0) gt_sed_all = np.concatenate(gt_sed_all, axis=0) # Compress to float16 to reduce space. pred_at_all = pred_at_all.astype(np.float16) seg_masks_all = seg_masks_all.astype(np.float16) print(pred_at_all.shape) print(seg_masks_all.shape) print(pred_at_all.dtype) out_dir = os.path.join(workspace, "callbacks", "preds", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr, "md%d_iters" % md.iter_) pp_data.create_folder(out_dir) out_at_path = os.path.join(out_dir, "at_probs.p") out_seg_masks_path = os.path.join(out_dir, "seg_masks.p") cPickle.dump(pred_at_all, open(out_at_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(seg_masks_all, open(out_seg_masks_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) thres = 0.5 (tp, fn, fp, tn) = tp_fn_fp_tn(pred_at_all, gt_at_all, thres, average='macro') (prec, recall, fvalue) = prec_recall_fvalue(pred_at_all, gt_at_all, thres, average='macro') logging.info("tp, fn, fp, tn: %d %d %d %d" % (tp, fn, fp, tn)) logging.info("prec, recall, fvalue: %f %f %f" % (prec, recall, fvalue))
def eval(model, x, y, out_dir, out_probs_dir, md_iter): pp_data.create_folder(out_dir) # Predict t1 = time.time() (n_clips, n_time_, n_freq) = x.shape (x, y) = pp_data.transform_data(x, y) prob = model.predict(x) prob = prob.astype(np.float32) print("The %d time into evalution." % md_iter) if out_probs_dir: pp_data.create_folder(out_probs_dir) out_prob_path = os.path.join(out_probs_dir, "prob_%d_iters.p" % md_iter) #cPickle.dump(prob, open(out_prob_path, 'wb')) # Dump predicted probabilities for future average n_out = y.shape[1] stats = [] t1 = time.time() for k in range(n_out): (precisions, recalls, thresholds) = metrics.precision_recall_curve(y[:, k], prob[:, k]) avg_precision = metrics.average_precision_score(y[:, k], prob[:, k], average=None) (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], prob[:, k]) auc = metrics.roc_auc_score(y[:, k], prob[:, k], average=None) eer = pp_data.eer(prob[:, k], y[:, k]) skip = 1000 dict = { 'precisions': precisions[0::skip], 'recalls': recalls[0::skip], 'AP': avg_precision, 'fpr': fpr[0::skip], 'fnr': 1. - tpr[0::skip], 'auc': auc } stats.append(dict) logging.info("Callback time: %s" % (time.time() - t1, )) dump_path = os.path.join(out_dir, "model_%d_iters.p" % (md_iter, )) cPickle.dump(stats, open(dump_path, 'wb')) mAP = np.mean([e['AP'] for e in stats]) logging.info("mAP of %d iteration: %f" % (md_iter, mAP)) return mAP
def plot_training_stat(args): """Plot training and testing loss. Args: workspace: str, path of workspace. tr_snr: float, training SNR. bgn_iter: int, plot from bgn_iter fin_iter: int, plot finish at fin_iter interval_iter: int, interval of files. """ workspace = args.workspace tr_snr = args.tr_snr bgn_iter = args.bgn_iter fin_iter = args.fin_iter interval_iter = args.interval_iter tr_losses, te_losses, iters = [], [], [] # Load stats. stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr)) for iter in xrange(bgn_iter, fin_iter, interval_iter): stats_path = os.path.join(stats_dir, "%diters.p" % iter) dict = cPickle.load(open(stats_path, 'rb')) tr_losses.append(dict['tr_loss']) te_losses.append(dict['te_loss']) iters.append(dict['iter']) # Plot line_tr, = plt.plot(tr_losses, c='b', label="Train") line_te, = plt.plot(te_losses, c='r', label="Test") plt.axis([0, len(iters), 0, max(tr_losses)]) plt.xlabel("Iterations") plt.ylabel("Loss") plt.legend(handles=[line_tr, line_te]) plt.xticks(np.arange(len(iters)), iters) # plt.show() out_path = os.path.join(workspace, "figures", "train_history.png") pp_data.create_folder(os.path.dirname(out_path)) plt.savefig(out_path)
def train(args): num_classes = cfg.num_classes tr_data = h5py.File(args.tr_hdf5_path, 'r+') te_data = h5py.File(args.te_hdf5_path, 'r+') tr_shape = tr_data['x'].shape print("tr_x.shape: %s" % (tr_shape,)) # Build model model = create_model(num_classes, tr_shape) # Save model callback filepath = os.path.join(args.out_model_dir, "gatedAct_rationBal44_lr0.001_normalization_at_cnnRNN_64newMel_240fr.{epoch:02d}-{val_acc:.4f}.hdf5") print(filepath) create_folder(os.path.dirname(filepath)) save_model = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) num_examples = 41498 batch_size = 8 # Data generator gen = RatioDataGenerator(batch_size=batch_size, type='train') # Train model.fit_generator(generator=gen.generate(tr_data), steps_per_epoch=5.5*100, # 100 iters is called an 'epoch' epochs=31, # Maximum 'epoch' to train - With larger dataset loss increased after epoch 28 verbose=1, callbacks=[save_model], validation_data=(te_data['x'], te_data['y']))
def ibm_separation(args): """Ideal binary mask (IBM) source separation. """ workspace = args.workspace out_dir = os.path.join(workspace, "separated_wavs", "ibm_separation") pp_data.create_folder(out_dir) audio_dir = os.path.join(workspace, "mixed_audio", "testing") names = os.listdir(audio_dir) n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_sec = cfg.clip_sec ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) for na in names: if '.mix_0db.wav' in na: print(na) bare_na = os.path.splitext(os.path.splitext(na)[0])[0] audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='magnitude') [f, t, event_spec] = signal.spectral.spectrogram(x=event_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='magnitude') [f, t, mixed_spec] = signal.spectral.spectrogram(x=mixed_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') bg_spec = bg_spec.T event_spec = event_spec.T mixed_spec = mixed_spec.T ratio = 1.7 # 5 dB event_mask = (np.sign(event_spec / (bg_spec * ratio) - 1) + 1) / 2 bg_mask = 1. - event_mask bg_separated_spec = np.abs(mixed_spec) * bg_mask event_separated_spec = np.abs(mixed_spec) * event_mask # Write out separated music s = spectrogram_to_wave.recover_wav(bg_separated_spec, mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_sec)) s *= recover_scaler pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"), s, fs) # Write out separated vocal s = spectrogram_to_wave.recover_wav(event_separated_spec, mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_sec)) s *= recover_scaler pp_data.write_audio( os.path.join(out_dir, bare_na + ".sep_event.wav"), s, fs) print("Finished!")
def jsc_separation(args): """Joing separation-classification (JSC) source separation. """ workspace = args.workspace scaler_path = os.path.join(workspace, "scalers", "logmel", "training.scaler") scaler = pickle.load(open(scaler_path, 'rb')) md_path = os.path.join(workspace, "models", "main", args.model_name) md = serializations.load(md_path) out_dir = os.path.join(workspace, "separated_wavs", "jsc_separation") pp_data.create_folder(out_dir) observe_nodes = [md.find_layer('seg_masks').output_] f_forward = md.get_observe_forward_func(observe_nodes) audio_dir = os.path.join(os.path.join(workspace, "mixed_audio", "testing")) names = os.listdir(audio_dir) n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) for na in names: if ".mix" in na: # Read yaml bare_name = os.path.splitext(os.path.splitext(na)[0])[0] yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_name) with open(yaml_path, 'r') as f: data = yaml.load(f) event_type = data['event_type'] print(na, event_type) # Read audio audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, _) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio # Spectrogram [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') [f, t, event_spec] = signal.spectral.spectrogram(x=event_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') [f, t, mixed_spec] = signal.spectral.spectrogram(x=mixed_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') bg_spec = bg_spec.T event_spec = event_spec.T mixed_spec = mixed_spec.T # Log Mel spectrogram mixed_x = pp_data.calc_feat(mixed_audio) x3d = pp_data.do_scaler_on_x3d(mixed_x[np.newaxis, ...], scaler) # Segmentation masks [mel_masks] = md.run_function(f_forward, x3d, batch_size=10, tr_phase=0.) mel_masks = mel_masks[0] # (n_time, 64) spec_masks = np.dot(mel_masks, inverse_melW) # (n_time, 513) if args.plot_only: mixed_mel_spec = np.dot(np.abs(mixed_spec), melW.T) bg_mel_spec = np.dot(np.abs(bg_spec), melW.T) event_mel_spec = np.dot(np.abs(event_spec), melW.T) ratio = 1.7 # 5 dB event_mask = (np.sign(event_mel_spec / (bg_mel_spec * ratio) - 1) + 1) / 2 fig, axs = plt.subplots(3, 2, sharex=True) axs[0, 0].matshow(np.log(mixed_mel_spec.T), origin='lower', aspect='auto') axs[0, 1].matshow(event_mask.T, origin='lower', aspect='auto') axs[1, 0].matshow(spec_masks[0].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[1, 1].matshow(spec_masks[1].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[2, 0].matshow(spec_masks[2].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[2, 1].matshow(spec_masks[3].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[0, 0].set_title('log Mel of mixture') axs[0, 1].set_title('IBM of event') axs[1, 0].set_title('babycry') axs[1, 1].set_title('glassbreak') axs[2, 0].set_title('gunshot') axs[2, 1].set_title('bg') plt.show() else: # Separated spec separated_specs = spec_masks * np.abs(mixed_spec)[None, :, :] # Write out all events and bg enlarged_events = cfg.events + ['bg'] for i1 in xrange(4): s = spectrogram_to_wave.recover_wav( separated_specs[i1], mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=len(mixed_audio)) s *= recover_scaler pp_data.write_audio( os.path.join( out_dir, "%s.sep_%s.wav" % (bare_name, enlarged_events[i1])), s, fs) # Write out event s = spectrogram_to_wave.recover_wav( separated_specs[cfg.lb_to_ix[event_type]], mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=len(mixed_audio)) s *= recover_scaler pp_data.write_audio( os.path.join(out_dir, "%s.sep_event.wav" % bare_name), s, fs) # Write out origin mix pp_data.write_audio( os.path.join(out_dir, "%s.sep_mix.wav" % bare_name), mixed_audio, fs)
def evaluate_separation(args): workspace = cfg.workspace events = cfg.events te_fold = cfg.te_fold n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_duration = cfg.clip_duration n_events = args.n_events snr = args.snr # Load ground truth data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) at_y = te_at_y sed_y = te_sed_y na_list = te_na_list audio_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) sep_dir = os.path.join(workspace, "sep_audio", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) sep_stats = {} for e in events: sep_stats[e] = {'sdr': [], 'sir': [], 'sar': []} cnt = 0 for (i1, na) in enumerate(na_list): bare_na = os.path.splitext(na)[0] gt_audio_path = os.path.join(audio_dir, "%s.wav" % bare_na) (stereo_audio, _) = pp_data.read_stereo_audio(gt_audio_path, target_fs=fs) gt_event_audio = stereo_audio[:, 0] gt_noise_audio = stereo_audio[:, 1] print(na) for j1 in xrange(len(events)): if at_y[i1][j1] == 1: sep_event_audio_path = os.path.join( sep_dir, "%s.%s.wav" % (bare_na, events[j1])) (sep_event_audio, _) = pp_data.read_audio(sep_event_audio_path, target_fs=fs) sep_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na) (sep_noise_audio, _) = pp_data.read_audio(sep_noise_audio_path, target_fs=fs) ref_array = np.array((gt_event_audio, gt_noise_audio)) est_array = np.array((sep_event_audio, sep_noise_audio)) (sdr, sir, sar) = sdr_sir_sar(ref_array, est_array, sed_y[i1, :, j1], inside_only=True) print(sdr, sir, sar) sep_stats[events[j1]]['sdr'].append(sdr) sep_stats[events[j1]]['sir'].append(sir) sep_stats[events[j1]]['sar'].append(sar) cnt += 1 # if cnt == 5: break print(sep_stats) sep_stat_path = os.path.join(workspace, "sep_stats", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr, "sep_stat.p") pp_data.create_folder(os.path.dirname(sep_stat_path)) cPickle.dump(sep_stats, open(sep_stat_path, 'wb'))
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr lr = args.lr iteration = args.iter # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5") tr_adapt_utt_path = os.path.join(workspace, "adaptive_utterance", "train", "adaptive_utterance_spec.p") te_adapt_utt_path = os.path.join(workspace, "adaptive_utterance", "test", "adaptive_utterance_spec.p") tr_adapt_utt = cPickle.load(open(tr_adapt_utt_path, 'rb')) te_adapt_utt = cPickle.load(open(te_adapt_utt_path, 'rb')) tr_adapt_utt_len_path = os.path.join(workspace, "adaptive_utterance", "train", "adaptive_utterance_max_len.p") te_adapt_utt_len_path = os.path.join(workspace, "adaptive_utterance", "test", "adaptive_utterance_max_len.p") tr_adapt_utt_len = cPickle.load(open(tr_adapt_utt_len_path, 'rb')) te_adapt_utt_len = cPickle.load(open(te_adapt_utt_len_path, 'rb')) max_len = max(tr_adapt_utt_len, te_adapt_utt_len) (tr_x1, tr_x2, tr_y1, tr_y2, tr_name) = pp_data.load_hdf5(tr_hdf5_path) (te_x1, te_x2, te_y1, te_y2, te_name) = pp_data.load_hdf5(te_hdf5_path) print(tr_x1.shape, tr_y1.shape, tr_x2.shape, tr_y2.shape) print(te_x1.shape, te_y1.shape, te_x2.shape, te_y2.shape) print("Load data time: %s s" % (time.time() - t1,)) batch_size = 500 print("%d iterations / epoch" % int(tr_x1.shape[0] / batch_size)) # Scale data. if not True: t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x1 = pp_data.scale_on_3d(tr_x1, scaler) tr_y1 = pp_data.scale_on_2d(tr_y1, scaler) te_x1 = pp_data.scale_on_3d(te_x1, scaler) te_y1 = pp_data.scale_on_2d(te_y1, scaler) tr_x2 = pp_data.scale_on_2d(tr_x2, scaler) tr_y2 = pp_data.scale_on_2d(tr_y2, scaler) te_x2 = pp_data.scale_on_2d(te_x2, scaler) te_y2 = pp_data.scale_on_2d(te_y2, scaler) print("Scale data time: %s s" % (time.time() - t1,)) # Debug plot. if False: plt.matshow(tr_x[0: 1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() pause # Build model (_, n_concat, n_freq) = tr_x1.shape n_hid = 2048 input_dim1 = (257 + 40 + 30) * 2 input_dim2 = (257 + 40 + 30) out_dim1 = (257 + 40 + 30) * 2 out_dim1_irm = 257 + 40 + 64 out_dim2 = (257 + 40 + 30) out_dim2_irm = (257 + 40 + 64) num_factorize = 30 def multiplication(pair_tensors): ''' :param pair_tensors: x: (num_factorize,) y: (num_factorize, n_hid) :return: (n_hid,) sum(x[i]*y[i,:],axis=1) ''' x, y = pair_tensors return K.sum(tf.multiply(y, K.expand_dims(x, -1)), axis=1) adapt_input = Input(shape=(None,), name='adapt_input') layer = Reshape((-1, 257), name='reshape')(adapt_input) layer = Dense(512, activation='relu', name='adapt_dense1')(layer) layer = Dense(512, activation='relu', name='adapt_dense2')(layer) layer = Dense(num_factorize, activation='softmax', name='adapt_out')(layer) alpha = Lambda(lambda x: K.sum(x, axis=1), output_shape=(num_factorize,), name='sequence_sum')(layer) input1 = Input(shape=(n_concat, input_dim1), name='input1') layer = Flatten(name='flatten')(input1) layer = Dense(n_hid * num_factorize, name='dense0')(layer) layer = Reshape((num_factorize, n_hid), name='reshape2')(layer) layer = Lambda(multiplication, name='multiply')([alpha, layer]) layer = Dense(n_hid, activation='relu', name='dense1')(layer) layer = Dropout(0.2)(layer) layer = Dense(n_hid, activation='relu', name='dense2')(layer) layer = Dropout(0.2)(layer) partial_out1 = Dense(out_dim1, name='1_out_linear')(layer) partial_out1_irm = Dense(out_dim1_irm, name='1_out_irm', activation='sigmoid')(layer) out1 = concatenate([partial_out1, partial_out1_irm], name='out1') input2 = Input(shape=(input_dim2,), name='input2') layer = concatenate([input2, out1], name='merge') layer = Dense(n_hid, activation='relu', name='dense3')(layer) layer = Dropout(0.2)(layer) layer = Dense(n_hid, activation='relu', name='dense4')(layer) layer = Dropout(0.2)(layer) partial_out2 = Dense(out_dim2, name='2_out_linear')(layer) partial_out2_irm = Dense(out_dim2_irm, name='2_out_irm', activation='sigmoid')(layer) out2 = concatenate([partial_out2, partial_out2_irm], name='out2') model = Model(inputs=[input1, input2, adapt_input], outputs=[out1, out2]) model.summary() sys.stdout.flush() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr, epsilon=1e-03)) # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train', max_len=max_len) eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100, max_len=max_len) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100, max_len=max_len) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr)) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr)) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2, tr_name, tr_adapt_utt) te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2, te_name, te_adapt_utt) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate([tr_x1, tr_x2, tr_name], [tr_y1, tr_y2], tr_adapt_utt): loss = model.train_on_batch(batch_x, batch_y) iter += 1 # Validate and save training stats. if iter % 100 == 0: tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2, tr_name, tr_adapt_utt) te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2, te_name, te_adapt_utt) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) sys.stdout.flush() # Save out training stats. stat_dict = {'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % (iteration / 20) == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == iteration + 1: break print("Training time: %s s" % (time.time() - t1,))
def train(args): if os.path.exists(args.out_model_dir): shutil.rmtree(args.out_model_dir) create_folder(args.out_model_dir) num_classes = cfg.num_classes # Load training & testing data (tr_x, tr_y, tr_na_list) = load_hdf5(args.tr_hdf5_path, verbose=1) (te_x, te_y, te_na_list) = load_hdf5(args.te_hdf5_path, verbose=1) print("") # Scale data tr_x = do_scale(tr_x, args.scaler_path, verbose=1) te_x = do_scale(te_x, args.scaler_path, verbose=1) # Build model (_, n_time, n_freq) = tr_x.shape #pdb.set_trace() input = Input(shape=(n_time, n_freq), name='input_layer') input_ = Reshape((n_time, n_freq, 1))(input) ''' block1 = Conv_BN(input_, 8, (3, 3), act="relu") block1 = Conv_BN(block1, 32, (3, 3), act="relu") block1 = Conv_BN(block1, 64, (3, 3), act="relu") block1 = block_a(input_, 8) block1 = block_a(block1, 32) block1 = block_a(block1, 64) ''' block1 = block_b(input_, 8) block1 = block_b(block1, 32) block1 = block_b(block1, 64) block1 = MaxPooling2D(pool_size=(1, 2))(block1) block2 = block_c(block1, 64) block2 = MaxPooling2D(pool_size=(1, 2))(block2) block3 = block_c(block2, 64) block3 = MaxPooling2D(pool_size=(1, 2))(block3) block4 = block_c(block3, 64) block4 = MaxPooling2D(pool_size=(1, 2))(block4) cnnout = Conv_BN(block4, 128, (1, 1), act="relu", bias=True) cnnout = MaxPooling2D(pool_size=(1, 2))(cnnout) cnnout = Reshape((240, 256))(cnnout) rnn = Bidirectional( GRU(128, activation='relu', return_sequences=True, kernel_regularizer=regularizers.l2(0.01), recurrent_regularizer=regularizers.l2(0.01)))(cnnout) out = TimeDistributed(Dense( num_classes, activation='softmax', kernel_regularizer=regularizers.l2(0.0), ), name='output_layer')(rnn) model = Model(input, out) model.summary() # Compile model adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.009) sgd = optimizers.SGD(lr=0.01, momentum=0.9, decay=0.0) model.compile(loss=focal_loss(alpha=[1, 1, 1, 1], gamma=1), optimizer="adam", metrics=[myacc(threshold=0.5)]) # Save model callback filepath = os.path.join( args.out_model_dir, "aed-batchsize_50-lr_0.01-{epoch:04d}-{val_Acc:.4f}.hdf5") save_model = ModelCheckpoint(filepath=filepath, monitor='val_Acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) # Train ''' history=model.fit( x=tr_x, y=tr_y, batch_size=50, epochs=200, verbose=1, shuffle=True, class_weight="auto", callbacks=[save_model], validation_data=(te_x,te_y) ) ''' # Data generator gen = Generator(batch_size=50, type='train') history = model.fit_generator( generator=gen.generate([tr_x], [tr_y]), steps_per_epoch=300, # 100 iters is called an 'epoch' epochs=100, # Maximum 'epoch' to train verbose=1, class_weight="auto", callbacks=[save_model], validation_data=(te_x, te_y)) with open('src/log.py', 'w') as f: f.write("history=") f.write(str(history.history))
def inference_wiener(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) # Load model. target_type = ['speech', 'noise'] model_dict = {} for e in target_type: n_freq = 257 model = DNN(stack_num, n_freq) model_path = os.path.join(workspace, "models", filename, e, "md_%d_iters.tar" % iter) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) # Move model to GPU. if cuda: model.cuda() model.eval() model_dict[e] = model # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio) cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window) x = np.abs(cmplx_sp) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(x, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) # Predict. pred_dict = {} for e in target_type: pred = forward(model_dict[e], x, mean_, std_, cuda) pred = pred.data.cpu().numpy() pred_dict[e] = pred print(cnt, name) # Wiener filter. pred_mag_sp = pred_dict['speech'] / ( pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp) pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp) frames = stft.istft(pred_cmplx_sp) cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(frames, hop_size, cola_constant) seq = seq[0:len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: vmin = -5. vmax = 5. fig, axs = plt.subplots(3, 1, sharex=True) axs[0].matshow(np.log(np.abs(cmplx_sp)).T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T, origin='lower', aspect='auto', cmap='jet') plt.show()
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_noise_frame = args.noise_frame n_hop = args.n_hop n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = False # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) model = load_model(model_path) # Load scaler. # scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") # scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) mel_basis = librosa.filters.mel(cfg.sample_rate, cfg.n_window, n_mels=40) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data input1_3d, input2, out1, out2 = pp_data.get_input_output_layer( mixed_cmplx_x, speech_x, noise_x, alpha, n_concat, n_noise_frame, n_hop, mel_basis) # Predict. pred = model.predict([input1_3d, input2]) print(cnt, na) sys.stdout.flush() # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) # post processing pred_speech_lps = 1 / 3.0 * (pred[0][:, :257] + pred[1][:, :257] + np.log(np.abs(mixed_cmplx_x) + 1e-08) + np.log(pred[1][:, 327:584])) # Debug plot. if args.visualize: out_path = os.path.join(workspace, "figures", "test", "%ddb" % int(te_snr), "%s.all.png" % na) pp_data.create_folder(os.path.dirname(out_path)) fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(np.log(np.abs(mixed_cmplx_x.T) + 1e-08), origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(speech_x.T + 1e-08), origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred_speech_lps.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig(out_path) plt.close('all') # plt.show() out_path = os.path.join(workspace, "figures", "test", "%ddb" % int(te_snr), "%s.mixture.png" % na) display.specshow(np.log(np.abs(mixed_cmplx_x.T) + 1e-08)) plt.title("%ddb mixture log spectrogram" % int(te_snr)) plt.savefig(out_path) out_path = os.path.join(workspace, "figures", "test", "%ddb" % int(te_snr), "%s.clean.png" % na) display.specshow(np.log(speech_x.T + 1e-08)) plt.title("Clean speech log spectrogram") plt.savefig(out_path) out_path = os.path.join(workspace, "figures", "test", "%ddb" % int(te_snr), "%s.enh.png" % na) display.specshow(pred_speech_lps.T) plt.title("Enhanced speech log spectrogram") plt.savefig(out_path) plt.close('all') # Recover enhanced wav. pred_sp = np.exp(pred_speech_lps) s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(args): cuda = args.use_cuda and torch.cuda.is_available() workspace = args.workspace model_name = args.model_name feat_type = args.feat_type script_na = args.script_na # Load data. te_packed_feat_path = os.path.join(workspace, "packed_features", feat_type, "test.p") [te_x_list, te_y_list, te_na_list] = cPickle.load(open(te_packed_feat_path, 'rb')) # Scale. if True: scale_path = os.path.join(workspace, "scalers", feat_type, "scaler.p") scaler = pickle.load(open(scale_path, 'rb')) te_x_list = pp_data.scale_on_x_list(te_x_list, scaler) # Construct model topology. n_concat = 3 te_n_hop = 1 n_freq = te_x_list[0].shape[-1] n_out = te_y_list[0].shape[-1] model = Net(n_concat, n_freq, n_out) # Init the weights of model using trained weights. model_path = os.path.join(workspace, "models", script_na, feat_type, model_name) if os.path.isfile(model_path): print("Loading checkpoint '%s'" % model_path) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) else: raise Exception("Model path %s does not exist!" % model_path) # Move model to GPU. if cuda: model.cuda() # Directory to write out transcript midi files. out_midi_dir = os.path.join(workspace, "out_midis", pp_data.get_filename(__file__), feat_type) pp_data.create_folder(out_midi_dir) # Data to 3d. n_half = (n_concat - 1) / 2 for i1 in xrange(len(te_x_list)): x = te_x_list[i1] # (n_time, n_freq) y = te_y_list[i1] # (n_time, n_out) bare_na = os.path.splitext(te_na_list[i1])[0] (n_time, n_freq) = x.shape zero_pad = np.zeros((n_half, n_freq)) x = np.concatenate((zero_pad, x, zero_pad), axis=0) x3d = pp_data.mat_2d_to_3d(x, n_concat, te_n_hop) # (n_time, n_concat, n_freq) # Move data to GPU. x3d = torch.Tensor(x3d) x3d = Variable(x3d) if cuda: x3d = x3d.cuda() # Inference. model.eval() pred = model(x3d) # (n_time, n_out) # Convert data type to numpy. pred = pred.data.cpu().numpy() # Threshold and write out predicted piano roll to midi file. mid_roll = pp_data.prob_to_midi_roll(pred, 0.5) out_path = os.path.join(out_midi_dir, "%s.mid" % bare_na) print("Write out to: %s" % out_path) pp_data.write_midi_roll_to_midi(mid_roll, out_path) # Debug plot. if True: fig, axs = plt.subplots(3, 1, sharex=True) axs[0].matshow(y.T, origin='lower', aspect='auto') axs[1].matshow(pred.T, origin='lower', aspect='auto') binary_pred = (np.sign(pred - 0.5) + 1) / 2 axs[2].matshow(binary_pred.T, origin='lower', aspect='auto') axs[0].set_title("Ground truth") axs[1].set_title("DNN output probability") axs[2].set_title("DNN output probability after thresholding") for j1 in xrange(3): axs[j1].set_ylabel('note index') axs[j1].set_xlabel('frames') axs[j1].xaxis.set_label_coords(1.06, -0.01) axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show()
def inference(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) audio_type = 'speech' sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) speech_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/timit_wavs/subtest" # Load model model_path = os.path.join(workspace, "models", filename, audio_type, "md_%d_iters.tar" % iter) n_freq = 257 model = DNN(stack_num, n_freq) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if cuda: model.cuda() # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) dft = pp_data.DFT(fft_size, cuda) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio0, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio0) # Enframe frames = stft.enframe(audio, fft_size, hop_size) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(frames, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) pred_frames = forward(model, x, mean_, std_, cuda) pred_frames = pred_frames.data.cpu().numpy() # cola_constant = 0.5 # seq = stft.overlap_add(pred_frames, hop_size, cola_constant) pred_frames *= window cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(pred_frames, hop_size, cola_constant) seq = seq[0 : len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: clean_audio_path = os.path.join(speech_dir, name.split('.')[0] + ".WAV") (clean_audio, _) = pp_data.read_audio(clean_audio_path, sample_rate) clean_audio = pp_data.normalize(clean_audio) clean_frames = stft.enframe(clean_audio, fft_size, hop_size) mix_sp = np.abs(np.fft.rfft(frames * window, norm='ortho')) enh_sp = np.abs(np.fft.rfft(pred_frames * window, norm='ortho')) clean_sp = np.abs(np.fft.rfft(clean_frames * window, norm='ortho')) K = 10 fig, axs = plt.subplots(K/2,2, sharex=True) for k in range(K): axs[k / 2, k % 2].plot(frames[k+100], color='y') axs[k / 2, k % 2].plot(clean_frames[k+100], color='r') axs[k / 2, k % 2].plot(pred_frames[k+100], color='b') plt.show() # import crash # asdf vmin = -5. vmax = 5. fig, axs = plt.subplots(3,1, sharex=True) axs[0].matshow(np.log(np.abs(mix_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) axs[1].matshow(np.log(np.abs(clean_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) axs[2].matshow(np.log(np.abs(enh_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) plt.show()
def create_mixture_csv(args): """Create csv containing mixture information. Each line in the .csv file contains [speech_name, noise_name, noise_onset, noise_offset] Args: workspace: str, path of workspace. speech_dir: str, path of speech data. noise_dir: str, path of noise data. data_type: str, 'train' | 'test'. magnification: int, only used when data_type='train', number of noise selected to mix with a speech. E.g., when magnication=3, then 4620 speech with create 4620*3 mixtures. magnification should not larger than the species of noises. """ workspace = args.workspace speech_dir = args.speech_dir noise_dir = args.noise_dir interfere_dir = args.interfere_dir data_type = args.data_type magnification = args.magnification fs = cfg.sample_rate speech_names = [ na for na in os.listdir(speech_dir) if na.lower().endswith(".wav") ] noise_names = [ na for na in os.listdir(noise_dir) if na.lower().endswith(".wav") ] interfere_names = [ na for na in os.listdir(interfere_dir) if na.lower().endswith(".wav") ] rs = np.random.RandomState(0) out_csv_path = os.path.join(workspace, "mixture_csvs", "%s.csv" % data_type) pp_data.create_folder(os.path.dirname(out_csv_path)) cnt = 0 f = open(out_csv_path, 'w') f.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("speech_name", "noise_name", "noise_onset", "noise_offset", "interfere_onset", "interfere_offset")) for speech_na in speech_names: # Read speech. speech_path = os.path.join(speech_dir, speech_na) (speech_audio, _) = read_audio(speech_path, fs) len_speech = len(speech_audio) # For training data, mix each speech with randomly picked #magnification noises. if data_type == 'train': selected_noise_names = rs.choice(noise_names, size=magnification, replace=False) # For test data, mix each speech with all noises. elif data_type == 'test': selected_noise_names = noise_names else: raise Exception("data_type must be train | test!") selected_interfere_names = rs.choice(interfere_names, size=1, replace=False) # Mix one speech with different noises many times. for idx, noise_na in enumerate(selected_noise_names): noise_path = os.path.join(noise_dir, noise_na) (noise_audio, _) = read_audio(noise_path, fs) interfere_path = os.path.join(interfere_dir, selected_interfere_names[0]) interfere_audio, _ = read_audio(interfere_path, fs) len_infer = len(interfere_audio) if len_infer <= len_speech: infer_onset = 0 infer_offset = len_speech # If noise longer than speech then randomly select a segment of noise. else: infer_onset = rs.randint(0, len_infer - len_speech, size=1)[0] infer_offset = infer_onset + len_speech len_noise = len(noise_audio) if len_noise <= len_speech: noise_onset = 0 nosie_offset = len_speech # If noise longer than speech then randomly select a segment of noise. else: noise_onset = rs.randint(0, len_noise - len_speech, size=1)[0] nosie_offset = noise_onset + len_speech if cnt % 100 == 0: print cnt cnt += 1 f.write("%s\t%s\t%s\t%d\t%d\t%d\t%d\n" % (speech_na, noise_na, selected_interfere_names[0], noise_onset, nosie_offset, infer_onset, infer_offset)) f.close() print(out_csv_path) print("Create %s mixture csv finished!" % data_type)
def create_mix_yaml(cv_path, n_events, out_path): """Create yaml file containing the mixture information. """ workspace = cfg.workspace events = cfg.events n_folds = cfg.n_folds onset_list = cfg.onset_list rs = np.random.RandomState(0) # Read cross validation csv cv_path = os.path.join(workspace, "cross_validation.csv") with open(cv_path, 'rb') as f: reader = csv.reader(f, delimiter='\t') lis = list(reader) yaml_data = [] cnt = 0 for tar_fold in xrange(n_folds): for loop in xrange(n_events): # Initialize dict dict = {} for e in events: dict[e] = [] # Read all rows in cross validation csv for i1 in xrange(1, len(lis)): [name, fold] = lis[i1] fold = int(fold) if fold == tar_fold: for e in events: if e in name: dict[e].append(name) while _get_n_elements_in_dict(dict) >= n_events: # Randomly select event files. selected_names = [] events_pool = _get_n_largest_events(dict, n_events, rs) selected_events = rs.choice(events_pool, size=n_events, replace=False) for e in selected_events: sel_na = rs.choice(dict[e], replace=False) sel_na = str(sel_na) selected_names.append(sel_na) dict[e].remove(sel_na) if len(dict[e]) == 0: dict.pop(e) # Combine yaml info. mixture_data = { 'name': "%05d.wav" % cnt, 'fold': tar_fold, 'events': [] } cnt += 1 for (j1, na) in enumerate(selected_names): event_data = { 'file_name': na, 'event': re.split('(\d+)', na)[0], 'onset': onset_list[j1], 'fold': 0 } mixture_data['events'].append(event_data) yaml_data.append(mixture_data) # Write out yaml file. pp_data.create_folder(os.path.dirname(out_path)) with open(out_path, 'w') as f: f.write(yaml.dump(yaml_data, default_flow_style=False)) print("len(yaml_file): %d" % len(yaml_data)) print("Write out to %s" % out_path)
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr lr = args.lr # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5") (tr_x, tr_y) = pp_data.load_hdf5(tr_hdf5_path) (te_x, te_y) = pp_data.load_hdf5(te_hdf5_path) print(tr_x.shape, tr_y.shape) print(te_x.shape, te_y.shape) print("Load data time: %s s" % (time.time() - t1,)) batch_size = 128 print("%d iterations / epoch" % int(tr_x.shape[0] / batch_size)) # Build model _, n_freq = tr_x.shape # encode T = 1 data = Input(shape=[n_freq]) x = Reshape([1, T, n_freq])(data) x1 = Conv2D(10, (T, 11), strides=(10, 1), data_format='channels_first', padding='same')(x) x1 = BatchNormalization(axis=-1)(x1) x1 = Activation('relu')(x1) x2 = Conv2D(12, (T, 7), strides=(10, 1), data_format='channels_first', padding='same')(x1) x2 = BatchNormalization(axis=-1)(x2) x2 = Activation('relu')(x2) x3 = Conv2D(14, (T, 5), strides=(10, 1), data_format='channels_first', padding='same')(x2) x3 = BatchNormalization(axis=-1)(x3) x3 = Activation('relu')(x3) x4 = Conv2D(15, (T, 5), strides=(10, 1), data_format='channels_first', padding='same')(x3) x4 = BatchNormalization(axis=-1)(x4) x4 = Activation('relu')(x4) x5 = Conv2D(19, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(x4) x5 = BatchNormalization(axis=-1)(x5) x5 = Activation('relu')(x5) x6 = Conv2D(21, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(x5) x6 = BatchNormalization(axis=-1)(x6) x6 = Activation('relu')(x6) x7 = Conv2D(23, (1, 7), strides=(10, 1), data_format='channels_first', padding='same')(x6) x7 = BatchNormalization(axis=-1)(x7) x7 = Activation('relu')(x7) x8 = Conv2D(25, (1, 11), strides=(10, 1), data_format='channels_first', padding='same')(x7) x8 = BatchNormalization(axis=-1)(x8) x8 = Activation('relu')(x8) # decode y1 = Conv2D(23, (1, 7), strides=(10, 1), data_format='channels_first', padding='same')(x8) y1 = Add()([y1, x7]) y1 = BatchNormalization(axis=-1)(y1) y1 = Activation('relu')(y1) y2 = Conv2D(21, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(y1) y2 = Add()([y2, x6]) y2 = BatchNormalization(axis=-1)(y2) y2 = Activation('relu')(y2) y3 = Conv2D(19, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(y2) y3 = Add()([y3, x5]) y3 = BatchNormalization(axis=-1)(y3) y3 = Activation('relu')(y3) y4 = Conv2D(15, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(y3) y4 = Add()([y4, x4]) y4 = BatchNormalization(axis=-1)(y4) y4 = Activation('relu')(y4) y5 = Conv2D(14, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(y4) y5 = Add()([y5, x3]) y5 = BatchNormalization(axis=-1)(y5) y5 = Activation('relu')(y5) y6 = Conv2D(12, (1, 7), strides=(10, 1), data_format='channels_first', padding='same')(y5) y6 = Add()([y6, x2]) y6 = BatchNormalization(axis=-1)(y6) y6 = Activation('relu')(y6) y7 = Conv2D(10, (1, 11), strides=(10, 1), data_format='channels_first', padding='same')(y6) y7 = Add()([y7, x1]) y7 = BatchNormalization(axis=-1)(y7) y7 = Activation('relu')(y7) y8 = Conv2D(1, (1, n_freq), strides=(10, 1), data_format='channels_first', padding='same')(y7) # y5 = BatchNormalization(axis=-1)(y5) y8 = Activation('relu')(y8) out = Reshape([n_freq])(y8) model = Model(inputs=data, outputs=out) adam = optimizers.Adam(lr=lr) model.compile(loss='mean_absolute_error', optimizer=adam) model.summary() # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train') te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=200) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr)) pp_data.create_folder(model_dir) # Train. t1 = time.time() model.fit_generator(tr_gen.generate(xs=[tr_x], ys=[tr_y]), validation_data=te_gen.generate(xs=[te_x], ys=[te_y]), validation_steps=100, steps_per_epoch=200, epochs=200) print("Training complete.") model_name = 'FullyCNN.h5' model_path = os.path.join(model_dir, model_name) model.save(model_path) print("Training time: %s s" % (time.time() - t1,))
def separate(args, bgn_iter, fin_iter, interval): workspace = cfg.workspace events = cfg.events te_fold = cfg.te_fold n_events = args.n_events n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_duration = cfg.clip_duration snr = args.snr # Load ground truth data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) at_y = te_at_y sed_y = te_sed_y na_list = te_na_list # Load and sum preds_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) at_probs_list, seg_masks_list = [], [] for iter in xrange(bgn_iter, fin_iter, interval): seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter, "seg_masks.p") seg_masks = cPickle.load(open(seg_masks_path, 'rb')) seg_masks_list.append(seg_masks) seg_masks = np.mean(seg_masks_list, axis=0) # (n_clips, n_classes, n_time, n_freq) print(seg_masks.shape) # audio_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) sep_dir = os.path.join(workspace, "sep_audio", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) pp_data.create_folder(sep_dir) ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) # (64, 513) seg_stats = {} for e in events: seg_stats[e] = { 'fvalue': [], 'auc': [], 'iou': [], 'hit': [], 'fa': [], 'tp': [], 'fn': [], 'fp': [] } cnt = 0 for (i1, na) in enumerate(na_list): bare_na = os.path.splitext(na)[0] audio_path = os.path.join(audio_dir, "%s.wav" % bare_na) (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs) event_audio = stereo_audio[:, 0] noise_audio = stereo_audio[:, 1] mixed_audio = event_audio + noise_audio mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window, n_overlap) mixed_sp = np.abs(mixed_cmplx_sp) event_sp = np.abs( pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap)) noise_sp = np.abs( pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap)) sm = seg_masks[i1] # (n_classes, n_time, n_freq) sm_upsampled = np.dot(sm, inverse_melW) # (n_classes, n_time, 513) print(na) # Write out separated events. for j1 in xrange(len(events)): if at_y[i1][j1] == 1: (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) (hit, fa) = hit_fa(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) seg_stats[events[j1]]['fvalue'].append(fvalue) seg_stats[events[j1]]['auc'].append(auc) seg_stats[events[j1]]['iou'].append(iou) seg_stats[events[j1]]['hit'].append(hit) seg_stats[events[j1]]['fa'].append(fa) seg_stats[events[j1]]['tp'].append(tp) seg_stats[events[j1]]['fn'].append(fn) seg_stats[events[j1]]['fp'].append(fp) sep_event_sp = sm_upsampled[j1] * mixed_sp sep_event_s = spectrogram_to_wave.recover_wav( sep_event_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_duration)) sep_event_s *= recover_scaler out_event_audio_path = os.path.join( sep_dir, "%s.%s.wav" % (bare_na, events[j1])) pp_data.write_audio(out_event_audio_path, sep_event_s, fs) # Write out separated noise. sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.) sep_noise_sp = sm_noise_upsampled * mixed_sp sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int( fs * clip_duration)) sep_noise_s *= recover_scaler out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na) pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs) cnt += 1 # if cnt == 2: break fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], [] for e in events: fvalues.append(np.mean(seg_stats[e]['fvalue'])) ious.append(np.mean(seg_stats[e]['iou'])) aucs.append(np.mean(seg_stats[e]['auc'])) hits.append(np.mean(seg_stats[e]['hit'])) fas.append(np.mean(seg_stats[e]['fa'])) tps.append(np.mean(seg_stats[e]['tp'])) fns.append(np.mean(seg_stats[e]['fn'])) fps.append(np.mean(seg_stats[e]['fp'])) logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" % ("".ljust(16))) logging.info( "%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs), np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) - np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps))) for i1 in xrange(len(events)): logging.info( "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1], fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))
pp.write_audio(clean_path, clean_new, conf1.fs) clean_spec = pp.calc_sp(clean_new, mode='magnitude') mixed_spec = pp.calc_sp(mixed, mode='complex') clean_all.append(clean_spec) mixed_all.append(mixed_spec) print(len(clean_all), ',', len(mixed_all)) num_te = pp.pack_features(mixed_all, clean_all, 'test') compute_scaler('test') return num_tr, num_te, get_gpu() pp.create_folder(conf1.train_folder) pp.create_folder(conf1.test_folder) pp.create_folder(conf1.packed_feature_dir) pp.create_folder(conf1.data_train_dir) pp.create_folder(conf1.data_test_dir) pp.create_folder(conf1.logs) pp.create_folder(conf1.model_dir) pp.create_folder(conf1.stats_dir) t1 = time.time() num_tr, num_te = prepare_database()
def train(args): workspace = cfg.workspace te_fold = cfg.te_fold n_events = args.n_events snr = args.snr feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) print(tr_x.shape, tr_at_y.shape) print(te_x.shape, te_at_y.shape) (_, n_time, n_freq) = tr_x.shape n_out = len(cfg.events) if False: for e in tr_x: plt.matshow(e.T, origin='lower', aspect='auto') plt.show() # Build model. lay_in = InputLayer(in_shape=(n_time, n_freq)) a = Reshape((1, n_time, n_freq))(lay_in) a = Conv2D(n_outfmaps=64, n_row=3, n_col=5, act='linear', strides=(1, 1), border_mode=(1, 2))(a) a = BN(axis=(0, 2, 3))(a) a = Activation('relu')(a) a = Conv2D(n_outfmaps=64, n_row=3, n_col=5, act='linear', strides=(1, 1), border_mode=(1, 2))(a) a = BN(axis=(0, 2, 3))(a) a = Activation('relu')(a) a = Dropout(p_drop=0.2)(a) a = Conv2D(n_outfmaps=64, n_row=3, n_col=5, act='linear', strides=(1, 1), border_mode=(1, 2))(a) a = BN(axis=(0, 2, 3))(a) a = Activation('relu')(a) a = Conv2D(n_outfmaps=64, n_row=3, n_col=5, act='linear', strides=(1, 1), border_mode=(1, 2))(a) a = BN(axis=(0, 2, 3))(a) a = Activation('relu')(a) a = Dropout(p_drop=0.2)(a) a = Conv2D(n_outfmaps=64, n_row=3, n_col=5, act='linear', strides=(1, 1), border_mode=(1, 2))(a) a = BN(axis=(0, 2, 3))(a) a = Activation('relu')(a) a = Conv2D(n_outfmaps=64, n_row=3, n_col=5, act='linear', strides=(1, 1), border_mode=(1, 2))(a) a = BN(axis=(0, 2, 3))(a) a = Activation('relu')(a) a = Dropout(p_drop=0.2)(a) a = Conv2D(n_outfmaps=n_out, n_row=1, n_col=1, act='sigmoid', border_mode=(0, 0), name='seg_masks')(a) a8 = Lambda(_global_avg_pooling, name='a8')(a) md = Model([lay_in], [a8]) md.compile() md.summary(is_logging=True) # Callbacks. md_dir = os.path.join(workspace, "models", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) pp_data.create_folder(md_dir) save_model = SaveModel(md_dir, call_freq=50, type='iter', is_logging=True) validation = Validation(te_x=te_x, te_y=te_at_y, batch_size=50, call_freq=50, metrics=['binary_crossentropy'], dump_path=None, is_logging=True) callbacks = [save_model, validation] observe_nodes = [md.find_layer('seg_masks').output_] f_forward = md.get_observe_forward_func(observe_nodes) # Generator. tr_gen = DataGenerator(batch_size=32, type='train') eva_gen = DataGenerator2(batch_size=32, type='test') # Train. loss_ary = [] t1 = time.time() optimizer = Adam(1e-3) for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_at_y]): if md.iter_ % 50 == 0: logging.info("iter: %d tr_loss: %f time: %s" % ( md.iter_, np.mean(loss_ary), time.time() - t1, )) t1 = time.time() loss_ary = [] # if md.iter_ % 200 == 0: # write_out_at_sed(md, eva_gen, f_forward, te_x, te_at_y, te_sed_y, n_events, snr, te_fold) if md.iter_ == 5001: break loss = md.train_on_batch(batch_x, batch_y, loss_func='binary_crossentropy', optimizer=optimizer, callbacks=callbacks) loss_ary.append(loss)
parser_get_sep_stats.add_argument('--n_events', type=int) parser_get_sep_stats.add_argument('--snr', type=int) parser_b2 = subparsers.add_parser('avg_recognize') parser_b2.add_argument('--n_events', type=int) parser_b2.add_argument('--snr', type=int) parser_c = subparsers.add_parser('plot_hotmap') parser_c.add_argument('--model_name', type=str) parser_c.add_argument('--n_events', type=int) args = parser.parse_args() logs_dir = os.path.join(cfg.workspace, "logs", pp_data.get_filename(__file__)) pp_data.create_folder(logs_dir) logging = pp_data.create_logging(logs_dir, filemode='w') logging.info(os.path.abspath(__file__)) logging.info(sys.argv) if args.mode == "train": train(args) elif args.mode == "recognize": recognize(args) elif args.mode == "get_stats": bgn_iter, fin_iter, interval = 2000, 3001, 200 get_stats(args, bgn_iter, fin_iter, interval) elif args.mode == "separate": bgn_iter, fin_iter, interval = 2000, 3001, 200 separate(args, bgn_iter, fin_iter, interval) elif args.mode == "evaluate_separation":
def train(args): num_classes = cfg.num_classes # Load training & testing data (tr_x, tr_y, tr_na_list) = load_hdf5_data(args.tr_hdf5_path, verbose=1) (te_x, te_y, te_na_list) = load_hdf5_data(args.te_hdf5_path, verbose=1) print("tr_x.shape: %s" % (tr_x.shape,))#removed this dec4 since its not helpful really # Scale data tr_x = do_scale(tr_x, args.scaler_path, verbose=1) te_x = do_scale(te_x, args.scaler_path, verbose=1) #print("delme dec 1, tr_x.shape", tr_x.shape)#output=51, 240, 64 #print("delme dec 1, te_x.shape", te_x.shape)#:51, 240, 64 # Build model (_, n_time, n_freq) = tr_x.shape # (N, 240, 64) input_logmel = Input(shape=(n_time, n_freq), name='in_layer') # (N, 240, 64) a1 = Reshape((n_time, n_freq, 1))(input_logmel) # (N, 240, 64, 1) a1 = block(a1) a1 = block(a1) a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 240, 32, 128) a1 = block(a1) a1 = block(a1) a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 240, 16, 128) a1 = block(a1) a1 = block(a1) a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 240, 8, 128) a1 = block(a1) a1 = block(a1) a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 240, 4, 128) a1 = Conv2D(256, (3, 3), padding="same", activation="relu", use_bias=True)(a1) a1 = MaxPooling2D(pool_size=(1, 4))(a1) # (N, 240, 1, 256) a1 = Reshape((240, 256))(a1) # (N, 240, 256) # Gated BGRU rnnout = Bidirectional(GRU(128, activation='linear', return_sequences=True))(a1) rnnout_gate = Bidirectional(GRU(128, activation='sigmoid', return_sequences=True))(a1) a2 = Multiply()([rnnout, rnnout_gate]) # Attention cla = TimeDistributed(Dense(num_classes, activation='sigmoid'), name='localization_layer')(a2) att = TimeDistributed(Dense(num_classes, activation='softmax'))(a2) out = Lambda(outfunc, output_shape=(num_classes,))([cla, att]) model = Model(input_logmel, out) model.summary() adam_optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) # Compile model model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])#finn delme dec1 you can change this to categorical_accuracy to see if you can subvert the keras error. However you dont know if its the right hting to do. Keep a look out at the results to determineif it did what you wanted # Save model callback print("working here 1") filepath = os.path.join(args.out_model_dir, "{0}_{1}.hdf5".format(args.model_name, args.epochs)) create_folder(os.path.dirname(filepath)) save_model = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) # Data generator # Train t_train = time.time() print("FINN Training started")#this is really just me seeing if this is where most of the time is spent use_generator = False if use_generator: gen = RatioDataGenerator(batch_size=args.batch_size, type='train')#batch size should be manipulated from 44 model.fit_generator(generator=gen.generate([tr_x], [tr_y]), steps_per_epoch=args.steps_p_epoch, # 100 iters is called an 'epoch' epochs=args.epochs, #31 # Maximum 'epoch' to train verbose=1, callbacks=[save_model], validation_data=(te_x, te_y)) else: model.fit(x=tr_x, y=tr_y, batch_size=20, epochs=args.epochs, verbose=1, callbacks=[save_model], validation_split=0.05, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=args.init_epoch, steps_per_epoch=None, validation_steps=None) model.save(os.path.join(args.out_model_dir, "final_model_{}_{}epochs.h5".format(args.model_name, args.epochs)))#am not sure if fit will save the final epoch.. pretty sure it does tho print("FINN Training finished, time taken: ", (time.time()-t_train))#this is really just me seeing if this is where most of the time is spent
def train(args): """Train the neural network. Write out model every several iterations. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. lr: float, learning rate. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr lr = args.lr iteration = args.iter # Load data. t1 = time.time() tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5") te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5") (tr_x1, tr_x2, tr_y1, tr_y2) = pp_data.load_hdf5(tr_hdf5_path) (te_x1, te_x2, te_y1, te_y2) = pp_data.load_hdf5(te_hdf5_path) print(tr_x1.shape, tr_y1.shape, tr_x2.shape, tr_y2.shape) print(te_x1.shape, te_y1.shape, te_x2.shape, te_y2.shape) print("Load data time: %s s" % (time.time() - t1, )) batch_size = 500 print("%d iterations / epoch" % int(tr_x1.shape[0] / batch_size)) # Scale data. if not True: t1 = time.time() scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) tr_x1 = pp_data.scale_on_3d(tr_x1, scaler) tr_y1 = pp_data.scale_on_2d(tr_y1, scaler) te_x1 = pp_data.scale_on_3d(te_x1, scaler) te_y1 = pp_data.scale_on_2d(te_y1, scaler) tr_x2 = pp_data.scale_on_2d(tr_x2, scaler) tr_y2 = pp_data.scale_on_2d(tr_y2, scaler) te_x2 = pp_data.scale_on_2d(te_x2, scaler) te_y2 = pp_data.scale_on_2d(te_y2, scaler) print("Scale data time: %s s" % (time.time() - t1, )) # Debug plot. if False: plt.matshow(tr_x[0:1000, 0, :].T, origin='lower', aspect='auto', cmap='jet') plt.show() pause # Build model (_, n_concat, n_freq) = tr_x1.shape n_hid = 2048 input_dim1 = (257 + 40 + 30) * 2 input_dim2 = (257 + 40 + 30) out_dim1 = (257 + 40 + 30) * 2 out_dim1_irm = 257 + 40 + 64 out_dim2 = (257 + 40 + 30) out_dim2_irm = (257 + 40 + 64) # model = Sequential() # model.add(Flatten(input_shape=(n_concat, n_freq))) # model.add(Dense(n_hid, activation='relu')) # model.add(Dropout(0.2)) # model.add(Dense(n_hid, activation='relu')) # model.add(Dropout(0.2)) # model.add(Dense(n_hid, activation='relu')) # model.add(Dropout(0.2)) # model.add(Dense(n_freq, activation='linear')) input1 = Input(shape=(n_concat, input_dim1), name='input1') layer = Flatten(name='flatten')(input1) layer = Dense(n_hid, activation='relu', name='dense1')(layer) layer = Dropout(0.2)(layer) layer = Dense(n_hid, activation='relu', name='dense2')(layer) layer = Dropout(0.2)(layer) partial_out1 = Dense(out_dim1, name='1_out_linear')(layer) partial_out1_irm = Dense(out_dim1_irm, name='1_out_irm', activation='sigmoid')(layer) out1 = concatenate([partial_out1, partial_out1_irm], name='out1') input2 = Input(shape=(input_dim2, ), name='input2') layer = concatenate([input2, out1], name='merge') layer = Dense(n_hid, activation='relu', name='dense3')(layer) layer = Dropout(0.2)(layer) layer = Dense(n_hid, activation='relu', name='dense4')(layer) layer = Dropout(0.2)(layer) partial_out2 = Dense(out_dim2, name='2_out_linear')(layer) partial_out2_irm = Dense(out_dim2_irm, name='2_out_irm', activation='sigmoid')(layer) out2 = concatenate([partial_out2, partial_out2_irm], name='out2') model = Model(inputs=[input1, input2], outputs=[out1, out2]) model.summary() sys.stdout.flush() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr, epsilon=1e-03)) # Data generator. tr_gen = DataGenerator(batch_size=batch_size, type='train') eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100) # Directories for saving models and training stats model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr)) pp_data.create_folder(model_dir) stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr)) pp_data.create_folder(stats_dir) # Print loss before training. iter = 0 tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2) te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Train. t1 = time.time() for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x1, tr_x2], ys=[tr_y1, tr_y2]): loss = model.train_on_batch(batch_x, batch_y) iter += 1 # Validate and save training stats. if iter % 100 == 0: tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2) te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2) print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss)) sys.stdout.flush() # Save out training stats. stat_dict = { 'iter': iter, 'tr_loss': tr_loss, 'te_loss': te_loss, } stat_path = os.path.join(stats_dir, "%diters.p" % iter) cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) # Save model. if iter % (iteration / 20) == 0: model_path = os.path.join(model_dir, "md_%diters.h5" % iter) model.save(model_path) print("Saved model to %s" % model_path) if iter == iteration + 1: break print("Training time: %s s" % (time.time() - t1, ))
def train(args): workspace = args.workspace audio_type = args.audio_type stack_num = args.stack_num hop_frames = args.hop_frames filename = args.filename cuda = args.use_cuda and torch.cuda.is_available() fft_size = cfg.fft_size print("cuda:", cuda) hdf5_file = os.path.join(args.workspace, "features", "cmplx_spectrogram.h5") data_type = 'train' t1 = time.time() batch_size = 500 shuffle = False load_raw = False data_loader = pp_data.DataLoader(hdf5_file, data_type, audio_type, stack_num, hop_frames, center_only=True, batch_size=batch_size, shuffle=shuffle, load_raw=load_raw) eval_tr_data_loader = pp_data.DataLoader(hdf5_file, 'train', audio_type, stack_num, hop_frames, center_only=True, batch_size=batch_size, shuffle=shuffle, load_raw=load_raw) eval_te_data_loader = pp_data.DataLoader(hdf5_file, 'test', audio_type, stack_num, hop_frames, center_only=True, batch_size=batch_size, shuffle=shuffle, load_raw=load_raw) print("Load time: %s" % (time.time() - t1)) # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda) std_ = move_data_to_gpu(std_, cuda) # Model n_freq = 257 model = DNN(stack_num, n_freq) if cuda: model.cuda() dft = pp_data.DFT(fft_size, cuda) # Optimizer optimizer = optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) # Train iter = 0 model_dir = os.path.join(workspace, "models", filename, audio_type) pp_data.create_folder(model_dir) t_train = time.time() for (batch_x, batch_y) in data_loader.generate(): output = forward(model, batch_x, mean_, std_, dft, cuda) batch_y = np.abs(batch_y) batch_y = move_data_to_gpu(batch_y, cuda) # batch_y = transform(batch_y, type='torch') # batch_y = pp_data.scale(batch_y, mean_, std_) loss = mse_loss(output, batch_y) # Backward optimizer.zero_grad() loss.backward() optimizer.step() iter += 1 # Evaluate. loss_ary = [] if iter % 500 == 0: t_eval = time.time() tr_loss = evaluate(model, eval_tr_data_loader, mean_, std_, dft, cuda) # tr_loss = -1 te_loss = evaluate(model, eval_te_data_loader, mean_, std_, dft, cuda) print("Iter: %d, train err: %f, test err: %f, train time: %s, eval time: %s" % \ (iter, tr_loss, te_loss, time.time() - t_train, time.time() - t_eval)) t_train = time.time() # Save model. if iter % 5000 == 0: save_out_dict = { 'iter': iter, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'te_loss': loss, } save_out_path = os.path.join(model_dir, "md_%d_iters.tar" % iter) torch.save(save_out_dict, save_out_path) print("Save model to %s" % save_out_path) t1 = time.time()