Example #1
0
def create_cv_csv(out_path):
    """Create cross validation csv file. 
    """
    dataset_dir = cfg.dataset_dir
    workspace = cfg.workspace
    events = cfg.events
    n_folds = cfg.n_folds

    pp_data.create_folder(os.path.dirname(out_path))
    f = open(out_path, 'w')
    f.write("name\tfold\n")

    names = os.listdir(dataset_dir)

    for event in events:
        event_names = [e for e in names if event in e]
        kf = KFold(n_splits=n_folds, shuffle=False, random_state=None)
        fold = 0
        for (tr_idxes, te_idxes) in kf.split(event_names):
            for idx in te_idxes:
                event_name = event_names[idx]
                f.write("%s\t%d\n" % (event_name, fold))
            fold += 1
    f.close()

    print("Write out to %s" % n_folds)
def recognize(args):
    workspace = cfg.workspace
    events = cfg.events
    n_events = args.n_events
    snr = args.snr
    md_na = args.model_name
    lb_to_ix = cfg.lb_to_ix
    n_out = len(cfg.events)
    te_fold = cfg.te_fold

    md_path = os.path.join(workspace, "models", pp_data.get_filename(__file__),
                           "n_events=%d" % n_events, "fold=%d" % te_fold,
                           "snr=%d" % snr, md_na)
    md = serializations.load(md_path)

    # Load data.
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    x = te_x
    at_gts = te_at_y
    sed_gts = te_sed_y
    na_list = te_na_list

    # Recognize.
    [at_pds] = md.predict(x)  # (N, 16)

    observe_nodes = [md.find_layer('detect').output_]
    f_forward = md.get_observe_forward_func(observe_nodes)
    [seg_masks] = md.run_function(f_forward, x, batch_size=500,
                                  tr_phase=0.)  # (n_clips, n_time, n_out)
    seg_masks = np.transpose(seg_masks, (0, 2, 1))[:, :, :, np.newaxis]

    # Dump to pickle.
    out_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__),
                           "n_events=%d" % n_events, "fold=%d" % te_fold,
                           "snr=%d" % snr,
                           os.path.splitext(md_na)[0])
    pp_data.create_folder(out_dir)
    out_at_path = os.path.join(out_dir, "at_probs.p")
    out_seg_masks_path = os.path.join(out_dir, "seg_masks.p")

    cPickle.dump(at_pds,
                 open(out_at_path, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(seg_masks,
                 open(out_seg_masks_path, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    # Print stats.
    sed_pds = np.mean(seg_masks, axis=-1)  # (N, n_out, n_time)
    sed_pds = np.transpose(sed_pds, (0, 2, 1))  # (N, n_time, n_out)
    print_stats(at_pds, at_gts, sed_pds, sed_gts)
Example #3
0
def no_separation(args):
    """Write out un-separated mixture as baseline. 
    """
    workspace = args.workspace

    out_dir = os.path.join(workspace, "separated_wavs", "no_separation")
    pp_data.create_folder(out_dir)

    audio_dir = os.path.join(workspace, "mixed_audio", "testing")
    names = os.listdir(audio_dir)

    for na in names:
        if '.mix_0db.wav' in na:
            print(na)
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            bare_na = os.path.splitext(os.path.splitext(na)[0])[0]
            pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"),
                                mixed_audio, fs)
            pp_data.write_audio(
                os.path.join(out_dir, bare_na + ".sep_event.wav"), mixed_audio,
                fs)

    print("Write out finished!")
Example #4
0
def calculate_scalar(args):
    workspace = args.workspace
    stack_num = args.stack_num
    hop_frames = args.hop_frames
    filename = args.filename
    audio_type = 'speech'
    
    hdf5_file = os.path.join(args.workspace, "features", "cmplx_spectrogram.h5")
    data_type = 'train'
    batch_size = 500
    data_loader = pp_data.DataLoader(hdf5_file, data_type, audio_type, stack_num, hop_frames, center_only=True, batch_size=batch_size)
    
    
    x_all = []
    iter = 0
    max_iter = 100
    
    for (batch_x, batch_y) in data_loader.generate():
        x_all.append(batch_x)
        
        iter += 1
        if iter == max_iter:
            break
            
    x_all = np.concatenate(x_all, axis=0)
    
    x_all = np.abs(x_all)
    x_all = transform(x_all, type='numpy')
    (mean_, std_) = pp_data.calculate_scalar(x_all)
    
    out_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    pp_data.create_folder(os.path.dirname(out_path))
    cPickle.dump((mean_, std_), open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
    print("Scalar saved to %s" % out_path)
Example #5
0
def train(tr_fe_fd, tr_csv_file, te_fe_fd, te_csv_file, 
          n_concat, hop, scaler, out_md_fd):
    # Prepare data 
    tr_x, tr_y = pp_data.get_matrix_format_data(
                     fe_fd=tr_fe_fd, 
                     csv_file=tr_csv_file, 
                     n_concat=n_concat, hop=hop, scaler=scaler)
                     
    te_x, te_y = pp_data.get_matrix_format_data(
                     fe_fd=te_fe_fd, 
                     csv_file=te_csv_file, 
                     n_concat=n_concat, hop=hop, scaler=scaler)
    
    n_freq = tr_x.shape[2]
    print 'tr_x.shape:', tr_x.shape     # (n_samples, n_concat, n_freq)
    print 'tr_y.shape:', tr_y.shape     # (n_samples, n_labels)
    
    
    # Build model
    n_out = len(cfg.labels)
    seq = Sequential()
    seq.add(InputLayer((n_concat, n_freq)))
    seq.add(Flatten())
    seq.add(Dropout(0.2))
    seq.add(Dense(200, act='relu'))
    seq.add(Dropout(0.2))
    seq.add(Dense(200, act='relu'))
    seq.add(Dropout(0.2))
    seq.add(Dense(n_out, act='softmax'))
    md = seq.compile()
    md.summary()
    
    # Validation. 
    # tr_err, te_err are frame based. To get event based err, run recognize.py
    validation = Validation(tr_x=tr_x, tr_y=tr_y, 
                            va_x=None, va_y=None, 
                            te_x=te_x, te_y=te_y, 
                            batch_size=500, call_freq=1, dump_path=None)
    
    # Save model
    pp_data.create_folder(out_md_fd)
    save_model = SaveModel(out_md_fd, call_freq=2)
    
    # Callbacks
    callbacks = [validation, save_model]
    
    # Optimizer
    optimizer = Adam(1e-3)
    
    # fit model
    md.fit(x=tr_x, y=tr_y, 
           batch_size=100, 
           n_epochs=101, 
           loss_func='categorical_crossentropy', 
           optimizer=optimizer, 
           callbacks=callbacks)
def get_avg_stats(args, file_name, bgn_iter, fin_iter, interval_iter):
    eval_hdf5_path = os.path.join(args.cpickle_dir, "eval.h5")
    workspace = args.workspace
    
    # Load ground truth
    (te_x, te_y, te_id_list) = pp_data.load_data(eval_hdf5_path)
    y = te_y
    
    # Average prediction probabilities of several iterations
    prob_dir = os.path.join(workspace, "probs", file_name, "test")
    names = os.listdir(prob_dir)
    
    probs = []
    iters = range(bgn_iter, fin_iter, interval_iter)
    for iter in iters:
        pickle_path = os.path.join(prob_dir, "prob_%d_iters.p" % iter)
        prob = cPickle.load(open(pickle_path, 'rb'))
        probs.append(prob)
    #print(len(probs))
    avg_prob = np.mean(np.array(probs), axis=0)

    # Compute stats
    t1 = time.time()
    n_out = y.shape[1]
    stats = []
    for k in range(n_out):
        (precisions, recalls, thresholds) = metrics.precision_recall_curve(y[:, k], avg_prob[:, k])
        avg_precision = metrics.average_precision_score(y[:, k], avg_prob[:, k], average=None)
        (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], avg_prob[:, k])
        auc = metrics.roc_auc_score(y[:, k], avg_prob[:, k], average=None)
        #eer = pp_data.eer(avg_prob[:, k], y[:, k])
        
        skip = 1000
        dict = {'precisions': precisions[0::skip], 'recalls': recalls[0::skip], 'AP': avg_precision, 
                'fpr': fpr[0::skip], 'fnr': 1. - tpr[0::skip], 'auc': auc}
        
        stats.append(dict)
    logging.info("Callback time: %s" % (time.time() - t1,))
    
    # Dump stats
    dump_path = os.path.join(workspace, "stats", pp_data.get_filename(__file__), "test", "avg_%d_%d_%d.p" % (bgn_iter, fin_iter, interval_iter))
    pp_data.create_folder(os.path.dirname(dump_path))
    cPickle.dump(stats, open(dump_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
    #print(stats.shape)
    #for i, e in enumerate(stats):
    #  logging.info("%d. mAP: %f, auc: %f, d_prime: %f" % (i, e['AP'], e['auc'], pp_data.d_prime(e['auc'])))

    # Write out to log
    logging.info("bgn_iter, fin_iter, interval_iter: %d, %d, %d" % (bgn_iter, fin_iter, interval_iter))
    logging.info("mAP: %f" % np.mean([e['AP'] for e in stats]))
    auc = np.mean([e['auc'] for e in stats])
    logging.info("auc: %f" % auc)
    logging.info("d_prime: %f" % pp_data.d_prime(auc))
def my_plot(pd, gt, picture_path, threshold=None):
    classes = cfg.classes
    ig = cfg.ig
    mg = cfg.mg

    estimate_path = picture_path.replace("picture", "estimate_txt")
    estimate_path = estimate_path.replace("jpg", "txt")

    folder, _ = os.path.split(picture_path)
    if not os.path.exists(folder):
        create_folder(folder)

    folder, _ = os.path.split(estimate_path)
    if not os.path.exists(folder):
        create_folder(folder)

    result = open(estimate_path, 'at')
    n_cls = len(classes)
    if threshold == None:
        pd_ = pd.argmax(axis=-1)
    for i in range(n_cls):
        #'''
        plt.subplot(221 + i)
        plt.plot(range(240), gt[:, i], 'r')
        plt.bar(left=range(240), height=pd[:, i], width=1, color='b')
        plt.xlim(0, 251)
        plt.ylim(0, 1.1)
        #'''
        if not i == 0:
            if threshold == None:
                class_ind = np.where(pd_ == i)[0]
                pd_class = np.zeros(pd_.shape)
                pd_class[class_ind] = 1
                segments = pro_boundary(pd_class, 0, mg[i], ig[i])
            else:
                segments = pro_boundary(pd[:, i], threshold[i], mg[i], ig[i])
            for j in range(len(segments)):
                #'''
                plt.plot([segments[j][0]] * 240,
                         np.arange(240) / 240.0 * 1.1, 'g')
                plt.plot([segments[j][1]] * 240,
                         np.arange(240) / 240.0 * 1.1, 'g')
                #'''
                result.write(
                    str(segments[j][0] * cfg.step_time) + '\t' +
                    str(segments[j][1] * cfg.step_time) + '\t' + classes[i] +
                    '\n')
    #'''
    plt.savefig(picture_path)
    #plt.show()
    plt.close()
    #'''
    result.close()
def eval(md, x, y, out_dir, out_probs_dir, iter_):

    # Predict
    t1 = time.time()
    (n_clips, n_time, n_freq) = x.shape
    (x, y) = pp_data.transform_data(x, y)
    prob = md.predict(x)
    prob = prob.astype(np.float32)

    if out_dir:
        pp_data.create_folder(out_dir)
        #out_prob_path = os.path.join(out_probs_dir, "prob_%d_iters.p" %iter_)
    # Dump predicted probabilites for future average
    if out_probs_dir:
        pp_data.create_folder(out_probs_dir)
        out_prob_path = os.path.join(out_probs_dir, "prob_%d_iters.p" % iter_)
        cPickle.dump(prob,
                     open(out_prob_path, 'wb'),
                     protocol=cPickle.HIGHEST_PROTOCOL)

    # Compute and dump stats
    n_out = y.shape[1]
    stats = []
    t1 = time.time()
    for k in range(n_out):
        (precisions, recalls,
         thresholds) = metrics.precision_recall_curve(y[:, k], prob[:, k])
        avg_precision = metrics.average_precision_score(y[:, k],
                                                        prob[:, k],
                                                        average=None)
        (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], prob[:, k])
        auc = metrics.roc_auc_score(y[:, k], prob[:, k], average=None)
        #eer = pp_data.eer(prob[:, k], y[:, k])
        skip = 1000
        dict = {
            'precisions': precisions[0::skip],
            'recalls': recalls[0::skip],
            'AP': avg_precision,
            'fpr': fpr[0::skip],
            'fnr': 1. - tpr[0::skip],
            'auc': auc
        }

        stats.append(dict)
    logging.info("Callback time: %s" % (time.time() - t1, ))

    dump_path = os.path.join(out_dir, "md%d_iters.p" % iter_)
    cPickle.dump(stats,
                 open(dump_path, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    logging.info("mAP: %f" % np.mean([e['AP'] for e in stats]))
Example #9
0
def write_out_at_sed(md, gen, f_forward, x, at_y, sed_y, n_events, snr, te_fold):
    workspace = cfg.workspace
    pred_at_all = []
    seg_masks_all = []
    gt_at_all = []
    gt_sed_all = []
    for [batch_x, batch_at_y, batch_sed_y] in gen.generate(zs=[x, at_y, sed_y]):
        # AT. 
        [at_pred] = md.predict(batch_x, batch_size=None)
        pred_at_all.append(at_pred)
        
        # SED. 
        [seg_masks] = md.run_function(func=f_forward, z=[batch_x], batch_size=500, tr_phase=0.)
        seg_masks_all.append(seg_masks)
        
        gt_at_all.append(batch_at_y)
        gt_sed_all.append(batch_sed_y)
        
    # DO NOT SHUFFLE DATA!
    pred_at_all = np.concatenate(pred_at_all, axis=0)
    seg_masks_all = np.concatenate(seg_masks_all, axis=0)
    
    gt_at_all = np.concatenate(gt_at_all, axis=0)
    gt_sed_all = np.concatenate(gt_sed_all, axis=0)

    # Compress to float16 to reduce space. 
    pred_at_all = pred_at_all.astype(np.float16)
    seg_masks_all = seg_masks_all.astype(np.float16)
    
    print(pred_at_all.shape)
    print(seg_masks_all.shape)
    print(pred_at_all.dtype)
    
    out_dir = os.path.join(workspace, "callbacks", "preds", pp_data.get_filename(__file__), 
                          "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr, 
                          "md%d_iters" % md.iter_)
    pp_data.create_folder(out_dir)
    out_at_path = os.path.join(out_dir, "at_probs.p")
    out_seg_masks_path = os.path.join(out_dir, "seg_masks.p")
    
    cPickle.dump(pred_at_all, open(out_at_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(seg_masks_all, open(out_seg_masks_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
    
    thres = 0.5
    (tp, fn, fp, tn) = tp_fn_fp_tn(pred_at_all, gt_at_all, thres, average='macro')
    (prec, recall, fvalue) = prec_recall_fvalue(pred_at_all, gt_at_all, thres, average='macro')
    logging.info("tp, fn, fp, tn: %d %d %d %d" % (tp, fn, fp, tn))
    logging.info("prec, recall, fvalue: %f %f %f" % (prec, recall, fvalue))
def eval(model, x, y, out_dir, out_probs_dir, md_iter):
    pp_data.create_folder(out_dir)

    # Predict
    t1 = time.time()
    (n_clips, n_time_, n_freq) = x.shape
    (x, y) = pp_data.transform_data(x, y)
    prob = model.predict(x)
    prob = prob.astype(np.float32)
    print("The %d time into evalution." % md_iter)
    if out_probs_dir:
        pp_data.create_folder(out_probs_dir)
        out_prob_path = os.path.join(out_probs_dir,
                                     "prob_%d_iters.p" % md_iter)
        #cPickle.dump(prob, open(out_prob_path, 'wb'))
    # Dump predicted probabilities for future average
    n_out = y.shape[1]
    stats = []
    t1 = time.time()
    for k in range(n_out):
        (precisions, recalls,
         thresholds) = metrics.precision_recall_curve(y[:, k], prob[:, k])
        avg_precision = metrics.average_precision_score(y[:, k],
                                                        prob[:, k],
                                                        average=None)
        (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], prob[:, k])
        auc = metrics.roc_auc_score(y[:, k], prob[:, k], average=None)
        eer = pp_data.eer(prob[:, k], y[:, k])
        skip = 1000
        dict = {
            'precisions': precisions[0::skip],
            'recalls': recalls[0::skip],
            'AP': avg_precision,
            'fpr': fpr[0::skip],
            'fnr': 1. - tpr[0::skip],
            'auc': auc
        }
        stats.append(dict)

    logging.info("Callback time: %s" % (time.time() - t1, ))
    dump_path = os.path.join(out_dir, "model_%d_iters.p" % (md_iter, ))
    cPickle.dump(stats, open(dump_path, 'wb'))
    mAP = np.mean([e['AP'] for e in stats])
    logging.info("mAP of %d iteration: %f" % (md_iter, mAP))
    return mAP
Example #11
0
def plot_training_stat(args):
    """Plot training and testing loss.

    Args:
      workspace: str, path of workspace.
      tr_snr: float, training SNR.
      bgn_iter: int, plot from bgn_iter
      fin_iter: int, plot finish at fin_iter
      interval_iter: int, interval of files.
    """
    workspace = args.workspace
    tr_snr = args.tr_snr
    bgn_iter = args.bgn_iter
    fin_iter = args.fin_iter
    interval_iter = args.interval_iter

    tr_losses, te_losses, iters = [], [], []

    # Load stats.
    stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr))
    for iter in xrange(bgn_iter, fin_iter, interval_iter):
        stats_path = os.path.join(stats_dir, "%diters.p" % iter)
        dict = cPickle.load(open(stats_path, 'rb'))
        tr_losses.append(dict['tr_loss'])
        te_losses.append(dict['te_loss'])
        iters.append(dict['iter'])

    # Plot
    line_tr, = plt.plot(tr_losses, c='b', label="Train")
    line_te, = plt.plot(te_losses, c='r', label="Test")
    plt.axis([0, len(iters), 0, max(tr_losses)])
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend(handles=[line_tr, line_te])
    plt.xticks(np.arange(len(iters)), iters)
    # plt.show()
    out_path = os.path.join(workspace, "figures", "train_history.png")
    pp_data.create_folder(os.path.dirname(out_path))
    plt.savefig(out_path)
def train(args):
    num_classes = cfg.num_classes

    tr_data = h5py.File(args.tr_hdf5_path, 'r+')
    te_data = h5py.File(args.te_hdf5_path, 'r+')

    tr_shape = tr_data['x'].shape

    print("tr_x.shape: %s" % (tr_shape,))
    
    # Build model
    model = create_model(num_classes, tr_shape)
    
    # Save model callback
    filepath = os.path.join(args.out_model_dir, "gatedAct_rationBal44_lr0.001_normalization_at_cnnRNN_64newMel_240fr.{epoch:02d}-{val_acc:.4f}.hdf5")
    print(filepath)
    create_folder(os.path.dirname(filepath))
    save_model = ModelCheckpoint(filepath=filepath,
                                 monitor='val_acc', 
                                 verbose=0,
                                 save_best_only=False,
                                 save_weights_only=False,
                                 mode='auto',
                                 period=1)  
    num_examples = 41498
    batch_size = 8

    # Data generator
    gen = RatioDataGenerator(batch_size=batch_size, type='train')

    # Train
    model.fit_generator(generator=gen.generate(tr_data), 
                        steps_per_epoch=5.5*100,    # 100 iters is called an 'epoch'
                        epochs=31,              # Maximum 'epoch' to train - With larger dataset loss increased after epoch 28
                        verbose=1, 
                        callbacks=[save_model], 
                        validation_data=(te_data['x'], te_data['y']))
Example #13
0
def ibm_separation(args):
    """Ideal binary mask (IBM) source separation. 
    """
    workspace = args.workspace

    out_dir = os.path.join(workspace, "separated_wavs", "ibm_separation")
    pp_data.create_folder(out_dir)

    audio_dir = os.path.join(workspace, "mixed_audio", "testing")
    names = os.listdir(audio_dir)

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_sec = cfg.clip_sec

    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())

    for na in names:
        if '.mix_0db.wav' in na:
            print(na)
            bare_na = os.path.splitext(os.path.splitext(na)[0])[0]
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio,
                                                          window=ham_win,
                                                          nperseg=n_window,
                                                          noverlap=n_overlap,
                                                          detrend=False,
                                                          return_onesided=True,
                                                          scaling='density',
                                                          mode='magnitude')

            [f, t,
             event_spec] = signal.spectral.spectrogram(x=event_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='magnitude')

            [f, t,
             mixed_spec] = signal.spectral.spectrogram(x=mixed_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            bg_spec = bg_spec.T
            event_spec = event_spec.T
            mixed_spec = mixed_spec.T

            ratio = 1.7  # 5 dB
            event_mask = (np.sign(event_spec / (bg_spec * ratio) - 1) + 1) / 2
            bg_mask = 1. - event_mask

            bg_separated_spec = np.abs(mixed_spec) * bg_mask
            event_separated_spec = np.abs(mixed_spec) * event_mask

            # Write out separated music
            s = spectrogram_to_wave.recover_wav(bg_separated_spec,
                                                mixed_spec,
                                                n_overlap=n_overlap,
                                                winfunc=np.hamming,
                                                wav_len=int(fs * clip_sec))
            s *= recover_scaler
            pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"),
                                s, fs)

            # Write out separated vocal
            s = spectrogram_to_wave.recover_wav(event_separated_spec,
                                                mixed_spec,
                                                n_overlap=n_overlap,
                                                winfunc=np.hamming,
                                                wav_len=int(fs * clip_sec))
            s *= recover_scaler
            pp_data.write_audio(
                os.path.join(out_dir, bare_na + ".sep_event.wav"), s, fs)

    print("Finished!")
Example #14
0
def jsc_separation(args):
    """Joing separation-classification (JSC) source separation. 
    """
    workspace = args.workspace

    scaler_path = os.path.join(workspace, "scalers", "logmel",
                               "training.scaler")
    scaler = pickle.load(open(scaler_path, 'rb'))

    md_path = os.path.join(workspace, "models", "main", args.model_name)
    md = serializations.load(md_path)

    out_dir = os.path.join(workspace, "separated_wavs", "jsc_separation")
    pp_data.create_folder(out_dir)

    observe_nodes = [md.find_layer('seg_masks').output_]
    f_forward = md.get_observe_forward_func(observe_nodes)

    audio_dir = os.path.join(os.path.join(workspace, "mixed_audio", "testing"))
    names = os.listdir(audio_dir)

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())

    melW = librosa.filters.mel(sr=fs,
                               n_fft=n_window,
                               n_mels=64,
                               fmin=0.,
                               fmax=fs / 2)
    inverse_melW = get_inverse_W(melW)

    for na in names:
        if ".mix" in na:
            # Read yaml
            bare_name = os.path.splitext(os.path.splitext(na)[0])[0]
            yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_name)
            with open(yaml_path, 'r') as f:
                data = yaml.load(f)
            event_type = data['event_type']
            print(na, event_type)

            # Read audio
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, _) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            # Spectrogram
            [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio,
                                                          window=ham_win,
                                                          nperseg=n_window,
                                                          noverlap=n_overlap,
                                                          detrend=False,
                                                          return_onesided=True,
                                                          scaling='density',
                                                          mode='complex')

            [f, t,
             event_spec] = signal.spectral.spectrogram(x=event_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            [f, t,
             mixed_spec] = signal.spectral.spectrogram(x=mixed_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            bg_spec = bg_spec.T
            event_spec = event_spec.T
            mixed_spec = mixed_spec.T

            # Log Mel spectrogram
            mixed_x = pp_data.calc_feat(mixed_audio)
            x3d = pp_data.do_scaler_on_x3d(mixed_x[np.newaxis, ...], scaler)

            # Segmentation masks
            [mel_masks] = md.run_function(f_forward,
                                          x3d,
                                          batch_size=10,
                                          tr_phase=0.)
            mel_masks = mel_masks[0]  # (n_time, 64)
            spec_masks = np.dot(mel_masks, inverse_melW)  # (n_time, 513)

            if args.plot_only:
                mixed_mel_spec = np.dot(np.abs(mixed_spec), melW.T)
                bg_mel_spec = np.dot(np.abs(bg_spec), melW.T)
                event_mel_spec = np.dot(np.abs(event_spec), melW.T)
                ratio = 1.7  # 5 dB
                event_mask = (np.sign(event_mel_spec /
                                      (bg_mel_spec * ratio) - 1) + 1) / 2

                fig, axs = plt.subplots(3, 2, sharex=True)
                axs[0, 0].matshow(np.log(mixed_mel_spec.T),
                                  origin='lower',
                                  aspect='auto')
                axs[0, 1].matshow(event_mask.T, origin='lower', aspect='auto')
                axs[1, 0].matshow(spec_masks[0].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[1, 1].matshow(spec_masks[1].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[2, 0].matshow(spec_masks[2].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[2, 1].matshow(spec_masks[3].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[0, 0].set_title('log Mel of mixture')
                axs[0, 1].set_title('IBM of event')
                axs[1, 0].set_title('babycry')
                axs[1, 1].set_title('glassbreak')
                axs[2, 0].set_title('gunshot')
                axs[2, 1].set_title('bg')

                plt.show()

            else:
                # Separated spec
                separated_specs = spec_masks * np.abs(mixed_spec)[None, :, :]

                # Write out all events and bg
                enlarged_events = cfg.events + ['bg']
                for i1 in xrange(4):
                    s = spectrogram_to_wave.recover_wav(
                        separated_specs[i1],
                        mixed_spec,
                        n_overlap=n_overlap,
                        winfunc=np.hamming,
                        wav_len=len(mixed_audio))
                    s *= recover_scaler
                    pp_data.write_audio(
                        os.path.join(
                            out_dir, "%s.sep_%s.wav" %
                            (bare_name, enlarged_events[i1])), s, fs)

                # Write out event
                s = spectrogram_to_wave.recover_wav(
                    separated_specs[cfg.lb_to_ix[event_type]],
                    mixed_spec,
                    n_overlap=n_overlap,
                    winfunc=np.hamming,
                    wav_len=len(mixed_audio))
                s *= recover_scaler
                pp_data.write_audio(
                    os.path.join(out_dir, "%s.sep_event.wav" % bare_name), s,
                    fs)

                # Write out origin mix
                pp_data.write_audio(
                    os.path.join(out_dir, "%s.sep_mix.wav" % bare_name),
                    mixed_audio, fs)
def evaluate_separation(args):
    workspace = cfg.workspace
    events = cfg.events
    te_fold = cfg.te_fold
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_duration = cfg.clip_duration
    n_events = args.n_events
    snr = args.snr

    # Load ground truth data.
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    at_y = te_at_y
    sed_y = te_sed_y
    na_list = te_na_list

    audio_dir = os.path.join(workspace, "mixed_audio",
                             "n_events=%d" % n_events)

    sep_dir = os.path.join(workspace, "sep_audio",
                           pp_data.get_filename(__file__),
                           "n_events=%d" % n_events, "fold=%d" % te_fold,
                           "snr=%d" % snr)

    sep_stats = {}
    for e in events:
        sep_stats[e] = {'sdr': [], 'sir': [], 'sar': []}

    cnt = 0
    for (i1, na) in enumerate(na_list):
        bare_na = os.path.splitext(na)[0]
        gt_audio_path = os.path.join(audio_dir, "%s.wav" % bare_na)
        (stereo_audio, _) = pp_data.read_stereo_audio(gt_audio_path,
                                                      target_fs=fs)
        gt_event_audio = stereo_audio[:, 0]
        gt_noise_audio = stereo_audio[:, 1]

        print(na)
        for j1 in xrange(len(events)):
            if at_y[i1][j1] == 1:
                sep_event_audio_path = os.path.join(
                    sep_dir, "%s.%s.wav" % (bare_na, events[j1]))
                (sep_event_audio, _) = pp_data.read_audio(sep_event_audio_path,
                                                          target_fs=fs)
                sep_noise_audio_path = os.path.join(sep_dir,
                                                    "%s.noise.wav" % bare_na)
                (sep_noise_audio, _) = pp_data.read_audio(sep_noise_audio_path,
                                                          target_fs=fs)
                ref_array = np.array((gt_event_audio, gt_noise_audio))
                est_array = np.array((sep_event_audio, sep_noise_audio))
                (sdr, sir, sar) = sdr_sir_sar(ref_array,
                                              est_array,
                                              sed_y[i1, :, j1],
                                              inside_only=True)
                print(sdr, sir, sar)
                sep_stats[events[j1]]['sdr'].append(sdr)
                sep_stats[events[j1]]['sir'].append(sir)
                sep_stats[events[j1]]['sar'].append(sar)

        cnt += 1
        # if cnt == 5: break

    print(sep_stats)
    sep_stat_path = os.path.join(workspace, "sep_stats",
                                 pp_data.get_filename(__file__),
                                 "n_events=%d" % n_events, "fold=%d" % te_fold,
                                 "snr=%d" % snr, "sep_stat.p")
    pp_data.create_folder(os.path.dirname(sep_stat_path))
    cPickle.dump(sep_stats, open(sep_stat_path, 'wb'))
Example #16
0
def train(args):
    """Train the neural network. Write out model every several iterations. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      lr: float, learning rate. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    lr = args.lr
    iteration = args.iter

    # Load data. 
    t1 = time.time()
    tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5")
    te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5")
    tr_adapt_utt_path = os.path.join(workspace, "adaptive_utterance", "train", "adaptive_utterance_spec.p")
    te_adapt_utt_path = os.path.join(workspace, "adaptive_utterance", "test", "adaptive_utterance_spec.p")
    tr_adapt_utt = cPickle.load(open(tr_adapt_utt_path, 'rb'))
    te_adapt_utt = cPickle.load(open(te_adapt_utt_path, 'rb'))
    tr_adapt_utt_len_path = os.path.join(workspace, "adaptive_utterance", "train", "adaptive_utterance_max_len.p")
    te_adapt_utt_len_path = os.path.join(workspace, "adaptive_utterance", "test", "adaptive_utterance_max_len.p")
    tr_adapt_utt_len = cPickle.load(open(tr_adapt_utt_len_path, 'rb'))
    te_adapt_utt_len = cPickle.load(open(te_adapt_utt_len_path, 'rb'))
    max_len = max(tr_adapt_utt_len, te_adapt_utt_len)
    (tr_x1, tr_x2, tr_y1, tr_y2, tr_name) = pp_data.load_hdf5(tr_hdf5_path)
    (te_x1, te_x2, te_y1, te_y2, te_name) = pp_data.load_hdf5(te_hdf5_path)
    print(tr_x1.shape, tr_y1.shape, tr_x2.shape, tr_y2.shape)
    print(te_x1.shape, te_y1.shape, te_x2.shape, te_y2.shape)
    print("Load data time: %s s" % (time.time() - t1,))

    batch_size = 500
    print("%d iterations / epoch" % int(tr_x1.shape[0] / batch_size))

    # Scale data. 
    if not True:
        t1 = time.time()
        scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr),
                                   "scaler.p")
        scaler = pickle.load(open(scaler_path, 'rb'))
        tr_x1 = pp_data.scale_on_3d(tr_x1, scaler)
        tr_y1 = pp_data.scale_on_2d(tr_y1, scaler)
        te_x1 = pp_data.scale_on_3d(te_x1, scaler)
        te_y1 = pp_data.scale_on_2d(te_y1, scaler)
        tr_x2 = pp_data.scale_on_2d(tr_x2, scaler)
        tr_y2 = pp_data.scale_on_2d(tr_y2, scaler)
        te_x2 = pp_data.scale_on_2d(te_x2, scaler)
        te_y2 = pp_data.scale_on_2d(te_y2, scaler)
        print("Scale data time: %s s" % (time.time() - t1,))

    # Debug plot. 
    if False:
        plt.matshow(tr_x[0: 1000, 0, :].T, origin='lower', aspect='auto', cmap='jet')
        plt.show()
        pause

    # Build model
    (_, n_concat, n_freq) = tr_x1.shape
    n_hid = 2048
    input_dim1 = (257 + 40 + 30) * 2
    input_dim2 = (257 + 40 + 30)
    out_dim1 = (257 + 40 + 30) * 2
    out_dim1_irm = 257 + 40 + 64
    out_dim2 = (257 + 40 + 30)
    out_dim2_irm = (257 + 40 + 64)
    num_factorize = 30

    def multiplication(pair_tensors):
        '''
        :param pair_tensors: x: (num_factorize,)
                            y: (num_factorize, n_hid)
        :return: (n_hid,) sum(x[i]*y[i,:],axis=1)
        '''
        x, y = pair_tensors
        return K.sum(tf.multiply(y, K.expand_dims(x, -1)), axis=1)

    adapt_input = Input(shape=(None,), name='adapt_input')
    layer = Reshape((-1, 257), name='reshape')(adapt_input)
    layer = Dense(512, activation='relu', name='adapt_dense1')(layer)
    layer = Dense(512, activation='relu', name='adapt_dense2')(layer)
    layer = Dense(num_factorize, activation='softmax', name='adapt_out')(layer)
    alpha = Lambda(lambda x: K.sum(x, axis=1), output_shape=(num_factorize,), name='sequence_sum')(layer)
    input1 = Input(shape=(n_concat, input_dim1), name='input1')
    layer = Flatten(name='flatten')(input1)
    layer = Dense(n_hid * num_factorize, name='dense0')(layer)
    layer = Reshape((num_factorize, n_hid), name='reshape2')(layer)
    layer = Lambda(multiplication, name='multiply')([alpha, layer])
    layer = Dense(n_hid, activation='relu', name='dense1')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(n_hid, activation='relu', name='dense2')(layer)
    layer = Dropout(0.2)(layer)
    partial_out1 = Dense(out_dim1, name='1_out_linear')(layer)
    partial_out1_irm = Dense(out_dim1_irm, name='1_out_irm', activation='sigmoid')(layer)
    out1 = concatenate([partial_out1, partial_out1_irm], name='out1')
    input2 = Input(shape=(input_dim2,), name='input2')
    layer = concatenate([input2, out1], name='merge')
    layer = Dense(n_hid, activation='relu', name='dense3')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(n_hid, activation='relu', name='dense4')(layer)
    layer = Dropout(0.2)(layer)
    partial_out2 = Dense(out_dim2, name='2_out_linear')(layer)
    partial_out2_irm = Dense(out_dim2_irm, name='2_out_irm', activation='sigmoid')(layer)
    out2 = concatenate([partial_out2, partial_out2_irm], name='out2')
    model = Model(inputs=[input1, input2, adapt_input], outputs=[out1, out2])

    model.summary()
    sys.stdout.flush()
    model.compile(loss='mean_absolute_error',
                  optimizer=Adam(lr=lr, epsilon=1e-03))
    # Data generator.
    tr_gen = DataGenerator(batch_size=batch_size, type='train', max_len=max_len)
    eval_te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100, max_len=max_len)
    eval_tr_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=100, max_len=max_len)

    # Directories for saving models and training stats
    model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr))
    pp_data.create_folder(model_dir)

    stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr))
    pp_data.create_folder(stats_dir)

    # Print loss before training. 
    iter = 0
    tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2, tr_name, tr_adapt_utt)
    te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2, te_name, te_adapt_utt)
    print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss))

    # Save out training stats. 
    stat_dict = {'iter': iter,
                 'tr_loss': tr_loss,
                 'te_loss': te_loss, }
    stat_path = os.path.join(stats_dir, "%diters.p" % iter)
    cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)

    # Train. 
    t1 = time.time()
    for (batch_x, batch_y) in tr_gen.generate([tr_x1, tr_x2, tr_name], [tr_y1, tr_y2], tr_adapt_utt):
        loss = model.train_on_batch(batch_x, batch_y)
        iter += 1

        # Validate and save training stats. 
        if iter % 100 == 0:
            tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2, tr_name, tr_adapt_utt)
            te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2, te_name, te_adapt_utt)
            print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss))
            sys.stdout.flush()

            # Save out training stats. 
            stat_dict = {'iter': iter,
                         'tr_loss': tr_loss,
                         'te_loss': te_loss, }
            stat_path = os.path.join(stats_dir, "%diters.p" % iter)
            cPickle.dump(stat_dict, open(stat_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)

        # Save model. 
        if iter % (iteration / 20) == 0:
            model_path = os.path.join(model_dir, "md_%diters.h5" % iter)
            model.save(model_path)
            print("Saved model to %s" % model_path)

        if iter == iteration + 1:
            break

    print("Training time: %s s" % (time.time() - t1,))
def train(args):
    if os.path.exists(args.out_model_dir):
        shutil.rmtree(args.out_model_dir)
    create_folder(args.out_model_dir)
    num_classes = cfg.num_classes
    # Load training & testing data
    (tr_x, tr_y, tr_na_list) = load_hdf5(args.tr_hdf5_path, verbose=1)
    (te_x, te_y, te_na_list) = load_hdf5(args.te_hdf5_path, verbose=1)
    print("")

    # Scale data
    tr_x = do_scale(tr_x, args.scaler_path, verbose=1)
    te_x = do_scale(te_x, args.scaler_path, verbose=1)
    # Build model
    (_, n_time, n_freq) = tr_x.shape

    #pdb.set_trace()

    input = Input(shape=(n_time, n_freq), name='input_layer')
    input_ = Reshape((n_time, n_freq, 1))(input)
    '''
    block1 = Conv_BN(input_, 8, (3, 3), act="relu")
    block1 = Conv_BN(block1, 32, (3, 3), act="relu")
    block1 = Conv_BN(block1, 64, (3, 3), act="relu")

    block1 = block_a(input_, 8)
    block1 = block_a(block1, 32)
    block1 = block_a(block1, 64)
    '''
    block1 = block_b(input_, 8)
    block1 = block_b(block1, 32)
    block1 = block_b(block1, 64)
    block1 = MaxPooling2D(pool_size=(1, 2))(block1)

    block2 = block_c(block1, 64)
    block2 = MaxPooling2D(pool_size=(1, 2))(block2)

    block3 = block_c(block2, 64)
    block3 = MaxPooling2D(pool_size=(1, 2))(block3)

    block4 = block_c(block3, 64)
    block4 = MaxPooling2D(pool_size=(1, 2))(block4)

    cnnout = Conv_BN(block4, 128, (1, 1), act="relu", bias=True)
    cnnout = MaxPooling2D(pool_size=(1, 2))(cnnout)
    cnnout = Reshape((240, 256))(cnnout)

    rnn = Bidirectional(
        GRU(128,
            activation='relu',
            return_sequences=True,
            kernel_regularizer=regularizers.l2(0.01),
            recurrent_regularizer=regularizers.l2(0.01)))(cnnout)

    out = TimeDistributed(Dense(
        num_classes,
        activation='softmax',
        kernel_regularizer=regularizers.l2(0.0),
    ),
                          name='output_layer')(rnn)

    model = Model(input, out)
    model.summary()

    # Compile model
    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.009)
    sgd = optimizers.SGD(lr=0.01, momentum=0.9, decay=0.0)
    model.compile(loss=focal_loss(alpha=[1, 1, 1, 1], gamma=1),
                  optimizer="adam",
                  metrics=[myacc(threshold=0.5)])

    # Save model callback
    filepath = os.path.join(
        args.out_model_dir,
        "aed-batchsize_50-lr_0.01-{epoch:04d}-{val_Acc:.4f}.hdf5")
    save_model = ModelCheckpoint(filepath=filepath,
                                 monitor='val_Acc',
                                 verbose=0,
                                 save_best_only=False,
                                 save_weights_only=False,
                                 mode='auto',
                                 period=1)

    # Train
    '''
    history=model.fit(  x=tr_x, 
			y=tr_y, 
			batch_size=50, 
			epochs=200, 
			verbose=1,
			shuffle=True,
			class_weight="auto", 
			callbacks=[save_model], 
			validation_data=(te_x,te_y)
		      ) 

    '''
    # Data generator
    gen = Generator(batch_size=50, type='train')
    history = model.fit_generator(
        generator=gen.generate([tr_x], [tr_y]),
        steps_per_epoch=300,  # 100 iters is called an 'epoch'
        epochs=100,  # Maximum 'epoch' to train
        verbose=1,
        class_weight="auto",
        callbacks=[save_model],
        validation_data=(te_x, te_y))

    with open('src/log.py', 'w') as f:
        f.write("history=")
        f.write(str(history.history))
Example #18
0
def inference_wiener(args):
    workspace = args.workspace
    iter = args.iteration
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)

    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)

    # Load model.
    target_type = ['speech', 'noise']
    model_dict = {}
    for e in target_type:
        n_freq = 257
        model = DNN(stack_num, n_freq)
        model_path = os.path.join(workspace, "models", filename, e,
                                  "md_%d_iters.tar" % iter)
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['state_dict'])

        # Move model to GPU.
        if cuda:
            model.cuda()
        model.eval()

        model_dict[e] = model

    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)

    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1

    out_wav_dir = os.path.join(workspace, "enh_wavs", filename)
    pp_data.create_folder(out_wav_dir)

    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio, _) = pp_data.read_audio(audio_path, sample_rate)

            audio = pp_data.normalize(audio)
            cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window)
            x = np.abs(cmplx_sp)

            # Process data.
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(x, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)

            # Predict.
            pred_dict = {}
            for e in target_type:
                pred = forward(model_dict[e], x, mean_, std_, cuda)
                pred = pred.data.cpu().numpy()
                pred_dict[e] = pred
            print(cnt, name)

            # Wiener filter.
            pred_mag_sp = pred_dict['speech'] / (
                pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp)

            pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp)
            frames = stft.istft(pred_cmplx_sp)

            cola_constant = stft.get_cola_constant(hop_size, window)
            seq = stft.overlap_add(frames, hop_size, cola_constant)
            seq = seq[0:len(audio)]

            # Write out wav
            out_wav_path = os.path.join(out_wav_dir, name)
            pp_data.write_audio(out_wav_path, seq, sample_rate)
            print("Write out wav to: %s" % out_wav_path)

            if visualize:
                vmin = -5.
                vmax = 5.
                fig, axs = plt.subplots(3, 1, sharex=True)
                axs[0].matshow(np.log(np.abs(cmplx_sp)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                plt.show()
Example #19
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration
    n_noise_frame = args.noise_frame
    n_hop = args.n_hop

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = False
    # Load model.
    model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr),
                              "md_%diters.h5" % iter)
    model = load_model(model_path)

    # Load scaler.
    # scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p")
    # scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)
    mel_basis = librosa.filters.mel(cfg.sample_rate, cfg.n_window, n_mels=40)
    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        input1_3d, input2, out1, out2 = pp_data.get_input_output_layer(
            mixed_cmplx_x, speech_x, noise_x, alpha, n_concat, n_noise_frame,
            n_hop, mel_basis)

        # Predict.
        pred = model.predict([input1_3d, input2])
        print(cnt, na)
        sys.stdout.flush()

        # Inverse scale.
        if scale:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)

        # post processing
        pred_speech_lps = 1 / 3.0 * (pred[0][:, :257] + pred[1][:, :257] +
                                     np.log(np.abs(mixed_cmplx_x) + 1e-08) +
                                     np.log(pred[1][:, 327:584]))

        # Debug plot.
        if args.visualize:
            out_path = os.path.join(workspace, "figures", "test",
                                    "%ddb" % int(te_snr), "%s.all.png" % na)
            pp_data.create_folder(os.path.dirname(out_path))
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(np.log(np.abs(mixed_cmplx_x.T) + 1e-08),
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[1].matshow(np.log(speech_x.T + 1e-08),
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[2].matshow(pred_speech_lps.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in xrange(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.savefig(out_path)
            plt.close('all')
            # plt.show()
            out_path = os.path.join(workspace, "figures", "test",
                                    "%ddb" % int(te_snr),
                                    "%s.mixture.png" % na)
            display.specshow(np.log(np.abs(mixed_cmplx_x.T) + 1e-08))
            plt.title("%ddb mixture log spectrogram" % int(te_snr))
            plt.savefig(out_path)
            out_path = os.path.join(workspace, "figures", "test",
                                    "%ddb" % int(te_snr), "%s.clean.png" % na)
            display.specshow(np.log(speech_x.T + 1e-08))
            plt.title("Clean speech log spectrogram")
            plt.savefig(out_path)
            out_path = os.path.join(workspace, "figures", "test",
                                    "%ddb" % int(te_snr), "%s.enh.png" % na)
            display.specshow(pred_speech_lps.T)
            plt.title("Enhanced speech log spectrogram")
            plt.savefig(out_path)
            plt.close('all')

        # Recover enhanced wav.
        pred_sp = np.exp(pred_speech_lps)
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test",
                                "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
def inference(args):
    cuda = args.use_cuda and torch.cuda.is_available()
    workspace = args.workspace
    model_name = args.model_name
    feat_type = args.feat_type
    script_na = args.script_na

    # Load data.
    te_packed_feat_path = os.path.join(workspace, "packed_features", feat_type,
                                       "test.p")
    [te_x_list, te_y_list,
     te_na_list] = cPickle.load(open(te_packed_feat_path, 'rb'))

    # Scale.
    if True:
        scale_path = os.path.join(workspace, "scalers", feat_type, "scaler.p")
        scaler = pickle.load(open(scale_path, 'rb'))
        te_x_list = pp_data.scale_on_x_list(te_x_list, scaler)

    # Construct model topology.
    n_concat = 3
    te_n_hop = 1
    n_freq = te_x_list[0].shape[-1]
    n_out = te_y_list[0].shape[-1]
    model = Net(n_concat, n_freq, n_out)

    # Init the weights of model using trained weights.
    model_path = os.path.join(workspace, "models", script_na, feat_type,
                              model_name)
    if os.path.isfile(model_path):
        print("Loading checkpoint '%s'" % model_path)
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['state_dict'])
    else:
        raise Exception("Model path %s does not exist!" % model_path)

    # Move model to GPU.
    if cuda:
        model.cuda()

    # Directory to write out transcript midi files.
    out_midi_dir = os.path.join(workspace, "out_midis",
                                pp_data.get_filename(__file__), feat_type)
    pp_data.create_folder(out_midi_dir)

    # Data to 3d.
    n_half = (n_concat - 1) / 2
    for i1 in xrange(len(te_x_list)):
        x = te_x_list[i1]  # (n_time, n_freq)
        y = te_y_list[i1]  # (n_time, n_out)
        bare_na = os.path.splitext(te_na_list[i1])[0]
        (n_time, n_freq) = x.shape

        zero_pad = np.zeros((n_half, n_freq))
        x = np.concatenate((zero_pad, x, zero_pad), axis=0)
        x3d = pp_data.mat_2d_to_3d(x, n_concat,
                                   te_n_hop)  # (n_time, n_concat, n_freq)

        # Move data to GPU.
        x3d = torch.Tensor(x3d)
        x3d = Variable(x3d)
        if cuda:
            x3d = x3d.cuda()

        # Inference.
        model.eval()
        pred = model(x3d)  # (n_time, n_out)

        # Convert data type to numpy.
        pred = pred.data.cpu().numpy()

        # Threshold and write out predicted piano roll to midi file.
        mid_roll = pp_data.prob_to_midi_roll(pred, 0.5)
        out_path = os.path.join(out_midi_dir, "%s.mid" % bare_na)
        print("Write out to: %s" % out_path)
        pp_data.write_midi_roll_to_midi(mid_roll, out_path)

        # Debug plot.
        if True:
            fig, axs = plt.subplots(3, 1, sharex=True)
            axs[0].matshow(y.T, origin='lower', aspect='auto')
            axs[1].matshow(pred.T, origin='lower', aspect='auto')
            binary_pred = (np.sign(pred - 0.5) + 1) / 2
            axs[2].matshow(binary_pred.T, origin='lower', aspect='auto')
            axs[0].set_title("Ground truth")
            axs[1].set_title("DNN output probability")
            axs[2].set_title("DNN output probability after thresholding")
            for j1 in xrange(3):
                axs[j1].set_ylabel('note index')
                axs[j1].set_xlabel('frames')
                axs[j1].xaxis.set_label_coords(1.06, -0.01)
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()
Example #21
0
def inference(args):
    workspace = args.workspace
    iter = args.iteration
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)
    audio_type = 'speech'
    
    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)
    
    speech_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/timit_wavs/subtest"
    
    # Load model
    model_path = os.path.join(workspace, "models", filename, audio_type, "md_%d_iters.tar" % iter)
    n_freq = 257
    model = DNN(stack_num, n_freq)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['state_dict'])
    
    if cuda:
        model.cuda()
        
    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)
    
    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1
        
    out_wav_dir = os.path.join(workspace, "enh_wavs", filename)
    pp_data.create_folder(out_wav_dir)
    
    dft = pp_data.DFT(fft_size, cuda)
        
    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio0, _) = pp_data.read_audio(audio_path, sample_rate)
            
            audio = pp_data.normalize(audio0)
            
            # Enframe
            frames = stft.enframe(audio, fft_size, hop_size)
            
            # Process data. 
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(frames, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)
            
            pred_frames = forward(model, x, mean_, std_, cuda)
            
            pred_frames = pred_frames.data.cpu().numpy()
            
            # cola_constant = 0.5
            # seq = stft.overlap_add(pred_frames, hop_size, cola_constant)
            
            pred_frames *= window
            
            cola_constant = stft.get_cola_constant(hop_size, window)
            seq = stft.overlap_add(pred_frames, hop_size, cola_constant)
            seq = seq[0 : len(audio)]
            
            
            # Write out wav
            out_wav_path = os.path.join(out_wav_dir, name)
            pp_data.write_audio(out_wav_path, seq, sample_rate)
            print("Write out wav to: %s" % out_wav_path)
            
            if visualize:
                
                clean_audio_path = os.path.join(speech_dir, name.split('.')[0] + ".WAV")
                (clean_audio, _) = pp_data.read_audio(clean_audio_path, sample_rate)
                clean_audio = pp_data.normalize(clean_audio)
                clean_frames = stft.enframe(clean_audio, fft_size, hop_size)
                
                mix_sp = np.abs(np.fft.rfft(frames * window, norm='ortho'))
                enh_sp = np.abs(np.fft.rfft(pred_frames * window, norm='ortho'))
                clean_sp = np.abs(np.fft.rfft(clean_frames * window, norm='ortho'))
                
                K = 10
                fig, axs = plt.subplots(K/2,2, sharex=True)
                for k in range(K):
                    axs[k / 2, k % 2].plot(frames[k+100], color='y')
                    axs[k / 2, k % 2].plot(clean_frames[k+100], color='r')
                    axs[k / 2, k % 2].plot(pred_frames[k+100], color='b')
                plt.show()
                
                # import crash
                # asdf
                
                vmin = -5.
                vmax = 5.
                fig, axs = plt.subplots(3,1, sharex=True)
                axs[0].matshow(np.log(np.abs(mix_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                axs[1].matshow(np.log(np.abs(clean_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                axs[2].matshow(np.log(np.abs(enh_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax)
                plt.show()
Example #22
0
def create_mixture_csv(args):
    """Create csv containing mixture information.
    Each line in the .csv file contains [speech_name, noise_name, noise_onset, noise_offset]

    Args:
      workspace: str, path of workspace.
      speech_dir: str, path of speech data.
      noise_dir: str, path of noise data.
      data_type: str, 'train' | 'test'.
      magnification: int, only used when data_type='train', number of noise
          selected to mix with a speech. E.g., when magnication=3, then 4620
          speech with create 4620*3 mixtures. magnification should not larger
          than the species of noises.
    """
    workspace = args.workspace
    speech_dir = args.speech_dir
    noise_dir = args.noise_dir
    interfere_dir = args.interfere_dir
    data_type = args.data_type
    magnification = args.magnification
    fs = cfg.sample_rate

    speech_names = [
        na for na in os.listdir(speech_dir) if na.lower().endswith(".wav")
    ]
    noise_names = [
        na for na in os.listdir(noise_dir) if na.lower().endswith(".wav")
    ]
    interfere_names = [
        na for na in os.listdir(interfere_dir) if na.lower().endswith(".wav")
    ]

    rs = np.random.RandomState(0)
    out_csv_path = os.path.join(workspace, "mixture_csvs",
                                "%s.csv" % data_type)
    pp_data.create_folder(os.path.dirname(out_csv_path))

    cnt = 0
    f = open(out_csv_path, 'w')
    f.write("%s\t%s\t%s\t%s\t%s\t%s\n" %
            ("speech_name", "noise_name", "noise_onset", "noise_offset",
             "interfere_onset", "interfere_offset"))
    for speech_na in speech_names:
        # Read speech.
        speech_path = os.path.join(speech_dir, speech_na)
        (speech_audio, _) = read_audio(speech_path, fs)
        len_speech = len(speech_audio)

        # For training data, mix each speech with randomly picked #magnification noises.
        if data_type == 'train':
            selected_noise_names = rs.choice(noise_names,
                                             size=magnification,
                                             replace=False)
        # For test data, mix each speech with all noises.
        elif data_type == 'test':
            selected_noise_names = noise_names
        else:
            raise Exception("data_type must be train | test!")

        selected_interfere_names = rs.choice(interfere_names,
                                             size=1,
                                             replace=False)

        # Mix one speech with different noises many times.
        for idx, noise_na in enumerate(selected_noise_names):
            noise_path = os.path.join(noise_dir, noise_na)
            (noise_audio, _) = read_audio(noise_path, fs)
            interfere_path = os.path.join(interfere_dir,
                                          selected_interfere_names[0])
            interfere_audio, _ = read_audio(interfere_path, fs)
            len_infer = len(interfere_audio)

            if len_infer <= len_speech:
                infer_onset = 0
                infer_offset = len_speech
            # If noise longer than speech then randomly select a segment of noise.
            else:
                infer_onset = rs.randint(0, len_infer - len_speech, size=1)[0]
                infer_offset = infer_onset + len_speech

            len_noise = len(noise_audio)

            if len_noise <= len_speech:
                noise_onset = 0
                nosie_offset = len_speech
            # If noise longer than speech then randomly select a segment of noise.
            else:
                noise_onset = rs.randint(0, len_noise - len_speech, size=1)[0]
                nosie_offset = noise_onset + len_speech

            if cnt % 100 == 0:
                print cnt

            cnt += 1
            f.write("%s\t%s\t%s\t%d\t%d\t%d\t%d\n" %
                    (speech_na, noise_na, selected_interfere_names[0],
                     noise_onset, nosie_offset, infer_onset, infer_offset))
    f.close()
    print(out_csv_path)
    print("Create %s mixture csv finished!" % data_type)
Example #23
0
def create_mix_yaml(cv_path, n_events, out_path):
    """Create yaml file containing the mixture information. 
    """
    workspace = cfg.workspace
    events = cfg.events
    n_folds = cfg.n_folds
    onset_list = cfg.onset_list

    rs = np.random.RandomState(0)

    # Read cross validation csv
    cv_path = os.path.join(workspace, "cross_validation.csv")
    with open(cv_path, 'rb') as f:
        reader = csv.reader(f, delimiter='\t')
        lis = list(reader)

    yaml_data = []
    cnt = 0
    for tar_fold in xrange(n_folds):
        for loop in xrange(n_events):

            # Initialize dict
            dict = {}
            for e in events:
                dict[e] = []

            # Read all rows in cross validation csv
            for i1 in xrange(1, len(lis)):
                [name, fold] = lis[i1]
                fold = int(fold)
                if fold == tar_fold:
                    for e in events:
                        if e in name:
                            dict[e].append(name)

            while _get_n_elements_in_dict(dict) >= n_events:
                # Randomly select event files.
                selected_names = []
                events_pool = _get_n_largest_events(dict, n_events, rs)

                selected_events = rs.choice(events_pool,
                                            size=n_events,
                                            replace=False)
                for e in selected_events:
                    sel_na = rs.choice(dict[e], replace=False)
                    sel_na = str(sel_na)
                    selected_names.append(sel_na)
                    dict[e].remove(sel_na)
                    if len(dict[e]) == 0:
                        dict.pop(e)

                # Combine yaml info.
                mixture_data = {
                    'name': "%05d.wav" % cnt,
                    'fold': tar_fold,
                    'events': []
                }
                cnt += 1
                for (j1, na) in enumerate(selected_names):
                    event_data = {
                        'file_name': na,
                        'event': re.split('(\d+)', na)[0],
                        'onset': onset_list[j1],
                        'fold': 0
                    }
                    mixture_data['events'].append(event_data)

                yaml_data.append(mixture_data)

    # Write out yaml file.
    pp_data.create_folder(os.path.dirname(out_path))
    with open(out_path, 'w') as f:
        f.write(yaml.dump(yaml_data, default_flow_style=False))
    print("len(yaml_file): %d" % len(yaml_data))
    print("Write out to %s" % out_path)
Example #24
0
def train(args):
    """Train the neural network. Write out model every several iterations.

    Args:
      workspace: str, path of workspace.
      tr_snr: float, training SNR.
      te_snr: float, testing SNR.
      lr: float, learning rate.
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    lr = args.lr
    # Load data.
    t1 = time.time()
    tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "data.h5")
    te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", "test", "%ddb" % int(te_snr), "data.h5")
    (tr_x, tr_y) = pp_data.load_hdf5(tr_hdf5_path)
    (te_x, te_y) = pp_data.load_hdf5(te_hdf5_path)
    print(tr_x.shape, tr_y.shape)
    print(te_x.shape, te_y.shape)
    print("Load data time: %s s" % (time.time() - t1,))
    batch_size = 128
    print("%d iterations / epoch" % int(tr_x.shape[0] / batch_size))
    # Build model
    _, n_freq = tr_x.shape
    # encode
    T = 1
    data = Input(shape=[n_freq])
    x = Reshape([1, T, n_freq])(data)
    x1 = Conv2D(10, (T, 11), strides=(10, 1), data_format='channels_first', padding='same')(x)
    x1 = BatchNormalization(axis=-1)(x1)
    x1 = Activation('relu')(x1)

    x2 = Conv2D(12, (T, 7), strides=(10, 1), data_format='channels_first', padding='same')(x1)
    x2 = BatchNormalization(axis=-1)(x2)
    x2 = Activation('relu')(x2)

    x3 = Conv2D(14, (T, 5), strides=(10, 1), data_format='channels_first', padding='same')(x2)
    x3 = BatchNormalization(axis=-1)(x3)
    x3 = Activation('relu')(x3)

    x4 = Conv2D(15, (T, 5), strides=(10, 1), data_format='channels_first', padding='same')(x3)
    x4 = BatchNormalization(axis=-1)(x4)
    x4 = Activation('relu')(x4)

    x5 = Conv2D(19, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(x4)
    x5 = BatchNormalization(axis=-1)(x5)
    x5 = Activation('relu')(x5)

    x6 = Conv2D(21, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(x5)
    x6 = BatchNormalization(axis=-1)(x6)
    x6 = Activation('relu')(x6)

    x7 = Conv2D(23, (1, 7), strides=(10, 1), data_format='channels_first', padding='same')(x6)
    x7 = BatchNormalization(axis=-1)(x7)
    x7 = Activation('relu')(x7)

    x8 = Conv2D(25, (1, 11), strides=(10, 1), data_format='channels_first', padding='same')(x7)
    x8 = BatchNormalization(axis=-1)(x8)
    x8 = Activation('relu')(x8)

    # decode
    y1 = Conv2D(23, (1, 7), strides=(10, 1), data_format='channels_first', padding='same')(x8)
    y1 = Add()([y1, x7])
    y1 = BatchNormalization(axis=-1)(y1)
    y1 = Activation('relu')(y1)

    y2 = Conv2D(21, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(y1)
    y2 = Add()([y2, x6])
    y2 = BatchNormalization(axis=-1)(y2)
    y2 = Activation('relu')(y2)

    y3 = Conv2D(19, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(y2)
    y3 = Add()([y3, x5])
    y3 = BatchNormalization(axis=-1)(y3)
    y3 = Activation('relu')(y3)

    y4 = Conv2D(15, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(y3)
    y4 = Add()([y4, x4])
    y4 = BatchNormalization(axis=-1)(y4)
    y4 = Activation('relu')(y4)

    y5 = Conv2D(14, (1, 5), strides=(10, 1), data_format='channels_first', padding='same')(y4)
    y5 = Add()([y5, x3])
    y5 = BatchNormalization(axis=-1)(y5)
    y5 = Activation('relu')(y5)

    y6 = Conv2D(12, (1, 7), strides=(10, 1), data_format='channels_first', padding='same')(y5)
    y6 = Add()([y6, x2])
    y6 = BatchNormalization(axis=-1)(y6)
    y6 = Activation('relu')(y6)

    y7 = Conv2D(10, (1, 11), strides=(10, 1), data_format='channels_first', padding='same')(y6)
    y7 = Add()([y7, x1])
    y7 = BatchNormalization(axis=-1)(y7)
    y7 = Activation('relu')(y7)

    y8 = Conv2D(1, (1, n_freq), strides=(10, 1), data_format='channels_first', padding='same')(y7)
    # y5 = BatchNormalization(axis=-1)(y5)
    y8 = Activation('relu')(y8)

    out = Reshape([n_freq])(y8)

    model = Model(inputs=data, outputs=out)
    adam = optimizers.Adam(lr=lr)
    model.compile(loss='mean_absolute_error', optimizer=adam)
    model.summary()

    # Data generator.
    tr_gen = DataGenerator(batch_size=batch_size, type='train')
    te_gen = DataGenerator(batch_size=batch_size, type='test', te_max_iter=200)
    # Directories for saving models and training stats
    model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr))
    pp_data.create_folder(model_dir)
    # Train.
    t1 = time.time()
    model.fit_generator(tr_gen.generate(xs=[tr_x], ys=[tr_y]), validation_data=te_gen.generate(xs=[te_x], ys=[te_y]),
                        validation_steps=100, steps_per_epoch=200, epochs=200)
    print("Training complete.")
    model_name = 'FullyCNN.h5'
    model_path = os.path.join(model_dir, model_name)
    model.save(model_path)
    print("Training time: %s s" % (time.time() - t1,))
def separate(args, bgn_iter, fin_iter, interval):
    workspace = cfg.workspace
    events = cfg.events
    te_fold = cfg.te_fold
    n_events = args.n_events
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_duration = cfg.clip_duration
    snr = args.snr

    # Load ground truth data.
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    at_y = te_at_y
    sed_y = te_sed_y
    na_list = te_na_list

    # Load and sum
    preds_dir = os.path.join(workspace, "preds",
                             pp_data.get_filename(__file__),
                             "n_events=%d" % n_events, "fold=%d" % te_fold,
                             "snr=%d" % snr)

    at_probs_list, seg_masks_list = [], []
    for iter in xrange(bgn_iter, fin_iter, interval):
        seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter,
                                      "seg_masks.p")
        seg_masks = cPickle.load(open(seg_masks_path, 'rb'))
        seg_masks_list.append(seg_masks)
    seg_masks = np.mean(seg_masks_list,
                        axis=0)  # (n_clips, n_classes, n_time, n_freq)

    print(seg_masks.shape)

    #
    audio_dir = os.path.join(workspace, "mixed_audio",
                             "n_events=%d" % n_events)

    sep_dir = os.path.join(workspace, "sep_audio",
                           pp_data.get_filename(__file__),
                           "n_events=%d" % n_events, "fold=%d" % te_fold,
                           "snr=%d" % snr)
    pp_data.create_folder(sep_dir)

    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())
    melW = librosa.filters.mel(sr=fs,
                               n_fft=n_window,
                               n_mels=64,
                               fmin=0.,
                               fmax=fs / 2)
    inverse_melW = get_inverse_W(melW)  # (64, 513)

    seg_stats = {}
    for e in events:
        seg_stats[e] = {
            'fvalue': [],
            'auc': [],
            'iou': [],
            'hit': [],
            'fa': [],
            'tp': [],
            'fn': [],
            'fp': []
        }

    cnt = 0
    for (i1, na) in enumerate(na_list):
        bare_na = os.path.splitext(na)[0]
        audio_path = os.path.join(audio_dir, "%s.wav" % bare_na)
        (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs)
        event_audio = stereo_audio[:, 0]
        noise_audio = stereo_audio[:, 1]
        mixed_audio = event_audio + noise_audio

        mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window,
                                         n_overlap)
        mixed_sp = np.abs(mixed_cmplx_sp)
        event_sp = np.abs(
            pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap))
        noise_sp = np.abs(
            pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap))

        sm = seg_masks[i1]  # (n_classes, n_time, n_freq)
        sm_upsampled = np.dot(sm, inverse_melW)  # (n_classes, n_time, 513)

        print(na)

        # Write out separated events.
        for j1 in xrange(len(events)):
            if at_y[i1][j1] == 1:
                (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1],
                                                            event_sp,
                                                            noise_sp,
                                                            sed_y[i1, :, j1],
                                                            seg_thres,
                                                            inside_only=True)
                (hit, fa) = hit_fa(sm_upsampled[j1],
                                   event_sp,
                                   noise_sp,
                                   sed_y[i1, :, j1],
                                   seg_thres,
                                   inside_only=True)
                seg_stats[events[j1]]['fvalue'].append(fvalue)
                seg_stats[events[j1]]['auc'].append(auc)
                seg_stats[events[j1]]['iou'].append(iou)
                seg_stats[events[j1]]['hit'].append(hit)
                seg_stats[events[j1]]['fa'].append(fa)
                seg_stats[events[j1]]['tp'].append(tp)
                seg_stats[events[j1]]['fn'].append(fn)
                seg_stats[events[j1]]['fp'].append(fp)

                sep_event_sp = sm_upsampled[j1] * mixed_sp
                sep_event_s = spectrogram_to_wave.recover_wav(
                    sep_event_sp,
                    mixed_cmplx_sp,
                    n_overlap=n_overlap,
                    winfunc=np.hamming,
                    wav_len=int(fs * clip_duration))
                sep_event_s *= recover_scaler

                out_event_audio_path = os.path.join(
                    sep_dir, "%s.%s.wav" % (bare_na, events[j1]))
                pp_data.write_audio(out_event_audio_path, sep_event_s, fs)

        # Write out separated noise.
        sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.)
        sep_noise_sp = sm_noise_upsampled * mixed_sp
        sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp,
                                                      mixed_cmplx_sp,
                                                      n_overlap=n_overlap,
                                                      winfunc=np.hamming,
                                                      wav_len=int(
                                                          fs * clip_duration))
        sep_noise_s *= recover_scaler
        out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na)
        pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs)

        cnt += 1
        # if cnt == 2: break


    fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], []
    for e in events:
        fvalues.append(np.mean(seg_stats[e]['fvalue']))
        ious.append(np.mean(seg_stats[e]['iou']))
        aucs.append(np.mean(seg_stats[e]['auc']))
        hits.append(np.mean(seg_stats[e]['hit']))
        fas.append(np.mean(seg_stats[e]['fa']))
        tps.append(np.mean(seg_stats[e]['tp']))
        fns.append(np.mean(seg_stats[e]['fn']))
        fps.append(np.mean(seg_stats[e]['fp']))

    logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" %
                 ("".ljust(16)))
    logging.info(
        "%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
        ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs),
         np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) -
         np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps)))
    for i1 in xrange(len(events)):
        logging.info(
            "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" %
            (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1],
             fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))
Example #26
0
            pp.write_audio(clean_path, clean_new, conf1.fs)

        clean_spec = pp.calc_sp(clean_new, mode='magnitude')
        mixed_spec = pp.calc_sp(mixed, mode='complex')

        clean_all.append(clean_spec)
        mixed_all.append(mixed_spec)

    print(len(clean_all), ',', len(mixed_all))

    num_te = pp.pack_features(mixed_all, clean_all, 'test')

    compute_scaler('test')

    return num_tr, num_te,


get_gpu()

pp.create_folder(conf1.train_folder)
pp.create_folder(conf1.test_folder)
pp.create_folder(conf1.packed_feature_dir)
pp.create_folder(conf1.data_train_dir)
pp.create_folder(conf1.data_test_dir)
pp.create_folder(conf1.logs)
pp.create_folder(conf1.model_dir)
pp.create_folder(conf1.stats_dir)

t1 = time.time()

num_tr, num_te = prepare_database()
def train(args):
    workspace = cfg.workspace
    te_fold = cfg.te_fold
    n_events = args.n_events
    snr = args.snr

    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    print(tr_x.shape, tr_at_y.shape)
    print(te_x.shape, te_at_y.shape)
    (_, n_time, n_freq) = tr_x.shape
    n_out = len(cfg.events)

    if False:
        for e in tr_x:
            plt.matshow(e.T, origin='lower', aspect='auto')
            plt.show()

    # Build model.
    lay_in = InputLayer(in_shape=(n_time, n_freq))

    a = Reshape((1, n_time, n_freq))(lay_in)
    a = Conv2D(n_outfmaps=64,
               n_row=3,
               n_col=5,
               act='linear',
               strides=(1, 1),
               border_mode=(1, 2))(a)
    a = BN(axis=(0, 2, 3))(a)
    a = Activation('relu')(a)
    a = Conv2D(n_outfmaps=64,
               n_row=3,
               n_col=5,
               act='linear',
               strides=(1, 1),
               border_mode=(1, 2))(a)
    a = BN(axis=(0, 2, 3))(a)
    a = Activation('relu')(a)
    a = Dropout(p_drop=0.2)(a)

    a = Conv2D(n_outfmaps=64,
               n_row=3,
               n_col=5,
               act='linear',
               strides=(1, 1),
               border_mode=(1, 2))(a)
    a = BN(axis=(0, 2, 3))(a)
    a = Activation('relu')(a)
    a = Conv2D(n_outfmaps=64,
               n_row=3,
               n_col=5,
               act='linear',
               strides=(1, 1),
               border_mode=(1, 2))(a)
    a = BN(axis=(0, 2, 3))(a)
    a = Activation('relu')(a)
    a = Dropout(p_drop=0.2)(a)

    a = Conv2D(n_outfmaps=64,
               n_row=3,
               n_col=5,
               act='linear',
               strides=(1, 1),
               border_mode=(1, 2))(a)
    a = BN(axis=(0, 2, 3))(a)
    a = Activation('relu')(a)
    a = Conv2D(n_outfmaps=64,
               n_row=3,
               n_col=5,
               act='linear',
               strides=(1, 1),
               border_mode=(1, 2))(a)
    a = BN(axis=(0, 2, 3))(a)
    a = Activation('relu')(a)
    a = Dropout(p_drop=0.2)(a)

    a = Conv2D(n_outfmaps=n_out,
               n_row=1,
               n_col=1,
               act='sigmoid',
               border_mode=(0, 0),
               name='seg_masks')(a)

    a8 = Lambda(_global_avg_pooling, name='a8')(a)

    md = Model([lay_in], [a8])
    md.compile()
    md.summary(is_logging=True)

    # Callbacks.
    md_dir = os.path.join(workspace, "models", pp_data.get_filename(__file__),
                          "n_events=%d" % n_events, "fold=%d" % te_fold,
                          "snr=%d" % snr)
    pp_data.create_folder(md_dir)
    save_model = SaveModel(md_dir, call_freq=50, type='iter', is_logging=True)
    validation = Validation(te_x=te_x,
                            te_y=te_at_y,
                            batch_size=50,
                            call_freq=50,
                            metrics=['binary_crossentropy'],
                            dump_path=None,
                            is_logging=True)

    callbacks = [save_model, validation]

    observe_nodes = [md.find_layer('seg_masks').output_]
    f_forward = md.get_observe_forward_func(observe_nodes)

    # Generator.
    tr_gen = DataGenerator(batch_size=32, type='train')
    eva_gen = DataGenerator2(batch_size=32, type='test')

    # Train.
    loss_ary = []
    t1 = time.time()
    optimizer = Adam(1e-3)
    for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_at_y]):
        if md.iter_ % 50 == 0:
            logging.info("iter: %d tr_loss: %f time: %s" % (
                md.iter_,
                np.mean(loss_ary),
                time.time() - t1,
            ))
            t1 = time.time()
            loss_ary = []
        # if md.iter_ % 200 == 0:
        # write_out_at_sed(md, eva_gen, f_forward, te_x, te_at_y, te_sed_y, n_events, snr, te_fold)
        if md.iter_ == 5001:
            break
        loss = md.train_on_batch(batch_x,
                                 batch_y,
                                 loss_func='binary_crossentropy',
                                 optimizer=optimizer,
                                 callbacks=callbacks)
        loss_ary.append(loss)
    parser_get_sep_stats.add_argument('--n_events', type=int)
    parser_get_sep_stats.add_argument('--snr', type=int)

    parser_b2 = subparsers.add_parser('avg_recognize')
    parser_b2.add_argument('--n_events', type=int)
    parser_b2.add_argument('--snr', type=int)

    parser_c = subparsers.add_parser('plot_hotmap')
    parser_c.add_argument('--model_name', type=str)
    parser_c.add_argument('--n_events', type=int)

    args = parser.parse_args()

    logs_dir = os.path.join(cfg.workspace, "logs",
                            pp_data.get_filename(__file__))
    pp_data.create_folder(logs_dir)
    logging = pp_data.create_logging(logs_dir, filemode='w')
    logging.info(os.path.abspath(__file__))
    logging.info(sys.argv)

    if args.mode == "train":
        train(args)
    elif args.mode == "recognize":
        recognize(args)
    elif args.mode == "get_stats":
        bgn_iter, fin_iter, interval = 2000, 3001, 200
        get_stats(args, bgn_iter, fin_iter, interval)
    elif args.mode == "separate":
        bgn_iter, fin_iter, interval = 2000, 3001, 200
        separate(args, bgn_iter, fin_iter, interval)
    elif args.mode == "evaluate_separation":
def train(args):
    num_classes = cfg.num_classes
    
    # Load training & testing data
    (tr_x, tr_y, tr_na_list) = load_hdf5_data(args.tr_hdf5_path, verbose=1)
    (te_x, te_y, te_na_list) = load_hdf5_data(args.te_hdf5_path, verbose=1)
    print("tr_x.shape: %s" % (tr_x.shape,))#removed this dec4 since its not helpful really

    # Scale data
    tr_x = do_scale(tr_x, args.scaler_path, verbose=1)
    te_x = do_scale(te_x, args.scaler_path, verbose=1)
    #print("delme dec 1, tr_x.shape", tr_x.shape)#output=51, 240, 64
    #print("delme dec 1, te_x.shape", te_x.shape)#:51, 240, 64
    # Build model
    (_, n_time, n_freq) = tr_x.shape    # (N, 240, 64)
    input_logmel = Input(shape=(n_time, n_freq), name='in_layer')   # (N, 240, 64)
    a1 = Reshape((n_time, n_freq, 1))(input_logmel) # (N, 240, 64, 1)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 240, 32, 128)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 240, 16, 128)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 240, 8, 128)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 240, 4, 128)
    
    a1 = Conv2D(256, (3, 3), padding="same", activation="relu", use_bias=True)(a1)
    a1 = MaxPooling2D(pool_size=(1, 4))(a1) # (N, 240, 1, 256)
    
    a1 = Reshape((240, 256))(a1) # (N, 240, 256)
    
    # Gated BGRU
    rnnout = Bidirectional(GRU(128, activation='linear', return_sequences=True))(a1)
    rnnout_gate = Bidirectional(GRU(128, activation='sigmoid', return_sequences=True))(a1)
    a2 = Multiply()([rnnout, rnnout_gate])
    
    # Attention
    cla = TimeDistributed(Dense(num_classes, activation='sigmoid'), name='localization_layer')(a2)
    att = TimeDistributed(Dense(num_classes, activation='softmax'))(a2)
    out = Lambda(outfunc, output_shape=(num_classes,))([cla, att])

    model = Model(input_logmel, out)
    model.summary()
    adam_optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer=adam_optimizer,
                  metrics=['accuracy'])#finn delme dec1 you can change this to categorical_accuracy to see if you can subvert the keras error. However you dont know if its the right hting to do. Keep a look out at the results to determineif it did what you wanted
    
    # Save model callback
    print("working here 1")
    filepath = os.path.join(args.out_model_dir, "{0}_{1}.hdf5".format(args.model_name, args.epochs)) 
    create_folder(os.path.dirname(filepath))
    save_model = ModelCheckpoint(filepath=filepath,
                                 monitor='val_acc',
                                 verbose=0,
                                 save_best_only=False,
                                 save_weights_only=False,
                                 mode='auto',
                                 period=1)  
    
    # Data generator
        
    # Train
    t_train = time.time()
    print("FINN Training started")#this is really just me seeing if this is where most of the time is spent
    use_generator = False
    if use_generator:
        gen = RatioDataGenerator(batch_size=args.batch_size, type='train')#batch size should be manipulated from 44

        model.fit_generator(generator=gen.generate([tr_x], [tr_y]), 
                        steps_per_epoch=args.steps_p_epoch,    # 100 iters is called an 'epoch'
                        epochs=args.epochs, #31             # Maximum 'epoch' to train
                        verbose=1, 
                        callbacks=[save_model], 
                        validation_data=(te_x, te_y))
    else:
        model.fit(x=tr_x, y=tr_y, batch_size=20, epochs=args.epochs, verbose=1, callbacks=[save_model], validation_split=0.05, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=args.init_epoch, steps_per_epoch=None, validation_steps=None)
    model.save(os.path.join(args.out_model_dir, "final_model_{}_{}epochs.h5".format(args.model_name, args.epochs)))#am not sure if fit will save the final epoch.. pretty sure it does tho
    print("FINN Training finished, time taken: ", (time.time()-t_train))#this is really just me seeing if this is where most of the time is spent
Example #30
0
def train(args):
    """Train the neural network. Write out model every several iterations. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      lr: float, learning rate. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    lr = args.lr
    iteration = args.iter

    # Load data.
    t1 = time.time()
    tr_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram",
                                "train", "%ddb" % int(tr_snr), "data.h5")
    te_hdf5_path = os.path.join(workspace, "packed_features", "spectrogram",
                                "test", "%ddb" % int(te_snr), "data.h5")
    (tr_x1, tr_x2, tr_y1, tr_y2) = pp_data.load_hdf5(tr_hdf5_path)
    (te_x1, te_x2, te_y1, te_y2) = pp_data.load_hdf5(te_hdf5_path)
    print(tr_x1.shape, tr_y1.shape, tr_x2.shape, tr_y2.shape)
    print(te_x1.shape, te_y1.shape, te_x2.shape, te_y2.shape)
    print("Load data time: %s s" % (time.time() - t1, ))

    batch_size = 500
    print("%d iterations / epoch" % int(tr_x1.shape[0] / batch_size))

    # Scale data.
    if not True:
        t1 = time.time()
        scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                                   "train", "%ddb" % int(tr_snr), "scaler.p")
        scaler = pickle.load(open(scaler_path, 'rb'))
        tr_x1 = pp_data.scale_on_3d(tr_x1, scaler)
        tr_y1 = pp_data.scale_on_2d(tr_y1, scaler)
        te_x1 = pp_data.scale_on_3d(te_x1, scaler)
        te_y1 = pp_data.scale_on_2d(te_y1, scaler)
        tr_x2 = pp_data.scale_on_2d(tr_x2, scaler)
        tr_y2 = pp_data.scale_on_2d(tr_y2, scaler)
        te_x2 = pp_data.scale_on_2d(te_x2, scaler)
        te_y2 = pp_data.scale_on_2d(te_y2, scaler)
        print("Scale data time: %s s" % (time.time() - t1, ))

    # Debug plot.
    if False:
        plt.matshow(tr_x[0:1000, 0, :].T,
                    origin='lower',
                    aspect='auto',
                    cmap='jet')
        plt.show()
        pause

    # Build model
    (_, n_concat, n_freq) = tr_x1.shape
    n_hid = 2048
    input_dim1 = (257 + 40 + 30) * 2
    input_dim2 = (257 + 40 + 30)
    out_dim1 = (257 + 40 + 30) * 2
    out_dim1_irm = 257 + 40 + 64
    out_dim2 = (257 + 40 + 30)
    out_dim2_irm = (257 + 40 + 64)

    # model = Sequential()
    # model.add(Flatten(input_shape=(n_concat, n_freq)))
    # model.add(Dense(n_hid, activation='relu'))
    # model.add(Dropout(0.2))
    # model.add(Dense(n_hid, activation='relu'))
    # model.add(Dropout(0.2))
    # model.add(Dense(n_hid, activation='relu'))
    # model.add(Dropout(0.2))
    # model.add(Dense(n_freq, activation='linear'))
    input1 = Input(shape=(n_concat, input_dim1), name='input1')
    layer = Flatten(name='flatten')(input1)
    layer = Dense(n_hid, activation='relu', name='dense1')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(n_hid, activation='relu', name='dense2')(layer)
    layer = Dropout(0.2)(layer)
    partial_out1 = Dense(out_dim1, name='1_out_linear')(layer)
    partial_out1_irm = Dense(out_dim1_irm,
                             name='1_out_irm',
                             activation='sigmoid')(layer)
    out1 = concatenate([partial_out1, partial_out1_irm], name='out1')
    input2 = Input(shape=(input_dim2, ), name='input2')
    layer = concatenate([input2, out1], name='merge')
    layer = Dense(n_hid, activation='relu', name='dense3')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(n_hid, activation='relu', name='dense4')(layer)
    layer = Dropout(0.2)(layer)
    partial_out2 = Dense(out_dim2, name='2_out_linear')(layer)
    partial_out2_irm = Dense(out_dim2_irm,
                             name='2_out_irm',
                             activation='sigmoid')(layer)
    out2 = concatenate([partial_out2, partial_out2_irm], name='out2')
    model = Model(inputs=[input1, input2], outputs=[out1, out2])

    model.summary()
    sys.stdout.flush()
    model.compile(loss='mean_absolute_error',
                  optimizer=Adam(lr=lr, epsilon=1e-03))
    # Data generator.
    tr_gen = DataGenerator(batch_size=batch_size, type='train')
    eval_te_gen = DataGenerator(batch_size=batch_size,
                                type='test',
                                te_max_iter=100)
    eval_tr_gen = DataGenerator(batch_size=batch_size,
                                type='test',
                                te_max_iter=100)

    # Directories for saving models and training stats
    model_dir = os.path.join(workspace, "models", "%ddb" % int(tr_snr))
    pp_data.create_folder(model_dir)

    stats_dir = os.path.join(workspace, "training_stats", "%ddb" % int(tr_snr))
    pp_data.create_folder(stats_dir)

    # Print loss before training.
    iter = 0
    tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2)
    te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2)
    print("Iteration: %d, tr_loss: %f, te_loss: %f" % (iter, tr_loss, te_loss))

    # Save out training stats.
    stat_dict = {
        'iter': iter,
        'tr_loss': tr_loss,
        'te_loss': te_loss,
    }
    stat_path = os.path.join(stats_dir, "%diters.p" % iter)
    cPickle.dump(stat_dict,
                 open(stat_path, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    # Train.
    t1 = time.time()
    for (batch_x, batch_y) in tr_gen.generate(xs=[tr_x1, tr_x2],
                                              ys=[tr_y1, tr_y2]):
        loss = model.train_on_batch(batch_x, batch_y)
        iter += 1

        # Validate and save training stats.
        if iter % 100 == 0:
            tr_loss = eval(model, eval_tr_gen, tr_x1, tr_x2, tr_y1, tr_y2)
            te_loss = eval(model, eval_te_gen, te_x1, te_x2, te_y1, te_y2)
            print("Iteration: %d, tr_loss: %f, te_loss: %f" %
                  (iter, tr_loss, te_loss))
            sys.stdout.flush()

            # Save out training stats.
            stat_dict = {
                'iter': iter,
                'tr_loss': tr_loss,
                'te_loss': te_loss,
            }
            stat_path = os.path.join(stats_dir, "%diters.p" % iter)
            cPickle.dump(stat_dict,
                         open(stat_path, 'wb'),
                         protocol=cPickle.HIGHEST_PROTOCOL)

        # Save model.
        if iter % (iteration / 20) == 0:
            model_path = os.path.join(model_dir, "md_%diters.h5" % iter)
            model.save(model_path)
            print("Saved model to %s" % model_path)

        if iter == iteration + 1:
            break

    print("Training time: %s s" % (time.time() - t1, ))
Example #31
0
def train(args):
    workspace = args.workspace
    audio_type = args.audio_type
    stack_num = args.stack_num
    hop_frames = args.hop_frames
    filename = args.filename
    cuda = args.use_cuda and torch.cuda.is_available()
    fft_size = cfg.fft_size
    print("cuda:", cuda)

    hdf5_file = os.path.join(args.workspace, "features",
                             "cmplx_spectrogram.h5")
    data_type = 'train'

    t1 = time.time()
    batch_size = 500
    shuffle = False
    load_raw = False
    data_loader = pp_data.DataLoader(hdf5_file,
                                     data_type,
                                     audio_type,
                                     stack_num,
                                     hop_frames,
                                     center_only=True,
                                     batch_size=batch_size,
                                     shuffle=shuffle,
                                     load_raw=load_raw)
    eval_tr_data_loader = pp_data.DataLoader(hdf5_file,
                                             'train',
                                             audio_type,
                                             stack_num,
                                             hop_frames,
                                             center_only=True,
                                             batch_size=batch_size,
                                             shuffle=shuffle,
                                             load_raw=load_raw)
    eval_te_data_loader = pp_data.DataLoader(hdf5_file,
                                             'test',
                                             audio_type,
                                             stack_num,
                                             hop_frames,
                                             center_only=True,
                                             batch_size=batch_size,
                                             shuffle=shuffle,
                                             load_raw=load_raw)
    print("Load time: %s" % (time.time() - t1))

    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda)
    std_ = move_data_to_gpu(std_, cuda)

    # Model
    n_freq = 257
    model = DNN(stack_num, n_freq)

    if cuda:
        model.cuda()

    dft = pp_data.DFT(fft_size, cuda)

    # Optimizer
    optimizer = optim.Adam(model.parameters(),
                           lr=1e-4,
                           betas=(0.9, 0.999),
                           eps=1e-08,
                           weight_decay=0)

    # Train
    iter = 0
    model_dir = os.path.join(workspace, "models", filename, audio_type)
    pp_data.create_folder(model_dir)
    t_train = time.time()

    for (batch_x, batch_y) in data_loader.generate():

        output = forward(model, batch_x, mean_, std_, dft, cuda)

        batch_y = np.abs(batch_y)
        batch_y = move_data_to_gpu(batch_y, cuda)
        # batch_y = transform(batch_y, type='torch')
        # batch_y = pp_data.scale(batch_y, mean_, std_)

        loss = mse_loss(output, batch_y)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        iter += 1

        # Evaluate.
        loss_ary = []
        if iter % 500 == 0:
            t_eval = time.time()
            tr_loss = evaluate(model, eval_tr_data_loader, mean_, std_, dft,
                               cuda)
            # tr_loss = -1
            te_loss = evaluate(model, eval_te_data_loader, mean_, std_, dft,
                               cuda)
            print("Iter: %d, train err: %f, test err: %f, train time: %s, eval time: %s" % \
                    (iter, tr_loss, te_loss, time.time() - t_train, time.time() - t_eval))
            t_train = time.time()

        # Save model.
        if iter % 5000 == 0:
            save_out_dict = {
                'iter': iter,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'te_loss': loss,
            }
            save_out_path = os.path.join(model_dir, "md_%d_iters.tar" % iter)
            torch.save(save_out_dict, save_out_path)
            print("Save model to %s" % save_out_path)

        t1 = time.time()