Example #1
0
def main(wavfile, destfile, win_size, hop_size, nfbank, zoom, eps):
    # load signal
    fs, sig = apkit.load_wav(wavfile)
    tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size)
    nch, nframe, _ = tf.shape

    # trim freq bins
    nfbin = _FREQ_MAX * win_size / fs  # 0-8kHz
    freq = np.fft.fftfreq(win_size)[:nfbin]
    tf = tf[:, :, :nfbin]

    # compute pairwise gcc on f-banks
    ecov = apkit.empirical_cov_mat(tf, fw=1, tw=1)
    fbw = apkit.mel_freq_fbank_weight(nfbank,
                                      freq,
                                      fs,
                                      fmax=_FREQ_MAX,
                                      fmin=_FREQ_MIN)
    fbcc = apkit.gcc_phat_fbanks(ecov, fbw, zoom, freq, eps=eps)

    # merge to a single numpy array, indexed by 'tpbd'
    #                                           (time, pair, bank, delay)
    feature = np.asarray(
        [fbcc[(i, j)] for i in xrange(nch) for j in xrange(nch) if i < j])
    feature = np.moveaxis(feature, 2, 0)

    # and map [-1.0, 1.0] to 16-bit integer, to save storage space
    dtype = np.int16
    vmax = np.iinfo(dtype).max
    feature = (feature * vmax).astype(dtype)

    np.save(destfile, feature)
Example #2
0
def _load_feature(datafile, extract_ft, win_size, hop_size, n_ctx, n_ahd):
    fs, sig = apkit.load_wav(datafile)
    nch, nsamples = sig.shape
    feat = np.array([
        extract_ft(
            fs,
            _pad_context(sig, o, win_size, n_ctx * win_size / 8,
                         n_ahd * win_size / 8))
        for o in range(0, nsamples - win_size + 1, hop_size)
    ])
    return feat
Example #3
0
 def __getitem__(self, index):
     n = self.names[index]
     fid = self.fids[index]
     fs, sig = apkit.load_wav(os.path.join(self.datadir, n + _WAV_SUFFIX),
                              offset=fid * self.hop_size,
                              nsamples=self.win_size)
     assert sig.shape[1] == self.win_size
     feat = self.extract_ft(fs, sig)
     if self.prepare_gt is not None:
         gt = self.prepare_gt(self.gts[index])
     else:
         gt = None
     return (feat, gt)
Example #4
0
 def __getitem__(self, index):
     n = self.names[index]
     fid = self.fids[index]
     f_start = fid * self.hop_size
     c_start = max(0, f_start - self.ctx_size)
     c_end = f_start + self.win_size + self.ahd_size
     fs, sig = apkit.load_wav(os.path.join(self.datadir, n + _WAV_SUFFIX),
                              offset=c_start,
                              nsamples=(c_end - c_start))
     n_ctx_pad = self.ctx_size - (f_start - c_start)
     n_ahd_pad = (c_end - c_start) - sig.shape[1]
     if n_ctx_pad > 0 or n_ahd_pad > 0:
         sig = np.concatenate((np.zeros((sig.shape[0], n_ctx_pad)), sig,
                               np.zeros((sig.shape[0], n_ahd_pad))),
                              axis=1)
     assert sig.shape[1] == self.win_size + self.ctx_size + self.ahd_size
     feat = self.extract_ft(fs, sig)
     odtype = feat.dtype
     feat = feat.astype('float32', copy=False)
     if np.issubdtype(odtype, np.integer):
         feat /= abs(float(np.iinfo(odtype).min))  #normalize
     gt = self.prepare_gt(self.gts[index])
     return (feat, gt)
Example #5
0
def load_cpsd(afile, win_size, hop_size):
    fs, sig = apkit.load_wav(afile)
    tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size)
    return apkit.pairwise_cpsd(tf)
Example #6
0
def load_frame(path, sid, fid, win_size, hop_size, wav_dir=_DEFAULT_WAV_DIR):
    return apkit.load_wav(os.path.join(path, wav_dir, sid + _WAV_SUFFIX),
                          offset=fid * hop_size,
                          nsamples=win_size)
Example #7
0
def load_wav(path, sid, wav_dir=_DEFAULT_WAV_DIR):
    return apkit.load_wav(os.path.join(path, wav_dir, sid + _WAV_SUFFIX))
Example #8
0
 def _load_frame(self, sid, fid):
     return apkit.load_wav(os.path.join(self.wav_dir, sid + _WAV_SUFFIX),
                           offset=fid * self.hop_size,
                           nsamples=self.win_size)
Example #9
0
def main(infile, outdir, afunc, win_size, hop_size, block_size, block_hop,
         min_sc):
    stime = time.time()

    # load candidate DOAs
    pts = apkit.load_pts_on_sphere()
    pts = pts[pts[:, 2] > -0.05]  # use upper half of the sphere
    # NOTE: alternatively use only points on the horizontal plane
    # pts = apkit.load_pts_horizontal(360)
    print('%.3fs: load points (%d)' % (time.time() - stime, len(pts)),
          file=sys.stderr)

    # compute neighbors (for peak finding)
    nlist = apkit.neighbor_list(pts, math.pi / 180.0 * 8.0)
    print('%.3fs: neighbor list' % (time.time() - stime), file=sys.stderr)

    # load signal
    fs, sig = apkit.load_wav(infile)
    print('%.3fs: load signal' % (time.time() - stime), file=sys.stderr)

    # compute delays (delay for each candidate DOA and each microphone)
    delays = apkit.compute_delay(_MICROPHONE_COORDINATES, pts, fs=fs)
    print('%.3fs: compute delays' % (time.time() - stime), file=sys.stderr)

    # compute empirical covariance matrix
    tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size)
    max_fbin = _MAX_FREQ * win_size // fs  # int
    assert max_fbin <= win_size // 2
    tf = tf[:, :, :max_fbin]  # 0-8kHz
    fbins = np.arange(max_fbin, dtype=float) / win_size
    if block_size is None:
        ecov = apkit.empirical_cov_mat(tf)
    else:
        ecov = apkit.empirical_cov_mat_by_block(tf, block_size, block_hop)
    nch, _, nblock, nfbin = ecov.shape
    print('%.3fs: empirical cov matrix (nfbin=%d)' %
          (time.time() - stime, nfbin),
          file=sys.stderr)

    # local angular spectrum function
    phi = afunc(ecov, delays, fbins)
    print('%.3fs: compute phi' % (time.time() - stime), file=sys.stderr)

    # find local maxima
    lmax = apkit.local_maxima(phi, nlist, th_phi=min_sc)
    print('%.3fs: find local maxima' % (time.time() - stime), file=sys.stderr)

    # merge predictions that have similar azimuth predicitons
    # NOTE: skip this step if the candinate DOAs are on the horizontal plane
    lmax = apkit.merge_lm_on_azimuth(phi, lmax, pts, math.pi / 180.0 * 5.0)
    print('%.3fs: refine local maxima' % (time.time() - stime),
          file=sys.stderr)

    # save results
    # each file contains the predicted angular spectrum for each frame/block
    # each line has five tokens:
    #   (1) x coordinate of the candidate DOA
    #   (2) y coordinate of the candidate DOA
    #   (3) z coordinate of the candidate DOA
    #   (4) angular spectrum value
    #   (5) 1 if this is a local maximum, otherwise 0
    for t in range(nblock):
        with open(f'{outdir}/{t:06d}', 'w') as f:
            for i in range(len(pts)):
                print('%g %g %g %g %d' % (pts[i, 0], pts[i, 1], pts[i, 2],
                                          phi[i, t], 1 if i in lmax[t] else 0),
                      file=f)
    print('%.3fs: save results' % (time.time() - stime), file=sys.stderr)
Example #10
0
def load_ncov(path, win_size, hop_size):
    fs, sig = apkit.load_wav(path)
    nfbin = _MAX_FREQ * win_size // fs  # 0-8kHz
    tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size)
    tf = tf[:, :, :nfbin]
    return apkit.cov_matrix(tf)
Example #11
0
def main(root, outdir, method, sid, vmin, vmax, win_size, hop_size, min_sc,
         no_gt, add_sns, audio_onset, no_video, min_sns, method_name):
    # create output directory and sub-dir
    os.mkdir(outdir)
    odatadir = os.path.join(outdir, 'data')
    ofigdir = os.path.join(outdir, 'fig')
    os.mkdir(odatadir)
    os.mkdir(ofigdir)

    # load result
    rdir = os.path.join(root, 'results', method)

    # DOAs
    doa = np.load(os.path.join(rdir, 'doas.npy'))
    doa_azi = np.arctan2(doa[:, 1], doa[:, 0])
    index_doa_azi_sort = sorted(enumerate(doa_azi), key=lambda x: x[1])
    perm = [index for index, _ in index_doa_azi_sort]
    doa_azi_sort = [a for _, a in index_doa_azi_sort]
    doa_file = os.path.join(odatadir, 'doa')
    np.savetxt(doa_file, doa_azi_sort, fmt='%.5g')
    nlist = apkit.neighbor_list(doa, math.pi / 180 * 5)

    # project doa to image x-coordinate
    if not no_video:
        a2d, _ = cv2.projectPoints(doa * 2, _CAM_R, _CAM_T, _CAM_MATRIX,
                                   _CAM_DIST)
        a2dx = [
            int(x) + _X_EXPAND if abs(x) < 100000 and dx > 0.0 else None
            for ((x, y), ), (dx, dy, dz) in zip(a2d, doa)
        ]

    # heat values
    if not add_sns:
        phi = np.load(os.path.join(rdir, sid + '.npy'))
    else:
        phi, sns = evaluation.load_2tasks_heat(rdir, sid)

    ndoa, nframe = phi.shape
    assert ndoa == len(doa)
    heat_data = np.stack([phi, sns], axis=-1) if add_sns else phi
    for t in xrange(nframe):
        np.savetxt(os.path.join(odatadir, 'h%06d' % t),
                   heat_data[:, t][perm],
                   fmt='%.5g')

    # find local maxima
    lmax = apkit.local_maxima(phi, nlist, th_phi=min_sc)

    # load ground truth
    if not no_gt:
        with open(
                os.path.join(root, 'data',
                             _GT_PATTERN % (sid, win_size, hop_size)),
                'r') as f:
            gt = pickle.load(f)
        fid2gt = dict(gt)

    # load audio (only to see the frame rate)
    wav_file = os.path.join(root, 'data', '%s.wav' % sid)
    fs, _ = apkit.load_wav(wav_file)
    fr = Fraction(fs, hop_size)

    # load video
    if not no_video:
        vdir = os.path.join(root, 'video_proc', sid)
        stamps = np.loadtxt(os.path.join(vdir, 'stamps'))

        for fid in xrange(nframe):
            t = float(fid * hop_size + win_size / 2) / fs + audio_onset
            vimg = cv2.imread(
                os.path.join(vdir, 'r%06d.png' % qvfid(t, stamps)))
            h, w, _ = vimg.shape
            img = np.ones((h + _HMAP_HEIGHT, w + 2 * _X_EXPAND, 3),
                          dtype=vimg.dtype) * 255
            img[:h, _X_EXPAND:-_X_EXPAND] = vimg

            img = plot_grid(img, a2dx)

            if not no_gt and fid in fid2gt and len(fid2gt[fid]) > 0:
                if add_sns:
                    gdoa = np.asarray(
                        [loc for loc, stype, spkid in fid2gt[fid]])
                else:
                    gdoa = np.asarray([
                        loc for loc, stype, spkid in fid2gt[fid] if stype == 1
                    ])

                if len(gdoa) > 0:
                    n2d, _ = cv2.projectPoints(gdoa, _CAM_R, _CAM_T,
                                               _CAM_MATRIX, _CAM_DIST)
                    n2d = np.asarray(n2d, dtype=int)
                    n2d[:, 0, 0] += _X_EXPAND
                    for p in n2d:
                        try:
                            cv2.circle(img,
                                       tuple(p[0]),
                                       40,
                                       _COLOR_GT_SPEECH,
                                       5,
                                       lineType=cv2.CV_AA)
                        except OverflowError:
                            pass

                    # save ground truth data
                    gdoa_azi = np.arctan2(gdoa[:, 1], gdoa[:, 0])
                    np.savetxt(os.path.join(odatadir, 'g%06d' % fid),
                               gdoa_azi,
                               fmt='%.5g')
                else:
                    np.savetxt(os.path.join(odatadir, 'g%06d' % fid), [-9],
                               fmt='%.5g')

                if add_sns:
                    gndoa = np.asarray([
                        loc for loc, stype, spkid in fid2gt[fid] if stype != 1
                    ])
                    if len(gndoa) > 0:
                        n2d, _ = cv2.projectPoints(gndoa, _CAM_R, _CAM_T,
                                                   _CAM_MATRIX, _CAM_DIST)
                        n2d = np.asarray(n2d, dtype=int)
                        n2d[:, 0, 0] += _X_EXPAND
                        for p in n2d:
                            try:
                                cv2.circle(img,
                                           tuple(p[0]),
                                           40,
                                           _COLOR_GT_NOISE,
                                           5,
                                           lineType=cv2.CV_AA)
                            except OverflowError:
                                pass

                        # save ground truth data
                        gndoa_azi = np.arctan2(gndoa[:, 1], gndoa[:, 0])
                        np.savetxt(os.path.join(odatadir, 'f%06d' % fid),
                                   gndoa_azi,
                                   fmt='%.5g')
                    else:
                        np.savetxt(os.path.join(odatadir, 'f%06d' % fid), [-9],
                                   fmt='%.5g')
            else:
                np.savetxt(os.path.join(odatadir, 'g%06d' % fid), [-9],
                           fmt='%.5g')
                if add_sns:
                    np.savetxt(os.path.join(odatadir, 'f%06d' % fid), [-9],
                               fmt='%.5g')

            # hmap in fov
            if add_sns:
                plot_hmap_fov(img[h:], a2dx, phi[:, fid], sns[:, fid])
            else:
                plot_hmap_fov(img[h:], a2dx, phi[:, fid])

            # plot prediction
            ol = img.copy()
            for pid in lmax[fid]:
                px = a2dx[pid]
                if px is not None:
                    if add_sns and sns[pid, fid] * phi[pid, fid] > min_sns:
                        pcolor = _COLOR_PRED_SPEECH
                    else:
                        pcolor = _COLOR_PRED_NOISE
                    cv2.rectangle(ol, (px - 10, 0), (px + 10, img.shape[1]),
                                  pcolor, -1)
            img = 0.6 * img + 0.4 * ol

            # save prediction
            if add_sns:
                pdoa_azi = doa_azi[[
                    pid for pid in lmax[fid]
                    if sns[pid, fid] * phi[pid, fid] > min_sns
                ]]
                qdoa_azi = doa_azi[[
                    pid for pid in lmax[fid]
                    if sns[pid, fid] * phi[pid, fid] <= min_sns
                ]]
            else:
                pdoa_azi = doa_azi[lmax[fid]]

            if len(pdoa_azi) > 0:
                np.savetxt(os.path.join(odatadir, 'p%06d' % fid),
                           pdoa_azi,
                           fmt='%.5g')
            else:
                np.savetxt(os.path.join(odatadir, 'p%06d' % fid), [-9],
                           fmt='%.5g')

            if add_sns:
                if len(qdoa_azi) > 0:
                    np.savetxt(os.path.join(odatadir, 'q%06d' % fid),
                               qdoa_azi,
                               fmt='%.5g')
                else:
                    np.savetxt(os.path.join(odatadir, 'q%06d' % fid), [-9],
                               fmt='%.5g')

            # plot legend
            plot_legend(img, t, sid, method_name, add_sns)

            cv2.imwrite(os.path.join(ofigdir, 'v%06d.png' % fid), img)
    else:
        # no video
        for fid in xrange(nframe):
            if not no_gt and fid in fid2gt and len(fid2gt[fid]) > 0:
                if add_sns:
                    gdoa = np.asarray(
                        [loc for loc, stype, spkid in fid2gt[fid]])
                else:
                    gdoa = np.asarray([
                        loc for loc, stype, spkid in fid2gt[fid] if stype == 1
                    ])

                if len(gdoa) > 0:
                    # save ground truth data
                    gdoa_azi = np.arctan2(gdoa[:, 1], gdoa[:, 0])
                    np.savetxt(os.path.join(odatadir, 'g%06d' % fid),
                               gdoa_azi,
                               fmt='%.5g')
                else:
                    np.savetxt(os.path.join(odatadir, 'g%06d' % fid), [-9],
                               fmt='%.5g')

                if add_sns:
                    gndoa = np.asarray([
                        loc for loc, stype, spkid in fid2gt[fid] if stype != 1
                    ])
                    if len(gndoa) > 0:
                        # save ground truth data
                        gndoa_azi = np.arctan2(gndoa[:, 1], gndoa[:, 0])
                        np.savetxt(os.path.join(odatadir, 'f%06d' % fid),
                                   gndoa_azi,
                                   fmt='%.5g')
                    else:
                        np.savetxt(os.path.join(odatadir, 'f%06d' % fid), [-9],
                                   fmt='%.5g')
            else:
                np.savetxt(os.path.join(odatadir, 'g%06d' % fid), [-9],
                           fmt='%.5g')
                if add_sns:
                    np.savetxt(os.path.join(odatadir, 'f%06d' % fid), [-9],
                               fmt='%.5g')

            # save prediction
            if add_sns:
                pdoa_azi = doa_azi[[
                    pid for pid in lmax[fid]
                    if sns[pid, fid] * phi[pid, fid] > min_sns
                ]]
                qdoa_azi = doa_azi[[
                    pid for pid in lmax[fid]
                    if sns[pid, fid] * phi[pid, fid] <= min_sns
                ]]
            else:
                pdoa_azi = doa_azi[lmax[fid]]

            if len(pdoa_azi) > 0:
                np.savetxt(os.path.join(odatadir, 'p%06d' % fid),
                           pdoa_azi,
                           fmt='%.5g')
            else:
                np.savetxt(os.path.join(odatadir, 'p%06d' % fid), [-9],
                           fmt='%.5g')

            if add_sns:
                if len(qdoa_azi) > 0:
                    np.savetxt(os.path.join(odatadir, 'q%06d' % fid),
                               qdoa_azi,
                               fmt='%.5g')
                else:
                    np.savetxt(os.path.join(odatadir, 'q%06d' % fid), [-9],
                               fmt='%.5g')

    script_file = os.path.join(outdir, 'plot.gp')
    with open(script_file, 'w') as s:
        if not no_video:
            print >> s, 'set terminal pngcairo size 240,320'
        else:
            print >> s, 'set terminal pngcairo size 800,600'
        print >> s, 'set polar'
        print >> s, 'unset border'
        print >> s, 'unset margin'
        print >> s, 'set tics scale 0'
        print >> s, 'unset xtics'
        print >> s, 'unset ytics'
        print >> s, 'set rtics ("" 0, "" 0.25, "" 0.5, "" 0.75, "" 1.0)'
        print >> s, 'unset raxis'
        print >> s, 'set trange [-2*pi:2*pi]'
        print >> s, 'set grid polar pi/6'
        print >> s, 'set size square'
        if not no_video:
            print >> s, 'set key bm'
        else:
            print >> s, 'set key bot rm'
            print >> s, 'set xrange [-1.3:1.3]'
            print >> s, 'set yrange [-1.3:1.3]'
        print >> s, 'set label at 1.2,0 "right" center rotate by -90 tc rgb "gray"'
        print >> s, 'set label at -1.2,0 "left" center rotate by 90 tc rgb "gray"'
        print >> s, 'set label at 0,1.2 "front" center tc rgb "gray"'
        print >> s, 'set label at 0,-1.2 "rear" center tc rgb "gray"'
        print >> s, 'do for [ii=0:%d] {' % (nframe - 1)
        print >> s, '  data=sprintf("< paste %s %s/h%%06d", ii)' % (doa_file,
                                                                    odatadir)
        print >> s, '  gdata=sprintf("%s/g%%06d", ii)' % (odatadir)
        print >> s, '  pdata=sprintf("%s/p%%06d", ii)' % (odatadir)
        if add_sns:
            print >> s, '  fdata=sprintf("%s/f%%06d", ii)' % (odatadir)
            print >> s, '  qdata=sprintf("%s/q%%06d", ii)' % (odatadir)
        print >> s, '  set output sprintf("%s/t%%06d.png", ii)' % ofigdir
        if no_video:
            print >> s, '  set title sprintf("Method %s; Time %%.2fs; Frame #%%06d", ii * %g, ii)' % (
                method_name, 1.0 * hop_size / fs)
        if add_sns:
            print >> s, '  plot 1.1 w l lw 2 lc rgb "gray" notitle,' \
                        ' data u ($1+0.5*pi):2 w l lc rgb "%s" lw 2 title "SSL Likelihood",' \
                        ' data u ($1+0.5*pi):3 w l lc rgb "%s" lw 1 title "SNS Likelihood",' \
                        ' gdata u ($1+0.5*pi):(1.05) pt 6 ps 3 lw 3 lc rgb "%s" title "GT. Speech",' \
                        ' fdata u ($1+0.5*pi):(1.05) pt 6 ps 3 lw 3 lc rgb "%s" title "GT. Noise",' \
                        ' pdata u ($1+0.5*pi):(1.05) pt 2 ps 3 lw 3 lc rgb "%s" title "Pred. Speech",' \
                        ' qdata u ($1+0.5*pi):(1.05) pt 2 ps 3 lw 3 lc rgb "%s" title "Pred. Noise"' \
                        % tuple([_bgrtuple2rgbstr(c) for c in [_COLOR_OUTPUT_SSL, _COLOR_OUTPUT_SNS, _COLOR_GT_SPEECH, _COLOR_GT_NOISE, _COLOR_PRED_SPEECH, _COLOR_PRED_NOISE]])
        else:
            print >> s, '  plot 1.1 w l lw 2 lc rgb "gray" notitle, data u ($1+0.5*pi):2 w l lc rgb "blue" lw 2 title "output value", gdata u ($1+0.5*pi):(1.05) pt 6 ps 3 lw 3 lc rgb "red" title "ground truth", pdata u ($1+0.5*pi):(1.05) pt 2 ps 3 lw 3 lc rgb "green" title "prediction"'
        print >> s, '}'
    audio_temp = os.path.join(outdir, '%s.wav' % sid)
    print 'data and script generated, now run'
    print '  gnuplot %s && \\' % script_file
    if not no_video:
        print '  for x in %s/v*.png; do z=${x##*/}; y=${x%%/*}/${z/v/t}; o=${x%%/*}/${z/v/m}; convert -page +0+0 ${x} -page +880+100 ${y} -flatten ${o}; done && \\' % ofigdir
    print '  gst-launch-1.0 filesrc location="%s" ! decodebin ! audioresample ! "audio/x-raw,rate=16000" ! deinterleave name=d' \
          '    interleave name=i ! audioconvert ! wavenc ! filesink location="%s"' \
          '    d.src_0 ! queue ! i.sink_0' \
          '    d.src_1 ! queue ! i.sink_1 && \\' % (wav_file, audio_temp)
    print '  gst-launch-1.0 multifilesrc location="%s/%s%%06d.png" ' \
          '    caps="image/png,framerate=%d/%d,pixel-aspect-ratio=1/1" ' \
          '    ! decodebin ! videorate ! videoconvert ! theoraenc ! oggmux name=mux ! filesink location=%s/%s.ogv ' \
          '    filesrc location="%s" ! decodebin ! audioconvert ! vorbisenc ! mux. ' \
                % (ofigdir, 'm' if not no_video else 't', fr.numerator, fr.denominator, outdir, sid, audio_temp)