def main(wavfile, destfile, win_size, hop_size, nfbank, zoom, eps): # load signal fs, sig = apkit.load_wav(wavfile) tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size) nch, nframe, _ = tf.shape # trim freq bins nfbin = _FREQ_MAX * win_size / fs # 0-8kHz freq = np.fft.fftfreq(win_size)[:nfbin] tf = tf[:, :, :nfbin] # compute pairwise gcc on f-banks ecov = apkit.empirical_cov_mat(tf, fw=1, tw=1) fbw = apkit.mel_freq_fbank_weight(nfbank, freq, fs, fmax=_FREQ_MAX, fmin=_FREQ_MIN) fbcc = apkit.gcc_phat_fbanks(ecov, fbw, zoom, freq, eps=eps) # merge to a single numpy array, indexed by 'tpbd' # (time, pair, bank, delay) feature = np.asarray( [fbcc[(i, j)] for i in xrange(nch) for j in xrange(nch) if i < j]) feature = np.moveaxis(feature, 2, 0) # and map [-1.0, 1.0] to 16-bit integer, to save storage space dtype = np.int16 vmax = np.iinfo(dtype).max feature = (feature * vmax).astype(dtype) np.save(destfile, feature)
def _load_feature(datafile, extract_ft, win_size, hop_size, n_ctx, n_ahd): fs, sig = apkit.load_wav(datafile) nch, nsamples = sig.shape feat = np.array([ extract_ft( fs, _pad_context(sig, o, win_size, n_ctx * win_size / 8, n_ahd * win_size / 8)) for o in range(0, nsamples - win_size + 1, hop_size) ]) return feat
def __getitem__(self, index): n = self.names[index] fid = self.fids[index] fs, sig = apkit.load_wav(os.path.join(self.datadir, n + _WAV_SUFFIX), offset=fid * self.hop_size, nsamples=self.win_size) assert sig.shape[1] == self.win_size feat = self.extract_ft(fs, sig) if self.prepare_gt is not None: gt = self.prepare_gt(self.gts[index]) else: gt = None return (feat, gt)
def __getitem__(self, index): n = self.names[index] fid = self.fids[index] f_start = fid * self.hop_size c_start = max(0, f_start - self.ctx_size) c_end = f_start + self.win_size + self.ahd_size fs, sig = apkit.load_wav(os.path.join(self.datadir, n + _WAV_SUFFIX), offset=c_start, nsamples=(c_end - c_start)) n_ctx_pad = self.ctx_size - (f_start - c_start) n_ahd_pad = (c_end - c_start) - sig.shape[1] if n_ctx_pad > 0 or n_ahd_pad > 0: sig = np.concatenate((np.zeros((sig.shape[0], n_ctx_pad)), sig, np.zeros((sig.shape[0], n_ahd_pad))), axis=1) assert sig.shape[1] == self.win_size + self.ctx_size + self.ahd_size feat = self.extract_ft(fs, sig) odtype = feat.dtype feat = feat.astype('float32', copy=False) if np.issubdtype(odtype, np.integer): feat /= abs(float(np.iinfo(odtype).min)) #normalize gt = self.prepare_gt(self.gts[index]) return (feat, gt)
def load_cpsd(afile, win_size, hop_size): fs, sig = apkit.load_wav(afile) tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size) return apkit.pairwise_cpsd(tf)
def load_frame(path, sid, fid, win_size, hop_size, wav_dir=_DEFAULT_WAV_DIR): return apkit.load_wav(os.path.join(path, wav_dir, sid + _WAV_SUFFIX), offset=fid * hop_size, nsamples=win_size)
def load_wav(path, sid, wav_dir=_DEFAULT_WAV_DIR): return apkit.load_wav(os.path.join(path, wav_dir, sid + _WAV_SUFFIX))
def _load_frame(self, sid, fid): return apkit.load_wav(os.path.join(self.wav_dir, sid + _WAV_SUFFIX), offset=fid * self.hop_size, nsamples=self.win_size)
def main(infile, outdir, afunc, win_size, hop_size, block_size, block_hop, min_sc): stime = time.time() # load candidate DOAs pts = apkit.load_pts_on_sphere() pts = pts[pts[:, 2] > -0.05] # use upper half of the sphere # NOTE: alternatively use only points on the horizontal plane # pts = apkit.load_pts_horizontal(360) print('%.3fs: load points (%d)' % (time.time() - stime, len(pts)), file=sys.stderr) # compute neighbors (for peak finding) nlist = apkit.neighbor_list(pts, math.pi / 180.0 * 8.0) print('%.3fs: neighbor list' % (time.time() - stime), file=sys.stderr) # load signal fs, sig = apkit.load_wav(infile) print('%.3fs: load signal' % (time.time() - stime), file=sys.stderr) # compute delays (delay for each candidate DOA and each microphone) delays = apkit.compute_delay(_MICROPHONE_COORDINATES, pts, fs=fs) print('%.3fs: compute delays' % (time.time() - stime), file=sys.stderr) # compute empirical covariance matrix tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size) max_fbin = _MAX_FREQ * win_size // fs # int assert max_fbin <= win_size // 2 tf = tf[:, :, :max_fbin] # 0-8kHz fbins = np.arange(max_fbin, dtype=float) / win_size if block_size is None: ecov = apkit.empirical_cov_mat(tf) else: ecov = apkit.empirical_cov_mat_by_block(tf, block_size, block_hop) nch, _, nblock, nfbin = ecov.shape print('%.3fs: empirical cov matrix (nfbin=%d)' % (time.time() - stime, nfbin), file=sys.stderr) # local angular spectrum function phi = afunc(ecov, delays, fbins) print('%.3fs: compute phi' % (time.time() - stime), file=sys.stderr) # find local maxima lmax = apkit.local_maxima(phi, nlist, th_phi=min_sc) print('%.3fs: find local maxima' % (time.time() - stime), file=sys.stderr) # merge predictions that have similar azimuth predicitons # NOTE: skip this step if the candinate DOAs are on the horizontal plane lmax = apkit.merge_lm_on_azimuth(phi, lmax, pts, math.pi / 180.0 * 5.0) print('%.3fs: refine local maxima' % (time.time() - stime), file=sys.stderr) # save results # each file contains the predicted angular spectrum for each frame/block # each line has five tokens: # (1) x coordinate of the candidate DOA # (2) y coordinate of the candidate DOA # (3) z coordinate of the candidate DOA # (4) angular spectrum value # (5) 1 if this is a local maximum, otherwise 0 for t in range(nblock): with open(f'{outdir}/{t:06d}', 'w') as f: for i in range(len(pts)): print('%g %g %g %g %d' % (pts[i, 0], pts[i, 1], pts[i, 2], phi[i, t], 1 if i in lmax[t] else 0), file=f) print('%.3fs: save results' % (time.time() - stime), file=sys.stderr)
def load_ncov(path, win_size, hop_size): fs, sig = apkit.load_wav(path) nfbin = _MAX_FREQ * win_size // fs # 0-8kHz tf = apkit.stft(sig, apkit.cola_hamming, win_size, hop_size) tf = tf[:, :, :nfbin] return apkit.cov_matrix(tf)
def main(root, outdir, method, sid, vmin, vmax, win_size, hop_size, min_sc, no_gt, add_sns, audio_onset, no_video, min_sns, method_name): # create output directory and sub-dir os.mkdir(outdir) odatadir = os.path.join(outdir, 'data') ofigdir = os.path.join(outdir, 'fig') os.mkdir(odatadir) os.mkdir(ofigdir) # load result rdir = os.path.join(root, 'results', method) # DOAs doa = np.load(os.path.join(rdir, 'doas.npy')) doa_azi = np.arctan2(doa[:, 1], doa[:, 0]) index_doa_azi_sort = sorted(enumerate(doa_azi), key=lambda x: x[1]) perm = [index for index, _ in index_doa_azi_sort] doa_azi_sort = [a for _, a in index_doa_azi_sort] doa_file = os.path.join(odatadir, 'doa') np.savetxt(doa_file, doa_azi_sort, fmt='%.5g') nlist = apkit.neighbor_list(doa, math.pi / 180 * 5) # project doa to image x-coordinate if not no_video: a2d, _ = cv2.projectPoints(doa * 2, _CAM_R, _CAM_T, _CAM_MATRIX, _CAM_DIST) a2dx = [ int(x) + _X_EXPAND if abs(x) < 100000 and dx > 0.0 else None for ((x, y), ), (dx, dy, dz) in zip(a2d, doa) ] # heat values if not add_sns: phi = np.load(os.path.join(rdir, sid + '.npy')) else: phi, sns = evaluation.load_2tasks_heat(rdir, sid) ndoa, nframe = phi.shape assert ndoa == len(doa) heat_data = np.stack([phi, sns], axis=-1) if add_sns else phi for t in xrange(nframe): np.savetxt(os.path.join(odatadir, 'h%06d' % t), heat_data[:, t][perm], fmt='%.5g') # find local maxima lmax = apkit.local_maxima(phi, nlist, th_phi=min_sc) # load ground truth if not no_gt: with open( os.path.join(root, 'data', _GT_PATTERN % (sid, win_size, hop_size)), 'r') as f: gt = pickle.load(f) fid2gt = dict(gt) # load audio (only to see the frame rate) wav_file = os.path.join(root, 'data', '%s.wav' % sid) fs, _ = apkit.load_wav(wav_file) fr = Fraction(fs, hop_size) # load video if not no_video: vdir = os.path.join(root, 'video_proc', sid) stamps = np.loadtxt(os.path.join(vdir, 'stamps')) for fid in xrange(nframe): t = float(fid * hop_size + win_size / 2) / fs + audio_onset vimg = cv2.imread( os.path.join(vdir, 'r%06d.png' % qvfid(t, stamps))) h, w, _ = vimg.shape img = np.ones((h + _HMAP_HEIGHT, w + 2 * _X_EXPAND, 3), dtype=vimg.dtype) * 255 img[:h, _X_EXPAND:-_X_EXPAND] = vimg img = plot_grid(img, a2dx) if not no_gt and fid in fid2gt and len(fid2gt[fid]) > 0: if add_sns: gdoa = np.asarray( [loc for loc, stype, spkid in fid2gt[fid]]) else: gdoa = np.asarray([ loc for loc, stype, spkid in fid2gt[fid] if stype == 1 ]) if len(gdoa) > 0: n2d, _ = cv2.projectPoints(gdoa, _CAM_R, _CAM_T, _CAM_MATRIX, _CAM_DIST) n2d = np.asarray(n2d, dtype=int) n2d[:, 0, 0] += _X_EXPAND for p in n2d: try: cv2.circle(img, tuple(p[0]), 40, _COLOR_GT_SPEECH, 5, lineType=cv2.CV_AA) except OverflowError: pass # save ground truth data gdoa_azi = np.arctan2(gdoa[:, 1], gdoa[:, 0]) np.savetxt(os.path.join(odatadir, 'g%06d' % fid), gdoa_azi, fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'g%06d' % fid), [-9], fmt='%.5g') if add_sns: gndoa = np.asarray([ loc for loc, stype, spkid in fid2gt[fid] if stype != 1 ]) if len(gndoa) > 0: n2d, _ = cv2.projectPoints(gndoa, _CAM_R, _CAM_T, _CAM_MATRIX, _CAM_DIST) n2d = np.asarray(n2d, dtype=int) n2d[:, 0, 0] += _X_EXPAND for p in n2d: try: cv2.circle(img, tuple(p[0]), 40, _COLOR_GT_NOISE, 5, lineType=cv2.CV_AA) except OverflowError: pass # save ground truth data gndoa_azi = np.arctan2(gndoa[:, 1], gndoa[:, 0]) np.savetxt(os.path.join(odatadir, 'f%06d' % fid), gndoa_azi, fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'f%06d' % fid), [-9], fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'g%06d' % fid), [-9], fmt='%.5g') if add_sns: np.savetxt(os.path.join(odatadir, 'f%06d' % fid), [-9], fmt='%.5g') # hmap in fov if add_sns: plot_hmap_fov(img[h:], a2dx, phi[:, fid], sns[:, fid]) else: plot_hmap_fov(img[h:], a2dx, phi[:, fid]) # plot prediction ol = img.copy() for pid in lmax[fid]: px = a2dx[pid] if px is not None: if add_sns and sns[pid, fid] * phi[pid, fid] > min_sns: pcolor = _COLOR_PRED_SPEECH else: pcolor = _COLOR_PRED_NOISE cv2.rectangle(ol, (px - 10, 0), (px + 10, img.shape[1]), pcolor, -1) img = 0.6 * img + 0.4 * ol # save prediction if add_sns: pdoa_azi = doa_azi[[ pid for pid in lmax[fid] if sns[pid, fid] * phi[pid, fid] > min_sns ]] qdoa_azi = doa_azi[[ pid for pid in lmax[fid] if sns[pid, fid] * phi[pid, fid] <= min_sns ]] else: pdoa_azi = doa_azi[lmax[fid]] if len(pdoa_azi) > 0: np.savetxt(os.path.join(odatadir, 'p%06d' % fid), pdoa_azi, fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'p%06d' % fid), [-9], fmt='%.5g') if add_sns: if len(qdoa_azi) > 0: np.savetxt(os.path.join(odatadir, 'q%06d' % fid), qdoa_azi, fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'q%06d' % fid), [-9], fmt='%.5g') # plot legend plot_legend(img, t, sid, method_name, add_sns) cv2.imwrite(os.path.join(ofigdir, 'v%06d.png' % fid), img) else: # no video for fid in xrange(nframe): if not no_gt and fid in fid2gt and len(fid2gt[fid]) > 0: if add_sns: gdoa = np.asarray( [loc for loc, stype, spkid in fid2gt[fid]]) else: gdoa = np.asarray([ loc for loc, stype, spkid in fid2gt[fid] if stype == 1 ]) if len(gdoa) > 0: # save ground truth data gdoa_azi = np.arctan2(gdoa[:, 1], gdoa[:, 0]) np.savetxt(os.path.join(odatadir, 'g%06d' % fid), gdoa_azi, fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'g%06d' % fid), [-9], fmt='%.5g') if add_sns: gndoa = np.asarray([ loc for loc, stype, spkid in fid2gt[fid] if stype != 1 ]) if len(gndoa) > 0: # save ground truth data gndoa_azi = np.arctan2(gndoa[:, 1], gndoa[:, 0]) np.savetxt(os.path.join(odatadir, 'f%06d' % fid), gndoa_azi, fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'f%06d' % fid), [-9], fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'g%06d' % fid), [-9], fmt='%.5g') if add_sns: np.savetxt(os.path.join(odatadir, 'f%06d' % fid), [-9], fmt='%.5g') # save prediction if add_sns: pdoa_azi = doa_azi[[ pid for pid in lmax[fid] if sns[pid, fid] * phi[pid, fid] > min_sns ]] qdoa_azi = doa_azi[[ pid for pid in lmax[fid] if sns[pid, fid] * phi[pid, fid] <= min_sns ]] else: pdoa_azi = doa_azi[lmax[fid]] if len(pdoa_azi) > 0: np.savetxt(os.path.join(odatadir, 'p%06d' % fid), pdoa_azi, fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'p%06d' % fid), [-9], fmt='%.5g') if add_sns: if len(qdoa_azi) > 0: np.savetxt(os.path.join(odatadir, 'q%06d' % fid), qdoa_azi, fmt='%.5g') else: np.savetxt(os.path.join(odatadir, 'q%06d' % fid), [-9], fmt='%.5g') script_file = os.path.join(outdir, 'plot.gp') with open(script_file, 'w') as s: if not no_video: print >> s, 'set terminal pngcairo size 240,320' else: print >> s, 'set terminal pngcairo size 800,600' print >> s, 'set polar' print >> s, 'unset border' print >> s, 'unset margin' print >> s, 'set tics scale 0' print >> s, 'unset xtics' print >> s, 'unset ytics' print >> s, 'set rtics ("" 0, "" 0.25, "" 0.5, "" 0.75, "" 1.0)' print >> s, 'unset raxis' print >> s, 'set trange [-2*pi:2*pi]' print >> s, 'set grid polar pi/6' print >> s, 'set size square' if not no_video: print >> s, 'set key bm' else: print >> s, 'set key bot rm' print >> s, 'set xrange [-1.3:1.3]' print >> s, 'set yrange [-1.3:1.3]' print >> s, 'set label at 1.2,0 "right" center rotate by -90 tc rgb "gray"' print >> s, 'set label at -1.2,0 "left" center rotate by 90 tc rgb "gray"' print >> s, 'set label at 0,1.2 "front" center tc rgb "gray"' print >> s, 'set label at 0,-1.2 "rear" center tc rgb "gray"' print >> s, 'do for [ii=0:%d] {' % (nframe - 1) print >> s, ' data=sprintf("< paste %s %s/h%%06d", ii)' % (doa_file, odatadir) print >> s, ' gdata=sprintf("%s/g%%06d", ii)' % (odatadir) print >> s, ' pdata=sprintf("%s/p%%06d", ii)' % (odatadir) if add_sns: print >> s, ' fdata=sprintf("%s/f%%06d", ii)' % (odatadir) print >> s, ' qdata=sprintf("%s/q%%06d", ii)' % (odatadir) print >> s, ' set output sprintf("%s/t%%06d.png", ii)' % ofigdir if no_video: print >> s, ' set title sprintf("Method %s; Time %%.2fs; Frame #%%06d", ii * %g, ii)' % ( method_name, 1.0 * hop_size / fs) if add_sns: print >> s, ' plot 1.1 w l lw 2 lc rgb "gray" notitle,' \ ' data u ($1+0.5*pi):2 w l lc rgb "%s" lw 2 title "SSL Likelihood",' \ ' data u ($1+0.5*pi):3 w l lc rgb "%s" lw 1 title "SNS Likelihood",' \ ' gdata u ($1+0.5*pi):(1.05) pt 6 ps 3 lw 3 lc rgb "%s" title "GT. Speech",' \ ' fdata u ($1+0.5*pi):(1.05) pt 6 ps 3 lw 3 lc rgb "%s" title "GT. Noise",' \ ' pdata u ($1+0.5*pi):(1.05) pt 2 ps 3 lw 3 lc rgb "%s" title "Pred. Speech",' \ ' qdata u ($1+0.5*pi):(1.05) pt 2 ps 3 lw 3 lc rgb "%s" title "Pred. Noise"' \ % tuple([_bgrtuple2rgbstr(c) for c in [_COLOR_OUTPUT_SSL, _COLOR_OUTPUT_SNS, _COLOR_GT_SPEECH, _COLOR_GT_NOISE, _COLOR_PRED_SPEECH, _COLOR_PRED_NOISE]]) else: print >> s, ' plot 1.1 w l lw 2 lc rgb "gray" notitle, data u ($1+0.5*pi):2 w l lc rgb "blue" lw 2 title "output value", gdata u ($1+0.5*pi):(1.05) pt 6 ps 3 lw 3 lc rgb "red" title "ground truth", pdata u ($1+0.5*pi):(1.05) pt 2 ps 3 lw 3 lc rgb "green" title "prediction"' print >> s, '}' audio_temp = os.path.join(outdir, '%s.wav' % sid) print 'data and script generated, now run' print ' gnuplot %s && \\' % script_file if not no_video: print ' for x in %s/v*.png; do z=${x##*/}; y=${x%%/*}/${z/v/t}; o=${x%%/*}/${z/v/m}; convert -page +0+0 ${x} -page +880+100 ${y} -flatten ${o}; done && \\' % ofigdir print ' gst-launch-1.0 filesrc location="%s" ! decodebin ! audioresample ! "audio/x-raw,rate=16000" ! deinterleave name=d' \ ' interleave name=i ! audioconvert ! wavenc ! filesink location="%s"' \ ' d.src_0 ! queue ! i.sink_0' \ ' d.src_1 ! queue ! i.sink_1 && \\' % (wav_file, audio_temp) print ' gst-launch-1.0 multifilesrc location="%s/%s%%06d.png" ' \ ' caps="image/png,framerate=%d/%d,pixel-aspect-ratio=1/1" ' \ ' ! decodebin ! videorate ! videoconvert ! theoraenc ! oggmux name=mux ! filesink location=%s/%s.ogv ' \ ' filesrc location="%s" ! decodebin ! audioconvert ! vorbisenc ! mux. ' \ % (ofigdir, 'm' if not no_video else 't', fr.numerator, fr.denominator, outdir, sid, audio_temp)