def segments_to_bin_vad(segments_file, num_frames_file, frame_shift, output_path, part_idx, num_parts): num_frames = None if num_frames_file is not None: utt2num_frames = pd.read_csv(num_frames_file, sep='\s+', header=None, names=['file_id', 'num_frames'], index_col=0) segments = SegmentList.load(segments_file) if num_parts > 1: segments = segments.split(part_idx, num_parts) with DWF.create(output_path) as writer: for file_id in segments.uniq_file_id: logging.info('processing VAD for %s' % (file_id)) if num_frames_file is not None: num_frames = int(utt2num_frames.loc[file_id]['num_frames']) vad = segments.to_bin_vad(file_id, frame_shift=frame_shift, num_frames=num_frames) num_speech_frames = np.sum(vad) logging.info('for %s detected %d/%d (%.2f %%) speech frames' % (file_id, num_speech_frames, num_frames, num_speech_frames / num_frames * 100)) writer.write(file_id, vad)
def extract_embed(seq_file, model_file, preproc_file, output_path, max_seq_length, pooling_output, write_format, **kwargs): set_float_cpu('float32') sr_args = SDRF.filter_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None sr = SDRF.create(seq_file, transform=preproc, **sr_args) t1 = time.time() model = SeqQEmbed.load(model_file) model.build(max_seq_length) model.build_embed(pooling_output) y_dim = model.embed_dim _, seq_lengths = sr.read_num_rows() sr.reset() num_seqs = len(seq_lengths) p1_y = np.zeros((num_seqs, y_dim), dtype=float_keras()) p2_y = np.zeros((num_seqs, y_dim), dtype=float_keras()) keys = [] for i in xrange(num_seqs): ti1 = time.time() key, data = sr.read(1) ti2 = time.time() logging.info('Extracting embeddings %d/%d for %s, num_frames: %d' % (i, num_seqs, key[0], data[0].shape[0])) keys.append(key[0]) p1_y[i], p2_y[i] = model.predict_embed(data[0]) ti4 = time.time() logging.info('Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f' % (i, num_seqs, key, ti4-ti1, ti2-ti1, ti4-ti2)) logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) if write_format == 'p1': y = p1_y elif write_format == 'p1+p2': y = np.hstack((p1_y, p2_y)) else: y = p2_y hw = DWF.create(output_path) hw.write(keys, y)
def compute_mfcc_feats(input_path, output_path, compress, compression_method, write_num_frames, **kwargs): mfcc_args = MFCC.filter_args(**kwargs) mfcc = MFCC(**mfcc_args) if mfcc.input_step == 'wave': input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ', compress=compress, compression_method=compression_method) if write_num_frames is not None: f_num_frames = open(write_num_frames, 'w') for data in reader: if mfcc.input_step == 'wave': key, x, fs = data else: key, x = data logging.info('Extracting MFCC for %s' % (key)) t1 = time.time() y = mfcc.compute(x) dt = (time.time() - t1)*1000 rtf = mfcc.frame_shift*y.shape[0]/dt logging.info('Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % (key, y.shape[0], dt, rtf)) writer.write([key], [y]) if write_num_frames is not None: f_num_frames.write('%s %d\n' % (key, y.shape[0])) mfcc.reset() if write_num_frames is not None: f_num_frames.close()
def compute_vad(input_path, output_path, **kwargs): mfcc_args = EnergyVAD.filter_args(**kwargs) mfcc = EnergVAD(**mfcc_args) input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ') for data in reader: key, x, fs = data logging.info('Extracting VAD for %s' % (key)) t1 = time.time() y = vad.compute(x) dt = (time.time() - t1)*1000 rtf = vad.frame_shift*y.shape[0]/dt logging.info('Extracted VAD for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % (key, y.shape[0], dt, rtf)) writer.write([key], [y]) vad.reset()
def rttm_to_bin_vad(rttm_file, num_frames_file, frame_shift, output_path, part_idx, num_parts): num_frames = None if num_frames_file is not None: utt2num_frames = pd.read_csv(num_frames_file, sep='\s+', header=None, names=['file_id', 'num_frames'], index_col=0) segments = RTTM.load(rttm_file).to_segment_list() segments_orig = copy.deepcopy(segments) if num_parts > 1: segments = segments.split(part_idx, num_parts) with DWF.create(output_path) as writer: for file_id in segments.uniq_file_id: logging.info('processing VAD for %s' % (file_id)) if num_frames_file is not None: num_frames = int(utt2num_frames.loc[file_id]['num_frames']) vad = segments.to_bin_vad(file_id, frame_shift=frame_shift, num_frames=num_frames) num_speech_frames = np.sum(vad) logging.info('for %s detected %d/%d (%.2f %%) speech frames' % (file_id, num_speech_frames, num_frames, num_speech_frames / num_frames * 100)) writer.write(file_id, vad) if part_idx == 1: for file_id in utt2num_frames.index: if not (file_id in segments_orig.uniq_file_id): logging.warning( 'not speeech detected in %s, putting all to 1' % (file_id)) num_frames = int(utt2num_frames.loc[file_id]['num_frames']) vad = np.ones((num_frames, ), dtype='float32') writer.write(file_id, vad)
def compute_mfcc_feats(input_path, output_path, compress, compression_method, write_num_frames, use_gpu, nn_model_path, chunk_size, context, **kwargs): #open device if use_gpu and torch.cuda.is_available(): logging.info('CUDA_VISIBLE_DEVICES=%s' % os.environ['CUDA_VISIBLE_DEVICES']) logging.info('init gpu device') device = torch.device('cuda', 0) torch.tensor([0]).to(device) else: logging.info('init cpu device') device = torch.device('cpu') mfcc_args = MFCC.filter_args(**kwargs) mfcc = MFCC(**mfcc_args) # PUT YOUR NNET MODEL HERE!!!! enhancer = CAN(num_channels=45) enhancer.load_state_dict( torch.load(nn_model_path, map_location=device)['state_dict']) enhancer.to(device) enhancer.eval() if mfcc.input_step == 'wave': input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ', compress=compress, compression_method=compression_method) if write_num_frames is not None: f_num_frames = open(write_num_frames, 'w') for data in reader: if mfcc.input_step == 'wave': key, x, fs = data else: key, x = data logging.info('Extracting filter-banks for %s' % (key)) t1 = time.time() y = mfcc.compute(x) #we apply dummy identity network to fb logging.info('Running enhancement network') y = apply_nnet(y, enhancer, chunk_size, context, device) dt = (time.time() - t1) * 1000 rtf = mfcc.frame_shift * y.shape[0] / dt logging.info( 'Extracted filter-banks for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % (key, y.shape[0], dt, rtf)) writer.write([key], [y]) if write_num_frames is not None: f_num_frames.write('%s %d\n' % (key, y.shape[0])) mfcc.reset() if write_num_frames is not None: f_num_frames.close()
def extract_embed(seq_file, model_file, preproc_file, output_path, max_length, layer_names, **kwargs): set_float_cpu('float32') sr_args = SDRF.filter_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None sr = SDRF.create(seq_file, transform=preproc, **sr_args) t1 = time.time() model = SeqEmbed.load(model_file) model.build() model.build_embed(layer_names) y_dim = model.embed_dim _, seq_lengths = sr.read_num_rows() sr.reset() num_seqs = len(seq_lengths) max_length = np.minimum(np.max(seq_lengths), max_length) y = np.zeros((num_seqs, y_dim), dtype=float_keras()) xx = np.zeros((1, max_length, model.x_dim), dtype=float_keras()) keys = [] for i in xrange(num_seqs): ti1 = time.time() data = sr.read(1) key = data[0][0] x = data[1][0] ti2 = time.time() logging.info('Extracting embeddings %d/%d for %s, num_frames: %d' % (i, num_seqs, key, x.shape[0])) keys.append(key) xx[:, :, :] = 0 if x.shape[0] <= max_length: xx[0, :x.shape[0]] = x y[i] = model.predict_embed(xx, batch_size=1) else: num_chunks = int(np.ceil(float(x.shape[0]) / max_length)) chunk_size = int(np.ceil(float(x.shape[0]) / num_chunks)) for j in xrange(num_chunks - 1): start = j * chunk_size xx[0, :chunk_size] = x[start:start + chunk_size] y[i] += model.predict_embed(xx, batch_size=1).ravel() xx[0, :chunk_size] = x[-chunk_size:] y[i] += model.predict_embed(xx, batch_size=1).ravel() y[i] /= num_chunks ti4 = time.time() logging.info( 'Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f' % (i, num_seqs, key, ti4 - ti1, ti2 - ti1, ti4 - ti2)) logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) hw = DWF.create(output_path) hw.write(keys, y)
def plot_vector_tsne(iv_file, v_list, preproc_file, output_path, save_embed, output_dim, perplexity, exag, lr, num_iter, init_method, rng_seed, verbose, pca_dim, max_classes, **kwargs): if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None vr_args = VCR.filter_args(**kwargs) vcr = VCR(iv_file, v_list, preproc, **vr_args) x, class_ids = vcr.read() t1 = time.time() if pca_dim > 0: pca = PCA(pca_dim=pca_dim) pca.fit(x) x = pca.predict(x) if not os.path.exists(output_path): os.makedirs(ouput_path) tsne_obj = lambda n: TSNE(n_components=n, perplexity=perplexity, early_exaggeration=exag, learning_rate=lr, n_iter=num_iter, init=init_method, random_state=rng_seed, verbose=verbose) if max_classes > 0: index = class_ids < max_classes x = x[index] class_ids = class_ids[index] if output_dim > 3: tsne = tsne_obj(output_dim) y = tsne.fit_transform(x) if save_embed: h5_file = '%s/embed_%dd.h5' % (output_path, ouput_dim) hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) tsne = tsne_obj(2) y = tsne.fit_transform(x) if save_embed: h5_file = '%s/embed_2d.h5' % output_path hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) fig_file = '%s/tsne_2d.pdf' % (output_path) # plt.scatter(y[:,0], y[:,1], c=class_ids, marker='x') color_marker = [(c, m) for m in markers for c in colors] for c in np.unique(class_ids): idx = class_ids == c plt.scatter(y[idx, 0], y[idx, 1], c=color_marker[c][0], marker=color_marker[c][1], label=vcr.class_names[c]) plt.legend() plt.grid(True) plt.show() plt.savefig(fig_file) plt.clf() # if max_classes > 0: # fig_file = '%s/tsne_2d_n%d.pdf' % (output_path, max_classes) # index = class_ids < max_classes # plt.scatter(y[index,0], y[index,1], c=class_ids[index], marker='x') # plt.grid(True) # plt.show() # plt.savefig(fig_file) # plt.clf() tsne = tsne_obj(3) y = tsne.fit_transform(x) if save_embed: h5_file = '%s/embed_3d.h5' % output_path hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) fig_file = '%s/tsne_3d.pdf' % (output_path) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') #ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') for c in np.unique(class_ids): idx = class_ids == c ax.scatter(y[idx, 0], y[idx, 1], y[idx, 2], c=color_marker[c][0], marker=color_marker[c][1], label=vcr.class_names[c]) plt.grid(True) plt.show() plt.savefig(fig_file) plt.clf() # if max_classes > 0: # fig_file = '%s/tsne_3d_n%d.pdf' % (output_path, max_classes) # index = class_ids < max_classes # ax = fig.add_subplot(111, projection='3d') # ax.scatter(y[index,0], y[index,1], y[index,2], c=class_ids[index], marker='x') # plt.grid(True) # plt.show() # plt.savefig(fig_file) # plt.clf() logging.info('Elapsed time: %.2f s.' % (time.time() - t1))
def compute_mfcc_feats(input_path, output_path, compress, compression_method, write_num_frames, use_gpu, nn_model_path, chunk_size, context, **kwargs): #open device if use_gpu and torch.cuda.is_available(): os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID' max_tries = 100 for g in range(max_tries): try: gpu_ids = find_free_gpus() os.environ['CUDA_VISIBLE_DEVICES'] = gpu_ids logging.info('CUDA_VISIBLE_DEVICES=%s' % os.environ['CUDA_VISIBLE_DEVICES']) logging.info('init gpu device') device = torch.device('cuda', 0) torch.tensor([0]).to(device) break except: if g < max_tries-1: logging.info('failing init gpu, trying again') time.sleep(10) else: logging.info('failing init gpu, using cpu') device = torch.device('cpu') else: logging.info('init cpu device') device = torch.device('cpu') mfcc_args1 = MFCC.filter_args(**kwargs) mfcc_args2 = copy.deepcopy(mfcc_args1) mfcc_args1['output_step'] = 'logfb' mfcc_args2['input_step'] = 'logfb' print(kwargs) print(mfcc_args1) print(mfcc_args2) mfcc1 = MFCC(**mfcc_args1) mfcc2 = MFCC(**mfcc_args2) mvn = MVN(norm_var=False, left_context=150, right_context=150) # PUT YOUR NNET MODEL HERE!!!! enhancer = CGN() #enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)['state_dict']) enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)) enhancer.to(device) enhancer.eval() if mfcc1.input_step == 'wave': input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ', compress=compress, compression_method=compression_method) if write_num_frames is not None: f_num_frames = open(write_num_frames, 'w') for data in reader: if mfcc1.input_step == 'wave': key, x, fs = data else: key, x = data logging.info('Extracting filter-banks for %s' % (key)) t1 = time.time() y = mfcc1.compute(x) # separate logE and filterbanks logE = y[:,0] y = y[:,1:] #estimate log energy from filterbanks logEy1 = logsumexp(y, axis=-1) #we apply dummy identity network to fb logging.info('Running enhancement network') y = mvn.normalize(y) y = apply_nnet(y, enhancer, chunk_size, context, device) #lets rescale the logE based on enhanced filterbanks logEy2 = logsumexp(y, axis=-1) logE = logE + (logEy2 - logEy1) # concatenate logE and filterbanks y = np.concatenate((logE[:,None], y), axis=-1) #apply DCT logging.info('Applying DCT') y = mfcc2.compute(y) dt = (time.time() - t1)*1000 rtf = mfcc1.frame_shift*y.shape[0]/dt logging.info('Extracted filter-banks for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % (key, y.shape[0], dt, rtf)) writer.write([key], [y]) if write_num_frames is not None: f_num_frames.write('%s %d\n' % (key, y.shape[0])) mfcc1.reset() if write_num_frames is not None: f_num_frames.close()
# PUT YOUR NNET MODEL HERE!!!! enhancer = CGN() #enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)['state_dict']) enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)) enhancer.to(device) enhancer.eval() if mfcc1.input_step == 'wave': input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ', compress=compress, compression_method=compression_method) if write_num_frames is not None: f_num_frames = open(write_num_frames, 'w') for data in reader: if mfcc1.input_step == 'wave': key, x, fs = data else: key, x = data logging.info('Extracting filter-banks for %s' % (key)) t1 = time.time() y = mfcc1.compute(x) # separate logE and filterbanks