def extract_embed(seq_file, model_file, preproc_file, output_path, max_seq_length, pooling_output, write_format, **kwargs): set_float_cpu('float32') sr_args = SDRF.filter_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None sr = SDRF.create(seq_file, transform=preproc, **sr_args) t1 = time.time() model = SeqQEmbed.load(model_file) model.build(max_seq_length) model.build_embed(pooling_output) y_dim = model.embed_dim _, seq_lengths = sr.read_num_rows() sr.reset() num_seqs = len(seq_lengths) p1_y = np.zeros((num_seqs, y_dim), dtype=float_keras()) p2_y = np.zeros((num_seqs, y_dim), dtype=float_keras()) keys = [] for i in xrange(num_seqs): ti1 = time.time() key, data = sr.read(1) ti2 = time.time() logging.info('Extracting embeddings %d/%d for %s, num_frames: %d' % (i, num_seqs, key[0], data[0].shape[0])) keys.append(key[0]) p1_y[i], p2_y[i] = model.predict_embed(data[0]) ti4 = time.time() logging.info('Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f' % (i, num_seqs, key, ti4-ti1, ti2-ti1, ti4-ti2)) logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) if write_format == 'p1': y = p1_y elif write_format == 'p1+p2': y = np.hstack((p1_y, p2_y)) else: y = p2_y hw = DWF.create(output_path) hw.write(keys, y)
def convert(input_file, output_file, class_file): r = DRF.create(input_file) seg_set, score_mat = r.read(0, squeeze=True) with open(class_file, 'r') as f: model_set = [line.rstrip().split()[0] for line in f] scores = TrialScores(model_set, seg_set, score_mat.T) scores.save(output_file)
def compute_mfcc_feats(input_path, output_path, compress, compression_method, write_num_frames, **kwargs): mfcc_args = MFCC.filter_args(**kwargs) mfcc = MFCC(**mfcc_args) if mfcc.input_step == 'wave': input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ', compress=compress, compression_method=compression_method) if write_num_frames is not None: f_num_frames = open(write_num_frames, 'w') for data in reader: if mfcc.input_step == 'wave': key, x, fs = data else: key, x = data logging.info('Extracting MFCC for %s' % (key)) t1 = time.time() y = mfcc.compute(x) dt = (time.time() - t1)*1000 rtf = mfcc.frame_shift*y.shape[0]/dt logging.info('Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % (key, y.shape[0], dt, rtf)) writer.write([key], [y]) if write_num_frames is not None: f_num_frames.write('%s %d\n' % (key, y.shape[0])) mfcc.reset() if write_num_frames is not None: f_num_frames.close()
def compute_mfcc_feats(input_path, output_path, compress, compression_method, write_num_frames, use_gpu, nn_model_path, chunk_size, context, **kwargs): #open device if use_gpu and torch.cuda.is_available(): logging.info('CUDA_VISIBLE_DEVICES=%s' % os.environ['CUDA_VISIBLE_DEVICES']) logging.info('init gpu device') device = torch.device('cuda', 0) torch.tensor([0]).to(device) else: logging.info('init cpu device') device = torch.device('cpu') mfcc_args = MFCC.filter_args(**kwargs) mfcc = MFCC(**mfcc_args) # PUT YOUR NNET MODEL HERE!!!! enhancer = CAN(num_channels=45) enhancer.load_state_dict( torch.load(nn_model_path, map_location=device)['state_dict']) enhancer.to(device) enhancer.eval() if mfcc.input_step == 'wave': input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ', compress=compress, compression_method=compression_method) if write_num_frames is not None: f_num_frames = open(write_num_frames, 'w') for data in reader: if mfcc.input_step == 'wave': key, x, fs = data else: key, x = data logging.info('Extracting filter-banks for %s' % (key)) t1 = time.time() y = mfcc.compute(x) #we apply dummy identity network to fb logging.info('Running enhancement network') y = apply_nnet(y, enhancer, chunk_size, context, device) dt = (time.time() - t1) * 1000 rtf = mfcc.frame_shift * y.shape[0] / dt logging.info( 'Extracted filter-banks for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % (key, y.shape[0], dt, rtf)) writer.write([key], [y]) if write_num_frames is not None: f_num_frames.write('%s %d\n' % (key, y.shape[0])) mfcc.reset() if write_num_frames is not None: f_num_frames.close()
if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, fromfile_prefix_chars='@', description= 'Compute filter-bank features and enhance with pytorch model') parser.add_argument('--input', dest='input_path', required=True) parser.add_argument('--output', dest='output_path', required=True) parser.add_argument('--write-num-frames', dest='write_num_frames', default=None) DRF.add_argparse_args(parser) MFCC.add_argparse_args(parser) parser.add_argument('--compress', dest='compress', default=False, action='store_true', help='Compress the features') parser.add_argument('--compression-method', dest='compression_method', default='auto', choices=compression_methods, help='Compression method') parser.add_argument('-v', '--verbose', dest='verbose', default=1,
def extract_embed(seq_file, model_file, preproc_file, output_path, max_length, layer_names, **kwargs): set_float_cpu('float32') sr_args = SDRF.filter_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None sr = SDRF.create(seq_file, transform=preproc, **sr_args) t1 = time.time() model = SeqEmbed.load(model_file) model.build() model.build_embed(layer_names) y_dim = model.embed_dim _, seq_lengths = sr.read_num_rows() sr.reset() num_seqs = len(seq_lengths) max_length = np.minimum(np.max(seq_lengths), max_length) y = np.zeros((num_seqs, y_dim), dtype=float_keras()) xx = np.zeros((1, max_length, model.x_dim), dtype=float_keras()) keys = [] for i in xrange(num_seqs): ti1 = time.time() data = sr.read(1) key = data[0][0] x = data[1][0] ti2 = time.time() logging.info('Extracting embeddings %d/%d for %s, num_frames: %d' % (i, num_seqs, key, x.shape[0])) keys.append(key) xx[:, :, :] = 0 if x.shape[0] <= max_length: xx[0, :x.shape[0]] = x y[i] = model.predict_embed(xx, batch_size=1) else: num_chunks = int(np.ceil(float(x.shape[0]) / max_length)) chunk_size = int(np.ceil(float(x.shape[0]) / num_chunks)) for j in xrange(num_chunks - 1): start = j * chunk_size xx[0, :chunk_size] = x[start:start + chunk_size] y[i] += model.predict_embed(xx, batch_size=1).ravel() xx[0, :chunk_size] = x[-chunk_size:] y[i] += model.predict_embed(xx, batch_size=1).ravel() y[i] /= num_chunks ti4 = time.time() logging.info( 'Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f' % (i, num_seqs, key, ti4 - ti1, ti2 - ti1, ti4 - ti2)) logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) hw = DWF.create(output_path) hw.write(keys, y)
def compute_mfcc_feats(input_path, output_path, compress, compression_method, write_num_frames, use_gpu, nn_model_path, chunk_size, context, **kwargs): #open device if use_gpu and torch.cuda.is_available(): os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID' max_tries = 100 for g in range(max_tries): try: gpu_ids = find_free_gpus() os.environ['CUDA_VISIBLE_DEVICES'] = gpu_ids logging.info('CUDA_VISIBLE_DEVICES=%s' % os.environ['CUDA_VISIBLE_DEVICES']) logging.info('init gpu device') device = torch.device('cuda', 0) torch.tensor([0]).to(device) break except: if g < max_tries-1: logging.info('failing init gpu, trying again') time.sleep(10) else: logging.info('failing init gpu, using cpu') device = torch.device('cpu') else: logging.info('init cpu device') device = torch.device('cpu') mfcc_args1 = MFCC.filter_args(**kwargs) mfcc_args2 = copy.deepcopy(mfcc_args1) mfcc_args1['output_step'] = 'logfb' mfcc_args2['input_step'] = 'logfb' print(kwargs) print(mfcc_args1) print(mfcc_args2) mfcc1 = MFCC(**mfcc_args1) mfcc2 = MFCC(**mfcc_args2) mvn = MVN(norm_var=False, left_context=150, right_context=150) # PUT YOUR NNET MODEL HERE!!!! enhancer = CGN() #enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)['state_dict']) enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)) enhancer.to(device) enhancer.eval() if mfcc1.input_step == 'wave': input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ', compress=compress, compression_method=compression_method) if write_num_frames is not None: f_num_frames = open(write_num_frames, 'w') for data in reader: if mfcc1.input_step == 'wave': key, x, fs = data else: key, x = data logging.info('Extracting filter-banks for %s' % (key)) t1 = time.time() y = mfcc1.compute(x) # separate logE and filterbanks logE = y[:,0] y = y[:,1:] #estimate log energy from filterbanks logEy1 = logsumexp(y, axis=-1) #we apply dummy identity network to fb logging.info('Running enhancement network') y = mvn.normalize(y) y = apply_nnet(y, enhancer, chunk_size, context, device) #lets rescale the logE based on enhanced filterbanks logEy2 = logsumexp(y, axis=-1) logE = logE + (logEy2 - logEy1) # concatenate logE and filterbanks y = np.concatenate((logE[:,None], y), axis=-1) #apply DCT logging.info('Applying DCT') y = mfcc2.compute(y) dt = (time.time() - t1)*1000 rtf = mfcc1.frame_shift*y.shape[0]/dt logging.info('Extracted filter-banks for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % (key, y.shape[0], dt, rtf)) writer.write([key], [y]) if write_num_frames is not None: f_num_frames.write('%s %d\n' % (key, y.shape[0])) mfcc1.reset() if write_num_frames is not None: f_num_frames.close()
print(mfcc_args2) mfcc1 = MFCC(**mfcc_args1) mfcc2 = MFCC(**mfcc_args2) # PUT YOUR NNET MODEL HERE!!!! enhancer = CGN() #enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)['state_dict']) enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)) enhancer.to(device) enhancer.eval() if mfcc1.input_step == 'wave': input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) writer = DWF.create(output_path, scp_sep=' ', compress=compress, compression_method=compression_method) if write_num_frames is not None: f_num_frames = open(write_num_frames, 'w') for data in reader: if mfcc1.input_step == 'wave': key, x, fs = data else: key, x = data logging.info('Extracting filter-banks for %s' % (key))