コード例 #1
0
def segments_to_bin_vad(segments_file, num_frames_file, frame_shift,
                        output_path, part_idx, num_parts):

    num_frames = None
    if num_frames_file is not None:
        utt2num_frames = pd.read_csv(num_frames_file,
                                     sep='\s+',
                                     header=None,
                                     names=['file_id', 'num_frames'],
                                     index_col=0)

    segments = SegmentList.load(segments_file)
    if num_parts > 1:
        segments = segments.split(part_idx, num_parts)

    with DWF.create(output_path) as writer:
        for file_id in segments.uniq_file_id:
            logging.info('processing VAD for %s' % (file_id))
            if num_frames_file is not None:
                num_frames = int(utt2num_frames.loc[file_id]['num_frames'])
            vad = segments.to_bin_vad(file_id,
                                      frame_shift=frame_shift,
                                      num_frames=num_frames)
            num_speech_frames = np.sum(vad)
            logging.info('for %s detected %d/%d (%.2f %%) speech frames' %
                         (file_id, num_speech_frames, num_frames,
                          num_speech_frames / num_frames * 100))
            writer.write(file_id, vad)
コード例 #2
0
def extract_embed(seq_file, model_file, preproc_file, output_path,
                  max_seq_length, pooling_output, write_format, **kwargs):

    set_float_cpu('float32')
    
    sr_args = SDRF.filter_args(**kwargs)
    
    if preproc_file is not None:
        preproc = TransformList.load(preproc_file)
    else:
        preproc = None

    sr = SDRF.create(seq_file, transform=preproc, **sr_args)
    
    t1 = time.time()

    model = SeqQEmbed.load(model_file)
    model.build(max_seq_length)
    model.build_embed(pooling_output)
    y_dim = model.embed_dim

    _, seq_lengths = sr.read_num_rows()
    sr.reset()
    num_seqs = len(seq_lengths)

    p1_y = np.zeros((num_seqs, y_dim), dtype=float_keras())
    p2_y = np.zeros((num_seqs, y_dim), dtype=float_keras())
    keys = []

    for i in xrange(num_seqs):
        ti1 = time.time()
        key, data = sr.read(1)
        
        ti2 = time.time()
        logging.info('Extracting embeddings %d/%d for %s, num_frames: %d' %
              (i, num_seqs, key[0], data[0].shape[0]))
        keys.append(key[0])
        p1_y[i], p2_y[i] = model.predict_embed(data[0])
                
        ti4 = time.time()
        logging.info('Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f' %
              (i, num_seqs, key, ti4-ti1, ti2-ti1, ti4-ti2))
            
    logging.info('Extract elapsed time: %.2f' % (time.time() - t1))

    if write_format == 'p1':
        y = p1_y
    elif write_format == 'p1+p2':
        y = np.hstack((p1_y, p2_y))
    else:
        y = p2_y
    
    hw = DWF.create(output_path)
    hw.write(keys, y)
コード例 #3
0
def compute_mfcc_feats(input_path, output_path,
                       compress, compression_method, write_num_frames, **kwargs):

    mfcc_args = MFCC.filter_args(**kwargs)
    mfcc = MFCC(**mfcc_args)

    if mfcc.input_step == 'wave':
        input_args = AR.filter_args(**kwargs)
        reader = AR(input_path, **input_args)
    else:
        input_args = DRF.filter_args(**kwargs)
        reader = DRF.create(input_path, **input_args)

    writer = DWF.create(output_path, scp_sep=' ',
                        compress=compress,
                        compression_method=compression_method)

    if write_num_frames is not None:
        f_num_frames = open(write_num_frames, 'w')
    
    for data in reader:
        if mfcc.input_step == 'wave':
            key, x, fs = data
        else:
            key, x = data
        logging.info('Extracting MFCC for %s' % (key))
        t1 = time.time()
        y = mfcc.compute(x)
        dt = (time.time() - t1)*1000
        rtf = mfcc.frame_shift*y.shape[0]/dt
        logging.info('Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' %
                     (key, y.shape[0], dt, rtf))
        writer.write([key], [y])
        
        if write_num_frames is not None:
            f_num_frames.write('%s %d\n' % (key, y.shape[0]))

        mfcc.reset()
            
    if write_num_frames is not None:
        f_num_frames.close()
コード例 #4
0
def compute_vad(input_path, output_path,  **kwargs):

    mfcc_args = EnergyVAD.filter_args(**kwargs)
    mfcc = EnergVAD(**mfcc_args)
    
    input_args = AR.filter_args(**kwargs)
    reader = AR(input_path, **input_args)

    writer = DWF.create(output_path, scp_sep=' ')

    for data in reader:
        key, x, fs = data
        logging.info('Extracting VAD for %s' % (key))
        t1 = time.time()
        y = vad.compute(x)
        dt = (time.time() - t1)*1000
        rtf = vad.frame_shift*y.shape[0]/dt
        logging.info('Extracted VAD for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' %
                     (key, y.shape[0], dt, rtf))
        writer.write([key], [y])
        
        vad.reset()
コード例 #5
0
ファイル: rttm-to-bin-vad.py プロジェクト: whkanggg/hyperion
def rttm_to_bin_vad(rttm_file, num_frames_file, frame_shift, output_path,
                    part_idx, num_parts):

    num_frames = None
    if num_frames_file is not None:
        utt2num_frames = pd.read_csv(num_frames_file,
                                     sep='\s+',
                                     header=None,
                                     names=['file_id', 'num_frames'],
                                     index_col=0)

    segments = RTTM.load(rttm_file).to_segment_list()
    segments_orig = copy.deepcopy(segments)
    if num_parts > 1:
        segments = segments.split(part_idx, num_parts)

    with DWF.create(output_path) as writer:
        for file_id in segments.uniq_file_id:
            logging.info('processing VAD for %s' % (file_id))
            if num_frames_file is not None:
                num_frames = int(utt2num_frames.loc[file_id]['num_frames'])
            vad = segments.to_bin_vad(file_id,
                                      frame_shift=frame_shift,
                                      num_frames=num_frames)
            num_speech_frames = np.sum(vad)
            logging.info('for %s detected %d/%d (%.2f %%) speech frames' %
                         (file_id, num_speech_frames, num_frames,
                          num_speech_frames / num_frames * 100))
            writer.write(file_id, vad)

        if part_idx == 1:
            for file_id in utt2num_frames.index:
                if not (file_id in segments_orig.uniq_file_id):
                    logging.warning(
                        'not speeech detected in %s, putting all to 1' %
                        (file_id))
                    num_frames = int(utt2num_frames.loc[file_id]['num_frames'])
                    vad = np.ones((num_frames, ), dtype='float32')
                    writer.write(file_id, vad)
def compute_mfcc_feats(input_path, output_path, compress, compression_method,
                       write_num_frames, use_gpu, nn_model_path, chunk_size,
                       context, **kwargs):

    #open device
    if use_gpu and torch.cuda.is_available():
        logging.info('CUDA_VISIBLE_DEVICES=%s' %
                     os.environ['CUDA_VISIBLE_DEVICES'])
        logging.info('init gpu device')
        device = torch.device('cuda', 0)
        torch.tensor([0]).to(device)
    else:
        logging.info('init cpu device')
        device = torch.device('cpu')

    mfcc_args = MFCC.filter_args(**kwargs)
    mfcc = MFCC(**mfcc_args)
    # PUT YOUR NNET MODEL HERE!!!!
    enhancer = CAN(num_channels=45)
    enhancer.load_state_dict(
        torch.load(nn_model_path, map_location=device)['state_dict'])
    enhancer.to(device)
    enhancer.eval()

    if mfcc.input_step == 'wave':
        input_args = AR.filter_args(**kwargs)
        reader = AR(input_path, **input_args)
    else:
        input_args = DRF.filter_args(**kwargs)
        reader = DRF.create(input_path, **input_args)

    writer = DWF.create(output_path,
                        scp_sep=' ',
                        compress=compress,
                        compression_method=compression_method)

    if write_num_frames is not None:
        f_num_frames = open(write_num_frames, 'w')

    for data in reader:
        if mfcc.input_step == 'wave':
            key, x, fs = data
        else:
            key, x = data
        logging.info('Extracting filter-banks for %s' % (key))
        t1 = time.time()
        y = mfcc.compute(x)

        #we apply dummy identity network to fb
        logging.info('Running enhancement network')
        y = apply_nnet(y, enhancer, chunk_size, context, device)

        dt = (time.time() - t1) * 1000
        rtf = mfcc.frame_shift * y.shape[0] / dt
        logging.info(
            'Extracted filter-banks for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f'
            % (key, y.shape[0], dt, rtf))
        writer.write([key], [y])

        if write_num_frames is not None:
            f_num_frames.write('%s %d\n' % (key, y.shape[0]))

        mfcc.reset()

    if write_num_frames is not None:
        f_num_frames.close()
コード例 #7
0
def extract_embed(seq_file, model_file, preproc_file, output_path, max_length,
                  layer_names, **kwargs):

    set_float_cpu('float32')

    sr_args = SDRF.filter_args(**kwargs)

    if preproc_file is not None:
        preproc = TransformList.load(preproc_file)
    else:
        preproc = None

    sr = SDRF.create(seq_file, transform=preproc, **sr_args)

    t1 = time.time()

    model = SeqEmbed.load(model_file)
    model.build()
    model.build_embed(layer_names)
    y_dim = model.embed_dim

    _, seq_lengths = sr.read_num_rows()
    sr.reset()
    num_seqs = len(seq_lengths)
    max_length = np.minimum(np.max(seq_lengths), max_length)

    y = np.zeros((num_seqs, y_dim), dtype=float_keras())
    xx = np.zeros((1, max_length, model.x_dim), dtype=float_keras())
    keys = []

    for i in xrange(num_seqs):
        ti1 = time.time()
        data = sr.read(1)
        key = data[0][0]
        x = data[1][0]

        ti2 = time.time()
        logging.info('Extracting embeddings %d/%d for %s, num_frames: %d' %
                     (i, num_seqs, key, x.shape[0]))
        keys.append(key)
        xx[:, :, :] = 0

        if x.shape[0] <= max_length:
            xx[0, :x.shape[0]] = x
            y[i] = model.predict_embed(xx, batch_size=1)
        else:
            num_chunks = int(np.ceil(float(x.shape[0]) / max_length))
            chunk_size = int(np.ceil(float(x.shape[0]) / num_chunks))
            for j in xrange(num_chunks - 1):
                start = j * chunk_size
                xx[0, :chunk_size] = x[start:start + chunk_size]
                y[i] += model.predict_embed(xx, batch_size=1).ravel()
            xx[0, :chunk_size] = x[-chunk_size:]
            y[i] += model.predict_embed(xx, batch_size=1).ravel()
            y[i] /= num_chunks

        ti4 = time.time()
        logging.info(
            'Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f'
            % (i, num_seqs, key, ti4 - ti1, ti2 - ti1, ti4 - ti2))

    logging.info('Extract elapsed time: %.2f' % (time.time() - t1))

    hw = DWF.create(output_path)
    hw.write(keys, y)
コード例 #8
0
ファイル: plot-vector-tsne.py プロジェクト: whkanggg/hyperion
def plot_vector_tsne(iv_file, v_list, preproc_file, output_path, save_embed,
                     output_dim, perplexity, exag, lr, num_iter, init_method,
                     rng_seed, verbose, pca_dim, max_classes, **kwargs):

    if preproc_file is not None:
        preproc = TransformList.load(preproc_file)
    else:
        preproc = None

    vr_args = VCR.filter_args(**kwargs)
    vcr = VCR(iv_file, v_list, preproc, **vr_args)

    x, class_ids = vcr.read()

    t1 = time.time()

    if pca_dim > 0:
        pca = PCA(pca_dim=pca_dim)
        pca.fit(x)
        x = pca.predict(x)

    if not os.path.exists(output_path):
        os.makedirs(ouput_path)

    tsne_obj = lambda n: TSNE(n_components=n,
                              perplexity=perplexity,
                              early_exaggeration=exag,
                              learning_rate=lr,
                              n_iter=num_iter,
                              init=init_method,
                              random_state=rng_seed,
                              verbose=verbose)

    if max_classes > 0:
        index = class_ids < max_classes
        x = x[index]
        class_ids = class_ids[index]

    if output_dim > 3:
        tsne = tsne_obj(output_dim)
        y = tsne.fit_transform(x)

        if save_embed:
            h5_file = '%s/embed_%dd.h5' % (output_path, ouput_dim)
            hw = DWF.create(h5_file)
            hw.write(vcr.u2c.key, y)

    tsne = tsne_obj(2)
    y = tsne.fit_transform(x)
    if save_embed:
        h5_file = '%s/embed_2d.h5' % output_path
        hw = DWF.create(h5_file)
        hw.write(vcr.u2c.key, y)

    fig_file = '%s/tsne_2d.pdf' % (output_path)
    # plt.scatter(y[:,0], y[:,1], c=class_ids, marker='x')

    color_marker = [(c, m) for m in markers for c in colors]
    for c in np.unique(class_ids):
        idx = class_ids == c
        plt.scatter(y[idx, 0],
                    y[idx, 1],
                    c=color_marker[c][0],
                    marker=color_marker[c][1],
                    label=vcr.class_names[c])

    plt.legend()
    plt.grid(True)
    plt.show()
    plt.savefig(fig_file)
    plt.clf()

    # if max_classes > 0:
    #     fig_file = '%s/tsne_2d_n%d.pdf' % (output_path, max_classes)
    #     index = class_ids < max_classes
    #     plt.scatter(y[index,0], y[index,1], c=class_ids[index], marker='x')
    #     plt.grid(True)
    #     plt.show()
    #     plt.savefig(fig_file)
    #     plt.clf()

    tsne = tsne_obj(3)
    y = tsne.fit_transform(x)
    if save_embed:
        h5_file = '%s/embed_3d.h5' % output_path
        hw = DWF.create(h5_file)
        hw.write(vcr.u2c.key, y)

    fig_file = '%s/tsne_3d.pdf' % (output_path)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    #ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x')
    for c in np.unique(class_ids):
        idx = class_ids == c
        ax.scatter(y[idx, 0],
                   y[idx, 1],
                   y[idx, 2],
                   c=color_marker[c][0],
                   marker=color_marker[c][1],
                   label=vcr.class_names[c])

    plt.grid(True)
    plt.show()
    plt.savefig(fig_file)
    plt.clf()

    # if max_classes > 0:
    #     fig_file = '%s/tsne_3d_n%d.pdf' % (output_path, max_classes)
    #     index = class_ids < max_classes
    #     ax = fig.add_subplot(111, projection='3d')
    #     ax.scatter(y[index,0], y[index,1], y[index,2], c=class_ids[index], marker='x')
    #     plt.grid(True)
    #     plt.show()
    #     plt.savefig(fig_file)
    #     plt.clf()

    logging.info('Elapsed time: %.2f s.' % (time.time() - t1))
コード例 #9
0
def compute_mfcc_feats(input_path, output_path,
                       compress, compression_method, write_num_frames, 
                       use_gpu, nn_model_path, chunk_size, context,
                       **kwargs):

    #open device
    if  use_gpu and torch.cuda.is_available():
        os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
        max_tries = 100
        for g in range(max_tries):
            try:
                gpu_ids = find_free_gpus()
                os.environ['CUDA_VISIBLE_DEVICES'] = gpu_ids
                logging.info('CUDA_VISIBLE_DEVICES=%s' % os.environ['CUDA_VISIBLE_DEVICES'])
                logging.info('init gpu device')
                device = torch.device('cuda', 0)
                torch.tensor([0]).to(device)
                break
            except:
                if g < max_tries-1:
                    logging.info('failing init gpu, trying again')
                    time.sleep(10)
                else:
                    logging.info('failing init gpu, using cpu')
                    device = torch.device('cpu')

    else:
        logging.info('init cpu device')
        device = torch.device('cpu')

    mfcc_args1 = MFCC.filter_args(**kwargs)
    mfcc_args2 = copy.deepcopy(mfcc_args1)
    mfcc_args1['output_step'] = 'logfb'
    mfcc_args2['input_step'] = 'logfb'
    print(kwargs)
    print(mfcc_args1)
    print(mfcc_args2)
    mfcc1 = MFCC(**mfcc_args1)
    mfcc2 = MFCC(**mfcc_args2)   

    mvn = MVN(norm_var=False, left_context=150, right_context=150)
    
    # PUT YOUR NNET MODEL HERE!!!!
    enhancer = CGN()
    #enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)['state_dict'])
    enhancer.load_state_dict(torch.load(nn_model_path, map_location=device))
    enhancer.to(device)
    enhancer.eval()

    if mfcc1.input_step == 'wave':
        input_args = AR.filter_args(**kwargs)
        reader = AR(input_path, **input_args)
    else:
        input_args = DRF.filter_args(**kwargs)
        reader = DRF.create(input_path, **input_args)

    writer = DWF.create(output_path, scp_sep=' ',
                        compress=compress,
                        compression_method=compression_method)

    if write_num_frames is not None:
        f_num_frames = open(write_num_frames, 'w')
    
    for data in reader:
        if mfcc1.input_step == 'wave':
            key, x, fs = data
        else:
            key, x = data
        logging.info('Extracting filter-banks for %s' % (key))
        t1 = time.time()
        y = mfcc1.compute(x)

        # separate logE and filterbanks
        logE = y[:,0]
        y = y[:,1:]

        #estimate log energy from filterbanks
        logEy1 = logsumexp(y, axis=-1)

        #we apply dummy identity network to fb
        logging.info('Running enhancement network')
        y = mvn.normalize(y)
        y = apply_nnet(y, enhancer, chunk_size, context, device)

        #lets rescale the logE based on enhanced filterbanks
        logEy2 = logsumexp(y, axis=-1)
        logE = logE + (logEy2 - logEy1)

        # concatenate logE and filterbanks
        y = np.concatenate((logE[:,None], y), axis=-1)

        #apply DCT
        logging.info('Applying DCT')
        y = mfcc2.compute(y)

        dt = (time.time() - t1)*1000
        rtf = mfcc1.frame_shift*y.shape[0]/dt
        logging.info('Extracted filter-banks for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' %
                     (key, y.shape[0], dt, rtf))
        writer.write([key], [y])
        
        if write_num_frames is not None:
            f_num_frames.write('%s %d\n' % (key, y.shape[0]))

        mfcc1.reset()
            
    if write_num_frames is not None:
        f_num_frames.close()
コード例 #10
0
    # PUT YOUR NNET MODEL HERE!!!!
    enhancer = CGN()
    #enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)['state_dict'])
    enhancer.load_state_dict(torch.load(nn_model_path, map_location=device))
    enhancer.to(device)
    enhancer.eval()

    if mfcc1.input_step == 'wave':
        input_args = AR.filter_args(**kwargs)
        reader = AR(input_path, **input_args)
    else:
        input_args = DRF.filter_args(**kwargs)
        reader = DRF.create(input_path, **input_args)

    writer = DWF.create(output_path, scp_sep=' ',
                        compress=compress,
                        compression_method=compression_method)

    if write_num_frames is not None:
        f_num_frames = open(write_num_frames, 'w')
    
    for data in reader:
        if mfcc1.input_step == 'wave':
            key, x, fs = data
        else:
            key, x = data
        logging.info('Extracting filter-banks for %s' % (key))
        t1 = time.time()
        y = mfcc1.compute(x)

        # separate logE and filterbanks