Exemple #1
0
def infer(args):
    system_info.print_system_info()

    # Prepare model
    in_size = feature.get_input_dim(args.frame_size, args.context_size,
                                    args.input_transform)

    if args.model_type == 'Transformer':
        model = TransformerDiarization(
            in_size,
            n_units=args.hidden_size,
            n_heads=args.transformer_encoder_n_heads,
            n_layers=args.transformer_encoder_n_layers,
            dropout=0,
            alpha=0)
    else:
        raise ValueError('Unknown model type.')

    serializers.load_npz(args.model_file, model)

    if args.gpu >= 0:
        gpuid = use_single_gpu()
        model.to_gpu()

    kaldi_obj = kaldi_data.KaldiData(args.data_dir)
    for recid in kaldi_obj.wavs:
        data, rate = kaldi_obj.load_wav(recid)
        Y = feature.stft(data, args.frame_size, args.frame_shift)
        Y = feature.transform(Y, transform_type=args.input_transform)
        Y = feature.splice(Y, context_size=args.context_size)
        Y = Y[::args.subsampling]
        out_chunks = []
        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            hs = None
            for start, end in _gen_chunk_indices(len(Y), args.chunk_size):
                Y_chunked = Variable(Y[start:end])
                if args.gpu >= 0:
                    Y_chunked.to_gpu(gpuid)
                hs, ys = model.estimate_sequential(hs, [Y_chunked])
                if args.gpu >= 0:
                    ys[0].to_cpu()
                out_chunks.append(ys[0].data)
                if args.save_attention_weight == 1:
                    att_fname = f"{recid}_{start}_{end}.att.npy"
                    att_path = os.path.join(args.out_dir, att_fname)
                    model.save_attention_weight(att_path)
        outfname = recid + '.h5'
        outpath = os.path.join(args.out_dir, outfname)
        if hasattr(model, 'label_delay'):
            outdata = shift(np.vstack(out_chunks), (-model.label_delay, 0))
        else:
            outdata = np.vstack(out_chunks)
        with h5py.File(outpath, 'w') as wf:
            wf.create_dataset('T_hat', data=outdata)
    def __init__(
        self,
        data_dir,
        dtype=np.float32,
        chunk_size=2000,
        context_size=0,
        frame_size=1024,
        frame_shift=256,
        subsampling=1,
        rate=16000,
        input_transform=None,
        use_last_samples=False,
        label_delay=0,
        n_speakers=None,
        shuffle=False,
    ):
        self.data_dir = data_dir
        self.dtype = dtype
        self.chunk_size = chunk_size
        self.context_size = context_size
        self.frame_size = frame_size
        self.frame_shift = frame_shift
        self.subsampling = subsampling
        self.input_transform = input_transform
        self.n_speakers = n_speakers
        self.chunk_indices = []
        self.label_delay = label_delay

        self.data = kaldi_data.KaldiData(self.data_dir)

        # make chunk indices: filepath, start_frame, end_frame
        for rec in self.data.wavs:
            data_len = int(self.data.reco2dur[rec] * rate / frame_shift)
            data_len = int(data_len / self.subsampling)
            for st, ed in _gen_frame_indices(data_len,
                                             chunk_size,
                                             chunk_size,
                                             use_last_samples,
                                             label_delay=self.label_delay,
                                             subsampling=self.subsampling):
                self.chunk_indices.append(
                    (rec, st * self.subsampling, ed * self.subsampling))
        print(len(self.chunk_indices), " chunks")

        self.shuffle = shuffle