def main(args):
    trials = [x.split() for x in open(args.trials)]
    utt1s = [x[0] for x in trials]
    utt2s = [x[1] for x in trials]
    if len(trials[0]) == 3:
        tar2int = {'nontarget': 0, 'target': 1}
        target = [tar2int[x[2]] for x in trials]
    else:
        target = None

    with kaldiio.ReadHelper(
            f'scp:{args.enroll_scp_dir}/{args.enroll_scp}') as reader:
        utt2embd_enroll = {utt: embd for utt, embd in reader}

        l_out, l_out_label = (
            np.array([utt2embd_enroll[i] for i in utt2embd_enroll]),
            np.array([i for i in utt2embd_enroll]),
        )

        l_out, l_out_label, _, _ = frontend(args, l_out, l_out_label,
                                            np.zeros((512, 512)),
                                            np.zeros((512, )))

        utt2embd_enroll = {utt: embd for utt, embd in zip(l_out_label, l_out)}

    with kaldiio.ReadHelper(
            f'scp:{args.trial_scp_dir}/{args.trial_scp}') as reader:
        utt2embd_trial = {utt: embd for utt, embd in reader}

        u_out, u_out_label = (
            np.array([utt2embd_trial[i] for i in utt2embd_trial]),
            np.array([i for i in utt2embd_trial]),
        )

        utt2embd_trial = {utt: embd for utt, embd in zip(u_out_label, u_out)}

    utt2embd_enroll = [utt2embd_enroll[utt] for utt in utt1s]
    utt2embd_trial = [utt2embd_trial[utt] for utt in utt2s]

    scores = cosine_scoring(utt2embd_enroll, utt2embd_trial)
    score_file_kaldi = []
    for enroll, trial, score in zip(utt1s, utt2s, scores):
        score_file_kaldi.append([enroll, trial, str(score)])

    with open(args.output, "w") as txt_file:
        for line in score_file_kaldi:
            txt_file.write(" ".join(line) +
                           "\n")  # works with any number of elements in a line

    if target is not None:
        eer, threshold = compute_eer(scores, target)
        print("ROC_EER: {:.2f}".format(eer * 100))
Ejemplo n.º 2
0
def prepare_data(ivec_scp, data_dir):

    with kaldiio.ReadHelper('scp:' + ivec_scp) as reader:
        ivectors = {}
        for k, iv in reader:
            ivectors[k] = iv

    with open('{}/utt2lang'.format(data_dir), 'r') as input_utt2lang:
        utt2lang_dict = {}
        for line in input_utt2lang:
            utt2lang_dict[line.split(' ')[0]] = line.split(' ')[1].strip('\n')

    with open('{}/utt2spk'.format(data_dir), 'r') as input_utt2spk:
        utt2spk_dict = {}
        for line in input_utt2spk:
            utt2spk_dict[line.split(' ')[0]] = line.split(' ')[1].strip('\n')

    with open('{}/utt2sent'.format(data_dir), 'r') as input_utt2sent:
        utt2sent_dict = {}
        for line in input_utt2sent:
            utt2sent_dict[line.split(' ')[0]] = line.split(' ')[1].strip('\n')

    ivectors_df = pd.DataFrame.from_dict(ivectors, orient='index').sort_index()
    labels_df = pd.DataFrame.from_dict(utt2lang_dict,
                                       orient='index',
                                       columns=["lang"]).sort_index()
    labels_df["spk"] = pd.DataFrame.from_dict(utt2spk_dict, orient='index')
    labels_df["sent"] = pd.DataFrame.from_dict(utt2sent_dict, orient='index')

    data = np.array(ivectors_df)

    return ivectors_df, labels_df
Ejemplo n.º 3
0
def compute_spectrum():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = float(args.sample_rate)
    config['output_type'] = int(args.output_type)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length

    spectrum = Spectrum.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            spectrum_test = spectrum(audio_data, args.sample_rate)
            sess = tf.compat.v1.Session()
            spectrum_feats = spectrum_test.eval(session=sess)
            writer[utt_id] = spectrum_feats
Ejemplo n.º 4
0
def main():
    args = parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logger.info(get_commandline_args())

    utt_text_speaker = consolidate_utt_info(scp=None,
                                            text=args.text_file,
                                            utt2spk=args.utt2spk_file)

    with kaldiio.ReadHelper(
            args.rspecifier,
            segments=args.segments) as reader, file_writer_helper(
                args.wspecifier,
                filetype=args.archive_format,
                compress=args.compress,
                compression_method=args.compression_method,
                sample_frequency=args.sample_frequency,
                transform=Transformation(args.feature_config)) as writer:
        for utt_id, (rate, wave) in tqdm.tqdm(reader,
                                              miniters=100,
                                              maxinterval=30):
            utt_dict = {"x": wave, "rate": rate}
            utt_dict.update(utt_text_speaker.get(utt_id, {}))
            try:
                writer[utt_id] = utt_dict
            except Exception as e:
                logger.warning(
                    f"Failed to process utterance {utt_id} with exception:\n{str(e)}"
                )
                continue
 def __iter__(self):
     with kaldiio.ReadHelper(self.rspecifier,
                             segments=self.segments) as reader:
         for key, array in reader:
             if self.return_shape:
                 array = array.shape
             yield key, array
Ejemplo n.º 6
0
def compute_fbank_pitch():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = float(args.sample_rate)
    config['upper_frequency_limit'] = float(args.upper_frequency_limit)
    config['lower_frequency_limit'] = float(args.lower_frequency_limit)
    config['filterbank_channel_count'] = float(args.filterbank_channel_count)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['thres_autoc'] = args.thres_autoc
    config['output_type'] = args.output_type

    fbank_pitch = FbankPitch.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            fbank_pitch_test = fbank_pitch(audio_data, args.sample_rate)
            sess = tf.Session()
            fbank_pitch_feats = fbank_pitch_test.eval(session=sess)
            writer[utt_id] = fbank_pitch_feats
Ejemplo n.º 7
0
def compute_stft():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length

    stft = Analyfiltbank.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            power_spectrum, phase_spectrum = stft(audio_data, args.sample_rate)
            sess = tf.Session()
            if args.output_type == 1:
                out_feats = power_spectrum.eval(session=sess)
            else:
                out_feats = phase_spectrum.eval(session=sess)
            writer[utt_id] = out_feats
Ejemplo n.º 8
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info('Apply preprocessing: {}'.format(preprocessing))
    else:
        preprocessing = None

    with file_writer_helper(args.wspecifier,
                            filetype=args.filetype,
                            write_num_frames=args.write_num_frames,
                            compress=args.compress,
                            compression_method=args.compression_method,
                            pcm_format=args.format) as writer:
        for utt_id, (rate,
                     array) in kaldiio.ReadHelper(args.rspecifier,
                                                  args.segments):
            if args.filetype == 'mat':
                # Kaldi-matrix doesn't support integer
                array = array.astype(numpy.float32)

            if array.ndim == 1:
                # (Time) -> (Time, Channel)
                array = array[:, None]

            if args.normalize is not None and args.normalize != 1:
                array = array.astype(numpy.float32)
                array = array / (1 << (args.normalize - 1))

            if preprocessing is not None:
                orgtype = array.dtype
                out = preprocessing(array, uttid_list=utt_id)
                out = out.astype(orgtype)

                if args.keep_length:
                    if len(out) > len(array):
                        out = numpy.pad(out, [(0, len(out) - len(array))] +
                                        [(0, 0) for _ in range(out.ndim - 1)],
                                        mode='constant')
                    elif len(out) < len(array):
                        # The length can be changed by stft, for example.
                        out = out[:len(out)]

                array = out

            # shape = (Time, Channel)
            if args.filetype in ['sound.hdf5', 'sound']:
                # Write Tuple[int, numpy.ndarray] (scipy style)
                writer[utt_id] = (rate, array)
            else:
                writer[utt_id] = array
Ejemplo n.º 9
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
            file_writer_helper(args.wspecifier,
                               filetype=args.filetype,
                               write_num_frames=args.write_num_frames,
                               compress=args.compress,
                               compression_method=args.compression_method
                               ) as writer:
        for utt_id, (_, array) in reader:
            array = array.astype(numpy.float32)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))
            spc = spectrogram(x=array,
                              n_fft=args.n_fft,
                              n_shift=args.n_shift,
                              win_length=args.win_length,
                              window=args.window)
            writer[utt_id] = spc
Ejemplo n.º 10
0
def apply_cmvn():
    args = get_parser().parse_args()

    if ':' in args.stats_rspecifier_or_rxfilename:
        is_rspcifier = True
        stats_filetype = 'ark'
        stats_dict = dict(KaldiReader(args.stats_rspecifier_or_rxfilename))
    else:
        is_rspcifier = False
        stats_filetype = 'mat'
        stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename)
        stats_dict = {None: stats}

    config = {}
    config['norm_means'] = args.norm_means
    config['norm_vars'] = args.norm_vars
    config['utt2spk'] = args.utt2spk
    config['spk2utt'] = args.spk2utt
    config['reverse'] = args.reverse
    config['std_floor'] = args.std_floor
    config['filetype'] = stats_filetype

    cmvn = CMVN.params(config).instantiate()
    cmvn.call(stats_dict)

    with KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                  compress=args.compress, compression_method=args.compression_method) as writer, \
      kaldiio.ReadHelper(args.rspecifier) as reader:
        for utt, mat in reader:
            mat_new = cmvn.apply_cmvn(mat, utt)
            writer[utt] = mat_new
Ejemplo n.º 11
0
def compute_mfcc():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['upper_frequency_limit'] = float(args.upper_frequency_limit)
    config['lower_frequency_limit'] = float(args.lower_frequency_limit)
    config['filterbank_channel_count'] = float(args.filterbank_channel_count)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['output_type'] = args.output_type
    config['window_type'] = args.window_type
    config['snip_edges'] = args.snip_edges
    config['preeph_coeff'] = args.preeph_coeff
    config['remove_dc_offset'] = args.remove_dc_offset
    config['is_fbank'] = args.is_fbank
    config['cepstral_lifter'] = args.cepstral_lifter
    config['coefficient_count'] = args.coefficient_count

    mfcc = Mfcc.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            mfcc_test = tf.squeeze(mfcc(audio_data, args.sample_rate))
            sess = tf.Session()
            mfcc_feats = mfcc_test.eval(session=sess)
            writer[utt_id] = mfcc_feats
Ejemplo n.º 12
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    # set logger
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if not os.path.exists(args.figdir):
        os.makedirs(args.figdir)

    with kaldiio.ReadHelper(args.rspecifier) as reader, \
            codecs.open(args.wspecifier, "w", encoding="utf-8") as f:
        for utt_id, (rate, array) in reader:
            assert rate == args.fs
            array = array.astype(numpy.float32)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))
            array_trim, idx = librosa.effects.trim(
                y=array,
                top_db=args.threshold,
                frame_length=args.win_length,
                hop_length=args.shift_length
            )
            start, end = idx / args.fs

            # save figure
            plt.subplot(2, 1, 1)
            plt.plot(array)
            plt.title("Original")
            plt.subplot(2, 1, 2)
            plt.plot(array_trim)
            plt.title("Trim")
            plt.tight_layout()
            plt.savefig(args.figdir + "/" + utt_id + ".png")
            plt.close()

            # added minimum silence part
            start = max(0.0, start - args.min_silence)
            end = min(len(array) / args.fs, end + args.min_silence)

            # write to segments file
            segment = "%s %s %f %f\n" % (
                utt_id, utt_id, start, end
            )
            f.write(segment)
Ejemplo n.º 13
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    # Find the number of utterances
    n_utt = sum(1 for line in open(args.segments))
    logging.info("%d utterances found to be processed." % n_utt)

    # Compute fbank features
    with kaldiio.ReadHelper(
            args.rspecifier,
            segments=args.segments) as reader, file_writer_helper(
                args.wspecifier,
                filetype=args.filetype,
                write_num_frames=args.write_num_frames,
                compress=args.compress,
                compression_method=args.compression_method,
            ) as writer:
        for i, struct in enumerate(reader, start=1):
            logging.info("processing %d/%d(%.2f%%)" %
                         (i, n_utt, 100 * i / n_utt))
            utt_id, (rate, array) = struct
            try:
                assert rate == args.fs
                array = array.astype(numpy.float32)
                if args.normalize is not None and args.normalize != 1:
                    array = array / (1 << (args.normalize - 1))

                lmspc = logmelspectrogram(
                    x=array,
                    fs=args.fs,
                    n_mels=args.n_mels,
                    n_fft=args.n_fft,
                    n_shift=args.n_shift,
                    win_length=args.win_length,
                    window=args.window,
                    fmin=args.fmin,
                    fmax=args.fmax,
                )
                writer[utt_id] = lmspc
            except:
                logging.warning("failed to compute fbank for utt_id=`%s`" %
                                utt_id)
Ejemplo n.º 14
0
def main(args):
    with kaldiio.ReadHelper(f'scp:{args.emb_in}') as reader:
        x_vector_u = {utt:embd for utt, embd in reader}

    R = np.load(args.rotation)

    # Convert from dictionaries to numpy arrays
    u_out, u_out_label = (
        np.array([x_vector_u[i] for i in x_vector_u]),
        np.array([i for i in x_vector_u]),
    )
    _, _, emb, emb_label = frontend(args, np.zeros((512,512)), np.zeros((512,)), u_out, u_out_label)

    R_emb = np.dot(emb, R)

    scp_data = {utt:embd for utt, embd in zip(emb_label, R_emb)}

    kaldiio.save_ark(f'{args.emb_out}/transformed_xvector.ark', scp_data, scp=f'{args.emb_out}/transformed_xvector.scp')
Ejemplo n.º 15
0
def compute_pitch():
    parser = get_parser()
    args = parser.parse_args()

    config = {}
    config['sample_rate'] = int(args.sample_rate)
    config['window_length'] = args.window_length
    config['frame_length'] = args.frame_length
    config['snip_edges'] = args.snip_edges
    config['preemph_coeff'] = args.preemph_coeff
    config['min_f0'] = args.min_f0
    config['max_f0'] = args.max_f0
    config['soft_min_f0'] = args.soft_min_f0
    config['penalty_factor'] = args.penalty_factor
    config['lowpass_cutoff'] = args.lowpass_cutoff
    config['resample_freq'] = args.resample_freq
    config['delta_pitch'] = args.delta_pitch
    config['nccf_ballast'] = args.nccf_ballast
    config['lowpass_filter_width'] = args.lowpass_filter_width
    config['upsample_filter_width'] = args.upsample_filter_width
    config['max_frames_latency'] = args.max_frames_latency
    config['frames_per_chunk'] = args.frames_per_chunk
    config['simulate_first_pass_online'] = args.simulate_first_pass_online
    config['recompute_frame'] = args.recompute_frame
    config['nccf_ballast_online'] = args.nccf_ballast_online

    pitch = Pitch.params(config).instantiate()

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
          KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames,
                      compress=args.compress, compression_method=args.compression_method) as writer:
        for utt_id, (sample_rate, array) in reader:
            if sample_rate != args.sample_rate:
                args.sample_rate = sample_rate
            array = array.astype(np.float32)
            audio_data = tf.constant(array, dtype=tf.float32)
            pitch_test = tf.squeeze(pitch(audio_data, args.sample_rate))
            sess = tf.Session()
            pitch_feats = pitch_test.eval(session=sess)
            writer[utt_id] = pitch_feats
Ejemplo n.º 16
0
def wav_generator(rspecifier, segments=None):
    """Generates wav-array from multiple wav-rspecifier

    :param List[str] rspecifier:
    :param str segments:

    """

    readers = [kaldiio.ReadHelper(r, segments=segments) for r in rspecifier]
    for vs in zip(*readers):
        for (_, v), r in zip(vs, rspecifier):
            # kaldiio.load_mat can handle both wavfile and kaldi-matrix,
            # and if it is wavfile, returns (rate, ndarray), else ndarray
            if not isinstance(v, tuple):
                raise RuntimeError('"{}" is an invalid wav file.'.format(r))

        utts = [utt_id for utt_id, _ in vs]
        if not all(u == utts[0] for u in utts):
            raise RuntimeError(
                'The all keys must be common among wav-rspecifiers: {}'.format(
                    rspecifier))
        rates = [rate for utt_id, (rate, array) in vs]
        if not all(rates[i] == rates[0] for i in range(len(vs))):
            raise RuntimeError('The all sampling-rage must be common '
                               'among wav-rspecifiers: {}'.format(rspecifier))

        arrays = []
        for utt_id, (rate, array) in vs:
            if array.ndim == 1:
                # shape = (Time, 1)
                array = array[:, None]
            arrays.append(array)

        utt_id = utts[0]
        rate = rates[0]

        # [Time, Channel]
        array = numpy.concatenate(arrays, axis=1)
        yield utt_id, (rate, array)
Ejemplo n.º 17
0
def main(args):
    trials = [x.split() for x in open(os.path.join(args.data, 'trials'))]
    utt1s = [x[0] for x in trials]
    utt2s = [x[1] for x in trials]
    if len(trials[0]) == 3:
        tar2int = {'nontarget': 0, 'target': 1}
        target = [tar2int[x[2]] for x in trials]
    else:
        target = None

    embd_scp = os.path.join(args.data, 'embedding.scp')
    with kaldiio.ReadHelper(f'scp:{embd_scp}') as reader:
        utt2embd = {utt: embd for utt, embd in reader}

    embd1s = [utt2embd[utt] for utt in utt1s]
    embd2s = [utt2embd[utt] for utt in utt2s]

    scores = cosine_scoring(embd1s, embd2s)
    score_path = os.path.join(args.data, 'scores.txt')
    np.savetxt(score_path, scores, fmt='%.4f')

    if target is not None:
        eer, threshold = compute_eer(scores, target)
        print("EER: {:.2f}%".format(eer * 100))
Ejemplo n.º 18
0
        help="name to output file (ivectors.h5f or lda_ivectors.h5f")
    parser.parse_args()
    args, leftovers = parser.parse_known_args()

    print(args.output_name)
    try:
        shutil.rmtree('{}/tmp'.format(args.target_dir))
    except:
        pass

    if os.path.exists('{}/{}'.format(args.target_dir, args.output_name)):
        os.remove('{}/{}'.format(args.target_dir, args.output_name))

    os.makedirs('{}/tmp'.format(args.target_dir))

    with kaldiio.ReadHelper('scp:{}'.format(args.feats_file)) as reader:
        filenames = []
        times = np.array([0])
        for key, numpy_array in reader:
            filenames.append(key)
            ivector_2d = np.expand_dims(numpy_array.astype(np.float64), axis=0)
            np.savez('{}/tmp/{}'.format(args.target_dir, key),
                     features=ivector_2d,
                     time=times)
    print('aaa')
    any2h5features.convert('{}/tmp/'.format(args.target_dir),
                           '{}/{}'.format(args.target_dir, args.output_name))
    print(args.target_dir, args.output_name)
    print('bbb')

    # shutil.rmtree('{}/tmp'.format(args.target_dir))
Ejemplo n.º 19
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--fs', type=int, help='Sampling frequency')
    parser.add_argument('--fmax',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Maximum frequency')
    parser.add_argument('--fmin',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Minimum frequency')
    parser.add_argument('--n_mels',
                        type=int,
                        default=80,
                        help='Number of mel basis')
    parser.add_argument('--n_fft',
                        type=int,
                        default=1024,
                        help='FFT length in point')
    parser.add_argument('--n_shift',
                        type=int,
                        default=512,
                        help='Shift length in point')
    parser.add_argument('--win_length',
                        type=int,
                        default=None,
                        nargs='?',
                        help='Analisys window length in point')
    parser.add_argument('--window',
                        type=str,
                        default='hann',
                        choices=['hann', 'hamming'],
                        help='Type of window')
    parser.add_argument('--write-num-frames',
                        type=str,
                        help='Specify wspecifer for utt2num_frames')
    parser.add_argument('--filetype',
                        type=str,
                        default='mat',
                        choices=['mat', 'hdf5'],
                        help='Specify the file format for output. '
                        '"mat" is the matrix format in kaldi')
    parser.add_argument('--compress',
                        type=strtobool,
                        default=False,
                        help='Save in compressed format')
    parser.add_argument(
        '--compression-method',
        type=int,
        default=2,
        help='Specify the method(if mat) or gzip-level(if hdf5)')
    parser.add_argument('--verbose',
                        '-V',
                        default=0,
                        type=int,
                        help='Verbose option')
    parser.add_argument('--normalize',
                        choices=[1, 16, 24, 32],
                        type=int,
                        default=None,
                        help='Give the bit depth of the PCM, '
                        'then normalizes data to scale in [-1,1]')
    parser.add_argument('rspecifier', type=str, help='WAV scp file')
    parser.add_argument('--segments',
                        type=str,
                        help='segments-file format: each line is either'
                        '<segment-id> <recording-id> <start-time> <end-time>'
                        'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5')
    parser.add_argument('wspecifier', type=str, help='Write specifier')
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    with kaldiio.ReadHelper(args.rspecifier,
                            segments=args.segments) as reader, \
            FileWriterWrapper(args.wspecifier,
                              filetype=args.filetype,
                              write_num_frames=args.write_num_frames,
                              compress=args.compress,
                              compression_method=args.compression_method
                              ) as writer:
        for utt_id, (rate, array) in reader:
            assert rate == args.fs
            array = array.astype(numpy.float32)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))

            lmspc = logmelspectrogram(x=array,
                                      fs=args.fs,
                                      n_mels=args.n_mels,
                                      n_fft=args.n_fft,
                                      n_shift=args.n_shift,
                                      win_length=args.win_length,
                                      window=args.window,
                                      fmin=args.fmin,
                                      fmax=args.fmax)
            writer[utt_id] = lmspc
import torch
from data_io import load_dataset, read_lab_fea_refac01
import kaldiio
import os
import numpy as np


def check_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)


if __name__ == '__main__':
    feat_opts = "apply-cmvn --utt2spk=ark:/users/liuli/project/kaldi/egs/timit/s5/data/GSC_enroll_customized/utt2spk ark:/users/liuli/project/kaldi/egs/timit/s5/GSC_fbank/cmvn_GSC_enroll_customized.ark ark:- ark:- |"
    scp_file = "/users/liuli/project/kaldi/egs/timit/s5/data/GSC_enroll_customized/feats.scp"
    output_dir = "/users/liuli/database/features/GSC_V2/win25ms_hop10ms_41fbank_cmvn/enroll_customized"

    ark_file = "ark:copy-feats scp:" + scp_file + " ark:- | " + feat_opts
    idx = 0
    with kaldiio.ReadHelper(ark_file) as reader:
        for key, numpy_array in reader:
            idx += 1
            label = key.split("-")[0]
            file_name = key.split("-")[1]
            save_dir = os.path.join(output_dir, label)
            check_dir(save_dir)
            save_path = os.path.join(save_dir, file_name + ".npy")
            np.save(save_path, numpy_array)
            if idx % 1000 == 0:
                print("{}/{} files finished".format(idx, 3600))
Ejemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--fs', type=int, help='Sampling frequency')
    parser.add_argument('--threshold',
                        type=float,
                        default=60,
                        help='Threshold in decibels')
    parser.add_argument('--win_length',
                        type=int,
                        default=1024,
                        help='Analisys window length in point')
    parser.add_argument('--shift_length',
                        type=int,
                        default=256,
                        help='Shift length in point')
    parser.add_argument('--min_silence',
                        type=float,
                        default=0.01,
                        help='minimum silence length')
    parser.add_argument('--verbose',
                        '-V',
                        default=0,
                        type=int,
                        help='Verbose option')
    parser.add_argument('--normalize',
                        choices=[1, 16, 24, 32],
                        type=int,
                        default=None,
                        help='Give the bit depth of the PCM, '
                        'then normalizes data to scale in [-1,1]')
    parser.add_argument('rspecifier', type=str, help='WAV scp file')
    parser.add_argument('wspecifier', type=str, help='Segments file')
    args = parser.parse_args()

    # set logger
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    with kaldiio.ReadHelper(args.rspecifier) as reader, \
            codecs.open(args.wspecifier, "w", encoding="utf-8") as f:
        for utt_id, (rate, array) in reader:
            assert rate == args.fs
            array = array.astype(numpy.float32)
            if args.normalize is not None and args.normalize != 1:
                array = array / (1 << (args.normalize - 1))
            array_trim, idx = librosa.effects.trim(
                y=array,
                top_db=args.threshold,
                frame_length=args.win_length,
                hop_length=args.shift_length)
            start, end = idx / args.fs

            # added minimum silence part
            start = max(0.0, start - args.min_silence)
            end = min(len(array) / args.fs, end + args.min_silence)

            # write to segments file
            segment = "%s_%s_%s %s %f %f\n" % (utt_id, _time_to_str(start),
                                               _time_to_str(end), utt_id,
                                               start, end)
            f.write(segment)
Ejemplo n.º 22
0
def main():
    """Run preprocessing process."""
    parser = argparse.ArgumentParser(
        description=
        "Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)."
    )
    parser.add_argument(
        "--scp",
        default=None,
        type=str,
        help=
        "kaldi-style wav.scp file. you need to specify either scp or rootdir.")
    parser.add_argument(
        "--segments",
        default=None,
        type=str,
        help=
        "kaldi-style segments file. if use, you must to specify both scp and segments."
    )
    parser.add_argument(
        "--rootdir",
        default=None,
        type=str,
        help=
        "directory including wav files. you need to specify either scp or rootdir."
    )
    parser.add_argument("--dumpdir",
                        type=str,
                        required=True,
                        help="directory to dump feature files.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="yaml format configuration file.")
    parser.add_argument("--n_jobs",
                        type=int,
                        default=16,
                        help="number of parallel jobs. (default=16)")
    parser.add_argument(
        "--verbose",
        type=int,
        default=1,
        help="logging level. higher is more logging. (default=1)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning('skip DEBUG/INFO messages')

    # load config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.scp is not None and args.rootdir is not None) or \
            (args.scp is None and args.rootdir is None):
        raise ValueError("Please specify either rootdir or scp.")

    # get dataset
    if args.scp is not None:
        dataset = kaldiio.ReadHelper(f"scp:{args.scp}", segments=args.segments)
    else:
        dataset = AudioDataset(args.rootdir,
                               "*.wav",
                               audio_load_fn=sf.read,
                               return_filename=True)

    # check directly existence
    if not os.path.exists(args.dumpdir):
        os.makedirs(args.dumpdir, exist_ok=True)

    # define function for parallel processing
    def _process_single_file(data):
        # parse inputs
        if args.scp is not None:
            utt_id, (fs, audio) = data
            audio = audio.astype(np.float32)
            audio /= (1 << (16 - 1))  # assume that wav is PCM 16 bit
        else:
            name, (audio, fs) = data
            utt_id = os.path.basename(name).replace(".wav", "")

        # check
        assert len(audio.shape) == 1, \
            f"{utt_id} seems to be multi-channel signal."
        assert fs == config["sampling_rate"], \
            f"{utt_id} seems to have a different sampling rate."
        assert np.abs(audio).max() <= 1.0, \
            f"{utt_id} seems to be different from 16 bit PCM."

        # trim silence
        if config["trim_silence"]:
            audio, _ = librosa.effects.trim(
                audio,
                top_db=config["trim_threshold_in_db"],
                frame_length=config["trim_frame_size"],
                hop_length=config["trim_hop_size"])

        # extract feature
        mel = logmelfilterbank(audio,
                               fs,
                               fft_size=config["fft_size"],
                               hop_size=config["hop_size"],
                               win_length=config["win_length"],
                               window=config["window"],
                               num_mels=config["num_mels"],
                               fmin=config["fmin"],
                               fmax=config["fmax"])

        # make sure the audio length and feature length are matched
        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        audio = audio[:len(mel) * config["hop_size"]]
        assert len(mel) * config["hop_size"] == len(audio)

        # apply global gain
        if config["global_gain_scale"] > 0.0:
            audio *= config["global_gain_scale"]
            if np.abs(audio).max() > 1.0:
                logging.warn(f"{utt_id} causes clipping. "
                             f"it is better to re-consider global gain scale.")
                return

        # save
        if config["format"] == "hdf5":
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave",
                       audio.astype(np.float32))
            write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats",
                       mel.astype(np.float32))
        elif config["format"] == "npy":
            np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"),
                    audio.astype(np.float32),
                    allow_pickle=False)
            np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"),
                    mel.astype(np.float32),
                    allow_pickle=False)
        else:
            raise ValueError("support only hdf5 or npy format.")

    # process in parallel
    Parallel(n_jobs=args.n_jobs, verbose=args.verbose)(
        [delayed(_process_single_file)(data) for data in tqdm(dataset)])
from apc_model import APCModel
from utils import PrenetConfig, RNNConfig
# added by Sameer
import kaldiio
import sys

feats_scp = sys.argv[1]
segments = sys.argv[2]
scp_file = sys.argv[3]

ark_file = scp_file.replace('.scp', '.ark')
writer = kaldiio.WriteHelper('ark,scp:%s,%s' % (ark_file, scp_file))

if segments:
    reader = kaldiio.ReadHelper('scp:%s' % feats_scp, segments=segments)
else:
    reader = kaldiio.ReadHelper('scp:%s' % feats_scp)


def main():
    prenet_config = None
    rnn_config = RNNConfig(input_size=80,
                           hidden_size=512,
                           num_layers=3,
                           dropout=0.,
                           residual=True)  # Sameer Added residual=True
    pretrained_apc = APCModel(mel_dim=80,
                              prenet_config=prenet_config,
                              rnn_config=rnn_config).cuda()
Ejemplo n.º 24
0
    def __iter__(self):
        if self.filetype == 'mat':
            with kaldiio.ReadHelper(self.rspecifier) as reader:
                for key, array in reader:
                    if self.return_shape:
                        array = array.shape
                    yield key, array

        elif self.filetype == 'sound':
            if ':' not in self.rspecifier:
                raise ValueError('Give "rspecifier" such as "scp:some.scp: {}"'
                                 .format(self.rspecifier))
            ark_or_scp, filepath = self.rspecifier.split(':', 1)
            if ark_or_scp != 'scp':
                raise ValueError('Only supporting "scp" for sound file: {}'
                                 .format(ark_or_scp))
            with io.open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    key, sound_file_path = line.rstrip().split(None, 1)
                    # Assume PCM16
                    array, rate = soundfile.read(sound_file_path,
                                                 dtype='int16')
                    # Change Tuple[ndarray, int] -> Tuple[int, ndarray]
                    # (soundfile style -> scipy style)

                    if self.return_shape:
                        array = array.shape
                    yield key, (rate, array)

        elif self.filetype in ['hdf5', 'sound.hdf5']:
            if ':' not in self.rspecifier:
                raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'
                                 .format(self.rspecifier))
            ark_or_scp, filepath = self.rspecifier.split(':', 1)
            if ark_or_scp not in ['ark', 'scp']:
                raise ValueError('Must be scp or ark: {}'.format(ark_or_scp))

            if ark_or_scp == 'scp':
                hdf5_dict = {}
                with io.open(filepath, 'r', encoding='utf-8') as f:
                    for line in f:
                        key, value = line.rstrip().split(None, 1)

                        if ':' not in value:
                            raise RuntimeError(
                                'scp file for hdf5 should be like: '
                                '"uttid filepath.h5:key": {}({})'
                                .format(line, filepath))
                        path, h5_key = value.split(':', 1)

                        hdf5_file = hdf5_dict.get(path)
                        if hdf5_file is None:
                            if self.filetype == 'sound.hdf5':
                                hdf5_file = SoundHDF5File(path, 'r')
                            else:
                                hdf5_file = h5py.File(path, 'r')
                            hdf5_dict[path] = hdf5_file

                        if self.filetype == 'sound.hdf5':
                            # Change Tuple[ndarray, int] -> Tuple[int, ndarray]
                            # (soundfile style -> scipy style)
                            array, rate = hdf5_file[h5_key]

                            if self.return_shape:
                                array = array.shape
                            yield key, (rate, array)
                        else:
                            if self.return_shape:
                                yield key, hdf5_file[h5_key].shape
                            else:
                                yield key, hdf5_file[h5_key][()]

                # Closing all files
                for k in hdf5_dict:
                    hdf5_dict[k].close()

            else:
                if filepath == '-':
                    # Required h5py>=2.9
                    if PY2:
                        filepath = io.BytesIO(sys.stdin.read())
                    else:
                        filepath = io.BytesIO(sys.stdin.buffer.read())
                if self.filetype == 'sound.hdf5':
                    for key, (r, a) in SoundHDF5File(filepath, 'r').items():
                        if self.return_shape:
                            a = a.shape
                        yield key, (r, a)
                else:
                    with h5py.File(filepath, 'r') as f:
                        for key in f:
                            if self.return_shape:
                                yield key, f[key].shape
                            else:
                                yield key, f[key][()]
        else:
            raise ValueError(
                'Not supporting: filetype={}'.format(self.filetype))
Ejemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--write-num-frames',
                        type=str,
                        help='Specify wspecifer for utt2num_frames')
    parser.add_argument('--filetype',
                        type=str,
                        default='mat',
                        choices=['mat', 'hdf5', 'sound.hdf5', 'sound'],
                        help='Specify the file format for output. '
                        '"mat" is the matrix format in kaldi')
    parser.add_argument('--format',
                        type=str,
                        default=None,
                        help='The file format for output pcm. '
                        'This option is only valid '
                        'when "--filetype" is "sound.hdf5" or "sound"')
    parser.add_argument('--compress',
                        type=strtobool,
                        default=False,
                        help='Save in compressed format')
    parser.add_argument(
        '--compression-method',
        type=int,
        default=2,
        help='Specify the method(if mat) or gzip-level(if hdf5)')
    parser.add_argument('--verbose',
                        '-V',
                        default=0,
                        type=int,
                        help='Verbose option')
    parser.add_argument('--normalize',
                        choices=[1, 16, 24, 32],
                        type=int,
                        default=None,
                        help='Give the bit depth of the PCM, '
                        'then normalizes data to scale in [-1,1]')
    parser.add_argument('--preprocess-conf',
                        type=str,
                        default=None,
                        help='The configuration file for the pre-processing')
    parser.add_argument('--keep-length',
                        type=strtobool,
                        default=True,
                        help='Truncating or zero padding if the output length '
                        'is changed from the input by preprocessing')
    parser.add_argument('rspecifier', type=str, help='WAV scp file')
    parser.add_argument('--segments',
                        type=str,
                        help='segments-file format: each line is either'
                        '<segment-id> <recording-id> <start-time> <end-time>'
                        'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5')
    parser.add_argument('wspecifier', type=str, help='Write specifier')
    args = parser.parse_args()

    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)
    logging.info(get_commandline_args())

    if args.preprocess_conf is not None:
        preprocessing = Transformation(args.preprocess_conf)
        logging.info('Apply preprocessing: {}'.format(preprocessing))
    else:
        preprocessing = None

    with FileWriterWrapper(args.wspecifier,
                           filetype=args.filetype,
                           write_num_frames=args.write_num_frames,
                           compress=args.compress,
                           compression_method=args.compression_method,
                           pcm_format=args.format) as writer:
        for utt_id, (rate,
                     array) in kaldiio.ReadHelper(args.rspecifier,
                                                  args.segments):
            if args.filetype == 'mat':
                # Kaldi-matrix doesn't support integer
                array = array.astype(numpy.float32)

            if array.ndim == 1:
                # (Time) -> (Time, Channel)
                array = array[:, None]

            if args.normalize is not None and args.normalize != 1:
                array = array.astype(numpy.float32)
                array = array / (1 << (args.normalize - 1))

            if preprocessing is not None:
                orgtype = array.dtype
                out = preprocessing(array, uttid_list=utt_id)
                out = out.astype(orgtype)

                if args.keep_length:
                    if len(out) > len(array):
                        out = numpy.pad(out, [(0, len(out) - len(array))] +
                                        [(0, 0) for _ in range(out.ndim - 1)],
                                        mode='constant')
                    elif len(out) < len(array):
                        # The length can be changed by stft, for example.
                        out = out[:len(out)]

                array = out

            # shape = (Time, Channel)
            if args.filetype in ['sound.hdf5', 'sound']:
                # Write Tuple[int, numpy.ndarray] (scipy style)
                writer[utt_id] = (rate, array)
            else:
                writer[utt_id] = array
Ejemplo n.º 26
0
    parser.add_argument(
        "train",
        help="train directory - just basename. Eg: <train_bil_eng-ger>")
    parser.parse_args()
    args, leftovers = parser.parse_known_args()

    train = args.train
    with open('data/emime/{}/utt2lang'.format(train), 'r') as input_utt2lang:
        utt2lang_dict = {}
        for line in input_utt2lang:
            utt2lang_dict[line.split(' ')[0]] = line.split(' ')[1].strip('\n')

    ivec = 'exp_emime/ivectors-deltassdc/ivectors_128_tr-{}_ts-{}/ivector.scp'.format(
        train, train)

    with kaldiio.ReadHelper('scp:' + ivec) as reader:
        ivectors = {}
        for k, iv in reader:
            ivectors[k] = iv

    #ivectors_df = pd.DataFrame.from_dict(ivectors)
    ivectors_df = pd.DataFrame.from_dict(ivectors,
                                         orient='index')  #this is our y

    #utt2lang_df = pd.DataFrame(utt2lang_dict, index=["lang"])
    predictor_df = pd.DataFrame.from_dict(
        utt2lang_dict, orient='index', columns=["lang"],
        dtype="category")  #cat is so can do dummy encoding
    predictor_df["lang_dich"] = predictor_df[
        "lang"].cat.codes  #change dichotomous
Ejemplo n.º 27
0
def main():
    """Run decoding process."""
    parser = argparse.ArgumentParser(
        description="Decode dumped features with trained Parallel WaveGAN Generator.")
    parser.add_argument("--scp", default=None, type=str,
                        help="Kaldi-style feats.scp file.")
    parser.add_argument("--dumpdir", default=None, type=str,
                        help="Directory including feature files.")
    parser.add_argument("--outdir", default=None, type=str, required=True,
                        help="Direcotry to save generated speech.")
    parser.add_argument("--checkpoint", default=None, type=str, required=True,
                        help="Checkpoint file.")
    parser.add_argument("--config", default=None, type=str,
                        help="Yaml format configuration file.")
    parser.add_argument("--verbose", type=int, default=1,
                        help="logging level (higher is more logging)")
    args = parser.parse_args()

    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning("skip DEBUG/INFO messages")

    # check direcotry existence
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # load config
    if args.config is None:
        dirname = os.path.dirname(args.checkpoint)
        args.config = os.path.join(dirname, "config.yml")
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))

    # check arguments
    if (args.scp is not None and args.dumpdir is not None) or \
            (args.scp is None and args.dumpdir is None):
        raise ValueError("Please specify either dumpdir or scp.")

    # get dataset
    if args.scp is None:
        if config["format"] == "hdf5":
            mel_query = "*.h5"
            mel_load_fn = lambda x: read_hdf5(x, "feats")  # NOQA
        elif config["format"] == "npy":
            mel_query = "*-feats.npy"
            mel_load_fn = np.load
        else:
            raise ValueError("support only hdf5 or npy format.")
        dataset = MelDataset(
            args.dumpdir,
            mel_query=mel_query,
            mel_load_fn=mel_load_fn,
            return_filename=True)
        logging.info(f"the number of features to be decoded = {len(dataset)}.")
    else:
        dataset = kaldiio.ReadHelper(f"scp:{args.scp}")
        logging.info(f"the feature loaded from {args.scp}.")

    # setup
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model = ParallelWaveGANGenerator(**config["generator_params"])
    model.load_state_dict(torch.load(args.checkpoint, map_location="cpu")["model"]["generator"])
    model.remove_weight_norm()
    model = model.eval().to(device)
    logging.info(f"loaded model parameters from {args.checkpoint}.")

    # start generation
    pad_size = (config["generator_params"]["aux_context_window"],
                config["generator_params"]["aux_context_window"])
    total_rtf = 0.0
    with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar:
        for idx, (feat_path, c) in enumerate(pbar, 1):
            # generate each utterance
            z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device)
            c = np.pad(c, (pad_size, (0, 0)), "edge")
            c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device)
            start = time.time()
            y = model(z, c).view(-1).cpu().numpy()
            rtf = (time.time() - start) / (len(y) / config["sampling_rate"])
            pbar.set_postfix({"RTF": rtf})
            total_rtf += rtf

            # save as PCM 16 bit wav file
            utt_id = os.path.splitext(os.path.basename(feat_path))[0]
            sf.write(os.path.join(config["outdir"], f"{utt_id}_gen.wav"),
                     y, config["sampling_rate"], "PCM_16")

    # report average RTF
    logging.info(f"finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f}).")
Ejemplo n.º 28
0
def main(cmd_args):
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)

    # logging info
    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s')
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s')
        logging.warning('Skip DEBUG/INFO messages')

    # display PYTHONPATH
    logging.info('python path = ' + os.environ.get('PYTHONPATH', '(None)'))

    # set random seed
    logging.info('random seed = %d' % args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    set_deterministic_pytorch(args)

    logging.info("total speaker is %d" % args.nClasses)
    spk_model = SpeakerNet(nClasses=args.nClasses,
                           nPerSpeaker=args.nPerSpeaker,
                           trainfunc=args.trainfunc,
                           nOut=512)

    if args.spk_model is not None:
        spk_model.loadParameters(args.spk_model)
    else:
        spk_model = None
    spk_model.eval()
    mean = np.array([[
        -1.7101e+08, -1.727767e+08, -1.654258e+08, -1.568423e+08, -1.47768e+08,
        -1.355978e+08, -1.337955e+08, -1.290715e+08, -1.292888e+08,
        -1.333105e+08, -1.380836e+08, -1.388845e+08, -1.445241e+08,
        -1.438754e+08, -1.428372e+08, -1.428697e+08, -1.417773e+08,
        -1.400568e+08, -1.448087e+08, -1.459874e+08, -1.47229e+08,
        -1.490556e+08, -1.499799e+08, -1.522063e+08, -1.590756e+08,
        -1.618226e+08, -1.651485e+08, -1.684847e+08, -1.692581e+08,
        -1.714363e+08, -1.763494e+08, -1.776152e+08, -1.789162e+08,
        -1.805202e+08, -1.798933e+08, -1.818852e+08, -1.852947e+08,
        -1.860893e+08, -1.873477e+08, -1.889484e+08, -1.873008e+08,
        -1.891793e+08, -1.917609e+08, -1.932594e+08, -1.934982e+08,
        -1.90069e+08, -1.967007e+08, -1.955583e+08, -1.932292e+08,
        -2.001965e+08, -1.926799e+08, -2.013976e+08, -1.932717e+08,
        -1.997551e+08, -1.955731e+08, -1.958617e+08, -1.967825e+08,
        -1.952326e+08, -1.931164e+08, -1.947601e+08, -1.94064e+08,
        -1.937533e+08, -1.93948e+08, -1.940927e+08, -1.945755e+08,
        -1.955468e+08, -1.96344e+08, -1.963595e+08, -1.971519e+08,
        -1.991344e+08, -1.989762e+08, -2.000582e+08, -2.019397e+08,
        -2.019519e+08, -2.024301e+08, -2.031892e+08, -2.029932e+08,
        -2.029679e+08, -2.033156e+08, -2.033823e+08, -2.03208e+08,
        -2.036384e+08, -2.03879e+08, -2.04647e+08, -2.06028e+08, -2.060116e+08,
        -2.070609e+08, -2.071168e+08, -2.083309e+08, -2.092469e+08,
        -2.103796e+08, -2.122868e+08, -2.135678e+08, -2.144521e+08,
        -2.158103e+08, -2.171439e+08, -2.176665e+08, -2.191257e+08,
        -2.193856e+08, -2.21079e+08, -2.226874e+08, -2.247855e+08,
        -2.267768e+08, -2.286809e+08, -2.311216e+08, -2.33142e+08,
        -2.352095e+08, -2.373178e+08, -2.393992e+08, -2.415607e+08,
        -2.436022e+08, -2.450806e+08, -2.462217e+08, -2.47608e+08,
        -2.483978e+08, -2.495429e+08, -2.495807e+08, -2.501201e+08,
        -2.504308e+08, -2.506836e+08, -2.518955e+08, -2.528667e+08,
        -2.538843e+08, -2.553601e+08, -2.571577e+08, -2.592016e+08,
        -2.737314e+08, -3.25694e+08
    ]])
    var = np.array([[
        3.875797e+08, 3.972777e+08, 3.76892e+08, 3.590407e+08, 3.36797e+08,
        2.982351e+08, 2.993923e+08, 2.900205e+08, 2.903182e+08, 3.00258e+08,
        3.139445e+08, 3.133095e+08, 3.316776e+08, 3.290742e+08, 3.259625e+08,
        3.292938e+08, 3.253266e+08, 3.20113e+08, 3.353506e+08, 3.40549e+08,
        3.424283e+08, 3.454718e+08, 3.482779e+08, 3.577333e+08, 3.827005e+08,
        3.899876e+08, 4.01662e+08, 4.141465e+08, 4.154033e+08, 4.238292e+08,
        4.437099e+08, 4.463138e+08, 4.495017e+08, 4.545714e+08, 4.517053e+08,
        4.601415e+08, 4.730579e+08, 4.755685e+08, 4.813327e+08, 4.884872e+08,
        4.809006e+08, 4.883675e+08, 5.00223e+08, 5.064776e+08, 5.080264e+08,
        4.91717e+08, 5.215152e+08, 5.169479e+08, 5.060737e+08, 5.381505e+08,
        5.023963e+08, 5.430141e+08, 5.040811e+08, 5.339064e+08, 5.142676e+08,
        5.158492e+08, 5.202875e+08, 5.131353e+08, 5.043084e+08, 5.129934e+08,
        5.087678e+08, 5.064136e+08, 5.083315e+08, 5.083852e+08, 5.09834e+08,
        5.150194e+08, 5.177091e+08, 5.167306e+08, 5.197394e+08, 5.282414e+08,
        5.270312e+08, 5.324564e+08, 5.408028e+08, 5.407178e+08, 5.426285e+08,
        5.456758e+08, 5.454526e+08, 5.462478e+08, 5.481372e+08, 5.508704e+08,
        5.496423e+08, 5.518889e+08, 5.532486e+08, 5.56079e+08, 5.627578e+08,
        5.617894e+08, 5.666932e+08, 5.67652e+08, 5.73079e+08, 5.768822e+08,
        5.817027e+08, 5.912957e+08, 5.977753e+08, 6.0268e+08, 6.094717e+08,
        6.166043e+08, 6.196362e+08, 6.269311e+08, 6.276106e+08, 6.369116e+08,
        6.44361e+08, 6.551513e+08, 6.656342e+08, 6.762929e+08, 6.899264e+08,
        7.008929e+08, 7.117181e+08, 7.238042e+08, 7.350025e+08, 7.47482e+08,
        7.59422e+08, 7.681328e+08, 7.75756e+08, 7.834833e+08, 7.868992e+08,
        7.938968e+08, 7.929719e+08, 7.966068e+08, 7.983973e+08, 7.993377e+08,
        8.061261e+08, 8.111478e+08, 8.169364e+08, 8.25449e+08, 8.366562e+08,
        8.486715e+08, 9.377093e+08, 1.289456e+09
    ]])
    num_sum = 8.478675e+07
    with kaldiio.ReadHelper("scp:%s" %
                            args.read_file) as reader, kaldiio.WriteHelper(
                                'ark,scp:%s.ark,%s.scp' %
                                (args.write_file, args.write_file)) as writer:
        for key, numpy_array in reader:
            with torch.no_grad():
                length = len(numpy_array)
                numpy_array = numpy_array[20:-20]
                # numpy_array = numpy_array[20:]
                # numpy_array = numpy_array[:-20]
                # numpy_array = numpy_array - mean/num_sum
                # numpy_array = numpy_array / ( var/num_sum - (mean/num_sum)**2)
                torch_array = torch.from_numpy(numpy_array).unsqueeze(
                    0).float()

                logging.info(torch_array.size())
                writer[key] = spk_model(torch_array).squeeze(0).numpy()