Example #1
0
def adv_ex(model, x_in, parameters, sampling_rate, target, eps, n_adv, sess,
           multi_model, attack):

    if parameters['feature_type'] == 'raw':
        hop_size_samples = tools.sec_to_samples(parameters['hop_size'],
                                                sampling_rate)

        x, _ = torchaudio.load(x_in)

        num_frames = np.floor(x.shape[1] / hop_size_samples)
        x = x[:, :int(num_frames * hop_size_samples) - 1]
    else:
        x = fe.compute_features_with_context(x_in, **parameters)
        x = np.reshape(x, (x.shape[0], (x.shape[1] * x.shape[2])), order='C')

    signal_length = x.shape[1]
    window_size_samples = tools.next_pow2_samples(parameters['window_size'],
                                                  sampling_rate)
    hop_size_samples = tools.sec_to_samples(parameters['hop_size'],
                                            sampling_rate)

    num_frames = tools.get_num_frames(signal_length, window_size_samples,
                                      hop_size_samples) + 1

    # if target length does nt fit signal length
    if target.shape[0] != num_frames:
        x = x[:, :-hop_size_samples]
        signal_length = x.shape[1]
        num_frames = tools.get_num_frames(signal_length, window_size_samples,
                                          hop_size_samples) + 1

    adv, single_advs = targeted(model, x.shape, sess, x, target, eps, n_adv,
                                attack, multi_model)  # x.cpu().numpy(),
    return adv, single_advs
def generator(model,
              hmm,
              x_dirs,
              y_dirs,
              sampling_rate,
              parameters,
              viterbi_training=False):
    """
    creates feature-target-pairs out of files lists for training.
    :param model: trained dnn model
    :param hmm: hmm class instance
    :param x_dirs: *.wav file list
    :param y_dirs: *.TextGrid file list
    :param sampling_rate: sampling frequency in hz
    :param parameters: parameters for feature extraction
    :param viterbi_training: flag for viterbi training
    :return: x, y: feature-target-pair
    """
    # set random seed
    random.seed(42)
    # init A for viterbo training
    hmm.A_count = np.ceil(hmm.A)
    # same values for all utterances
    window_size_samples = tools.next_pow2_samples(parameters['window_size'],
                                                  sampling_rate)
    hop_size_samples = tools.sec_to_samples(parameters['hop_size'],
                                            sampling_rate)
    # generator
    while True:
        x_dirs, y_dirs = tools.shuffle_list(x_dirs, y_dirs)
        for audio_file, target_dir in zip(x_dirs, y_dirs):
            # get features and target
            y = tools.praat_file_to_word_target(target_dir, sampling_rate,
                                                window_size_samples,
                                                hop_size_samples, hmm)
            x, _ = torchaudio.load(audio_file)

            # to have the same number of frames as the targets
            num_frames = np.floor(x.shape[1] / hop_size_samples)
            x = x[:, :int(num_frames * hop_size_samples) - 1]

            yield x, y, target_dir
Example #3
0
def preprocess_dataset(model_type,
                       data_dir,
                       feature_parameters,
                       device='cuda'):
    def load_raw_data_dir(dataset_dir, device='cuda'):
        dataset_dir = dataset_dir.resolve()  # To resolve symlinks!
        # find raw data
        wav_files = [
            f for f in sorted(
                dataset_dir.joinpath('wav').resolve().glob('*.wav'))
        ]
        praat_files = [
            f for f in sorted(
                dataset_dir.joinpath('TextGrid').resolve().glob('*.TextGrid'))
        ]
        lab_files = [
            f for f in sorted(
                dataset_dir.joinpath('lab').resolve().glob('*.lab'))
        ]

        # load raw data
        X = []
        Y = []
        texts = []
        for wav_file, praat_file, lab_file in tqdm(
                zip(wav_files, praat_files, lab_files),
                total=len(wav_files),
                bar_format='    load raw     {l_bar}{bar:30}{r_bar}'):
            # sanity check
            assert wav_file.stem == praat_file.stem == lab_file.stem
            ## load x
            x, _ = torchaudio.load(wav_file)
            # round to the next `full` frame
            num_frames = np.floor(x.shape[1] / hop_size_samples)
            x = x[:, :int(num_frames * hop_size_samples)].to(device)
            X.append(x)
            ## load y
            # optional: convert praats into jsons
            # dataset_dir.joinpath('align').mkdir(parents=True, exist_ok=True)
            # tg = tgio.openTextgrid(praat_file)
            # align_dict = tools.textgrid_to_dict(tg)
            # json_file = Path(str(praat_file).replace('TextGrid', 'align')).with_suffix('.json')
            # json_file.write_text(json.dumps(align_dict, indent=4))
            # y = tools.json_file_to_target(json_file, sampling_rate, window_size_samples, hop_size_samples, hmm)
            y = tools.praat_file_to_target(praat_file, sampling_rate,
                                           window_size_samples,
                                           hop_size_samples, hmm)
            y = torch.from_numpy(y).to(device)
            Y.append(y)
            ## load text
            text = lab_file.read_text().strip()
            texts.append(text)
        return wav_files, X, Y, texts

    """
    Creates two datasets:
        - plain is simply a pre-processed version of TIDIGITS 
        - aligned replaces the targets Y with more precise targets (obtained via viterbi training)

    """

    # check if data dir exist
    raw_data_dir = Path(data_dir).joinpath('raw')
    assert raw_data_dir.is_dir()

    # data config
    sampling_rate = feature_parameters['sampling_rate']
    window_size_samples = tools.next_pow2_samples(
        feature_parameters['window_size'], sampling_rate)
    hop_size_samples = tools.sec_to_samples(feature_parameters['hop_size'],
                                            sampling_rate)

    # check if dataset is already pre-processed
    plain_out_dir = Path(data_dir).joinpath(model_type, 'plain')
    aligend_out_dir = Path(data_dir).joinpath(model_type, 'aligned')
    if plain_out_dir.joinpath('hmm.h5').is_file() and aligend_out_dir.joinpath(
            'hmm.h5').is_file():
        logging.info(f"[+] Dataset already pre-processed")
        return
    shutil.rmtree(plain_out_dir, ignore_errors=True)
    plain_out_dir.mkdir(parents=True)
    shutil.rmtree(aligend_out_dir, ignore_errors=True)
    aligend_out_dir.mkdir(parents=True)

    # Step 1: plain data
    # -> wavs are split into individual frames (the Xs)
    # -> each frame is mapped to the corresponding target state
    #    of the hmm (the Ys)
    #
    # As these targets are always depend on a particular hmm,
    # we save the hmm alongside with the data
    hmm = HMM.HMM('word')
    pickle.dump(hmm, plain_out_dir.joinpath('hmm.h5').open('wb'))

    # pre-proccess plain data
    dataset_names = [
        d.name for d in Path(raw_data_dir).glob('*') if d.is_dir()
    ]
    for dataset_name in dataset_names:
        logging.info(f"[+] Pre-process {dataset_name}")
        wav_files, X, Y, texts = load_raw_data_dir(
            raw_data_dir.joinpath(dataset_name))
        ## dump plain
        X_out_dir = plain_out_dir.joinpath(dataset_name, 'X')
        X_out_dir.mkdir(parents=True)
        Y_out_dir = plain_out_dir.joinpath(dataset_name, 'Y')
        Y_out_dir.mkdir(parents=True)
        text_out_dir = plain_out_dir.joinpath(dataset_name, 'text')
        text_out_dir.mkdir(parents=True)
        wav_out_dir = plain_out_dir.joinpath(dataset_name, 'wavs')
        wav_out_dir.mkdir(parents=True)
        for wav_file, x, y, text in tqdm(
                zip(wav_files, X, Y, texts),
                total=len(wav_files),
                bar_format='    dump plain  {l_bar}{bar:30}{r_bar}'):
            filename = wav_file.stem
            torch.save(y, Y_out_dir.joinpath(filename).with_suffix('.pt'))
            torch.save(x, X_out_dir.joinpath(filename).with_suffix('.pt'))
            text_out_dir.joinpath(filename).with_suffix('.txt').write_text(
                text)
            shutil.copyfile(wav_file,
                            wav_out_dir.joinpath(filename).with_suffix('.wav'))

    # Step 2: align data
    # -> for the plain data we only used relatively vague alignements between
    #    input frame and target
    # -> to improve this we create a second dataset that uses a hmm
    #    that is trained with viterbi to obtain more precise alignments

    # first we need to train the hmm with viterbi training
    dataset = Dataset(plain_out_dir.joinpath('TRAIN'), feature_parameters)
    model = init_model(model_type, feature_parameters, hmm)
    model.train_model(dataset, epochs=12, batch_size=32)
    model.train_model(dataset, epochs=1, batch_size=32, viterbi_training=True)
    model.hmm.A = hmm.modifyTransitions(model.hmm.A_count)
    model.train_model(dataset, epochs=2, batch_size=32, viterbi_training=True)
    # again, save hmm alongside the data
    pickle.dump(hmm, aligend_out_dir.joinpath('hmm.h5').open('wb'))

    # pre-proccess aligned data
    dataset_names = [
        d.name for d in Path(raw_data_dir).glob('*') if d.is_dir()
    ]
    for dataset_name in dataset_names:
        logging.info(f"[+] Pre-process {dataset_name}")
        # wav_files, X, Y, texts = load_raw_data_dir(raw_data_dir.joinpath(dataset_name), device=device)
        dst_path = plain_out_dir.joinpath(dataset_name)
        dataset = Dataset(dst_path, feature_parameters)
        ## dump plain
        X_out_dir = aligend_out_dir.joinpath(dataset_name, 'X')
        X_out_dir.mkdir(parents=True)
        Y_out_dir = aligend_out_dir.joinpath(dataset_name, 'Y')
        Y_out_dir.mkdir(parents=True)
        text_out_dir = aligend_out_dir.joinpath(dataset_name, 'text')
        text_out_dir.mkdir(parents=True)
        wav_out_dir = aligend_out_dir.joinpath(dataset_name, 'wavs')
        wav_out_dir.mkdir(parents=True)

        with tqdm(
                total=len(wav_files),
                bar_format='    dump aligned {l_bar}{bar:30}{r_bar}') as pbar:
            for X_batch, Y_batch, texts_batch, y_true_length, x_true_length, filenames in dataset.generator(
                    return_filename=True, batch_size=32, return_x_length=True):

                posteriors = model.features_to_posteriors(X_batch)
                Y_batch = hmm.viterbi_train(posteriors, y_true_length, Y_batch,
                                            texts_batch)

                for filename, x, y, y_length, x_length, text in zip(
                        filenames, X_batch, Y_batch, y_true_length,
                        x_true_length, texts_batch):
                    torch.save(y.clone()[:y_length],
                               Y_out_dir.joinpath(filename).with_suffix('.pt'))
                    torch.save(x.clone()[:x_length].unsqueeze(dim=0),
                               X_out_dir.joinpath(filename).with_suffix('.pt'))
                    text_out_dir.joinpath(filename).with_suffix(
                        '.txt').write_text(text)
                    shutil.copyfile(
                        dst_path.joinpath('wavs',
                                          filename).with_suffix('.wav'),
                        wav_out_dir.joinpath(filename).with_suffix('.wav'))
                    pbar.update(1)
Example #4
0
def preprocess(data_dir, feature_parameters):
    def load_raw_data_dir(dataset_dir, device='cuda'):
        dataset_dir = dataset_dir.resolve()  # To resolve symlinks!
        # find raw data
        wav_files = [
            f for f in sorted(
                dataset_dir.joinpath('wav').resolve().glob('*.wav'))
        ]
        praat_files = [
            f for f in sorted(
                dataset_dir.joinpath('TextGrid').resolve().glob('*.TextGrid'))
        ]
        lab_files = [
            f for f in sorted(
                dataset_dir.joinpath('lab').resolve().glob('*.lab'))
        ]

        # load raw data
        X = []
        Y = []
        texts = []
        for wav_file, praat_file, lab_file in tqdm(
                zip(wav_files, praat_files, lab_files),
                total=len(wav_files),
                bar_format='    load raw     {l_bar}{bar:30}{r_bar}'):
            # sanity check
            assert wav_file.stem == praat_file.stem == lab_file.stem, f'{wav_file.stem} {praat_file.stem} {lab_file.stem}'
            ## load x
            x, _ = torchaudio.load(wav_file)
            # round to the next `full` frame
            num_frames = np.floor(x.shape[1] / hop_size_samples)
            x = x[:, :int(num_frames * hop_size_samples)].to(device)
            X.append(x)
            ## load y
            # optional: convert praats into jsons
            # dataset_dir.joinpath('align').mkdir(parents=True, exist_ok=True)
            # tg = tgio.openTextgrid(praat_file)
            # align_dict = tools.textgrid_to_dict(tg)
            # json_file = Path(str(praat_file).replace('TextGrid', 'align')).with_suffix('.json')
            # json_file.write_text(json.dumps(align_dict, indent=4))
            # y = tools.json_file_to_target(json_file, sampling_rate, window_size_samples, hop_size_samples, hmm)
            y = tools.praat_file_to_target(praat_file, sampling_rate,
                                           window_size_samples,
                                           hop_size_samples, hmm)
            y = torch.from_numpy(y).to(device)
            Y.append(y)
            ## load text
            text = lab_file.read_text().strip()
            texts.append(text)
        return wav_files, X, Y, texts

    raw_data_dir = Path(data_dir).joinpath('raw')
    assert raw_data_dir.is_dir()

    # data config
    sampling_rate = feature_parameters['sampling_rate']
    window_size_samples = tools.next_pow2_samples(
        feature_parameters['window_size'], sampling_rate)
    hop_size_samples = tools.sec_to_samples(feature_parameters['hop_size'],
                                            sampling_rate)

    plain_out_dir = Path(data_dir).joinpath('plain')
    plain_out_dir.mkdir()

    hmm = HMM.HMM('word')
    pickle.dump(hmm, plain_out_dir.joinpath('hmm.h5').open('wb'))

    # pre-proccess plain data
    dataset_names = [
        d.name for d in Path(raw_data_dir).glob('*') if d.is_dir()
    ]
    for dataset_name in dataset_names:
        wav_files, X, Y, texts = load_raw_data_dir(
            raw_data_dir.joinpath(dataset_name))
        ## dump plain
        X_out_dir = plain_out_dir.joinpath(dataset_name, 'X')
        X_out_dir.mkdir(parents=True)
        Y_out_dir = plain_out_dir.joinpath(dataset_name, 'Y')
        Y_out_dir.mkdir(parents=True)
        text_out_dir = plain_out_dir.joinpath(dataset_name, 'text')
        text_out_dir.mkdir(parents=True)
        wav_out_dir = plain_out_dir.joinpath(dataset_name, 'wavs')
        wav_out_dir.mkdir(parents=True)
        for wav_file, x, y, text in tqdm(
                zip(wav_files, X, Y, texts),
                total=len(wav_files),
                bar_format='    dump plain  {l_bar}{bar:30}{r_bar}'):
            filename = wav_file.stem
            torch.save(y, Y_out_dir.joinpath(filename).with_suffix('.pt'))
            torch.save(x, X_out_dir.joinpath(filename).with_suffix('.pt'))
            text_out_dir.joinpath(filename).with_suffix('.txt').write_text(
                text)
            shutil.copyfile(wav_file,
                            wav_out_dir.joinpath(filename).with_suffix('.wav'))
Example #5
0
    assert params.model_type in params.attack_dir, "It seems you are trying to evalute " \
                                                   "results generated for a different model type"
    # assert params.model_type in str(params.data_dir), "You are using the wrong hmm (and aligned data)!"

    feature_parameters = {
        'window_size': 25e-3,
        'hop_size': 12.5e-3,
        'feature_type': 'raw',
        'num_ceps': 13,
        'left_context': 4,
        'right_context': 4,
        'sampling_rate': tools.get_sampling_rate(params.data_dir.parent)
    }
    feature_parameters['hop_size_samples'] = tools.sec_to_samples(
        feature_parameters['hop_size'], feature_parameters['sampling_rate'])
    feature_parameters['window_size_samples'] = tools.next_pow2_samples(
        feature_parameters['window_size'], feature_parameters['sampling_rate'])

    tools.set_seed(params.seed)

    attack_dir = Path(params.attack_dir)

    assert os.path.exists(attack_dir)

    if not attack_dir.joinpath('log.txt').is_file():
        assert len(list(attack_dir.iterdir())
                   ) == 1, "more than one instance of attack exist!"
        attack_dir = list(attack_dir.iterdir())[0]

    attack_step_dirs = [s for s in attack_dir.iterdir() if s.is_dir()]
    attack_step_dirs = sorted(attack_step_dirs, key=lambda s: int(s.name))
    attack_last_step_dir = attack_step_dirs[-1]