コード例 #1
0
def adv_ex(model, x_in, parameters, sampling_rate, target, eps, n_adv, sess,
           multi_model, attack):

    if parameters['feature_type'] == 'raw':
        hop_size_samples = tools.sec_to_samples(parameters['hop_size'],
                                                sampling_rate)

        x, _ = torchaudio.load(x_in)

        num_frames = np.floor(x.shape[1] / hop_size_samples)
        x = x[:, :int(num_frames * hop_size_samples) - 1]
    else:
        x = fe.compute_features_with_context(x_in, **parameters)
        x = np.reshape(x, (x.shape[0], (x.shape[1] * x.shape[2])), order='C')

    signal_length = x.shape[1]
    window_size_samples = tools.next_pow2_samples(parameters['window_size'],
                                                  sampling_rate)
    hop_size_samples = tools.sec_to_samples(parameters['hop_size'],
                                            sampling_rate)

    num_frames = tools.get_num_frames(signal_length, window_size_samples,
                                      hop_size_samples) + 1

    # if target length does nt fit signal length
    if target.shape[0] != num_frames:
        x = x[:, :-hop_size_samples]
        signal_length = x.shape[1]
        num_frames = tools.get_num_frames(signal_length, window_size_samples,
                                          hop_size_samples) + 1

    adv, single_advs = targeted(model, x.shape, sess, x, target, eps, n_adv,
                                attack, multi_model)  # x.cpu().numpy(),
    return adv, single_advs
コード例 #2
0
def generator(x_dirs, y_dirs, hmm, sampling_rate, parameters):
    feats_list = []
    target_list = []

    length_feats = 0
    length_target = 0
    number_features = parameters['num_ceps'] * 3
    number_context = parameters['left_context'] + parameters[
        'right_context'] + 1

    for i in range(len(x_dirs)):
        # Compute audiofile to feature matrix
        # get path to audio file
        audio_file = x_dirs[i]

        # compute features
        feats = fe.compute_features_with_context(audio_file, **parameters)

        # get label
        target_dir = y_dirs[i]

        # calculate window size and hop size
        window_size_samples = tools.sec_to_samples(parameters['window_size'],
                                                   sampling_rate)
        window_size_samples = 2**tools.next_pow2(window_size_samples)

        hop_size_samples = tools.sec_to_samples(parameters['hop_size'],
                                                sampling_rate)

        # calculatge target
        target = tools.praat_file_to_target(target_dir, sampling_rate,
                                            window_size_samples,
                                            hop_size_samples, hmm)

        # append to list with features and targets
        length_feats += len(feats)
        length_target += len(target)
        feats_list.append(feats)
        target_list.append(target)

    target_list = list(chain.from_iterable(target_list))
    feats_list = list(chain.from_iterable(feats_list))

    feats_list_new = np.reshape(np.array(feats_list),
                                newshape=(length_feats, number_features,
                                          number_context))
    target_list_new = np.reshape(np.array(target_list),
                                 newshape=(length_feats, hmm.get_num_states()))

    return feats_list_new, target_list_new
コード例 #3
0
def return_start_words(praat_file,
                       sampling_rate=16000,
                       window_size=25e-3,
                       hop_size=12.5e-3):
    #:param praat_file: *.TextGrid file.
    window_size_samples = tools.sec_to_samples(window_size, sampling_rate)
    hop_size_samples = tools.sec_to_samples(hop_size, sampling_rate)

    intervals, min_time, max_time = tools.praat_to_word_Interval(praat_file)

    # parse intervals
    starts = []
    ends = []
    for interval in intervals:
        start_frame = tools.sec_to_frame(interval.start, sampling_rate,
                                         window_size_samples, hop_size_samples)
        end_frame = tools.sec_to_frame(interval.end, sampling_rate,
                                       window_size_samples, hop_size_samples)
        starts.append(start_frame)
        ends.append(end_frame)
    return starts, ends
コード例 #4
0
def get_mel_filters(sampling_rate,
                    window_size_sec,
                    n_filters,
                    f_min=0,
                    f_max=8000):
    """
    Returns a mel filterbank for a given set of specifications.

    :param sampling_rate: sampling rate in Hz.
    :param window_size_sec: window size in seconds.
    :param n_filters: number of filters.
    :param f_min: minimum frequency covered by mel filterbank in Hz (default: 0).
    :param f_max: maximum frequency covered by mel filterbank in Hz (default: 8000).
    :return: m x d array representing the mel filterbank, where m is the FFT size and d is the number of mel filters.
    """

    # calculate max and min frequency in mel
    f_min_mel = tools.hz_to_mel(f_min)
    f_max_mel = tools.hz_to_mel(f_max)

    # create vector with frequency points for filterbank in mel scale (equidistant)
    freq_points_mel = np.linspace(f_min_mel, f_max_mel, n_filters + 2)
    # transform it into Hertz scale
    freq_points_hz = tools.mel_to_hz(freq_points_mel)

    # calculate number of FFT frequency points
    fft_samples = int((2**tools.next_pow2(
        tools.sec_to_samples(window_size_sec, sampling_rate)) / 2) + 1)

    # find the corresponding indices for the filterbank in the FFT
    f = []
    for i in range(n_filters + 2):
        f.append(np.round((fft_samples) * freq_points_hz[i] / f_max))

    # initialize filterbank matrix H
    H = np.zeros((fft_samples, n_filters))

    # calculate filterbank matrix H
    for m in range(1, n_filters + 1):
        for k in range(fft_samples):
            if k < f[m - 1]:
                H[k, m - 1] = 0
            elif f[m - 1] <= k and k < f[m]:
                H[k, m - 1] = (2 * (k - f[m - 1])) / ((f[m + 1] - f[m - 1]) *
                                                      (f[m] - f[m - 1]))
            elif f[m] <= k and k <= f[m + 1]:
                H[k, m - 1] = (2 * (f[m + 1] - k)) / ((f[m + 1] - f[m - 1]) *
                                                      (f[m + 1] - f[m]))
            elif k > f[m + 1]:
                H[k, m - 1] = 0
    return H
コード例 #5
0
def make_frames(audio_data, sampling_rate, window_size, hop_size):
    """
    Splits an audio signal into subsequent frames.
 
    :param audio_data: array representing the audio signal.
    :param sampling_rate: sampling rate in Hz.
    :param window_size: window size in seconds. 
    :param hop_size: hop size (frame shift) in seconds.
    :return: n x m array of signal frames, where n is the number of frames and m is the window size in samples.
    """

    # transform window size in seconds to samples and calculate next higher power of two
    window_size_samples = tools.sec_to_samples(window_size, sampling_rate)
    window_size_samples = 2**tools.next_pow2(window_size_samples)

    # assign hamming window
    hamming_window = np.hamming(window_size_samples)

    # transform hop size in seconds to samples
    hop_size_samples = tools.sec_to_samples(hop_size, sampling_rate)

    # get number of frames from function in tools.py
    n_frames = tools.get_num_frames(len(audio_data), window_size_samples,
                                    hop_size_samples)

    # initialize nxm matrix (n is number of frames, m is window size)
    # initialize with zeros to avoid zero padding
    frames = np.zeros([n_frames, window_size_samples], dtype=float)

    # write frames in matrix
    for i in range(n_frames):
        start = i * hop_size_samples
        end = i * hop_size_samples + window_size_samples
        frames[i, 0:len(audio_data[start:end])] = audio_data[start:end]
        frames[i, :] = frames[i, :] * hamming_window

    return frames
コード例 #6
0
def generator(model,
              hmm,
              x_dirs,
              y_dirs,
              sampling_rate,
              parameters,
              viterbi_training=False):
    """
    creates feature-target-pairs out of files lists for training.
    :param model: trained dnn model
    :param hmm: hmm class instance
    :param x_dirs: *.wav file list
    :param y_dirs: *.TextGrid file list
    :param sampling_rate: sampling frequency in hz
    :param parameters: parameters for feature extraction
    :param viterbi_training: flag for viterbi training
    :return: x, y: feature-target-pair
    """
    # set random seed
    random.seed(42)
    # init A for viterbo training
    hmm.A_count = np.ceil(hmm.A)
    # same values for all utterances
    window_size_samples = tools.next_pow2_samples(parameters['window_size'],
                                                  sampling_rate)
    hop_size_samples = tools.sec_to_samples(parameters['hop_size'],
                                            sampling_rate)
    # generator
    while True:
        x_dirs, y_dirs = tools.shuffle_list(x_dirs, y_dirs)
        for audio_file, target_dir in zip(x_dirs, y_dirs):
            # get features and target
            y = tools.praat_file_to_word_target(target_dir, sampling_rate,
                                                window_size_samples,
                                                hop_size_samples, hmm)
            x, _ = torchaudio.load(audio_file)

            # to have the same number of frames as the targets
            num_frames = np.floor(x.shape[1] / hop_size_samples)
            x = x[:, :int(num_frames * hop_size_samples) - 1]

            yield x, y, target_dir
コード例 #7
0
    def __init__(self,
                 feature_parameters,
                 hmm,
                 dropout=0.0,
                 test_dropout_enabled=False):
        super(BaseModel, self).__init__()

        self.feature_parameters = feature_parameters
        self.hop_size_samples = tools.sec_to_samples(
            self.feature_parameters['hop_size'],
            self.feature_parameters['sampling_rate'])
        self.left_context = feature_parameters['left_context']
        self.right_context = feature_parameters['right_context']
        self.n_mfcc = feature_parameters['num_ceps']
        self.dropout = dropout
        self.hmm = hmm

        self.test_dropout_enabled = test_dropout_enabled

        # mfcc
        self.mfcc = torchaudio.transforms.MFCC(n_mfcc=self.n_mfcc)

        # delta and deltadeltas
        self.deltas = torchaudio.transforms.ComputeDeltas()
コード例 #8
0
ファイル: dataset.py プロジェクト: 9yte/VenoMave
def preprocess_dataset(model_type,
                       data_dir,
                       feature_parameters,
                       device='cuda'):
    def load_raw_data_dir(dataset_dir, device='cuda'):
        dataset_dir = dataset_dir.resolve()  # To resolve symlinks!
        # find raw data
        wav_files = [
            f for f in sorted(
                dataset_dir.joinpath('wav').resolve().glob('*.wav'))
        ]
        praat_files = [
            f for f in sorted(
                dataset_dir.joinpath('TextGrid').resolve().glob('*.TextGrid'))
        ]
        lab_files = [
            f for f in sorted(
                dataset_dir.joinpath('lab').resolve().glob('*.lab'))
        ]

        # load raw data
        X = []
        Y = []
        texts = []
        for wav_file, praat_file, lab_file in tqdm(
                zip(wav_files, praat_files, lab_files),
                total=len(wav_files),
                bar_format='    load raw     {l_bar}{bar:30}{r_bar}'):
            # sanity check
            assert wav_file.stem == praat_file.stem == lab_file.stem
            ## load x
            x, _ = torchaudio.load(wav_file)
            # round to the next `full` frame
            num_frames = np.floor(x.shape[1] / hop_size_samples)
            x = x[:, :int(num_frames * hop_size_samples)].to(device)
            X.append(x)
            ## load y
            # optional: convert praats into jsons
            # dataset_dir.joinpath('align').mkdir(parents=True, exist_ok=True)
            # tg = tgio.openTextgrid(praat_file)
            # align_dict = tools.textgrid_to_dict(tg)
            # json_file = Path(str(praat_file).replace('TextGrid', 'align')).with_suffix('.json')
            # json_file.write_text(json.dumps(align_dict, indent=4))
            # y = tools.json_file_to_target(json_file, sampling_rate, window_size_samples, hop_size_samples, hmm)
            y = tools.praat_file_to_target(praat_file, sampling_rate,
                                           window_size_samples,
                                           hop_size_samples, hmm)
            y = torch.from_numpy(y).to(device)
            Y.append(y)
            ## load text
            text = lab_file.read_text().strip()
            texts.append(text)
        return wav_files, X, Y, texts

    """
    Creates two datasets:
        - plain is simply a pre-processed version of TIDIGITS 
        - aligned replaces the targets Y with more precise targets (obtained via viterbi training)

    """

    # check if data dir exist
    raw_data_dir = Path(data_dir).joinpath('raw')
    assert raw_data_dir.is_dir()

    # data config
    sampling_rate = feature_parameters['sampling_rate']
    window_size_samples = tools.next_pow2_samples(
        feature_parameters['window_size'], sampling_rate)
    hop_size_samples = tools.sec_to_samples(feature_parameters['hop_size'],
                                            sampling_rate)

    # check if dataset is already pre-processed
    plain_out_dir = Path(data_dir).joinpath(model_type, 'plain')
    aligend_out_dir = Path(data_dir).joinpath(model_type, 'aligned')
    if plain_out_dir.joinpath('hmm.h5').is_file() and aligend_out_dir.joinpath(
            'hmm.h5').is_file():
        logging.info(f"[+] Dataset already pre-processed")
        return
    shutil.rmtree(plain_out_dir, ignore_errors=True)
    plain_out_dir.mkdir(parents=True)
    shutil.rmtree(aligend_out_dir, ignore_errors=True)
    aligend_out_dir.mkdir(parents=True)

    # Step 1: plain data
    # -> wavs are split into individual frames (the Xs)
    # -> each frame is mapped to the corresponding target state
    #    of the hmm (the Ys)
    #
    # As these targets are always depend on a particular hmm,
    # we save the hmm alongside with the data
    hmm = HMM.HMM('word')
    pickle.dump(hmm, plain_out_dir.joinpath('hmm.h5').open('wb'))

    # pre-proccess plain data
    dataset_names = [
        d.name for d in Path(raw_data_dir).glob('*') if d.is_dir()
    ]
    for dataset_name in dataset_names:
        logging.info(f"[+] Pre-process {dataset_name}")
        wav_files, X, Y, texts = load_raw_data_dir(
            raw_data_dir.joinpath(dataset_name))
        ## dump plain
        X_out_dir = plain_out_dir.joinpath(dataset_name, 'X')
        X_out_dir.mkdir(parents=True)
        Y_out_dir = plain_out_dir.joinpath(dataset_name, 'Y')
        Y_out_dir.mkdir(parents=True)
        text_out_dir = plain_out_dir.joinpath(dataset_name, 'text')
        text_out_dir.mkdir(parents=True)
        wav_out_dir = plain_out_dir.joinpath(dataset_name, 'wavs')
        wav_out_dir.mkdir(parents=True)
        for wav_file, x, y, text in tqdm(
                zip(wav_files, X, Y, texts),
                total=len(wav_files),
                bar_format='    dump plain  {l_bar}{bar:30}{r_bar}'):
            filename = wav_file.stem
            torch.save(y, Y_out_dir.joinpath(filename).with_suffix('.pt'))
            torch.save(x, X_out_dir.joinpath(filename).with_suffix('.pt'))
            text_out_dir.joinpath(filename).with_suffix('.txt').write_text(
                text)
            shutil.copyfile(wav_file,
                            wav_out_dir.joinpath(filename).with_suffix('.wav'))

    # Step 2: align data
    # -> for the plain data we only used relatively vague alignements between
    #    input frame and target
    # -> to improve this we create a second dataset that uses a hmm
    #    that is trained with viterbi to obtain more precise alignments

    # first we need to train the hmm with viterbi training
    dataset = Dataset(plain_out_dir.joinpath('TRAIN'), feature_parameters)
    model = init_model(model_type, feature_parameters, hmm)
    model.train_model(dataset, epochs=12, batch_size=32)
    model.train_model(dataset, epochs=1, batch_size=32, viterbi_training=True)
    model.hmm.A = hmm.modifyTransitions(model.hmm.A_count)
    model.train_model(dataset, epochs=2, batch_size=32, viterbi_training=True)
    # again, save hmm alongside the data
    pickle.dump(hmm, aligend_out_dir.joinpath('hmm.h5').open('wb'))

    # pre-proccess aligned data
    dataset_names = [
        d.name for d in Path(raw_data_dir).glob('*') if d.is_dir()
    ]
    for dataset_name in dataset_names:
        logging.info(f"[+] Pre-process {dataset_name}")
        # wav_files, X, Y, texts = load_raw_data_dir(raw_data_dir.joinpath(dataset_name), device=device)
        dst_path = plain_out_dir.joinpath(dataset_name)
        dataset = Dataset(dst_path, feature_parameters)
        ## dump plain
        X_out_dir = aligend_out_dir.joinpath(dataset_name, 'X')
        X_out_dir.mkdir(parents=True)
        Y_out_dir = aligend_out_dir.joinpath(dataset_name, 'Y')
        Y_out_dir.mkdir(parents=True)
        text_out_dir = aligend_out_dir.joinpath(dataset_name, 'text')
        text_out_dir.mkdir(parents=True)
        wav_out_dir = aligend_out_dir.joinpath(dataset_name, 'wavs')
        wav_out_dir.mkdir(parents=True)

        with tqdm(
                total=len(wav_files),
                bar_format='    dump aligned {l_bar}{bar:30}{r_bar}') as pbar:
            for X_batch, Y_batch, texts_batch, y_true_length, x_true_length, filenames in dataset.generator(
                    return_filename=True, batch_size=32, return_x_length=True):

                posteriors = model.features_to_posteriors(X_batch)
                Y_batch = hmm.viterbi_train(posteriors, y_true_length, Y_batch,
                                            texts_batch)

                for filename, x, y, y_length, x_length, text in zip(
                        filenames, X_batch, Y_batch, y_true_length,
                        x_true_length, texts_batch):
                    torch.save(y.clone()[:y_length],
                               Y_out_dir.joinpath(filename).with_suffix('.pt'))
                    torch.save(x.clone()[:x_length].unsqueeze(dim=0),
                               X_out_dir.joinpath(filename).with_suffix('.pt'))
                    text_out_dir.joinpath(filename).with_suffix(
                        '.txt').write_text(text)
                    shutil.copyfile(
                        dst_path.joinpath('wavs',
                                          filename).with_suffix('.wav'),
                        wav_out_dir.joinpath(filename).with_suffix('.wav'))
                    pbar.update(1)
コード例 #9
0
ファイル: eval.py プロジェクト: 9yte/VenoMave
def preprocess(data_dir, feature_parameters):
    def load_raw_data_dir(dataset_dir, device='cuda'):
        dataset_dir = dataset_dir.resolve()  # To resolve symlinks!
        # find raw data
        wav_files = [
            f for f in sorted(
                dataset_dir.joinpath('wav').resolve().glob('*.wav'))
        ]
        praat_files = [
            f for f in sorted(
                dataset_dir.joinpath('TextGrid').resolve().glob('*.TextGrid'))
        ]
        lab_files = [
            f for f in sorted(
                dataset_dir.joinpath('lab').resolve().glob('*.lab'))
        ]

        # load raw data
        X = []
        Y = []
        texts = []
        for wav_file, praat_file, lab_file in tqdm(
                zip(wav_files, praat_files, lab_files),
                total=len(wav_files),
                bar_format='    load raw     {l_bar}{bar:30}{r_bar}'):
            # sanity check
            assert wav_file.stem == praat_file.stem == lab_file.stem, f'{wav_file.stem} {praat_file.stem} {lab_file.stem}'
            ## load x
            x, _ = torchaudio.load(wav_file)
            # round to the next `full` frame
            num_frames = np.floor(x.shape[1] / hop_size_samples)
            x = x[:, :int(num_frames * hop_size_samples)].to(device)
            X.append(x)
            ## load y
            # optional: convert praats into jsons
            # dataset_dir.joinpath('align').mkdir(parents=True, exist_ok=True)
            # tg = tgio.openTextgrid(praat_file)
            # align_dict = tools.textgrid_to_dict(tg)
            # json_file = Path(str(praat_file).replace('TextGrid', 'align')).with_suffix('.json')
            # json_file.write_text(json.dumps(align_dict, indent=4))
            # y = tools.json_file_to_target(json_file, sampling_rate, window_size_samples, hop_size_samples, hmm)
            y = tools.praat_file_to_target(praat_file, sampling_rate,
                                           window_size_samples,
                                           hop_size_samples, hmm)
            y = torch.from_numpy(y).to(device)
            Y.append(y)
            ## load text
            text = lab_file.read_text().strip()
            texts.append(text)
        return wav_files, X, Y, texts

    raw_data_dir = Path(data_dir).joinpath('raw')
    assert raw_data_dir.is_dir()

    # data config
    sampling_rate = feature_parameters['sampling_rate']
    window_size_samples = tools.next_pow2_samples(
        feature_parameters['window_size'], sampling_rate)
    hop_size_samples = tools.sec_to_samples(feature_parameters['hop_size'],
                                            sampling_rate)

    plain_out_dir = Path(data_dir).joinpath('plain')
    plain_out_dir.mkdir()

    hmm = HMM.HMM('word')
    pickle.dump(hmm, plain_out_dir.joinpath('hmm.h5').open('wb'))

    # pre-proccess plain data
    dataset_names = [
        d.name for d in Path(raw_data_dir).glob('*') if d.is_dir()
    ]
    for dataset_name in dataset_names:
        wav_files, X, Y, texts = load_raw_data_dir(
            raw_data_dir.joinpath(dataset_name))
        ## dump plain
        X_out_dir = plain_out_dir.joinpath(dataset_name, 'X')
        X_out_dir.mkdir(parents=True)
        Y_out_dir = plain_out_dir.joinpath(dataset_name, 'Y')
        Y_out_dir.mkdir(parents=True)
        text_out_dir = plain_out_dir.joinpath(dataset_name, 'text')
        text_out_dir.mkdir(parents=True)
        wav_out_dir = plain_out_dir.joinpath(dataset_name, 'wavs')
        wav_out_dir.mkdir(parents=True)
        for wav_file, x, y, text in tqdm(
                zip(wav_files, X, Y, texts),
                total=len(wav_files),
                bar_format='    dump plain  {l_bar}{bar:30}{r_bar}'):
            filename = wav_file.stem
            torch.save(y, Y_out_dir.joinpath(filename).with_suffix('.pt'))
            torch.save(x, X_out_dir.joinpath(filename).with_suffix('.pt'))
            text_out_dir.joinpath(filename).with_suffix('.txt').write_text(
                text)
            shutil.copyfile(wav_file,
                            wav_out_dir.joinpath(filename).with_suffix('.wav'))
コード例 #10
0
ファイル: eval.py プロジェクト: 9yte/VenoMave
    params.data_dir = params.data_dir.joinpath(params.model_type)

    assert params.model_type in params.attack_dir, "It seems you are trying to evalute " \
                                                   "results generated for a different model type"
    # assert params.model_type in str(params.data_dir), "You are using the wrong hmm (and aligned data)!"

    feature_parameters = {
        'window_size': 25e-3,
        'hop_size': 12.5e-3,
        'feature_type': 'raw',
        'num_ceps': 13,
        'left_context': 4,
        'right_context': 4,
        'sampling_rate': tools.get_sampling_rate(params.data_dir.parent)
    }
    feature_parameters['hop_size_samples'] = tools.sec_to_samples(
        feature_parameters['hop_size'], feature_parameters['sampling_rate'])
    feature_parameters['window_size_samples'] = tools.next_pow2_samples(
        feature_parameters['window_size'], feature_parameters['sampling_rate'])

    tools.set_seed(params.seed)

    attack_dir = Path(params.attack_dir)

    assert os.path.exists(attack_dir)

    if not attack_dir.joinpath('log.txt').is_file():
        assert len(list(attack_dir.iterdir())
                   ) == 1, "more than one instance of attack exist!"
        attack_dir = list(attack_dir.iterdir())[0]

    attack_step_dirs = [s for s in attack_dir.iterdir() if s.is_dir()]