Esempio n. 1
def validating_noise_data(in_path_raw):
    # preparing
    noise_dataset = ['musan', 'rirs']
    all_files = defaultdict(list)
    n_files = sum(
        len(sre_file_list[i]) for i in noise_dataset if i in sre_file_list)
    n_non_exist = 0
    n_exist = 0
    prog = Progbar(target=n_files,
                   name="Validating noise dataset")
    prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1])
    prog.set_summarizer(key='#Exist', fn=lambda x: x[-1])
    # check all dataset
    for ds_name in noise_dataset:
        if ds_name not in sre_file_list:
        if ds_name not in in_path_raw:
        base_path = in_path_raw[ds_name]
        base_ds = all_files[ds_name]
        # start validating
        for row in sre_file_list[ds_name]:
            # check file
            path, channel, name, noise_type, duration = row[:5]
            path = os.path.join(base_path, path)
            if os.path.exists(path):
                base_ds.append([path, channel, name, noise_type, duration])
                n_exist += 1
                n_non_exist += 1
            # update progress
            prog['ds'] = ds_name
            prog['#Exist'] = n_exist
            prog['#Non-exist'] = n_non_exist
    # ====== return ====== #
    # Header:
    #  0       1      2         3         4
    # path, channel, name, noise_type, duration
    return {
        key: np.array(sorted(val, key=lambda x: x[0]))
        for key, val in all_files.items()
Esempio n. 3
def filter_utterances(X,
  X : 2-D matrix
    input features

  indices : Mapping
    utterance_name -> (start, end) in `X`

  spkid : Mapping
    utterance_name -> speaker_id

  remove_min_length : bool (default: True)
    if True, remove all files shorter than MINIMUM_UTT_DURATION

  remove_min_uttspk : bool (default: True)
    if True, remove all speakers with lower amount of utterances than

  n_speakers : {None, int} (default: None)
    if given, downsample the dataset by given number of speakers

  save_path : {None, str} (default: None)
    if given, pickle all filtered files to disk

    if min_dur is None:
        min_dur = MINIMUM_UTT_DURATION
    if min_utt is None:
        min_utt = MINIMUM_UTT_PER_SPEAKERS

    minimum_amount_of_frames = min_dur / Config.STEP_LENGTH
    save_data = {}

    prog = Progbar(target=len(indices),
                   name='Filtering broken utterances: %s' % title)
    prog.set_summarizer('zero-length', fn=lambda x: x[-1])
    prog.set_summarizer('min-frames', fn=lambda x: x[-1])
    prog.set_summarizer('zero-var', fn=lambda x: x[-1])
    prog.set_summarizer('small-var', fn=lambda x: x[-1])
    prog.set_summarizer('overflow', fn=lambda x: x[-1])

    # ====== mpi function for checking ====== #
    @nb.jit(nopython=True, nogil=True)
    def _fast_mean_var_ax0(z):
        # using this function for calculating mean and variance
        # can double the speed but cannot check overflow,
        # only accept float32 or float64 input
        s1 = np.zeros(shape=(z.shape[1], ), dtype=z.dtype)
        s2 = np.zeros(shape=(z.shape[1], ), dtype=z.dtype)
        for i in range(z.shape[0]):
            s1 += z[i]
            s2 += np.power(z[i], 2)
        mean = s1 / z.shape[0]
        var = s2 / z.shape[0] - np.power(mean, 2)
        return mean, var

    def _mpi_func(jobs):
        for name, (start, end) in jobs:
            y = X[start:end]
            # flags
            is_zero_len = False
            is_zero_var = False
            is_small_var = False
            is_min_frames = False
            is_overflow = False
            # checking length
            if y.shape[0] == 0:
                is_zero_len = True
            elif y.shape[0] < minimum_amount_of_frames:
                is_min_frames = True
            # checking statistics
                with catch_warnings_error(RuntimeWarning):
                        # mean = np.mean(y, axis=-1)
                        var = np.var(y, axis=-1)
                        # min_val = np.min(y, axis=-1)
                        # max_val = np.max(y, axis=-1)
                    # numerical unstable
                    except RuntimeWarning as w:
                        if 'overflow encountered' in str(w):
                            is_overflow = True
                            print(name, ':', w)
                    # process with more numerical filtering
                        if np.any(np.isclose(var, 0)):
                            is_zero_var = True
                        # very heuristic and aggressive here
                        # filter-out anything with ~16.67% of low-var
                        # this could remove 1/3 of the original data
                        if np.sum(var < 0.01) > (len(y) / 6):
                            is_small_var = True
            # return the flags
            yield (name, is_zero_len, is_min_frames, is_zero_var, is_small_var,

    # ====== running the multiprocessing filter ====== #
    zero_len_files = {}
    min_frame_files = {}
    zero_var_files = {}
    small_var_files = {}
    overflow_files = {}
    for res in mpi.MPI(jobs=sorted(indices.items(), key=lambda x: x[1][0]),
                       ncpu=NCPU if ncpu is None else int(ncpu),
        name = res[0]
        if res[1]: zero_len_files[name] = 1
        if res[2]: min_frame_files[name] = 1
        if res[3]: zero_var_files[name] = 1
        if res[4]: small_var_files[name] = 1
        if res[5]: overflow_files[name] = 1
        # update progress
        prog['name'] = name[:48]
        prog['zero-length'] = len(zero_len_files)
        prog['min-frames'] = len(min_frame_files)
        prog['zero-var'] = len(zero_var_files)
        prog['small-var'] = len(small_var_files)
        prog['overflow'] = len(overflow_files)
    # ====== remove broken files ====== #
    if not bool(remove_min_length):
        min_frame_files = {}
    new_indices = {
        name: (start, end)
        for name, (start, end) in indices.items() if name not in zero_len_files
        and name not in min_frame_files and name not in zero_var_files
        and name not in small_var_files and name not in overflow_files
    print("Filtered #utterances: %s/%s (files)" %
          (ctext(len(indices) - len(new_indices),
                 'lightcyan'), ctext(len(indices), 'cyan')))
    indices = new_indices
    # ====== store save data ====== #
    save_data['zero_len'] = zero_len_files
    save_data['min_dur'] = min_frame_files
    save_data['zero_var'] = zero_var_files
    save_data['small_var'] = small_var_files
    save_data['overflow'] = overflow_files
    # ====== filter-out by number of utt-per-speaker ====== #
    if bool(remove_min_uttspk):
        spk2utt = defaultdict(list)
        for name in indices.keys():

        n_utt_removed = 0
        n_spk_removed = 0
        removed_utt = []
        keep_utt = []
        for spk, utt in spk2utt.items():
            if len(utt) < min_utt:
                n_utt_removed += len(utt)
                n_spk_removed += 1
                removed_utt += utt
                keep_utt += utt

        removed_utt = set(removed_utt)
        keep_utt = set(keep_utt)
        save_data['min_utt'] = removed_utt

        print("Removed min-utt/spk:  %s/%s(utt)  %s/%s(spk)" %
              (ctext(n_utt_removed, 'lightcyan'), ctext(len(indices), 'cyan'),
               ctext(n_spk_removed, 'lightcyan'), ctext(len(spk2utt), 'cyan')))
        assert len(indices) == n_utt_removed + len(keep_utt), "Not possible!"

        indices = {
            name: (start, end)
            for name, (start, end) in indices.items() if name in keep_utt
    # ====== sample by number of speakers ====== #
    if isinstance(n_speakers, Number) and n_speakers > 0:
        spk2utt = defaultdict(list)
        for name, (start, end) in indices.items():
            spk2utt[spkid[name]].append((name, (start, end)))

        n_org_spk = len(spk2utt)
        n_org_ids = len(indices)
        # only need down-sampling with smaller number of speaker
        if n_speakers < n_org_spk:
            rand = np.random.RandomState(seed=Config.SUPER_SEED)
            tmp = list(spk2utt.keys())
            sampled_spk = tmp[:n_speakers]

            indices = []
            for spk in sampled_spk:
                indices += spk2utt[spk]
            indices = dict(indices)
            sampled_spk = spk2utt
        # print some log
        print("Selected: %s/%s(spk) which have %s/%s(utt)" %
              (ctext(len(sampled_spk), 'lightcyan'), ctext(n_org_spk, 'cyan'),
               ctext(len(indices), 'lightcyan'), ctext(n_org_ids, 'cyan')))
    # ====== return the new indices ====== #
    if save_path is not None:
            with open(save_path, 'wb') as save_file:
                pickle.dump(save_data, save_file)
        except Exception as e:
            print("Cannot save filtering data to path: '%s', error: '%s'" %
                  (save_path, str(e)))
    return indices
