Ejemplo n.º 1
0
def validate_features_dataset(output_dataset_path, ds_validation_path):
    ds = F.Dataset(output_dataset_path, read_only=True)
    print(ds)

    features = {}
    for key, val in ds.items():
        if 'indices_' in key:
            name = key.split('_')[-1]
            features[name] = (val, ds[name])

    all_indices = [val[0] for val in features.values()]
    # ====== sampling 250 files ====== #
    all_files = sampling_iter(it=all_indices[0].keys(),
                              k=250,
                              seed=Config.SUPER_SEED)
    all_files = [f for f in all_files if all(f in ids for ids in all_indices)]
    print("#Samples:", ctext(len(all_files), 'cyan'))

    # ====== ignore the 20-figures warning ====== #
    with catch_warnings_ignore(RuntimeWarning):
        for file_name in all_files:
            X = {}
            for feat_name, (ids, data) in features.items():
                start, end = ids[file_name]
                X[feat_name] = data[start:end][:].astype('float32')
            V.plot_multiple_features(features=X,
                                     fig_width=20,
                                     title='[%s]%s' %
                                     (ds['dsname'][file_name], file_name))

    V.plot_save(ds_validation_path, dpi=12)
Ejemplo n.º 2
0
    def test_speech_processor(self):
        try:
            datapath = F.load_digit_wav()
        except Exception as e:
            print('Error (skip this test):', str(e))
            return
        output_path = utils.get_datasetpath(name='digit', override=True)
        feat = F.SpeechProcessor(datapath,
                                 output_path,
                                 audio_ext='wav',
                                 sr_new=8000,
                                 win=0.02,
                                 shift=0.01,
                                 nb_melfilters=40,
                                 nb_ceps=13,
                                 get_delta=2,
                                 get_energy=True,
                                 pitch_threshold=0.8,
                                 get_spec=True,
                                 get_mspec=True,
                                 get_mfcc=True,
                                 get_pitch=True,
                                 get_vad=True,
                                 save_stats=True,
                                 substitute_nan=None,
                                 dtype='float32',
                                 datatype='memmap',
                                 ncache=0.12,
                                 ncpu=4)
        feat.run()
        ds = F.Dataset(output_path)

        def is_equal(x1, x2):
            x1 = repr(np.array(x1, 'float32').tolist())
            x2 = repr(np.array(x2, 'float32').tolist())
            n = 0
            for i, j in zip(x1, x2):
                if i == j:
                    n += 1
            return n >= max(len(x1), len(x2)) // 2

        # these numbers are highly numerical instable
        for i in ds.keys():
            if i == 'indices.csv':
                self.assertTrue(isinstance(ds[i], str))
            elif '_' not in i:
                pca = i + '_pca'
                if pca in ds:
                    self.assertTrue(
                        is_equal(np.sum(ds[i][:], dtype='float32'),
                                 test_speech_features[i]))
            elif '_pca' not in i:
                self.assertTrue(
                    is_equal(np.sum(ds[i][:], dtype='float32'),
                             test_speech_features[i]))
            else:
                self.assertTrue(
                    is_equal(np.sum(ds[i].components_),
                             test_speech_features[i]))
Ejemplo n.º 3
0
def prepare_ivec_data(recipe, feat):
    ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True)
    X = ds[feat]
    train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()}
    test_indices = {
        name: start_end
        for name, start_end in ds['indices'].items() if name not in TRAIN_DATA
    }
    print("#Train files:", ctext(len(train_indices), 'cyan'))
    print("#Test files:", ctext(len(test_indices), 'cyan'))
    return X, train_indices, test_indices
Ejemplo n.º 4
0
    get_logpath(name="analyze_data.log",
                increasing=True,
                odin_base=False,
                root=ANALYSIS_DIR))
print(ctext(FEATURE_RECIPE, 'lightyellow'))
print(ctext(FEATURE_NAME, 'lightyellow'))
assert os.path.isdir(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE))
# ====== essential path ====== #
figure_path = os.path.join(
    ANALYSIS_DIR,
    '%s_%s.pdf' % (FEATURE_RECIPE.replace('_', ''), FEATURE_NAME))
print(ctext(figure_path, 'lightyellow'))
# ===========================================================================
# Load the data
# ===========================================================================
ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE),
               read_only=True)
X = ds[FEATURE_NAME]
# remove all noise data
indices = {
    name: (start, end)
    for name, (start, end) in ds['indices_%s' % FEATURE_NAME].items()
    if '/' not in name
}

all_dataset = sorted(set(ds['dsname'].values()))
print("All dataset:", ctext(all_dataset, 'cyan'))
# ===========================================================================
# Helpers
# ===========================================================================
all_percentiles = [.01, .05, .1, .25, .5, .75, .9, .95, .99]
Ejemplo n.º 5
0
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321):
    """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
    # Load dataset
    frame_length = int(utt_length / FRAME_SHIFT)
    ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True)
    X = ds[feat]
    train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()}
    test_indices = {
        name: start_end
        for name, start_end in ds['indices'].items() if name not in TRAIN_DATA
    }
    train_indices, valid_indices = train_valid_test_split(x=list(
        train_indices.items()),
                                                          train=0.9,
                                                          inc_test=False,
                                                          seed=seed)
    all_speakers = sorted(set(TRAIN_DATA.values()))
    n_speakers = max(all_speakers) + 1
    print("#Train files:", ctext(len(train_indices), 'cyan'))
    print("#Valid files:", ctext(len(valid_indices), 'cyan'))
    print("#Test files:", ctext(len(test_indices), 'cyan'))
    print("#Speakers:", ctext(n_speakers, 'cyan'))
    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post',
                             data_idx=0),
        F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0),
        F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
    ]
    train_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=train_indices),
                            batch_mode='batch',
                            ncpu=7,
                            buffer_size=12)
    valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=valid_indices),
                            batch_mode='batch',
                            ncpu=2,
                            buffer_size=4)
    train_feeder.set_recipes(recipes)
    valid_feeder.set_recipes(recipes)
    print(train_feeder)
    # ====== cache the test data ====== #
    cache_dat = os.path.join(PATH_EXP,
                             'test_%s_%d.dat' % (feat, int(utt_length)))
    cache_ids = os.path.join(PATH_EXP,
                             'test_%s_%d.ids' % (feat, int(utt_length)))
    # validate cache files
    if os.path.exists(cache_ids):
        with open(cache_ids, 'rb') as f:
            ids = pickle.load(f)
        if len(ids) != len(test_indices):
            os.remove(cache_ids)
            if os.path.exists(cache_dat):
                os.remove(cache_dat)
    elif os.path.exists(cache_dat):
        os.remove(cache_dat)
    # caching
    if not os.path.exists(cache_dat):
        dat = F.MmapData(cache_dat,
                         dtype='float16',
                         shape=(0, frame_length, X.shape[1]))
        ids = {}
        prog = Progbar(target=len(test_indices))
        s = 0
        for name, (start, end) in test_indices.items():
            y = X[start:end]
            y = segment_axis(y,
                             axis=0,
                             frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post')
            dat.append(y)
            # update indices
            ids[name] = (s, s + len(y))
            s += len(y)
            # update progress
            prog.add(1)
        dat.flush()
        dat.close()
        with open(cache_ids, 'wb') as f:
            pickle.dump(ids, f)
    # ====== re-load ====== #
    dat = F.MmapData(cache_dat, read_only=True)
    with open(cache_ids, 'rb') as f:
        ids = pickle.load(f)
    # ====== save some sample ====== #
    sample_path = os.path.join(PATH_EXP,
                               'test_%s_%d.pdf' % (feat, int(utt_length)))
    V.plot_figure(nrow=9, ncol=6)
    for i, (name, (start, end)) in enumerate(
            sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]),
                          k=12,
                          seed=87654321)):
        x = dat[start:end][:].astype('float32')
        ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                                ax=(12, 1, i + 1),
                                title='')
        ax.set_title(name)
    V.plot_save(sample_path)
    return (train_feeder, valid_feeder, ids, dat, all_speakers)
Ejemplo n.º 6
0
def prepare_data(feat, label, utt_length=0.4, for_ivec=False):
    """

  Returns (i-vector)
  ------------------
  ds[feat]
  train_files
  y_train
  test_files
  y_test
  labels

  Returns (x-vector)
  ------------------
  train : Feeder
    feeder for training data for iterating over pair of (X, y)
  valid : Feeder
    feeder for validating data for iterating over pair of (X, y)
  X_test_name : list of file names
    file names are append with '.%d' for cut segment ID
  X_test_true : list of integer
    label of each sample
  X_test_data : array
    list of test data same length as X_test_name
  labels : list of string
    list of labels for classification task

  Example
  -------
  (train, valid,
   X_test_name, X_test_true, X_test_data,
   labels) = prepare_data_dnn(feat=FEAT, label='gender')

  """
    label = str(label).lower()
    assert label in _support_label, "No support for label: %s" % label
    assert 0 < utt_length <= 1.
    # ====== load dataset ====== #
    if not os.path.exists(PATH_ACOUSTIC):
        raise RuntimeError(
            "Cannot find extracted acoustic features at path: '%s',"
            "run the code speech_features_extraction.py!" % PATH_ACOUSTIC)
    ds = F.Dataset(PATH_ACOUSTIC, read_only=True)
    assert feat in ds, "Cannot find feature with name: %s" % feat
    indices = list(ds['indices'].items())
    K.get_rng().shuffle(indices)

    # ====== helper ====== #
    def is_train(x):
        return x.split('_')[0] == 'train'

    def extract_label(x):
        return x.split('_')[_support_label[label]]

    print("Task:", ctext(label, 'cyan'))
    fn_label, labels = unique_labels([i[0] for i in indices],
                                     key_func=extract_label,
                                     return_labels=True)
    print("Labels:", ctext(labels, 'cyan'))
    # ====== training and test data ====== #
    train_files = []  # (name, (start, end)) ...
    test_files = []
    for name, (start, end) in indices:
        if is_train(name):
            train_files.append((name, (start, end)))
        else:
            test_files.append((name, (start, end)))
    # name for each dataset, useful for later
    print("#Train:", ctext(len(train_files), 'cyan'))
    print("#Test:", ctext(len(test_files), 'cyan'))
    # ====== for i-vectors ====== #
    y_train = np.array([fn_label(i[0]) for i in train_files])
    y_test = np.array([fn_label(i[0]) for i in test_files])
    if bool(for_ivec):
        return ds[feat], train_files, y_train, test_files, y_test, labels
    # ====== length ====== #
    length = [(end - start) for _, (start, end) in indices]
    max_length = max(length)
    frame_length = int(max_length * utt_length)
    step_length = frame_length
    print("Max length  :", ctext(max_length, 'yellow'))
    print("Frame length:", ctext(frame_length, 'yellow'))
    print("Step length :", ctext(step_length, 'yellow'))
    # ====== split dataset ====== #
    # split by speaker ID
    train_files, valid_files = train_valid_test_split(
        x=train_files,
        train=0.8,
        cluster_func=None,
        idfunc=lambda x: x[0].split('_')[4],  # splited by speaker
        inc_test=False)
    print("#File train:", ctext(len(train_files), 'cyan'))
    print("#File valid:", ctext(len(valid_files), 'cyan'))
    print("#File test :", ctext(len(test_files), 'cyan'))

    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=step_length,
                             end='pad',
                             pad_mode='post',
                             pad_value=0),
        F.recipes.Name2Label(converter_func=fn_label),
        F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1)
    ]
    feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files),
                            ncpu=6,
                            batch_mode='batch')
    feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files),
                            ncpu=4,
                            batch_mode='batch')
    feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files),
                           ncpu=4,
                           batch_mode='file')
    feeder_train.set_recipes(recipes)
    feeder_valid.set_recipes(recipes)
    feeder_test.set_recipes(recipes)
    print(feeder_train)

    # ====== process X_test, y_test in advance for faster evaluation ====== #
    @cache_disk
    def _extract_test_data(feat, label, utt_length):
        prog = Progbar(target=len(feeder_test),
                       print_summary=True,
                       name="Preprocessing test set")
        X_test = defaultdict(list)
        for name, idx, X, y in feeder_test:
            # validate everything as expected
            assert fn_label(name) == np.argmax(y), name  # label is right
            # save to list
            X_test[name].append((idx, X))
            prog.add(X.shape[0])
        # ====== create 1 array for data and dictionary for indices ====== #
        X_test_name = []
        X_test_data = []
        for name, X in X_test.items():
            X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])],
                               axis=0).astype('float16')
            X_test_name += [name + '.%d' % i for i in range(len(X))]
            X_test_data.append(X)
        X_test_name = np.array(X_test_name)
        X_test_data = np.concatenate(X_test_data, axis=0)
        return X_test_name, X_test_data

    # convert everything back to float32
    X_test_name, X_test_data = _extract_test_data(feat, label, utt_length)
    X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name])
    return feeder_train, feeder_valid, \
    X_test_name, X_test_true, X_test_data, labels
Ejemplo n.º 7
0
        processor = pp.FeatureProcessor(jobs=all_files,
                                        path=PATH_ACOUSTIC_FEATURES,
                                        extractor=extractors,
                                        n_cache=120,
                                        ncpu=None,
                                        override=True,
                                        identifier='name',
                                        log_path=os.path.join(
                                            EXP_DIR, 'processor.log'),
                                        stop_on_failure=True)
        processor.run()
        # pp.validate_features(processor,
        #                      nb_samples=12,
        #                      path=os.path.join(EXP_DIR, 'feature_validation'),
        #                      override=True)
ds = F.Dataset(PATH_ACOUSTIC_FEATURES, read_only=True)
print(ds)
indices = list(ds['indices_%s' % args.feat].items())
print("Utterances length:")
print("   ",
      describe([end - start for name, (start, end) in indices], shorten=True))
# ===========================================================================
# Basic path for GMM, T-matrix and I-vector
# ===========================================================================
EXP_DIR = os.path.join(EXP_DIR, '%s_%d_%d' % (FEAT, NMIX, TV_DIM))
LOG_PATH = get_logpath(name='log.txt',
                       override=False,
                       root=EXP_DIR,
                       odin_base=False)
stdio(LOG_PATH)
print("Exp-dir:", ctext(EXP_DIR, 'cyan'))
Ejemplo n.º 8
0
        "`-aug` option was not provided, choose: 'rirs' or 'musan'")
np.random.seed(Config.SUPER_SEED)
# percentage of data will be used for augmentation
PERCENTAGE_AUGMENTATION = 0.8
# ===========================================================================
# Constant
# ===========================================================================
AUGMENTATION_DATASET = ['swb', 'sre04', 'sre05', 'sre06', 'sre08', 'sre10']
AUGMENTATION_DATASET = [i for i in AUGMENTATION_DATASET if i in ALL_DATASET]
print("Augmenting following dataset: %s" %
      ctext(', '.join(AUGMENTATION_DATASET), 'yellow'))
# ====== get the duration ====== #
path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)
assert os.path.exists(path), \
"Acoustic feature must be extracted first, and stored at path: %s" % path
ds = F.Dataset(path, read_only=True)
all_duration = dict(ds['duration'].items())
ds.close()
# ====== select a new file list ====== #
AUG_FILES = []
missing_duration = []
for row in ALL_FILES:
    if row[4] not in AUGMENTATION_DATASET:
        continue
    if row[2] not in all_duration:
        missing_duration.append(row)
        continue
    dur = all_duration[row[2]]
    AUG_FILES.append([i for i in row] + [dur])
print("#Files missing duration:", ctext(len(missing_duration), 'cyan'))
assert len(AUG_FILES), "Cannot find any files for augmentation"
Ejemplo n.º 9
0
def prepare_dnn_data(save_dir,
                     feat_name=None,
                     utt_length=None,
                     seq_mode=None,
                     min_dur=None,
                     min_utt=None,
                     exclude=None,
                     train_proportion=None,
                     return_dataset=False):
    assert os.path.isdir(save_dir), \
        "Path to '%s' is not a directory" % save_dir
    if feat_name is None:
        feat_name = FEATURE_NAME
    if utt_length is None:
        utt_length = int(_args.utt)
    if seq_mode is None:
        seq_mode = str(_args.seq).strip().lower()
    if min_dur is None:
        min_dur = MINIMUM_UTT_DURATION
    if min_utt is None:
        min_utt = MINIMUM_UTT_PER_SPEAKERS
    if exclude is None:
        exclude = str(_args.exclude).strip()
    print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan'))
    print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan'))
    # ******************** prepare dataset ******************** #
    path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)
    assert os.path.exists(
        path), "Cannot find acoustic dataset at path: %s" % path
    ds = F.Dataset(path=path, read_only=True)
    rand = np.random.RandomState(seed=Config.SUPER_SEED)
    # ====== find the right feature ====== #
    assert feat_name in ds, "Cannot find feature with name: %s" % feat_name
    X = ds[feat_name]
    ids_name = 'indices_%s' % feat_name
    assert ids_name in ds, "Cannot find indices with name: %s" % ids_name
    # ====== basic path ====== #
    path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl')
    path_train_files = os.path.join(save_dir, 'train_files.pkl')
    path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl')
    # ******************** cannot find cached data ******************** #
    if any(not os.path.exists(p)
           for p in [path_filtered_data, path_train_files, path_speaker_info]):
        # ====== exclude some dataset ====== #
        if len(exclude) > 0:
            exclude_dataset = {i: 1 for i in exclude.split(',')}
            print("* Excluded dataset:", ctext(exclude_dataset, 'cyan'))
            indices = {
                name: (start, end)
                for name, (start, end) in ds[ids_name].items()
                if ds['dsname'][name] not in exclude_dataset
            }
            # special case exclude all the noise data
            if 'noise' in exclude_dataset:
                indices = {
                    name: (start, end)
                    for name, (start, end) in indices.items()
                    if '/' not in name
                }
        else:
            indices = {i: j for i, j in ds[ids_name].items()}
        # ====== down-sampling if necessary ====== #
        if _args.downsample > 1000:
            dataset2name = defaultdict(list)
            # ordering the indices so we sample the same set every time
            for name in sorted(indices.keys()):
                dataset2name[ds['dsname'][name]].append(name)
            n_total_files = len(indices)
            n_sample_files = int(_args.downsample)
            # get the percentage of each dataset
            dataset2per = {
                i: len(j) / n_total_files
                for i, j in dataset2name.items()
            }
            # sampling based on percentage
            _ = {}
            for dsname, flist in dataset2name.items():
                rand.shuffle(flist)
                n_dataset_files = int(dataset2per[dsname] * n_sample_files)
                _.update({i: indices[i] for i in flist[:n_dataset_files]})
            indices = _
        # ====== * filter out "bad" sample ====== #
        indices = filter_utterances(X=X,
                                    indices=indices,
                                    spkid=ds['spkid'],
                                    min_utt=min_utt,
                                    min_dur=min_dur,
                                    remove_min_length=True,
                                    remove_min_uttspk=True,
                                    n_speakers=None,
                                    ncpu=None,
                                    save_path=path_filtered_data)
        # ====== all training file name ====== #
        # modify here to train full dataset
        all_name = sorted(indices.keys())
        rand.shuffle(all_name)
        rand.shuffle(all_name)
        n_files = len(all_name)
        print("#Files:", ctext(n_files, 'cyan'))
        # ====== speaker mapping ====== #
        name2spk = {name: ds['spkid'][name] for name in all_name}
        all_speakers = sorted(set(name2spk.values()))
        spk2label = {spk: i for i, spk in enumerate(all_speakers)}
        name2label = {name: spk2label[spk] for name, spk in name2spk.items()}
        assert len(name2label) == len(all_name)
        print("#Speakers:", ctext(len(all_speakers), 'cyan'))
        # ====== stratify sampling based on speaker ====== #
        valid_name = []
        # create speakers' cluster
        label2name = defaultdict(list)
        for name, label in sorted(name2label.items(), key=lambda x: x[0]):
            label2name[label].append(name)
        # for each speaker with >= 3 utterance
        for label, name_list in sorted(label2name.items(), key=lambda x: x[0]):
            if len(name_list) < 3:
                continue
            n = max(1, int(0.05 * len(name_list)))  # 5% for validation
            valid_name += rand.choice(a=name_list, size=n,
                                      replace=False).tolist()
        # train list is the rest
        _ = set(valid_name)
        train_name = [i for i in all_name if i not in _]
        # ====== split training and validation ====== #
        train_indices = {name: indices[name] for name in train_name}
        valid_indices = {name: indices[name] for name in valid_name}
        # ====== save cached data ====== #
        with open(path_train_files, 'wb') as fout:
            pickle.dump({'train': train_indices, 'valid': valid_indices}, fout)
        with open(path_speaker_info, 'wb') as fout:
            pickle.dump(
                {
                    'all_speakers': all_speakers,
                    'name2label': name2label,
                    'spk2label': spk2label
                }, fout)
    # ******************** load cached data ******************** #
    else:
        with open(path_train_files, 'rb') as fin:
            obj = pickle.load(fin)
            train_indices = obj['train']
            valid_indices = obj['valid']
        with open(path_speaker_info, 'rb') as fin:
            obj = pickle.load(fin)
            all_speakers = obj['all_speakers']
            name2label = obj['name2label']
            spk2label = obj['spk2label']

    # ******************** print log ******************** #

    def summary_indices(ids):
        datasets = defaultdict(int)
        speakers = defaultdict(list)
        text = ''
        for name in sorted(ids.keys()):
            text += name + str(ids[name])
            dsname = ds['dsname'][name]
            datasets[dsname] += 1
            speakers[dsname].append(ds['spkid'][name])
        for dsname in sorted(datasets.keys()):
            print('  %-18s: %s(utt) %s(spk)' %
                  (dsname, ctext('%6d' % datasets[dsname], 'cyan'),
                   ctext(len(set(speakers[dsname])), 'cyan')))
        print('  MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan'))

    # ====== training files ====== #
    print(
        "#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:",
        ctext(len(set(name2label[name] for name in train_indices.keys())),
              'cyan'), "#noise:",
        ctext(len([name for name in train_indices.keys() if '/' in name]),
              'cyan'))
    summary_indices(ids=train_indices)
    # ====== valid files ====== #
    print(
        "#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:",
        ctext(len(set(name2label[name] for name in valid_indices.keys())),
              'cyan'), "#noise:",
        ctext(len([name for name in valid_indices.keys() if '/' in name]),
              'cyan'))
    summary_indices(ids=valid_indices)
    # ******************** create the recipe ******************** #
    assert all(name in name2label for name in train_indices.keys())
    assert all(name in name2label for name in valid_indices.keys())
    recipes = prepare_dnn_feeder_recipe(name2label=name2label,
                                        n_speakers=len(all_speakers),
                                        utt_length=utt_length,
                                        seq_mode=seq_mode)
    # ====== downsample training set for analyzing if required ====== #
    if train_proportion is not None:
        assert 0 < train_proportion < 1
        n_training = len(train_indices)
        train_indices = list(train_indices.items())
        rand.shuffle(train_indices)
        rand.shuffle(train_indices)
        train_indices = dict(train_indices[:int(n_training *
                                                train_proportion)])
    # ====== create feeder ====== #
    train_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=train_indices),
                            batch_mode='batch',
                            ncpu=NCPU,
                            buffer_size=256)

    valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=valid_indices),
                            batch_mode='batch',
                            ncpu=max(2, NCPU // 4),
                            buffer_size=64)

    train_feeder.set_recipes(recipes)
    valid_feeder.set_recipes(recipes)
    print(train_feeder)
    print(valid_feeder)
    # ====== debugging ====== #
    if IS_DEBUGGING:
        import matplotlib
        matplotlib.use('Agg')
        prog = Progbar(target=len(valid_feeder),
                       print_summary=True,
                       name="Iterating validation set")
        samples = []
        n_visual = 250
        for name, idx, X, y in valid_feeder.set_batch(batch_size=100000,
                                                      batch_mode='file',
                                                      seed=None,
                                                      shuffle_level=0):
            assert idx == 0, "Utterances longer than %.2f(sec)" % (
                100000 * Config.STEP_LENGTH)
            prog['X'] = X.shape
            prog['y'] = y.shape
            prog.add(X.shape[0])
            # random sampling
            if rand.rand(1) < 0.5 and len(samples) < n_visual:
                for i in rand.randint(0, X.shape[0], size=4, dtype='int32'):
                    samples.append((name, X[i], np.argmax(y[i], axis=-1)))
        # plot the spectrogram
        n_visual = len(samples)
        V.plot_figure(nrow=n_visual, ncol=8)
        for i, (name, X, y) in enumerate(samples):
            is_noise = '/' in name
            assert name2label[
                name] == y, "Speaker label mismatch for file: %s" % name
            name = name.split('/')[0]
            dsname = ds['dsname'][name]
            spkid = ds['spkid'][name]
            y = np.argmax(y, axis=-1)
            ax = V.plot_spectrogram(X.T,
                                    ax=(n_visual, 1, i + 1),
                                    title='#%d' % (i + 1))
            ax.set_title(
                '[%s][%s]%s  %s' %
                ('noise' if is_noise else 'clean', dsname, name, spkid),
                fontsize=6)
        # don't need to be high resolutions
        V.plot_save('/tmp/tmp.pdf', dpi=12)
        exit()
    # ====== return ====== #
    if bool(return_dataset):
        return train_feeder, valid_feeder, all_speakers, ds
    return train_feeder, valid_feeder, all_speakers
# python two_way_2_group_data_into_batch.py -m memory_profiler
# group: 165 + 10.9 MB and 24.3 (s/iter)
# group2: 164 + 43.6 MB (old method) and 15.6 (s/iter)
from __future__ import print_function, division, absolute_import

import os
os.environ['ODIN'] = 'theano,cpu,float32'
from six.moves import zip_longest, cPickle

import numpy as np
from odin import backend as K, nnet as N, fuel as F
from odin.utils import UnitTimer

from memory_profiler import profile

ds = F.Dataset('/home/trung/data/estonia_audio32')
indices = np.genfromtxt(ds['indices.csv'], dtype=str, delimiter=' ')

name, start, end = indices[0]
x0 = ds['mfcc'][int(start):int(end)]
x0 = (name, [x0, x0])

name, start, end = indices[1]
x1 = ds['mfcc'][int(start):int(end)]
x1 = (name, [x1, x1])

name, start, end = indices[2]
x2 = ds['mfcc'][int(start):int(end)]
x2 = (name, [x2, x2])

Ejemplo n.º 11
0
                         vad_minlen=0.1,
                         pca=True,
                         pca_whiten=False,
                         center=True,
                         save_stats=True,
                         substitute_nan=None,
                         dtype='float16',
                         datatype='memmap',
                         ncache=0.12,
                         ncpu=8)
with utils.UnitTimer():
    feat.run()
shutil.copy(os.path.join(datapath, 'README.md'),
            os.path.join(output_path, 'README.md'))
# ====== check the preprocessed dataset ====== #
ds = F.Dataset(output_path, read_only=True)
print('Output path:', output_path)
print(ds)

for n in ds.keys():
    if '_pca' in n:
        pca = ds[n]
        if pca.components_ is None:
            print(n, 'components is None !')
        elif np.any(np.isnan(pca.components_)):
            print(n, 'contains NaN !')
        else:
            print(
                n, ':', ' '.join([
                    '%.2f' % i + '-' + '%.2f' % j
                    for i, j in zip(pca.explained_variance_ratio_[:8],
Ejemplo n.º 12
0
EXTRACTOR_NAME = FEATURE_RECIPE.split("_")[0]
extractor = get_module_from_path(identifier=EXTRACTOR_NAME,
                                 path=get_script_path(),
                                 prefix='feature_recipes')
assert len(extractor) > 0, \
    "Cannot find extractor with name: %s" % EXTRACTOR_NAME
extractor = extractor[0]()
# ====== initializing ====== #
# mapping from
# scoring_data_name -> [features 2-D array,
#                       indices {name: (start, end)},
#                       spkid_or_meta {name: spkid_or_meta},
#                       path {name: path}]
acoustic_features = {}
training_ds = F.Dataset(path=os.path.join(PATH_ACOUSTIC_FEATURES,
                                          FEATURE_RECIPE),
                        read_only=True)
all_training_dataset = set(training_ds['dsname'].values())
print("All training dataset:", ctext(all_training_dataset, 'cyan'))
# ====== extract the feature if not exists ====== #
for dsname, file_list in sorted(list(SCORING_DATASETS.items()) +
                                list(BACKEND_DATASETS.items()),
                                key=lambda x: x[0]):
    # acoustic features already extracted in training dataset
    if dsname in all_training_dataset:
        assert FEATURE_NAME in training_ds, \
            "Cannot find feature with name: %s, from: %s" % (FEATURE_NAME, training_ds.path)
        X = training_ds[FEATURE_NAME]
        indices = {
            name: (start, end)
            for name, (start, end) in training_ds['indices_%s' %
Ejemplo n.º 13
0
    def test_feeders(self):
        with utils.TemporaryDirectory() as temppath:
            np.random.seed(1208251813)
            transcription_test = {}
            # ====== create fake dataset ====== #
            ds = F.Dataset(os.path.join(temppath, 'ds'))
            ds['X'] = np.arange(0, 10000).reshape(-1, 5)
            # generate fake indices
            indices = []
            for i, j in enumerate(range(0, ds['X'].shape[0], 20)):
                indices.append(['name_%d' % i, j, j + 20])
            np.savetxt(os.path.join(ds.path, 'indices.csv'),
                       indices,
                       fmt='%s',
                       delimiter=' ')
            # generate fake transcription
            transcription = F.MmapDict(
                os.path.join(ds.path, 'transcription.dict'))
            for name, start, end in indices:
                trans = np.random.randint(0, 4, size=(20, )).tolist()
                transcription[name] = trans
                for i, j in zip(ds['X'][start:end], trans):
                    transcription_test[str(i.tolist())] = j
            transcription.flush()
            transcription.close()
            ds.flush()
            ds.close()
            # ====== test feeder ====== #
            ds = F.Dataset(os.path.join(temppath, 'ds'), read_only=True)
            REF = ds['X'][:].ravel().tolist()
            feeder = F.Feeder(ds['X'],
                              ds['indices.csv'],
                              ncpu=2,
                              buffer_size=2)

            # ==================== No recipes ==================== #
            def test_iter_no_trans(it):
                x = []
                n = 0
                for i in it:
                    x += i.ravel().tolist()
                    n += i.shape[0]
                x = np.sort(x).tolist()
                self.assertEqual(x, REF)
                self.assertEqual(n, ds['X'].shape[0])

            # ====== NO shuffle ====== #
            test_iter_no_trans(feeder.set_batch(12, seed=None,
                                                shuffle_level=0))
            # ====== shuffle 0 ====== #
            test_iter_no_trans(feeder.set_batch(12, seed=1203,
                                                shuffle_level=0))
            # ====== shuffle 2 ====== #
            test_iter_no_trans(feeder.set_batch(12, seed=1203,
                                                shuffle_level=2))
            # ==================== Convert name to indices ==================== #
            feeder.set_recipes([
                F.recipes.Name2Trans(
                    converter_func=lambda name: int(name.split('_')[-1])),
                F.recipes.CreateBatch()
            ])

            def test_iter_trans(it):
                x = []
                y = 0
                n = 0
                for i, j in it:
                    x += i.ravel().tolist()
                    n += i.shape[0]
                    y += np.sum(j)
                x = np.sort(x).tolist()
                self.assertEqual(x, REF)
                self.assertEqual(y, 99000)
                self.assertEqual(n, ds['X'].shape[0])

            # ====== NO shuffle ====== #
            test_iter_trans(feeder.set_batch(12, seed=None, shuffle_level=0))
            # ====== shuffle 0 ====== #
            test_iter_trans(feeder.set_batch(12, seed=1203, shuffle_level=0))
            # ====== shuffle 2 ====== #
            test_iter_trans(feeder.set_batch(12, seed=1203, shuffle_level=2))
            # ==================== Transcription ==================== #
            del feeder
            ds = F.Dataset(os.path.join(temppath, 'ds'))
            feeder = F.Feeder(ds['X'],
                              indices=ds['indices.csv'],
                              ncpu=2,
                              buffer_size=2)
            feeder.set_recipes([
                F.recipes.TransLoader(ds['transcription.dict'], dtype='int32'),
                F.recipes.CreateBatch()
            ])
            n = 0
            X = []
            for i, j in feeder.set_batch(12, seed=1208251813, shuffle_level=2):
                X += i.ravel().tolist()
                n += i.shape[0]
                for x, y in zip(i, j):
                    self.assertTrue(transcription_test[str(x.tolist())] == y)
            X = np.sort(X).tolist()
            self.assertEqual(X, REF)
            self.assertEqual(n, ds['X'].shape[0])
Ejemplo n.º 14
0
    processor = pp.FeatureProcessor(
        jobs=all_files,
        path=PATH_ACOUSTIC,
        extractor=extractors,
        n_cache=0.12,
        ncpu=min(18,
                 cpu_count() - 2) if args.ncpu <= 0 else int(args.ncpu),
        override=True,
        identifier='name',
        log_path=os.path.join(PATH_EXP, 'processor.log'),
        stop_on_failure=True  # small dataset, enable stop on failure
    )
    with UnitTimer():
        processor.run()
    n_error = len(processor.error_log)
    print(processor)
# ====== copy readme and check the preprocessed dataset ====== #
if n_error == 0:
    readme_path = os.path.join(
        audio.path, [i for i in os.listdir(audio.path) if 'README' in i][0])
    shutil.copy(readme_path, os.path.join(PATH_ACOUSTIC, 'README.md'))

    ds = F.Dataset(PATH_ACOUSTIC, read_only=True)
    print(ds)
    pp.validate_features(ds,
                         path=os.path.join(PATH_EXP, 'acoustic'),
                         nb_samples=12,
                         override=True)
else:
    print("%s errors happened during processing!" % ctext(n_error, 'red'))