Python train_valid_test_splitの例、odin.stats.train_valid_test_split Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cifar10.py プロジェクト: johndpope/odin-ai

print("#Parameters:", len(parameters))
updates = optz(ce, parameters)
K.initialize_all_variables()
# ====== function ====== #
print('Building training functions ...')
f_train = K.function(inputs, [ce, optz.norm, cm], updates=updates, training=True)
print('Building testing functions ...')
f_test = K.function(inputs, [ce, acc, cm], training=False)
print('Building predicting functions ...')
f_pred = K.function(inputs[0], outputs['prob'], training=False)
# ===========================================================================
# Build trainer
# ===========================================================================
# ====== spliting the data ====== #
idx = np.arange(len(X_train), dtype='int32')
idx_train, idx_valid = train_valid_test_split(idx, train=0.8,
                                              inc_test=False, seed=1234)
X_valid = X_train[idx_valid]
y_valid = y_train[idx_valid]
X_train = X_train[idx_train]
y_train = y_train[idx_train]
print("#Train:", X_train.shape, y_train.shape)
print("#Valid:", X_valid.shape, y_valid.shape)
print("#Test:", X_test.shape, y_test.shape)
# ====== trainign ====== #
print('Start training ...')
task = training.MainLoop(batch_size=128, seed=1234, shuffle_level=2,
                         allow_rollback=True)
task.set_checkpoint(MODEL_PATH, model)
task.set_callbacks([
    training.NaNDetector(),
    training.EarlyStopGeneralizationLoss('valid', ce, threshold=5, patience=3)

コード例 #2

0

ファイルを表示

def prepare_data(feat, label, utt_length=0.4, for_ivec=False):
    """

  Returns (i-vector)
  ------------------
  ds[feat]
  train_files
  y_train
  test_files
  y_test
  labels

  Returns (x-vector)
  ------------------
  train : Feeder
    feeder for training data for iterating over pair of (X, y)
  valid : Feeder
    feeder for validating data for iterating over pair of (X, y)
  X_test_name : list of file names
    file names are append with '.%d' for cut segment ID
  X_test_true : list of integer
    label of each sample
  X_test_data : array
    list of test data same length as X_test_name
  labels : list of string
    list of labels for classification task

  Example
  -------
  (train, valid,
   X_test_name, X_test_true, X_test_data,
   labels) = prepare_data_dnn(feat=FEAT, label='gender')

  """
    label = str(label).lower()
    assert label in _support_label, "No support for label: %s" % label
    assert 0 < utt_length <= 1.
    # ====== load dataset ====== #
    if not os.path.exists(PATH_ACOUSTIC):
        raise RuntimeError(
            "Cannot find extracted acoustic features at path: '%s',"
            "run the code speech_features_extraction.py!" % PATH_ACOUSTIC)
    ds = F.Dataset(PATH_ACOUSTIC, read_only=True)
    assert feat in ds, "Cannot find feature with name: %s" % feat
    indices = list(ds['indices'].items())
    K.get_rng().shuffle(indices)

    # ====== helper ====== #
    def is_train(x):
        return x.split('_')[0] == 'train'

    def extract_label(x):
        return x.split('_')[_support_label[label]]

    print("Task:", ctext(label, 'cyan'))
    fn_label, labels = unique_labels([i[0] for i in indices],
                                     key_func=extract_label,
                                     return_labels=True)
    print("Labels:", ctext(labels, 'cyan'))
    # ====== training and test data ====== #
    train_files = []  # (name, (start, end)) ...
    test_files = []
    for name, (start, end) in indices:
        if is_train(name):
            train_files.append((name, (start, end)))
        else:
            test_files.append((name, (start, end)))
    # name for each dataset, useful for later
    print("#Train:", ctext(len(train_files), 'cyan'))
    print("#Test:", ctext(len(test_files), 'cyan'))
    # ====== for i-vectors ====== #
    y_train = np.array([fn_label(i[0]) for i in train_files])
    y_test = np.array([fn_label(i[0]) for i in test_files])
    if bool(for_ivec):
        return ds[feat], train_files, y_train, test_files, y_test, labels
    # ====== length ====== #
    length = [(end - start) for _, (start, end) in indices]
    max_length = max(length)
    frame_length = int(max_length * utt_length)
    step_length = frame_length
    print("Max length  :", ctext(max_length, 'yellow'))
    print("Frame length:", ctext(frame_length, 'yellow'))
    print("Step length :", ctext(step_length, 'yellow'))
    # ====== split dataset ====== #
    # split by speaker ID
    train_files, valid_files = train_valid_test_split(
        x=train_files,
        train=0.8,
        cluster_func=None,
        idfunc=lambda x: x[0].split('_')[4],  # splited by speaker
        inc_test=False)
    print("#File train:", ctext(len(train_files), 'cyan'))
    print("#File valid:", ctext(len(valid_files), 'cyan'))
    print("#File test :", ctext(len(test_files), 'cyan'))

    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=step_length,
                             end='pad',
                             pad_mode='post',
                             pad_value=0),
        F.recipes.Name2Label(converter_func=fn_label),
        F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1)
    ]
    feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files),
                            ncpu=6,
                            batch_mode='batch')
    feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files),
                            ncpu=4,
                            batch_mode='batch')
    feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files),
                           ncpu=4,
                           batch_mode='file')
    feeder_train.set_recipes(recipes)
    feeder_valid.set_recipes(recipes)
    feeder_test.set_recipes(recipes)
    print(feeder_train)

    # ====== process X_test, y_test in advance for faster evaluation ====== #
    @cache_disk
    def _extract_test_data(feat, label, utt_length):
        prog = Progbar(target=len(feeder_test),
                       print_summary=True,
                       name="Preprocessing test set")
        X_test = defaultdict(list)
        for name, idx, X, y in feeder_test:
            # validate everything as expected
            assert fn_label(name) == np.argmax(y), name  # label is right
            # save to list
            X_test[name].append((idx, X))
            prog.add(X.shape[0])
        # ====== create 1 array for data and dictionary for indices ====== #
        X_test_name = []
        X_test_data = []
        for name, X in X_test.items():
            X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])],
                               axis=0).astype('float16')
            X_test_name += [name + '.%d' % i for i in range(len(X))]
            X_test_data.append(X)
        X_test_name = np.array(X_test_name)
        X_test_data = np.concatenate(X_test_data, axis=0)
        return X_test_name, X_test_data

    # convert everything back to float32
    X_test_name, X_test_data = _extract_test_data(feat, label, utt_length)
    X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name])
    return feeder_train, feeder_valid, \
    X_test_name, X_test_true, X_test_data, labels

コード例 #3

0

ファイルを表示

ファイル: utils.py プロジェクト: trungnt13/odin-ai

def prepare_dnn_data(recipe, feat, utt_length, seed=87654321):
    """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
    # Load dataset
    frame_length = int(utt_length / FRAME_SHIFT)
    ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True)
    X = ds[feat]
    train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()}
    test_indices = {
        name: start_end
        for name, start_end in ds['indices'].items() if name not in TRAIN_DATA
    }
    train_indices, valid_indices = train_valid_test_split(x=list(
        train_indices.items()),
                                                          train=0.9,
                                                          inc_test=False,
                                                          seed=seed)
    all_speakers = sorted(set(TRAIN_DATA.values()))
    n_speakers = max(all_speakers) + 1
    print("#Train files:", ctext(len(train_indices), 'cyan'))
    print("#Valid files:", ctext(len(valid_indices), 'cyan'))
    print("#Test files:", ctext(len(test_indices), 'cyan'))
    print("#Speakers:", ctext(n_speakers, 'cyan'))
    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post',
                             data_idx=0),
        F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0),
        F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
    ]
    train_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=train_indices),
                            batch_mode='batch',
                            ncpu=7,
                            buffer_size=12)
    valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=valid_indices),
                            batch_mode='batch',
                            ncpu=2,
                            buffer_size=4)
    train_feeder.set_recipes(recipes)
    valid_feeder.set_recipes(recipes)
    print(train_feeder)
    # ====== cache the test data ====== #
    cache_dat = os.path.join(PATH_EXP,
                             'test_%s_%d.dat' % (feat, int(utt_length)))
    cache_ids = os.path.join(PATH_EXP,
                             'test_%s_%d.ids' % (feat, int(utt_length)))
    # validate cache files
    if os.path.exists(cache_ids):
        with open(cache_ids, 'rb') as f:
            ids = pickle.load(f)
        if len(ids) != len(test_indices):
            os.remove(cache_ids)
            if os.path.exists(cache_dat):
                os.remove(cache_dat)
    elif os.path.exists(cache_dat):
        os.remove(cache_dat)
    # caching
    if not os.path.exists(cache_dat):
        dat = F.MmapData(cache_dat,
                         dtype='float16',
                         shape=(0, frame_length, X.shape[1]))
        ids = {}
        prog = Progbar(target=len(test_indices))
        s = 0
        for name, (start, end) in test_indices.items():
            y = X[start:end]
            y = segment_axis(y,
                             axis=0,
                             frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post')
            dat.append(y)
            # update indices
            ids[name] = (s, s + len(y))
            s += len(y)
            # update progress
            prog.add(1)
        dat.flush()
        dat.close()
        with open(cache_ids, 'wb') as f:
            pickle.dump(ids, f)
    # ====== re-load ====== #
    dat = F.MmapData(cache_dat, read_only=True)
    with open(cache_ids, 'rb') as f:
        ids = pickle.load(f)
    # ====== save some sample ====== #
    sample_path = os.path.join(PATH_EXP,
                               'test_%s_%d.pdf' % (feat, int(utt_length)))
    V.plot_figure(nrow=9, ncol=6)
    for i, (name, (start, end)) in enumerate(
            sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]),
                          k=12,
                          seed=87654321)):
        x = dat[start:end][:].astype('float32')
        ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                                ax=(12, 1, i + 1),
                                title='')
        ax.set_title(name)
    V.plot_save(sample_path)
    return (train_feeder, valid_feeder, ids, dat, all_speakers)

コード例 #4

0

ファイルを表示

ファイル: single_digit_e2e.py プロジェクト: professorlust/odin-ai

legends = OrderedDict()
for g, m in zip(genders, gender_markers):
    for d, c in zip(digits, digit_colors):
        legends[(c, m)] = str(g) + '-' + str(d)
# ===========================================================================
# SPlit dataset
# ===========================================================================
split_spkID = lambda x: x[0].split('_')[4]
split_dialID_spkID = lambda x: x[0].split('_')[3] + x[0].split('_')[4]
split_genID_spkID = lambda x: x[0].split('_')[1] + x[0].split('_')[4]
split_genID = lambda x: x[0].split('_')[1]
split_ageID = lambda x: x[0].split('_')[2]
# stratified sampling for each digit, splited based on speaker ID
train, valid = train_valid_test_split(x=train,
                                      train=0.6,
                                      inc_test=False,
                                      idfunc=split_spkID,
                                      seed=K.get_rng().randint(0, 10e8))
# make sure both train and valid set have all the numbers
assert set(i[0].split('_')[-1] for i in train) == set(digits)
assert set(i[0].split('_')[-1] for i in valid) == set(digits)
# ====== report ====== #
report_info = lambda idx, flist: sorted(
    list(set(i[0].split('_')[idx] for i in flist)))
print(ctext("#File train:", 'yellow'), len(train), train[:2])
print(' * Genders:', ctext(report_info(1, train), 'cyan'))
print(' * Age:', ctext(report_info(2, train), 'cyan'))
print(' * Dialects:', ctext(report_info(3, train), 'cyan'))
print(' * Speakers:', ctext(report_info(4, train), 'cyan'))
print(ctext("#File valid:", 'yellow'), len(valid), valid[:2])
print(' * Genders:', ctext(report_info(1, valid), 'cyan'))

コード例 #5

0

ファイルを表示

ファイル: baselines.py プロジェクト: trungnt13/sisua

y_prot_names = [
    i.replace("protein.count.", "") for i in ds['meta_cols'][12:22]
]
y_prot = ds['metadata'][:, 12:22]
# ====== log ====== #
assert y_mRNA_names == mRNA_ORDER and y_prot_names == PROTEIN_ORDER
# ====== load thresholded protein values ====== #
y_bin = ds['y_bin'] if 'y_bin' in ds else None
y_prob = ds['y_prob'] if 'y_prob' in ds else None
# ===========================================================================
# split the data
# ===========================================================================
num_samples = data.shape[0]
ids = get_rng().permutation(num_samples)
train, valid, test = train_valid_test_split(ids,
                                            train=TRAIN_PERCENTAGE,
                                            inc_test=True,
                                            seed=get_rng().randint(0, 10e8))
X_train, y_mRNA_train, y_prot_train, y_bin_train, y_prob_train = data[
    train], y_mRNA[train], y_prot[train], y_bin[train], y_prob[train]
X_valid, y_mRNA_valid, y_prot_valid, y_bin_valid, y_prob_valid = data[
    valid], y_mRNA[valid], y_prot[valid], y_bin[valid], y_prob[valid]
X_test, y_mRNA_test, y_prot_test, y_bin_test, y_prob_test = data[test], y_mRNA[
    test], y_prot[test], y_bin[test], y_prob[test]
print(ctext("Train:", 'cyan'), X_train.shape, y_mRNA_train.shape,
      y_prot_train.shape, y_bin_train.shape, y_prob_train.shape)
print(ctext("Valid:", 'cyan'), X_valid.shape, y_mRNA_valid.shape,
      y_prot_valid.shape, y_bin_valid.shape, y_prob_valid.shape)
print(ctext("Test:", 'cyan'), X_test.shape, y_mRNA_test.shape,
      y_prot_test.shape, y_bin_test.shape, y_prob_test.shape)
# ===========================================================================
# Evaluation

コード例 #6

0

ファイルを表示

print(ds)
nb_classes = 10  # 10 digits (0-9)

# ===========================================================================
# Create feeder
# ===========================================================================
indices = [(name, start, end) for name, (start, end) in ds['indices']]
longest_utterances = max(
    int(end) - int(start) - 1 for i, start, end in indices)
longest_vad = max(end - start for name, vad in ds['vadids']
                  for (start, end) in vad)
print("Longest Utterance:", longest_utterances)
print("Longest Vad:", longest_vad)

np.random.shuffle(indices)
train, valid, test = train_valid_test_split(indices, train=0.6, inc_test=True)
print('Nb train:', len(train), freqcount([int(i[0][0]) for i in train]))
print('Nb valid:', len(valid), freqcount([int(i[0][0]) for i in valid]))
print('Nb test:', len(test), freqcount([int(i[0][0]) for i in test]))

train_feeder = F.Feeder(ds['mspec'], train, ncpu=1)
test_feeder = F.Feeder(ds['mspec'], test, ncpu=2)
valid_feeder = F.Feeder(ds['mspec'], valid, ncpu=2)

recipes = [
    F.recipes.Name2Trans(converter_func=lambda x: int(x[0])),
    F.recipes.Normalization(mean=ds['mspec_mean'],
                            std=ds['mspec_std'],
                            local_normalize=False),
    F.recipes.Sequencing(frame_length=longest_utterances,
                         hop_length=1,

コード例 #7

0

ファイルを表示

ファイル: utils.py プロジェクト: imito/odin

def prepare_dnn_data(recipe, feat, utt_length, seed=52181208):
  """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
  # Load dataset
  frame_length = int(utt_length / FRAME_SHIFT)
  ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe),
                 read_only=True)
  X = ds[feat]
  train_indices = {name: ds['indices'][name]
                   for name in TRAIN_DATA.keys()}
  test_indices = {name: start_end
                  for name, start_end in ds['indices'].items()
                  if name not in TRAIN_DATA}
  train_indices, valid_indices = train_valid_test_split(
      x=list(train_indices.items()), train=0.9, inc_test=False, seed=seed)
  all_speakers = sorted(set(TRAIN_DATA.values()))
  n_speakers = max(all_speakers) + 1
  print("#Train files:", ctext(len(train_indices), 'cyan'))
  print("#Valid files:", ctext(len(valid_indices), 'cyan'))
  print("#Test files:", ctext(len(test_indices), 'cyan'))
  print("#Speakers:", ctext(n_speakers, 'cyan'))
  recipes = [
      F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length,
                           end='pad', pad_value=0, pad_mode='post',
                           data_idx=0),
      F.recipes.Name2Label(lambda name:TRAIN_DATA[name], ref_idx=0),
      F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
  ]
  train_feeder = F.Feeder(
      data_desc=F.IndexedData(data=X, indices=train_indices),
      batch_mode='batch', ncpu=7, buffer_size=12)
  valid_feeder = F.Feeder(
      data_desc=F.IndexedData(data=X, indices=valid_indices),
      batch_mode='batch', ncpu=2, buffer_size=4)
  train_feeder.set_recipes(recipes)
  valid_feeder.set_recipes(recipes)
  print(train_feeder)
  # ====== cache the test data ====== #
  cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length)))
  cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length)))
  # validate cache files
  if os.path.exists(cache_ids):
    with open(cache_ids, 'rb') as f:
      ids = pickle.load(f)
    if len(ids) != len(test_indices):
      os.remove(cache_ids)
      if os.path.exists(cache_dat):
        os.remove(cache_dat)
  elif os.path.exists(cache_dat):
    os.remove(cache_dat)
  # caching
  if not os.path.exists(cache_dat):
    dat = F.MmapData(cache_dat, dtype='float16',
                     shape=(0, frame_length, X.shape[1]))
    ids = {}
    prog = Progbar(target=len(test_indices))
    s = 0
    for name, (start, end) in test_indices.items():
      y = X[start:end]
      y = segment_axis(y, axis=0,
                       frame_length=frame_length, step_length=frame_length,
                       end='pad', pad_value=0, pad_mode='post')
      dat.append(y)
      # update indices
      ids[name] = (s, s + len(y))
      s += len(y)
      # update progress
      prog.add(1)
    dat.flush()
    dat.close()
    with open(cache_ids, 'wb') as f:
      pickle.dump(ids, f)
  # ====== re-load ====== #
  dat = F.MmapData(cache_dat, read_only=True)
  with open(cache_ids, 'rb') as f:
    ids = pickle.load(f)
  # ====== save some sample ====== #
  sample_path = os.path.join(PATH_EXP,
                             'test_%s_%d.pdf' % (feat, int(utt_length)))
  V.plot_figure(nrow=9, ncol=6)
  for i, (name, (start, end)) in enumerate(
      sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=52181208)):
    x = dat[start:end][:].astype('float32')
    ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                            ax=(12, 1, i + 1), title='')
    ax.set_title(name)
  V.plot_save(sample_path)
  return (train_feeder, valid_feeder,
          ids, dat, all_speakers)

コード例 #8

0

ファイルを表示

ファイル: utils.py プロジェクト: imito/odin

def prepare_data(feat, label, utt_length=0.4, for_ivec=False):
  """

  Returns (i-vector)
  ------------------
  ds[feat]
  train_files
  y_train
  test_files
  y_test
  labels

  Returns (x-vector)
  ------------------
  train : Feeder
    feeder for training data for iterating over pair of (X, y)
  valid : Feeder
    feeder for validating data for iterating over pair of (X, y)
  X_test_name : list of file names
    file names are append with '.%d' for cut segment ID
  X_test_true : list of integer
    label of each sample
  X_test_data : array
    list of test data same length as X_test_name
  labels : list of string
    list of labels for classification task

  Example
  -------
  (train, valid,
   X_test_name, X_test_true, X_test_data,
   labels) = prepare_data_dnn(feat=FEAT, label='gender')

  """
  label = str(label).lower()
  assert label in _support_label, "No support for label: %s" % label
  assert 0 < utt_length <= 1.
  # ====== load dataset ====== #
  if not os.path.exists(PATH_ACOUSTIC):
    raise RuntimeError("Cannot find extracted acoustic features at path: '%s',"
                       "run the code speech_features_extraction.py!" % PATH_ACOUSTIC)
  ds = F.Dataset(PATH_ACOUSTIC, read_only=True)
  assert feat in ds, "Cannot find feature with name: %s" % feat
  indices = list(ds['indices'].items())
  K.get_rng().shuffle(indices)

  # ====== helper ====== #
  def is_train(x):
    return x.split('_')[0] == 'train'

  def extract_label(x):
    return x.split('_')[_support_label[label]]

  print("Task:", ctext(label, 'cyan'))
  fn_label, labels = unique_labels([i[0] for i in indices],
                                   key_func=extract_label,
                                   return_labels=True)
  print("Labels:", ctext(labels, 'cyan'))
  # ====== training and test data ====== #
  train_files = [] # (name, (start, end)) ...
  test_files = []
  for name, (start, end) in indices:
    if is_train(name):
      train_files.append((name, (start, end)))
    else:
      test_files.append((name, (start, end)))
  # name for each dataset, useful for later
  print("#Train:", ctext(len(train_files), 'cyan'))
  print("#Test:", ctext(len(test_files), 'cyan'))
  # ====== for i-vectors ====== #
  y_train = np.array([fn_label(i[0]) for i in train_files])
  y_test = np.array([fn_label(i[0]) for i in test_files])
  if bool(for_ivec):
    return ds[feat], train_files, y_train, test_files, y_test, labels
  # ====== length ====== #
  length = [(end - start) for _, (start, end) in indices]
  max_length = max(length)
  frame_length = int(max_length * utt_length)
  step_length = frame_length
  print("Max length  :", ctext(max_length, 'yellow'))
  print("Frame length:", ctext(frame_length, 'yellow'))
  print("Step length :", ctext(step_length, 'yellow'))
  # ====== split dataset ====== #
  # split by speaker ID
  train_files, valid_files = train_valid_test_split(
      x=train_files, train=0.8,
      cluster_func=None,
      idfunc=lambda x: x[0].split('_')[4], # splited by speaker
      inc_test=False)
  print("#File train:", ctext(len(train_files), 'cyan'))
  print("#File valid:", ctext(len(valid_files), 'cyan'))
  print("#File test :", ctext(len(test_files), 'cyan'))

  recipes = [
      F.recipes.Sequencing(frame_length=frame_length, step_length=step_length,
                           end='pad', pad_mode='post', pad_value=0),
      F.recipes.Name2Label(converter_func=fn_label),
      F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1)
  ]
  feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files),
                          ncpu=6, batch_mode='batch')
  feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files),
                          ncpu=4, batch_mode='batch')
  feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files),
                         ncpu=4, batch_mode='file')
  feeder_train.set_recipes(recipes)
  feeder_valid.set_recipes(recipes)
  feeder_test.set_recipes(recipes)
  print(feeder_train)

  # ====== process X_test, y_test in advance for faster evaluation ====== #
  @cache_disk
  def _extract_test_data(feat, label, utt_length):
    prog = Progbar(target=len(feeder_test),
                   print_summary=True, name="Preprocessing test set")
    X_test = defaultdict(list)
    for name, idx, X, y in feeder_test:
      # validate everything as expected
      assert fn_label(name) == np.argmax(y), name # label is right
      # save to list
      X_test[name].append((idx, X))
      prog.add(X.shape[0])
    # ====== create 1 array for data and dictionary for indices ====== #
    X_test_name = []
    X_test_data = []
    for name, X in X_test.items():
      X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])],
                         axis=0).astype('float16')
      X_test_name += [name + '.%d' % i for i in range(len(X))]
      X_test_data.append(X)
    X_test_name = np.array(X_test_name)
    X_test_data = np.concatenate(X_test_data, axis=0)
    return X_test_name, X_test_data
  # convert everything back to float32
  X_test_name, X_test_data = _extract_test_data(feat, label, utt_length)
  X_test_true = np.array([fn_label(i.split('.')[0])
                          for i in X_test_name])
  return feeder_train, feeder_valid, \
  X_test_name, X_test_true, X_test_data, labels