コード例 #1
0
ファイル: rnn.py プロジェクト: mtingzhi/returnn
def analyze_data(config):  # pylint: disable=redefined-outer-name
    """
  :param Config config:
  """
    dss = config.value('analyze_dataset', 'train')
    ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss]
    epoch = config.int('epoch', 1)
    print("Analyze dataset", dss, "epoch", epoch, file=log.v1)
    ds.init_seq_order(epoch=epoch)
    stat_prefix = config.value('statistics_save_prefix', 'statistics')
    dtype = config.value('statistics_dtype', 'float64')
    target = config.value('target', 'classes')
    data_key = config.value('data_key', 'data')
    assert ds.is_data_sparse(target), "need for prior calculation"
    assert not ds.is_data_sparse(data_key), "needed for mean/var estimation"
    from Util import inplace_increment, progress_bar_with_time, NumbersDict

    priors = numpy.zeros((ds.get_data_dim(target), ), dtype=dtype)
    mean = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype)
    mean_sq = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype)
    total_targets_len = 0
    total_data_len = 0

    # Note: This is not stable! See :class:`Util.Stats` for a better alternative.
    seq_idx = 0
    while ds.is_less_than_num_seqs(seq_idx):
        progress_bar_with_time(ds.get_complete_frac(seq_idx))
        ds.load_seqs(seq_idx, seq_idx + 1)
        targets = ds.get_data(seq_idx, target)
        inplace_increment(priors, targets, 1)
        total_targets_len += targets.shape[0]
        data = ds.get_data(seq_idx, data_key)
        new_total_data_len = total_data_len + data.shape[0]
        f = float(total_data_len) / new_total_data_len
        mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f)
        mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f)
        total_data_len = new_total_data_len
        seq_idx += 1
    log_priors = numpy.log(priors)
    log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target])
    std_dev = numpy.sqrt(mean_sq - mean * mean)
    print("Finished. %i total target frames, %i total data frames" %
          (total_targets_len, total_data_len),
          file=log.v1)
    priors_fn = stat_prefix + ".log_priors.txt"
    mean_fn = stat_prefix + ".mean.txt"
    std_dev_fn = stat_prefix + ".std_dev.txt"
    print("Dump priors to", priors_fn, file=log.v1)
    numpy.savetxt(priors_fn, log_priors)
    print("Dump mean to", mean_fn, file=log.v1)
    numpy.savetxt(mean_fn, mean)
    print("Dump std dev to", std_dev_fn, file=log.v1)
    numpy.savetxt(std_dev_fn, std_dev)
    print("Done.", file=log.v1)
コード例 #2
0
ファイル: rnn.py プロジェクト: rwth-i6/returnn
def analyze_data(config):  # pylint: disable=redefined-outer-name
  """
  :param Config config:
  """
  dss = config.value('analyze_dataset', 'train')
  ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss]
  epoch = config.int('epoch', 1)
  print("Analyze dataset", dss, "epoch", epoch, file=log.v1)
  ds.init_seq_order(epoch=epoch)
  stat_prefix = config.value('statistics_save_prefix', 'statistics')
  dtype = config.value('statistics_dtype', 'float64')
  target = config.value('target', 'classes')
  data_key = config.value('data_key', 'data')
  assert ds.is_data_sparse(target), "need for prior calculation"
  assert not ds.is_data_sparse(data_key), "needed for mean/var estimation"
  from Util import inplace_increment, progress_bar_with_time, NumbersDict

  priors = numpy.zeros((ds.get_data_dim(target),), dtype=dtype)
  mean = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype)
  mean_sq = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype)
  total_targets_len = 0
  total_data_len = 0

  # Note: This is not stable! See :class:`Util.Stats` for a better alternative.
  seq_idx = 0
  while ds.is_less_than_num_seqs(seq_idx):
    progress_bar_with_time(ds.get_complete_frac(seq_idx))
    ds.load_seqs(seq_idx, seq_idx + 1)
    targets = ds.get_data(seq_idx, target)
    inplace_increment(priors, targets, 1)
    total_targets_len += targets.shape[0]
    data = ds.get_data(seq_idx, data_key)
    new_total_data_len = total_data_len + data.shape[0]
    f = float(total_data_len) / new_total_data_len
    mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f)
    mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f)
    total_data_len = new_total_data_len
    seq_idx += 1
  log_priors = numpy.log(priors)
  log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target])
  std_dev = numpy.sqrt(mean_sq - mean * mean)
  print("Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len), file=log.v1)
  priors_fn = stat_prefix + ".log_priors.txt"
  mean_fn = stat_prefix + ".mean.txt"
  std_dev_fn = stat_prefix + ".std_dev.txt"
  print("Dump priors to", priors_fn, file=log.v1)
  numpy.savetxt(priors_fn, log_priors)
  print("Dump mean to", mean_fn, file=log.v1)
  numpy.savetxt(mean_fn, mean)
  print("Dump std dev to", std_dev_fn, file=log.v1)
  numpy.savetxt(std_dev_fn, std_dev)
  print("Done.", file=log.v1)
コード例 #3
0
def analyze_data(config):
    dss = config.value('analyze_dataset', 'train')
    ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss]
    epoch = config.int('epoch', 1)
    print >> log.v1, "Analyze dataset", dss, "epoch", epoch
    ds.init_seq_order(epoch=epoch)
    stat_prefix = config.value('statistics_save_prefix', 'statistics')
    dtype = config.value('statistics_dtype', 'float64')
    target = config.value('target', 'classes')
    data_key = config.value('data_key', 'data')
    assert ds.is_data_sparse(target), "need for prior calculation"
    assert not ds.is_data_sparse(data_key), "needed for mean/var estimation"
    from Util import inplace_increment, progress_bar_with_time, NumbersDict

    priors = numpy.zeros((ds.get_data_dim(target), ), dtype=dtype)
    mean = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype)
    mean_sq = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype)
    total_targets_len = 0
    total_data_len = 0

    seq_idx = 0
    while ds.is_less_than_num_seqs(seq_idx):
        progress_bar_with_time(ds.get_complete_frac(seq_idx))
        ds.load_seqs(seq_idx, seq_idx + 1)
        targets = ds.get_data(seq_idx, target)
        inplace_increment(priors, targets, 1)
        total_targets_len += targets.shape[0]
        data = ds.get_data(seq_idx, data_key)
        new_total_data_len = total_data_len + data.shape[0]
        f = float(total_data_len) / new_total_data_len
        mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f)
        mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f)
        total_data_len = new_total_data_len
        seq_idx += 1
    log_priors = numpy.log(priors)
    log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target])
    var = numpy.sqrt(mean_sq - mean * mean)
    print >> log.v1, "Finished. %i total target frames, %i total data frames" % (
        total_targets_len, total_data_len)
    priors_fn = stat_prefix + ".log_priors.txt"
    mean_fn = stat_prefix + ".mean.txt"
    var_fn = stat_prefix + ".var.txt"
    print >> log.v1, "Dump priors to", priors_fn
    numpy.savetxt(priors_fn, log_priors)
    print >> log.v1, "Dump mean to", mean_fn
    numpy.savetxt(mean_fn, mean)
    print >> log.v1, "Dump var to", var_fn
    numpy.savetxt(var_fn, var)
    print >> log.v1, "Done."
コード例 #4
0
ファイル: rnn.py プロジェクト: atuxhe/returnn
def analyze_data(config):
  dss = config.value('analyze_dataset', 'train')
  ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss]
  epoch = config.int('epoch', 1)
  print >> log.v1, "Analyze dataset", dss, "epoch", epoch
  ds.init_seq_order(epoch=epoch)
  stat_prefix = config.value('statistics_save_prefix', 'statistics')
  dtype = config.value('statistics_dtype', 'float64')
  target = config.value('target', 'classes')
  data_key = config.value('data_key', 'data')
  assert ds.is_data_sparse(target), "need for prior calculation"
  assert not ds.is_data_sparse(data_key), "needed for mean/var estimation"
  from Util import inplace_increment, progress_bar_with_time, NumbersDict

  priors = numpy.zeros((ds.get_data_dim(target),), dtype=dtype)
  mean = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype)
  mean_sq = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype)
  total_targets_len = 0
  total_data_len = 0

  seq_idx = 0
  while ds.is_less_than_num_seqs(seq_idx):
    progress_bar_with_time(ds.get_complete_frac(seq_idx))
    ds.load_seqs(seq_idx, seq_idx + 1)
    targets = ds.get_data(seq_idx, target)
    inplace_increment(priors, targets, 1)
    total_targets_len += targets.shape[0]
    data = ds.get_data(seq_idx, data_key)
    new_total_data_len = total_data_len + data.shape[0]
    f = float(total_data_len) / new_total_data_len
    mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f)
    mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f)
    total_data_len = new_total_data_len
    seq_idx += 1
  log_priors = numpy.log(priors)
  log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target])
  var = numpy.sqrt(mean_sq - mean * mean)
  print >> log.v1, "Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len)
  priors_fn = stat_prefix + ".log_priors.txt"
  mean_fn = stat_prefix + ".mean.txt"
  var_fn = stat_prefix + ".var.txt"
  print >> log.v1, "Dump priors to", priors_fn
  numpy.savetxt(priors_fn, log_priors)
  print >> log.v1, "Dump mean to", mean_fn
  numpy.savetxt(mean_fn, mean)
  print >> log.v1, "Dump var to", var_fn
  numpy.savetxt(var_fn, var)
  print >> log.v1, "Done."
コード例 #5
0
ファイル: hdf_dump.py プロジェクト: julian90/returnn
def hdf_dump_from_dataset(dataset, hdf_dataset, parser_args):
    """
  :param Dataset dataset: could be any dataset implemented as child of Dataset
  :type hdf_dataset: h5py._hl.files.File
  :param parser_args: argparse object from main()
  :return:
  """
    print("Work on epoch: %i" % parser_args.epoch, file=log.v3)
    dataset.init_seq_order(parser_args.epoch)

    data_keys = sorted(dataset.get_data_keys())
    print("Data keys:", data_keys, file=log.v3)
    if "orth" in data_keys:
        data_keys.remove("orth")

    # We need to do one run through the dataset to collect some stats like total len.
    print("Collect stats, iterate through all data...", file=log.v3)
    seq_idx = parser_args.start_seq
    seq_idxs = []
    seq_tags = []
    seq_lens = []
    total_seq_len = NumbersDict(0)
    max_tag_len = 0
    dataset_num_seqs = try_run(lambda: dataset.num_seqs,
                               default=None)  # can be unknown
    if parser_args.end_seq != float("inf"):
        if dataset_num_seqs is not None:
            dataset_num_seqs = min(dataset_num_seqs, parser_args.end_seq)
        else:
            dataset_num_seqs = parser_args.end_seq
    if dataset_num_seqs is not None:
        dataset_num_seqs -= parser_args.start_seq
        assert dataset_num_seqs > 0
    while dataset.is_less_than_num_seqs(
            seq_idx) and seq_idx <= parser_args.end_seq:
        seq_idxs += [seq_idx]
        dataset.load_seqs(seq_idx, seq_idx + 1)
        seq_len = dataset.get_seq_length(seq_idx)
        seq_lens += [seq_len]
        tag = dataset.get_tag(seq_idx)
        seq_tags += [tag]
        max_tag_len = max(len(tag), max_tag_len)
        total_seq_len += seq_len
        if dataset_num_seqs is not None:
            progress_bar_with_time(
                float(seq_idx - parser_args.start_seq) / dataset_num_seqs)
        seq_idx += 1
    num_seqs = len(seq_idxs)

    assert num_seqs > 0
    shapes = {}
    for data_key in data_keys:
        assert data_key in total_seq_len.dict
        shape = [total_seq_len[data_key]]
        shape += dataset.get_data_shape(data_key)
        print("Total len of %r is %s, shape %r, dtype %s" %
              (data_key, human_size(
                  shape[0]), shape, dataset.get_data_dtype(data_key)),
              file=log.v3)
        shapes[data_key] = shape

    print("Set seq tags...", file=log.v3)
    hdf_dataset.create_dataset('seqTags',
                               shape=(num_seqs, ),
                               dtype="S%i" % (max_tag_len + 1))
    for i, tag in enumerate(seq_tags):
        hdf_dataset['seqTags'][i] = numpy.array(tag,
                                                dtype="S%i" %
                                                (max_tag_len + 1))
        progress_bar_with_time(float(i) / num_seqs)

    print("Set seq len info...", file=log.v3)
    hdf_dataset.create_dataset(HDFDataset.attr_seqLengths,
                               shape=(num_seqs, 2),
                               dtype="int32")
    for i, seq_len in enumerate(seq_lens):
        data_len = seq_len["data"]
        targets_len = seq_len["classes"]
        for data_key in dataset.get_target_list():
            if data_key == "orth":
                continue
            assert seq_len[
                data_key] == targets_len, "different lengths in multi-target not supported"
        if targets_len is None:
            targets_len = data_len
        hdf_dataset[HDFDataset.attr_seqLengths][i] = [data_len, targets_len]
        progress_bar_with_time(float(i) / num_seqs)

    print("Create arrays in HDF...", file=log.v3)
    hdf_dataset.create_group('targets/data')
    hdf_dataset.create_group('targets/size')
    hdf_dataset.create_group('targets/labels')
    for data_key in data_keys:
        if data_key == "data":
            hdf_dataset.create_dataset('inputs',
                                       shape=shapes[data_key],
                                       dtype=dataset.get_data_dtype(data_key))
        else:
            hdf_dataset['targets/data'].create_dataset(
                data_key,
                shape=shapes[data_key],
                dtype=dataset.get_data_dtype(data_key))
            hdf_dataset['targets/size'].attrs[data_key] = dataset.num_outputs[
                data_key]

        if data_key in dataset.labels:
            labels = dataset.labels[data_key]
            assert len(labels) == dataset.num_outputs[data_key][0]
        else:
            labels = [
                "%s-class-%i" % (data_key, i)
                for i in range(dataset.get_data_dim(data_key))
            ]
        print("Labels for %s:" % data_key, labels[:3], "...", file=log.v5)
        max_label_len = max(map(len, labels))
        if data_key != "data":
            hdf_dataset['targets/labels'].create_dataset(
                data_key, (len(labels), ), dtype="S%i" % (max_label_len + 1))
            for i, label in enumerate(labels):
                hdf_dataset['targets/labels'][data_key][i] = numpy.array(
                    label, dtype="S%i" % (max_label_len + 1))

    # Again iterate through dataset, and set the data
    print("Write data...", file=log.v3)
    dataset.init_seq_order(parser_args.epoch)
    offsets = NumbersDict(0)
    for seq_idx, tag in zip(seq_idxs, seq_tags):
        dataset.load_seqs(seq_idx, seq_idx + 1)
        tag_ = dataset.get_tag(seq_idx)
        assert tag == tag_  # Just a check for sanity. We expect the same order.
        seq_len = dataset.get_seq_length(seq_idx)
        for data_key in data_keys:
            if data_key == "data":
                hdf_data = hdf_dataset['inputs']
            else:
                hdf_data = hdf_dataset['targets/data'][data_key]
            data = dataset.get_data(seq_idx, data_key)
            hdf_data[offsets[data_key]:offsets[data_key] +
                     seq_len[data_key]] = data

        progress_bar_with_time(float(offsets["data"]) / total_seq_len["data"])

        offsets += seq_len

    assert offsets == total_seq_len  # Sanity check.

    # Set some old-format attribs. Not needed for newer CRNN versions.
    hdf_dataset.attrs[HDFDataset.attr_inputPattSize] = dataset.num_inputs
    hdf_dataset.attrs[HDFDataset.attr_numLabels] = dataset.num_outputs.get(
        "classes", (0, 0))[0]

    print("All done.", file=log.v3)
コード例 #6
0
ファイル: hdf_dump.py プロジェクト: atuxhe/returnn
def hdf_dump_from_dataset(dataset, hdf_dataset, parser_args):
  """
  :param Dataset dataset: could be any dataset implemented as child of Dataset
  :type hdf_dataset: h5py._hl.files.File
  :param parser_args: argparse object from main()
  :return:
  """
  print >> log.v3, "Work on epoch: %i" % parser_args.epoch
  dataset.init_seq_order(parser_args.epoch)

  data_keys = sorted(dataset.get_data_keys())
  print >> log.v3, "Data keys:", data_keys

  # We need to do one run through the dataset to collect some stats like total len.
  print >> log.v3, "Collect stats, iterate through all data..."
  seq_idx = parser_args.start_seq
  seq_idxs = []
  seq_tags = []
  seq_lens = []
  total_seq_len = NumbersDict(0)
  max_tag_len = 0
  dataset_num_seqs = try_run(lambda: dataset.num_seqs, default=None)  # can be unknown
  if parser_args.end_seq != float("inf"):
    if dataset_num_seqs is not None:
      dataset_num_seqs = min(dataset_num_seqs, parser_args.end_seq)
    else:
      dataset_num_seqs = parser_args.end_seq
  if dataset_num_seqs is not None:
    dataset_num_seqs -= parser_args.start_seq
    assert dataset_num_seqs > 0
  while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= parser_args.end_seq:
    seq_idxs += [seq_idx]
    dataset.load_seqs(seq_idx, seq_idx + 1)
    seq_len = dataset.get_seq_length(seq_idx)
    seq_lens += [seq_len]
    tag = dataset.get_tag(seq_idx)
    seq_tags += [tag]
    max_tag_len = max(len(tag), max_tag_len)
    total_seq_len += seq_len
    if dataset_num_seqs is not None:
      progress_bar_with_time(float(seq_idx - parser_args.start_seq) / dataset_num_seqs)
    seq_idx += 1
  num_seqs = len(seq_idxs)

  assert num_seqs > 0
  shapes = {}
  for data_key in data_keys:
    assert data_key in total_seq_len.dict
    shape = [total_seq_len[data_key]]
    shape += dataset.get_data_shape(data_key)
    print >> log.v3, "Total len of %r is %s, shape %r, dtype %s" % (
                     data_key, human_size(shape[0]), shape, dataset.get_data_dtype(data_key))
    shapes[data_key] = shape

  print >> log.v3, "Set seq tags..."
  hdf_dataset.create_dataset('seqTags', shape=(num_seqs,), dtype="S%i" % (max_tag_len + 1))
  for i, tag in enumerate(seq_tags):
    hdf_dataset['seqTags'][i] = tag
    progress_bar_with_time(float(i) / num_seqs)

  print >> log.v3, "Set seq len info..."
  hdf_dataset.create_dataset(HDFDataset.attr_seqLengths, shape=(num_seqs, 2), dtype="int32")
  for i, seq_len in enumerate(seq_lens):
    data_len = seq_len["data"]
    targets_len = seq_len["classes"]
    for data_key in dataset.get_target_list():
      assert seq_len[data_key] == targets_len, "different lengths in multi-target not supported"
    if targets_len is None:
      targets_len = data_len
    hdf_dataset[HDFDataset.attr_seqLengths][i] = [data_len, targets_len]
    progress_bar_with_time(float(i) / num_seqs)

  print >> log.v3, "Create arrays in HDF..."
  hdf_dataset.create_group('targets/data')
  hdf_dataset.create_group('targets/size')
  hdf_dataset.create_group('targets/labels')
  for data_key in data_keys:
    if data_key == "data":
      hdf_dataset.create_dataset(
        'inputs', shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key))
    else:
      hdf_dataset['targets/data'].create_dataset(
        data_key, shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key))
      hdf_dataset['targets/size'].attrs[data_key] = dataset.num_outputs[data_key]

    if data_key in dataset.labels:
      labels = dataset.labels[data_key]
      assert len(labels) == dataset.num_outputs[data_key][0]
    else:
      labels = ["%s-class-%i" % (data_key, i) for i in range(dataset.get_data_dim(data_key))]
    print >> log.v5, "Labels for %s:" % data_key, labels[:3], "..."
    max_label_len = max(map(len, labels))
    hdf_dataset['targets/labels'].create_dataset(data_key, (len(labels),), dtype="S%i" % (max_label_len + 1))
    for i, label in enumerate(labels):
      hdf_dataset['targets/labels'][data_key][i] = label

  # Again iterate through dataset, and set the data
  print >> log.v3, "Write data..."
  dataset.init_seq_order(parser_args.epoch)
  offsets = NumbersDict(0)
  for seq_idx, tag in zip(seq_idxs, seq_tags):
    dataset.load_seqs(seq_idx, seq_idx + 1)
    tag_ = dataset.get_tag(seq_idx)
    assert tag == tag_  # Just a check for sanity. We expect the same order.
    seq_len = dataset.get_seq_length(seq_idx)
    for data_key in data_keys:
      if data_key == "data":
        hdf_data = hdf_dataset['inputs']
      else:
        hdf_data = hdf_dataset['targets/data'][data_key]
      data = dataset.get_data(seq_idx, data_key)
      hdf_data[offsets[data_key]:offsets[data_key] + seq_len[data_key]] = data

    progress_bar_with_time(float(offsets["data"]) / total_seq_len["data"])

    offsets += seq_len

  assert offsets == total_seq_len  # Sanity check.

  # Set some old-format attribs. Not needed for newer CRNN versions.
  hdf_dataset.attrs[HDFDataset.attr_inputPattSize] = dataset.num_inputs
  hdf_dataset.attrs[HDFDataset.attr_numLabels] = dataset.num_outputs.get("classes", (0, 0))[0]

  print >> log.v3, "All done."
コード例 #7
0
def demo():
    print("SprintDataset demo.")
    from argparse import ArgumentParser
    from Util import hms, progress_bar_with_time
    from Log import log
    from Config import Config
    from Dataset import init_dataset
    arg_parser = ArgumentParser()
    arg_parser.add_argument("--config",
                            help="config with ExternSprintDataset",
                            required=True)
    arg_parser.add_argument("--sprint_cache_dataset",
                            help="kwargs dict for SprintCacheDataset",
                            required=True)
    arg_parser.add_argument("--max_num_seqs", default=sys.maxint, type=int)
    arg_parser.add_argument("--action",
                            default="compare",
                            help="compare or benchmark")
    args = arg_parser.parse_args()
    log.initialize(verbosity=[4])
    sprint_cache_dataset_kwargs = eval(args.sprint_cache_dataset)
    assert isinstance(sprint_cache_dataset_kwargs, dict)
    sprint_cache_dataset = SprintCacheDataset(**sprint_cache_dataset_kwargs)
    print("SprintCacheDataset: %r" % sprint_cache_dataset)
    config = Config()
    config.load_file(args.config)
    dataset = init_dataset(config.typed_value("train"))
    print("Dataset via config: %r" % dataset)
    assert sprint_cache_dataset.num_inputs == dataset.num_inputs
    assert tuple(sprint_cache_dataset.num_outputs["classes"]) == tuple(
        dataset.num_outputs["classes"])
    sprint_cache_dataset.init_seq_order(epoch=1)

    if args.action == "compare":
        print("Iterating through dataset...")
        seq_idx = 0
        dataset.init_seq_order(epoch=1)
        while seq_idx < args.max_num_seqs:
            if not dataset.is_less_than_num_seqs(seq_idx):
                break
            dataset.load_seqs(seq_idx, seq_idx + 1)
            tag = dataset.get_tag(seq_idx)
            assert not tag.startswith(
                "seq-"), "dataset does not provide tag-names for seqs"
            dataset_seq = sprint_cache_dataset.get_dataset_seq_for_name(tag)
            data = dataset.get_data(seq_idx, "data")
            targets = dataset.get_data(seq_idx, "classes")
            assert data.shape == dataset_seq.features.shape
            assert targets.shape == dataset_seq.targets["classes"].shape
            assert numpy.allclose(data, dataset_seq.features)
            assert numpy.allclose(targets, dataset_seq.targets["classes"])
            seq_idx += 1
            progress_bar_with_time(dataset.get_complete_frac(seq_idx))

        print("Finished through dataset. Num seqs: %i" % seq_idx)
        print("SprintCacheDataset has num seqs: %i." %
              sprint_cache_dataset.num_seqs)

    elif args.action == "benchmark":
        print("Iterating through dataset...")
        start_time = time.time()
        seq_tags = []
        seq_idx = 0
        dataset.init_seq_order(epoch=1)
        while seq_idx < args.max_num_seqs:
            if not dataset.is_less_than_num_seqs(seq_idx):
                break
            dataset.load_seqs(seq_idx, seq_idx + 1)
            tag = dataset.get_tag(seq_idx)
            assert not tag.startswith(
                "seq-"), "dataset does not provide tag-names for seqs"
            seq_tags.append(tag)
            dataset.get_data(seq_idx, "data")
            dataset.get_data(seq_idx, "classes")
            seq_idx += 1
            progress_bar_with_time(dataset.get_complete_frac(seq_idx))
        print("Finished through dataset. Num seqs: %i, time: %f" %
              (seq_idx, time.time() - start_time))
        print("SprintCacheDataset has num seqs: %i." %
              sprint_cache_dataset.num_seqs)
        if hasattr(dataset, "exit_handler"):
            dataset.exit_handler()
        else:
            print("No way to stop any background tasks.")
        del dataset

        start_time = time.time()
        print("Iterating through SprintCacheDataset...")
        for i, tag in enumerate(seq_tags):
            sprint_cache_dataset.get_dataset_seq_for_name(tag)
            progress_bar_with_time(float(i) / len(seq_tags))
        print("Finished through SprintCacheDataset. time: %f" %
              (time.time() - start_time, ))

    else:
        raise Exception("invalid action: %r" % args.action)
コード例 #8
0
ファイル: SprintDataset.py プロジェクト: rwth-i6/returnn
def demo():
  """
  Demo.
  """
  print("SprintDataset demo.")
  from argparse import ArgumentParser
  from Util import progress_bar_with_time
  from Log import log
  from Config import Config
  from Dataset import init_dataset
  arg_parser = ArgumentParser()
  arg_parser.add_argument("--config", help="config with ExternSprintDataset", required=True)
  arg_parser.add_argument("--sprint_cache_dataset", help="kwargs dict for SprintCacheDataset", required=True)
  arg_parser.add_argument("--max_num_seqs", default=sys.maxsize, type=int)
  arg_parser.add_argument("--action", default="compare", help="compare or benchmark")
  args = arg_parser.parse_args()
  log.initialize(verbosity=[4])
  sprint_cache_dataset_kwargs = eval(args.sprint_cache_dataset)
  assert isinstance(sprint_cache_dataset_kwargs, dict)
  sprint_cache_dataset = SprintCacheDataset(**sprint_cache_dataset_kwargs)
  print("SprintCacheDataset: %r" % sprint_cache_dataset)
  config = Config()
  config.load_file(args.config)
  dataset = init_dataset(config.typed_value("train"))
  print("Dataset via config: %r" % dataset)
  assert sprint_cache_dataset.num_inputs == dataset.num_inputs
  assert tuple(sprint_cache_dataset.num_outputs["classes"]) == tuple(dataset.num_outputs["classes"])
  sprint_cache_dataset.init_seq_order(epoch=1)

  if args.action == "compare":
    print("Iterating through dataset...")
    seq_idx = 0
    dataset.init_seq_order(epoch=1)
    while seq_idx < args.max_num_seqs:
      if not dataset.is_less_than_num_seqs(seq_idx):
        break
      dataset.load_seqs(seq_idx, seq_idx + 1)
      tag = dataset.get_tag(seq_idx)
      assert not tag.startswith("seq-"), "dataset does not provide tag-names for seqs"
      dataset_seq = sprint_cache_dataset.get_dataset_seq_for_name(tag)
      data = dataset.get_data(seq_idx, "data")
      targets = dataset.get_data(seq_idx, "classes")
      assert data.shape == dataset_seq.features["data"].shape
      assert targets.shape == dataset_seq.features["classes"].shape
      assert numpy.allclose(data, dataset_seq.features["data"])
      assert numpy.allclose(targets, dataset_seq.features["classes"])
      seq_idx += 1
      progress_bar_with_time(dataset.get_complete_frac(seq_idx))

    print("Finished through dataset. Num seqs: %i" % seq_idx)
    print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs)

  elif args.action == "benchmark":
    print("Iterating through dataset...")
    start_time = time.time()
    seq_tags = []
    seq_idx = 0
    dataset.init_seq_order(epoch=1)
    while seq_idx < args.max_num_seqs:
      if not dataset.is_less_than_num_seqs(seq_idx):
        break
      dataset.load_seqs(seq_idx, seq_idx + 1)
      tag = dataset.get_tag(seq_idx)
      assert not tag.startswith("seq-"), "dataset does not provide tag-names for seqs"
      seq_tags.append(tag)
      dataset.get_data(seq_idx, "data")
      dataset.get_data(seq_idx, "classes")
      seq_idx += 1
      progress_bar_with_time(dataset.get_complete_frac(seq_idx))
    print("Finished through dataset. Num seqs: %i, time: %f" % (seq_idx, time.time() - start_time))
    print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs)
    if hasattr(dataset, "exit_handler"):
      dataset.exit_handler()
    else:
      print("No way to stop any background tasks.")
    del dataset

    start_time = time.time()
    print("Iterating through SprintCacheDataset...")
    for i, tag in enumerate(seq_tags):
      sprint_cache_dataset.get_dataset_seq_for_name(tag)
      progress_bar_with_time(float(i) / len(seq_tags))
    print("Finished through SprintCacheDataset. time: %f" % (time.time() - start_time,))

  else:
    raise Exception("invalid action: %r" % args.action)
コード例 #9
0
ファイル: HDFDataset.py プロジェクト: nikita68/returnn
  def dump_from_dataset(self, dataset, epoch=1, start_seq=0, end_seq=float("inf"), use_progress_bar=True):
    """
    :param Dataset dataset: could be any dataset implemented as child of Dataset
    :param int epoch: for dataset
    :param int start_seq:
    :param int|float end_seq:
    :param bool use_progress_bar:
    """
    from Util import NumbersDict, human_size, progress_bar_with_time, try_run, PY3
    hdf_dataset = self.file

    print("Work on epoch: %i" % epoch, file=log.v3)
    dataset.init_seq_order(epoch)

    data_keys = sorted(dataset.get_data_keys())
    print("Data keys:", data_keys, file=log.v3)
    if "orth" in data_keys:  # special workaround for now, not handled
      data_keys.remove("orth")
    data_target_keys = [key for key in dataset.get_target_list() if key in data_keys]
    data_input_keys = [key for key in data_keys if key not in data_target_keys]
    assert len(data_input_keys) > 0 and len(data_target_keys) > 0
    if len(data_input_keys) > 1:
      if "data" in data_input_keys:
        default_data_input_key = "data"
      else:
        raise Exception("not sure which input data key to use from %r" % (data_input_keys,))
    else:
      default_data_input_key = data_input_keys[0]
    print("Using input data key:", default_data_input_key)
    if len(data_target_keys) > 1:
      if "classes" in data_target_keys:
        default_data_target_key = "classes"
      else:
        raise Exception("not sure which target data key to use from %r" % (data_target_keys,))
    else:
      default_data_target_key = data_target_keys[0]
    print("Using target data key:", default_data_target_key)

    hdf_data_key_map = {key: key for key in data_keys if key != default_data_input_key}
    if "data" in hdf_data_key_map:
      hdf_data_key_map["data"] = "classes"  # Replace "data" which is reserved for input key in HDFDataset.
      assert "classes" not in hdf_data_key_map

    # We need to do one run through the dataset to collect some stats like total len.
    print("Collect stats, iterate through all data...", file=log.v3)
    seq_idx = start_seq
    seq_idxs = []
    seq_tags = []
    seq_lens = []
    total_seq_len = NumbersDict(0)
    max_tag_len = 0
    dataset_num_seqs = try_run(lambda: dataset.num_seqs, default=None)  # can be unknown
    if end_seq != float("inf"):
      if dataset_num_seqs is not None:
        dataset_num_seqs = min(dataset_num_seqs, end_seq)
      else:
        dataset_num_seqs = end_seq
    if dataset_num_seqs is not None:
      dataset_num_seqs -= start_seq
      assert dataset_num_seqs > 0
    while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= end_seq:
      seq_idxs += [seq_idx]
      dataset.load_seqs(seq_idx, seq_idx + 1)
      seq_len = dataset.get_seq_length(seq_idx)
      seq_lens += [seq_len]
      tag = dataset.get_tag(seq_idx)
      seq_tags += [tag]
      max_tag_len = max(len(tag), max_tag_len)
      total_seq_len += seq_len
      if use_progress_bar and dataset_num_seqs is not None:
        progress_bar_with_time(float(seq_idx - start_seq) / dataset_num_seqs)
      seq_idx += 1
    num_seqs = len(seq_idxs)

    assert num_seqs > 0
    shapes = {}
    for data_key in data_keys:
      assert data_key in total_seq_len.dict
      shape = [total_seq_len[data_key]]
      shape += dataset.get_data_shape(data_key)
      print("Total len of %r is %s, shape %r, dtype %s" % (
        data_key, human_size(shape[0]), shape, dataset.get_data_dtype(data_key)), file=log.v3)
      shapes[data_key] = shape

    print("Set seq tags...", file=log.v3)
    hdf_dataset.create_dataset('seqTags', shape=(num_seqs,), dtype="S%i" % (max_tag_len + 1))
    for i, tag in enumerate(seq_tags):
      hdf_dataset['seqTags'][i] = numpy.array(tag, dtype="S%i" % (max_tag_len + 1))
      if use_progress_bar:
        progress_bar_with_time(float(i) / num_seqs)

    print("Set seq len info...", file=log.v3)
    hdf_dataset.create_dataset(attr_seqLengths, shape=(num_seqs, 2), dtype="int32")
    for i, seq_len in enumerate(seq_lens):
      data_len = seq_len[default_data_input_key]
      targets_len = seq_len[default_data_target_key]
      for data_key in data_target_keys:
        assert seq_len[data_key] == targets_len, "different lengths in multi-target not supported"
      if targets_len is None:
        targets_len = data_len
      hdf_dataset[attr_seqLengths][i] = [data_len, targets_len]
      if use_progress_bar:
        progress_bar_with_time(float(i) / num_seqs)

    print("Create arrays in HDF...", file=log.v3)
    hdf_dataset.create_group('targets/data')
    hdf_dataset.create_group('targets/size')
    hdf_dataset.create_group('targets/labels')
    for data_key in data_keys:
      if data_key == default_data_input_key:
        hdf_dataset.create_dataset(
          'inputs', shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key))
      else:
        hdf_dataset['targets/data'].create_dataset(
          hdf_data_key_map[data_key], shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key))
        hdf_dataset['targets/size'].attrs[hdf_data_key_map[data_key]] = dataset.num_outputs[data_key]
      if data_key in dataset.labels:
        labels = dataset.labels[data_key]
        if PY3:
          labels = [label.encode("utf8") for label in labels]
        assert len(labels) == dataset.num_outputs[data_key][0]
      else:
        labels = ["%s-class-%i" % (data_key, i) for i in range(dataset.get_data_dim(data_key))]
      print("Labels for %s:" % data_key, labels[:3], "...", file=log.v5)
      max_label_len = max(map(len, labels))
      if data_key != default_data_input_key:
        hdf_dataset['targets/labels'].create_dataset(hdf_data_key_map[data_key],
                                                     (len(labels),), dtype="S%i" % (max_label_len + 1))
        for i, label in enumerate(labels):
          hdf_dataset['targets/labels'][hdf_data_key_map[data_key]][i] = numpy.array(
            label, dtype="S%i" % (max_label_len + 1))

    # Again iterate through dataset, and set the data
    print("Write data...", file=log.v3)
    dataset.init_seq_order(epoch)
    offsets = NumbersDict(0)
    for seq_idx, tag in zip(seq_idxs, seq_tags):
      dataset.load_seqs(seq_idx, seq_idx + 1)
      tag_ = dataset.get_tag(seq_idx)
      assert tag == tag_  # Just a check for sanity. We expect the same order.
      seq_len = dataset.get_seq_length(seq_idx)
      for data_key in data_keys:
        if data_key == default_data_input_key:
          hdf_data = hdf_dataset['inputs']
        else:
          hdf_data = hdf_dataset['targets/data'][hdf_data_key_map[data_key]]
        data = dataset.get_data(seq_idx, data_key)
        hdf_data[offsets[data_key]:offsets[data_key] + seq_len[data_key]] = data

      if use_progress_bar:
        progress_bar_with_time(float(offsets[default_data_input_key]) / total_seq_len[default_data_input_key])

      offsets += seq_len

    assert offsets == total_seq_len  # Sanity check.

    # Set some old-format attribs. Not needed for newer CRNN versions.
    hdf_dataset.attrs[attr_inputPattSize] = dataset.num_inputs
    hdf_dataset.attrs[attr_numLabels] = dataset.num_outputs.get(default_data_target_key, (0, 0))[0]

    print("All done.", file=log.v3)
コード例 #10
0
ファイル: HDFDataset.py プロジェクト: rwth-i6/returnn
  def dump_from_dataset(self, dataset, epoch=1, start_seq=0, end_seq=float("inf"), use_progress_bar=True):
    """
    :param Dataset dataset: could be any dataset implemented as child of Dataset
    :param int epoch: for dataset
    :param int start_seq:
    :param int|float end_seq:
    :param bool use_progress_bar:
    """
    from Util import NumbersDict, human_size, progress_bar_with_time, try_run, PY3
    hdf_dataset = self.file

    print("Work on epoch: %i" % epoch, file=log.v3)
    dataset.init_seq_order(epoch)

    data_keys = sorted(dataset.get_data_keys())
    print("Data keys:", data_keys, file=log.v3)
    if "orth" in data_keys:  # special workaround for now, not handled
      data_keys.remove("orth")
    data_target_keys = [key for key in dataset.get_target_list() if key in data_keys]
    data_input_keys = [key for key in data_keys if key not in data_target_keys]
    assert len(data_input_keys) > 0 and len(data_target_keys) > 0
    if len(data_input_keys) > 1:
      if "data" in data_input_keys:
        default_data_input_key = "data"
      else:
        raise Exception("not sure which input data key to use from %r" % (data_input_keys,))
    else:
      default_data_input_key = data_input_keys[0]
    print("Using input data key:", default_data_input_key)
    if len(data_target_keys) > 1:
      if "classes" in data_target_keys:
        default_data_target_key = "classes"
      else:
        raise Exception("not sure which target data key to use from %r" % (data_target_keys,))
    else:
      default_data_target_key = data_target_keys[0]
    print("Using target data key:", default_data_target_key)

    hdf_data_key_map = {key: key for key in data_keys if key != default_data_input_key}
    if "data" in hdf_data_key_map:
      hdf_data_key_map["data"] = "classes"  # Replace "data" which is reserved for input key in HDFDataset.
      assert "classes" not in hdf_data_key_map

    # We need to do one run through the dataset to collect some stats like total len.
    print("Collect stats, iterate through all data...", file=log.v3)
    seq_idx = start_seq
    seq_idxs = []
    seq_tags = []
    seq_lens = []
    total_seq_len = NumbersDict(0)
    max_tag_len = 0
    dataset_num_seqs = try_run(lambda: dataset.num_seqs, default=None)  # can be unknown
    if end_seq != float("inf"):
      if dataset_num_seqs is not None:
        dataset_num_seqs = min(dataset_num_seqs, end_seq)
      else:
        dataset_num_seqs = end_seq
    if dataset_num_seqs is not None:
      dataset_num_seqs -= start_seq
      assert dataset_num_seqs > 0
    while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= end_seq:
      seq_idxs += [seq_idx]
      dataset.load_seqs(seq_idx, seq_idx + 1)
      seq_len = dataset.get_seq_length(seq_idx)
      seq_lens += [seq_len]
      tag = dataset.get_tag(seq_idx)
      seq_tags += [tag]
      max_tag_len = max(len(tag), max_tag_len)
      total_seq_len += seq_len
      if use_progress_bar and dataset_num_seqs is not None:
        progress_bar_with_time(float(seq_idx - start_seq) / dataset_num_seqs)
      seq_idx += 1
    num_seqs = len(seq_idxs)

    assert num_seqs > 0
    shapes = {}
    for data_key in data_keys:
      assert data_key in total_seq_len.dict
      shape = [total_seq_len[data_key]]
      shape += dataset.get_data_shape(data_key)
      print("Total len of %r is %s, shape %r, dtype %s" % (
        data_key, human_size(shape[0]), shape, dataset.get_data_dtype(data_key)), file=log.v3)
      shapes[data_key] = shape

    print("Set seq tags...", file=log.v3)
    hdf_dataset.create_dataset('seqTags', shape=(num_seqs,), dtype="S%i" % (max_tag_len + 1))
    for i, tag in enumerate(seq_tags):
      hdf_dataset['seqTags'][i] = numpy.array(tag, dtype="S%i" % (max_tag_len + 1))
      if use_progress_bar:
        progress_bar_with_time(float(i) / num_seqs)

    print("Set seq len info...", file=log.v3)
    hdf_dataset.create_dataset(attr_seqLengths, shape=(num_seqs, 2), dtype="int32")
    for i, seq_len in enumerate(seq_lens):
      data_len = seq_len[default_data_input_key]
      targets_len = seq_len[default_data_target_key]
      for data_key in data_target_keys:
        assert seq_len[data_key] == targets_len, "different lengths in multi-target not supported"
      if targets_len is None:
        targets_len = data_len
      hdf_dataset[attr_seqLengths][i] = [data_len, targets_len]
      if use_progress_bar:
        progress_bar_with_time(float(i) / num_seqs)

    print("Create arrays in HDF...", file=log.v3)
    hdf_dataset.create_group('targets/data')
    hdf_dataset.create_group('targets/size')
    hdf_dataset.create_group('targets/labels')
    for data_key in data_keys:
      if data_key == default_data_input_key:
        hdf_dataset.create_dataset(
          'inputs', shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key))
      else:
        hdf_dataset['targets/data'].create_dataset(
          hdf_data_key_map[data_key], shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key))
        hdf_dataset['targets/size'].attrs[hdf_data_key_map[data_key]] = dataset.num_outputs[data_key]
      if data_key in dataset.labels:
        labels = dataset.labels[data_key]
        if PY3:
          labels = [label.encode("utf8") for label in labels]
        assert len(labels) == dataset.num_outputs[data_key][0]
      else:
        labels = ["%s-class-%i" % (data_key, i) for i in range(dataset.get_data_dim(data_key))]
      print("Labels for %s:" % data_key, labels[:3], "...", file=log.v5)
      max_label_len = max(map(len, labels))
      if data_key != default_data_input_key:
        hdf_dataset['targets/labels'].create_dataset(hdf_data_key_map[data_key],
                                                     (len(labels),), dtype="S%i" % (max_label_len + 1))
        for i, label in enumerate(labels):
          hdf_dataset['targets/labels'][hdf_data_key_map[data_key]][i] = numpy.array(
            label, dtype="S%i" % (max_label_len + 1))

    # Again iterate through dataset, and set the data
    print("Write data...", file=log.v3)
    dataset.init_seq_order(epoch)
    offsets = NumbersDict(0)
    for seq_idx, tag in zip(seq_idxs, seq_tags):
      dataset.load_seqs(seq_idx, seq_idx + 1)
      tag_ = dataset.get_tag(seq_idx)
      assert tag == tag_  # Just a check for sanity. We expect the same order.
      seq_len = dataset.get_seq_length(seq_idx)
      for data_key in data_keys:
        if data_key == default_data_input_key:
          hdf_data = hdf_dataset['inputs']
        else:
          hdf_data = hdf_dataset['targets/data'][hdf_data_key_map[data_key]]
        data = dataset.get_data(seq_idx, data_key)
        hdf_data[offsets[data_key]:offsets[data_key] + seq_len[data_key]] = data

      if use_progress_bar:
        progress_bar_with_time(float(offsets[default_data_input_key]) / total_seq_len[default_data_input_key])

      offsets += seq_len

    assert offsets == total_seq_len  # Sanity check.

    # Set some old-format attribs. Not needed for newer CRNN versions.
    hdf_dataset.attrs[attr_inputPattSize] = dataset.num_inputs
    hdf_dataset.attrs[attr_numLabels] = dataset.num_outputs.get(default_data_target_key, (0, 0))[0]

    print("All done.", file=log.v3)