Ejemplo n.º 1
0
def analyze_data(config):  # pylint: disable=redefined-outer-name
    """
  :param Config config:
  """
    dss = config.value('analyze_dataset', 'train')
    ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss]
    epoch = config.int('epoch', 1)
    print("Analyze dataset", dss, "epoch", epoch, file=log.v1)
    ds.init_seq_order(epoch=epoch)
    stat_prefix = config.value('statistics_save_prefix', 'statistics')
    dtype = config.value('statistics_dtype', 'float64')
    target = config.value('target', 'classes')
    data_key = config.value('data_key', 'data')
    assert ds.is_data_sparse(target), "need for prior calculation"
    assert not ds.is_data_sparse(data_key), "needed for mean/var estimation"
    from Util import inplace_increment, progress_bar_with_time, NumbersDict

    priors = numpy.zeros((ds.get_data_dim(target), ), dtype=dtype)
    mean = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype)
    mean_sq = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype)
    total_targets_len = 0
    total_data_len = 0

    # Note: This is not stable! See :class:`Util.Stats` for a better alternative.
    seq_idx = 0
    while ds.is_less_than_num_seqs(seq_idx):
        progress_bar_with_time(ds.get_complete_frac(seq_idx))
        ds.load_seqs(seq_idx, seq_idx + 1)
        targets = ds.get_data(seq_idx, target)
        inplace_increment(priors, targets, 1)
        total_targets_len += targets.shape[0]
        data = ds.get_data(seq_idx, data_key)
        new_total_data_len = total_data_len + data.shape[0]
        f = float(total_data_len) / new_total_data_len
        mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f)
        mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f)
        total_data_len = new_total_data_len
        seq_idx += 1
    log_priors = numpy.log(priors)
    log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target])
    std_dev = numpy.sqrt(mean_sq - mean * mean)
    print("Finished. %i total target frames, %i total data frames" %
          (total_targets_len, total_data_len),
          file=log.v1)
    priors_fn = stat_prefix + ".log_priors.txt"
    mean_fn = stat_prefix + ".mean.txt"
    std_dev_fn = stat_prefix + ".std_dev.txt"
    print("Dump priors to", priors_fn, file=log.v1)
    numpy.savetxt(priors_fn, log_priors)
    print("Dump mean to", mean_fn, file=log.v1)
    numpy.savetxt(mean_fn, mean)
    print("Dump std dev to", std_dev_fn, file=log.v1)
    numpy.savetxt(std_dev_fn, std_dev)
    print("Done.", file=log.v1)
Ejemplo n.º 2
0
def analyze_data(config):  # pylint: disable=redefined-outer-name
  """
  :param Config config:
  """
  dss = config.value('analyze_dataset', 'train')
  ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss]
  epoch = config.int('epoch', 1)
  print("Analyze dataset", dss, "epoch", epoch, file=log.v1)
  ds.init_seq_order(epoch=epoch)
  stat_prefix = config.value('statistics_save_prefix', 'statistics')
  dtype = config.value('statistics_dtype', 'float64')
  target = config.value('target', 'classes')
  data_key = config.value('data_key', 'data')
  assert ds.is_data_sparse(target), "need for prior calculation"
  assert not ds.is_data_sparse(data_key), "needed for mean/var estimation"
  from Util import inplace_increment, progress_bar_with_time, NumbersDict

  priors = numpy.zeros((ds.get_data_dim(target),), dtype=dtype)
  mean = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype)
  mean_sq = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype)
  total_targets_len = 0
  total_data_len = 0

  # Note: This is not stable! See :class:`Util.Stats` for a better alternative.
  seq_idx = 0
  while ds.is_less_than_num_seqs(seq_idx):
    progress_bar_with_time(ds.get_complete_frac(seq_idx))
    ds.load_seqs(seq_idx, seq_idx + 1)
    targets = ds.get_data(seq_idx, target)
    inplace_increment(priors, targets, 1)
    total_targets_len += targets.shape[0]
    data = ds.get_data(seq_idx, data_key)
    new_total_data_len = total_data_len + data.shape[0]
    f = float(total_data_len) / new_total_data_len
    mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f)
    mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f)
    total_data_len = new_total_data_len
    seq_idx += 1
  log_priors = numpy.log(priors)
  log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target])
  std_dev = numpy.sqrt(mean_sq - mean * mean)
  print("Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len), file=log.v1)
  priors_fn = stat_prefix + ".log_priors.txt"
  mean_fn = stat_prefix + ".mean.txt"
  std_dev_fn = stat_prefix + ".std_dev.txt"
  print("Dump priors to", priors_fn, file=log.v1)
  numpy.savetxt(priors_fn, log_priors)
  print("Dump mean to", mean_fn, file=log.v1)
  numpy.savetxt(mean_fn, mean)
  print("Dump std dev to", std_dev_fn, file=log.v1)
  numpy.savetxt(std_dev_fn, std_dev)
  print("Done.", file=log.v1)
Ejemplo n.º 3
0
def analyze_data(config):
    dss = config.value('analyze_dataset', 'train')
    ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss]
    epoch = config.int('epoch', 1)
    print >> log.v1, "Analyze dataset", dss, "epoch", epoch
    ds.init_seq_order(epoch=epoch)
    stat_prefix = config.value('statistics_save_prefix', 'statistics')
    dtype = config.value('statistics_dtype', 'float64')
    target = config.value('target', 'classes')
    data_key = config.value('data_key', 'data')
    assert ds.is_data_sparse(target), "need for prior calculation"
    assert not ds.is_data_sparse(data_key), "needed for mean/var estimation"
    from Util import inplace_increment, progress_bar_with_time, NumbersDict

    priors = numpy.zeros((ds.get_data_dim(target), ), dtype=dtype)
    mean = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype)
    mean_sq = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype)
    total_targets_len = 0
    total_data_len = 0

    seq_idx = 0
    while ds.is_less_than_num_seqs(seq_idx):
        progress_bar_with_time(ds.get_complete_frac(seq_idx))
        ds.load_seqs(seq_idx, seq_idx + 1)
        targets = ds.get_data(seq_idx, target)
        inplace_increment(priors, targets, 1)
        total_targets_len += targets.shape[0]
        data = ds.get_data(seq_idx, data_key)
        new_total_data_len = total_data_len + data.shape[0]
        f = float(total_data_len) / new_total_data_len
        mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f)
        mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f)
        total_data_len = new_total_data_len
        seq_idx += 1
    log_priors = numpy.log(priors)
    log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target])
    var = numpy.sqrt(mean_sq - mean * mean)
    print >> log.v1, "Finished. %i total target frames, %i total data frames" % (
        total_targets_len, total_data_len)
    priors_fn = stat_prefix + ".log_priors.txt"
    mean_fn = stat_prefix + ".mean.txt"
    var_fn = stat_prefix + ".var.txt"
    print >> log.v1, "Dump priors to", priors_fn
    numpy.savetxt(priors_fn, log_priors)
    print >> log.v1, "Dump mean to", mean_fn
    numpy.savetxt(mean_fn, mean)
    print >> log.v1, "Dump var to", var_fn
    numpy.savetxt(var_fn, var)
    print >> log.v1, "Done."
Ejemplo n.º 4
0
def analyze_data(config):
  dss = config.value('analyze_dataset', 'train')
  ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss]
  epoch = config.int('epoch', 1)
  print >> log.v1, "Analyze dataset", dss, "epoch", epoch
  ds.init_seq_order(epoch=epoch)
  stat_prefix = config.value('statistics_save_prefix', 'statistics')
  dtype = config.value('statistics_dtype', 'float64')
  target = config.value('target', 'classes')
  data_key = config.value('data_key', 'data')
  assert ds.is_data_sparse(target), "need for prior calculation"
  assert not ds.is_data_sparse(data_key), "needed for mean/var estimation"
  from Util import inplace_increment, progress_bar_with_time, NumbersDict

  priors = numpy.zeros((ds.get_data_dim(target),), dtype=dtype)
  mean = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype)
  mean_sq = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype)
  total_targets_len = 0
  total_data_len = 0

  seq_idx = 0
  while ds.is_less_than_num_seqs(seq_idx):
    progress_bar_with_time(ds.get_complete_frac(seq_idx))
    ds.load_seqs(seq_idx, seq_idx + 1)
    targets = ds.get_data(seq_idx, target)
    inplace_increment(priors, targets, 1)
    total_targets_len += targets.shape[0]
    data = ds.get_data(seq_idx, data_key)
    new_total_data_len = total_data_len + data.shape[0]
    f = float(total_data_len) / new_total_data_len
    mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f)
    mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f)
    total_data_len = new_total_data_len
    seq_idx += 1
  log_priors = numpy.log(priors)
  log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target])
  var = numpy.sqrt(mean_sq - mean * mean)
  print >> log.v1, "Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len)
  priors_fn = stat_prefix + ".log_priors.txt"
  mean_fn = stat_prefix + ".mean.txt"
  var_fn = stat_prefix + ".var.txt"
  print >> log.v1, "Dump priors to", priors_fn
  numpy.savetxt(priors_fn, log_priors)
  print >> log.v1, "Dump mean to", mean_fn
  numpy.savetxt(mean_fn, mean)
  print >> log.v1, "Dump var to", var_fn
  numpy.savetxt(var_fn, var)
  print >> log.v1, "Done."