def analyze_data(config): # pylint: disable=redefined-outer-name """ :param Config config: """ dss = config.value('analyze_dataset', 'train') ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss] epoch = config.int('epoch', 1) print("Analyze dataset", dss, "epoch", epoch, file=log.v1) ds.init_seq_order(epoch=epoch) stat_prefix = config.value('statistics_save_prefix', 'statistics') dtype = config.value('statistics_dtype', 'float64') target = config.value('target', 'classes') data_key = config.value('data_key', 'data') assert ds.is_data_sparse(target), "need for prior calculation" assert not ds.is_data_sparse(data_key), "needed for mean/var estimation" from Util import inplace_increment, progress_bar_with_time, NumbersDict priors = numpy.zeros((ds.get_data_dim(target), ), dtype=dtype) mean = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype) mean_sq = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype) total_targets_len = 0 total_data_len = 0 # Note: This is not stable! See :class:`Util.Stats` for a better alternative. seq_idx = 0 while ds.is_less_than_num_seqs(seq_idx): progress_bar_with_time(ds.get_complete_frac(seq_idx)) ds.load_seqs(seq_idx, seq_idx + 1) targets = ds.get_data(seq_idx, target) inplace_increment(priors, targets, 1) total_targets_len += targets.shape[0] data = ds.get_data(seq_idx, data_key) new_total_data_len = total_data_len + data.shape[0] f = float(total_data_len) / new_total_data_len mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f) mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f) total_data_len = new_total_data_len seq_idx += 1 log_priors = numpy.log(priors) log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target]) std_dev = numpy.sqrt(mean_sq - mean * mean) print("Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len), file=log.v1) priors_fn = stat_prefix + ".log_priors.txt" mean_fn = stat_prefix + ".mean.txt" std_dev_fn = stat_prefix + ".std_dev.txt" print("Dump priors to", priors_fn, file=log.v1) numpy.savetxt(priors_fn, log_priors) print("Dump mean to", mean_fn, file=log.v1) numpy.savetxt(mean_fn, mean) print("Dump std dev to", std_dev_fn, file=log.v1) numpy.savetxt(std_dev_fn, std_dev) print("Done.", file=log.v1)
def analyze_data(config): # pylint: disable=redefined-outer-name """ :param Config config: """ dss = config.value('analyze_dataset', 'train') ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss] epoch = config.int('epoch', 1) print("Analyze dataset", dss, "epoch", epoch, file=log.v1) ds.init_seq_order(epoch=epoch) stat_prefix = config.value('statistics_save_prefix', 'statistics') dtype = config.value('statistics_dtype', 'float64') target = config.value('target', 'classes') data_key = config.value('data_key', 'data') assert ds.is_data_sparse(target), "need for prior calculation" assert not ds.is_data_sparse(data_key), "needed for mean/var estimation" from Util import inplace_increment, progress_bar_with_time, NumbersDict priors = numpy.zeros((ds.get_data_dim(target),), dtype=dtype) mean = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype) mean_sq = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype) total_targets_len = 0 total_data_len = 0 # Note: This is not stable! See :class:`Util.Stats` for a better alternative. seq_idx = 0 while ds.is_less_than_num_seqs(seq_idx): progress_bar_with_time(ds.get_complete_frac(seq_idx)) ds.load_seqs(seq_idx, seq_idx + 1) targets = ds.get_data(seq_idx, target) inplace_increment(priors, targets, 1) total_targets_len += targets.shape[0] data = ds.get_data(seq_idx, data_key) new_total_data_len = total_data_len + data.shape[0] f = float(total_data_len) / new_total_data_len mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f) mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f) total_data_len = new_total_data_len seq_idx += 1 log_priors = numpy.log(priors) log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target]) std_dev = numpy.sqrt(mean_sq - mean * mean) print("Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len), file=log.v1) priors_fn = stat_prefix + ".log_priors.txt" mean_fn = stat_prefix + ".mean.txt" std_dev_fn = stat_prefix + ".std_dev.txt" print("Dump priors to", priors_fn, file=log.v1) numpy.savetxt(priors_fn, log_priors) print("Dump mean to", mean_fn, file=log.v1) numpy.savetxt(mean_fn, mean) print("Dump std dev to", std_dev_fn, file=log.v1) numpy.savetxt(std_dev_fn, std_dev) print("Done.", file=log.v1)
def analyze_data(config): dss = config.value('analyze_dataset', 'train') ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss] epoch = config.int('epoch', 1) print >> log.v1, "Analyze dataset", dss, "epoch", epoch ds.init_seq_order(epoch=epoch) stat_prefix = config.value('statistics_save_prefix', 'statistics') dtype = config.value('statistics_dtype', 'float64') target = config.value('target', 'classes') data_key = config.value('data_key', 'data') assert ds.is_data_sparse(target), "need for prior calculation" assert not ds.is_data_sparse(data_key), "needed for mean/var estimation" from Util import inplace_increment, progress_bar_with_time, NumbersDict priors = numpy.zeros((ds.get_data_dim(target), ), dtype=dtype) mean = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype) mean_sq = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype) total_targets_len = 0 total_data_len = 0 seq_idx = 0 while ds.is_less_than_num_seqs(seq_idx): progress_bar_with_time(ds.get_complete_frac(seq_idx)) ds.load_seqs(seq_idx, seq_idx + 1) targets = ds.get_data(seq_idx, target) inplace_increment(priors, targets, 1) total_targets_len += targets.shape[0] data = ds.get_data(seq_idx, data_key) new_total_data_len = total_data_len + data.shape[0] f = float(total_data_len) / new_total_data_len mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f) mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f) total_data_len = new_total_data_len seq_idx += 1 log_priors = numpy.log(priors) log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target]) var = numpy.sqrt(mean_sq - mean * mean) print >> log.v1, "Finished. %i total target frames, %i total data frames" % ( total_targets_len, total_data_len) priors_fn = stat_prefix + ".log_priors.txt" mean_fn = stat_prefix + ".mean.txt" var_fn = stat_prefix + ".var.txt" print >> log.v1, "Dump priors to", priors_fn numpy.savetxt(priors_fn, log_priors) print >> log.v1, "Dump mean to", mean_fn numpy.savetxt(mean_fn, mean) print >> log.v1, "Dump var to", var_fn numpy.savetxt(var_fn, var) print >> log.v1, "Done."
def analyze_data(config): dss = config.value('analyze_dataset', 'train') ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss] epoch = config.int('epoch', 1) print >> log.v1, "Analyze dataset", dss, "epoch", epoch ds.init_seq_order(epoch=epoch) stat_prefix = config.value('statistics_save_prefix', 'statistics') dtype = config.value('statistics_dtype', 'float64') target = config.value('target', 'classes') data_key = config.value('data_key', 'data') assert ds.is_data_sparse(target), "need for prior calculation" assert not ds.is_data_sparse(data_key), "needed for mean/var estimation" from Util import inplace_increment, progress_bar_with_time, NumbersDict priors = numpy.zeros((ds.get_data_dim(target),), dtype=dtype) mean = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype) mean_sq = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype) total_targets_len = 0 total_data_len = 0 seq_idx = 0 while ds.is_less_than_num_seqs(seq_idx): progress_bar_with_time(ds.get_complete_frac(seq_idx)) ds.load_seqs(seq_idx, seq_idx + 1) targets = ds.get_data(seq_idx, target) inplace_increment(priors, targets, 1) total_targets_len += targets.shape[0] data = ds.get_data(seq_idx, data_key) new_total_data_len = total_data_len + data.shape[0] f = float(total_data_len) / new_total_data_len mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f) mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f) total_data_len = new_total_data_len seq_idx += 1 log_priors = numpy.log(priors) log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target]) var = numpy.sqrt(mean_sq - mean * mean) print >> log.v1, "Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len) priors_fn = stat_prefix + ".log_priors.txt" mean_fn = stat_prefix + ".mean.txt" var_fn = stat_prefix + ".var.txt" print >> log.v1, "Dump priors to", priors_fn numpy.savetxt(priors_fn, log_priors) print >> log.v1, "Dump mean to", mean_fn numpy.savetxt(mean_fn, mean) print >> log.v1, "Dump var to", var_fn numpy.savetxt(var_fn, var) print >> log.v1, "Done."