def analyze_data(config): # pylint: disable=redefined-outer-name """ :param Config config: """ dss = config.value('analyze_dataset', 'train') ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss] epoch = config.int('epoch', 1) print("Analyze dataset", dss, "epoch", epoch, file=log.v1) ds.init_seq_order(epoch=epoch) stat_prefix = config.value('statistics_save_prefix', 'statistics') dtype = config.value('statistics_dtype', 'float64') target = config.value('target', 'classes') data_key = config.value('data_key', 'data') assert ds.is_data_sparse(target), "need for prior calculation" assert not ds.is_data_sparse(data_key), "needed for mean/var estimation" from Util import inplace_increment, progress_bar_with_time, NumbersDict priors = numpy.zeros((ds.get_data_dim(target), ), dtype=dtype) mean = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype) mean_sq = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype) total_targets_len = 0 total_data_len = 0 # Note: This is not stable! See :class:`Util.Stats` for a better alternative. seq_idx = 0 while ds.is_less_than_num_seqs(seq_idx): progress_bar_with_time(ds.get_complete_frac(seq_idx)) ds.load_seqs(seq_idx, seq_idx + 1) targets = ds.get_data(seq_idx, target) inplace_increment(priors, targets, 1) total_targets_len += targets.shape[0] data = ds.get_data(seq_idx, data_key) new_total_data_len = total_data_len + data.shape[0] f = float(total_data_len) / new_total_data_len mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f) mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f) total_data_len = new_total_data_len seq_idx += 1 log_priors = numpy.log(priors) log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target]) std_dev = numpy.sqrt(mean_sq - mean * mean) print("Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len), file=log.v1) priors_fn = stat_prefix + ".log_priors.txt" mean_fn = stat_prefix + ".mean.txt" std_dev_fn = stat_prefix + ".std_dev.txt" print("Dump priors to", priors_fn, file=log.v1) numpy.savetxt(priors_fn, log_priors) print("Dump mean to", mean_fn, file=log.v1) numpy.savetxt(mean_fn, mean) print("Dump std dev to", std_dev_fn, file=log.v1) numpy.savetxt(std_dev_fn, std_dev) print("Done.", file=log.v1)
def analyze_data(config): # pylint: disable=redefined-outer-name """ :param Config config: """ dss = config.value('analyze_dataset', 'train') ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss] epoch = config.int('epoch', 1) print("Analyze dataset", dss, "epoch", epoch, file=log.v1) ds.init_seq_order(epoch=epoch) stat_prefix = config.value('statistics_save_prefix', 'statistics') dtype = config.value('statistics_dtype', 'float64') target = config.value('target', 'classes') data_key = config.value('data_key', 'data') assert ds.is_data_sparse(target), "need for prior calculation" assert not ds.is_data_sparse(data_key), "needed for mean/var estimation" from Util import inplace_increment, progress_bar_with_time, NumbersDict priors = numpy.zeros((ds.get_data_dim(target),), dtype=dtype) mean = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype) mean_sq = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype) total_targets_len = 0 total_data_len = 0 # Note: This is not stable! See :class:`Util.Stats` for a better alternative. seq_idx = 0 while ds.is_less_than_num_seqs(seq_idx): progress_bar_with_time(ds.get_complete_frac(seq_idx)) ds.load_seqs(seq_idx, seq_idx + 1) targets = ds.get_data(seq_idx, target) inplace_increment(priors, targets, 1) total_targets_len += targets.shape[0] data = ds.get_data(seq_idx, data_key) new_total_data_len = total_data_len + data.shape[0] f = float(total_data_len) / new_total_data_len mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f) mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f) total_data_len = new_total_data_len seq_idx += 1 log_priors = numpy.log(priors) log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target]) std_dev = numpy.sqrt(mean_sq - mean * mean) print("Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len), file=log.v1) priors_fn = stat_prefix + ".log_priors.txt" mean_fn = stat_prefix + ".mean.txt" std_dev_fn = stat_prefix + ".std_dev.txt" print("Dump priors to", priors_fn, file=log.v1) numpy.savetxt(priors_fn, log_priors) print("Dump mean to", mean_fn, file=log.v1) numpy.savetxt(mean_fn, mean) print("Dump std dev to", std_dev_fn, file=log.v1) numpy.savetxt(std_dev_fn, std_dev) print("Done.", file=log.v1)
def analyze_data(config): dss = config.value('analyze_dataset', 'train') ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss] epoch = config.int('epoch', 1) print >> log.v1, "Analyze dataset", dss, "epoch", epoch ds.init_seq_order(epoch=epoch) stat_prefix = config.value('statistics_save_prefix', 'statistics') dtype = config.value('statistics_dtype', 'float64') target = config.value('target', 'classes') data_key = config.value('data_key', 'data') assert ds.is_data_sparse(target), "need for prior calculation" assert not ds.is_data_sparse(data_key), "needed for mean/var estimation" from Util import inplace_increment, progress_bar_with_time, NumbersDict priors = numpy.zeros((ds.get_data_dim(target), ), dtype=dtype) mean = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype) mean_sq = numpy.zeros((ds.get_data_dim(data_key), ), dtype=dtype) total_targets_len = 0 total_data_len = 0 seq_idx = 0 while ds.is_less_than_num_seqs(seq_idx): progress_bar_with_time(ds.get_complete_frac(seq_idx)) ds.load_seqs(seq_idx, seq_idx + 1) targets = ds.get_data(seq_idx, target) inplace_increment(priors, targets, 1) total_targets_len += targets.shape[0] data = ds.get_data(seq_idx, data_key) new_total_data_len = total_data_len + data.shape[0] f = float(total_data_len) / new_total_data_len mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f) mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f) total_data_len = new_total_data_len seq_idx += 1 log_priors = numpy.log(priors) log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target]) var = numpy.sqrt(mean_sq - mean * mean) print >> log.v1, "Finished. %i total target frames, %i total data frames" % ( total_targets_len, total_data_len) priors_fn = stat_prefix + ".log_priors.txt" mean_fn = stat_prefix + ".mean.txt" var_fn = stat_prefix + ".var.txt" print >> log.v1, "Dump priors to", priors_fn numpy.savetxt(priors_fn, log_priors) print >> log.v1, "Dump mean to", mean_fn numpy.savetxt(mean_fn, mean) print >> log.v1, "Dump var to", var_fn numpy.savetxt(var_fn, var) print >> log.v1, "Done."
def analyze_data(config): dss = config.value('analyze_dataset', 'train') ds = {"train": train_data, "dev": dev_data, "eval": eval_data}[dss] epoch = config.int('epoch', 1) print >> log.v1, "Analyze dataset", dss, "epoch", epoch ds.init_seq_order(epoch=epoch) stat_prefix = config.value('statistics_save_prefix', 'statistics') dtype = config.value('statistics_dtype', 'float64') target = config.value('target', 'classes') data_key = config.value('data_key', 'data') assert ds.is_data_sparse(target), "need for prior calculation" assert not ds.is_data_sparse(data_key), "needed for mean/var estimation" from Util import inplace_increment, progress_bar_with_time, NumbersDict priors = numpy.zeros((ds.get_data_dim(target),), dtype=dtype) mean = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype) mean_sq = numpy.zeros((ds.get_data_dim(data_key),), dtype=dtype) total_targets_len = 0 total_data_len = 0 seq_idx = 0 while ds.is_less_than_num_seqs(seq_idx): progress_bar_with_time(ds.get_complete_frac(seq_idx)) ds.load_seqs(seq_idx, seq_idx + 1) targets = ds.get_data(seq_idx, target) inplace_increment(priors, targets, 1) total_targets_len += targets.shape[0] data = ds.get_data(seq_idx, data_key) new_total_data_len = total_data_len + data.shape[0] f = float(total_data_len) / new_total_data_len mean = mean * f + numpy.sum(data, axis=0) * (1.0 - f) mean_sq = mean_sq * f + numpy.sum(data * data, axis=0) * (1.0 - f) total_data_len = new_total_data_len seq_idx += 1 log_priors = numpy.log(priors) log_priors -= numpy.log(NumbersDict(ds.get_num_timesteps())[target]) var = numpy.sqrt(mean_sq - mean * mean) print >> log.v1, "Finished. %i total target frames, %i total data frames" % (total_targets_len, total_data_len) priors_fn = stat_prefix + ".log_priors.txt" mean_fn = stat_prefix + ".mean.txt" var_fn = stat_prefix + ".var.txt" print >> log.v1, "Dump priors to", priors_fn numpy.savetxt(priors_fn, log_priors) print >> log.v1, "Dump mean to", mean_fn numpy.savetxt(mean_fn, mean) print >> log.v1, "Dump var to", var_fn numpy.savetxt(var_fn, var) print >> log.v1, "Done."
def hdf_dump_from_dataset(dataset, hdf_dataset, parser_args): """ :param Dataset dataset: could be any dataset implemented as child of Dataset :type hdf_dataset: h5py._hl.files.File :param parser_args: argparse object from main() :return: """ print("Work on epoch: %i" % parser_args.epoch, file=log.v3) dataset.init_seq_order(parser_args.epoch) data_keys = sorted(dataset.get_data_keys()) print("Data keys:", data_keys, file=log.v3) if "orth" in data_keys: data_keys.remove("orth") # We need to do one run through the dataset to collect some stats like total len. print("Collect stats, iterate through all data...", file=log.v3) seq_idx = parser_args.start_seq seq_idxs = [] seq_tags = [] seq_lens = [] total_seq_len = NumbersDict(0) max_tag_len = 0 dataset_num_seqs = try_run(lambda: dataset.num_seqs, default=None) # can be unknown if parser_args.end_seq != float("inf"): if dataset_num_seqs is not None: dataset_num_seqs = min(dataset_num_seqs, parser_args.end_seq) else: dataset_num_seqs = parser_args.end_seq if dataset_num_seqs is not None: dataset_num_seqs -= parser_args.start_seq assert dataset_num_seqs > 0 while dataset.is_less_than_num_seqs( seq_idx) and seq_idx <= parser_args.end_seq: seq_idxs += [seq_idx] dataset.load_seqs(seq_idx, seq_idx + 1) seq_len = dataset.get_seq_length(seq_idx) seq_lens += [seq_len] tag = dataset.get_tag(seq_idx) seq_tags += [tag] max_tag_len = max(len(tag), max_tag_len) total_seq_len += seq_len if dataset_num_seqs is not None: progress_bar_with_time( float(seq_idx - parser_args.start_seq) / dataset_num_seqs) seq_idx += 1 num_seqs = len(seq_idxs) assert num_seqs > 0 shapes = {} for data_key in data_keys: assert data_key in total_seq_len.dict shape = [total_seq_len[data_key]] shape += dataset.get_data_shape(data_key) print("Total len of %r is %s, shape %r, dtype %s" % (data_key, human_size( shape[0]), shape, dataset.get_data_dtype(data_key)), file=log.v3) shapes[data_key] = shape print("Set seq tags...", file=log.v3) hdf_dataset.create_dataset('seqTags', shape=(num_seqs, ), dtype="S%i" % (max_tag_len + 1)) for i, tag in enumerate(seq_tags): hdf_dataset['seqTags'][i] = numpy.array(tag, dtype="S%i" % (max_tag_len + 1)) progress_bar_with_time(float(i) / num_seqs) print("Set seq len info...", file=log.v3) hdf_dataset.create_dataset(HDFDataset.attr_seqLengths, shape=(num_seqs, 2), dtype="int32") for i, seq_len in enumerate(seq_lens): data_len = seq_len["data"] targets_len = seq_len["classes"] for data_key in dataset.get_target_list(): if data_key == "orth": continue assert seq_len[ data_key] == targets_len, "different lengths in multi-target not supported" if targets_len is None: targets_len = data_len hdf_dataset[HDFDataset.attr_seqLengths][i] = [data_len, targets_len] progress_bar_with_time(float(i) / num_seqs) print("Create arrays in HDF...", file=log.v3) hdf_dataset.create_group('targets/data') hdf_dataset.create_group('targets/size') hdf_dataset.create_group('targets/labels') for data_key in data_keys: if data_key == "data": hdf_dataset.create_dataset('inputs', shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key)) else: hdf_dataset['targets/data'].create_dataset( data_key, shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key)) hdf_dataset['targets/size'].attrs[data_key] = dataset.num_outputs[ data_key] if data_key in dataset.labels: labels = dataset.labels[data_key] assert len(labels) == dataset.num_outputs[data_key][0] else: labels = [ "%s-class-%i" % (data_key, i) for i in range(dataset.get_data_dim(data_key)) ] print("Labels for %s:" % data_key, labels[:3], "...", file=log.v5) max_label_len = max(map(len, labels)) if data_key != "data": hdf_dataset['targets/labels'].create_dataset( data_key, (len(labels), ), dtype="S%i" % (max_label_len + 1)) for i, label in enumerate(labels): hdf_dataset['targets/labels'][data_key][i] = numpy.array( label, dtype="S%i" % (max_label_len + 1)) # Again iterate through dataset, and set the data print("Write data...", file=log.v3) dataset.init_seq_order(parser_args.epoch) offsets = NumbersDict(0) for seq_idx, tag in zip(seq_idxs, seq_tags): dataset.load_seqs(seq_idx, seq_idx + 1) tag_ = dataset.get_tag(seq_idx) assert tag == tag_ # Just a check for sanity. We expect the same order. seq_len = dataset.get_seq_length(seq_idx) for data_key in data_keys: if data_key == "data": hdf_data = hdf_dataset['inputs'] else: hdf_data = hdf_dataset['targets/data'][data_key] data = dataset.get_data(seq_idx, data_key) hdf_data[offsets[data_key]:offsets[data_key] + seq_len[data_key]] = data progress_bar_with_time(float(offsets["data"]) / total_seq_len["data"]) offsets += seq_len assert offsets == total_seq_len # Sanity check. # Set some old-format attribs. Not needed for newer CRNN versions. hdf_dataset.attrs[HDFDataset.attr_inputPattSize] = dataset.num_inputs hdf_dataset.attrs[HDFDataset.attr_numLabels] = dataset.num_outputs.get( "classes", (0, 0))[0] print("All done.", file=log.v3)
def hdf_dump_from_dataset(dataset, hdf_dataset, parser_args): """ :param Dataset dataset: could be any dataset implemented as child of Dataset :type hdf_dataset: h5py._hl.files.File :param parser_args: argparse object from main() :return: """ print >> log.v3, "Work on epoch: %i" % parser_args.epoch dataset.init_seq_order(parser_args.epoch) data_keys = sorted(dataset.get_data_keys()) print >> log.v3, "Data keys:", data_keys # We need to do one run through the dataset to collect some stats like total len. print >> log.v3, "Collect stats, iterate through all data..." seq_idx = parser_args.start_seq seq_idxs = [] seq_tags = [] seq_lens = [] total_seq_len = NumbersDict(0) max_tag_len = 0 dataset_num_seqs = try_run(lambda: dataset.num_seqs, default=None) # can be unknown if parser_args.end_seq != float("inf"): if dataset_num_seqs is not None: dataset_num_seqs = min(dataset_num_seqs, parser_args.end_seq) else: dataset_num_seqs = parser_args.end_seq if dataset_num_seqs is not None: dataset_num_seqs -= parser_args.start_seq assert dataset_num_seqs > 0 while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= parser_args.end_seq: seq_idxs += [seq_idx] dataset.load_seqs(seq_idx, seq_idx + 1) seq_len = dataset.get_seq_length(seq_idx) seq_lens += [seq_len] tag = dataset.get_tag(seq_idx) seq_tags += [tag] max_tag_len = max(len(tag), max_tag_len) total_seq_len += seq_len if dataset_num_seqs is not None: progress_bar_with_time(float(seq_idx - parser_args.start_seq) / dataset_num_seqs) seq_idx += 1 num_seqs = len(seq_idxs) assert num_seqs > 0 shapes = {} for data_key in data_keys: assert data_key in total_seq_len.dict shape = [total_seq_len[data_key]] shape += dataset.get_data_shape(data_key) print >> log.v3, "Total len of %r is %s, shape %r, dtype %s" % ( data_key, human_size(shape[0]), shape, dataset.get_data_dtype(data_key)) shapes[data_key] = shape print >> log.v3, "Set seq tags..." hdf_dataset.create_dataset('seqTags', shape=(num_seqs,), dtype="S%i" % (max_tag_len + 1)) for i, tag in enumerate(seq_tags): hdf_dataset['seqTags'][i] = tag progress_bar_with_time(float(i) / num_seqs) print >> log.v3, "Set seq len info..." hdf_dataset.create_dataset(HDFDataset.attr_seqLengths, shape=(num_seqs, 2), dtype="int32") for i, seq_len in enumerate(seq_lens): data_len = seq_len["data"] targets_len = seq_len["classes"] for data_key in dataset.get_target_list(): assert seq_len[data_key] == targets_len, "different lengths in multi-target not supported" if targets_len is None: targets_len = data_len hdf_dataset[HDFDataset.attr_seqLengths][i] = [data_len, targets_len] progress_bar_with_time(float(i) / num_seqs) print >> log.v3, "Create arrays in HDF..." hdf_dataset.create_group('targets/data') hdf_dataset.create_group('targets/size') hdf_dataset.create_group('targets/labels') for data_key in data_keys: if data_key == "data": hdf_dataset.create_dataset( 'inputs', shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key)) else: hdf_dataset['targets/data'].create_dataset( data_key, shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key)) hdf_dataset['targets/size'].attrs[data_key] = dataset.num_outputs[data_key] if data_key in dataset.labels: labels = dataset.labels[data_key] assert len(labels) == dataset.num_outputs[data_key][0] else: labels = ["%s-class-%i" % (data_key, i) for i in range(dataset.get_data_dim(data_key))] print >> log.v5, "Labels for %s:" % data_key, labels[:3], "..." max_label_len = max(map(len, labels)) hdf_dataset['targets/labels'].create_dataset(data_key, (len(labels),), dtype="S%i" % (max_label_len + 1)) for i, label in enumerate(labels): hdf_dataset['targets/labels'][data_key][i] = label # Again iterate through dataset, and set the data print >> log.v3, "Write data..." dataset.init_seq_order(parser_args.epoch) offsets = NumbersDict(0) for seq_idx, tag in zip(seq_idxs, seq_tags): dataset.load_seqs(seq_idx, seq_idx + 1) tag_ = dataset.get_tag(seq_idx) assert tag == tag_ # Just a check for sanity. We expect the same order. seq_len = dataset.get_seq_length(seq_idx) for data_key in data_keys: if data_key == "data": hdf_data = hdf_dataset['inputs'] else: hdf_data = hdf_dataset['targets/data'][data_key] data = dataset.get_data(seq_idx, data_key) hdf_data[offsets[data_key]:offsets[data_key] + seq_len[data_key]] = data progress_bar_with_time(float(offsets["data"]) / total_seq_len["data"]) offsets += seq_len assert offsets == total_seq_len # Sanity check. # Set some old-format attribs. Not needed for newer CRNN versions. hdf_dataset.attrs[HDFDataset.attr_inputPattSize] = dataset.num_inputs hdf_dataset.attrs[HDFDataset.attr_numLabels] = dataset.num_outputs.get("classes", (0, 0))[0] print >> log.v3, "All done."
def demo(): print("SprintDataset demo.") from argparse import ArgumentParser from Util import hms, progress_bar_with_time from Log import log from Config import Config from Dataset import init_dataset arg_parser = ArgumentParser() arg_parser.add_argument("--config", help="config with ExternSprintDataset", required=True) arg_parser.add_argument("--sprint_cache_dataset", help="kwargs dict for SprintCacheDataset", required=True) arg_parser.add_argument("--max_num_seqs", default=sys.maxint, type=int) arg_parser.add_argument("--action", default="compare", help="compare or benchmark") args = arg_parser.parse_args() log.initialize(verbosity=[4]) sprint_cache_dataset_kwargs = eval(args.sprint_cache_dataset) assert isinstance(sprint_cache_dataset_kwargs, dict) sprint_cache_dataset = SprintCacheDataset(**sprint_cache_dataset_kwargs) print("SprintCacheDataset: %r" % sprint_cache_dataset) config = Config() config.load_file(args.config) dataset = init_dataset(config.typed_value("train")) print("Dataset via config: %r" % dataset) assert sprint_cache_dataset.num_inputs == dataset.num_inputs assert tuple(sprint_cache_dataset.num_outputs["classes"]) == tuple( dataset.num_outputs["classes"]) sprint_cache_dataset.init_seq_order(epoch=1) if args.action == "compare": print("Iterating through dataset...") seq_idx = 0 dataset.init_seq_order(epoch=1) while seq_idx < args.max_num_seqs: if not dataset.is_less_than_num_seqs(seq_idx): break dataset.load_seqs(seq_idx, seq_idx + 1) tag = dataset.get_tag(seq_idx) assert not tag.startswith( "seq-"), "dataset does not provide tag-names for seqs" dataset_seq = sprint_cache_dataset.get_dataset_seq_for_name(tag) data = dataset.get_data(seq_idx, "data") targets = dataset.get_data(seq_idx, "classes") assert data.shape == dataset_seq.features.shape assert targets.shape == dataset_seq.targets["classes"].shape assert numpy.allclose(data, dataset_seq.features) assert numpy.allclose(targets, dataset_seq.targets["classes"]) seq_idx += 1 progress_bar_with_time(dataset.get_complete_frac(seq_idx)) print("Finished through dataset. Num seqs: %i" % seq_idx) print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs) elif args.action == "benchmark": print("Iterating through dataset...") start_time = time.time() seq_tags = [] seq_idx = 0 dataset.init_seq_order(epoch=1) while seq_idx < args.max_num_seqs: if not dataset.is_less_than_num_seqs(seq_idx): break dataset.load_seqs(seq_idx, seq_idx + 1) tag = dataset.get_tag(seq_idx) assert not tag.startswith( "seq-"), "dataset does not provide tag-names for seqs" seq_tags.append(tag) dataset.get_data(seq_idx, "data") dataset.get_data(seq_idx, "classes") seq_idx += 1 progress_bar_with_time(dataset.get_complete_frac(seq_idx)) print("Finished through dataset. Num seqs: %i, time: %f" % (seq_idx, time.time() - start_time)) print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs) if hasattr(dataset, "exit_handler"): dataset.exit_handler() else: print("No way to stop any background tasks.") del dataset start_time = time.time() print("Iterating through SprintCacheDataset...") for i, tag in enumerate(seq_tags): sprint_cache_dataset.get_dataset_seq_for_name(tag) progress_bar_with_time(float(i) / len(seq_tags)) print("Finished through SprintCacheDataset. time: %f" % (time.time() - start_time, )) else: raise Exception("invalid action: %r" % args.action)
def demo(): """ Demo. """ print("SprintDataset demo.") from argparse import ArgumentParser from Util import progress_bar_with_time from Log import log from Config import Config from Dataset import init_dataset arg_parser = ArgumentParser() arg_parser.add_argument("--config", help="config with ExternSprintDataset", required=True) arg_parser.add_argument("--sprint_cache_dataset", help="kwargs dict for SprintCacheDataset", required=True) arg_parser.add_argument("--max_num_seqs", default=sys.maxsize, type=int) arg_parser.add_argument("--action", default="compare", help="compare or benchmark") args = arg_parser.parse_args() log.initialize(verbosity=[4]) sprint_cache_dataset_kwargs = eval(args.sprint_cache_dataset) assert isinstance(sprint_cache_dataset_kwargs, dict) sprint_cache_dataset = SprintCacheDataset(**sprint_cache_dataset_kwargs) print("SprintCacheDataset: %r" % sprint_cache_dataset) config = Config() config.load_file(args.config) dataset = init_dataset(config.typed_value("train")) print("Dataset via config: %r" % dataset) assert sprint_cache_dataset.num_inputs == dataset.num_inputs assert tuple(sprint_cache_dataset.num_outputs["classes"]) == tuple(dataset.num_outputs["classes"]) sprint_cache_dataset.init_seq_order(epoch=1) if args.action == "compare": print("Iterating through dataset...") seq_idx = 0 dataset.init_seq_order(epoch=1) while seq_idx < args.max_num_seqs: if not dataset.is_less_than_num_seqs(seq_idx): break dataset.load_seqs(seq_idx, seq_idx + 1) tag = dataset.get_tag(seq_idx) assert not tag.startswith("seq-"), "dataset does not provide tag-names for seqs" dataset_seq = sprint_cache_dataset.get_dataset_seq_for_name(tag) data = dataset.get_data(seq_idx, "data") targets = dataset.get_data(seq_idx, "classes") assert data.shape == dataset_seq.features["data"].shape assert targets.shape == dataset_seq.features["classes"].shape assert numpy.allclose(data, dataset_seq.features["data"]) assert numpy.allclose(targets, dataset_seq.features["classes"]) seq_idx += 1 progress_bar_with_time(dataset.get_complete_frac(seq_idx)) print("Finished through dataset. Num seqs: %i" % seq_idx) print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs) elif args.action == "benchmark": print("Iterating through dataset...") start_time = time.time() seq_tags = [] seq_idx = 0 dataset.init_seq_order(epoch=1) while seq_idx < args.max_num_seqs: if not dataset.is_less_than_num_seqs(seq_idx): break dataset.load_seqs(seq_idx, seq_idx + 1) tag = dataset.get_tag(seq_idx) assert not tag.startswith("seq-"), "dataset does not provide tag-names for seqs" seq_tags.append(tag) dataset.get_data(seq_idx, "data") dataset.get_data(seq_idx, "classes") seq_idx += 1 progress_bar_with_time(dataset.get_complete_frac(seq_idx)) print("Finished through dataset. Num seqs: %i, time: %f" % (seq_idx, time.time() - start_time)) print("SprintCacheDataset has num seqs: %i." % sprint_cache_dataset.num_seqs) if hasattr(dataset, "exit_handler"): dataset.exit_handler() else: print("No way to stop any background tasks.") del dataset start_time = time.time() print("Iterating through SprintCacheDataset...") for i, tag in enumerate(seq_tags): sprint_cache_dataset.get_dataset_seq_for_name(tag) progress_bar_with_time(float(i) / len(seq_tags)) print("Finished through SprintCacheDataset. time: %f" % (time.time() - start_time,)) else: raise Exception("invalid action: %r" % args.action)
def dump_from_dataset(self, dataset, epoch=1, start_seq=0, end_seq=float("inf"), use_progress_bar=True): """ :param Dataset dataset: could be any dataset implemented as child of Dataset :param int epoch: for dataset :param int start_seq: :param int|float end_seq: :param bool use_progress_bar: """ from Util import NumbersDict, human_size, progress_bar_with_time, try_run, PY3 hdf_dataset = self.file print("Work on epoch: %i" % epoch, file=log.v3) dataset.init_seq_order(epoch) data_keys = sorted(dataset.get_data_keys()) print("Data keys:", data_keys, file=log.v3) if "orth" in data_keys: # special workaround for now, not handled data_keys.remove("orth") data_target_keys = [key for key in dataset.get_target_list() if key in data_keys] data_input_keys = [key for key in data_keys if key not in data_target_keys] assert len(data_input_keys) > 0 and len(data_target_keys) > 0 if len(data_input_keys) > 1: if "data" in data_input_keys: default_data_input_key = "data" else: raise Exception("not sure which input data key to use from %r" % (data_input_keys,)) else: default_data_input_key = data_input_keys[0] print("Using input data key:", default_data_input_key) if len(data_target_keys) > 1: if "classes" in data_target_keys: default_data_target_key = "classes" else: raise Exception("not sure which target data key to use from %r" % (data_target_keys,)) else: default_data_target_key = data_target_keys[0] print("Using target data key:", default_data_target_key) hdf_data_key_map = {key: key for key in data_keys if key != default_data_input_key} if "data" in hdf_data_key_map: hdf_data_key_map["data"] = "classes" # Replace "data" which is reserved for input key in HDFDataset. assert "classes" not in hdf_data_key_map # We need to do one run through the dataset to collect some stats like total len. print("Collect stats, iterate through all data...", file=log.v3) seq_idx = start_seq seq_idxs = [] seq_tags = [] seq_lens = [] total_seq_len = NumbersDict(0) max_tag_len = 0 dataset_num_seqs = try_run(lambda: dataset.num_seqs, default=None) # can be unknown if end_seq != float("inf"): if dataset_num_seqs is not None: dataset_num_seqs = min(dataset_num_seqs, end_seq) else: dataset_num_seqs = end_seq if dataset_num_seqs is not None: dataset_num_seqs -= start_seq assert dataset_num_seqs > 0 while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= end_seq: seq_idxs += [seq_idx] dataset.load_seqs(seq_idx, seq_idx + 1) seq_len = dataset.get_seq_length(seq_idx) seq_lens += [seq_len] tag = dataset.get_tag(seq_idx) seq_tags += [tag] max_tag_len = max(len(tag), max_tag_len) total_seq_len += seq_len if use_progress_bar and dataset_num_seqs is not None: progress_bar_with_time(float(seq_idx - start_seq) / dataset_num_seqs) seq_idx += 1 num_seqs = len(seq_idxs) assert num_seqs > 0 shapes = {} for data_key in data_keys: assert data_key in total_seq_len.dict shape = [total_seq_len[data_key]] shape += dataset.get_data_shape(data_key) print("Total len of %r is %s, shape %r, dtype %s" % ( data_key, human_size(shape[0]), shape, dataset.get_data_dtype(data_key)), file=log.v3) shapes[data_key] = shape print("Set seq tags...", file=log.v3) hdf_dataset.create_dataset('seqTags', shape=(num_seqs,), dtype="S%i" % (max_tag_len + 1)) for i, tag in enumerate(seq_tags): hdf_dataset['seqTags'][i] = numpy.array(tag, dtype="S%i" % (max_tag_len + 1)) if use_progress_bar: progress_bar_with_time(float(i) / num_seqs) print("Set seq len info...", file=log.v3) hdf_dataset.create_dataset(attr_seqLengths, shape=(num_seqs, 2), dtype="int32") for i, seq_len in enumerate(seq_lens): data_len = seq_len[default_data_input_key] targets_len = seq_len[default_data_target_key] for data_key in data_target_keys: assert seq_len[data_key] == targets_len, "different lengths in multi-target not supported" if targets_len is None: targets_len = data_len hdf_dataset[attr_seqLengths][i] = [data_len, targets_len] if use_progress_bar: progress_bar_with_time(float(i) / num_seqs) print("Create arrays in HDF...", file=log.v3) hdf_dataset.create_group('targets/data') hdf_dataset.create_group('targets/size') hdf_dataset.create_group('targets/labels') for data_key in data_keys: if data_key == default_data_input_key: hdf_dataset.create_dataset( 'inputs', shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key)) else: hdf_dataset['targets/data'].create_dataset( hdf_data_key_map[data_key], shape=shapes[data_key], dtype=dataset.get_data_dtype(data_key)) hdf_dataset['targets/size'].attrs[hdf_data_key_map[data_key]] = dataset.num_outputs[data_key] if data_key in dataset.labels: labels = dataset.labels[data_key] if PY3: labels = [label.encode("utf8") for label in labels] assert len(labels) == dataset.num_outputs[data_key][0] else: labels = ["%s-class-%i" % (data_key, i) for i in range(dataset.get_data_dim(data_key))] print("Labels for %s:" % data_key, labels[:3], "...", file=log.v5) max_label_len = max(map(len, labels)) if data_key != default_data_input_key: hdf_dataset['targets/labels'].create_dataset(hdf_data_key_map[data_key], (len(labels),), dtype="S%i" % (max_label_len + 1)) for i, label in enumerate(labels): hdf_dataset['targets/labels'][hdf_data_key_map[data_key]][i] = numpy.array( label, dtype="S%i" % (max_label_len + 1)) # Again iterate through dataset, and set the data print("Write data...", file=log.v3) dataset.init_seq_order(epoch) offsets = NumbersDict(0) for seq_idx, tag in zip(seq_idxs, seq_tags): dataset.load_seqs(seq_idx, seq_idx + 1) tag_ = dataset.get_tag(seq_idx) assert tag == tag_ # Just a check for sanity. We expect the same order. seq_len = dataset.get_seq_length(seq_idx) for data_key in data_keys: if data_key == default_data_input_key: hdf_data = hdf_dataset['inputs'] else: hdf_data = hdf_dataset['targets/data'][hdf_data_key_map[data_key]] data = dataset.get_data(seq_idx, data_key) hdf_data[offsets[data_key]:offsets[data_key] + seq_len[data_key]] = data if use_progress_bar: progress_bar_with_time(float(offsets[default_data_input_key]) / total_seq_len[default_data_input_key]) offsets += seq_len assert offsets == total_seq_len # Sanity check. # Set some old-format attribs. Not needed for newer CRNN versions. hdf_dataset.attrs[attr_inputPattSize] = dataset.num_inputs hdf_dataset.attrs[attr_numLabels] = dataset.num_outputs.get(default_data_target_key, (0, 0))[0] print("All done.", file=log.v3)