def run(self): args = self.args if args.verbosity > 1: print("Running lidbox command '{}' with arguments:".format( self.__class__.__name__.lower())) lidbox.yaml_pprint(vars(args)) print()
def create_model(self, config, skip_training=False): model_cache_dir = os.path.join(self.cache_dir, self.model_id) tensorboard_log_dir = os.path.join(model_cache_dir, "tensorboard", "logs") tensorboard_dir = os.path.join(tensorboard_log_dir, now_str()) default_tensorboard_config = { "log_dir": tensorboard_dir, "profile_batch": 0, "histogram_freq": 1, } tensorboard_config = dict(default_tensorboard_config, **config.get("tensorboard", {})) checkpoint_dir = self.get_checkpoint_dir() checkpoint_format = "epoch{epoch:06d}.hdf5" if "checkpoints" in config and "format" in config["checkpoints"]: checkpoint_format = config["checkpoints"].pop("format") default_checkpoints_config = { "filepath": os.path.join(checkpoint_dir, checkpoint_format), } checkpoints_config = dict(default_checkpoints_config, **config.get("checkpoints", {})) callbacks_kwargs = { "checkpoints": checkpoints_config, "early_stopping": config.get("early_stopping"), "tensorboard": tensorboard_config, "other_callbacks": config.get("other_callbacks", []), } if not skip_training: self.make_named_dir(tensorboard_dir, "tensorboard") self.make_named_dir(checkpoint_dir, "checkpoints") if self.args.verbosity > 1: print("KerasWrapper callback parameters will be set to:") yaml_pprint(callbacks_kwargs) print() return models.KerasWrapper(self.model_id, config["model_definition"], **callbacks_kwargs)
def run(self): args = self.args max_loglevel = len(VERBOSITY_TO_LOGLEVEL) - 1 if lidbox.DEBUG: print("lidbox.DEBUG is True, overriding given --verbosity setting {} with maximum log level {}".format(args.verbosity, max_loglevel)) args.verbosity = max_loglevel loglevel = min(max_loglevel, max(0, args.verbosity)) lidbox.reset_global_loglevel(VERBOSITY_TO_LOGLEVEL[loglevel]) if args.verbosity > 1: print("Running lidbox command '{}' with arguments:".format(self.__class__.__name__.lower())) lidbox.yaml_pprint(vars(args)) print()
def run(self): super().run() args = self.args if args.verbosity > 1: print("Running subcommand '{}' with arguments:".format( self.__class__.__name__.lower())) yaml_pprint(vars(args)) print() if args.verbosity: print("Loading experiment config from '{}'".format( args.experiment_config)) self.experiment_config = system.load_yaml(args.experiment_config) if args.verbosity > 1: print("Experiment config is:") yaml_pprint(self.experiment_config) print() self.cache_dir = os.path.abspath(self.experiment_config["cache"]) if args.verbosity > 1: print("Cache dir is '{}'".format(self.cache_dir))
def evaluate_metrics_for_predictions(utt2prediction, utt2target, eval_confs, labels): import sklearn.metrics logger.info("Stacking predictions to numpy arrays") # Ensure true labels are always in the same order as in predictions predictions = np.stack([p for _, p in utt2prediction]) min_score = np.amin(predictions) max_score = np.amax(predictions) true_labels_sparse = np.array([utt2target[u] for u, _ in utt2prediction]) pred_labels_sparse = np.argmax(predictions, axis=1) def onehot(i): o = np.zeros(len(labels)) o[i] = 1 return o true_labels_dense = np.stack([onehot(t) for t in true_labels_sparse]) logger.info( "Evaluating metrics on true labels of shape %s and predicted labels of shape %s." " Min prediction score %.3f max prediction score %.3f", true_labels_sparse.shape, pred_labels_sparse.shape, float(min_score), float(max_score)) for metric in eval_confs: result = None if metric["name"].endswith("average_detection_cost"): logger.info("Evaluating minimum average detection cost") thresholds = np.linspace(min_score, max_score, metric.get("num_thresholds", 50)) if metric["name"].startswith("sparse_"): cavg = lidbox.metrics.SparseAverageDetectionCost( len(labels), thresholds) cavg.update_state(true_labels_sparse, predictions) else: cavg = lidbox.metrics.AverageDetectionCost( len(labels), thresholds) cavg.update_state(true_labels_dense, predictions) result = float(cavg.result().numpy()) logger.info("%s: %.6f", metric["name"], result) elif metric["name"].endswith("average_equal_error_rate"): #TODO sparse EER, generate one-hot true_labels logger.info("Evaluating average equal error rate") eer = np.zeros(len(labels)) for l, label in enumerate(labels): # https://stackoverflow.com/a/46026962 fpr, tpr, _ = sklearn.metrics.roc_curve( true_labels_dense[:, l], predictions[:, l]) fnr = 1 - tpr eer[l] = fpr[np.nanargmin(np.absolute(fnr - fpr))] result = { "avg": float(eer.mean()), "by_label": {label: float(eer[l]) for l, label in enumerate(labels)} } logger.info("%s: %s", metric["name"], lidbox.yaml_pprint(result, to_string=True)) elif metric["name"] == "average_f1_score": logger.info("Evaluating average F1 score") f1 = sklearn.metrics.f1_score(true_labels_sparse, pred_labels_sparse, labels=list(range(len(labels))), average="weighted") result = {"avg": float(f1)} logger.info("%s: %.6f", metric["name"], f1) elif metric["name"] == "sklearn_classification_report": logger.info("Generating full sklearn classification report") result = sklearn.metrics.classification_report( true_labels_sparse, pred_labels_sparse, labels=list(range(len(labels))), target_names=labels, output_dict=True, zero_division=0) logger.info("%s:\n%s", metric["name"], lidbox.yaml_pprint(result, left_pad=2, to_string=True)) elif metric["name"] == "confusion_matrix": logger.info("Generating confusion matrix") result = sklearn.metrics.confusion_matrix(true_labels_sparse, pred_labels_sparse) logger.info("%s:\n%s", metric["name"], format_confusion_matrix(result, labels)) result = result.tolist() else: logger.error("Cannot evaluate unknown metric '%s'", metric["name"]) yield {"name": metric["name"], "result": result}
def evaluate_test_set(split2ds, split2meta, labels, config): from lidbox.dataset.steps import as_supervised, initialize from lidbox.models.keras_utils import best_model_checkpoint_from_config, experiment_cache_from_config test_conf = config["experiment"]["data"]["test"] test_ds = (split2ds[test_conf["split"]].batch( test_conf["batch_size"]).apply(as_supervised)) predictions = None if "user_script" in config: user_script = load_user_script_as_module(config["user_script"]) if hasattr(user_script, "predict"): logger.info( "User script has defined a 'predict' function, will use it") predictions = user_script.predict(test_ds, config) if predictions is None: logger.error( "Function 'predict' in the user script '%s' did not return predictions", config["user_script"]) return if predictions is None: logger.info( "User script has not defined a 'predict' function, will use default approach" ) keras_wrapper = KerasWrapper.from_config(config) logger.info("Model initialized:\n%s", str(keras_wrapper)) best_checkpoint = best_model_checkpoint_from_config(config) logger.info("Loading weights from checkpoint file '%s'", best_checkpoint) keras_wrapper.load_weights(best_checkpoint) logger.info("Starting prediction with model '%s'", keras_wrapper.model_key) predictions = keras_wrapper.keras_model.predict(test_ds) logger.info( "Model returned predictions of shape %s, now gathering all test set ids", repr(predictions.shape)) test_ids = [ x["id"].decode("utf-8") for x in split2ds[test_conf["split"]].as_numpy_iterator() ] utt2prediction = sorted(zip(test_ids, predictions), key=lambda t: t[0]) del test_ids has_chunks = False if "chunks" in config.get("pre_process", {}): logger.info( "Original signals were divided into chunks, merging chunk scores by averaging" ) has_chunks = True if "chunks" in config.get("post_process", {}): logger.info( "Extracted features were divided into chunks, merging chunk scores by averaging" ) has_chunks = True if has_chunks: utt2prediction = group_chunk_predictions_by_parent_id(utt2prediction) predictions = np.array([p for _, p in utt2prediction]) # Collect targets from the test set iterator test_meta_ds = initialize(None, labels, split2meta[test_conf["split"]]) utt2target = { x["id"].decode("utf-8"): x["target"] for x in test_meta_ds.as_numpy_iterator() } missed_utterances = set(utt2target.keys()) - set( u for u, _ in utt2prediction) min_score = np.amin(predictions) max_score = np.amax(predictions) if missed_utterances: logger.info( "%d test samples had no predictions and worst-case scores %.3f will be generated for them for every label", len(missed_utterances), min_score) utt2prediction.extend([(utt, np.array([min_score for _ in labels])) for utt in sorted(missed_utterances)]) scores_file = os.path.join(experiment_cache_from_config(config), "predictions", "scores") os.makedirs(os.path.dirname(scores_file), exist_ok=True) logger.info("Writing predicted scores to '%s'", scores_file) if os.path.exists(scores_file): logger.warning("Overwriting existing '%s'", scores_file) with open(scores_file, "w") as scores_f: print_predictions(utt2prediction, labels, file=scores_f) metric_results = [] # Ensure true labels are always in the same order as in predictions predictions = np.array([p for _, p in utt2prediction]) true_labels_sparse = np.array([utt2target[u] for u, _ in utt2prediction]) pred_labels_sparse = np.argmax(predictions, axis=1) logger.info( "Evaluating metrics on true labels of shape %s and predicted labels of shape %s", true_labels_sparse.shape, pred_labels_sparse.shape) for metric in test_conf["evaluate_metrics"]: result = None if metric["name"].endswith("average_detection_cost"): logger.info("Evaluating minimum average detection cost") thresholds = np.linspace(min_score, max_score, metric.get("num_thresholds", 200)) if metric["name"].startswith("sparse_"): cavg = lidbox.metrics.SparseAverageDetectionCost( len(labels), thresholds) cavg.update_state(np.expand_dims(true_labels_sparse, -1), predictions) else: cavg = lidbox.metrics.AverageDetectionCost( len(labels), thresholds) cavg.update_state(true_labels, predictions) result = float(cavg.result().numpy()) logger.info("%s: %.6f", metric["name"], result) elif metric["name"].endswith("average_equal_error_rate"): #TODO sparse EER, generate one-hot true_labels logger.info("Evaluating average equal error rate") eer = np.zeros(len(labels)) for l, label in enumerate(labels): if label not in all_testset_labels: eer[l] = 0 continue # https://stackoverflow.com/a/46026962 fpr, tpr, _ = sklearn.metrics.roc_curve( true_labels[:, l], predictions[:, l]) fnr = 1 - tpr eer[l] = fpr[np.nanargmin(np.absolute(fnr - fpr))] result = { "avg": float(eer.mean()), "by_label": {label: float(eer[l]) for l, label in enumerate(labels)} } logger.info("%s: %s", metric["name"], lidbox.yaml_pprint(result, to_string=True)) elif metric["name"] == "average_f1_score": logger.info("Evaluating average F1 score") f1 = sklearn.metrics.f1_score(true_labels_sparse, pred_labels_sparse, labels=list(range(len(labels))), average="weighted") result = {"avg": float(f1)} logger.info("%s: %.6f", metric["name"], f1) elif metric["name"] == "sklearn_classification_report": logger.info("Generating full sklearn classification report") result = sklearn.metrics.classification_report( true_labels_sparse, pred_labels_sparse, labels=list(range(len(labels))), target_names=labels, output_dict=True, zero_division=0) logger.info("%s:\n%s", metric["name"], lidbox.yaml_pprint(result, left_pad=2, to_string=True)) elif metric["name"] == "confusion_matrix": logger.info("Generating confusion matrix") result = sklearn.metrics.confusion_matrix(true_labels_sparse, pred_labels_sparse) logger.info("%s:\n%s", metric["name"], format_confusion_matrix(result, labels)) result = result.tolist() else: logger.error("Cannot evaluate unknown metric '%s'", metric["name"]) metric_results.append({"name": metric["name"], "result": result}) return metric_results
def predict(self): args = self.args if args.verbosity: print("Preparing model for prediction") self.model_id = self.experiment_config["experiment"]["name"] if not args.trials: args.trials = os.path.join(self.cache_dir, self.model_id, "predictions", "trials") if not args.scores: args.scores = os.path.join(self.cache_dir, self.model_id, "predictions", "scores") self.make_named_dir(os.path.dirname(args.trials)) self.make_named_dir(os.path.dirname(args.scores)) training_config = self.experiment_config["experiment"] feat_config = self.experiment_config["features"] if args.verbosity > 1: print("Using model parameters:") yaml_pprint(training_config) print() if args.verbosity > 1: print("Using feature extraction parameters:") yaml_pprint(feat_config) print() model = self.create_model(dict(training_config), skip_training=True) if args.verbosity > 1: print("Preparing model") labels = self.experiment_config["dataset"]["labels"] model.prepare(labels, training_config) checkpoint_dir = self.get_checkpoint_dir() if args.checkpoint: checkpoint_path = os.path.join(checkpoint_dir, args.checkpoint) elif "best_checkpoint" in self.experiment_config.get("prediction", {}): checkpoint_path = os.path.join(checkpoint_dir, self.experiment_config["prediction"]["best_checkpoint"]) else: checkpoints = os.listdir(checkpoint_dir) if os.path.isdir(checkpoint_dir) else [] if not checkpoints: print("Error: Cannot evaluate with a model that has no checkpoints, i.e. is not trained.") return 1 if "checkpoints" in training_config: monitor_value = training_config["checkpoints"]["monitor"] monitor_mode = training_config["checkpoints"].get("mode") else: monitor_value = "epoch" monitor_mode = None checkpoint_path = os.path.join(checkpoint_dir, models.get_best_checkpoint(checkpoints, key=monitor_value, mode=monitor_mode)) if args.verbosity: print("Loading model weights from checkpoint file '{}'".format(checkpoint_path)) model.load_weights(checkpoint_path) if args.verbosity: print("\nEvaluating testset with model:") print(str(model)) print() ds = "test" if args.verbosity > 2: print("Dataset config for '{}'".format(ds)) yaml_pprint(training_config[ds]) ds_config = dict(training_config, **training_config[ds]) del ds_config["train"], ds_config["validation"] if args.verbosity and "dataset_logger" in ds_config: print("Warning: dataset_logger in the test datagroup has no effect.") datagroup_key = ds_config.pop("datagroup") datagroup = self.experiment_config["dataset"]["datagroups"][datagroup_key] utt2path_path = os.path.join(datagroup["path"], datagroup.get("utt2path", "utt2path")) utt2label_path = os.path.join(datagroup["path"], datagroup.get("utt2label", "utt2label")) utt2path = collections.OrderedDict( row[:2] for row in parse_space_separated(utt2path_path) ) utt2label = collections.OrderedDict( row[:2] for row in parse_space_separated(utt2label_path) ) utterance_list = list(utt2path.keys()) if args.file_limit: utterance_list = utterance_list[:args.file_limit] if args.verbosity > 3: print("Using utterance ids:") yaml_pprint(utterance_list) int2label = self.experiment_config["dataset"]["labels"] label2int, OH = make_label2onehot(int2label) def label2onehot(label): return OH[label2int.lookup(label)] labels_set = set(int2label) paths = [] paths_meta = [] for utt in utterance_list: label = utt2label[utt] if label not in labels_set: continue paths.append(utt2path[utt]) paths_meta.append((utt, label)) if args.verbosity: print("Extracting test set features for prediction") features = self.extract_features( feat_config, "test", trim_audio=False, debug_squeeze_last_dim=(ds_config["input_shape"][-1] == 1), ) conf_json, conf_checksum = config_checksum(self.experiment_config, datagroup_key) features = tf_data.prepare_dataset_for_training( features, ds_config, feat_config, label2onehot, self.model_id, verbosity=args.verbosity, conf_checksum=conf_checksum, ) # drop meta wavs required only for vad features = features.map(lambda *t: t[:3]) if ds_config.get("persistent_features_cache", True): features_cache_dir = os.path.join(self.cache_dir, "features") else: features_cache_dir = "/tmp/tensorflow-cache" features_cache_path = os.path.join( features_cache_dir, self.experiment_config["dataset"]["key"], ds, feat_config["type"], conf_checksum, ) self.make_named_dir(os.path.dirname(features_cache_path), "features cache") if not os.path.exists(features_cache_path + ".md5sum-input"): with open(features_cache_path + ".md5sum-input", "w") as f: print(conf_json, file=f, end='') if args.verbosity: print("Writing features into new cache: '{}'".format(features_cache_path)) else: if args.verbosity: print("Loading features from existing cache: '{}'".format(features_cache_path)) features = features.cache(filename=features_cache_path) if args.verbosity: print("Gathering all utterance ids from features dataset iterator") # Gather utterance ids, this also causes the extraction pipeline to be evaluated utterance_ids = [] i = 0 if args.verbosity > 1: print(now_str(date=True), "- 0 samples done") for _, _, uttids in features.as_numpy_iterator(): for uttid in uttids: utterance_ids.append(uttid.decode("utf-8")) i += 1 if args.verbosity > 1 and i % 10000 == 0: print(now_str(date=True), "-", i, "samples done") if args.verbosity > 1: print(now_str(date=True), "- all", i, "samples done") if args.verbosity: print("Features extracted, writing target and non-target language information for each utterance to '{}'.".format(args.trials)) with open(args.trials, "w") as trials_f: for utt, target in utt2label.items(): for lang in int2label: print(lang, utt, "target" if target == lang else "nontarget", file=trials_f) if args.verbosity: print("Starting prediction with model") predictions = model.predict(features.map(lambda *t: t[0])) if args.verbosity > 1: print("Done predicting, model returned predictions of shape {}. Writing them to '{}'.".format(predictions.shape, args.scores)) num_predictions = 0 with open(args.scores, "w") as scores_f: print(*int2label, file=scores_f) for utt, pred in zip(utterance_ids, predictions): pred_scores = [np.format_float_positional(x, precision=args.score_precision) for x in pred] print(utt, *pred_scores, sep=args.score_separator, file=scores_f) num_predictions += 1 if args.verbosity: print("Wrote {} prediction scores to '{}'.".format(num_predictions, args.scores))
def train(self): args = self.args if args.verbosity: print("Preparing model for training") training_config = self.experiment_config["experiment"] feat_config = self.experiment_config["features"] if args.verbosity > 1: print("Using model parameters:") yaml_pprint(training_config) print() if args.verbosity > 1: print("Using feature extraction parameters:") yaml_pprint(feat_config) print() if args.dataset_config: dataset_config = system.load_yaml(args.dataset_config) self.experiment_config["datasets"] = [d for d in dataset_config if d["key"] in self.experiment_config["datasets"]] labels = sorted(set(l for d in self.experiment_config["datasets"] for l in d["labels"])) label2int, OH = make_label2onehot(labels) def label2onehot(label): return OH[label2int.lookup(label)] if args.verbosity > 2: print("Generating onehot encoding from labels:", ', '.join(labels)) print("Generated onehot encoding as tensors:") for l in labels: l = tf.constant(l, dtype=tf.string) tf_data.tf_print(l, "\t", label2onehot(l)) self.model_id = training_config["name"] model = self.create_model(dict(training_config), args.skip_training) if args.verbosity > 1: print("Preparing model") model.prepare(labels, training_config) if args.verbosity: print("Using model:\n{}".format(str(model))) dataset = {} for ds in ("train", "validation"): if args.verbosity > 2: print("Dataset config for '{}'".format(ds)) yaml_pprint(training_config[ds]) ds_config = dict(training_config, **training_config[ds]) del ds_config["train"], ds_config["validation"] summary_kwargs = dict(ds_config.get("dataset_logger", {})) debug_squeeze_last_dim = ds_config["input_shape"][-1] == 1 datagroup_key = ds_config.pop("datagroup") conf_json, conf_checksum = config_checksum(self.experiment_config, datagroup_key) if args.verbosity > 2: print("Config md5 checksum '{}' computed from json string:".format(conf_checksum)) print(conf_json) extractor_ds = self.extract_features( self.experiment_config["datasets"], json.loads(json.dumps(feat_config)), datagroup_key, summary_kwargs.pop("trim_audio", False), debug_squeeze_last_dim, ) if ds_config.get("persistent_features_cache", True): features_cache_dir = os.path.join(self.cache_dir, "features") else: features_cache_dir = "/tmp/tensorflow-cache" features_cache_path = os.path.join( features_cache_dir, datagroup_key, feat_config["type"], conf_checksum, ) self.make_named_dir(os.path.dirname(features_cache_path), "features cache") if not os.path.exists(features_cache_path + ".md5sum-input"): with open(features_cache_path + ".md5sum-input", "w") as f: print(conf_json, file=f, end='') if args.verbosity: print("Writing features into new cache: '{}'".format(features_cache_path)) else: if args.verbosity: print("Loading features from existing cache: '{}'".format(features_cache_path)) extractor_ds = extractor_ds.cache(filename=features_cache_path) if args.exhaust_dataset_iterator: if args.verbosity: print("--exhaust-dataset-iterator given, now iterating once over the dataset iterator to fill the features cache.") # This forces the extractor_ds pipeline to be evaluated, and the features being serialized into the cache i = 0 if args.verbosity > 1: print(now_str(date=True), "- 0 samples done") for i, (feats, *meta) in enumerate(extractor_ds.as_numpy_iterator(), start=1): if args.verbosity > 1 and i % 10000 == 0: print(now_str(date=True), "-", i, "samples done") if args.verbosity > 3: tf_data.tf_print("sample:", i, "features shape:", tf.shape(feats), "metadata:", *meta) if args.verbosity > 1: print(now_str(date=True), "- all", i, "samples done") dataset[ds] = tf_data.prepare_dataset_for_training( extractor_ds, ds_config, feat_config, label2onehot, self.model_id, conf_checksum=conf_checksum, verbosity=args.verbosity, ) if args.debug_dataset: if args.verbosity: print("--debug-dataset given, iterating over the dataset to gather stats") if args.verbosity > 1: print("Counting all unique dim sizes of elements at index 0 in dataset") for axis, size_counts in enumerate(count_dim_sizes(dataset[ds], 0, len(ds_config["input_shape"]) + 1)): print("axis {}\n[count size]:".format(axis)) tf_data.tf_print(size_counts, summarize=10) if summary_kwargs: logdir = os.path.join(os.path.dirname(model.tensorboard.log_dir), "dataset", ds) if os.path.isdir(logdir): if args.verbosity: print("summary_kwargs available, but '{}' already exists, not iterating over dataset again".format(logdir)) else: if args.verbosity: print("Datagroup '{}' has a dataset logger defined. We will iterate over {} batches of samples from the dataset to create TensorBoard summaries of the input data into '{}'.".format(ds, summary_kwargs.get("num_batches", "'all'"), logdir)) self.make_named_dir(logdir) writer = tf.summary.create_file_writer(logdir) summary_kwargs["debug_squeeze_last_dim"] = debug_squeeze_last_dim with writer.as_default(): logged_dataset = tf_data.attach_dataset_logger(dataset[ds], feat_config["type"], **summary_kwargs) if args.verbosity: print("Dataset logger attached to '{0}' dataset iterator, now exhausting the '{0}' dataset logger iterator once to write TensorBoard summaries of model input data".format(ds)) i = 0 max_outputs = summary_kwargs.get("max_outputs", 10) for i, (samples, labels, *meta) in enumerate(logged_dataset.as_numpy_iterator()): if args.verbosity > 1 and i % (2000//ds_config.get("batch_size", 1)) == 0: print(i, "batches done") if args.verbosity > 3: tf_data.tf_print( "batch:", i, "utts", meta[0][:max_outputs], "samples shape:", tf.shape(samples), "onehot shape:", tf.shape(labels), "wav.audio.shape", meta[1].audio.shape, "wav.sample_rate[0]", meta[1].sample_rate[0]) if args.verbosity > 1: print(i, "batches done") del logged_dataset checkpoint_dir = self.get_checkpoint_dir() checkpoints = [c.name for c in os.scandir(checkpoint_dir) if c.is_file()] if os.path.isdir(checkpoint_dir) else [] if checkpoints: if "checkpoints" in training_config: monitor_value = training_config["checkpoints"]["monitor"] monitor_mode = training_config["checkpoints"].get("mode") else: monitor_value = "epoch" monitor_mode = None checkpoint_path = os.path.join(checkpoint_dir, models.get_best_checkpoint(checkpoints, key=monitor_value, mode=monitor_mode)) if args.verbosity: print("Loading model weights from checkpoint file '{}' according to monitor value '{}'".format(checkpoint_path, monitor_value)) model.load_weights(checkpoint_path) if args.verbosity: print("\nStarting training") if args.skip_training: print("--skip-training given, will not call model.fit") return history = model.fit(dataset["train"], dataset["validation"], training_config) if args.verbosity: print("\nTraining finished after {} epochs at epoch {}".format(len(history.epoch), history.epoch[-1] + 1)) print("metric:\tmin (epoch),\tmax (epoch):") for name, epoch_vals in history.history.items(): vals = np.array(epoch_vals) print("{}:\t{:.6f} ({:d}),\t{:.6f} ({:d})".format( name, vals.min(), vals.argmin() + 1, vals.max(), vals.argmax() + 1 )) history_cache_dir = os.path.join(self.cache_dir, self.model_id, "history") now_s = now_str() for name, epoch_vals in history.history.items(): history_file = os.path.join(history_cache_dir, now_s, name) self.make_named_dir(os.path.dirname(history_file), "training history") with open(history_file, "w") as f: for epoch, val in enumerate(epoch_vals, start=1): print(epoch, val, file=f) if args.verbosity > 1: print("wrote history file '{}'".format(history_file))
def extract_features(self, datasets, config, datagroup_key, trim_audio, debug_squeeze_last_dim): args = self.args utt2path = collections.OrderedDict() utt2meta = collections.OrderedDict() if args.verbosity > 1: print("Extracting features from datagroup '{}'".format(datagroup_key)) if args.verbosity > 2: yaml_pprint(config) num_utts_dropped = collections.Counter() for ds_config in datasets: if args.verbosity > 1: print("Dataset '{}'".format(ds_config["key"])) datagroup = ds_config["datagroups"][datagroup_key] utt2path_path = os.path.join(datagroup["path"], datagroup.get("utt2path", "utt2path")) utt2label_path = os.path.join(datagroup["path"], datagroup.get("utt2label", "utt2label")) if args.verbosity: print("Reading labels for utterances from utt2label file '{}'".format(utt2label_path)) if args.verbosity > 1: print("Expected labels (utterances with other labels will be ignored):") for l in ds_config["labels"]: print(" {}".format(l)) enabled_labels = set(ds_config["labels"]) skipped_utterances = set() for utt, label, *rest in parse_space_separated(utt2label_path): if label not in enabled_labels: skipped_utterances.add(utt) continue assert utt not in utt2meta, "duplicate utterance id found when parsing labels: '{}'".format(utt) utt2meta[utt] = {"label": label, "dataset": ds_config["key"], "duration_sec": -1.0} utt2dur_path = os.path.join(datagroup["path"], datagroup.get("utt2dur", "utt2dur")) if os.path.exists(utt2dur_path): if args.verbosity: print("Reading durations from utt2dur file '{}'".format(utt2dur_path)) for utt, duration, *rest in parse_space_separated(utt2dur_path): if utt in skipped_utterances: continue assert utt in utt2meta, "utterance id without label found when parsing durations: '{}'".format(utt) utt2meta[utt]["duration_sec"] = float(duration) else: if args.verbosity: print("Skipping signal duration parse since utt2dur file '{}' does not exist".format(utt2dur_path)) if args.verbosity: print("Reading paths of wav files from utt2path file '{}'".format(utt2path_path)) for utt, path, *rest in parse_space_separated(utt2path_path): if utt in skipped_utterances: continue assert utt not in utt2path, "duplicate utterance id found when parsing paths: '{}'".format(utt) utt2path[utt] = path if args.verbosity > 1: print("Total amount of non-empty lines read from utt2path {}, and utt2meta {}".format(len(utt2path), len(utt2meta))) if skipped_utterances: print("Utterances skipped due to unexpected labels: {}".format(len(skipped_utterances))) # All utterance ids must be present in both files assert set(utt2path) == set(utt2meta), "Mismatching sets of utterances in utt2path and utt2meta, the utterance ids must be exactly the same" utterance_list = list(utt2path.keys()) if args.shuffle_utt2path or datagroup.get("shuffle_utt2path", False): if args.verbosity > 1: print("Shuffling utterance ids, all wavpaths in the utt2path list will be processed in random order.") random.shuffle(utterance_list) else: if args.verbosity > 1: print("Not shuffling utterance ids, all wavs will be processed in order of the utt2path list.") if args.file_limit: if args.verbosity > 1: print("--file-limit set at {0}, using at most {0} utterances from the utterance id list, starting at the beginning of utt2path".format(args.file_limit)) utterance_list = utterance_list[:args.file_limit] if args.verbosity > 3: print("Using utterance ids:") yaml_pprint(utterance_list) paths = [] paths_meta = [] for utt in utterance_list: paths.append(utt2path[utt]) meta = utt2meta[utt] paths_meta.append((utt, meta["label"], meta["dataset"], meta["duration_sec"])) if args.verbosity: print("Starting feature extraction for datagroup '{}' from {} files".format(datagroup_key, len(paths))) if args.verbosity > 3: print("All utterances:") for path, (utt, label, dataset, *rest) in zip(paths, paths_meta): print(utt, label, dataset, sep='\t') if config["type"] == "sparsespeech": seg2utt_path = os.path.join(datagroup["path"], "segmented", datagroup.get("seg2utt", "seg2utt")) if args.verbosity: print("Parsing SparseSpeech features") print("Reading utterance segmentation data from seg2utt file '{}'".format(seg2utt_path)) seg2utt = collections.OrderedDict( row[:2] for row in parse_space_separated(seg2utt_path) ) enc_path = config["sparsespeech_paths"]["output"][datagroup_key] feat_path = config["sparsespeech_paths"]["input"][datagroup_key] if args.verbosity: print("SparseSpeech input: '{}' and encoding: '{}'".format(feat_path, enc_path)) feat = tf_data.parse_sparsespeech_features(config, enc_path, feat_path, seg2utt, utt2label) elif config["type"] == "kaldi": feat_conf = dict(config["datagroups"][datagroup_key]) kaldi_feats_scp = feat_conf.pop("features_path") expected_shape = feat_conf.pop("shape") if args.verbosity: print("Parsing Kaldi features from '{}' with expected shape {}".format(kaldi_feats_scp, expected_shape)) feat = tf_data.parse_kaldi_features(utterance_list, kaldi_feats_scp, utt2label, expected_shape, feat_conf) else: feat = tf_data.extract_features_from_paths( config, paths, paths_meta, datagroup_key, trim_audio=trim_audio, debug_squeeze_last_dim=debug_squeeze_last_dim, verbosity=args.verbosity, ) return feat
def extract_features_from_paths(feat_config, paths, meta, datagroup_key, trim_audio=None, debug_squeeze_last_dim=False, verbosity=0): paths, meta = list(paths), [m[:3] for m in meta] assert len(paths) == len( meta ), "Cannot extract features from paths when the amount of metadata {} does not match the amount of wavfile paths {}".format( len(meta), len(paths)) wav_config = feat_config.get("wav_config") if wav_config: dataset_types = ((tf.float32, tf.int32), tf.string, tf.string) dataset_shapes = ((tf.TensorShape([None]), tf.TensorShape([])), tf.TensorShape([]), tf.TensorShape([])) if "chunks" in wav_config: chunk_loader_fn = get_chunk_loader(wav_config, verbosity, datagroup_key) def ds_generator(*args): return tf.data.Dataset.from_generator(chunk_loader_fn, dataset_types, dataset_shapes, args=args) paths_t = tf.constant(paths, tf.string) meta_t = tf.constant(meta, tf.string) wavs = ( tf.data.Dataset.from_tensor_slices((paths_t, meta_t)). interleave( ds_generator, # Hide IO latency from reading wav files by using several workers per CPU # The exact amount of workers is chosen by TensorFlow due to autotune, but this will be the maximum cycle_length=wav_config.get("workers_per_cpu", 16) * len(os.sched_getaffinity(0)), num_parallel_calls=TF_AUTOTUNE)) else: print("unknown, non-empty wav_config given:") yaml_pprint(wav_config) raise NotImplementedError wavs = wavs.map(lambda wav, *meta: (audio_feat.Wav(wav[0], wav[1]), *meta)) else: wav_paths = tf.data.Dataset.from_tensor_slices( (tf.constant(paths, dtype=tf.string), tf.constant(meta, dtype=tf.string))) load_wav_with_meta = lambda path, *meta: (load_wav(path), *meta) wavs = wav_paths.map(load_wav_with_meta, num_parallel_calls=TF_AUTOTUNE) if "batch_wavs_by_length" in feat_config: window_size = feat_config["batch_wavs_by_length"]["max_batch_size"] if verbosity: print( "Batching all wavs by equal length into batches of max size {}" .format(window_size)) key_fn = lambda wav, *meta: tf.cast(tf.size(wav.audio), tf.int64) reduce_fn = lambda key, group_ds: group_ds.batch(window_size) group_by_wav_length = tf.data.experimental.group_by_window( key_fn, reduce_fn, window_size) wavs_batched = wavs.apply(group_by_wav_length) else: batch_size = feat_config.get("batch_size", 1) if verbosity: print("Batching wavs with batch size", batch_size) wavs_batched = wavs.batch(batch_size) if verbosity: print("Applying feature extractor to batched wavs") feat_extract_args = feat_extraction_args_as_list(feat_config) # This function expects batches of wavs extract_feats = lambda wavs, *meta: (extract_features( wavs, *feat_extract_args), (*meta, wavs)) features = wavs_batched.map(extract_feats, num_parallel_calls=TF_AUTOTUNE) if "mean_var_norm_numpy" in feat_config: window_len = tf.constant( feat_config["mean_var_norm_numpy"]["window_len"], tf.int32) normalize_variance = tf.constant( feat_config["mean_var_norm_numpy"].get("normalize_variance", True), tf.bool) if verbosity: tf_print( "Using numpy to apply mean_var_norm sliding window of length", window_len, "without padding. Will also normalize variance:", normalize_variance) def apply_mean_var_norm_numpy(feats, *rest): normalized = tf.numpy_function( mean_var_norm_nopad_slide_numpy, [feats, window_len, normalize_variance], feats.dtype) normalized.set_shape(feats.shape.as_list()) return (normalized, *rest) features = features.map(apply_mean_var_norm_numpy, num_parallel_calls=TF_AUTOTUNE) features = features.unbatch() return features
def prepare_dataset_for_training(ds, config, feat_config, label2onehot, model_id, conf_checksum='', verbosity=0): if "frames" in config: raise NotImplementedError("todo") if verbosity: print("Dividing features time dimension into frames") assert "convert_to_images" not in config, "todo, time dim random chunks for image data" # frame_axis = 1 if "convert_to_images" in config else 0 # if frame_axis == 1: # ds = ds.map(lambda f, *meta: (tf.transpose(f, perm=(1, 0, 2, 3)), *meta)) if config["frames"].get("random", False): if verbosity: print("Dividing features time dimension randomly") assert isinstance( config["frames"]["length"], dict ), "key 'frames.length' must map to a dict type when doing random chunking of frames" frame_chunker_fn = make_random_frame_chunker_fn( config["frames"]["length"]) chunk_timedim_randomly = lambda f, *meta: (frame_chunker_fn(f), *meta) ds = ds.map(chunk_timedim_randomly, num_parallel_calls=TF_AUTOTUNE) else: if verbosity: print( "Dividing features time dimension into fixed length chunks" ) # Extract frames from all features, using the same metadata for each frame of one sample of features seq_len = config["frames"]["length"] seq_step = config["frames"]["step"] pad_zeros = config["frames"].get("pad_zeros", False) to_frames = lambda feats, *meta: (tf.signal.frame( feats, seq_len, seq_step, pad_end=pad_zeros, axis=0), *meta) ds = ds.map(to_frames) if config["frames"].get("flatten", True): def _unbatch_ragged_frames(frames, *meta): frames_ds = tf.data.Dataset.from_tensor_slices(frames) inf_repeated_meta_ds = [ tf.data.Dataset.from_tensors(m).repeat() for m in meta ] return tf.data.Dataset.zip((frames_ds, *inf_repeated_meta_ds)) ds = ds.flat_map(_unbatch_ragged_frames) ds = ds.filter(lambda frames, *meta: tf.shape(frames)[0] > 0) if "normalize" in config["frames"]: axis = config["frames"]["normalize"]["axis"] if verbosity: print("Normalizing means frame-wise over axis {}".format(axis)) def normalize_frames(frames, *meta): return (frames - tf.math.reduce_mean(frames, axis=axis, keepdims=True), *meta) ds = ds.map(normalize_frames) # Transform dataset such that 2 first elements will always be (sample, onehot_label) and rest will be metadata that can be safely dropped when training starts to_model_input = lambda feats, meta: (feats, label2onehot(meta[1]), meta[ 0], *meta[2:]) ds = ds.map(to_model_input) if "min_shape" in config: if verbosity: print("Filtering features by minimum shape", config["min_shape"]) ds = filter_with_min_shape(ds, config["min_shape"]) shuffle_buffer_size = config.get("shuffle_buffer", {"before_cache": 0})["before_cache"] if shuffle_buffer_size: if verbosity: print("Shuffling features with shuffle buffer size", shuffle_buffer_size) ds = ds.shuffle(shuffle_buffer_size) if "padded_batch" in config: pad_kwargs = config["padded_batch"]["kwargs"] if verbosity: print("Batching features with padded batch kwargs:") yaml_pprint(pad_kwargs) pad_kwargs["padded_shapes"] = tuple(pad_kwargs["padded_shapes"]) pad_kwargs["padding_values"] = tuple( tf.constant(float(val), dtype=tf.float32) for val in pad_kwargs["padding_values"]) ds = without_metadata(ds).padded_batch(**pad_kwargs) elif "batch_size" in config: if verbosity: print("Batching features with batch size", config["batch_size"]) ds = ds.batch(config["batch_size"], drop_remainder=True) if "bucket_by_sequence_length" in config: if verbosity: print( "Batching features by bucketing samples into fixed length, padded sequence length buckets" ) seq_len_fn = lambda feats, meta: tf.shape(feats)[0] bucket_conf = config["bucket_by_sequence_length"] bucket_boundaries = np.linspace(bucket_conf["bins"]["min"], bucket_conf["bins"]["max"], bucket_conf["bins"]["num"], dtype=np.int32) bucket_batch_sizes = [ 1 ] + (len(bucket_boundaries) - 1) * [bucket_conf["batch_size"]] + [1] bucketing_fn = tf.data.experimental.bucket_by_sequence_length( seq_len_fn, bucket_boundaries, bucket_batch_sizes, **bucket_conf.get("kwargs", {})) ds = ds.apply(bucketing_fn) elif "group_by_sequence_length" in config: max_batch_size = tf.constant( config["group_by_sequence_length"]["max_batch_size"], tf.int64) if verbosity: tf_print( "Grouping samples by sequence length into batches of max size", max_batch_size) get_seq_len = lambda feat, meta: tf.cast(tf.shape(feat)[0], tf.int64) group_to_batch = lambda key, group: group.batch(max_batch_size) ds = ds.apply( tf.data.experimental.group_by_window(get_seq_len, group_to_batch, window_size=max_batch_size)) if "min_batch_size" in config["group_by_sequence_length"]: min_batch_size = config["group_by_sequence_length"][ "min_batch_size"] if verbosity: print("Dropping batches smaller than min_batch_size", min_batch_size) min_batch_size = tf.constant(min_batch_size, tf.int32) ds = ds.filter(lambda batch, meta: (tf.shape(batch)[0] >= min_batch_size)) if config.get("copy_cache_to_tmp", False): tmp_cache_path = "/tmp/tensorflow-cache/{}/training-prepared_{}_{}".format( model_id, int(time.time()), conf_checksum) if verbosity: print("Caching prepared dataset iterator to '{}'".format( tmp_cache_path)) os.makedirs(os.path.dirname(tmp_cache_path), exist_ok=True) ds = ds.cache(filename=tmp_cache_path) cache_shuffle_buffer_size = config.get( "shuffle_buffer", {"after_cache": 0})["after_cache"] if cache_shuffle_buffer_size: if verbosity: print("Shuffling cached features with shuffle buffer size", cache_shuffle_buffer_size) ds = ds.shuffle(cache_shuffle_buffer_size) # assume autotuned prefetch (turned off when config["prefetch"] is None) if "prefetch" not in config: if verbosity: print("Using autotune value", TF_AUTOTUNE, "for prefetching batches") ds = ds.prefetch(TF_AUTOTUNE) elif config["prefetch"] is not None: if verbosity: print("Using fixed size prefetch value", config["prefetch"]) ds = ds.prefetch(config["prefetch"]) return ds