Esempio n. 1
0
 def run(self):
     args = self.args
     if args.verbosity > 1:
         print("Running lidbox command '{}' with arguments:".format(
             self.__class__.__name__.lower()))
         lidbox.yaml_pprint(vars(args))
         print()
Esempio n. 2
0
 def create_model(self, config, skip_training=False):
     model_cache_dir = os.path.join(self.cache_dir, self.model_id)
     tensorboard_log_dir = os.path.join(model_cache_dir, "tensorboard", "logs")
     tensorboard_dir = os.path.join(tensorboard_log_dir, now_str())
     default_tensorboard_config = {
         "log_dir": tensorboard_dir,
         "profile_batch": 0,
         "histogram_freq": 1,
     }
     tensorboard_config = dict(default_tensorboard_config, **config.get("tensorboard", {}))
     checkpoint_dir = self.get_checkpoint_dir()
     checkpoint_format = "epoch{epoch:06d}.hdf5"
     if "checkpoints" in config and "format" in config["checkpoints"]:
         checkpoint_format = config["checkpoints"].pop("format")
     default_checkpoints_config = {
         "filepath": os.path.join(checkpoint_dir, checkpoint_format),
     }
     checkpoints_config = dict(default_checkpoints_config, **config.get("checkpoints", {}))
     callbacks_kwargs = {
         "checkpoints": checkpoints_config,
         "early_stopping": config.get("early_stopping"),
         "tensorboard": tensorboard_config,
         "other_callbacks": config.get("other_callbacks", []),
     }
     if not skip_training:
         self.make_named_dir(tensorboard_dir, "tensorboard")
         self.make_named_dir(checkpoint_dir, "checkpoints")
     if self.args.verbosity > 1:
         print("KerasWrapper callback parameters will be set to:")
         yaml_pprint(callbacks_kwargs)
         print()
     return models.KerasWrapper(self.model_id, config["model_definition"], **callbacks_kwargs)
Esempio n. 3
0
 def run(self):
     args = self.args
     max_loglevel = len(VERBOSITY_TO_LOGLEVEL) - 1
     if lidbox.DEBUG:
         print("lidbox.DEBUG is True, overriding given --verbosity setting {} with maximum log level {}".format(args.verbosity, max_loglevel))
         args.verbosity = max_loglevel
     loglevel = min(max_loglevel, max(0, args.verbosity))
     lidbox.reset_global_loglevel(VERBOSITY_TO_LOGLEVEL[loglevel])
     if args.verbosity > 1:
         print("Running lidbox command '{}' with arguments:".format(self.__class__.__name__.lower()))
         lidbox.yaml_pprint(vars(args))
         print()
Esempio n. 4
0
 def run(self):
     super().run()
     args = self.args
     if args.verbosity > 1:
         print("Running subcommand '{}' with arguments:".format(
             self.__class__.__name__.lower()))
         yaml_pprint(vars(args))
         print()
     if args.verbosity:
         print("Loading experiment config from '{}'".format(
             args.experiment_config))
     self.experiment_config = system.load_yaml(args.experiment_config)
     if args.verbosity > 1:
         print("Experiment config is:")
         yaml_pprint(self.experiment_config)
         print()
     self.cache_dir = os.path.abspath(self.experiment_config["cache"])
     if args.verbosity > 1:
         print("Cache dir is '{}'".format(self.cache_dir))
Esempio n. 5
0
def evaluate_metrics_for_predictions(utt2prediction, utt2target, eval_confs,
                                     labels):
    import sklearn.metrics

    logger.info("Stacking predictions to numpy arrays")
    # Ensure true labels are always in the same order as in predictions
    predictions = np.stack([p for _, p in utt2prediction])
    min_score = np.amin(predictions)
    max_score = np.amax(predictions)
    true_labels_sparse = np.array([utt2target[u] for u, _ in utt2prediction])
    pred_labels_sparse = np.argmax(predictions, axis=1)

    def onehot(i):
        o = np.zeros(len(labels))
        o[i] = 1
        return o

    true_labels_dense = np.stack([onehot(t) for t in true_labels_sparse])
    logger.info(
        "Evaluating metrics on true labels of shape %s and predicted labels of shape %s."
        " Min prediction score %.3f max prediction score %.3f",
        true_labels_sparse.shape, pred_labels_sparse.shape, float(min_score),
        float(max_score))

    for metric in eval_confs:
        result = None
        if metric["name"].endswith("average_detection_cost"):
            logger.info("Evaluating minimum average detection cost")
            thresholds = np.linspace(min_score, max_score,
                                     metric.get("num_thresholds", 50))
            if metric["name"].startswith("sparse_"):
                cavg = lidbox.metrics.SparseAverageDetectionCost(
                    len(labels), thresholds)
                cavg.update_state(true_labels_sparse, predictions)
            else:
                cavg = lidbox.metrics.AverageDetectionCost(
                    len(labels), thresholds)
                cavg.update_state(true_labels_dense, predictions)
            result = float(cavg.result().numpy())
            logger.info("%s: %.6f", metric["name"], result)
        elif metric["name"].endswith("average_equal_error_rate"):
            #TODO sparse EER, generate one-hot true_labels
            logger.info("Evaluating average equal error rate")
            eer = np.zeros(len(labels))
            for l, label in enumerate(labels):
                # https://stackoverflow.com/a/46026962
                fpr, tpr, _ = sklearn.metrics.roc_curve(
                    true_labels_dense[:, l], predictions[:, l])
                fnr = 1 - tpr
                eer[l] = fpr[np.nanargmin(np.absolute(fnr - fpr))]
            result = {
                "avg": float(eer.mean()),
                "by_label":
                {label: float(eer[l])
                 for l, label in enumerate(labels)}
            }
            logger.info("%s: %s", metric["name"],
                        lidbox.yaml_pprint(result, to_string=True))
        elif metric["name"] == "average_f1_score":
            logger.info("Evaluating average F1 score")
            f1 = sklearn.metrics.f1_score(true_labels_sparse,
                                          pred_labels_sparse,
                                          labels=list(range(len(labels))),
                                          average="weighted")
            result = {"avg": float(f1)}
            logger.info("%s: %.6f", metric["name"], f1)
        elif metric["name"] == "sklearn_classification_report":
            logger.info("Generating full sklearn classification report")
            result = sklearn.metrics.classification_report(
                true_labels_sparse,
                pred_labels_sparse,
                labels=list(range(len(labels))),
                target_names=labels,
                output_dict=True,
                zero_division=0)
            logger.info("%s:\n%s", metric["name"],
                        lidbox.yaml_pprint(result, left_pad=2, to_string=True))
        elif metric["name"] == "confusion_matrix":
            logger.info("Generating confusion matrix")
            result = sklearn.metrics.confusion_matrix(true_labels_sparse,
                                                      pred_labels_sparse)
            logger.info("%s:\n%s", metric["name"],
                        format_confusion_matrix(result, labels))
            result = result.tolist()
        else:
            logger.error("Cannot evaluate unknown metric '%s'", metric["name"])
        yield {"name": metric["name"], "result": result}
Esempio n. 6
0
def evaluate_test_set(split2ds, split2meta, labels, config):
    from lidbox.dataset.steps import as_supervised, initialize
    from lidbox.models.keras_utils import best_model_checkpoint_from_config, experiment_cache_from_config
    test_conf = config["experiment"]["data"]["test"]
    test_ds = (split2ds[test_conf["split"]].batch(
        test_conf["batch_size"]).apply(as_supervised))
    predictions = None
    if "user_script" in config:
        user_script = load_user_script_as_module(config["user_script"])
        if hasattr(user_script, "predict"):
            logger.info(
                "User script has defined a 'predict' function, will use it")
            predictions = user_script.predict(test_ds, config)
            if predictions is None:
                logger.error(
                    "Function 'predict' in the user script '%s' did not return predictions",
                    config["user_script"])
                return
    if predictions is None:
        logger.info(
            "User script has not defined a 'predict' function, will use default approach"
        )
        keras_wrapper = KerasWrapper.from_config(config)
        logger.info("Model initialized:\n%s", str(keras_wrapper))
        best_checkpoint = best_model_checkpoint_from_config(config)
        logger.info("Loading weights from checkpoint file '%s'",
                    best_checkpoint)
        keras_wrapper.load_weights(best_checkpoint)
        logger.info("Starting prediction with model '%s'",
                    keras_wrapper.model_key)
        predictions = keras_wrapper.keras_model.predict(test_ds)
    logger.info(
        "Model returned predictions of shape %s, now gathering all test set ids",
        repr(predictions.shape))
    test_ids = [
        x["id"].decode("utf-8")
        for x in split2ds[test_conf["split"]].as_numpy_iterator()
    ]
    utt2prediction = sorted(zip(test_ids, predictions), key=lambda t: t[0])
    del test_ids
    has_chunks = False
    if "chunks" in config.get("pre_process", {}):
        logger.info(
            "Original signals were divided into chunks, merging chunk scores by averaging"
        )
        has_chunks = True
    if "chunks" in config.get("post_process", {}):
        logger.info(
            "Extracted features were divided into chunks, merging chunk scores by averaging"
        )
        has_chunks = True
    if has_chunks:
        utt2prediction = group_chunk_predictions_by_parent_id(utt2prediction)
        predictions = np.array([p for _, p in utt2prediction])
    # Collect targets from the test set iterator
    test_meta_ds = initialize(None, labels, split2meta[test_conf["split"]])
    utt2target = {
        x["id"].decode("utf-8"): x["target"]
        for x in test_meta_ds.as_numpy_iterator()
    }
    missed_utterances = set(utt2target.keys()) - set(
        u for u, _ in utt2prediction)
    min_score = np.amin(predictions)
    max_score = np.amax(predictions)
    if missed_utterances:
        logger.info(
            "%d test samples had no predictions and worst-case scores %.3f will be generated for them for every label",
            len(missed_utterances), min_score)
        utt2prediction.extend([(utt, np.array([min_score for _ in labels]))
                               for utt in sorted(missed_utterances)])
    scores_file = os.path.join(experiment_cache_from_config(config),
                               "predictions", "scores")
    os.makedirs(os.path.dirname(scores_file), exist_ok=True)
    logger.info("Writing predicted scores to '%s'", scores_file)
    if os.path.exists(scores_file):
        logger.warning("Overwriting existing '%s'", scores_file)
    with open(scores_file, "w") as scores_f:
        print_predictions(utt2prediction, labels, file=scores_f)
    metric_results = []
    # Ensure true labels are always in the same order as in predictions
    predictions = np.array([p for _, p in utt2prediction])
    true_labels_sparse = np.array([utt2target[u] for u, _ in utt2prediction])
    pred_labels_sparse = np.argmax(predictions, axis=1)
    logger.info(
        "Evaluating metrics on true labels of shape %s and predicted labels of shape %s",
        true_labels_sparse.shape, pred_labels_sparse.shape)
    for metric in test_conf["evaluate_metrics"]:
        result = None
        if metric["name"].endswith("average_detection_cost"):
            logger.info("Evaluating minimum average detection cost")
            thresholds = np.linspace(min_score, max_score,
                                     metric.get("num_thresholds", 200))
            if metric["name"].startswith("sparse_"):
                cavg = lidbox.metrics.SparseAverageDetectionCost(
                    len(labels), thresholds)
                cavg.update_state(np.expand_dims(true_labels_sparse, -1),
                                  predictions)
            else:
                cavg = lidbox.metrics.AverageDetectionCost(
                    len(labels), thresholds)
                cavg.update_state(true_labels, predictions)
            result = float(cavg.result().numpy())
            logger.info("%s: %.6f", metric["name"], result)
        elif metric["name"].endswith("average_equal_error_rate"):
            #TODO sparse EER, generate one-hot true_labels
            logger.info("Evaluating average equal error rate")
            eer = np.zeros(len(labels))
            for l, label in enumerate(labels):
                if label not in all_testset_labels:
                    eer[l] = 0
                    continue
                # https://stackoverflow.com/a/46026962
                fpr, tpr, _ = sklearn.metrics.roc_curve(
                    true_labels[:, l], predictions[:, l])
                fnr = 1 - tpr
                eer[l] = fpr[np.nanargmin(np.absolute(fnr - fpr))]
            result = {
                "avg": float(eer.mean()),
                "by_label":
                {label: float(eer[l])
                 for l, label in enumerate(labels)}
            }
            logger.info("%s: %s", metric["name"],
                        lidbox.yaml_pprint(result, to_string=True))
        elif metric["name"] == "average_f1_score":
            logger.info("Evaluating average F1 score")
            f1 = sklearn.metrics.f1_score(true_labels_sparse,
                                          pred_labels_sparse,
                                          labels=list(range(len(labels))),
                                          average="weighted")
            result = {"avg": float(f1)}
            logger.info("%s: %.6f", metric["name"], f1)
        elif metric["name"] == "sklearn_classification_report":
            logger.info("Generating full sklearn classification report")
            result = sklearn.metrics.classification_report(
                true_labels_sparse,
                pred_labels_sparse,
                labels=list(range(len(labels))),
                target_names=labels,
                output_dict=True,
                zero_division=0)
            logger.info("%s:\n%s", metric["name"],
                        lidbox.yaml_pprint(result, left_pad=2, to_string=True))
        elif metric["name"] == "confusion_matrix":
            logger.info("Generating confusion matrix")
            result = sklearn.metrics.confusion_matrix(true_labels_sparse,
                                                      pred_labels_sparse)
            logger.info("%s:\n%s", metric["name"],
                        format_confusion_matrix(result, labels))
            result = result.tolist()
        else:
            logger.error("Cannot evaluate unknown metric '%s'", metric["name"])
        metric_results.append({"name": metric["name"], "result": result})
    return metric_results
Esempio n. 7
0
 def predict(self):
     args = self.args
     if args.verbosity:
         print("Preparing model for prediction")
     self.model_id = self.experiment_config["experiment"]["name"]
     if not args.trials:
         args.trials = os.path.join(self.cache_dir, self.model_id, "predictions", "trials")
     if not args.scores:
         args.scores = os.path.join(self.cache_dir, self.model_id, "predictions", "scores")
     self.make_named_dir(os.path.dirname(args.trials))
     self.make_named_dir(os.path.dirname(args.scores))
     training_config = self.experiment_config["experiment"]
     feat_config = self.experiment_config["features"]
     if args.verbosity > 1:
         print("Using model parameters:")
         yaml_pprint(training_config)
         print()
     if args.verbosity > 1:
         print("Using feature extraction parameters:")
         yaml_pprint(feat_config)
         print()
     model = self.create_model(dict(training_config), skip_training=True)
     if args.verbosity > 1:
         print("Preparing model")
     labels = self.experiment_config["dataset"]["labels"]
     model.prepare(labels, training_config)
     checkpoint_dir = self.get_checkpoint_dir()
     if args.checkpoint:
         checkpoint_path = os.path.join(checkpoint_dir, args.checkpoint)
     elif "best_checkpoint" in self.experiment_config.get("prediction", {}):
         checkpoint_path = os.path.join(checkpoint_dir, self.experiment_config["prediction"]["best_checkpoint"])
     else:
         checkpoints = os.listdir(checkpoint_dir) if os.path.isdir(checkpoint_dir) else []
         if not checkpoints:
             print("Error: Cannot evaluate with a model that has no checkpoints, i.e. is not trained.")
             return 1
         if "checkpoints" in training_config:
             monitor_value = training_config["checkpoints"]["monitor"]
             monitor_mode = training_config["checkpoints"].get("mode")
         else:
             monitor_value = "epoch"
             monitor_mode = None
         checkpoint_path = os.path.join(checkpoint_dir, models.get_best_checkpoint(checkpoints, key=monitor_value, mode=monitor_mode))
     if args.verbosity:
         print("Loading model weights from checkpoint file '{}'".format(checkpoint_path))
     model.load_weights(checkpoint_path)
     if args.verbosity:
         print("\nEvaluating testset with model:")
         print(str(model))
         print()
     ds = "test"
     if args.verbosity > 2:
         print("Dataset config for '{}'".format(ds))
         yaml_pprint(training_config[ds])
     ds_config = dict(training_config, **training_config[ds])
     del ds_config["train"], ds_config["validation"]
     if args.verbosity and "dataset_logger" in ds_config:
         print("Warning: dataset_logger in the test datagroup has no effect.")
     datagroup_key = ds_config.pop("datagroup")
     datagroup = self.experiment_config["dataset"]["datagroups"][datagroup_key]
     utt2path_path = os.path.join(datagroup["path"], datagroup.get("utt2path", "utt2path"))
     utt2label_path = os.path.join(datagroup["path"], datagroup.get("utt2label", "utt2label"))
     utt2path = collections.OrderedDict(
         row[:2] for row in parse_space_separated(utt2path_path)
     )
     utt2label = collections.OrderedDict(
         row[:2] for row in parse_space_separated(utt2label_path)
     )
     utterance_list = list(utt2path.keys())
     if args.file_limit:
         utterance_list = utterance_list[:args.file_limit]
         if args.verbosity > 3:
             print("Using utterance ids:")
             yaml_pprint(utterance_list)
     int2label = self.experiment_config["dataset"]["labels"]
     label2int, OH = make_label2onehot(int2label)
     def label2onehot(label):
         return OH[label2int.lookup(label)]
     labels_set = set(int2label)
     paths = []
     paths_meta = []
     for utt in utterance_list:
         label = utt2label[utt]
         if label not in labels_set:
             continue
         paths.append(utt2path[utt])
         paths_meta.append((utt, label))
     if args.verbosity:
         print("Extracting test set features for prediction")
     features = self.extract_features(
         feat_config,
         "test",
         trim_audio=False,
         debug_squeeze_last_dim=(ds_config["input_shape"][-1] == 1),
     )
     conf_json, conf_checksum = config_checksum(self.experiment_config, datagroup_key)
     features = tf_data.prepare_dataset_for_training(
         features,
         ds_config,
         feat_config,
         label2onehot,
         self.model_id,
         verbosity=args.verbosity,
         conf_checksum=conf_checksum,
     )
     # drop meta wavs required only for vad
     features = features.map(lambda *t: t[:3])
     if ds_config.get("persistent_features_cache", True):
         features_cache_dir = os.path.join(self.cache_dir, "features")
     else:
         features_cache_dir = "/tmp/tensorflow-cache"
     features_cache_path = os.path.join(
         features_cache_dir,
         self.experiment_config["dataset"]["key"],
         ds,
         feat_config["type"],
         conf_checksum,
     )
     self.make_named_dir(os.path.dirname(features_cache_path), "features cache")
     if not os.path.exists(features_cache_path + ".md5sum-input"):
         with open(features_cache_path + ".md5sum-input", "w") as f:
             print(conf_json, file=f, end='')
         if args.verbosity:
             print("Writing features into new cache: '{}'".format(features_cache_path))
     else:
         if args.verbosity:
             print("Loading features from existing cache: '{}'".format(features_cache_path))
     features = features.cache(filename=features_cache_path)
     if args.verbosity:
         print("Gathering all utterance ids from features dataset iterator")
     # Gather utterance ids, this also causes the extraction pipeline to be evaluated
     utterance_ids = []
     i = 0
     if args.verbosity > 1:
         print(now_str(date=True), "- 0 samples done")
     for _, _, uttids in features.as_numpy_iterator():
         for uttid in uttids:
             utterance_ids.append(uttid.decode("utf-8"))
             i += 1
             if args.verbosity > 1 and i % 10000 == 0:
                 print(now_str(date=True), "-", i, "samples done")
     if args.verbosity > 1:
         print(now_str(date=True), "- all", i, "samples done")
     if args.verbosity:
         print("Features extracted, writing target and non-target language information for each utterance to '{}'.".format(args.trials))
     with open(args.trials, "w") as trials_f:
         for utt, target in utt2label.items():
             for lang in int2label:
                 print(lang, utt, "target" if target == lang else "nontarget", file=trials_f)
     if args.verbosity:
         print("Starting prediction with model")
     predictions = model.predict(features.map(lambda *t: t[0]))
     if args.verbosity > 1:
         print("Done predicting, model returned predictions of shape {}. Writing them to '{}'.".format(predictions.shape, args.scores))
     num_predictions = 0
     with open(args.scores, "w") as scores_f:
         print(*int2label, file=scores_f)
         for utt, pred in zip(utterance_ids, predictions):
             pred_scores = [np.format_float_positional(x, precision=args.score_precision) for x in pred]
             print(utt, *pred_scores, sep=args.score_separator, file=scores_f)
             num_predictions += 1
     if args.verbosity:
         print("Wrote {} prediction scores to '{}'.".format(num_predictions, args.scores))
Esempio n. 8
0
 def train(self):
     args = self.args
     if args.verbosity:
         print("Preparing model for training")
     training_config = self.experiment_config["experiment"]
     feat_config = self.experiment_config["features"]
     if args.verbosity > 1:
         print("Using model parameters:")
         yaml_pprint(training_config)
         print()
     if args.verbosity > 1:
         print("Using feature extraction parameters:")
         yaml_pprint(feat_config)
         print()
     if args.dataset_config:
         dataset_config = system.load_yaml(args.dataset_config)
         self.experiment_config["datasets"] = [d for d in dataset_config if d["key"] in self.experiment_config["datasets"]]
     labels = sorted(set(l for d in self.experiment_config["datasets"] for l in d["labels"]))
     label2int, OH = make_label2onehot(labels)
     def label2onehot(label):
         return OH[label2int.lookup(label)]
     if args.verbosity > 2:
         print("Generating onehot encoding from labels:", ', '.join(labels))
         print("Generated onehot encoding as tensors:")
         for l in labels:
             l = tf.constant(l, dtype=tf.string)
             tf_data.tf_print(l, "\t", label2onehot(l))
     self.model_id = training_config["name"]
     model = self.create_model(dict(training_config), args.skip_training)
     if args.verbosity > 1:
         print("Preparing model")
     model.prepare(labels, training_config)
     if args.verbosity:
         print("Using model:\n{}".format(str(model)))
     dataset = {}
     for ds in ("train", "validation"):
         if args.verbosity > 2:
             print("Dataset config for '{}'".format(ds))
             yaml_pprint(training_config[ds])
         ds_config = dict(training_config, **training_config[ds])
         del ds_config["train"], ds_config["validation"]
         summary_kwargs = dict(ds_config.get("dataset_logger", {}))
         debug_squeeze_last_dim = ds_config["input_shape"][-1] == 1
         datagroup_key = ds_config.pop("datagroup")
         conf_json, conf_checksum = config_checksum(self.experiment_config, datagroup_key)
         if args.verbosity > 2:
             print("Config md5 checksum '{}' computed from json string:".format(conf_checksum))
             print(conf_json)
         extractor_ds = self.extract_features(
             self.experiment_config["datasets"],
             json.loads(json.dumps(feat_config)),
             datagroup_key,
             summary_kwargs.pop("trim_audio", False),
             debug_squeeze_last_dim,
         )
         if ds_config.get("persistent_features_cache", True):
             features_cache_dir = os.path.join(self.cache_dir, "features")
         else:
             features_cache_dir = "/tmp/tensorflow-cache"
         features_cache_path = os.path.join(
             features_cache_dir,
             datagroup_key,
             feat_config["type"],
             conf_checksum,
         )
         self.make_named_dir(os.path.dirname(features_cache_path), "features cache")
         if not os.path.exists(features_cache_path + ".md5sum-input"):
             with open(features_cache_path + ".md5sum-input", "w") as f:
                 print(conf_json, file=f, end='')
             if args.verbosity:
                 print("Writing features into new cache: '{}'".format(features_cache_path))
         else:
             if args.verbosity:
                 print("Loading features from existing cache: '{}'".format(features_cache_path))
         extractor_ds = extractor_ds.cache(filename=features_cache_path)
         if args.exhaust_dataset_iterator:
             if args.verbosity:
                 print("--exhaust-dataset-iterator given, now iterating once over the dataset iterator to fill the features cache.")
             # This forces the extractor_ds pipeline to be evaluated, and the features being serialized into the cache
             i = 0
             if args.verbosity > 1:
                 print(now_str(date=True), "- 0 samples done")
             for i, (feats, *meta) in enumerate(extractor_ds.as_numpy_iterator(), start=1):
                 if args.verbosity > 1 and i % 10000 == 0:
                     print(now_str(date=True), "-", i, "samples done")
                 if args.verbosity > 3:
                     tf_data.tf_print("sample:", i, "features shape:", tf.shape(feats), "metadata:", *meta)
             if args.verbosity > 1:
                 print(now_str(date=True), "- all", i, "samples done")
         dataset[ds] = tf_data.prepare_dataset_for_training(
             extractor_ds,
             ds_config,
             feat_config,
             label2onehot,
             self.model_id,
             conf_checksum=conf_checksum,
             verbosity=args.verbosity,
         )
         if args.debug_dataset:
             if args.verbosity:
                 print("--debug-dataset given, iterating over the dataset to gather stats")
             if args.verbosity > 1:
                 print("Counting all unique dim sizes of elements at index 0 in dataset")
             for axis, size_counts in enumerate(count_dim_sizes(dataset[ds], 0, len(ds_config["input_shape"]) + 1)):
                 print("axis {}\n[count size]:".format(axis))
                 tf_data.tf_print(size_counts, summarize=10)
             if summary_kwargs:
                 logdir = os.path.join(os.path.dirname(model.tensorboard.log_dir), "dataset", ds)
                 if os.path.isdir(logdir):
                     if args.verbosity:
                         print("summary_kwargs available, but '{}' already exists, not iterating over dataset again".format(logdir))
                 else:
                     if args.verbosity:
                         print("Datagroup '{}' has a dataset logger defined. We will iterate over {} batches of samples from the dataset to create TensorBoard summaries of the input data into '{}'.".format(ds, summary_kwargs.get("num_batches", "'all'"), logdir))
                     self.make_named_dir(logdir)
                     writer = tf.summary.create_file_writer(logdir)
                     summary_kwargs["debug_squeeze_last_dim"] = debug_squeeze_last_dim
                     with writer.as_default():
                         logged_dataset = tf_data.attach_dataset_logger(dataset[ds], feat_config["type"], **summary_kwargs)
                         if args.verbosity:
                             print("Dataset logger attached to '{0}' dataset iterator, now exhausting the '{0}' dataset logger iterator once to write TensorBoard summaries of model input data".format(ds))
                         i = 0
                         max_outputs = summary_kwargs.get("max_outputs", 10)
                         for i, (samples, labels, *meta) in enumerate(logged_dataset.as_numpy_iterator()):
                             if args.verbosity > 1 and i % (2000//ds_config.get("batch_size", 1)) == 0:
                                 print(i, "batches done")
                             if args.verbosity > 3:
                                 tf_data.tf_print(
                                         "batch:", i,
                                         "utts", meta[0][:max_outputs],
                                         "samples shape:", tf.shape(samples),
                                         "onehot shape:", tf.shape(labels),
                                         "wav.audio.shape", meta[1].audio.shape,
                                         "wav.sample_rate[0]", meta[1].sample_rate[0])
                         if args.verbosity > 1:
                             print(i, "batches done")
                         del logged_dataset
     checkpoint_dir = self.get_checkpoint_dir()
     checkpoints = [c.name for c in os.scandir(checkpoint_dir) if c.is_file()] if os.path.isdir(checkpoint_dir) else []
     if checkpoints:
         if "checkpoints" in training_config:
             monitor_value = training_config["checkpoints"]["monitor"]
             monitor_mode = training_config["checkpoints"].get("mode")
         else:
             monitor_value = "epoch"
             monitor_mode = None
         checkpoint_path = os.path.join(checkpoint_dir, models.get_best_checkpoint(checkpoints, key=monitor_value, mode=monitor_mode))
         if args.verbosity:
             print("Loading model weights from checkpoint file '{}' according to monitor value '{}'".format(checkpoint_path, monitor_value))
         model.load_weights(checkpoint_path)
     if args.verbosity:
         print("\nStarting training")
     if args.skip_training:
         print("--skip-training given, will not call model.fit")
         return
     history = model.fit(dataset["train"], dataset["validation"], training_config)
     if args.verbosity:
         print("\nTraining finished after {} epochs at epoch {}".format(len(history.epoch), history.epoch[-1] + 1))
         print("metric:\tmin (epoch),\tmax (epoch):")
         for name, epoch_vals in history.history.items():
             vals = np.array(epoch_vals)
             print("{}:\t{:.6f} ({:d}),\t{:.6f} ({:d})".format(
                 name,
                 vals.min(),
                 vals.argmin() + 1,
                 vals.max(),
                 vals.argmax() + 1
             ))
     history_cache_dir = os.path.join(self.cache_dir, self.model_id, "history")
     now_s = now_str()
     for name, epoch_vals in history.history.items():
         history_file = os.path.join(history_cache_dir, now_s, name)
         self.make_named_dir(os.path.dirname(history_file), "training history")
         with open(history_file, "w") as f:
             for epoch, val in enumerate(epoch_vals, start=1):
                 print(epoch, val, file=f)
         if args.verbosity > 1:
             print("wrote history file '{}'".format(history_file))
Esempio n. 9
0
 def extract_features(self, datasets, config, datagroup_key, trim_audio, debug_squeeze_last_dim):
     args = self.args
     utt2path = collections.OrderedDict()
     utt2meta = collections.OrderedDict()
     if args.verbosity > 1:
         print("Extracting features from datagroup '{}'".format(datagroup_key))
         if args.verbosity > 2:
             yaml_pprint(config)
     num_utts_dropped = collections.Counter()
     for ds_config in datasets:
         if args.verbosity > 1:
             print("Dataset '{}'".format(ds_config["key"]))
         datagroup = ds_config["datagroups"][datagroup_key]
         utt2path_path = os.path.join(datagroup["path"], datagroup.get("utt2path", "utt2path"))
         utt2label_path = os.path.join(datagroup["path"], datagroup.get("utt2label", "utt2label"))
         if args.verbosity:
             print("Reading labels for utterances from utt2label file '{}'".format(utt2label_path))
         if args.verbosity > 1:
             print("Expected labels (utterances with other labels will be ignored):")
             for l in ds_config["labels"]:
                 print("  {}".format(l))
         enabled_labels = set(ds_config["labels"])
         skipped_utterances = set()
         for utt, label, *rest in parse_space_separated(utt2label_path):
             if label not in enabled_labels:
                 skipped_utterances.add(utt)
                 continue
             assert utt not in utt2meta, "duplicate utterance id found when parsing labels: '{}'".format(utt)
             utt2meta[utt] = {"label": label, "dataset": ds_config["key"], "duration_sec": -1.0}
         utt2dur_path = os.path.join(datagroup["path"], datagroup.get("utt2dur", "utt2dur"))
         if os.path.exists(utt2dur_path):
             if args.verbosity:
                 print("Reading durations from utt2dur file '{}'".format(utt2dur_path))
             for utt, duration, *rest in parse_space_separated(utt2dur_path):
                 if utt in skipped_utterances:
                     continue
                 assert utt in utt2meta, "utterance id without label found when parsing durations: '{}'".format(utt)
                 utt2meta[utt]["duration_sec"] = float(duration)
         else:
             if args.verbosity:
                 print("Skipping signal duration parse since utt2dur file '{}' does not exist".format(utt2dur_path))
         if args.verbosity:
             print("Reading paths of wav files from utt2path file '{}'".format(utt2path_path))
         for utt, path, *rest in parse_space_separated(utt2path_path):
             if utt in skipped_utterances:
                 continue
             assert utt not in utt2path, "duplicate utterance id found when parsing paths: '{}'".format(utt)
             utt2path[utt] = path
     if args.verbosity > 1:
         print("Total amount of non-empty lines read from utt2path {}, and utt2meta {}".format(len(utt2path), len(utt2meta)))
         if skipped_utterances:
             print("Utterances skipped due to unexpected labels: {}".format(len(skipped_utterances)))
     # All utterance ids must be present in both files
     assert set(utt2path) == set(utt2meta), "Mismatching sets of utterances in utt2path and utt2meta, the utterance ids must be exactly the same"
     utterance_list = list(utt2path.keys())
     if args.shuffle_utt2path or datagroup.get("shuffle_utt2path", False):
         if args.verbosity > 1:
             print("Shuffling utterance ids, all wavpaths in the utt2path list will be processed in random order.")
         random.shuffle(utterance_list)
     else:
         if args.verbosity > 1:
             print("Not shuffling utterance ids, all wavs will be processed in order of the utt2path list.")
     if args.file_limit:
         if args.verbosity > 1:
             print("--file-limit set at {0}, using at most {0} utterances from the utterance id list, starting at the beginning of utt2path".format(args.file_limit))
         utterance_list = utterance_list[:args.file_limit]
         if args.verbosity > 3:
             print("Using utterance ids:")
             yaml_pprint(utterance_list)
     paths = []
     paths_meta = []
     for utt in utterance_list:
         paths.append(utt2path[utt])
         meta = utt2meta[utt]
         paths_meta.append((utt, meta["label"], meta["dataset"], meta["duration_sec"]))
     if args.verbosity:
         print("Starting feature extraction for datagroup '{}' from {} files".format(datagroup_key, len(paths)))
         if args.verbosity > 3:
             print("All utterances:")
             for path, (utt, label, dataset, *rest) in zip(paths, paths_meta):
                 print(utt, label, dataset, sep='\t')
     if config["type"] == "sparsespeech":
         seg2utt_path = os.path.join(datagroup["path"], "segmented", datagroup.get("seg2utt", "seg2utt"))
         if args.verbosity:
             print("Parsing SparseSpeech features")
             print("Reading utterance segmentation data from seg2utt file '{}'".format(seg2utt_path))
         seg2utt = collections.OrderedDict(
             row[:2] for row in parse_space_separated(seg2utt_path)
         )
         enc_path = config["sparsespeech_paths"]["output"][datagroup_key]
         feat_path = config["sparsespeech_paths"]["input"][datagroup_key]
         if args.verbosity:
             print("SparseSpeech input: '{}' and encoding: '{}'".format(feat_path, enc_path))
         feat = tf_data.parse_sparsespeech_features(config, enc_path, feat_path, seg2utt, utt2label)
     elif config["type"] == "kaldi":
         feat_conf = dict(config["datagroups"][datagroup_key])
         kaldi_feats_scp = feat_conf.pop("features_path")
         expected_shape = feat_conf.pop("shape")
         if args.verbosity:
             print("Parsing Kaldi features from '{}' with expected shape {}".format(kaldi_feats_scp, expected_shape))
         feat = tf_data.parse_kaldi_features(utterance_list, kaldi_feats_scp, utt2label, expected_shape, feat_conf)
     else:
         feat = tf_data.extract_features_from_paths(
             config,
             paths,
             paths_meta,
             datagroup_key,
             trim_audio=trim_audio,
             debug_squeeze_last_dim=debug_squeeze_last_dim,
             verbosity=args.verbosity,
         )
     return feat
Esempio n. 10
0
def extract_features_from_paths(feat_config,
                                paths,
                                meta,
                                datagroup_key,
                                trim_audio=None,
                                debug_squeeze_last_dim=False,
                                verbosity=0):
    paths, meta = list(paths), [m[:3] for m in meta]
    assert len(paths) == len(
        meta
    ), "Cannot extract features from paths when the amount of metadata {} does not match the amount of wavfile paths {}".format(
        len(meta), len(paths))
    wav_config = feat_config.get("wav_config")
    if wav_config:
        dataset_types = ((tf.float32, tf.int32), tf.string, tf.string)
        dataset_shapes = ((tf.TensorShape([None]), tf.TensorShape([])),
                          tf.TensorShape([]), tf.TensorShape([]))
        if "chunks" in wav_config:
            chunk_loader_fn = get_chunk_loader(wav_config, verbosity,
                                               datagroup_key)

            def ds_generator(*args):
                return tf.data.Dataset.from_generator(chunk_loader_fn,
                                                      dataset_types,
                                                      dataset_shapes,
                                                      args=args)

            paths_t = tf.constant(paths, tf.string)
            meta_t = tf.constant(meta, tf.string)
            wavs = (
                tf.data.Dataset.from_tensor_slices((paths_t, meta_t)).
                interleave(
                    ds_generator,
                    # Hide IO latency from reading wav files by using several workers per CPU
                    # The exact amount of workers is chosen by TensorFlow due to autotune, but this will be the maximum
                    cycle_length=wav_config.get("workers_per_cpu", 16) *
                    len(os.sched_getaffinity(0)),
                    num_parallel_calls=TF_AUTOTUNE))
        else:
            print("unknown, non-empty wav_config given:")
            yaml_pprint(wav_config)
            raise NotImplementedError
        wavs = wavs.map(lambda wav, *meta:
                        (audio_feat.Wav(wav[0], wav[1]), *meta))
    else:
        wav_paths = tf.data.Dataset.from_tensor_slices(
            (tf.constant(paths,
                         dtype=tf.string), tf.constant(meta, dtype=tf.string)))
        load_wav_with_meta = lambda path, *meta: (load_wav(path), *meta)
        wavs = wav_paths.map(load_wav_with_meta,
                             num_parallel_calls=TF_AUTOTUNE)
    if "batch_wavs_by_length" in feat_config:
        window_size = feat_config["batch_wavs_by_length"]["max_batch_size"]
        if verbosity:
            print(
                "Batching all wavs by equal length into batches of max size {}"
                .format(window_size))
        key_fn = lambda wav, *meta: tf.cast(tf.size(wav.audio), tf.int64)
        reduce_fn = lambda key, group_ds: group_ds.batch(window_size)
        group_by_wav_length = tf.data.experimental.group_by_window(
            key_fn, reduce_fn, window_size)
        wavs_batched = wavs.apply(group_by_wav_length)
    else:
        batch_size = feat_config.get("batch_size", 1)
        if verbosity:
            print("Batching wavs with batch size", batch_size)
        wavs_batched = wavs.batch(batch_size)
    if verbosity:
        print("Applying feature extractor to batched wavs")
    feat_extract_args = feat_extraction_args_as_list(feat_config)
    # This function expects batches of wavs
    extract_feats = lambda wavs, *meta: (extract_features(
        wavs, *feat_extract_args), (*meta, wavs))
    features = wavs_batched.map(extract_feats, num_parallel_calls=TF_AUTOTUNE)
    if "mean_var_norm_numpy" in feat_config:
        window_len = tf.constant(
            feat_config["mean_var_norm_numpy"]["window_len"], tf.int32)
        normalize_variance = tf.constant(
            feat_config["mean_var_norm_numpy"].get("normalize_variance", True),
            tf.bool)
        if verbosity:
            tf_print(
                "Using numpy to apply mean_var_norm sliding window of length",
                window_len, "without padding. Will also normalize variance:",
                normalize_variance)

        def apply_mean_var_norm_numpy(feats, *rest):
            normalized = tf.numpy_function(
                mean_var_norm_nopad_slide_numpy,
                [feats, window_len, normalize_variance], feats.dtype)
            normalized.set_shape(feats.shape.as_list())
            return (normalized, *rest)

        features = features.map(apply_mean_var_norm_numpy,
                                num_parallel_calls=TF_AUTOTUNE)
    features = features.unbatch()
    return features
Esempio n. 11
0
def prepare_dataset_for_training(ds,
                                 config,
                                 feat_config,
                                 label2onehot,
                                 model_id,
                                 conf_checksum='',
                                 verbosity=0):
    if "frames" in config:
        raise NotImplementedError("todo")
        if verbosity:
            print("Dividing features time dimension into frames")
        assert "convert_to_images" not in config, "todo, time dim random chunks for image data"
        # frame_axis = 1 if "convert_to_images" in config else 0
        # if frame_axis == 1:
        # ds = ds.map(lambda f, *meta: (tf.transpose(f, perm=(1, 0, 2, 3)), *meta))
        if config["frames"].get("random", False):
            if verbosity:
                print("Dividing features time dimension randomly")
            assert isinstance(
                config["frames"]["length"], dict
            ), "key 'frames.length' must map to a dict type when doing random chunking of frames"
            frame_chunker_fn = make_random_frame_chunker_fn(
                config["frames"]["length"])
            chunk_timedim_randomly = lambda f, *meta: (frame_chunker_fn(f),
                                                       *meta)
            ds = ds.map(chunk_timedim_randomly, num_parallel_calls=TF_AUTOTUNE)
        else:
            if verbosity:
                print(
                    "Dividing features time dimension into fixed length chunks"
                )
            # Extract frames from all features, using the same metadata for each frame of one sample of features
            seq_len = config["frames"]["length"]
            seq_step = config["frames"]["step"]
            pad_zeros = config["frames"].get("pad_zeros", False)
            to_frames = lambda feats, *meta: (tf.signal.frame(
                feats, seq_len, seq_step, pad_end=pad_zeros, axis=0), *meta)
            ds = ds.map(to_frames)
        if config["frames"].get("flatten", True):

            def _unbatch_ragged_frames(frames, *meta):
                frames_ds = tf.data.Dataset.from_tensor_slices(frames)
                inf_repeated_meta_ds = [
                    tf.data.Dataset.from_tensors(m).repeat() for m in meta
                ]
                return tf.data.Dataset.zip((frames_ds, *inf_repeated_meta_ds))

            ds = ds.flat_map(_unbatch_ragged_frames)
        ds = ds.filter(lambda frames, *meta: tf.shape(frames)[0] > 0)
        if "normalize" in config["frames"]:
            axis = config["frames"]["normalize"]["axis"]
            if verbosity:
                print("Normalizing means frame-wise over axis {}".format(axis))

            def normalize_frames(frames, *meta):
                return (frames -
                        tf.math.reduce_mean(frames, axis=axis, keepdims=True),
                        *meta)

            ds = ds.map(normalize_frames)
    # Transform dataset such that 2 first elements will always be (sample, onehot_label) and rest will be metadata that can be safely dropped when training starts
    to_model_input = lambda feats, meta: (feats, label2onehot(meta[1]), meta[
        0], *meta[2:])
    ds = ds.map(to_model_input)
    if "min_shape" in config:
        if verbosity:
            print("Filtering features by minimum shape", config["min_shape"])
        ds = filter_with_min_shape(ds, config["min_shape"])
    shuffle_buffer_size = config.get("shuffle_buffer",
                                     {"before_cache": 0})["before_cache"]
    if shuffle_buffer_size:
        if verbosity:
            print("Shuffling features with shuffle buffer size",
                  shuffle_buffer_size)
        ds = ds.shuffle(shuffle_buffer_size)
    if "padded_batch" in config:
        pad_kwargs = config["padded_batch"]["kwargs"]
        if verbosity:
            print("Batching features with padded batch kwargs:")
            yaml_pprint(pad_kwargs)
        pad_kwargs["padded_shapes"] = tuple(pad_kwargs["padded_shapes"])
        pad_kwargs["padding_values"] = tuple(
            tf.constant(float(val), dtype=tf.float32)
            for val in pad_kwargs["padding_values"])
        ds = without_metadata(ds).padded_batch(**pad_kwargs)
    elif "batch_size" in config:
        if verbosity:
            print("Batching features with batch size", config["batch_size"])
        ds = ds.batch(config["batch_size"], drop_remainder=True)
    if "bucket_by_sequence_length" in config:
        if verbosity:
            print(
                "Batching features by bucketing samples into fixed length, padded sequence length buckets"
            )
        seq_len_fn = lambda feats, meta: tf.shape(feats)[0]
        bucket_conf = config["bucket_by_sequence_length"]
        bucket_boundaries = np.linspace(bucket_conf["bins"]["min"],
                                        bucket_conf["bins"]["max"],
                                        bucket_conf["bins"]["num"],
                                        dtype=np.int32)
        bucket_batch_sizes = [
            1
        ] + (len(bucket_boundaries) - 1) * [bucket_conf["batch_size"]] + [1]
        bucketing_fn = tf.data.experimental.bucket_by_sequence_length(
            seq_len_fn, bucket_boundaries, bucket_batch_sizes,
            **bucket_conf.get("kwargs", {}))
        ds = ds.apply(bucketing_fn)
    elif "group_by_sequence_length" in config:
        max_batch_size = tf.constant(
            config["group_by_sequence_length"]["max_batch_size"], tf.int64)
        if verbosity:
            tf_print(
                "Grouping samples by sequence length into batches of max size",
                max_batch_size)
        get_seq_len = lambda feat, meta: tf.cast(tf.shape(feat)[0], tf.int64)
        group_to_batch = lambda key, group: group.batch(max_batch_size)
        ds = ds.apply(
            tf.data.experimental.group_by_window(get_seq_len,
                                                 group_to_batch,
                                                 window_size=max_batch_size))
        if "min_batch_size" in config["group_by_sequence_length"]:
            min_batch_size = config["group_by_sequence_length"][
                "min_batch_size"]
            if verbosity:
                print("Dropping batches smaller than min_batch_size",
                      min_batch_size)
            min_batch_size = tf.constant(min_batch_size, tf.int32)
            ds = ds.filter(lambda batch, meta:
                           (tf.shape(batch)[0] >= min_batch_size))
    if config.get("copy_cache_to_tmp", False):
        tmp_cache_path = "/tmp/tensorflow-cache/{}/training-prepared_{}_{}".format(
            model_id, int(time.time()), conf_checksum)
        if verbosity:
            print("Caching prepared dataset iterator to '{}'".format(
                tmp_cache_path))
        os.makedirs(os.path.dirname(tmp_cache_path), exist_ok=True)
        ds = ds.cache(filename=tmp_cache_path)
        cache_shuffle_buffer_size = config.get(
            "shuffle_buffer", {"after_cache": 0})["after_cache"]
        if cache_shuffle_buffer_size:
            if verbosity:
                print("Shuffling cached features with shuffle buffer size",
                      cache_shuffle_buffer_size)
            ds = ds.shuffle(cache_shuffle_buffer_size)
    # assume autotuned prefetch (turned off when config["prefetch"] is None)
    if "prefetch" not in config:
        if verbosity:
            print("Using autotune value", TF_AUTOTUNE,
                  "for prefetching batches")
        ds = ds.prefetch(TF_AUTOTUNE)
    elif config["prefetch"] is not None:
        if verbosity:
            print("Using fixed size prefetch value", config["prefetch"])
        ds = ds.prefetch(config["prefetch"])
    return ds