def create_preprocessor(hyper_config, destination, **kwargs):
    hyper_parameter_list = HyperParameterList(config_file_name=hyper_config)
    hparam_values = hyper_parameter_list.get_values(iteration_no=0)
    working_directory = dirname(destination)

    preprocess = PreprocessAudio(hparams=hparam_values, name="dsl_audio_preprocessor")
    input = tf.convert_to_tensor(np.array(np.random.random_sample((1, 16000)), dtype=np.float32), dtype=tf.float32)
    result = preprocess.preprocess(input)

    # ATTENTION: antialias is not supported in tflite
    tmp_save_path = os.path.join(working_directory, "preprocessor")
    os.makedirs(tmp_save_path, exist_ok=True)
    tf.saved_model.save(preprocess, tmp_save_path)

    # new_model = preprocess
    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir=tmp_save_path)
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
                                           tf.lite.OpsSet.SELECT_TF_OPS]
    converter.experimental_new_converter = True
    tflite_quant_model = converter.convert()
    open(destination, "wb").write(tflite_quant_model)

    interpreter = tf.lite.Interpreter(model_path=destination)
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    log.info(input_details)
    log.info(output_details)

    interpreter.allocate_tensors()

    interpreter.set_tensor(input_details[0]['index'], tf.convert_to_tensor(np.array(np.random.random_sample((1, 16000)), dtype=np.float32), dtype=tf.float32))

    interpreter.invoke()

    output = interpreter.get_tensor(output_details[0]['index'])

    # Test model on random input data.
    input_shape = input_details[0]['shape']
    log.info("input shape:")
    log.info(input_shape)
    log.info("output shape:")
    log.info(output_details[0]['shape'])
    input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
    interpreter.set_tensor(input_details[0]['index'], input_data)
    start_time = time.time()
    interpreter.invoke()
    stop_time = time.time()
    output_data = interpreter.get_tensor(output_details[0]['index'])

    log.info(output_data)
    log.info('time: {:.3f}ms'.format((stop_time - start_time) * 1000))
    log.info("Finished creating the TFLite preprocessor")
Exemple #2
0
    def __init__(self,
                 hy_params: HyperParameterList,
                 input_shape: tuple,
                 data_classes,
                 run_id: int,
                 run_dir: str = None,
                 use_ram: bool = True
                 ):
        """
        Abstract model implementation
        Args:
            hy_params: HyperParameterList
                parameters for gridsearch
            input_shape: tuple
                size of input of the model
            run_dir: str (optional)
                log directory of tensorboard Default: None
            use_ram: bool (optional)
                If enabled, the whole train data set will be saved in memory.
                Otherwise only the current batch will be loaded to memory. Default: True
        """
        self._run_id = run_id
        self.hy_params = hy_params.get_values(iteration_no=self._run_id)
        self.hy_params_tb = hy_params.get_values_tensorboard(iteration_no=self._run_id)
        self.use_ram = use_ram
        self.input_shape = input_shape
        self.verbose = 0
        self.confusion_matrix = None
        self.run_dir = run_dir
        self.data_classes = data_classes

        self.prediction_type = 'categorical'
        if 'prediction_type' in self.hy_params:
            self.prediction_type = self.hy_params['prediction_type']

        if self.prediction_type == 'categorical':
            self._metrics = [keras.metrics.Precision(name="precision"),
                             keras.metrics.Recall(name="recall"),
                             categorical_accuracy]
            for i in range(len(self.data_classes)):
                self._metrics.append(keras.metrics.Recall(name="recall_class_" + str(i), class_id=i))
            for i in range(len(self.data_classes)):
                self._metrics.append(keras.metrics.Precision(name="precision_class_" + str(i), class_id=i))
        elif self.prediction_type == 'regression':
            self._metrics = [keras.metrics.MeanAbsoluteError(name="mae"),
                             keras.metrics.RootMeanSquaredError(name="rmse"),
                             keras.metrics.MeanSquaredError(name="mse")]
        else:
            raise ValueError('prediction_type "' + self.prediction_type + '" not implemented')
def predict(ctx, model_dir, data_dir, class_config, hyper_config, **kwargs):
    verbose = ctx.obj['verbose']
    f = open(class_config)
    data = json.load(f)
    f.close()

    data_dir = os.path.join(data_dir, '')

    data_classes = data
    wav_files = sorted(glob.glob(f'{data_dir}/**/*.wav', recursive=True))
    filenames, labels, duration_frames = list(map(lambda x: os.path.relpath(x, start=data_dir), wav_files)), [list(data_classes.keys())[0]]*len(wav_files), []
    for fn in filenames:
        y, sr = librosa.load(os.path.join(data_dir, fn), sr=None)
        duration_frames.append(y.shape[0])

    log.info('Found %d wav files' % len(filenames))

    if data_classes is None:
        raise ValueError('no data classes defined')

    class_list = {}
    for i, data_class in enumerate(data_classes):
        class_list[data_class] = i

    hyper_parameter_list = HyperParameterList(config_file_name=hyper_config)
    log.info("Search within rule: " + model_dir)
    model_dir_list = glob.glob(model_dir)
    log.info("Found "+ str(len(model_dir_list)) + " files")

    for model_filename in model_dir_list:
        log.info("Load " + model_filename)
        p = Path(model_filename)
        parent = p.parent
        directory = parent.name

        result_dir = os.path.join(parent, "test")
        iteration_no = int(directory.split("_")[-1])

        log.info('--- Testing trial: %s' % iteration_no)
        hparam_values = hyper_parameter_list.get_values(iteration_no=iteration_no)
        log.info(hparam_values)

        test_data = pd.DataFrame({'filename': filenames, 'label': labels, 'duration_frames': duration_frames})

        print("Loading model: " + model_filename)
        model = tf.keras.models.load_model(model_filename,
                                           custom_objects={'AugmentableModel': AugmentableModel, 'ARelu': ARelu},
                                           compile=False)
        model.set_hyper_parameters(hparam_values)
        log.info("Successfully loaded model: " + model_filename)

        data_raw = test_data # [:10]
        dataset_name = 'test'

        dataset_result_dir = os.path.join(result_dir, dataset_name)

        os.makedirs(dataset_result_dir, exist_ok=True)

        data_pipeline = DataPipeline(name=dataset_name+'_data_set', data_classes=data_classes,
                                            enable_gpu=True, verbose=True, enable_augmentation=False,
                                            hparams=hparam_values, run_id=iteration_no)
        data_pipeline.set_data(data_raw)
        data_pipeline.set_filename_prepend(prepend_filename_str=data_dir)
        data_pipeline.preprocess()
        filename_list = data_pipeline.filenames
        dataset = data_pipeline.pipeline(cache=False, shuffle=False, drop_remainder=False)

        X_probs = model.predict(x=dataset, verbose=verbose)
        true_categories = tf.concat([y for x, y in dataset], axis=0)
        X_pred = tf.argmax(X_probs, axis=1)
        X_pred_ny = X_pred.numpy()


        target_names = []
        for data_class in data_classes:
            target_names.append(data_class)

        df = pd.DataFrame(data=filename_list[...,0], columns=["filename"])

        df['filename'] = df['filename'].apply(lambda x: os.path.basename(x))
        df['time'] = list(map(lambda x: int(x)/sr, filename_list[...,1]))
        for i, target in enumerate(target_names):
            df[f'prob_{target}'] = X_probs[:, i]
        df['prediction'] = list(map(lambda x: target_names[x], X_pred))

        df.to_csv(os.path.join(dataset_result_dir, dataset_name+".chunks.predictions.csv"), index=False)

        log.info("Finished testing")
Exemple #4
0
def devel_test(model_dir, data_dir, class_config, hyper_config, label_file,
               **kwargs):

    f = open(class_config)
    data = json.load(f)
    f.close()

    data_dir = os.path.join(data_dir, '')

    data_classes = data

    if data_classes is None:
        raise ValueError('no data classes defined')

    class_list = {}
    for i, data_class in enumerate(data_classes):
        class_list[data_class] = i

    hyper_parameter_list = HyperParameterList(config_file_name=hyper_config)

    log.info("Search by rule: " + model_dir)
    model_dir_list = glob.glob(model_dir)
    log.info("Found " + str(len(model_dir_list)) + " files")

    for model_filename in model_dir_list:
        log.info("Load " + model_filename)
        p = Path(model_filename)
        parent = p.parent
        directory = parent.name

        result_dir = os.path.join(parent, "evaluation")

        iteration_no = int(directory.split("_")[-1])

        log.info('--- Testing trial: %s' % iteration_no)
        hparam_values = hyper_parameter_list.get_values(
            iteration_no=iteration_no)
        log.info(hparam_values)

        label_parser_key = hparam_values['label_parser']

        if ":" not in label_parser_key:
            raise ValueError(
                'Please provide the parser in the following format: path.to.parser_file.py:ParserClass'
            )

        log.info(f'Using custom external parser: {label_parser_key}')
        path, class_name = label_parser_key.split(':')
        module_name = os.path.splitext(os.path.basename(path))[0]
        dir_path = os.path.dirname(os.path.realpath(__file__))
        path = os.path.join(dir_path, path)
        spec = importlib.util.spec_from_file_location(module_name, path)
        foo = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(foo)
        parser_class = getattr(foo, class_name)

        parser = parser_class(file_path=label_file)
        _, devel_data, test_data = parser.parse_labels()
        log.info("Successfully parsed labels: " + label_file)
        model = tf.keras.models.load_model(model_filename,
                                           custom_objects={
                                               'AugmentableModel':
                                               AugmentableModel,
                                               'ARelu': ARelu
                                           },
                                           compile=False)
        model.set_hyper_parameters(hparam_values)
        log.info("Successfully loaded model: " + model_filename)

        dataset_list = ["devel", "test"]

        for dataset_name in dataset_list:
            log.info("===== Dataset Partition: " + dataset_name)
            data_raw = []
            if dataset_name == 'devel':
                data_raw = devel_data  # [:10]
            elif dataset_name == 'test':
                data_raw = test_data  # [:10]

            dataset_result_dir = os.path.join(result_dir, dataset_name)

            os.makedirs(dataset_result_dir, exist_ok=True)

            data_pipeline = DataPipeline(name=dataset_name + '_data_set',
                                         data_classes=data_classes,
                                         enable_gpu=True,
                                         verbose=True,
                                         enable_augmentation=False,
                                         hparams=hparam_values,
                                         run_id=iteration_no)
            data_pipeline.set_data(data_raw)
            data_pipeline.set_filename_prepend(prepend_filename_str=data_dir)
            data_pipeline.preprocess()
            filename_list = data_pipeline.filenames
            dataset = data_pipeline.pipeline(cache=False,
                                             shuffle=False,
                                             drop_remainder=False)

            X_pred = model.predict(x=dataset)
            true_categories = tf.concat([y for x, y in dataset], axis=0)

            X_pred = tf.argmax(X_pred, axis=1)
            X_pred_ny = X_pred.numpy()

            true_categories = tf.argmax(true_categories, axis=1)
            true_np = true_categories.numpy()
            cm = tf.math.confusion_matrix(true_categories, X_pred)
            log.info("Confusion Matrix (chunks):")
            log.info(cm.numpy())

            target_names = []
            for data_class in data_classes:
                target_names.append(data_class)

            log.info(
                classification_report(y_true=true_categories.numpy(),
                                      y_pred=X_pred_ny,
                                      target_names=target_names,
                                      digits=4))

            recall = recall_score(y_true=true_categories.numpy(),
                                  y_pred=X_pred_ny,
                                  average='macro')
            log.info("UAR: " + str(recall * 100))

            json_cm_dir = os.path.join(dataset_result_dir,
                                       dataset_name + ".chunks.metrics.json")
            with open(json_cm_dir, 'w') as f:
                json.dump(
                    {
                        "cm": cm.numpy().tolist(),
                        "uar": round(recall * 100, 4)
                    }, f)

            X_pred_pd = pd.DataFrame(data=X_pred_ny, columns=["prediction"])
            pd_filename_list = pd.DataFrame(data=filename_list[..., 0],
                                            columns=["filename"])

            df = pd_filename_list.join(X_pred_pd, how='outer')
            df['filename'] = df['filename'].apply(
                lambda x: os.path.basename(x))

            df.to_csv(os.path.join(dataset_result_dir,
                                   dataset_name + ".chunks.predictions.csv"),
                      index=False)

            ###### grouped #######

            grouped_data = df.groupby(
                'filename',
                as_index=False).agg(lambda x: Counter(x).most_common(1)[0][0])
            grouped_data.to_csv(os.path.join(
                dataset_result_dir, dataset_name + ".grouped.predictions.csv"),
                                index=False)
            grouped_X_pred = grouped_data.values[..., 1].tolist()

            # test
            pd_filename_list = pd.DataFrame(data=filename_list[..., 0],
                                            columns=["filename"])
            true_pd = pd.DataFrame(data=true_np, columns=["label"])
            df = pd_filename_list.join(true_pd, how='outer')
            df['filename'] = df['filename'].apply(
                lambda x: os.path.basename(x))
            data_raw_labels = df.groupby(
                'filename',
                as_index=False).agg(lambda x: Counter(x).most_common(1)[0][0])

            # data_raw_labels = data_raw
            # data_raw_labels['label'] = data_raw_labels['label'].apply(lambda x: class_list[x])
            grouped_true = data_raw_labels.values[..., 1].tolist()
            cm = confusion_matrix(grouped_true, grouped_X_pred)
            log.info("Confusion Matrix (grouped):")
            log.info(cm)

            log.info(
                classification_report(y_true=grouped_true,
                                      y_pred=grouped_X_pred,
                                      target_names=target_names,
                                      digits=4))

            recall = recall_score(y_true=grouped_true,
                                  y_pred=grouped_X_pred,
                                  average='macro')
            log.info("UAR: " + str(recall * 100))

            json_cm_dir = os.path.join(dataset_result_dir,
                                       dataset_name + ".grouped.metrics.json")
            with open(json_cm_dir, 'w') as f:
                json.dump({
                    "cm": cm.tolist(),
                    "uar": round(recall * 100, 4)
                }, f)
Exemple #5
0
def train(model_dir, data_dir, class_config, hyper_config, label_file, disable_cache, **kwargs):
    import tensorflow as tf
    # tf.compat.v1.enable_eager_execution()
    # tf.config.experimental_run_functions_eagerly(True)
    from tensorboard.plugins.hparams import api as hp
    import numpy as np
    import importlib
    from deepspectrumlite import HyperParameterList, TransferBaseModel, DataPipeline, \
        METRIC_ACCURACY, METRIC_MAE, METRIC_RMSE, METRIC_RECALL, METRIC_PRECISION, METRIC_F_SCORE, METRIC_LOSS, METRIC_MSE
    import math

    enable_cache = not disable_cache
    data_dir = os.path.join(data_dir, '') # add trailing slash

    f = open(class_config)
    data = json.load(f)
    f.close()

    data_classes = data

    if data_classes is None:
        raise ValueError('no data classes defined')

    tensorboard_initialised = False

    log.info("Physical devices:")
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    log.info(physical_devices)
    del physical_devices

    hyper_parameter_list = HyperParameterList(config_file_name=hyper_config)

    max_iterations = hyper_parameter_list.get_max_iteration()
    log.info('Loaded hyperparameter configuration.')
    log.info("Recognised combinations of settings: " + str(max_iterations) + "")

    slurm_jobid = os.getenv('SLURM_ARRAY_TASK_ID')

    if slurm_jobid is not None:
        slurm_jobid = int(slurm_jobid)

        if slurm_jobid >= max_iterations:
            raise ValueError('slurm jobid ' + str(slurm_jobid) + ' is out of bound')

    for iteration_no in range(max_iterations):
        if slurm_jobid is not None:
            iteration_no = slurm_jobid
        hparam_values = hyper_parameter_list.get_values(iteration_no=iteration_no)
        hparam_values_tensorboard = hyper_parameter_list.get_values_tensorboard(iteration_no=iteration_no)

        run_identifier = hparam_values['tb_run_id'] + '_config_' + str(iteration_no)

        tensorboard_dir = hparam_values['tb_experiment']

        log_dir = os.path.join(model_dir, 'logs', tensorboard_dir)
        run_log_dir = os.path.join(log_dir, run_identifier)
        model_dir = os.path.join(model_dir, 'models', tensorboard_dir, run_identifier)
        # delete old log
        if os.path.isdir(run_log_dir):
            shutil.rmtree(run_log_dir)

        if not tensorboard_initialised:
            # create tensorboard
            with tf.summary.create_file_writer(log_dir).as_default():
                hp.hparams_config(
                    hparams=hyper_parameter_list.get_hparams(),
                    metrics=[hp.Metric(METRIC_ACCURACY, display_name='accuracy'),
                             hp.Metric(METRIC_PRECISION, display_name='precision'),
                             hp.Metric(METRIC_RECALL, display_name='unweighted recall'),
                             hp.Metric(METRIC_F_SCORE, display_name='f1 score'),
                             hp.Metric(METRIC_MAE, display_name='mae'),
                             hp.Metric(METRIC_RMSE, display_name='rmse')
                             ],
                )
                tensorboard_initialised = True

        # Use a label file parser to load data
        label_parser_key = hparam_values['label_parser']

        if ":" not in label_parser_key:
            raise ValueError('Please provide the parser in the following format: path.to.parser_file.py:ParserClass')

        log.info(f'Using custom external parser: {label_parser_key}')
        path, class_name = label_parser_key.split(':')
        module_name = os.path.splitext(os.path.basename(path))[0]
        dir_path = os.path.dirname(os.path.realpath(__file__))
        path = os.path.join(dir_path, path)
        spec = importlib.util.spec_from_file_location(module_name, path)
        foo = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(foo)
        parser_class = getattr(foo, class_name)

        parser = parser_class(file_path=label_file)
        train_data, devel_data, test_data = parser.parse_labels()

        # reset seed values to make keras reproducible
        np.random.seed(0)
        tf.compat.v1.set_random_seed(0)

        log.info('--- Starting trial: %s' % run_identifier)
        log.info({h.name: hparam_values_tensorboard[h] for h in hparam_values_tensorboard})

        log.info("Load data pipeline ...")

        ########### TRAIN DATA ###########
        train_data_pipeline = DataPipeline(name='train_data_set', data_classes=data_classes,
                                           enable_gpu=True, verbose=True, enable_augmentation=False,
                                           hparams=hparam_values, run_id=iteration_no)
        train_data_pipeline.set_data(train_data)
        train_data_pipeline.set_filename_prepend(prepend_filename_str=data_dir)
        train_data_pipeline.preprocess()
        train_data_pipeline.up_sample()
        train_dataset = train_data_pipeline.pipeline(cache=enable_cache)

        ########### DEVEL DATA ###########
        devel_data_pipeline = DataPipeline(name='devel_data_set', data_classes=data_classes,
                                           enable_gpu=True, verbose=True, enable_augmentation=False,
                                           hparams=hparam_values, run_id=iteration_no)
        devel_data_pipeline.set_data(devel_data)
        devel_data_pipeline.set_filename_prepend(prepend_filename_str=data_dir)
        devel_dataset = devel_data_pipeline.pipeline(cache=enable_cache, shuffle=False, drop_remainder=False)

        ########### TEST DATA ###########
        test_data_pipeline = DataPipeline(name='test_data_set', data_classes=data_classes,
                                          enable_gpu=True, verbose=True, enable_augmentation=False,
                                          hparams=hparam_values, run_id=iteration_no)
        test_data_pipeline.set_data(test_data)
        test_data_pipeline.set_filename_prepend(prepend_filename_str=data_dir)
        test_dataset = test_data_pipeline.pipeline(cache=enable_cache, shuffle=False, drop_remainder=False)

        log.info("All data pipelines have been successfully loaded.")
        log.info("Caching in memory is: " + str(enable_cache))

        model_name = hparam_values['model_name']

        available_ai_models = {
            'TransferBaseModel': TransferBaseModel
        }

        if model_name in available_ai_models:
            model = available_ai_models[model_name](hyper_parameter_list,
                                                    train_data_pipeline.get_model_input_shape(),
                                                    run_dir=run_log_dir,
                                                    data_classes=data_classes,
                                                    use_ram=True,
                                                    run_id=iteration_no)

            model.run(train_dataset=train_dataset,
                      test_dataset=test_dataset,
                      devel_dataset=devel_dataset,
                      save_model=True,
                      save_dir=model_dir)
        else:
            ValueError("Unknown model name: " + model_name)

        if slurm_jobid is not None:
            break