Example #1
0
def config():
    exp_dir = ''
    assert len(exp_dir) > 0, 'Set the exp_dir on the command line.'
    storage_dir = get_new_subdir(Path(exp_dir) / 'eval',
                                 id_naming='time',
                                 consider_mpi=True)
    database_json = load_json(Path(exp_dir) / 'config.json')["database_json"]
    test_set = 'test_clean'
    max_examples = None
    device = 0
    ex.observers.append(FileStorageObserver.create(storage_dir))
Example #2
0
def config():
    exp_dir = ''
    assert len(exp_dir) > 0, 'Set the model path on the command line.'
    storage_dir = str(
        get_new_subdir(Path(exp_dir) / 'eval',
                       id_naming='time',
                       consider_mpi=True))
    database_json = load_json(Path(exp_dir) / 'config.json')["database_json"]
    num_workers = 8
    batch_size = 32
    max_padding_rate = .05
    device = 0
    ckpt_name = 'ckpt_best_map.pth'
Example #3
0
def test_index():
    # Do all get_new_subdir tests in one TemporaryDirectory to ensure, that
    # they correctly read the files from the folder.
    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_dir = Path(tmp_dir)

        new = get_new_subdir(tmp_dir)
        assert new.name == '1'

        new = get_new_subdir(tmp_dir)
        assert new.name == '2'

        new = get_new_subdir(tmp_dir, prefix='prefix')
        assert new.name == 'prefix_1'

        new = get_new_subdir(tmp_dir, prefix='prefix')
        assert new.name == 'prefix_2'

        new = get_new_subdir(tmp_dir, suffix='suffix')
        assert new.name == '1_suffix'

        new = get_new_subdir(tmp_dir, suffix='suffix')
        assert new.name == '2_suffix'

        new = get_new_subdir(tmp_dir, prefix='prefix', suffix='suffix')
        assert new.name == 'prefix_1_suffix'

        new = get_new_subdir(tmp_dir, prefix='prefix', suffix='suffix')
        assert new.name == 'prefix_2_suffix'

        new = get_new_subdir(tmp_dir, id_naming='time')
        # depends on the the current time -> no assert

        rng = np.random.RandomState(0)
        new = get_new_subdir(tmp_dir, id_naming=NameGenerator(rng=rng))
        assert new.name == 'nice_tomato_fox'

        rng = np.random.RandomState(0)
        new = get_new_subdir(tmp_dir, id_naming=NameGenerator(rng=rng))
        assert new.name == 'related_apricot_ant'

        rng = np.random.RandomState(0)
        new = get_new_subdir(tmp_dir,
                             id_naming=NameGenerator(
                                 ('adjectives', 'animals', range(10)),
                                 rng=rng))
        assert new.name == 'nice_cardinal_5'

        rng = np.random.RandomState(0)
        new = get_new_subdir(tmp_dir,
                             id_naming=NameGenerator(
                                 ('adjectives', 'animals', range(10)),
                                 rng=rng))
        assert new.name == 'related_prawn_3'

        assert NameGenerator(rng=rng).possibilities() == 21_376_680
        assert NameGenerator(
            ('adjectives', 'animals', range(10))).possibilities() == 4_110_900

        # Example, how you get time stamp and word combination
        # Note, you shouldn't use here the rng argument from NameGenerator.
        def id_naming():
            time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
            return time + NameGenerator()()

        new = get_new_subdir(tmp_dir, id_naming=id_naming)
Example #4
0
def main(_run, exp_dir, storage_dir, database_json, test_set, max_examples, device):
    if IS_MASTER:
        commands.print_config(_run)

    exp_dir = Path(exp_dir)
    storage_dir = Path(storage_dir)
    eval_dir = get_new_subdir(storage_dir, id_naming='time', consider_mpi=True)
    audio_dir = eval_dir / 'audio'
    audio_dir.mkdir(parents=True)

    config = load_json(exp_dir / 'config.json')

    model = Model.from_storage_dir(exp_dir, consider_mpi=True)
    model.to(device)
    model.eval()

    db = JsonDatabase(database_json)
    test_data = db.get_dataset(test_set)
    if max_examples is not None:
        test_data = test_data.shuffle(rng=np.random.RandomState(0))[:max_examples]
    test_data = prepare_dataset(
        test_data, audio_reader=config['audio_reader'], stft=config['stft'],
        max_length_in_sec=None, batch_size=1, shuffle=False
    )
    squared_err = list()
    with torch.no_grad():
        for example in split_managed(
            test_data, is_indexable=False,
            progress_bar=True, allow_single_worker=True
        ):
            example = model.example_to_device(example, device)
            target = example['audio_data'].squeeze(1)
            x = model.feature_extraction(example['stft'], example['seq_len'])
            x = model.wavenet.infer(
                x.squeeze(1),
                chunk_length=48_000,
                chunk_overlap=16_000,
            )
            # wavenet also reconstructs padded samples which need to be discarded.
            # Assert that the number of discarded samples are less than shift,
            # i.e., less equal the maximum possible pad width.
            assert config['stft']['shift'] > (x.shape[-1] - target.shape[-1]) >= 0, (target.shape, x.shape)
            x = x[..., :target.shape[-1]]
            squared_err.extend([
                (ex_id, mse.cpu().detach().numpy(), x.shape[1])
                for ex_id, mse in zip(
                    example['example_id'], ((x-target)**2).sum(1)
                )
            ])

    squared_err_list = COMM.gather(squared_err, root=MASTER)

    if IS_MASTER:
        print(f'\nlen(squared_err_list): {len(squared_err_list)}')
        squared_err = []
        for i in range(len(squared_err_list)):
            squared_err.extend(squared_err_list[i])
        _, err, t = list(zip(*squared_err))
        print('rmse:', np.sqrt(np.sum(err)/np.sum(t)))
        rmse = sorted(
            [(ex_id, np.sqrt(err/t)) for ex_id, err, t in squared_err],
            key=lambda x: x[1]
        )
        dump_json(rmse, eval_dir / 'rmse.json', indent=4, sort_keys=False)
        ex_ids_ordered = [x[0] for x in rmse]
        test_data = db.get_dataset('test_clean').shuffle(
            rng=np.random.RandomState(0))[:max_examples].filter(
            lambda x: x['example_id'] in ex_ids_ordered[:10] + ex_ids_ordered[-10:],
            lazy=False
        )
        test_data = prepare_dataset(
            test_data, audio_reader=config['audio_reader'], stft=config['stft'],
            max_length_in_sec=None, batch_size=1, shuffle=False
        )
        with torch.no_grad():
            for example in test_data:
                example = model.example_to_device(example, device)
                x = model.feature_extraction(example['stft'], example['seq_len'])
                x = model.wavenet.infer(
                    x.squeeze(1),
                    chunk_length=48_000,
                    chunk_overlap=16_000,
                )
                for i, audio in enumerate(x.cpu().detach().numpy()):
                    wavfile.write(
                        str(audio_dir / f'{example["example_id"][i]}.wav'),
                        model.sample_rate, audio
                    )
Example #5
0
def get_new_storage_dir(
    experiment_name: str,
    *,
    id_naming: [str, callable] = 'index',
    mkdir: bool = True,
    prefix: str = None,
    suffix: str = None,
    consider_mpi: bool = False,
    dry_run: bool = False,
):
    """Determine a new non-existent storage_dir located in
        `os.environ['STORAGE_ROOT'] / experiment_name`

    This is a wrapper around `paderbox.io.new_subdir.get_new_subdir`.

    Features:
     - With mkdir: Thread and process save.
     - Different conventions for ID naming possible, default running index.
     - MPI aware: Get the folder on one worker and distribute to others.

    Args:
        experiment_name:
            The sub folder name, that is used in STORAGE_ROOT.
        id_naming:
            The id naming that is used for the folder name.
             - str: 'index':
                The largest index in basedir + 1.
                e.g.: '1', '2', ...
             - str: 'time': A timestamp with the format %Y-%m-%d-%H-%M-%S
                e.g. '2020-08-13-17-02-57'
             - callable: Each call should generate a new name.
        mkdir:
            Creates the dir and makes the program process/thread safe.
            Note this option ensures that you don't get a
            conflict between two concurrent calls of get_new_folder.
            Example:
                You launch several times your programs and each should get
                another folder (e.g. hyperparameter search). When inspecting
                basedir maybe some recognize they can use '2' as sub folder.
                This option ensures, that only one program gets the '2' and the
                remaining programs search for another free id.
        prefix:
            Optional prefix for the id. e.g.: '2' -> '{prefix}_2'
        suffix:
            Optional suffix for the id. e.g.: '2' -> '2_{suffix}'
        consider_mpi:
            If True, only search on one mpi process for the folder and
            distribute the folder name.
            When using mpi (and `consider_mpi is False`) the following can/will
            happen
             - When mkdir is True every process will get another folder.
               i.e. each process has a folder just for this process.
             - Warning: Never use mpi, when `mkdir is False` and
               `consider_mpi is False`. Depending on some random factors
               (e.g. python startup time) all workers could get the same
               folder, but mostly some get the same folder and some different.
               You never want this.
        dry_run:
            When true, disables mkdir and prints the folder name.

    Returns:
        pathlib.Path of the new subdir

    >>> import tempfile
    >>> with tempfile.TemporaryDirectory() as tmp_dir:
    ...     os.environ['STORAGE_ROOT'] = tmp_dir  # simulate enviroment variable for doctest
    ...     print(get_new_storage_dir('fance_nn_experiment').relative_to(tmp_dir))
    ...     print(get_new_storage_dir('fance_nn_experiment').relative_to(tmp_dir))
    fance_nn_experiment/1
    fance_nn_experiment/2

    """
    basedir = Path(os.environ['STORAGE_ROOT']) / experiment_name
    del experiment_name
    return get_new_subdir(**locals())
Example #6
0
def main(_run, model_path, load_ckpt, batch_size, device, store_misclassified):
    if IS_MASTER:
        commands.print_config(_run)

    model_path = Path(model_path)
    eval_dir = get_new_subdir(model_path / 'eval',
                              id_naming='time',
                              consider_mpi=True)
    # perform evaluation on a sub-set (10%) of the dataset used for training
    config = load_json(model_path / 'config.json')
    database_json = config['database_json']
    dataset = config['dataset']

    model = pt.Model.from_storage_dir(model_path,
                                      checkpoint_name=load_ckpt,
                                      consider_mpi=True)
    model.to(device)
    # Turn on evaluation mode for, e.g., BatchNorm and Dropout modules
    model.eval()

    _, _, test_set = get_datasets(model_path,
                                  database_json,
                                  dataset,
                                  batch_size,
                                  return_indexable=device == 'cpu')
    with torch.no_grad():
        summary = dict(misclassified_examples=dict(),
                       correct_classified_examples=dict(),
                       hits=list())
        for batch in split_managed(test_set,
                                   is_indexable=device == 'cpu',
                                   progress_bar=True,
                                   allow_single_worker=True):
            output = model(pt.data.example_to_device(batch, device))
            prediction = torch.argmax(output, dim=-1).cpu().numpy()
            confidence = torch.softmax(output, dim=-1).max(dim=-1).values.cpu()\
                .numpy()
            label = np.array(batch['speaker_id'])
            hits = (label == prediction).astype('bool')
            summary['hits'].extend(hits.tolist())
            summary['misclassified_examples'].update({
                k: {
                    'true_label': v1,
                    'predicted_label': v2,
                    'audio_path': v3,
                    'confidence': f'{v4:.2%}',
                }
                for k, v1, v2, v3, v4 in zip(
                    np.array(batch['example_id'])[~hits], label[~hits],
                    prediction[~hits],
                    np.array(batch['audio_path'])[~hits], confidence[~hits])
            })
            # for each correct predicted label, collect the audio paths
            correct_classified = summary['correct_classified_examples']
            summary['correct_classified_examples'].update({
                k: correct_classified[k] +
                [v] if k in correct_classified.keys() else [v]
                for k, v in zip(prediction[hits],
                                np.array(batch['audio_path'])[hits])
            })

    summary_list = COMM.gather(summary, root=MASTER)

    if IS_MASTER:
        print(f'\nlen(summary_list): {len(summary_list)}')
        if len(summary_list) > 1:
            summary = dict(
                misclassified_examples=dict(),
                correct_classified_examples=dict(),
                hits=list(),
            )
            for partial_summary in summary_list:
                summary['hits'].extend(partial_summary['hits'])
                summary['misclassified_examples'].update(
                    partial_summary['misclassified_examples'])
                for label, audio_path_list in \
                        partial_summary['correct_classified_examples'].items():
                    summary['correct_classified_examples'].update({
                        label:
                        summary['correct_classified_examples'][label] +
                        audio_path_list if label
                        in summary['correct_classified_examples'].keys() else
                        audio_path_list
                    })
        hits = summary['hits']
        misclassified_examples = summary['misclassified_examples']
        correct_classified_examples = summary['correct_classified_examples']
        accuracy = np.array(hits).astype('float').mean()
        if store_misclassified:
            misclassified_dir = eval_dir / 'misclassified_examples'
            for example_id, v in misclassified_examples.items():
                label, prediction_label, audio_path, _ = v.values()
                try:
                    predicted_speaker_audio_path = \
                        correct_classified_examples[prediction_label][0]
                    example_dir = \
                        misclassified_dir / f'{example_id}_{label}_{prediction_label}'
                    example_dir.mkdir(parents=True)
                    os.symlink(audio_path, example_dir / 'example.wav')
                    os.symlink(predicted_speaker_audio_path,
                               example_dir / 'predicted_speaker_example.wav')
                except KeyError:
                    warnings.warn(
                        'There were no correctly predicted inputs from speaker '
                        f'with speaker label {prediction_label}')
        outputs = dict(
            accuracy=f'{accuracy:.2%} ({np.sum(hits)}/{len(hits)})',
            misclassifications=misclassified_examples,
        )
        print(f'Speaker classification accuracy on test set: {accuracy:.2%}')
        print(f'Wrote results to {eval_dir / "results.json"}')
        dump_json(outputs, eval_dir / 'results.json')