Beispiel #1
0
def test_resolve_glob_pattern_folders_match(tmpdir):
    touch(tmpdir / "root.info")
    touch(tmpdir / "folderA" / "a.txt")
    touch(tmpdir / "folderB" / "c.info")
    touch(tmpdir / "folderB" / "C" / "c.txt")

    # 1st level with and without folders
    matches = resolve_glob_pattern('folder*', tmpdir, match_folders=False)
    assert matches == []

    matches = resolve_glob_pattern('folder*', tmpdir, match_folders=True)
    assert sorted(matches) == [tmpdir / 'folderA', tmpdir / 'folderB']

    # All levels with and without folders
    matches = resolve_glob_pattern('**/*', tmpdir, match_folders=True)
    assert sorted(matches) == [
        tmpdir / 'folderA', tmpdir / 'folderA' / 'a.txt', tmpdir / 'folderB',
        tmpdir / 'folderB' / 'C', tmpdir / 'folderB' / 'C' / 'c.txt',
        tmpdir / 'folderB' / 'c.info', tmpdir / 'root.info'
    ]

    matches = resolve_glob_pattern('**/*', tmpdir, match_folders=False)
    assert sorted(matches) == [
        tmpdir / 'folderA' / 'a.txt', tmpdir / 'folderB' / 'C' / 'c.txt',
        tmpdir / 'folderB' / 'c.info', tmpdir / 'root.info'
    ]
Beispiel #2
0
    def file_paths(self) -> List[Path]:
        """The list of files that matched the given patterns"""

        fpaths = []
        for pattern in self.file_patterns:
            fpaths.extend(resolve_glob_pattern(pattern, match_folders=False))

        return fpaths
Beispiel #3
0
def export_metrics(model_dir: str) -> None:
    """
    Export training metrics to a csv file in the model directory
    :param model_dir: The model directory to save the csv file in
    """
    train_events_file_paths = io.resolve_glob_pattern("*events.out.tfevents*", model_dir)
    validation_events_file_paths = io.resolve_glob_pattern("eval/*events.out.tfevents*", model_dir)

    if len(train_events_file_paths) == 1:
        train_events_file_path = str(train_events_file_paths[0])
        validation_events_file_path = str(validation_events_file_paths[0]) if len(
            validation_events_file_paths) == 1 else None

        output_file_path = f"{model_dir}/metrics.csv"
        _export_last_step_event_metrics_to_csv(train_events_file_path=train_events_file_path,
                                               validation_events_file_path=validation_events_file_path,
                                               output_file_path=output_file_path)

        logger.info("Exported metrics in %s", output_file_path)
    else:
        logger.warning("Could not export metrics because a single train events file should be found in %s", model_dir)
Beispiel #4
0
def total_examples(pattern) -> int:
    """
    Get total examples for all the files matched with the given input file pattern.
    """

    files = resolve_glob_pattern(pattern)
    click.echo(f"{len(files)} files matched with the pattern.")

    total_rows = 0
    for file in files:
        try:
            total_rows += get_fileinfo(file).total_records
        except Exception:
            pass

    click.echo(f"Total number of examples: {total_rows}")
Beispiel #5
0
def generate_metadata(pattern, force, compression_type):
    """
    Generate metadata for tfrecord files.

    With this util you can generate metadata from tfrecords based on a matching
    glob pattern.

    Example: Generate metadata for training dataset
      deep utils generate-metadata 'dataset/train-*'
    """

    files = resolve_glob_pattern(pattern)
    click.echo(f"{len(files)} files matched with the pattern.")

    with click.progressbar(files) as files:
        for fpath in files:
            try:
                generate_fileinfo(fpath, compression_type=compression_type)
            except Exception as e:
                click.echo(f'Skipping file {fpath} because of: {e!s}')
    click.echo('Finished generating metadata')
Beispiel #6
0
def test_resolve_glob_pattern(tmpdir):
    # Prepare a test structure
    touch(tmpdir / "root.txt")
    touch(tmpdir / "root.info")
    touch(tmpdir / "folderA" / "a.txt")
    touch(tmpdir / "folderA" / "b.txt")
    touch(tmpdir / "folderA" / "b.info")
    touch(tmpdir / "folderB" / "b.txt")
    touch(tmpdir / "folderB" / "c.txt")
    touch(tmpdir / "folderB" / "c.info")
    touch(tmpdir / "folderB" / "C" / "c.txt")
    touch(tmpdir / "folderB" / "C" / "c.info")

    root_text = resolve_glob_pattern("*.txt", tmpdir)
    assert sorted(root_text) == sorted([tmpdir / 'root.txt'])

    all_text = resolve_glob_pattern("**/*.txt", tmpdir)
    assert sorted(all_text) == sorted([
        tmpdir / 'root.txt',
        tmpdir / 'folderA' / 'a.txt',
        tmpdir / 'folderA' / 'b.txt',
        tmpdir / 'folderB' / 'b.txt',
        tmpdir / 'folderB' / 'c.txt',
        tmpdir / 'folderB' / 'C' / 'c.txt',
    ])

    # Check on current working directory
    os.chdir(tmpdir / 'folderA')
    folder_a_info = resolve_glob_pattern("*.info")
    assert sorted(folder_a_info) == sorted([
        tmpdir / 'folderA' / 'b.info',
    ])

    # glob with absolute pattern
    folder_b_recurse_info = resolve_glob_pattern(
        str(tmpdir / 'folderB' / "**/*.info"))
    assert sorted(folder_b_recurse_info) == sorted([
        tmpdir / 'folderB' / 'c.info',
        tmpdir / 'folderB' / 'C' / 'c.info',
    ])

    # raise when starting directory and absolute path have been set
    with pytest.raises(ValueError):
        resolve_glob_pattern(str(tmpdir / '**/*.txt'), tmpdir)
Beispiel #7
0
def validate(pattern: str, shallow_check: bool):
    """
    Validate each one of the files matched using the input file pattern.
    """
    start = time.time()

    files = resolve_glob_pattern(pattern)
    click.echo(f"{len(files)} files matched with the pattern.")

    with click.progressbar(files) as files:
        for file in files:
            try:
                get_fileinfo(file, shallow_check
                             )  # inside here happens the validation step too
            except TFRecordValidationError:
                raise
            except TFRecordInfoMissingError:
                raise
            except Exception as e:  # Probably not a valid tfrecords file
                click.echo(f'Probably not a valid tf_record file {e}')

    end = time.time()

    click.echo(f"Total execution time: {end - start}")