def test_resolve_glob_pattern_folders_match(tmpdir): touch(tmpdir / "root.info") touch(tmpdir / "folderA" / "a.txt") touch(tmpdir / "folderB" / "c.info") touch(tmpdir / "folderB" / "C" / "c.txt") # 1st level with and without folders matches = resolve_glob_pattern('folder*', tmpdir, match_folders=False) assert matches == [] matches = resolve_glob_pattern('folder*', tmpdir, match_folders=True) assert sorted(matches) == [tmpdir / 'folderA', tmpdir / 'folderB'] # All levels with and without folders matches = resolve_glob_pattern('**/*', tmpdir, match_folders=True) assert sorted(matches) == [ tmpdir / 'folderA', tmpdir / 'folderA' / 'a.txt', tmpdir / 'folderB', tmpdir / 'folderB' / 'C', tmpdir / 'folderB' / 'C' / 'c.txt', tmpdir / 'folderB' / 'c.info', tmpdir / 'root.info' ] matches = resolve_glob_pattern('**/*', tmpdir, match_folders=False) assert sorted(matches) == [ tmpdir / 'folderA' / 'a.txt', tmpdir / 'folderB' / 'C' / 'c.txt', tmpdir / 'folderB' / 'c.info', tmpdir / 'root.info' ]
def file_paths(self) -> List[Path]: """The list of files that matched the given patterns""" fpaths = [] for pattern in self.file_patterns: fpaths.extend(resolve_glob_pattern(pattern, match_folders=False)) return fpaths
def export_metrics(model_dir: str) -> None: """ Export training metrics to a csv file in the model directory :param model_dir: The model directory to save the csv file in """ train_events_file_paths = io.resolve_glob_pattern("*events.out.tfevents*", model_dir) validation_events_file_paths = io.resolve_glob_pattern("eval/*events.out.tfevents*", model_dir) if len(train_events_file_paths) == 1: train_events_file_path = str(train_events_file_paths[0]) validation_events_file_path = str(validation_events_file_paths[0]) if len( validation_events_file_paths) == 1 else None output_file_path = f"{model_dir}/metrics.csv" _export_last_step_event_metrics_to_csv(train_events_file_path=train_events_file_path, validation_events_file_path=validation_events_file_path, output_file_path=output_file_path) logger.info("Exported metrics in %s", output_file_path) else: logger.warning("Could not export metrics because a single train events file should be found in %s", model_dir)
def total_examples(pattern) -> int: """ Get total examples for all the files matched with the given input file pattern. """ files = resolve_glob_pattern(pattern) click.echo(f"{len(files)} files matched with the pattern.") total_rows = 0 for file in files: try: total_rows += get_fileinfo(file).total_records except Exception: pass click.echo(f"Total number of examples: {total_rows}")
def generate_metadata(pattern, force, compression_type): """ Generate metadata for tfrecord files. With this util you can generate metadata from tfrecords based on a matching glob pattern. Example: Generate metadata for training dataset deep utils generate-metadata 'dataset/train-*' """ files = resolve_glob_pattern(pattern) click.echo(f"{len(files)} files matched with the pattern.") with click.progressbar(files) as files: for fpath in files: try: generate_fileinfo(fpath, compression_type=compression_type) except Exception as e: click.echo(f'Skipping file {fpath} because of: {e!s}') click.echo('Finished generating metadata')
def test_resolve_glob_pattern(tmpdir): # Prepare a test structure touch(tmpdir / "root.txt") touch(tmpdir / "root.info") touch(tmpdir / "folderA" / "a.txt") touch(tmpdir / "folderA" / "b.txt") touch(tmpdir / "folderA" / "b.info") touch(tmpdir / "folderB" / "b.txt") touch(tmpdir / "folderB" / "c.txt") touch(tmpdir / "folderB" / "c.info") touch(tmpdir / "folderB" / "C" / "c.txt") touch(tmpdir / "folderB" / "C" / "c.info") root_text = resolve_glob_pattern("*.txt", tmpdir) assert sorted(root_text) == sorted([tmpdir / 'root.txt']) all_text = resolve_glob_pattern("**/*.txt", tmpdir) assert sorted(all_text) == sorted([ tmpdir / 'root.txt', tmpdir / 'folderA' / 'a.txt', tmpdir / 'folderA' / 'b.txt', tmpdir / 'folderB' / 'b.txt', tmpdir / 'folderB' / 'c.txt', tmpdir / 'folderB' / 'C' / 'c.txt', ]) # Check on current working directory os.chdir(tmpdir / 'folderA') folder_a_info = resolve_glob_pattern("*.info") assert sorted(folder_a_info) == sorted([ tmpdir / 'folderA' / 'b.info', ]) # glob with absolute pattern folder_b_recurse_info = resolve_glob_pattern( str(tmpdir / 'folderB' / "**/*.info")) assert sorted(folder_b_recurse_info) == sorted([ tmpdir / 'folderB' / 'c.info', tmpdir / 'folderB' / 'C' / 'c.info', ]) # raise when starting directory and absolute path have been set with pytest.raises(ValueError): resolve_glob_pattern(str(tmpdir / '**/*.txt'), tmpdir)
def validate(pattern: str, shallow_check: bool): """ Validate each one of the files matched using the input file pattern. """ start = time.time() files = resolve_glob_pattern(pattern) click.echo(f"{len(files)} files matched with the pattern.") with click.progressbar(files) as files: for file in files: try: get_fileinfo(file, shallow_check ) # inside here happens the validation step too except TFRecordValidationError: raise except TFRecordInfoMissingError: raise except Exception as e: # Probably not a valid tfrecords file click.echo(f'Probably not a valid tf_record file {e}') end = time.time() click.echo(f"Total execution time: {end - start}")