def test_deleted(self, tmpdir, datadir): shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir) fpath = Path(tmpdir / 'train-00000-of-00001') # Generate info original_info = generate_fileinfo(fpath) # Remove the file os.unlink(fpath) with pytest.raises(FileNotFoundError): get_fileinfo(fpath, shallow_check=False)
def test_generate(self, tmpdir, datadir): shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir) shutil.copy(datadir / 'tfrecords' / 'simple' / 'validation-00000-of-00001', tmpdir) result = self._invoke_cli('generate-metadata', str(tmpdir / '*')) assert result.exit_code == 0 assert "2 files matched" in result.output assert "Finished" in result.output info = get_fileinfo(Path(tmpdir / 'train-00000-of-00001')) assert info.file_size == 350 assert info.total_records == 10 assert info.md5_hash == '3c8c216b7293fdef623b04e01bb5878a' info = get_fileinfo(Path(tmpdir / 'validation-00000-of-00001')) assert info.file_size == 350 assert info.total_records == 10 assert info.md5_hash == 'fdebe01f545d90f127a15ea2f28d3d1d'
def test_invalid_hash(self, tmpdir, datadir): shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir) fpath = Path(tmpdir / 'train-00000-of-00001') # Generate info original_info = generate_fileinfo(fpath) # Change a bit the content by writing random data at the beginning with open(fpath, 'r+b') as f: f.seek(0, io.SEEK_SET) print(f.tell()) f.write('junk'.encode('utf-8')) with pytest.raises(TFRecordValidationError): get_fileinfo(fpath, shallow_check=False) # Try to regenerate, should fail because not valid tf records with pytest.raises(tf.errors.DataLossError): generate_fileinfo(fpath)
def test_invalid_size(self, tmpdir, datadir): shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir) fpath = Path(tmpdir / 'train-00000-of-00001') # Generate info original_info = generate_fileinfo(fpath) # Change a bit the size with open(fpath, 'ab') as f: f.write('junk'.encode('utf-8')) with pytest.raises(TFRecordValidationError): get_fileinfo(fpath, shallow_check=False) # Try to regenerate, will not fail because tf can handle trailing rubbish info2 = generate_fileinfo(fpath) assert original_info is not info2 assert original_info != info2 assert info2.file_size == 354 assert info2.md5_hash == '76a086a01e382560309ccfc232711dec'
def test_non_existing_meta_file_generate(self, tmpdir, datadir): shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir) fpath = Path(tmpdir / 'train-00000-of-00001') # Try to get on folder without metadata with pytest.raises(TFRecordInfoMissingError): get_fileinfo(fpath) # Try to generate info = generate_fileinfo(fpath) assert info.md5_hash == '3c8c216b7293fdef623b04e01bb5878a' assert info.file_size == 350 assert info.name == 'train-00000-of-00001' assert info.full_path == Path(fpath) # Try again to fetch from generated metadata info2 = get_fileinfo(fpath) assert info is not info2 assert info == info2
def test_generation_multiple_file_info(self, tmpdir, datadir): shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir) shutil.copy(datadir / 'tfrecords' / 'simple' / 'validation-00000-of-00001', tmpdir) fpath1 = Path(tmpdir / 'train-00000-of-00001') fpath2 = Path(tmpdir / 'validation-00000-of-00001') # Generate for both files original_info1 = generate_fileinfo(fpath1) original_info2 = generate_fileinfo(fpath2) assert original_info1.md5_hash == '3c8c216b7293fdef623b04e01bb5878a' assert original_info2.md5_hash == 'fdebe01f545d90f127a15ea2f28d3d1d' # Get from stored metadata info1 = get_fileinfo(fpath1) info2 = get_fileinfo(fpath2) assert info1 is not original_info1 assert info1 == original_info1 assert info2 is not original_info2 assert info2 == original_info2
def validate(pattern: str, shallow_check: bool): """ Validate each one of the files matched using the input file pattern. """ start = time.time() files = resolve_glob_pattern(pattern) click.echo(f"{len(files)} files matched with the pattern.") with click.progressbar(files) as files: for file in files: try: get_fileinfo(file, shallow_check ) # inside here happens the validation step too except TFRecordValidationError: raise except TFRecordInfoMissingError: raise except Exception as e: # Probably not a valid tfrecords file click.echo(f'Probably not a valid tf_record file {e}') end = time.time() click.echo(f"Total execution time: {end - start}")
def total_examples(pattern) -> int: """ Get total examples for all the files matched with the given input file pattern. """ files = resolve_glob_pattern(pattern) click.echo(f"{len(files)} files matched with the pattern.") total_rows = 0 for file in files: try: total_rows += get_fileinfo(file).total_records except Exception: pass click.echo(f"Total number of examples: {total_rows}")
def test_non_existing_meta(self, tmpdir, datadir): shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir) with pytest.raises(TFRecordInfoMissingError): get_fileinfo(Path(tmpdir / 'train-00000-of-00001'))
def test_get_info_non_existing_file(self, tmpdir): with pytest.raises(FileNotFoundError): get_fileinfo(Path(tmpdir / 'nonexisting.tfrecords')) with pytest.raises(FileNotFoundError): generate_fileinfo(Path(tmpdir / 'nonexisting.tfrecords'))