def test_is_valid_manifest_format_with_invalid_urls(caplog): """ Test that invalid urls are detected and error logged Test that empty arrays and empty quote pairs are detected and error logged """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_invalid_urls.tsv" ) error_log = caplog.text assert '"wrong_protocol://test_bucket/test.txt"' in error_log assert '"test/test.txt"' in error_log assert '"testaws/aws/test.txt"' in error_log assert '"://test_bucket/test.txt"' in error_log assert '"s3://"' in error_log assert '"gs://"' in error_log assert '"s3://bucket_without_object"' in error_log assert '"s3://bucket_without_object/"' in error_log assert '"test_bucket/aws/test.txt"' in error_log assert '"s3:/test_bucket/aws/test.txt"' in error_log assert '"s3:test_bucket/aws/test.txt"' in error_log assert '"://test_bucket/aws/test.txt"' in error_log assert '"s3test_bucket/aws/test.txt"' in error_log assert '"https://www.uchicago.edu"' in error_log assert '"https://www.uchicago.edu/about"' in error_log assert '"google.com/path"' in error_log assert '""""' in error_log assert "\"''\"" in error_log assert '"[]"' in error_log assert "\"['']\"" in error_log assert '"[""]"' in error_log assert '"["", ""]"' in error_log assert '"["", \'\']"' in error_log assert result == False
def test_is_valid_manifest_format_with_no_errors(caplog): """ Test that no errors occur for manifest without errors """ assert (is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_no_errors.tsv") == True) assert caplog.text == ""
def test_is_valid_manifest_format_with_empty_url(caplog): """ Test that by default, completely empty url values are allowed """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_empty_url.tsv", ) assert caplog.text == "" assert result == True
def test_is_valid_manifest_format_with_csv(logfile): """ Test that alternative delimiter can be automatically detected """ logging.setLevel(default_logging.ERROR) assert is_valid_manifest_format("tests/test_data/test_manifest.csv") == True assert logfile.read() == "" logging.setLevel(default_logging.WARNING)
def test_is_valid_manifest_with_wide_row(logfile): """ Test that warning is generated for a wide row with an extra value """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_wide_row.tsv", ) wide_warning = f"line 3, number of fields (6) in row is unequal to number of column names in manifest (5)" assert wide_warning in logfile.read() assert result == True
def test_is_valid_manifest_format_with_many_types_of_errors(caplog): """ Test that errors with md5, file size, url, and authz all get detected and error logged """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_many_types_of_errors.tsv", ) error_log = caplog.text manifest_with_many_types_of_errors_helper(error_log) assert result == False
def test_is_valid_manifest_format_with_no_errors(logfile): """ Test that no errors occur for manifest without errors """ assert ( is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_no_errors.tsv" ) == True ) assert logfile.read() == ""
def test_is_valid_manifest_format_using_error_on_empty_url(caplog): """ Test that completely empty urls are detected and reported in error log when using error_on_empty_url """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_empty_url.tsv", error_on_empty_url=True, ) assert '""' in caplog.text assert result == False
def test_is_valid_manifest_with_wide_row(caplog): """ Test that warning is generated for a wide row with an extra value """ logging.getLogger().setLevel(logging.WARNING) result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_wide_row.tsv", ) wide_warning = f"line 3, number of fields (6) in row is unequal to number of column names in manifest (5)" assert wide_warning in caplog.text assert result == True
def test_is_valid_manifest_with_missing_md5_column(caplog): """ Test that completely missing md5 column is detected and reported in error log """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_missing_md5_column.tsv", ) missing_md5_message = ( 'could not find a column name corresponding to required "Columns.MD5"') assert missing_md5_message in caplog.text assert result == False
def _verify_manifest_format(tsv): return indexing.is_valid_manifest_format( manifest_path=tsv, column_names_to_enums=None, allowed_protocols=[ "s3", "gs", "https", "htsget", "gds", "file", "ftp", "gsiftp", "globus" ], allow_base64_encoded_md5=False, error_on_empty_url=False, line_limit=None, )
def test_is_valid_manifest_format_with_invalid_sizes(caplog): """ Test that invalid sizes are detected and error logged """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_invalid_sizes.tsv" ) error_log = caplog.text assert "-1" in error_log assert "not_an_int" in error_log assert "3.34" in error_log assert "string_with_42" in error_log assert result == False
def test_is_valid_manifest_format_with_empty_url(logfile): """ Test that by default, completely empty url values are allowed """ logging.setLevel(default_logging.ERROR) result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_empty_url.tsv", ) assert logfile.read() == "" assert result == True logging.setLevel(default_logging.WARNING)
def test_is_valid_manifest_with_missing_url_column(logfile): """ Test that a warning is generated for completely missing url column by default """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_missing_url_column.tsv", ) missing_size_message = ( 'could not find a column name corresponding to required "Columns.URL"' ) assert missing_size_message in logfile.read() assert result == True
def test_is_valid_manifest_with_missing_size_column(logfile): """ Test that completely missing size column is detected and reported in error log """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_missing_size_column.tsv", ) missing_size_message = ( 'could not find a column name corresponding to required "Columns.SIZE"' ) assert missing_size_message in logfile.read() assert result == False
def test_is_valid_manifest_format_with_invalid_md5_values(caplog): """ Test that invalid md5 errors are detected and error logged """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_invalid_md5_values.tsv" ) error_log = caplog.text manifest_with_invalid_md5_values_helper(error_log) base64_encoded_md5 = '"jd2L5LF5pSmvpfL/rkuYWA=="' assert base64_encoded_md5 in error_log assert result == False
def test_is_valid_manifest_with_missing_url_column(caplog): """ Test that a warning is generated for completely missing url column by default """ logging.getLogger().setLevel(logging.WARNING) result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_missing_url_column.tsv", ) missing_size_message = ( 'could not find a column name corresponding to required "Columns.URL"') assert missing_size_message in caplog.text assert result == True
def test_is_valid_manifest_with_missing_url_column_and_error_on_empty_url( caplog): """ Test that an error is generated for completely missing url column when using error_on_empty_url """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_missing_url_column.tsv", error_on_empty_url=True, ) missing_size_message = ( 'could not find a column name corresponding to required "Columns.URL"') assert missing_size_message in caplog.text assert result == False
def test_is_valid_manifest_format_using_line_limit(caplog): """ Test that only first few lines of manifest can be validated """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_invalid_sizes.tsv", line_limit=3, ) error_log = caplog.text assert "line 2" in error_log assert "line 3" in error_log assert "line 4" not in error_log assert "line 5" not in error_log assert result == False
def test_is_valid_manifest_format_with_invalid_authz_resources(caplog): """ Test that invalid authz resources are detected and reported in error log """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_invalid_authz_resources.tsv", ) error_log = caplog.text assert '"invalid_authz"' in error_log assert '"/"' in error_log assert '"//"' in error_log assert '"///"' in error_log assert '"invalid_authz2"' in error_log assert result == False
def test_is_valid_manifest_format_allowing_base64_encoded_md5(caplog): """ Test that valid Base64 encoded md5 does not get reported in error log when allow_base64_encoded_md5 is used """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_invalid_md5_values.tsv", allow_base64_encoded_md5=True, ) error_log = caplog.text manifest_with_invalid_md5_values_helper(error_log) base64_encoded_md5 = '"jd2L5LF5pSmvpfL/rkuYWA=="' assert base64_encoded_md5 not in error_log assert result == False
def test_is_valid_manifest_format_using_allowed_protocols(caplog): """ Test that user defined protocols can be used """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_custom_url_protocols.tsv", allowed_protocols=["s3", "gs", "http", "https"], ) error_log = caplog.text assert "gs://test/test.txt" not in error_log assert "s3://testaws/aws/test.txt" not in error_log assert "https://www.uchicago.edu/about" not in error_log assert "http://en.wikipedia.org/wiki/University_of_Chicago" not in error_log assert '"s3://bucket_without_path"' in error_log assert '"wrong_protocol://test_bucket/test.txt"' in error_log assert result == False
def test_is_valid_manifest_format_using_column_names_to_enums(caplog): """ Test that custom manifest column names can be used """ column_names_to_enums = { "md5_with_underscores": Columns.MD5, "file size with spaces": Columns.SIZE, "Urls With Caps": Columns.URL, "authz with special chars!@*&": Columns.AUTHZ, } result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_custom_column_names.tsv", column_names_to_enums=column_names_to_enums, ) error_log = caplog.text manifest_with_many_types_of_errors_helper(error_log) assert result == False
def objects_manifest_validate_format( ctx, file, allowed_protocols, allow_base64_encoded_md5, error_on_empty_url, line_limit, ): if not file: file = click.prompt( "Enter Discovery metadata file path to validate format for") is_valid = is_valid_manifest_format( manifest_path=file, column_names_to_enums=None, allowed_protocols=allowed_protocols.split(" "), allow_base64_encoded_md5=allow_base64_encoded_md5, error_on_empty_url=error_on_empty_url, line_limit=line_limit, ) # non-zero exit code if not is_valid: sys.exit(1)
def test_is_valid_manifest_format_with_invalid_urls(logfile): """ Test that invalid urls are detected and error logged Test that empty arrays and empty quote pairs are detected and error logged """ result = is_valid_manifest_format( "tests/validate_manifest_format/manifests/manifest_with_invalid_urls.tsv" ) error_log = logfile.read() assert '"wrong_protocol://test_bucket/test.txt"' in error_log assert '"test/test.txt"' in error_log assert '"testaws/aws/test.txt"' in error_log assert '"://test_bucket/test.txt"' in error_log assert '"s3://"' in error_log assert '"gs://"' in error_log assert '"s3://bucket_without_object"' in error_log assert '"s3://bucket_without_object/"' in error_log assert '"test_bucket/aws/test.txt"' in error_log assert '"s3:/test_bucket/aws/test.txt"' in error_log assert '"s3:test_bucket/aws/test.txt"' in error_log assert '"://test_bucket/aws/test.txt"' in error_log assert '"s3test_bucket/aws/test.txt"' in error_log assert '"https://www.uchicago.edu"' in error_log assert '"https://www.uchicago.edu/about"' in error_log assert '"google.com/path"' in error_log # if the url resolves to nothing after replacing characters, the log may just say # "is empty" and not list the original value assert '""""' in error_log or "is empty" in error_log assert "\"''\"" in error_log or "is empty" in error_log assert '"[]"' in error_log or "is empty" in error_log assert "\"['']\"" in error_log or "is empty" in error_log assert '"[""]"' in error_log or "is empty" in error_log assert '"["" ""]"' in error_log or "is empty" in error_log assert '"["" \'\']"' in error_log or "is empty" in error_log assert result == False
def test_is_valid_manifest_format_with_csv(caplog): """ Test that alternative delimiter can be automatically detected """ assert is_valid_manifest_format("tests/test_manifest.csv") == True assert caplog.text == ""