def test_delete_correct_rows_from_json_file_with_composite_types_with_nullable_or_undefined_identifiers(): # Arrange to_delete = [ { "Columns": ["user.name", "parents.mother"], "MatchIds": [["John", "23456"]], "Type": "Composite", } ] data = ( '{"user": {"id": "12345", "name": "John"}, "parents": {"mother": "23456"}}\n' '{"user": {"id": "23456", "name": "John"}, "parents": {"mother": null}}\n' '{"user": {"id": "34567", "name": "John"}}\n' '{"user": {"id": "45678", "name": "John"}, "parents": {}}\n' '{"user": {"id": "45678", "name": "John"}, "parents": null}\n' ) out_stream = to_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 5, "DeletedRows": 1} == stats assert to_json_string(out) == ( '{"user": {"id": "23456", "name": "John"}, "parents": {"mother": null}}\n' '{"user": {"id": "34567", "name": "John"}}\n' '{"user": {"id": "45678", "name": "John"}, "parents": {}}\n' '{"user": {"id": "45678", "name": "John"}, "parents": null}\n' )
def test_it_throws_meaningful_error_for_serialization_issues(): # Arrange to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}] data = ( '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n' '{"customer_id": "23456", "x": 2.3, "d":"invalid\n' '{"customer_id": "34567", "x": 3.4, "d":"2001-01-05"}\n' ) out_stream = to_json_file(data) # Act with pytest.raises(ValueError) as e: out, stats = delete_matches_from_json_file(out_stream, to_delete) assert e.value.args[0] == ( "Serialization error when parsing JSON lines: " "Unterminated string starting at: line 2 column 40 (char 39)" )
def test_delete_correct_rows_from_json_file_with_complex_types(): # Arrange to_delete = [{"Column": "user.id", "MatchIds": ["23456"], "Type": "Simple"}] data = ( '{"user": {"id": "12345", "name": "John"}, "d":["2001-01-01"]}\n' '{"user": {"id": "23456", "name": "Jane"}, "d":[]}\n' '{"user": {"id": "34567", "name": "Mary"}, "d":["2001-01-08"]}\n' ) out_stream = to_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete, []) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 3, "DeletedRows": 1} == stats assert to_json_string(out) == ( '{"user": {"id": "12345", "name": "John"}, "d":["2001-01-01"]}\n' '{"user": {"id": "34567", "name": "Mary"}, "d":["2001-01-08"]}\n' )
def test_delete_correct_rows_when_missing_newline_at_the_end(): # Arrange to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}] data = ( '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n' '{"customer_id": "23456", "x": 2.3, "d":"2001-01-03"}\n' '{"customer_id": "34567", "x": 3.4, "d":"2001-01-05"}' ) out_stream = to_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete, []) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 3, "DeletedRows": 1} == stats assert to_json_string(out) == ( '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n' '{"customer_id": "34567", "x": 3.4, "d":"2001-01-05"}\n' )
def test_it_handles_json_with_gzip_compression(): # Arrange to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}] data = ( '{"customer_id": "12345", "x": 7, "d":"2001-01-01"}\n' '{"customer_id": "23456", "x": 8, "d":"2001-01-03"}\n' '{"customer_id": "34567", "x": 9, "d":"2001-01-05"}\n' ) out_stream = to_compressed_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete, True) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 3, "DeletedRows": 1} == stats assert to_decompressed_json_string(out) == ( '{"customer_id": "12345", "x": 7, "d":"2001-01-01"}\n' '{"customer_id": "34567", "x": 9, "d":"2001-01-05"}\n' )
def test_delete_correct_rows_from_json_file_with_multiple_identifiers(): # Arrange to_delete = [ {"Column": "user.id", "MatchIds": ["23456"], "Type": "Simple"}, {"Column": "mother", "MatchIds": ["23456"], "Type": "Simple"}, ] data = ( '{"user": {"id": "12345", "name": "John"}, "mother": "23456"}\n' '{"user": {"id": "23456", "name": "Jane"}, "mother": null}\n' '{"user": {"id": "34567", "name": "Mary"}}\n' ) out_stream = to_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 3, "DeletedRows": 2} == stats assert to_json_string(out) == '{"user": {"id": "34567", "name": "Mary"}}\n'
def test_delete_correct_rows_from_json_file_with_lower_cased_column_id(): # Arrange to_delete = [{"Column": "userid", "MatchIds": ["23456"], "Type": "Simple"}] data = ( '{"userId": "12345", "fullName": "JohnDoe"}\n' '{"userId": "23456", "fullName": "JaneDoe"}\n' '{"userId": "34567", "fullName": "MaryMary"}\n' ) out_stream = to_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 3, "DeletedRows": 1} == stats assert to_json_string(out) == ( '{"userId": "12345", "fullName": "JohnDoe"}\n' '{"userId": "34567", "fullName": "MaryMary"}\n' )
def test_it_generates_new_json_file_without_matches(): # Arrange to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}] data = ( '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n' '{"customer_id": "23456", "x": 2.3, "d":"2001-01-03"}\n' '{"customer_id": "34567", "x": 3.4, "d":"2001-01-05"}\n' ) out_stream = to_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 3, "DeletedRows": 1} == stats assert to_json_string(out) == ( '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n' '{"customer_id": "34567", "x": 3.4, "d":"2001-01-05"}\n' )
def test_delete_correct_rows_containing_newlines_as_content(): # UNICODE_NEWLINE_SEP = '\u2028' # Arrange to_delete = [{"Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple"}] data = ( '{"customer_id": "12345", "d": "foo"}\n' '{"customer_id": "23456", "d": "foo\u2028\\nbar"}\n' '{"customer_id": "34567", "d": "bar"}\n' ) out_stream = to_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete, []) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 3, "DeletedRows": 1} == stats assert to_json_string(out) == ( '{"customer_id": "23456", "d": "foo\u2028\\nbar"}\n' '{"customer_id": "34567", "d": "bar"}\n' )
def test_delete_correct_rows_from_json_file_with_composite_types_single_col(): # Arrange to_delete = [ {"Columns": ["last_name"], "MatchIds": [["Doe"]], "Type": "Composite",} ] data = ( '{"customer_id": 12345, "first_name": "John", "last_name": "Doe"}\n' '{"customer_id": 23456, "first_name": "Jane", "last_name": "Doe"}\n' '{"customer_id": 34567, "first_name": "Mary", "last_name": "Hey"}\n' ) out_stream = to_json_file(data) # Act out, stats = delete_matches_from_json_file(out_stream, to_delete) assert isinstance(out, pa.BufferOutputStream) assert {"ProcessedRows": 3, "DeletedRows": 2} == stats assert to_json_string(out) == ( '{"customer_id": 34567, "first_name": "Mary", "last_name": "Hey"}\n' )