def test_csv_output_custom_quote_char(client, log_output): # Get a unique bucket_name and object_name log_output.args['bucket_name'] = bucket_name = generate_bucket_name() tests = [ # UTF-8 quote character ("''", "''", b'col1,col2,col3\n', Exception()), ("'", "'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), ("", '"', b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'), ('"', '"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'), ('"', '"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'), ('"', '"', b'""""\n', b'""""\n'), ('"', '"', b'\n', b''), ("'", "\\", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), ("'", "\\", b'col""1,col2,col3\n', b"'col\"\"1','col2','col3'\n"), ("'", "\\", b'col\'1,col2,col3\n', b"'col\\'1','col2','col3'\n"), ("'", "\\", b'"col\'1","col2","col3"\n', b"'col\\'1','col2','col3'\n"), ("'", "\\", b'col\'\n', b"'col\\''\n"), # Two consecutive escaped quotes ("'", "\\", b'"a"""""\n', b"'a\"\"'\n"), ] client.make_bucket(bucket_name) try: for idx, (quote_char, escape_char, input_data, expected_output) in enumerate(tests): sql_opts = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="NONE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', Comments="#", AllowQuotedRecordDelimiter="FALSE", ), ), output_serialization=OutputSerialization(csv=CSVOutput( QuoteFields="ALWAYS", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter=quote_char, QuoteEscapeCharacter=escape_char, )), request_progress=RequestProgress(enabled="False")) test_sql_api(f'test_{idx}', client, bucket_name, input_data, sql_opts, expected_output) finally: client.remove_bucket(bucket_name) # Test passes print(log_output.json_report())
def test_xml_marshal_select(self): expected_string = (b'<SelectObjectContentRequest>' b'<Expression>select * from s3object</Expression>' b'<ExpressionType>SQL</ExpressionType>' b'<InputSerialization>' b'<CompressionType>NONE</CompressionType>' b'<CSV><FileHeaderInfo>USE</FileHeaderInfo>' b'<RecordDelimiter>\n</RecordDelimiter>' b'<FieldDelimiter>,</FieldDelimiter>' b'<QuoteCharacter>"</QuoteCharacter>' b'<QuoteEscapeCharacter>"</QuoteEscapeCharacter>' b'<Comments>#</Comments>' b'<AllowQuotedRecordDelimiter>false' b'</AllowQuotedRecordDelimiter></CSV>' b'</InputSerialization>' b'<OutputSerialization><CSV>' b'<QuoteFields>ASNEEDED</QuoteFields>' b'<RecordDelimiter>\n</RecordDelimiter>' b'<FieldDelimiter>,</FieldDelimiter>' b'<QuoteCharacter>"</QuoteCharacter>' b'<QuoteEscapeCharacter>"</QuoteEscapeCharacter>' b'</CSV></OutputSerialization>' b'<RequestProgress>' b'<Enabled>true</Enabled>' b'</RequestProgress>' b'</SelectObjectContentRequest>') options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput(FileHeaderInfo="USE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', Comments="#", AllowQuotedRecordDelimiter="FALSE"), ), output_serialization=OutputSerialization( csv=CSVOutput(QuoteFields="ASNEEDED", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"') ), request_progress=RequestProgress( enabled="TRUE" ) ) actual_string = xml_marshal_select(options) eq_(expected_string, actual_string)
def test_sql_expressions_custom_input_output(client, input_bytes, sql_input, sql_output, tests, log_output): bucket_name = generate_bucket_name() object_name = generate_object_name() log_output.args['total_tests'] = 0 log_output.args['total_success'] = 0 client.make_bucket(bucket_name) try: content = io.BytesIO(bytes(input_bytes, 'utf-8')) client.put_object(bucket_name, object_name, content, len(input_bytes)) for idx, (test_name, select_expression, expected_output) in enumerate(tests): if select_expression == '': continue try: log_output.args['total_tests'] += 1 options = SelectObjectOptions( expression=select_expression, input_serialization=sql_input, output_serialization=sql_output, request_progress=RequestProgress(enabled="False")) data = client.select_object_content(bucket_name, object_name, options) # Get the records records = io.BytesIO() for d in data.stream(10 * 1024): records.write(d.encode('utf-8')) got_output = records.getvalue() if got_output != expected_output: if type(expected_output) == datetime: # Attempt to parse the date which will throw an exception for any issue datetime.strptime( got_output.decode("utf-8").strip(), '%Y-%m-%dT%H:%M:%S.%f%z') else: raise ValueError( 'Test {}: data mismatch. Expected : {}. Received: {}.' .format(idx + 1, expected_output, got_output)) log_output.args['total_success'] += 1 except Exception as err: continue # TODO, raise instead # raise Exception(err) finally: client.remove_object(bucket_name, object_name) client.remove_bucket(bucket_name)
def test_csv_input_custom_quote_char(client, log_output): # Get a unique bucket_name and object_name log_output.args['bucket_name'] = bucket_name = generate_bucket_name() tests = [ # Invalid quote character, should fail ('""', '"', b'col1,col2,col3\n', Exception()), # UTF-8 quote character ('ع', '"', 'عcol1ع,عcol2ع,عcol3ع\n'.encode(), b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), # Only one field is quoted ('"', '"', b'"col1",col2,col3\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), ('"', '"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'), ('\'', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', '"', b'"col1","col2","col3"\n', b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'), ('"', '"', b'""""""\n', b'{"_1":"\\"\\""}\n'), ('"', '"', b'A",B\n', b'{"_1":"A\\"","_2":"B"}\n'), ('"', '"', b'A"",B\n', b'{"_1":"A\\"\\"","_2":"B"}\n'), ('"', '\\', b'A\\B,C\n', b'{"_1":"A\\\\B","_2":"C"}\n'), ('"', '"', b'"A""B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), ('"', '\\', b'"A\\B","CD"\n', b'{"_1":"AB","_2":"CD"}\n'), ('"', '\\', b'"A\\,","CD"\n', b'{"_1":"A,","_2":"CD"}\n'), ('"', '\\', b'"A\\"B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), ('"', '\\', b'"A\\""\n', b'{"_1":"A\\""}\n'), ('"', '\\', b'"A\\"\\"B"\n', b'{"_1":"A\\"\\"B"}\n'), ('"', '\\', b'"A\\"","\\"B"\n', b'{"_1":"A\\"","_2":"\\"B"}\n'), ] client.make_bucket(bucket_name) try: for idx, (quote_char, escape_char, data, expected_output) in enumerate(tests): sql_opts = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="NONE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter=quote_char, QuoteEscapeCharacter=escape_char, Comments="#", AllowQuotedRecordDelimiter="FALSE", ), ), output_serialization=OutputSerialization( json=JsonOutput(RecordDelimiter="\n", )), request_progress=RequestProgress(enabled="False")) test_sql_api(f'test_{idx}', client, bucket_name, data, sql_opts, expected_output) finally: client.remove_bucket(bucket_name) # Test passes print(log_output.json_report())
def test_csv_output_quote_char(client, log_output): # Get a unique bucket_name and object_name log_output.args['bucket_name'] = bucket_name = generate_bucket_name() tests = [ # UTF-8 quote character ("''", b'col1,col2,col3\n', Exception()), ("'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), ("", b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'), ('"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'), ('"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'), ('"', b'\n', b''), ] try: client.make_bucket(bucket_name) for idx, (quote_char, object_content, expected_output) in enumerate(tests): options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="NONE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', Comments="#", AllowQuotedRecordDelimiter="FALSE", ), ), output_serialization=OutputSerialization(csv=CSVOutput( QuoteFields="ALWAYS", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter=quote_char, QuoteEscapeCharacter=quote_char, )), request_progress=RequestProgress(enabled="False")) got_output = b'' try: got_output = exec_select(client, bucket_name, object_content, options, log_output) except Exception as select_err: if not isinstance(expected_output, Exception): raise ValueError( 'Test {} unexpectedly failed with: {}'.format( idx + 1, select_err)) else: if isinstance(expected_output, Exception): raise ValueError( 'Test {}: expected an exception, got {}'.format( idx + 1, got_output)) if got_output != expected_output: raise ValueError( 'Test {}: data mismatch. Expected : {}. Received: {}.'. format(idx + 1, expected_output, got_output)) except Exception as err: raise Exception(err) finally: try: client.remove_bucket(bucket_name) except Exception as err: raise Exception(err) # Test passes print(log_output.json_report())
def test_csv_input_quote_char(client, log_output): # Get a unique bucket_name and object_name log_output.args['bucket_name'] = bucket_name = generate_bucket_name() tests = [ # Invalid quote character, should fail ('""', b'col1,col2,col3\n', Exception()), # UTF-8 quote character ('ع', b'\xd8\xb9col1\xd8\xb9,\xd8\xb9col2\xd8\xb9,\xd8\xb9col3\xd8\xb9\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), # Only one field is quoted ('"', b'"col1",col2,col3\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n' ), ('"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'), ('\'', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), ('', b'"col1","col2","col3"\n', b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'), ('"', b'""""""\n', b'{"_1":"\\"\\""}\n'), ] try: client.make_bucket(bucket_name) for idx, (quote_char, object_content, expected_output) in enumerate(tests): options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="NONE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter=quote_char, QuoteEscapeCharacter=quote_char, Comments="#", AllowQuotedRecordDelimiter="FALSE", ), ), output_serialization=OutputSerialization( json=JsonOutput(RecordDelimiter="\n", )), request_progress=RequestProgress(enabled="False")) got_output = b'' try: got_output = exec_select(client, bucket_name, object_content, options, log_output) except Exception as select_err: if not isinstance(expected_output, Exception): raise ValueError( 'Test {} unexpectedly failed with: {}'.format( idx + 1, select_err)) else: if isinstance(expected_output, Exception): raise ValueError( 'Test {}: expected an exception, got {}'.format( idx + 1, got_output)) if got_output != expected_output: raise ValueError( 'Test {}: data mismatch. Expected : {}, Received {}'. format(idx + 1, expected_output, got_output)) except Exception as err: raise Exception(err) finally: try: client.remove_bucket(bucket_name) except Exception as err: raise Exception(err) # Test passes print(log_output.json_report())
options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( FileHeaderInfo="USE", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', Comments="#", AllowQuotedRecordDelimiter="FALSE", ), # If input is JSON # json=JSONInput(Type="DOCUMENT",) ), output_serialization=OutputSerialization( csv=CSVOutput( QuoteFields="ASNEEDED", RecordDelimiter="\n", FieldDelimiter=",", QuoteCharacter='"', QuoteEscapeCharacter='"', ) # json = JsonOutput( # RecordDelimiter="\n", # ) ), request_progress=RequestProgress(enabled="False"))
secret_key='YOUR-SECRETKEY') options = SelectObjectOptions( expression="select * from s3object", input_serialization=InputSerialization( compression_type="NONE", csv=CSVInput( file_header_info="USE", record_delimiter="\n", field_delimiter=",", quote_character='"', quote_escape_character='"', comments="#", allow_quoted_record_delimiter="FALSE", ), # If input is JSON # json=JSONInput(json_type="DOCUMENT") ), output_serialization=OutputSerialization( csv=CSVOutput( quote_fields="ASNEEDED", record_delimiter="\n", field_delimiter=",", quote_character='"', quote_escape_character='"', ), # json = JSONOutput(record_delimiter="\n") ), request_progress=RequestProgress(enabled="False")) try: