def test_csv_primitive_column(self): schema = CsvSchema.builder() \ .add_number_column('tinyint', DataTypes.TINYINT()) \ .add_number_column('smallint', DataTypes.SMALLINT()) \ .add_number_column('int', DataTypes.INT()) \ .add_number_column('bigint', DataTypes.BIGINT()) \ .add_number_column('float', DataTypes.FLOAT()) \ .add_number_column('double', DataTypes.DOUBLE()) \ .add_number_column('decimal', DataTypes.DECIMAL(2, 0)) \ .add_boolean_column('boolean') \ .add_string_column('string') \ .build() with open(self.csv_file_name, 'w') as f: f.write('127,') f.write('-32767,') f.write('2147483647,') f.write('-9223372036854775808,') f.write('3e38,') f.write('2e-308,') f.write('1.5,') f.write('true,') f.write('string\n') self._build_csv_job(schema) self.env.execute('test_csv_primitive_column') row = self.test_sink.get_results(True, False)[0] self.assertEqual(row['tinyint'], 127) self.assertEqual(row['smallint'], -32767) self.assertEqual(row['int'], 2147483647) self.assertEqual(row['bigint'], -9223372036854775808) self.assertAlmostEqual(row['float'], 3e38, delta=1e31) self.assertAlmostEqual(row['double'], 2e-308, delta=2e-301) self.assertAlmostEqual(row['decimal'], 2) self.assertEqual(row['boolean'], True) self.assertEqual(row['string'], 'string')
def _create_csv_primitive_column_schema_and_lines( ) -> Tuple[CsvSchema, List[str]]: schema = CsvSchema.builder() \ .add_number_column('tinyint', DataTypes.TINYINT()) \ .add_number_column('smallint', DataTypes.SMALLINT()) \ .add_number_column('int', DataTypes.INT()) \ .add_number_column('bigint', DataTypes.BIGINT()) \ .add_number_column('float', DataTypes.FLOAT()) \ .add_number_column('double', DataTypes.DOUBLE()) \ .add_number_column('decimal', DataTypes.DECIMAL(2, 0)) \ .add_boolean_column('boolean') \ .add_string_column('string') \ .build() lines = [ '127,' '-32767,' '2147483647,' '-9223372036854775808,' '3e38,' '2e-308,' '1.5,' 'true,' 'string\n', ] return schema, lines
def test_csv_add_columns_from(self): original_schema, lines = _create_csv_primitive_column_schema_and_lines( ) schema = CsvSchema.builder().add_columns_from(original_schema).build() self._build_csv_job(schema, lines) self.env.execute('test_csv_schema_copy') _check_csv_primitive_column_results( self, self.test_sink.get_results(True, False))
def test_csv_default_quote_char(self): schema = CsvSchema.builder() \ .add_string_column('string') \ .build() with open(self.csv_file_name, 'w') as f: f.write('"string"\n') self._build_csv_job(schema) self.env.execute('test_csv_default_quote_char') row = self.test_sink.get_results(True, False)[0] self.assertEqual(row['string'], 'string')
def _create_csv_default_quote_char_schema_and_lines( ) -> Tuple[CsvSchema, List[str]]: schema = CsvSchema.builder() \ .add_string_column('string') \ .add_string_column('string2') \ .set_column_separator('|') \ .build() lines = [ '"string"|"string2"\n', ] return schema, lines
def _create_csv_use_header_schema_and_lines() -> Tuple[CsvSchema, List[str]]: schema = CsvSchema.builder() \ .add_string_column('string') \ .add_number_column('number') \ .set_use_header() \ .build() lines = [ 'h1,h2\n', 'string,123\n', ] return schema, lines
def _build_csv_job(self, schema: CsvSchema, lines): with open(self.csv_file_name, 'w') as f: for line in lines: f.write(line) source = FileSource.for_record_stream_format( CsvReaderFormat.for_schema(schema), self.csv_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'csv-source') sink = FileSink.for_bulk_format( self.csv_dir_name, CsvBulkWriter.for_schema(schema)).build() ds.map(lambda e: e, output_type=schema.get_type_info()).sink_to(sink)
def _create_csv_set_escape_char_schema_and_lines( ) -> Tuple[CsvSchema, List[str]]: schema = CsvSchema.builder() \ .add_string_column('string') \ .add_string_column('string2') \ .set_column_separator(',') \ .set_escape_char('\\') \ .build() lines = [ 'string\\,,\\"string2\\"\n', ] return schema, lines
def _create_csv_customize_quote_char_schema_lines( ) -> Tuple[CsvSchema, List[str]]: schema = CsvSchema.builder() \ .add_string_column('string') \ .add_string_column('string2') \ .set_column_separator('|') \ .set_quote_char('`') \ .build() lines = [ '`string`|`string2`\n', ] return schema, lines
def _create_csv_allow_comments_schema_and_lines( ) -> Tuple[CsvSchema, List[str]]: schema = CsvSchema.builder() \ .add_string_column('string') \ .set_allow_comments() \ .build() lines = [ 'a\n', '# this is comment\n', 'b\n', ] return schema, lines
def test_csv_allow_comments(self): schema = CsvSchema.builder() \ .add_string_column('string') \ .set_allow_comments() \ .build() with open(self.csv_file_name, 'w') as f: f.write('a\n') f.write('# this is comment\n') f.write('b\n') self._build_csv_job(schema) self.env.execute('test_csv_allow_comments') rows = self.test_sink.get_results(True, False) self.assertEqual(rows[0]['string'], 'a') self.assertEqual(rows[1]['string'], 'b')
def test_csv_use_header(self): schema = CsvSchema.builder() \ .add_string_column('string') \ .add_number_column('number') \ .set_use_header() \ .build() with open(self.csv_file_name, 'w') as f: f.write('h1,h2\n') f.write('string,123\n') self._build_csv_job(schema) self.env.execute('test_csv_use_header') row = self.test_sink.get_results(True, False)[0] self.assertEqual(row['string'], 'string') self.assertEqual(row['number'], 123)
def _create_csv_array_column_schema_and_lines() -> Tuple[CsvSchema, List[str]]: schema = CsvSchema.builder() \ .add_array_column('number_array', separator=';', element_type=DataTypes.INT()) \ .add_array_column('boolean_array', separator=':', element_type=DataTypes.BOOLEAN()) \ .add_array_column('string_array', separator=',', element_type=DataTypes.STRING()) \ .set_column_separator('|') \ .disable_quote_char() \ .build() lines = [ '1;2;3|' 'true:false|' 'a,b,c\n', ] return schema, lines
def test_csv_array_column(self): schema = CsvSchema.builder() \ .add_array_column('number_array', separator=';', element_type=DataTypes.INT()) \ .add_array_column('boolean_array', separator=':', element_type=DataTypes.BOOLEAN()) \ .add_array_column('string_array', separator=',', element_type=DataTypes.STRING()) \ .set_column_separator('|') \ .build() with open(self.csv_file_name, 'w') as f: f.write('1;2;3|') f.write('true:false|') f.write('a,b,c\n') self._build_csv_job(schema) self.env.execute('test_csv_array_column') row = self.test_sink.get_results(True, False)[0] self.assertListEqual(row['number_array'], [1, 2, 3]) self.assertListEqual(row['boolean_array'], [True, False]) self.assertListEqual(row['string_array'], ['a', 'b', 'c'])