Exemple #1
0
 def test_csv_primitive_column(self):
     schema = CsvSchema.builder() \
         .add_number_column('tinyint', DataTypes.TINYINT()) \
         .add_number_column('smallint', DataTypes.SMALLINT()) \
         .add_number_column('int', DataTypes.INT()) \
         .add_number_column('bigint', DataTypes.BIGINT()) \
         .add_number_column('float', DataTypes.FLOAT()) \
         .add_number_column('double', DataTypes.DOUBLE()) \
         .add_number_column('decimal', DataTypes.DECIMAL(2, 0)) \
         .add_boolean_column('boolean') \
         .add_string_column('string') \
         .build()
     with open(self.csv_file_name, 'w') as f:
         f.write('127,')
         f.write('-32767,')
         f.write('2147483647,')
         f.write('-9223372036854775808,')
         f.write('3e38,')
         f.write('2e-308,')
         f.write('1.5,')
         f.write('true,')
         f.write('string\n')
     self._build_csv_job(schema)
     self.env.execute('test_csv_primitive_column')
     row = self.test_sink.get_results(True, False)[0]
     self.assertEqual(row['tinyint'], 127)
     self.assertEqual(row['smallint'], -32767)
     self.assertEqual(row['int'], 2147483647)
     self.assertEqual(row['bigint'], -9223372036854775808)
     self.assertAlmostEqual(row['float'], 3e38, delta=1e31)
     self.assertAlmostEqual(row['double'], 2e-308, delta=2e-301)
     self.assertAlmostEqual(row['decimal'], 2)
     self.assertEqual(row['boolean'], True)
     self.assertEqual(row['string'], 'string')
Exemple #2
0
def _create_csv_primitive_column_schema_and_lines(
) -> Tuple[CsvSchema, List[str]]:
    schema = CsvSchema.builder() \
        .add_number_column('tinyint', DataTypes.TINYINT()) \
        .add_number_column('smallint', DataTypes.SMALLINT()) \
        .add_number_column('int', DataTypes.INT()) \
        .add_number_column('bigint', DataTypes.BIGINT()) \
        .add_number_column('float', DataTypes.FLOAT()) \
        .add_number_column('double', DataTypes.DOUBLE()) \
        .add_number_column('decimal', DataTypes.DECIMAL(2, 0)) \
        .add_boolean_column('boolean') \
        .add_string_column('string') \
        .build()
    lines = [
        '127,'
        '-32767,'
        '2147483647,'
        '-9223372036854775808,'
        '3e38,'
        '2e-308,'
        '1.5,'
        'true,'
        'string\n',
    ]
    return schema, lines
Exemple #3
0
    def test_csv_add_columns_from(self):
        original_schema, lines = _create_csv_primitive_column_schema_and_lines(
        )
        schema = CsvSchema.builder().add_columns_from(original_schema).build()
        self._build_csv_job(schema, lines)

        self.env.execute('test_csv_schema_copy')
        _check_csv_primitive_column_results(
            self, self.test_sink.get_results(True, False))
Exemple #4
0
 def test_csv_default_quote_char(self):
     schema = CsvSchema.builder() \
         .add_string_column('string') \
         .build()
     with open(self.csv_file_name, 'w') as f:
         f.write('"string"\n')
     self._build_csv_job(schema)
     self.env.execute('test_csv_default_quote_char')
     row = self.test_sink.get_results(True, False)[0]
     self.assertEqual(row['string'], 'string')
Exemple #5
0
def _create_csv_default_quote_char_schema_and_lines(
) -> Tuple[CsvSchema, List[str]]:
    schema = CsvSchema.builder() \
        .add_string_column('string') \
        .add_string_column('string2') \
        .set_column_separator('|') \
        .build()
    lines = [
        '"string"|"string2"\n',
    ]
    return schema, lines
Exemple #6
0
def _create_csv_use_header_schema_and_lines() -> Tuple[CsvSchema, List[str]]:
    schema = CsvSchema.builder() \
        .add_string_column('string') \
        .add_number_column('number') \
        .set_use_header() \
        .build()
    lines = [
        'h1,h2\n',
        'string,123\n',
    ]
    return schema, lines
Exemple #7
0
 def _build_csv_job(self, schema: CsvSchema, lines):
     with open(self.csv_file_name, 'w') as f:
         for line in lines:
             f.write(line)
     source = FileSource.for_record_stream_format(
         CsvReaderFormat.for_schema(schema), self.csv_file_name).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(),
                               'csv-source')
     sink = FileSink.for_bulk_format(
         self.csv_dir_name, CsvBulkWriter.for_schema(schema)).build()
     ds.map(lambda e: e, output_type=schema.get_type_info()).sink_to(sink)
Exemple #8
0
def _create_csv_set_escape_char_schema_and_lines(
) -> Tuple[CsvSchema, List[str]]:
    schema = CsvSchema.builder() \
        .add_string_column('string') \
        .add_string_column('string2') \
        .set_column_separator(',') \
        .set_escape_char('\\') \
        .build()
    lines = [
        'string\\,,\\"string2\\"\n',
    ]
    return schema, lines
Exemple #9
0
def _create_csv_customize_quote_char_schema_lines(
) -> Tuple[CsvSchema, List[str]]:
    schema = CsvSchema.builder() \
        .add_string_column('string') \
        .add_string_column('string2') \
        .set_column_separator('|') \
        .set_quote_char('`') \
        .build()
    lines = [
        '`string`|`string2`\n',
    ]
    return schema, lines
Exemple #10
0
def _create_csv_allow_comments_schema_and_lines(
) -> Tuple[CsvSchema, List[str]]:
    schema = CsvSchema.builder() \
        .add_string_column('string') \
        .set_allow_comments() \
        .build()
    lines = [
        'a\n',
        '# this is comment\n',
        'b\n',
    ]
    return schema, lines
Exemple #11
0
 def test_csv_allow_comments(self):
     schema = CsvSchema.builder() \
         .add_string_column('string') \
         .set_allow_comments() \
         .build()
     with open(self.csv_file_name, 'w') as f:
         f.write('a\n')
         f.write('# this is comment\n')
         f.write('b\n')
     self._build_csv_job(schema)
     self.env.execute('test_csv_allow_comments')
     rows = self.test_sink.get_results(True, False)
     self.assertEqual(rows[0]['string'], 'a')
     self.assertEqual(rows[1]['string'], 'b')
Exemple #12
0
 def test_csv_use_header(self):
     schema = CsvSchema.builder() \
         .add_string_column('string') \
         .add_number_column('number') \
         .set_use_header() \
         .build()
     with open(self.csv_file_name, 'w') as f:
         f.write('h1,h2\n')
         f.write('string,123\n')
     self._build_csv_job(schema)
     self.env.execute('test_csv_use_header')
     row = self.test_sink.get_results(True, False)[0]
     self.assertEqual(row['string'], 'string')
     self.assertEqual(row['number'], 123)
Exemple #13
0
def _create_csv_array_column_schema_and_lines() -> Tuple[CsvSchema, List[str]]:
    schema = CsvSchema.builder() \
        .add_array_column('number_array', separator=';', element_type=DataTypes.INT()) \
        .add_array_column('boolean_array', separator=':', element_type=DataTypes.BOOLEAN()) \
        .add_array_column('string_array', separator=',', element_type=DataTypes.STRING()) \
        .set_column_separator('|') \
        .disable_quote_char() \
        .build()
    lines = [
        '1;2;3|'
        'true:false|'
        'a,b,c\n',
    ]
    return schema, lines
Exemple #14
0
 def test_csv_array_column(self):
     schema = CsvSchema.builder() \
         .add_array_column('number_array', separator=';', element_type=DataTypes.INT()) \
         .add_array_column('boolean_array', separator=':', element_type=DataTypes.BOOLEAN()) \
         .add_array_column('string_array', separator=',', element_type=DataTypes.STRING()) \
         .set_column_separator('|') \
         .build()
     with open(self.csv_file_name, 'w') as f:
         f.write('1;2;3|')
         f.write('true:false|')
         f.write('a,b,c\n')
     self._build_csv_job(schema)
     self.env.execute('test_csv_array_column')
     row = self.test_sink.get_results(True, False)[0]
     self.assertListEqual(row['number_array'], [1, 2, 3])
     self.assertListEqual(row['boolean_array'], [True, False])
     self.assertListEqual(row['string_array'], ['a', 'b', 'c'])