Ejemplos de schema en Python, ejemplos de pyarrow.schema en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_schema.py Proyecto: rok/arrow

def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('foo').name == 'foo'
    assert sch.field_by_name('foo').type == fields[0].type

    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([None])

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: marklavrynenko-original/arrow

    def test_timestamps_notimezone_nulls(self):
        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123',
                None,
                '2010-08-13T05:46:57.437'],
                dtype='datetime64[ms]')
            })
        field = pa.field('datetime64', pa.timestamp('ms'))
        schema = pa.schema([field])
        self._check_pandas_roundtrip(
            df,
            timestamps_to_ms=True,
            expected_schema=schema,
        )

        df = pd.DataFrame({
            'datetime64': np.array([
                '2007-07-13T01:23:34.123456789',
                None,
                '2010-08-13T05:46:57.437699912'],
                dtype='datetime64[ns]')
            })
        field = pa.field('datetime64', pa.timestamp('ns'))
        schema = pa.schema([field])
        self._check_pandas_roundtrip(
            df,
            timestamps_to_ms=False,
            expected_schema=schema,
        )

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_csv.py Proyecto: laurentgo/arrow

    def test_custom_nulls(self):
        # Infer nulls with custom values
        opts = ConvertOptions(null_values=['Xxx', 'Zzz'])
        rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.null()),
                            ('b', pa.string()),
                            ('c', pa.string()),
                            ('d', pa.int64())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [None, None],
            'b': [u"Xxx", u"#N/A"],
            'c': [u"1", u""],
            'd': [2, None],
            }

        opts = ConvertOptions(null_values=[])
        rows = b"a,b\n#N/A,\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.string()),
                            ('b', pa.string())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [u"#N/A"],
            'b': [u""],
            }

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_schema.py Proyecto: rok/arrow

def test_schema_equals_propagates_check_metadata():
    # ARROW-4088
    schema1 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string())
    ])
    schema2 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string(), metadata={'a': 'alpha'}),
    ])
    assert not schema1.equals(schema2)
    assert schema1.equals(schema2, check_metadata=False)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_schema.py Proyecto: giantwhale/arrow

def test_schema_equals():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]

    sch1 = pa.schema(fields)
    sch2 = pa.schema(fields)
    assert sch1.equals(sch2)

    del fields[-1]
    sch3 = pa.schema(fields)
    assert not sch1.equals(sch3)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_table.py Proyecto: rok/arrow

def test_table_from_pydict():
    table = pa.Table.from_pydict({})
    assert table.num_columns == 0
    assert table.num_rows == 0
    assert table.schema == pa.schema([])
    assert table.to_pydict() == {}

    # With arrays as values
    data = OrderedDict([('strs', pa.array([u'', u'foo', u'bar'])),
                        ('floats', pa.array([4.5, 5, None]))])
    schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With chunked arrays as values
    data = OrderedDict([('strs', pa.chunked_array([[u''], [u'foo', u'bar']])),
                        ('floats', pa.chunked_array([[4.5], [5, None]]))])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema

    # With lists as values
    data = OrderedDict([('strs', [u'', u'foo', u'bar']),
                        ('floats', [4.5, 5, None])])
    table = pa.Table.from_pydict(data)
    assert table.num_columns == 2
    assert table.num_rows == 3
    assert table.schema == schema
    assert table.to_pydict() == data

    # With metadata and inferred schema
    metadata = {b'foo': b'bar'}
    schema = schema.add_metadata(metadata)
    table = pa.Table.from_pydict(data, metadata=metadata)
    assert table.schema == schema
    assert table.schema.metadata == metadata
    assert table.to_pydict() == data

    # With explicit schema
    table = pa.Table.from_pydict(data, schema=schema)
    assert table.schema == schema
    assert table.schema.metadata == metadata
    assert table.to_pydict() == data

    # Cannot pass both schema and metadata
    with pytest.raises(ValueError):
        pa.Table.from_pydict(data, schema=schema, metadata=metadata)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_schema.py Proyecto: NonVolatileComputing/arrow

def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

    def test_float_nulls(self):
        num_values = 100

        null_mask = np.random.randint(0, 10, size=num_values) < 3
        dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
        names = ['f4', 'f8']
        expected_cols = []

        arrays = []
        fields = []
        for name, arrow_dtype in dtypes:
            values = np.random.randn(num_values).astype(name)

            arr = pa.array(values, from_pandas=True, mask=null_mask)
            arrays.append(arr)
            fields.append(pa.field(name, arrow_dtype))
            values[null_mask] = np.nan

            expected_cols.append(values)

        ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
                                columns=names)

        table = pa.Table.from_arrays(arrays, names)
        assert table.schema.equals(pa.schema(fields))
        result = table.to_pandas()
        tm.assert_frame_equal(result, ex_frame)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_table.py Proyecto: emkornfield/arrow

def test_table_safe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])
    casted_table = table.cast(target_schema)

    assert casted_table.equals(expected_table)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_orc.py Proyecto: dremio/arrow

def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_cuda.py Proyecto: emkornfield/arrow

def make_recordbatch(length):
    schema = pa.schema([pa.field('f0', pa.int16()),
                        pa.field('f1', pa.int16())])
    a0 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16))
    a1 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16))
    batch = pa.RecordBatch.from_arrays([a0, a1], schema)
    return batch

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_table.py Proyecto: dremio/arrow

def test_recordbatch_basics():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10])
    ]

    batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1'])
    assert not batch.schema.metadata

    assert len(batch) == 5
    assert batch.num_rows == 5
    assert batch.num_columns == len(data)
    assert batch.to_pydict() == OrderedDict([
        ('c0', [0, 1, 2, 3, 4]),
        ('c1', [-10, -5, 0, 5, 10])
    ])

    with pytest.raises(IndexError):
        # bounds checking
        batch[2]

    # Schema passed explicitly
    schema = pa.schema([pa.field('c0', pa.int16()),
                        pa.field('c1', pa.int32())],
                       metadata={b'foo': b'bar'})
    batch = pa.RecordBatch.from_arrays(data, schema)
    assert batch.schema == schema

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_table.py Proyecto: emkornfield/arrow

def test_table_unsafe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])

    with pytest.raises(pa.ArrowInvalid,
                       match='Floating point value truncated'):
        table.cast(target_schema)

    casted_table = table.cast(target_schema, safe=False)
    assert casted_table.equals(expected_table)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

 def test_int_object_nulls(self):
     arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
     df = pd.DataFrame({'ints': arr})
     expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
     field = pa.field('ints', pa.int64())
     schema = pa.schema([field])
     self._check_pandas_roundtrip(df, expected=expected,
                                  expected_schema=schema)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

    def test_unicode(self):
        repeats = 1000
        values = [u'foo', None, u'bar', u'mañana', np.nan]
        df = pd.DataFrame({'strings': values * repeats})
        field = pa.field('strings', pa.string())
        schema = pa.schema([field])

        self._check_pandas_roundtrip(df, expected_schema=schema)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_schema.py Proyecto: rok/arrow

def test_empty_table():
    schema = pa.schema([
        pa.field('oneField', pa.int64())
    ])
    table = schema.empty_table()
    assert isinstance(table, pa.Table)
    assert table.num_rows == 0
    assert table.schema == schema

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

    def test_boolean_no_nulls(self):
        num_values = 100

        np.random.seed(0)

        df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
        field = pa.field('bools', pa.bool_())
        schema = pa.schema([field])
        self._check_pandas_roundtrip(df, expected_schema=schema)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: test_schema.py Proyecto: rok/arrow

def test_schema_from_tuples():
    fields = [
        ('foo', pa.int32()),
        ('bar', pa.string()),
        ('baz', pa.list_(pa.int8())),
    ]
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([('foo', None)])

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

 def test_fixed_size_bytes(self):
     values = [b'foo', None, b'bar', None, None, b'hey']
     df = pd.DataFrame({'strings': values})
     schema = pa.schema([pa.field('strings', pa.binary(3))])
     table = pa.Table.from_pandas(df, schema=schema)
     assert table.schema[0].type == schema[0].type
     assert table.schema[0].name == schema[0].name
     result = table.to_pandas()
     tm.assert_frame_equal(result, df)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test_table.py Proyecto: emkornfield/arrow

def test_table_from_batches_and_schema():
    schema = pa.schema([
        pa.field('a', pa.int64()),
        pa.field('b', pa.float64()),
    ])
    batch = pa.RecordBatch.from_arrays([pa.array([1]), pa.array([3.14])],
                                       names=['a', 'b'])
    table = pa.Table.from_batches([batch], schema)
    assert table.schema.equals(schema)
    assert table.column(0) == pa.column('a', pa.array([1]))
    assert table.column(1) == pa.column('b', pa.array([3.14]))

    incompatible_schema = pa.schema([pa.field('a', pa.int64())])
    with pytest.raises(pa.ArrowInvalid):
        pa.Table.from_batches([batch], incompatible_schema)

    incompatible_batch = pa.RecordBatch.from_arrays([pa.array([1])], ['a'])
    with pytest.raises(pa.ArrowInvalid):
        pa.Table.from_batches([incompatible_batch], schema)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

    def test_table_str_to_categorical(self):
        values = [None, 'a', 'b', np.nan]
        df = pd.DataFrame({'strings': values})
        field = pa.field('strings', pa.string())
        schema = pa.schema([field])
        table = pa.Table.from_pandas(df, schema=schema)

        result = table.to_pandas(strings_to_categorical=True)
        expected = pd.DataFrame({'strings': pd.Categorical(values)})
        tm.assert_frame_equal(result, expected, check_dtype=True)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: pandas_examples.py Proyecto: NonVolatileComputing/arrow

def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema

Ejemplo n.º 23

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

 def test_decimal_128_from_pandas(self):
     expected = pd.DataFrame({
         'decimals': [
             decimal.Decimal('394092382910493.12341234678'),
             -decimal.Decimal('314292388910493.12343437128'),
         ]
     })
     converted = pa.Table.from_pandas(expected, preserve_index=False)
     field = pa.field('decimals', pa.decimal(26, 11))
     schema = pa.schema([field])
     assert converted.schema.equals(schema)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

 def test_decimal_64_from_pandas(self):
     expected = pd.DataFrame({
         'decimals': [
             decimal.Decimal('-129934.123331'),
             decimal.Decimal('129534.123731'),
         ]
     })
     converted = pa.Table.from_pandas(expected, preserve_index=False)
     field = pa.field('decimals', pa.decimal(12, 6))
     schema = pa.schema([field])
     assert converted.schema.equals(schema)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: giantwhale/arrow

 def test_decimal_32_from_pandas(self):
     expected = pd.DataFrame({
         'decimals': [
             decimal.Decimal('-1234.123'),
             decimal.Decimal('1234.439'),
         ]
     })
     converted = pa.Table.from_pandas(expected, preserve_index=False)
     field = pa.field('decimals', pa.decimal128(7, 3))
     schema = pa.schema([field])
     assert converted.schema.equals(schema)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: test_csv.py Proyecto: wesm/arrow

def test_convert_options():
    cls = ConvertOptions
    opts = cls()

    assert opts.check_utf8 is True
    opts.check_utf8 = False
    assert opts.check_utf8 is False

    assert opts.strings_can_be_null is False
    opts.strings_can_be_null = True
    assert opts.strings_can_be_null is True

    assert opts.column_types == {}
    # Pass column_types as mapping
    opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
    assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()}
    opts.column_types = {'v': 'int16', 'w': 'null'}
    assert opts.column_types == {'v': pa.int16(), 'w': pa.null()}
    # Pass column_types as schema
    schema = pa.schema([('a', pa.int32()), ('b', pa.string())])
    opts.column_types = schema
    assert opts.column_types == {'a': pa.int32(), 'b': pa.string()}
    # Pass column_types as sequence
    opts.column_types = [('x', pa.binary())]
    assert opts.column_types == {'x': pa.binary()}

    with pytest.raises(TypeError, match='DataType expected'):
        opts.column_types = {'a': None}
    with pytest.raises(TypeError):
        opts.column_types = 0

    assert isinstance(opts.null_values, list)
    assert '' in opts.null_values
    assert 'N/A' in opts.null_values
    opts.null_values = ['xxx', 'yyy']
    assert opts.null_values == ['xxx', 'yyy']

    assert isinstance(opts.true_values, list)
    opts.true_values = ['xxx', 'yyy']
    assert opts.true_values == ['xxx', 'yyy']

    assert isinstance(opts.false_values, list)
    opts.false_values = ['xxx', 'yyy']
    assert opts.false_values == ['xxx', 'yyy']

    opts = cls(check_utf8=False, column_types={'a': pa.null()},
               null_values=['N', 'nn'], true_values=['T', 'tt'],
               false_values=['F', 'ff'], strings_can_be_null=True)
    assert opts.check_utf8 is False
    assert opts.column_types == {'a': pa.null()}
    assert opts.null_values == ['N', 'nn']
    assert opts.false_values == ['F', 'ff']
    assert opts.true_values == ['T', 'tt']
    assert opts.strings_can_be_null is True

Ejemplo n.º 27

0

Mostrar archivo

Archivo: test_csv.py Proyecto: wesm/arrow

 def test_simple_timestamps(self):
     # Infer a timestamp column
     rows = b"a,b\n1970,1970-01-01\n1989,1989-07-14\n"
     table = self.read_bytes(rows)
     schema = pa.schema([('a', pa.int64()),
                         ('b', pa.timestamp('s'))])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': [1970, 1989],
         'b': [datetime(1970, 1, 1), datetime(1989, 7, 14)],
         }

Ejemplo n.º 28

0

Mostrar archivo

Archivo: test_table.py Proyecto: emkornfield/arrow

def test_table_cast_to_incompatible_schema():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10]),
    ]
    table = pa.Table.from_arrays(data, names=tuple('ab'))

    target_schema1 = pa.schema([
        pa.field('A', pa.int32()),
        pa.field('b', pa.int16()),
    ])
    target_schema2 = pa.schema([
        pa.field('a', pa.int32()),
    ])
    message = ("Target schema's field names are not matching the table's "
               "field names:.*")
    with pytest.raises(ValueError, match=message):
        table.cast(target_schema1)
    with pytest.raises(ValueError, match=message):
        table.cast(target_schema2)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

    def test_list_metadata(self):
        df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
        schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
        table = pa.Table.from_pandas(df, schema=schema)
        metadata = table.schema.metadata
        assert b'mixed' not in metadata[b'pandas']

        js = json.loads(metadata[b'pandas'].decode('utf8'))
        data_column = js['columns'][0]
        assert data_column['pandas_type'] == 'list[int64]'
        assert data_column['numpy_type'] == 'object'

Ejemplo n.º 30

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: NonVolatileComputing/arrow

    def test_partial_schema(self):
        data = OrderedDict([
            ('a', [0, 1, 2, 3, 4]),
            ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)),
            ('c', [-10, -5, 0, 5, 10])
        ])
        df = pd.DataFrame(data)

        partial_schema = pa.schema([
            pa.field('a', pa.int64()),
            pa.field('b', pa.int32())
        ])

        expected_schema = pa.schema([
            pa.field('a', pa.int64()),
            pa.field('b', pa.int32()),
            pa.field('c', pa.int64())
        ])

        self._check_pandas_roundtrip(df, schema=partial_schema,
                                     expected_schema=expected_schema)

Ejemplo n.º 31

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: crystalzyan/arrow

 def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
     values = [b'foo', None, b'ba', None, None, b'hey']
     df = pd.DataFrame({'strings': values})
     schema = pa.schema([pa.field('strings', pa.binary(3))])
     with self.assertRaises(pa.ArrowInvalid):
         pa.Table.from_pandas(df, schema=schema)

Ejemplo n.º 32

0

Mostrar archivo

def test_column_selection(tempdir):
    from pyarrow import orc

    # create a table with nested types
    inner = pa.field('inner', pa.int64())
    middle = pa.field('middle', pa.struct([inner]))
    fields = [
        pa.field('basic', pa.int32()),
        pa.field(
            'list', pa.list_(pa.field('item', pa.int32()))
        ),
        pa.field(
            'struct', pa.struct([middle, pa.field('inner2', pa.int64())])
        ),
        pa.field(
            'list-struct', pa.list_(pa.field(
                'item', pa.struct([
                    pa.field('inner1', pa.int64()),
                    pa.field('inner2', pa.int64())
                ])
            ))
        ),
        pa.field('basic2', pa.int64()),
    ]
    arrs = [
        [0], [[1, 2]], [{"middle": {"inner": 3}, "inner2": 4}],
        [[{"inner1": 5, "inner2": 6}, {"inner1": 7, "inner2": 8}]], [9]]
    table = pa.table(arrs, schema=pa.schema(fields))

    path = str(tempdir / 'test.orc')
    orc.write_table(table, path)
    orc_file = orc.ORCFile(path)

    # default selecting all columns
    result1 = orc_file.read()
    assert result1.equals(table)

    # selecting with columns names
    result2 = orc_file.read(columns=["basic", "basic2"])
    assert result2.equals(table.select(["basic", "basic2"]))

    result3 = orc_file.read(columns=["list", "struct", "basic2"])
    assert result3.equals(table.select(["list", "struct", "basic2"]))

    # using dotted paths
    result4 = orc_file.read(columns=["struct.middle.inner"])
    expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]})
    assert result4.equals(expected4)

    result5 = orc_file.read(columns=["struct.inner2"])
    expected5 = pa.table({"struct": [{"inner2": 4}]})
    assert result5.equals(expected5)

    result6 = orc_file.read(
        columns=["list", "struct.middle.inner", "struct.inner2"]
    )
    assert result6.equals(table.select(["list", "struct"]))

    result7 = orc_file.read(columns=["list-struct.inner1"])
    expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]})
    assert result7.equals(expected7)

    # selecting with (Arrow-based) field indices
    result2 = orc_file.read(columns=[0, 4])
    assert result2.equals(table.select(["basic", "basic2"]))

    result3 = orc_file.read(columns=[1, 2, 3])
    assert result3.equals(table.select(["list", "struct", "list-struct"]))

    # error on non-existing name or index
    with pytest.raises(IOError):
        # liborc returns ParseError, which gets translated into IOError
        # instead of ValueError
        orc_file.read(columns=["wrong"])

    with pytest.raises(ValueError):
        orc_file.read(columns=[5])

Ejemplo n.º 33

0

Mostrar archivo

def test_schema_to_string_with_metadata():
    lorem = """\
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla accumsan vel
turpis et mollis. Aliquam tincidunt arcu id tortor blandit blandit. Donec
eget leo quis lectus scelerisque varius. Class aptent taciti sociosqu ad
litora torquent per conubia nostra, per inceptos himenaeos. Praesent
faucibus, diam eu volutpat iaculis, tellus est porta ligula, a efficitur
turpis nulla facilisis quam. Aliquam vitae lorem erat. Proin a dolor ac libero
dignissim mollis vitae eu mauris. Quisque posuere tellus vitae massa
pellentesque sagittis. Aenean feugiat, diam ac dignissim fermentum, lorem
sapien commodo massa, vel volutpat orci nisi eu justo. Nulla non blandit
sapien. Quisque pretium vestibulum urna eu vehicula."""
    # ARROW-7063
    my_schema = pa.schema([pa.field("foo", "int32", False,
                                    metadata={"key1": "value1"}),
                           pa.field("bar", "string", True,
                                    metadata={"key3": "value3"})],
                          metadata={"lorem": lorem})

    assert my_schema.to_string() == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'
-- schema metadata --
lorem: '""" + lorem[:65] + "' + " + str(len(lorem) - 65)

    # Metadata that exactly fits
    result = pa.schema([('f0', 'int32')],
                       metadata={'key': 'value' + 'x' * 62}).to_string()
    assert result == """\
f0: int32
-- schema metadata --
key: 'valuexxxxxxxxxxxxxxxxxxxxxxxxxxxxx\
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'"""

    assert my_schema.to_string(truncate_metadata=False) == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'
-- schema metadata --
lorem: '{}'""".format(lorem)

    assert my_schema.to_string(truncate_metadata=False,
                               show_field_metadata=False) == """\
foo: int32 not null
bar: string
-- schema metadata --
lorem: '{}'""".format(lorem)

    assert my_schema.to_string(truncate_metadata=False,
                               show_schema_metadata=False) == """\
foo: int32 not null
  -- field metadata --
  key1: 'value1'
bar: string
  -- field metadata --
  key3: 'value3'"""

    assert my_schema.to_string(truncate_metadata=False,
                               show_field_metadata=False,
                               show_schema_metadata=False) == """\

Ejemplo n.º 34

0

Mostrar archivo

Archivo: test_flight.py Proyecto: vikram/arrow

 def list_flights(self, context, criteria):
     yield flight.FlightInfo(pa.schema([]),
                             flight.FlightDescriptor.for_path('/foo'), [],
                             -1, -1)
     raise flight.FlightInternalError("foo")

Ejemplo n.º 35

0

Mostrar archivo

Archivo: test_flight.py Proyecto: vikram/arrow

 def list_flights(self, context, criteria):
     if criteria == self.CRITERIA:
         yield flight.FlightInfo(pa.schema([]),
                                 flight.FlightDescriptor.for_path('/foo'),
                                 [], -1, -1)

Ejemplo n.º 36

0

Mostrar archivo

PQ_TYPES = {
    'flow_id': pa.int64(),
    'net_src': pa.string(),
    'ip_src': pa.string(),
    'ip_dst': pa.string(),
    'port_src': pa.int64(),
    'port_dst': pa.int64(),
    'tcp_flags_0': pa.int64(),
    'start_time': pa.int64(),
    'end_time': pa.int64(),
    'duration': pa.int64(),
    'proto': pa.int64(),
    'total_byte_cnt_0': pa.int64(),
    'total_pkt_cnt_0': pa.int64()
}
PQ_SCHEMA = pa.schema([pa.field(it, PQ_TYPES[it]) for it in PQ_TYPES.keys()])


def timestamps_to_index_dates(start_ts, end_ts, margin=600):
    def ts_to_day_start(ts):
        return int(time.mktime(datetime.fromtimestamp(ts).date().timetuple()))

    start_day_ts = ts_to_day_start(start_ts - margin)
    end_day_ts = ts_to_day_start(end_ts + margin)
    gen = xrange(start_day_ts, end_day_ts + DAY_SECS, DAY_SECS)
    return [datetime.fromtimestamp(it).strftime('%y%m%d00') for it in gen]


def get_flow_batch(start_ts, end_ts):
    es_client = Elasticsearch(ES_CONF, **ES_OPTS)
    indices = ','.join([

Ejemplo n.º 37

0

Mostrar archivo

        + bq_attr_fields
        + [
            f"{ds.GetRasterBand(iband + 1).GetDescription()}:{outtype}"
            for iband in range(ds.RasterCount)
        ]
    )
)

# create parquet schema
fields = [
             ("geography", pa.string()),
             ("geography_polygon", pa.string())
         ] \
         + pq_attr_fields \
         + [(ds.GetRasterBand(iband + 1).GetDescription(), pq_type) for iband in range(ds.RasterCount)]
pq_schema = pa.schema(fields)
# Pandas dataframe column names
columns = [field_name for (field_name, field_type) in fields]

# process the file only when the output file specified
if len(sys.argv) <= 2:
    exit(0)
outfile = sys.argv[2]

# prepare transfrom from pixel to raster coordinates
gt = ds.GetGeoTransform()

hw = (0.5 * gt[1])
hh = (0.5 * -gt[5])

with pq.ParquetWriter(outfile, pq_schema) as writer:

Ejemplo n.º 38

0

Mostrar archivo

    def initialize_write(
        df,
        fs,
        path,
        append=False,
        partition_on=None,
        ignore_divisions=False,
        division_info=None,
        schema=None,
        index_cols=None,
        **kwargs,
    ):
        # Infer schema if "infer"
        # (also start with inferred schema if user passes a dict)
        if schema == "infer" or isinstance(schema, dict):

            # Start with schema from _meta_nonempty
            _schema = pa.Schema.from_pandas(
                df._meta_nonempty.set_index(index_cols) if index_cols else df.
                _meta_nonempty)

            # Use dict to update our inferred schema
            if isinstance(schema, dict):
                schema = pa.schema(schema)
                for name in schema.names:
                    i = _schema.get_field_index(name)
                    j = schema.get_field_index(name)
                    _schema = _schema.set(i, schema.field(j))

            # If we have object columns, we need to sample partitions
            # until we find non-null data for each column in `sample`
            sample = [col for col in df.columns if df[col].dtype == "object"]
            if schema_field_supported and sample and schema == "infer":
                delayed_schema_from_pandas = delayed(pa.Schema.from_pandas)
                for i in range(df.npartitions):
                    # Keep data on worker
                    _s = delayed_schema_from_pandas(
                        df[sample].to_delayed()[i]).compute()
                    for name, typ in zip(_s.names, _s.types):
                        if typ != "null":
                            i = _schema.get_field_index(name)
                            j = _s.get_field_index(name)
                            _schema = _schema.set(i, _s.field(j))
                            sample.remove(name)
                    if not sample:
                        break

            # Final (inferred) schema
            schema = _schema

        dataset = fmd = None
        i_offset = 0
        if append and division_info is None:
            ignore_divisions = True
        fs.mkdirs(path, exist_ok=True)

        if append:
            try:
                # Allow append if the dataset exists.
                # Also need dataset.metadata object if
                # ignore_divisions is False (to check divisions)
                dataset = pq.ParquetDataset(path, filesystem=fs)
                if not dataset.metadata and not ignore_divisions:
                    # TODO: Be more flexible about existing metadata.
                    raise NotImplementedError(
                        "_metadata file needed to `append` "
                        "with `engine='pyarrow'` "
                        "unless `ignore_divisions` is `True`")
                fmd = dataset.metadata
            except (IOError, ValueError, IndexError):
                # Original dataset does not exist - cannot append
                append = False
        if append:
            names = dataset.metadata.schema.names
            has_pandas_metadata = (
                dataset.schema.to_arrow_schema().metadata is not None
                and b"pandas" in dataset.schema.to_arrow_schema().metadata)
            if has_pandas_metadata:
                pandas_metadata = json.loads(dataset.schema.to_arrow_schema(
                ).metadata[b"pandas"].decode("utf8"))
                categories = [
                    c["name"] for c in pandas_metadata["columns"]
                    if c["pandas_type"] == "categorical"
                ]
            else:
                categories = None
            dtypes = _get_pyarrow_dtypes(dataset.schema.to_arrow_schema(),
                                         categories)
            if set(names) != set(df.columns) - set(partition_on):
                raise ValueError("Appended columns not the same.\n"
                                 "Previous: {} | New: {}".format(
                                     names, list(df.columns)))
            elif (pd.Series(dtypes).loc[names] != df[names].dtypes).any():
                # TODO Coerce values for compatible but different dtypes
                raise ValueError("Appended dtypes differ.\n{}".format(
                    set(dtypes.items()) ^ set(df.dtypes.iteritems())))
            i_offset = len(dataset.pieces)

            if division_info["name"] not in names:
                ignore_divisions = True
            if not ignore_divisions:
                old_end = None
                row_groups = [
                    dataset.metadata.row_group(i)
                    for i in range(dataset.metadata.num_row_groups)
                ]
                for row_group in row_groups:
                    for i, name in enumerate(names):
                        if name != division_info["name"]:
                            continue
                        column = row_group.column(i)
                        if column.statistics:
                            if not old_end:
                                old_end = column.statistics.max
                            else:
                                old_end = max(old_end, column.statistics.max)
                            break

                divisions = division_info["divisions"]
                if divisions[0] < old_end:
                    raise ValueError(
                        "Appended divisions overlapping with the previous ones"
                        " (set ignore_divisions=True to append anyway).\n"
                        "Previous: {} | New: {}".format(old_end, divisions[0]))

        return fmd, schema, i_offset

Ejemplo n.º 39

0

Mostrar archivo

def _determine_schemas_to_compare(schemas, ignore_pandas):
    """
    Iterate over a list of `pyarrow.Schema` objects and prepares them for comparison by picking a reference
    and determining all null columns.

    .. note::

        If pandas metadata exists, the version stored in the metadata is overwritten with the currently
        installed version since we expect to stay backwards compatible

    Returns
    -------
    reference: Schema
        A reference schema which is picked from the input list. The reference schema is guaranteed
        to be a schema having the least number of null columns of all input columns. The set of null
        columns is guaranteed to be a true subset of all null columns of all input schemas. If no such
        schema can be found, an Exception is raised
    list_of_schemas: List[Tuple[Schema, List]]
        A list holding pairs of (Schema, null_columns) where the null_columns are all columns which are null and
        must be removed before comparing the schemas
    """
    has_pandas = _pandas_in_schemas(schemas) and not ignore_pandas
    schemas_to_evaluate = []
    reference = None
    null_cols_in_reference = set()

    for schema in schemas:
        if not isinstance(schema, SchemaWrapper):
            schema = SchemaWrapper(schema, None)

        if has_pandas:
            metadata = schema.metadata
            if metadata is None or b"pandas" not in metadata:
                raise ValueError(
                    "Pandas and non-Pandas schemas are not comparable. "
                    "Use ignore_pandas=True if you only want to compare "
                    "on Arrow level.")
            pandas_metadata = load_json(metadata[b"pandas"].decode("utf8"))

            # we don't care about the pandas version, since we assume it's safe
            # to read datasets that were written by older or newer versions.
            pandas_metadata["pandas_version"] = "{}".format(pd.__version__)

            metadata_clean = deepcopy(metadata)
            metadata_clean[b"pandas"] = _dict_to_binary(pandas_metadata)
            current = SchemaWrapper(pa.schema(schema, metadata_clean),
                                    schema.origin)
        else:
            current = schema

        # If a field is null we cannot compare it and must therefore reject it
        null_columns = {
            field.name
            for field in current if field.type == pa.null()
        }

        # Determine a valid reference schema. A valid reference schema is considered to be the schema
        # of all input schemas with the least empty columns.
        # The reference schema ought to be a schema whose empty columns are a true subset for all sets
        # of empty columns. This ensures that the actual reference schema is the schema with the most
        # information possible. A schema which doesn't fulfil this requirement would weaken the
        # comparison and would allow for false positives

        # Trivial case
        if reference is None:
            reference = current
            null_cols_in_reference = null_columns
        # The reference has enough information to validate against current schema.
        # Append it to the list of schemas to be verified
        elif null_cols_in_reference.issubset(null_columns):
            schemas_to_evaluate.append((current, null_columns))
        # current schema includes all information of reference and more.
        # Add reference to schemas_to_evaluate and update reference
        elif null_columns.issubset(null_cols_in_reference):
            schemas_to_evaluate.append((reference, null_cols_in_reference))
            reference = current
            null_cols_in_reference = null_columns
        # If there is no clear subset available elect the schema with the least null columns as `reference`.
        # Iterate over the null columns of `reference` and replace it with a non-null field of the `current`
        # schema which recovers the loop invariant (null columns of `reference` is subset of `current`)
        else:
            if len(null_columns) < len(null_cols_in_reference):
                reference, current = current, reference
                null_cols_in_reference, null_columns = (
                    null_columns,
                    null_cols_in_reference,
                )

            for col in null_cols_in_reference - null_columns:
                # Enrich the information in the reference by grabbing the missing fields
                # from the current iteration. This assumes that we only check for global validity and
                # isn't relevant where the reference comes from.
                reference = _swap_fields_by_name(reference, current, col)
                null_cols_in_reference.remove(col)
            schemas_to_evaluate.append((current, null_columns))

    if reference is None and schemas_to_evaluate:
        reference = schemas_to_evaluate.pop()[0]

    return reference, schemas_to_evaluate

Ejemplo n.º 40

0

Mostrar archivo

    def map(self,
            function,
            with_indices: bool = False,
            batched: bool = False,
            batch_size: Optional[int] = 1000,
            remove_columns: Optional[List[str]] = None,
            keep_in_memory: bool = False,
            load_from_cache_file: bool = True,
            cache_file_name: Optional[str] = None,
            writer_batch_size: Optional[int] = 1000,
            arrow_schema: Optional[pa.Schema] = None,
            disable_nullable: bool = True):
        """ Apply a function to all the elements in the table (individually or in batches)
            and update the table (if function does updated examples).
            
            Args:
                `function` (`callable`): with one of the following signature:
                    - `function(example: Dict) -> Union[Dict, Any]` if `batched=False` and `with_indices=False`
                    - `function(example: Dict, indices: int) -> Union[Dict, Any]` if `batched=False` and `with_indices=True`
                    - `function(batch: Dict[List]) -> Union[Dict, Any]` if `batched=True` and `with_indices=False`
                    - `function(batch: Dict[List], indices: List[int]) -> Union[Dict, Any]` if `batched=True` and `with_indices=True`
                `with_indices` (`bool`, default: `False`): Provide example indices to `function`
                `batched` (`bool`, default: `False`): Provide batch of examples to `function`
                `batch_size` (`Optional[int]`, default: `1000`): Number of examples per batch provided to `function` if `batched=True`
                    `batch_size <= 0` or `batch_size == None`: Provide the full dataset as a single batch to `function`
                `remove_columns` (`Optional[List[str]]`, default: `None`): Remove a selection of columns while doing the mapping.
                    Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding
                    columns with names in `remove_columns`, these columns will be kept.
                `keep_in_memory` (`bool`, default: `False`): Keep the dataset in memory instead of writing it to a cache file.
                `load_from_cache_file` (`bool`, default: `True`): If a cache file storing the current computation from `function`
                    can be identified, use it instead of recomputing.
                `cache_file_name` (`Optional[str]`, default: `None`): Provide the name of a cache file to use to store the
                    results of the computation instead of the automatically generated cache file name.
                `writer_batch_size` (`int`, default: `1000`): Number of rows per write operation for the cache file writer.
                    Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`.
                `arrow_schema` (`Optional[pa.Schema]`, default: `None`): Use a specific Apache Arrow Schema to store the cache file
                    instead of the automatically generated one.
                `disable_nullable` (`bool`, default: `True`): Allow null values in the table.
        """
        # If the array is empty we do nothing
        if len(self) == 0:
            return self

        # Select the columns (arrow columns) to process
        if remove_columns is not None and any(col not in self._data.column_names for col in remove_columns):
            raise ValueError("Column to remove {} not in the dataset. Current columns in the dataset: {}".format(
                             list(filter(lambda col: col not in self._data.column_names, remove_columns)),
                             self._data.column_names))

        # If we do batch computation but no batch sze is provided, default to the full dataset
        if batched and (batch_size is None or batch_size <= 0):
            batch_size = self._data.num_rows

        # Check if the function returns updated examples
        def does_function_return_dict(inputs, indices):
            """ Does the function returns a dict. """
            processed_inputs = function(inputs, indices) if with_indices else function(inputs)
            return isinstance(processed_inputs, Mapping)

        # We only update the data table (and use the cache) if the function returns a dict.
        # Test it on the first element or a small batch (0, 1) for batched inputs
        test_inputs = self[:2] if batched else self[0]
        test_indices = [0, 1] if batched else 0
        update_data = does_function_return_dict(test_inputs, test_indices)

        def apply_function_on_filtered_inputs(inputs, indices):
            """ Utility to apply the function on a selection of columns. """
            processed_inputs = function(inputs, indices) if with_indices else function(inputs)
            if not update_data:
                return None  # Nothing to update, let's move on
            if remove_columns is not None:
                for column in remove_columns:
                    inputs.pop(column)
            inputs.update(processed_inputs)
            return inputs

        # Find the output schema if none is given
        test_inputs = self[:2] if batched else self[0]
        test_indices = [0, 1] if batched else 0
        test_output = apply_function_on_filtered_inputs(test_inputs, test_indices)
        if arrow_schema is None and update_data:
            if not batched:
                test_output = self.nest(test_output)
            test_output = convert_tuples_in_lists(test_output)
            arrow_schema = pa.Table.from_pydict(test_output).schema
            if disable_nullable:
                arrow_schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in arrow_schema)

        # Check if we've already cached this computation (indexed by a hash)
        if self._data_files and update_data:
            if cache_file_name is None:
                # we create a unique hash from the function, current dataset file and the mapping args
                cache_kwargs = {"with_indices": with_indices,
                                "batched": batched,
                                "batch_size": batch_size,
                                "remove_columns": remove_columns,
                                "keep_in_memory": keep_in_memory,
                                "load_from_cache_file": load_from_cache_file,
                                "cache_file_name": cache_file_name,
                                "writer_batch_size": writer_batch_size,
                                "arrow_schema": arrow_schema,
                                "disable_nullable": disable_nullable,}
                cache_file_name = self._get_cache_file_path(function, cache_kwargs)
            if os.path.exists(cache_file_name) and load_from_cache_file:
                logger.info("Loading cached processed dataset at %s", cache_file_name)
                return Dataset.from_file(cache_file_name)

        # Prepare output buffer and batched writer in memory or on file if we update the table
        if update_data:
            if keep_in_memory or not self._data_files:
                buf_writer = pa.BufferOutputStream()
                writer = ArrowWriter(schema=arrow_schema, stream=buf_writer, writer_batch_size=writer_batch_size)
            else:
                buf_writer = None
                logger.info("Caching processed dataset at %s", cache_file_name)
                writer = ArrowWriter(schema=arrow_schema, path=cache_file_name, writer_batch_size=writer_batch_size)

        # Loop over single examples or batches and write to buffer/file if examples are to be updated
        if not batched:
            for i, example in tqdm(enumerate(self)):
                example = apply_function_on_filtered_inputs(example, i)
                if update_data:
                    writer.write(example)
        else:
            for i in tqdm(range(0, len(self), batch_size)):
                batch = self[i:i+batch_size]
                indices = list(range(*(slice(i, i+batch_size).indices(self._data.num_rows))))  # Something simpler?
                batch = apply_function_on_filtered_inputs(batch, indices)
                if update_data:
                    writer.write_batch(batch)

        if update_data:
            writer.finalize()  # close_stream=bool(buf_writer is None))  # We only close if we are writing in a file

            # Create new Dataset from buffer or file
            if buf_writer is None:
                return Dataset.from_file(cache_file_name)
            else:
                return Dataset.from_buffer(buf_writer.getvalue())
        else:
            return self

Ejemplo n.º 41

0

Mostrar archivo

def test_recordbatch_from_arrays_validate_schema():
    # ARROW-6263
    arr = pa.array([1, 2])
    schema = pa.schema([pa.field('f0', pa.list_(pa.utf8()))])
    with pytest.raises(NotImplementedError):
        pa.record_batch([arr], schema=schema)

Ejemplo n.º 42

0

Mostrar archivo

Archivo: wordcount.py Proyecto: kwadie/dataflow-templates-cicd

AVRO_SCHEMA = {
    'namespace':
    'avro.wordcount',
    'type':
    'record',
    'name':
    'WordCount',
    'fields': [{
        'name': 'word',
        'type': 'string'
    }, {
        'name': 'count',
        'type': 'int'
    }]
}
PARQUET_SCHEMA = pyarrow.schema([('word', pyarrow.string()),
                                 ('count', pyarrow.int64())])
DEFAULT_CODEC = 'snappy'


class WordExtractingDoFn(beam.DoFn):
    """Parse each line of input text into words."""
    def __init__(self):
        self.words_counter = Metrics.counter(self.__class__, 'words')
        self.word_lengths_counter = Metrics.counter(self.__class__,
                                                    'word_lengths')
        self.word_lengths_dist = Metrics.distribution(self.__class__,
                                                      'word_len_dist')
        self.empty_line_counter = Metrics.counter(self.__class__,
                                                  'empty_lines')

    def process(self, element):

Ejemplo n.º 43

0

Mostrar archivo

Archivo: test_dataset.py Proyecto: vrash/arrow

def test_filesystem_factory(mockfs, paths_or_selector):
    format = ds.ParquetFileFormat(
        read_options=ds.ParquetReadOptions(dictionary_columns={"str"})
    )

    options = ds.FileSystemFactoryOptions('subdir')
    options.partitioning = ds.DirectoryPartitioning(
        pa.schema([
            pa.field('group', pa.int32()),
            pa.field('key', pa.string())
        ])
    )
    assert options.partition_base_dir == 'subdir'
    assert options.selector_ignore_prefixes == ['.', '_']
    assert options.exclude_invalid_files is False

    factory = ds.FileSystemDatasetFactory(
        mockfs, paths_or_selector, format, options
    )
    inspected_schema = factory.inspect()

    assert factory.inspect().equals(pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64()),
        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
        pa.field('const', pa.int64()),
        pa.field('group', pa.int32()),
        pa.field('key', pa.string()),
    ]), check_metadata=False)

    assert isinstance(factory.inspect_schemas(), list)
    assert isinstance(factory.finish(inspected_schema),
                      ds.FileSystemDataset)
    assert factory.root_partition.equals(ds.scalar(True))

    dataset = factory.finish()
    assert isinstance(dataset, ds.FileSystemDataset)
    assert len(list(dataset.scan())) == 2

    scanner = ds.Scanner.from_dataset(dataset)
    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    expected_str = pa.DictionaryArray.from_arrays(
        pa.array([0, 1, 2, 3, 4], type=pa.int32()),
        pa.array("0 1 2 3 4".split(), type=pa.string())
    )
    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
        expected_group = pa.array([group] * 5, type=pa.int32())
        expected_key = pa.array([key] * 5, type=pa.string())
        expected_const = pa.array([group - 1] * 5, type=pa.int64())
        for batch in task.execute():
            assert batch.num_columns == 6
            assert batch[0].equals(expected_i64)
            assert batch[1].equals(expected_f64)
            assert batch[2].equals(expected_str)
            assert batch[3].equals(expected_const)
            assert batch[4].equals(expected_group)
            assert batch[5].equals(expected_key)

    table = dataset.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10
    assert table.num_columns == 6

Ejemplo n.º 44

0

Mostrar archivo

Archivo: pandas_compat.py Proyecto: joshuawinter/beamer

def dataframe_to_arrays(df,
                        schema,
                        preserve_index,
                        nthreads=1,
                        columns=None,
                        safe=True):
    (all_names, column_names, index_column_names, index_descriptors,
     index_columns, columns_to_convert,
     convert_fields) = _get_columns_to_convert(df, schema, preserve_index,
                                               columns)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, field):
        if field is None:
            field_nullable = True
            type_ = None
        else:
            field_nullable = field.nullable
            type_ = field.type

        try:
            result = pa.array(col, type=type_, from_pandas=True, safe=safe)
        except (pa.ArrowInvalid, pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += (
                "Conversion failed for column {0!s} with type {1!s}".format(
                    col.name, col.dtype), )
            raise e
        if not field_nullable and result.null_count > 0:
            raise ValueError("Field {} was non-nullable but pandas column "
                             "had {} null values".format(
                                 str(field), result.null_count))
        return result

    if nthreads == 1:
        arrays = [
            convert_column(c, f)
            for c, f in zip(columns_to_convert, convert_fields)
        ]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(
                executor.map(convert_column, columns_to_convert,
                             convert_fields))

    types = [x.type for x in arrays]

    if schema is not None:
        # add index columns
        index_types = types[len(column_names):]
        for name, type_ in zip(index_column_names, index_types):
            name = name if name is not None else 'None'
            schema = schema.append(pa.field(name, type_))
    else:
        fields = []
        for name, type_ in zip(all_names, types):
            name = name if name is not None else 'None'
            fields.append(pa.field(name, type_))
        schema = pa.schema(fields)

    metadata = construct_metadata(df, column_names, index_columns,
                                  index_descriptors, preserve_index, types)
    schema = schema.add_metadata(metadata)

    return arrays, schema

Ejemplo n.º 45

0

Mostrar archivo

Archivo: test_convert_pandas.py Proyecto: crystalzyan/arrow

 def test_boolean_object_nulls(self):
     arr = np.array([False, None, True] * 100, dtype=object)
     df = pd.DataFrame({'bools': arr})
     field = pa.field('bools', pa.bool_())
     schema = pa.schema([field])
     self._check_pandas_roundtrip(df, expected_schema=schema)

Ejemplo n.º 46

0

Mostrar archivo

def make_meta(obj, origin, partition_keys=None):
    """
    Create metadata object for DataFrame.

    .. note::
        This function can, for convenience reasons, also be applied to schema objects in which case they are just
        returned.

    .. warning::
        Information for categoricals will be stripped!

    :meth:`normalize_type` will be applied to normalize type information and :meth:`normalize_column_order` will be
    applied to to reorder column information.

    Parameters
    ----------
    obj: Union[DataFrame, Schema]
        Object to extract metadata from.
    origin: str
        Origin of the schema data, used for debugging and error reporting.
    partition_keys: Union[None, List[str]]
        Partition keys used to split the dataset.

    Returns
    -------
    schema: SchemaWrapper
        Schema information for DataFrame.
    """
    if isinstance(obj, SchemaWrapper):
        return obj
    if isinstance(obj, pa.Schema):
        return SchemaWrapper(obj, origin)

    if not isinstance(obj, pd.DataFrame):
        raise ValueError(
            "Input must be a pyarrow schema, or a pandas dataframe")

    if ARROW_LARGER_EQ_0130:
        schema = pa.Schema.from_pandas(obj)
    else:
        table = pa.Table.from_pandas(obj)
        schema = table.schema
        del table
    pandas_metadata = _pandas_meta_from_schema(schema)

    # normalize types
    fields = dict([(field.name, field.type) for field in schema])
    for cmd in pandas_metadata["columns"]:
        name = cmd.get("name")
        if name is None:
            continue
        field_name = cmd["field_name"]
        field_idx = schema.get_field_index(field_name)
        field = schema[field_idx]
        if pa.types.is_dictionary(field.type):
            # TODO: remove this with Arrow 0.10 when we can access the dictionary
            #       information.
            tmp_df = pd.DataFrame({field.name: obj[field.name].cat.categories})
            tmp_schema = pa.Table.from_pandas(tmp_df).schema
            field = tmp_schema[0]
            tmp_metadata = _pandas_meta_from_schema(schema)
            cmd = tmp_metadata["columns"][0]
        fields[field_name], cmd["pandas_type"], cmd["numpy_type"], cmd[
            "metadata"] = normalize_type(field.type, cmd["pandas_type"],
                                         cmd["numpy_type"], cmd["metadata"])
    metadata = schema.metadata
    metadata[b"pandas"] = _dict_to_binary(pandas_metadata)
    schema = pa.schema([pa.field(n, t) for n, t in fields.items()], metadata)
    return normalize_column_order(SchemaWrapper(schema, origin),
                                  partition_keys)

Ejemplo n.º 47

0

Mostrar archivo

Archivo: test_dataset.py Proyecto: vrash/arrow

def test_filesystem_dataset(mockfs):
    schema = pa.schema([
        pa.field('const', pa.int64())
    ])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.scalar(True), ds.scalar(True)]

    dataset = ds.FileSystemDataset(
        schema=schema,
        format=file_format,
        filesystem=mockfs,
        paths_or_selector=paths,
        partitions=partitions
    )

    assert isinstance(dataset.format, ds.ParquetFileFormat)

    # the root_partition and partitions keywords have defaults
    dataset = ds.FileSystemDataset(
        paths, schema, format=file_format, filesystem=mockfs,
    )

    assert isinstance(dataset.format, ds.ParquetFileFormat)

    # validation of required arguments
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs)
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs)
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, format=file_format)
    # validation of root_partition
    with pytest.raises(TypeError, match="incorrect type"):
        ds.FileSystemDataset(paths, schema=schema, format=file_format,
                             filesystem=mockfs, root_partition=1)

    root_partition = ds.field('level') == ds.scalar(1337)
    partitions = [ds.field('part') == x for x in range(1, 3)]
    dataset = ds.FileSystemDataset(
        paths_or_selector=paths,
        schema=schema,
        root_partition=root_partition,
        filesystem=mockfs,
        partitions=partitions,
        format=file_format
    )
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    for fragment, partition, path in zip(fragments, partitions, paths):
        assert fragment.partition_expression.equals(partition)
        assert fragment.path == path
        assert isinstance(fragment.format, ds.ParquetFileFormat)
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert fragment.row_groups is None

        row_group_fragments = list(fragment.get_row_group_fragments())
        assert len(row_group_fragments) == 1
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert row_group_fragments[0].path == path
        assert row_group_fragments[0].row_groups == {0}

    fragments = list(dataset.get_fragments(filter=ds.field("const") == 0))
    assert len(fragments) == 2

Ejemplo n.º 48

0

Mostrar archivo

def normalize_column_order(schema, partition_keys=None):
    """
    Normalize column order in schema.

    Columns are sorted in the following way:

    1. Partition keys (as provided by ``partition_keys``)
    2. DataFrame columns in alphabetic order
    3. Remaining fields as generated by pyarrow, mostly index columns

    Parameters
    ----------
    schema: SchemaWrapper
        Schema information for DataFrame.
    partition_keys: Union[None, List[str]]
        Partition keys used to split the dataset.

    Returns
    -------
    schema: SchemaWrapper
        Schema information for DataFrame.
    """
    if not isinstance(schema, SchemaWrapper):
        schema = SchemaWrapper(schema, None)

    if partition_keys is None:
        partition_keys = []
    else:
        partition_keys = list(partition_keys)

    pandas_metadata = _pandas_meta_from_schema(schema)
    origin = schema.origin

    cols_partition = {}
    cols_payload = []
    cols_misc = []

    for cmd in pandas_metadata["columns"]:
        name = cmd.get("name")
        field_name = cmd["field_name"]
        field_idx = schema.get_field_index(field_name)

        if field_idx >= 0:
            field = schema[field_idx]
        else:
            field = None

        if name is None:
            cols_misc.append((cmd, field))
        elif name in partition_keys:
            cols_partition[name] = (cmd, field)
        else:
            cols_payload.append((name, cmd, field))

    ordered = []
    for k in partition_keys:
        if k in cols_partition:
            ordered.append(cols_partition[k])

    ordered += [(cmd, f)
                for _name, cmd, f in sorted(cols_payload, key=lambda x: x[0])]
    ordered += cols_misc

    pandas_metadata["columns"] = [cmd for cmd, _ in ordered]
    fields = [f for _, f in ordered if f is not None]

    metadata = schema.metadata
    metadata[b"pandas"] = _dict_to_binary(pandas_metadata)
    schema = pa.schema(fields, metadata)
    return SchemaWrapper(schema, origin)

Ejemplo n.º 49

0

Mostrar archivo

 def make_schema():
     return pa.schema([('field', pa.int64())])

Ejemplo n.º 50

0

Mostrar archivo

 def _to_arrow_schema(row_type):
     return pa.schema([
         pa.field(n, to_arrow_type(t), t._nullable)
         for n, t in zip(row_type.field_names(), row_type.field_types())
     ])

Ejemplo n.º 51

0

Mostrar archivo

Archivo: test_flight.py Proyecto: vikram/arrow

def test_flight_get_schema():
    """Make sure GetSchema returns correct schema."""
    with GetInfoFlightServer() as server:
        client = FlightClient(('localhost', server.port))
        info = client.get_schema(flight.FlightDescriptor.for_command(b''))
        assert info.schema == pa.schema([('a', pa.int32())])

Ejemplo n.º 52

0

Mostrar archivo

def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df

Ejemplo n.º 53

0

Mostrar archivo

Archivo: test_flight.py Proyecto: vikram/arrow

 def do_get(self, context, ticket):
     return flight.GeneratorStream(pa.schema([('a', pa.int32())]),
                                   self.slow_stream())

Ejemplo n.º 54

0

Mostrar archivo

Archivo: parquet_schema.py Proyecto: shubhank-saxena/OpenWPM

import pyarrow as pa

PQ_SCHEMAS = dict()

# site_visits
fields = [
    pa.field('visit_id', pa.int64(), nullable=False),
    pa.field('crawl_id', pa.uint32(), nullable=False),
    pa.field('instance_id', pa.uint32(), nullable=False),
    pa.field('site_url', pa.string(), nullable=False),
    pa.field('site_rank', pa.uint32())
]
PQ_SCHEMAS['site_visits'] = pa.schema(fields)

# crawl_history
fields = [
    pa.field('crawl_id', pa.uint32(), nullable=False),
    pa.field('visit_id', pa.int64(), nullable=False),
    pa.field('instance_id', pa.uint32(), nullable=False),
    pa.field('command', pa.string()),
    pa.field('arguments', pa.string()),
    pa.field('retry_number', pa.int8()),
    pa.field('command_status', pa.string()),
    pa.field('error', pa.string()),
    pa.field('traceback', pa.string())
]
PQ_SCHEMAS['crawl_history'] = pa.schema(fields)

# http_requests
fields = [
    pa.field('incognito', pa.int32()),

Ejemplo n.º 55

0

Mostrar archivo

def test_to_column_info():
    schema = pa.schema([
        pa.field("col_boolean", pa.bool_()),
        pa.field("col_tinyint", pa.int32()),
        pa.field("col_smallint", pa.int32()),
        pa.field("col_int", pa.int32()),
        pa.field("col_bigint", pa.int64()),
        pa.field("col_float", pa.float32()),
        pa.field("col_double", pa.float64()),
        pa.field("col_string", pa.string()),
        pa.field("col_varchar", pa.string()),
        pa.field("col_timestamp", pa.timestamp("ns")),
        pa.field("col_date", pa.date32()),
        pa.field("col_binary", pa.binary()),
        pa.field("col_array", pa.list_(pa.field("array_element", pa.int32()))),
        pa.field("col_map", pa.map_(pa.int32(),
                                    pa.field("entries", pa.int32()))),
        pa.field(
            "col_struct",
            pa.struct([pa.field("a", pa.int32()),
                       pa.field("b", pa.int32())]),
        ),
        pa.field("col_decimal", pa.decimal128(10, 1)),
    ])
    assert to_column_info(schema) == (
        {
            "Name": "col_boolean",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "boolean",
        },
        {
            "Name": "col_tinyint",
            "Nullable": "NULLABLE",
            "Precision": 10,
            "Scale": 0,
            "Type": "integer",
        },
        {
            "Name": "col_smallint",
            "Nullable": "NULLABLE",
            "Precision": 10,
            "Scale": 0,
            "Type": "integer",
        },
        {
            "Name": "col_int",
            "Nullable": "NULLABLE",
            "Precision": 10,
            "Scale": 0,
            "Type": "integer",
        },
        {
            "Name": "col_bigint",
            "Nullable": "NULLABLE",
            "Precision": 19,
            "Scale": 0,
            "Type": "bigint",
        },
        {
            "Name": "col_float",
            "Nullable": "NULLABLE",
            "Precision": 17,
            "Scale": 0,
            "Type": "float",
        },
        {
            "Name": "col_double",
            "Nullable": "NULLABLE",
            "Precision": 17,
            "Scale": 0,
            "Type": "double",
        },
        {
            "Name": "col_string",
            "Nullable": "NULLABLE",
            "Precision": 2147483647,
            "Scale": 0,
            "Type": "varchar",
        },
        {
            "Name": "col_varchar",
            "Nullable": "NULLABLE",
            "Precision": 2147483647,
            "Scale": 0,
            "Type": "varchar",
        },
        {
            "Name": "col_timestamp",
            "Nullable": "NULLABLE",
            "Precision": 3,
            "Scale": 0,
            "Type": "timestamp",
        },
        {
            "Name": "col_date",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "date",
        },
        {
            "Name": "col_binary",
            "Nullable": "NULLABLE",
            "Precision": 1073741824,
            "Scale": 0,
            "Type": "varbinary",
        },
        {
            "Name": "col_array",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "array",
        },
        {
            "Name": "col_map",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "map",
        },
        {
            "Name": "col_struct",
            "Nullable": "NULLABLE",
            "Precision": 0,
            "Scale": 0,
            "Type": "row",
        },
        {
            "Name": "col_decimal",
            "Nullable": "NULLABLE",
            "Precision": 10,
            "Scale": 1,
            "Type": "decimal",
        },
    )

Ejemplo n.º 56

0

Mostrar archivo

def convert_csv_to_parquet(csv_file: str, parquet_dir: str, partitioned: bool):
    print("Start ", datetime.now())

    print(csv_file)
    print(parquet_dir)

    print("Abs path of csv file: " + os.path.abspath(csv_file))

    #Remove old partitions
    if partitioned:
        if Path(parquet_dir).is_dir():
            shutil.rmtree(parquet_dir)

    # ReadOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions
    csv_read_options = pv.ReadOptions(skip_rows=0,
                                      encoding="utf8",
                                      column_names=[
                                          "unit_id", "value", "start", "stop",
                                          "start_year", "start_epoch_days",
                                          "stop_epoch_days"
                                      ])

    # ParseOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html#pyarrow.csv.ParseOptions
    csv_parse_options = pv.ParseOptions(delimiter=';')

    # Types: https://arrow.apache.org/docs/python/api/datatypes.html
    # TODO nullable parameter does not work as expected!
    data_schema = pa.schema([
        pa.field(name='start_year', type=pa.string(), nullable=True),
        pa.field(name='unit_id', type=pa.uint64(), nullable=False),
        pa.field(name='value', type=pa.string(), nullable=False),
        pa.field(name='start_epoch_days', type=pa.int16(), nullable=True),
        pa.field(name='stop_epoch_days', type=pa.int16(), nullable=True),
    ])

    # ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions
    csv_convert_options = pv.ConvertOptions(column_types=data_schema)
    #include_columns=["start_year", "unit_id", "value", "start_epoch_days", "stop_epoch_days"])

    # read_csv: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html#pyarrow.csv.read_csv
    table = pv.read_csv(input_file=csv_file,
                        read_options=csv_read_options,
                        parse_options=csv_parse_options,
                        convert_options=csv_convert_options)

    # print('Bytes: ' + str(table.nbytes))
    # print('Rows: ' + str(table.num_rows))
    # print('Schema: ' + str(table.schema))
    # print('Column names: ' + str(table.column_names))
    # pandas.set_option('max_columns', None)  # print all columns
    # print(table.to_pandas().head(10))

    # write with partitions

    if partitioned:
        pq.write_to_dataset(table,
                            root_path=parquet_dir,
                            partition_cols=['start_year'])
    else:
        pq.write_to_dataset(table, root_path=parquet_dir)

    print("End ", datetime.now())

Ejemplo n.º 57

0

Mostrar archivo

def test_flight_get_schema():
    """Make sure GetSchema returns correct schema."""
    with flight_server(GetInfoFlightServer) as server_location:
        client = flight.FlightClient.connect(server_location)
        info = client.get_schema(flight.FlightDescriptor.for_command(b''))
        assert info.schema == pa.schema([('a', pa.int32())])

Ejemplo n.º 58

0

Mostrar archivo

Archivo: test_dataset.py Proyecto: vrash/arrow

def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
    child1 = ds.dataset('/plain', filesystem=multisourcefs, format='parquet')
    child2 = ds.dataset('/schema', filesystem=multisourcefs, format='parquet',
                        partitioning=['week', 'color'])
    child3 = ds.dataset('/hive', filesystem=multisourcefs, format='parquet',
                        partitioning='hive')

    assert child1.schema != child2.schema != child3.schema

    assembled = ds.dataset([child1, child2, child3])
    assert isinstance(assembled, ds.UnionDataset)

    msg = 'cannot pass any additional arguments'
    with pytest.raises(ValueError, match=msg):
        ds.dataset([child1, child2], filesystem=multisourcefs)

    expected_schema = pa.schema([
        ('date', pa.date32()),
        ('index', pa.int64()),
        ('value', pa.float64()),
        ('color', pa.string()),
        ('week', pa.int32()),
        ('year', pa.int32()),
        ('month', pa.int32()),
    ])
    assert assembled.schema.equals(expected_schema)
    assert assembled.to_table().schema.equals(expected_schema)

    assembled = ds.dataset([child1, child3])
    expected_schema = pa.schema([
        ('date', pa.date32()),
        ('index', pa.int64()),
        ('value', pa.float64()),
        ('color', pa.string()),
        ('year', pa.int32()),
        ('month', pa.int32()),
    ])
    assert assembled.schema.equals(expected_schema)
    assert assembled.to_table().schema.equals(expected_schema)

    expected_schema = pa.schema([
        ('month', pa.int32()),
        ('color', pa.string()),
        ('date', pa.date32()),
    ])
    assembled = ds.dataset([child1, child3], schema=expected_schema)
    assert assembled.to_table().schema.equals(expected_schema)

    expected_schema = pa.schema([
        ('month', pa.int32()),
        ('color', pa.string()),
        ('unkown', pa.string())  # fill with nulls
    ])
    assembled = ds.dataset([child1, child3], schema=expected_schema)
    assert assembled.to_table().schema.equals(expected_schema)

    # incompatible schemas, date and index columns have conflicting types
    table = pa.table([range(9), [0.] * 4 + [1.] * 5, 'abcdefghj'],
                     names=['date', 'value', 'index'])
    _, path = _create_single_file(tempdir, table=table)
    child4 = ds.dataset(path)

    with pytest.raises(pa.ArrowInvalid, match='Unable to merge'):
        ds.dataset([child1, child4])

Ejemplo n.º 59

0

Mostrar archivo

 def schema(self):
     return pa.schema(
         self.features.type) if self.features is not None else None

Ejemplo n.º 60

0

Mostrar archivo

import pyarrow as pa

input_schema = pa.schema([
    pa.field("input", pa.uint8(), False).with_metadata({b'fletcher_epc': b'8'})
]).with_metadata({
    b'fletcher_mode': b'read',
    b'fletcher_name': b'input'
})

pa.output_stream("in.as").write(input_schema.serialize())

test_rec = """{
"timestamp": "2005-09-09T11:59:06-10:01",  
    "accel_decel": 11446688,
    "timezone": 883,
    "vin": 8834555,
    "odometer": 99711112,
    "hypermiling": false,
    "avgspeed": 156,
    "sec_in_band": [3403, 893, 2225, 78, 162, 2332, 1473, 2587, 3446, 178, 997, 2403],
    "miles_in_time_range": [3376, 2553, 2146, 919, 2241, 1044, 1079, 3751, 1665, 2062, 46, 2868, 375, 3305, 4109, 3319, 627, 3523, 2225, 357, 1653, 2757, 3477, 3549],
    "const_speed_miles_in_band": [4175, 2541, 2841, 157, 2922, 651, 315, 2484, 2696, 165, 1366, 958],
    "vary_speed_miles_in_band": [2502, 155, 1516, 1208, 2229, 1850, 4032, 3225, 2704, 2064, 484, 3073],
    "sec_decel": [722, 2549, 547, 3468, 844, 3064, 2710, 1515, 763, 2972],
    "sec_accel": [4175, 2541, 2841, 157, 2922, 651, 315, 2484, 2696, 165, 1366, 958],
    "accel": [1780, 228, 1267, 2389, 437, 871],
    "orientation": false,
    "braking": [724, 2549, 547],
    "small_speed_var": [724, 2549, 54788],
    "large_speed_var": [724, 2549, 5478],
    "speed_changes": 156