def test_key_value_metadata(): m = pa.KeyValueMetadata({'a': 'A', 'b': 'B'}) assert len(m) == 2 assert m['a'] == b'A' assert m[b'a'] == b'A' assert m['b'] == b'B' assert 'a' in m assert b'a' in m assert 'c' not in m m1 = pa.KeyValueMetadata({'a': 'A', 'b': 'B'}) m2 = pa.KeyValueMetadata(a='A', b='B') m3 = pa.KeyValueMetadata([('a', 'A'), ('b', 'B')]) assert m1 != 2 assert m1 == m2 assert m2 == m3 assert m1 == {'a': 'A', 'b': 'B'} assert m1 != {'a': 'A', 'b': 'C'} with pytest.raises(TypeError): pa.KeyValueMetadata({'a': 1}) with pytest.raises(TypeError): pa.KeyValueMetadata({1: 'a'}) with pytest.raises(TypeError): pa.KeyValueMetadata(a=1) expected = [(b'a', b'A'), (b'b', b'B')] result = [(k, v) for k, v in m3.items()] assert result == expected assert list(m3.items()) == expected assert list(m3.keys()) == [b'a', b'b'] assert list(m3.values()) == [b'A', b'B'] assert len(m3) == 2 # test duplicate key support md = pa.KeyValueMetadata([ ('a', 'alpha'), ('b', 'beta'), ('a', 'Alpha'), ('a', 'ALPHA'), ], b='BETA') expected = [(b'a', b'alpha'), (b'b', b'beta'), (b'a', b'Alpha'), (b'a', b'ALPHA'), (b'b', b'BETA')] assert len(md) == 5 assert isinstance(md.keys(), Iterator) assert isinstance(md.values(), Iterator) assert isinstance(md.items(), Iterator) assert list(md.items()) == expected assert list(md.keys()) == [k for k, _ in expected] assert list(md.values()) == [v for _, v in expected] # first occurence assert md['a'] == b'alpha' assert md['b'] == b'beta' assert md.get_all('a') == [b'alpha', b'Alpha', b'ALPHA'] assert md.get_all('b') == [b'beta', b'BETA'] assert md.get_all('unkown') == []
def test_key_value_metadata_duplicates(): meta = pa.KeyValueMetadata({'a': '1', 'b': '2'}) with pytest.raises(KeyError): pa.KeyValueMetadata(meta, a='3')
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, safe=True): (all_names, column_names, index_column_names, index_descriptors, index_columns, columns_to_convert, convert_fields) = _get_columns_to_convert(df, schema, preserve_index, columns) # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether # using a thread pool is worth it. Currently the heuristic is whether the # nrows > 100 * ncols. if nthreads is None: nrows, ncols = len(df), len(df.columns) if nrows > ncols * 100: nthreads = pa.cpu_count() else: nthreads = 1 def convert_column(col, field): if field is None: field_nullable = True type_ = None else: field_nullable = field.nullable type_ = field.type try: result = pa.array(col, type=type_, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: e.args += ( "Conversion failed for column {!s} with type {!s}".format( col.name, col.dtype), ) raise e if not field_nullable and result.null_count > 0: raise ValueError("Field {} was non-nullable but pandas column " "had {} null values".format( str(field), result.null_count)) return result if nthreads == 1: arrays = [ convert_column(c, f) for c, f in zip(columns_to_convert, convert_fields) ] else: from concurrent import futures with futures.ThreadPoolExecutor(nthreads) as executor: arrays = list( executor.map(convert_column, columns_to_convert, convert_fields)) types = [x.type for x in arrays] if schema is None: fields = [] for name, type_ in zip(all_names, types): name = name if name is not None else 'None' fields.append(pa.field(name, type_)) schema = pa.schema(fields) pandas_metadata = construct_metadata(df, column_names, index_columns, index_descriptors, preserve_index, types) metadata = pa.KeyValueMetadata(schema.metadata or {}, **pandas_metadata) schema = schema.with_metadata(metadata) return arrays, schema