def _test_convert_nested(orig, expected_type, expected_value): a = [[orig]] s = Schema("a:" + expected_type).pa_schema x = list(apply_schema(s, a, deep=True))[0] y = list(apply_schema(s, a, copy=False, deep=True))[0] for b in [x, y]: assert expected_value == b[0] assert x is not a[0] assert y is a[0]
def _test_convert(orig, expected_type, expected_value): a = [[orig]] s = Schema("a:" + expected_type).pa_schema x = list(apply_schema(s, a))[0] y = list(apply_schema(s, a, copy=False))[0] for b in [x, y]: if isinstance(expected_value, float) and math.isnan(expected_value): assert math.isnan(b[0]) elif expected_value is pd.NaT: assert b[0] is pd.NaT else: assert expected_value == b[0] assert x is not a[0] assert y is a[0]
def as_array_iterable( self, df: T, schema: Optional[pa.Schema] = None, columns: Optional[List[str]] = None, type_safe: bool = False, ) -> Iterable[List[Any]]: """Convert pandas like dataframe to iterable of rows in the format of list. :param df: pandas like dataframe :param schema: schema of the input. With None, it will infer the schema, it can't infer wrong schema for nested types, so try to be explicit :param columns: columns to output, None for all columns :param type_safe: whether to enforce the types in schema, if False, it will return the original values from the dataframe :return: iterable of rows, each row is a list :Notice: If there are nested types in schema, the conversion can be slower """ if self.empty(df): return if schema is None: schema = self.to_schema(df) if columns is not None: df = df[columns] schema = pa.schema([schema.field(n) for n in columns]) if not type_safe: for arr in df.itertuples(index=False, name=None): yield list(arr) elif all(not pa.types.is_nested(x) for x in schema.types): p = self.as_arrow(df, schema) d = p.to_pydict() cols = [d[n] for n in schema.names] for arr in zip(*cols): yield list(arr) else: # If schema has nested types, the conversion will be much slower for arr in apply_schema( schema, df.itertuples(index=False, name=None), copy=True, deep=True, str_as_json=True, ): yield arr
def as_array_iterable(self, columns: Optional[List[str]] = None, type_safe: bool = False) -> Iterable[Any]: if columns is None: if not type_safe: for item in self.native: yield item else: for item in apply_schema( self.schema.pa_schema, self.native, copy=True, deep=True, str_as_json=True, ): yield item else: df = IterableDataFrame(self, self.schema.extract(columns)) for item in df.as_array_iterable(type_safe=type_safe): yield item
def as_array_iterable(self, columns: Optional[List[str]] = None, type_safe: bool = False) -> Iterable[Any]: if columns is None: pos = [] else: pos = [self.schema.index_of_key(k) for k in columns] assert_or_throw(len(pos) > 0, "columns if set must be non empty") if not type_safe: for item in self._iter_cols(pos): yield item else: sub = self.schema if columns is None else self.schema.extract( columns) for item in apply_schema( sub.pa_schema, self._iter_cols(pos), copy=True, deep=True, str_as_json=True, ): yield item
def _assert_not_supported(orig, expected_type, deep=False): with raises(NotImplementedError): a = [[orig]] s = Schema("a:" + expected_type).pa_schema b = list(apply_schema(s, a, deep=deep))[0]
def _assert_raise(orig, expected_type, deep=False): with raises(ValueError): a = [[orig]] s = Schema("a:" + expected_type).pa_schema b = list(apply_schema(s, a, deep=deep))[0]