def as_local(self) -> LocalDataFrame: # TODO: does it make sense to also include the metadata? if any(pa.types.is_nested(t) for t in self.schema.types): data = list(to_type_safe_input(self.native.collect(), self.schema)) return ArrayDataFrame(data, self.schema, self.metadata) return PandasDataFrame(self.native.toPandas(), self.schema, self.metadata)
def as_array_iterable( self, columns: Optional[List[str]] = None, type_safe: bool = False ) -> Iterable[Any]: sdf = self._withColumns(columns) if not type_safe: for row in to_type_safe_input(sdf.native.rdd.toLocalIterator(), sdf.schema): yield row else: df = IterableDataFrame(sdf.as_array_iterable(type_safe=False), sdf.schema) for row in df.as_array_iterable(type_safe=True): yield row
def run(self, no: int, rows: Iterable[ps.Row]) -> Iterable[Any]: df = IterableDataFrame(to_type_safe_input(rows, self.schema), self.schema, self.metadata) if df.empty: # pragma: no cover return cursor = self.partition_spec.get_cursor(self.schema, no) if self.on_init is not None: self.on_init(no, df) if self.partition_spec.empty: partitions: Iterable[Tuple[int, int, EmptyAwareIterable]] = [ (0, 0, df.native) ] else: partitioner = self.partition_spec.get_partitioner(self.schema) partitions = partitioner.partition(df.native) for pn, sn, sub in partitions: cursor.set(sub.peek(), pn, sn) sub_df = IterableDataFrame(sub, self.schema) sub_df._metadata = self.metadata res = self.map_func(cursor, sub_df) for r in res.as_array_iterable(type_safe=True): yield r