def get_dfs(seq): for x in seq: if x == "e": yield IterableDataFrame([], "a:int,b:int") if x == "v": yield IterableDataFrame([[1, 10]], "a:int,b:int") if x == "o": # bad schema but empty dataframe doesn't matter yield ArrayDataFrame([], "a:int,b:str")
def as_array_iterable( self, columns: Optional[List[str]] = None, type_safe: bool = False ) -> Iterable[Any]: sdf = self._withColumns(columns) if not type_safe: for row in to_type_safe_input(sdf.native.rdd.toLocalIterator(), sdf.schema): yield row else: df = IterableDataFrame(sdf.as_array_iterable(type_safe=False), sdf.schema) for row in df.as_array_iterable(type_safe=True): yield row
def to_output_df(self, output: EmptyAwareIterable[Dict[str, Any]], schema: Any) -> DataFrame: schema = schema if isinstance(schema, Schema) else Schema(schema) def get_all() -> Iterable[List[Any]]: for row in output: yield [row[x] for x in schema.names] return IterableDataFrame(get_all(), schema)
def _test_as_array_perf(): s = Schema() arr = [] for i in range(100): s.append(f"a{i}:int") arr.append(i) for i in range(100): s.append(f"b{i}:int") arr.append(float(i)) for i in range(100): s.append(f"c{i}:str") arr.append(str(i)) data = [] for i in range(5000): data.append(list(arr)) df = IterableDataFrame(data, s) res = df.as_array() res = df.as_array(type_safe=True) nts, ts = 0.0, 0.0 for i in range(10): t = datetime.now() res = df.as_array() nts += (datetime.now() - t).total_seconds() t = datetime.now() res = df.as_array(type_safe=True) ts += (datetime.now() - t).total_seconds() print(nts, ts)
def run(self, no: int, rows: Iterable[ps.Row]) -> Iterable[Any]: df = IterableDataFrame(to_type_safe_input(rows, self.schema), self.schema, self.metadata) if df.empty: # pragma: no cover return cursor = self.partition_spec.get_cursor(self.schema, no) if self.on_init is not None: self.on_init(no, df) if self.partition_spec.empty: partitions: Iterable[Tuple[int, int, EmptyAwareIterable]] = [ (0, 0, df.native) ] else: partitioner = self.partition_spec.get_partitioner(self.schema) partitions = partitioner.partition(df.native) for pn, sn, sub in partitions: cursor.set(sub.peek(), pn, sn) sub_df = IterableDataFrame(sub, self.schema) sub_df._metadata = self.metadata res = self.map_func(cursor, sub_df) for r in res.as_array_iterable(type_safe=True): yield r
def test_nested(): data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] df = IterableDataFrame(data, "a:{a:str,b:[int]}") a = df.as_array(type_safe=True) assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[json.dumps(dict(b=[30, "40"]))]]] df = IterableDataFrame(data, "a:[{a:str,b:[int]}]") a = df.as_array(type_safe=True) assert [[[dict(a=None, b=[30, 40])]]] == a
def f21(e: List[Dict[str, Any]], a: Iterable[Dict[str, Any]]) -> DataFrame: e += list(a) arr = [[x["a"]] for x in e] return IterableDataFrame(arr, "a:int")
def f20(e: List[List[Any]], a: Iterable[List[Any]]) -> LocalDataFrame: e += list(a) return IterableDataFrame(e, "a:int")
def to_output_df(self, output: EmptyAwareIterable[List[Any]], schema: Any) -> DataFrame: return IterableDataFrame(output, schema)
def to_output_df(self, output: Iterable[List[Any]], schema: Any, ctx: Any) -> DataFrame: return IterableDataFrame(output, schema)
def test_simple_methods(): df = IterableDataFrame([["a", 1], ["b", "2"]], "x:str,y:double") raises(InvalidOperationError, lambda: df.count()) assert not df.empty assert ["a", 1.0] == df.peek_array() assert dict(x="a", y=1.0) == df.peek_dict() assert [["a", 1], ["b", "2"]] == df.as_array() df = IterableDataFrame([["a", 1], ["b", "2"]], "x:str,y:double") pdf = df.as_pandas() assert [["a", 1.0], ["b", 2.0]] == pdf.values.tolist() df = IterableDataFrame([], "x:str,y:double") pdf = df.as_pandas() assert [] == pdf.values.tolist()
def test_init(): df = IterableDataFrame(schema="a:str,b:int") assert df.empty assert df.schema == "a:str,b:int" assert not df.is_bounded data = [["a", 1], ["b", 2]] df = IterableDataFrame(data, "a:str,b:str") assert [["a", "1"], ["b", "2"]] == df.as_array(type_safe=True) assert df.empty # after iterating all items df = IterableDataFrame(data, "a:str,b:int") assert [["a", 1], ["b", 2]] == df.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df) assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, "a:str,b:float64") assert [["a", 1.0], ["b", 2.0]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, "b:str,a:str") assert [["1", "a"], ["2", "b"]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, ["b"]) assert ddf.schema == "b:double" assert [[1.0], [2.0]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, ["a:str,b:str"]) assert [["a", "1"], ["b", "2"]] == ddf.as_array(type_safe=True) df = IterableDataFrame(data, "a:str,b:double") ddf = IterableDataFrame(df, ["b:str"]) assert [["1"], ["2"]] == ddf.as_array(type_safe=True) pdf = PandasDataFrame(data, "a:str,b:double") df = IterableDataFrame(pdf, "a:str,b:double") assert [["a", 1.0], ["b", 2.0]] == df.as_array(type_safe=True) df = IterableDataFrame(pdf, "b:str,a:str") assert [["1.0", "a"], ["2.0", "b"]] == df.as_array(type_safe=True) df = IterableDataFrame([], "x:str,y:double") assert df.empty assert df.is_local raises(FugueDataFrameInitError, lambda: IterableDataFrame(123))
def df(self, data: Any = None, schema: Any = None, metadata: Any = None) -> IterableDataFrame: return IterableDataFrame(data, schema, metadata)