def comap(cursor, dfs): assert not dfs.has_key v = ",".join([k + str(v.count()) for k, v in dfs.items()]) keys = cursor.key_value_array if len(keys) == 0: return ArrayDataFrame([[v]], "v:str") return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str")
def get_dfs(): if isinstance(data, list): for row in data: yield ArrayDataFrame([], schema, metadata) # noise yield ArrayDataFrame([row], schema, metadata) if schema is None: yield ArrayDataFrame([], schema, metadata) # noise elif data is not None: yield ArrayDataFrame(data, schema, metadata)
def test_map_with_binary(self): e = self.engine o = ArrayDataFrame( [[pickle.dumps(BinaryObject("a"))], [pickle.dumps(BinaryObject("b"))]], "a:bytes", ) c = e.map(o, binary_map, o.schema, PartitionSpec()) expected = ArrayDataFrame( [ [pickle.dumps(BinaryObject("ax"))], [pickle.dumps(BinaryObject("bx"))], ], "a:bytes", ) df_eq(expected, c, no_pandas=True, check_order=True, throw=True)
def get_dfs(seq): for x in seq: if x == "e": yield IterableDataFrame([], "a:int,b:int") if x == "v": yield IterableDataFrame([[1, 10]], "a:int,b:int") if x == "o": # bad schema but empty dataframe doesn't matter yield ArrayDataFrame([], "a:int,b:str")
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame: self.transformer._cursor = cursor # type: ignore df._metadata = self.metadata if len(self.ignore_errors) == 0: return self.transformer.transform(df) else: try: return to_local_bounded_df(self.transformer.transform(df)) except self.ignore_errors: # type: ignore return ArrayDataFrame([], self.transformer.output_schema)
def _test_as_array_perf(): s = Schema() arr = [] for i in range(100): s.append(f"a{i}:int") arr.append(i) for i in range(100): s.append(f"b{i}:int") arr.append(float(i)) for i in range(100): s.append(f"c{i}:str") arr.append(str(i)) data = [] for i in range(5000): data.append(list(arr)) df = ArrayDataFrame(data, s) res = df.as_array() res = df.as_array(type_safe=True) nts, ts = 0.0, 0.0 for i in range(10): t = datetime.now() res = df.as_array() nts += (datetime.now() - t).total_seconds() t = datetime.now() res = df.as_array(type_safe=True) ts += (datetime.now() - t).total_seconds() print(nts, ts)
def test_nested(): data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] df = ArrayDataFrame(data, "a:{a:str,b:[int]}") a = df.as_array(type_safe=True) assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[json.dumps(dict(b=[30, "40"]))]]] df = ArrayDataFrame(data, "a:[{a:str,b:[int]}]") a = df.as_array(type_safe=True) assert [[[dict(a=None, b=[30, 40])]]] == a
def test_run_outputter(): df = ArrayDataFrame([[0]], "a:int") dfs = DataFrames(df1=df, df2=df) dfs2 = DataFrames(df, df) assert not dfs2.has_key class Ct(object): pass c = Ct() o1 = _to_outputter(t3) o1(df, df, 2, c) assert 4 == c.value c.value = 0 o1._params = ParamDict([("a", 2), ("b", c)], deep=False) o1._execution_engine = None o1.process(dfs) assert 4 == c.value c.value = 0 o1._params = ParamDict([("a", 2), ("b", c)], deep=False) o1._execution_engine = None o1.process(dfs2) assert 4 == c.value c = Ct() o1 = _to_outputter(t5) o1("dummy", dfs, 2, c) assert 4 == c.value c.value = 0 o1("dummy", dfs2, 2, c) assert 4 == c.value c.value = 0 o1._params = ParamDict([("a", 2), ("b", c)], deep=False) o1._execution_engine = NativeExecutionEngine() o1.process(dfs) assert 4 == c.value c.value = 0 o1._params = ParamDict([("a", 2), ("b", c)], deep=False) o1._execution_engine = NativeExecutionEngine() o1.process(dfs2) assert 4 == c.value
def test_to_df(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 3]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.native.collect() assert res[0][0] == 1.0 or res[0][0] is None assert res[1][0] == 1.0 or res[1][0] is None df_eq(a, o, throw=True) o = ArrowDataFrame( [[1, 2], [None, 3]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.native.collect() assert res[0][0] == 1.0 or res[0][0] is None assert res[1][0] == 1.0 or res[1][0] is None a = e.to_df([[1, None]], "a:int,b:int", dict(a=1)) df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True) o = PandasDataFrame( [[{ "a": "b" }, 2]], "a:{a:str},b:int", dict(a=1), ) a = e.to_df(o) assert a is not o res = a.as_array(type_safe=True) assert res[0][0] == {"a": "b"}