Exemple #1
0
 def comap(cursor, dfs):
     assert not dfs.has_key
     v = ",".join([k + str(v.count()) for k, v in dfs.items()])
     keys = cursor.key_value_array
     if len(keys) == 0:
         return ArrayDataFrame([[v]], "v:str")
     return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str")
Exemple #2
0
 def get_dfs():
     if isinstance(data, list):
         for row in data:
             yield ArrayDataFrame([], schema, metadata)  # noise
             yield ArrayDataFrame([row], schema, metadata)
         if schema is None:
             yield ArrayDataFrame([], schema, metadata)  # noise
     elif data is not None:
         yield ArrayDataFrame(data, schema, metadata)
Exemple #3
0
 def test_map_with_binary(self):
     e = self.engine
     o = ArrayDataFrame(
         [[pickle.dumps(BinaryObject("a"))], [pickle.dumps(BinaryObject("b"))]],
         "a:bytes",
     )
     c = e.map(o, binary_map, o.schema, PartitionSpec())
     expected = ArrayDataFrame(
         [
             [pickle.dumps(BinaryObject("ax"))],
             [pickle.dumps(BinaryObject("bx"))],
         ],
         "a:bytes",
     )
     df_eq(expected, c, no_pandas=True, check_order=True, throw=True)
Exemple #4
0
 def get_dfs(seq):
     for x in seq:
         if x == "e":
             yield IterableDataFrame([], "a:int,b:int")
         if x == "v":
             yield IterableDataFrame([[1, 10]], "a:int,b:int")
         if x == "o":  # bad schema but empty dataframe doesn't matter
             yield ArrayDataFrame([], "a:int,b:str")
Exemple #5
0
 def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
     self.transformer._cursor = cursor  # type: ignore
     df._metadata = self.metadata
     if len(self.ignore_errors) == 0:
         return self.transformer.transform(df)
     else:
         try:
             return to_local_bounded_df(self.transformer.transform(df))
         except self.ignore_errors:  # type: ignore
             return ArrayDataFrame([], self.transformer.output_schema)
def _test_as_array_perf():
    s = Schema()
    arr = []
    for i in range(100):
        s.append(f"a{i}:int")
        arr.append(i)
    for i in range(100):
        s.append(f"b{i}:int")
        arr.append(float(i))
    for i in range(100):
        s.append(f"c{i}:str")
        arr.append(str(i))
    data = []
    for i in range(5000):
        data.append(list(arr))
    df = ArrayDataFrame(data, s)
    res = df.as_array()
    res = df.as_array(type_safe=True)
    nts, ts = 0.0, 0.0
    for i in range(10):
        t = datetime.now()
        res = df.as_array()
        nts += (datetime.now() - t).total_seconds()
        t = datetime.now()
        res = df.as_array(type_safe=True)
        ts += (datetime.now() - t).total_seconds()
    print(nts, ts)
def test_nested():
    data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]]
    df = ArrayDataFrame(data, "a:{a:str,b:[int]}")
    a = df.as_array(type_safe=True)
    assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a

    data = [[[json.dumps(dict(b=[30, "40"]))]]]
    df = ArrayDataFrame(data, "a:[{a:str,b:[int]}]")
    a = df.as_array(type_safe=True)
    assert [[[dict(a=None, b=[30, 40])]]] == a
Exemple #8
0
def test_run_outputter():
    df = ArrayDataFrame([[0]], "a:int")
    dfs = DataFrames(df1=df, df2=df)
    dfs2 = DataFrames(df, df)
    assert not dfs2.has_key

    class Ct(object):
        pass

    c = Ct()
    o1 = _to_outputter(t3)
    o1(df, df, 2, c)
    assert 4 == c.value
    c.value = 0
    o1._params = ParamDict([("a", 2), ("b", c)], deep=False)
    o1._execution_engine = None
    o1.process(dfs)
    assert 4 == c.value
    c.value = 0
    o1._params = ParamDict([("a", 2), ("b", c)], deep=False)
    o1._execution_engine = None
    o1.process(dfs2)
    assert 4 == c.value

    c = Ct()
    o1 = _to_outputter(t5)
    o1("dummy", dfs, 2, c)
    assert 4 == c.value
    c.value = 0
    o1("dummy", dfs2, 2, c)
    assert 4 == c.value
    c.value = 0
    o1._params = ParamDict([("a", 2), ("b", c)], deep=False)
    o1._execution_engine = NativeExecutionEngine()
    o1.process(dfs)
    assert 4 == c.value
    c.value = 0
    o1._params = ParamDict([("a", 2), ("b", c)], deep=False)
    o1._execution_engine = NativeExecutionEngine()
    o1.process(dfs2)
    assert 4 == c.value
    def test_to_df(self):
        e = self.engine
        o = ArrayDataFrame(
            [[1, 2], [None, 3]],
            "a:double,b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.native.collect()
        assert res[0][0] == 1.0 or res[0][0] is None
        assert res[1][0] == 1.0 or res[1][0] is None
        df_eq(a, o, throw=True)

        o = ArrowDataFrame(
            [[1, 2], [None, 3]],
            "a:double,b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.native.collect()
        assert res[0][0] == 1.0 or res[0][0] is None
        assert res[1][0] == 1.0 or res[1][0] is None

        a = e.to_df([[1, None]], "a:int,b:int", dict(a=1))
        df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True)

        o = PandasDataFrame(
            [[{
                "a": "b"
            }, 2]],
            "a:{a:str},b:int",
            dict(a=1),
        )
        a = e.to_df(o)
        assert a is not o
        res = a.as_array(type_safe=True)
        assert res[0][0] == {"a": "b"}