Beispiel #1
0
 def _serialize_by_partition(
     self,
     df: DataFrame,
     partition_spec: PartitionSpec,
     df_name: str,
     temp_path: Optional[str] = None,
     to_file_threshold: Any = -1,
     has_name: bool = False,
 ) -> DataFrame:
     to_file_threshold = _get_file_threshold(to_file_threshold)
     on = list(filter(lambda k: k in df.schema, partition_spec.partition_by))
     presort = list(
         filter(lambda p: p[0] in df.schema, partition_spec.presort.items())
     )
     col_name = _df_name_to_serialize_col(df_name)
     if len(on) == 0:
         partition_spec = PartitionSpec(
             partition_spec, num=1, by=[], presort=presort
         )
         output_schema = Schema(f"{col_name}:str")
     else:
         partition_spec = PartitionSpec(partition_spec, by=on, presort=presort)
         output_schema = partition_spec.get_key_schema(df.schema) + f"{col_name}:str"
     s = _PartitionSerializer(output_schema, temp_path, to_file_threshold)
     metadata = dict(
         serialized=True,
         serialized_cols={df_name: col_name},
         schemas={df_name: str(df.schema)},
         serialized_has_name=has_name,
     )
     return self.map(df, s.run, output_schema, partition_spec, metadata)
Beispiel #2
0
def test_partition_spec():
    p = PartitionSpec()
    assert [] == p.partition_by
    "0" == p.num_partitions
    {} == p.presort
    "hash" == p.algo
    assert p.empty

    p = PartitionSpec(None)
    assert p.empty
    p2 = PartitionSpec(p)
    assert p2.empty

    p = PartitionSpec(json.dumps(dict(partition_by=["a", "b", "c"], num_partitions=1)))
    assert ["a", "b", "c"] == p.partition_by
    assert "1" == p.num_partitions
    assert {} == p.presort
    assert "hash" == p.algo
    assert not p.empty

    p = PartitionSpec(dict(by=["a", "b", "c"], presort="d asc,e desc"))
    assert ["a", "b", "c"] == p.partition_by
    assert "0" == p.num_partitions
    assert dict(d=True, e=False) == p.presort
    assert "hash" == p.algo
    assert not p.empty

    p = PartitionSpec(by=["a", "b", "c"], num=5, presort="d,e desc", algo="EvEN")
    assert ["a", "b", "c"] == p.partition_by
    assert "5" == p.num_partitions
    assert dict(d=True, e=False) == p.presort
    assert "even" == p.algo
    assert not p.empty

    p = PartitionSpec(partition_by=["a", "b", "c"], presort="d,e desc", algo="EvEN",
                      num_partitions="ROWCOUNT*3", row_limit=4, size_limit="5k")
    p2 = PartitionSpec(p)
    assert p2.jsondict == p.jsondict
    assert "d ASC,e DESC" == p2.presort_expr
    assert not p.empty
    assert not p2.empty

    # partition by overlaps with presort
    raises(SyntaxError, lambda: PartitionSpec(partition_by=[
           "a", "b", "c"], presort="a asc,e desc", algo="EvEN"))

    # partition by has dups
    raises(SyntaxError, lambda: PartitionSpec(partition_by=["a", "b", "b"]))

    # partition by has dups
    raises(SyntaxError, lambda: PartitionSpec(partition_by=["a", "b", "b"]))

    # bad input
    raises(TypeError, lambda: PartitionSpec(1))

    # bad presort
    raises(SyntaxError, lambda: PartitionSpec(presort="a xsc,e desc"))
    raises(SyntaxError, lambda: PartitionSpec(presort="a asc,a desc"))
    raises(SyntaxError, lambda: PartitionSpec(presort="a b asc,a desc"))

    p = PartitionSpec(dict(partition_by=["a"], presort="d asc,e desc"))
    assert dict(a=True, d=True, e=False) == p.get_sorts(
        Schema("a:int,b:int,d:int,e:int"))
    p = PartitionSpec(dict(partition_by=["e", "a"], presort="d asc"))
    assert p.get_key_schema(Schema("a:int,b:int,d:int,e:int")) == "e:int,a:int"

    # modification
    a = PartitionSpec(by=["a", "b"])
    b = PartitionSpec(a, by=["a"], num=2)
    assert ["a", "b"] == a.partition_by
    assert '0' == a.num_partitions
    assert ["a"] == b.partition_by
    assert '2' == b.num_partitions

    a = PartitionSpec(by=["a"], presort="b DESC, c")
    b = PartitionSpec(by=["a"], presort="c,b DESC")
    assert a.presort != b.presort
    c = PartitionSpec(b, presort=a.presort)
    assert a.presort == c.presort
    c = PartitionSpec(b, presort=[("b", False), ("c", True)])
    assert a.presort == c.presort