Example #1
0
    def partition_and_write(self, cluster_spec: ClusterSpec, df: DataFrame) -> Result[ExecutedJob, InvalidJob]:
        if len(self.partition_columns) > 0:
            def write_partitioned(df: PDataFrame) -> Result[ExecutedJob, InvalidJob]:
                ddf = dd.from_pandas(df, npartitions=cluster_spec.num_workers() or self.partitions or 10)
                labels = {}

                for p in self.partition_columns:
                    value = df[p].unique()[0]
                    labels[p] = str(value)

                label = "&".join(list(map(lambda item: "=".join(item), labels.items())))
                result = df_to(ddf, self.output_path, self.output_format, label)

                return result

            def repartition_and_write(df: DataFrame) -> Result[ExecutedJob, InvalidJob]:
                pc = self.partition_columns
                df.groupby(pc).apply(write_partitioned, meta=str).compute()
                return Success(ExecutedJob(f"Repartitioned dataframe according to columns {self.partition_columns} and output to {self.output_path}"))
            
            check = self.check_columns(df, self.partition_columns)
            return flatten(check.map(lambda b: repartition_and_write(b)))
        else:
            repartitioned = self.repartition(df, cluster_spec)
            return flatten(repartitioned.map(lambda b: df_to(b, self.output_path, self.output_format)))
Example #2
0
 def df_filter_columns(self, cluster_spec: ClusterSpec, df: DataFrame) -> Result[DataFrame, InvalidJob]:
     if len(self.filter_columns) > 0:
         check = self.check_columns(df, self.filter_columns)
         
         def filter_and_repartition(df: DataFrame) -> Result[DataFrame, InvalidJob]:
             filtered = df[self.filter_columns]
             repartitioned = self.repartition(filtered, cluster_spec)
             return repartitioned
             
         return flatten(check.map(lambda b: filter_and_repartition(b)))
     else:
         return Success(df)
Example #3
0
def test_flatten_context(container, merged):
    """Ensures that `flatten` is always returning the correct type."""
    assert flatten(container)(...) == merged(...)
Example #4
0
def test_flatten_context():
    """Ensures that `join` works with Context."""
    assert flatten(
        Context.unit(Context.unit(1)),
    )(Context.Empty) == 1
Example #5
0
def test_flatten(container, merged):
    """Ensures that `join` is always returning the correct type."""
    assert flatten(container) == merged