def _save_parquet(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None: df.as_pandas().to_parquet( p.uri, **{ "engine": "pyarrow", "schema": df.schema.pa_schema, **kwargs })
def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any): """Save pandas dataframe as avro. If providing your own schema, the usage of schema argument is preferred :param schema: Avro Schema determines dtypes saved """ import pandavro as pdx kw = ParamDict(kwargs) # pandavro defaults schema = None append = False times_as_micros = True if "schema" in kw: schema = kw["schema"] del kw["schema"] if "append" in kw: append = kw[ "append"] # default is overwrite (False) instead of append (True) del kw["append"] if "times_as_micros" in kw: times_as_micros = kw["times_as_micros"] del kw["times_as_micros"] pdf = df.as_pandas() pdx.to_avro(p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw)
def transform(self, df: LocalDataFrame) -> LocalDataFrame: assert 1 == self.on_init_called assert "test" in self.workflow_conf assert "x" in df.metadata pdf = df.as_pandas() pdf["p"] = self.params.get("p", 1) pdf["ct"] = pdf.shape[0] return PandasDataFrame(pdf, self.output_schema)
def _save_avro(df: LocalDataFrame, p: FileParser, columns: Any = None, **kwargs: Any): """Save pandas dataframe as avro. If providing your own schema, the usage of schema argument is preferred """ kw = ParamDict(kwargs) # pandavro defaults schema = None append = False times_as_micros = True # pandavro defaults schema = None append = False times_as_micros = True if "schema" in kw: schema = kw["schema"] if schema is None: if columns is not None: schema = _convert_pyarrow_to_avro_schema(df, columns) else: if columns: # both schema and columns provided raise Exception("set columns to None when schema is provided") del kw["infer_schema"] if "infer_schema" in kw: infer_schema = kw["infer_schema"] if infer_schema and (schema is not None): # infer_schema set to True but schema was provided raise Exception( "set infer_schema to False when schema is provided") del kw["infer_schema"] if "append" in kw: append = kw[ "append"] # default is overwrite (False) instead of append (True) del kw["append"] if "times_as_micros" in kw: times_as_micros = kw["times_as_micros"] del kw["times_as_micros"] pdf = df.as_pandas() pdx.to_avro(p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw)
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None: df.as_pandas().to_json(p.uri, **{ "orient": "records", "lines": True, **kwargs })
def _save_csv(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None: df.as_pandas().to_csv(p.uri, **{"index": False, "header": False, **kwargs})
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None: df.as_pandas().to_json(p.uri, **kwargs)