def pyarrow_transform(batch: pa.Table) -> pa.Table: batch = batch.filter(pac.equal(batch["variety"], "Versicolor")) batch = batch.append_column( "normalized.sepal.length", pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])), ) return batch.drop(["sepal.length"])
def _split_into_per_realization_tables(table: pa.Table) -> Dict[int, pa.Table]: per_real_tables: Dict[int, pa.Table] = {} unique_reals = table.column("REAL").unique().to_pylist() for real in unique_reals: # pylint: disable=no-member mask = pc.is_in(table["REAL"], value_set=pa.array([real])) real_table = table.filter(mask).drop(["REAL"]) per_real_tables[real] = real_table return per_real_tables
def find_intersected_dates_between_realizations(table: pa.Table) -> np.ndarray: """Find the intersection of dates present in all the realizations The input table must contain both REAL and DATE columns, but this function makes no assumptions about sorting of either column""" unique_reals = table.column("REAL").unique().to_numpy() date_intersection = None for real in unique_reals: # pylint: disable=no-member real_mask = pc.is_in(table["REAL"], value_set=pa.array([real])) dates_in_real = table.filter(real_mask).column( "DATE").unique().to_numpy() if date_intersection is None: date_intersection = dates_in_real else: date_intersection = np.intersect1d(date_intersection, dates_in_real, assume_unique=True) if date_intersection is not None: return date_intersection return np.empty(0, dtype=np.datetime64)
def pyarrow_filter_rows(batch: pyarrow.Table) -> pyarrow.Table: return batch.filter(pyarrow.compute.equal(batch["variety"], "Versicolor"))