def test_sum_array(arrow_type): arr = pa.array([1, 2, 3, 4], type=arrow_type) assert arr.sum() == 10 assert pc.sum(arr) == 10 arr = pa.array([], type=arrow_type) assert arr.sum() == None # noqa: E711 assert pc.sum(arr) == None # noqa: E711
def test_sum_array(arrow_type): arr = pa.array([1, 2, 3, 4], type=arrow_type) assert arr.sum().as_py() == 10 assert pc.sum(arr).as_py() == 10 arr = pa.array([1, 2, 3, 4, None], type=arrow_type) assert arr.sum().as_py() == 10 assert pc.sum(arr).as_py() == 10 arr = pa.array([None], type=arrow_type) assert arr.sum().as_py() is None # noqa: E711 assert pc.sum(arr).as_py() is None # noqa: E711 arr = pa.array([], type=arrow_type) assert arr.sum().as_py() is None # noqa: E711
def sort_and_partition(self, boundaries: List[T], key: SortKeyT, descending: bool) -> List["Block[T]"]: if len(key) > 1: raise NotImplementedError( "sorting by multiple columns is not supported yet") import pyarrow.compute as pac indices = pac.sort_indices(self._table, sort_keys=key) table = self._table.take(indices) if len(boundaries) == 0: return [table] # For each boundary value, count the number of items that are less # than it. Since the block is sorted, these counts partition the items # such that boundaries[i] <= x < boundaries[i + 1] for each x in # partition[i]. If `descending` is true, `boundaries` would also be # in descending order and we only need to count the number of items # *greater than* the boundary value instead. col, _ = key[0] comp_fn = pac.greater if descending else pac.less boundary_indices = [ pac.sum(comp_fn(table[col], b)).as_py() for b in boundaries ] ret = [] prev_i = 0 for i in boundary_indices: ret.append(table.slice(prev_i, i - prev_i)) prev_i = i ret.append(table.slice(prev_i)) return ret
def do_scan(file, cols): table = ds.dataset(file, format=format_).to_table(use_threads=False) table = table.flatten() print(table.num_rows) val1 = pc.stddev(table.column(4)) val2 = pc.variance(table.column(4)) val3 = pc.mean(table.column(4)) val4 = pc.sum(table.column(4))
def test_sum_chunked_array(arrow_type): arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)]) assert pc.sum(arr).as_py() == 10 arr = pa.chunked_array( [pa.array([1, 2], type=arrow_type), pa.array([3, 4], type=arrow_type)]) assert pc.sum(arr).as_py() == 10 arr = pa.chunked_array([ pa.array([1, 2], type=arrow_type), pa.array([], type=arrow_type), pa.array([3, 4], type=arrow_type) ]) assert pc.sum(arr).as_py() == 10 arr = pa.chunked_array((), type=arrow_type) assert arr.num_chunks == 0 assert pc.sum(arr).as_py() is None # noqa: E711
def sort_and_partition(self, boundaries: List[T], key: SortKeyT, descending: bool) -> List["Block[T]"]: if len(key) > 1: raise NotImplementedError( "sorting by multiple columns is not supported yet") if self._table.num_rows == 0: # If the pyarrow table is empty we may not have schema # so calling sort_indices() will raise an error. return [ pyarrow.Table.from_pydict({}) for _ in range(len(boundaries) + 1) ] import pyarrow.compute as pac indices = pac.sort_indices(self._table, sort_keys=key) table = self._table.take(indices) if len(boundaries) == 0: return [table] # For each boundary value, count the number of items that are less # than it. Since the block is sorted, these counts partition the items # such that boundaries[i] <= x < boundaries[i + 1] for each x in # partition[i]. If `descending` is true, `boundaries` would also be # in descending order and we only need to count the number of items # *greater than* the boundary value instead. col, _ = key[0] comp_fn = pac.greater if descending else pac.less # TODO(ekl) this is O(n^2) but in practice it's much faster than the # O(n) algorithm, could be optimized. boundary_indices = [ pac.sum(comp_fn(table[col], b)).as_py() for b in boundaries ] ### Compute the boundary indices in O(n) time via scan. # noqa # boundary_indices = [] # remaining = boundaries.copy() # values = table[col] # for i, x in enumerate(values): # while remaining and not comp_fn(x, remaining[0]).as_py(): # remaining.pop(0) # boundary_indices.append(i) # for _ in remaining: # boundary_indices.append(len(values)) ret = [] prev_i = 0 for i in boundary_indices: # Slices need to be copied to avoid including the base table # during serialization. ret.append(_copy_table(table.slice(prev_i, i - prev_i))) prev_i = i ret.append(_copy_table(table.slice(prev_i))) return ret
def sum_of_squared_diffs_from_mean( self, on: KeyFn, ignore_nulls: bool, mean: Optional[U] = None, ) -> Optional[U]: import pyarrow.compute as pac if mean is None: # If precomputed mean not given, we compute it ourselves. mean = self.mean(on, ignore_nulls) if mean is None: return None return self._apply_arrow_compute( lambda col, skip_nulls: pac.sum( pac.power(pac.subtract(col, mean), 2), skip_nulls=skip_nulls, ), on, ignore_nulls, )
def merge(self, states: pa.Array) -> None: # Not nice since pyarrow scalars can't be summed yet. # This breaks on `None` self._sum = pa.scalar(self._sum.as_py() + pc.sum(states).as_py())
# # Prepopulate registry with simple functions # registry = UDFRegistry.registry() registry.add(ScalarUDF("lower", 1, lambda col: compute.utf8_lower(col.cast(string())))) registry.add(ScalarUDF("upper", 1, lambda col: compute.utf8_upper(col.cast(string())))) # # Prepopulate with incremental aggregation functions # registry.add(AggUDF("count", 1, lambda col: compute.count(col).cast(float64()))) registry.add(AggUDF("avg", 1, lambda col: compute.mean(col).cast(float64()))) registry.add(AggUDF("sum", 1, lambda col: compute.sum(col).cast(float64()))) # Welford's algorithm for online std std_init = lambda: [0, 0., 0] def std_update(s, v): s[0] += 1 d = v - s[1] s[1] += d / s[0] s[2] += d * (v - s[1]) return s def std_finalize(s): if s[0] < 2: return float('nan') return s[2] / (s[0] - 1) registry.add(IncAggUDF("std", 1, np.std, std_init, std_update, std_finalize)) registry.add(IncAggUDF("stdev", 1, np.std, std_init, std_update, std_finalize))
def convert_feather_v1_to_v2_vice_versa( input_ct_db_filename: str, output_ct_db_filename: str, compression: Optional[str] = "zstd", compression_level: int = 6, to_version: int = 2, ): """ Convert cisTarget Feather database from Feather v1 to v2 format (with or without compression) and vice versa. :param input_ct_db_filename: input cisTarget database filename. :param output_ct_db_filename: output cisTarget database filename. :param compression: Compression method: "zstd" (default), "lz4" or "uncompressed". :param compression_level: Compression level for "zstd" or "lz4". :param to_version: Output Feather file format version: 1 (legacy) or 2 (default). :return: """ if to_version != 2 and to_version != 1: raise ValueError( "Feather file version only supports 1 (legacy) or 2 (default).") if to_version == 1: # Compression is not supported in Feather v1 format. compression = "uncompressed" compression_level = None if compression not in {"zstd", "lz4", "uncompressed"}: raise ValueError( f'Unsupported compression value "{compression}". Choose "zstd" (default), "lz4" or "uncompressed".' ) # Read input cisTarget database as a pyarrow Table. df_pa_table = pf.read_table(source=input_ct_db_filename, ) # Get all column names. all_column_names = df_pa_table.column_names try: # Check if we have an old database that still used a "features" column and rename it. features_idx = all_column_names.index("features") # Get column which contains motif or track names. motifs_or_track_names = df_pa_table.column(features_idx) if pc.sum(pc.starts_with(motifs_or_track_names, "jaspar")).as_py() > 0: # It is a motif vs genes/regions database if JASPAR motif names were found in the "features" column. all_column_names[features_idx] = "motifs" else: all_column_names[features_idx] = "tracks" df_pa_table.drop(["features"]) # Rename features column in database to "motifs" or "tracks". df_pa_table = df_pa_table.rename_columns(all_column_names) except ValueError: # No old database (with "features" column). pass # Get database index column ("motifs", "tracks", "regions" or "genes" depending of the database type). for column_idx, column_name in enumerate(all_column_names): if column_name in {"motifs", "tracks", "regions", "genes"}: index_column = df_pa_table.column(column_idx) break # Sort column names (non-index columns) and add index column as last column. column_names_sorted_and_index = sorted([ column_name for column_name in all_column_names if column_name not in index_column._name ]) column_names_sorted_and_index.append(index_column._name) # Create a new pyarrow Table with columns in the new order. df_pa_table = df_pa_table.select(column_names_sorted_and_index) # Writhe cisTarget database to a new Feather file with the requested compression/version settings. pf.write_feather(df=df_pa_table, dest=output_ct_db_filename, compression=compression, compression_level=compression_level, version=to_version)