Esempio n. 1
0
def test_sum_array(arrow_type):
    arr = pa.array([1, 2, 3, 4], type=arrow_type)
    assert arr.sum() == 10
    assert pc.sum(arr) == 10

    arr = pa.array([], type=arrow_type)
    assert arr.sum() == None  # noqa: E711
    assert pc.sum(arr) == None  # noqa: E711
Esempio n. 2
0
def test_sum_array(arrow_type):
    arr = pa.array([1, 2, 3, 4], type=arrow_type)
    assert arr.sum().as_py() == 10
    assert pc.sum(arr).as_py() == 10

    arr = pa.array([1, 2, 3, 4, None], type=arrow_type)
    assert arr.sum().as_py() == 10
    assert pc.sum(arr).as_py() == 10

    arr = pa.array([None], type=arrow_type)
    assert arr.sum().as_py() is None  # noqa: E711
    assert pc.sum(arr).as_py() is None  # noqa: E711

    arr = pa.array([], type=arrow_type)
    assert arr.sum().as_py() is None  # noqa: E711
Esempio n. 3
0
    def sort_and_partition(self, boundaries: List[T], key: SortKeyT,
                           descending: bool) -> List["Block[T]"]:
        if len(key) > 1:
            raise NotImplementedError(
                "sorting by multiple columns is not supported yet")

        import pyarrow.compute as pac

        indices = pac.sort_indices(self._table, sort_keys=key)
        table = self._table.take(indices)
        if len(boundaries) == 0:
            return [table]

        # For each boundary value, count the number of items that are less
        # than it. Since the block is sorted, these counts partition the items
        # such that boundaries[i] <= x < boundaries[i + 1] for each x in
        # partition[i]. If `descending` is true, `boundaries` would also be
        # in descending order and we only need to count the number of items
        # *greater than* the boundary value instead.
        col, _ = key[0]
        comp_fn = pac.greater if descending else pac.less
        boundary_indices = [
            pac.sum(comp_fn(table[col], b)).as_py() for b in boundaries
        ]
        ret = []
        prev_i = 0
        for i in boundary_indices:
            ret.append(table.slice(prev_i, i - prev_i))
            prev_i = i
        ret.append(table.slice(prev_i))
        return ret
def do_scan(file, cols):
    table = ds.dataset(file, format=format_).to_table(use_threads=False)
    table = table.flatten()
    print(table.num_rows)
    val1 = pc.stddev(table.column(4))
    val2 = pc.variance(table.column(4))
    val3 = pc.mean(table.column(4))
    val4 = pc.sum(table.column(4))
Esempio n. 5
0
def test_sum_chunked_array(arrow_type):
    arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)])
    assert pc.sum(arr).as_py() == 10

    arr = pa.chunked_array(
        [pa.array([1, 2], type=arrow_type),
         pa.array([3, 4], type=arrow_type)])
    assert pc.sum(arr).as_py() == 10

    arr = pa.chunked_array([
        pa.array([1, 2], type=arrow_type),
        pa.array([], type=arrow_type),
        pa.array([3, 4], type=arrow_type)
    ])
    assert pc.sum(arr).as_py() == 10

    arr = pa.chunked_array((), type=arrow_type)
    assert arr.num_chunks == 0
    assert pc.sum(arr).as_py() is None  # noqa: E711
Esempio n. 6
0
    def sort_and_partition(self, boundaries: List[T], key: SortKeyT,
                           descending: bool) -> List["Block[T]"]:
        if len(key) > 1:
            raise NotImplementedError(
                "sorting by multiple columns is not supported yet")

        if self._table.num_rows == 0:
            # If the pyarrow table is empty we may not have schema
            # so calling sort_indices() will raise an error.
            return [
                pyarrow.Table.from_pydict({})
                for _ in range(len(boundaries) + 1)
            ]

        import pyarrow.compute as pac

        indices = pac.sort_indices(self._table, sort_keys=key)
        table = self._table.take(indices)
        if len(boundaries) == 0:
            return [table]

        # For each boundary value, count the number of items that are less
        # than it. Since the block is sorted, these counts partition the items
        # such that boundaries[i] <= x < boundaries[i + 1] for each x in
        # partition[i]. If `descending` is true, `boundaries` would also be
        # in descending order and we only need to count the number of items
        # *greater than* the boundary value instead.
        col, _ = key[0]
        comp_fn = pac.greater if descending else pac.less

        # TODO(ekl) this is O(n^2) but in practice it's much faster than the
        # O(n) algorithm, could be optimized.
        boundary_indices = [
            pac.sum(comp_fn(table[col], b)).as_py() for b in boundaries
        ]
        ### Compute the boundary indices in O(n) time via scan.  # noqa
        # boundary_indices = []
        # remaining = boundaries.copy()
        # values = table[col]
        # for i, x in enumerate(values):
        #     while remaining and not comp_fn(x, remaining[0]).as_py():
        #         remaining.pop(0)
        #         boundary_indices.append(i)
        # for _ in remaining:
        #     boundary_indices.append(len(values))

        ret = []
        prev_i = 0
        for i in boundary_indices:
            # Slices need to be copied to avoid including the base table
            # during serialization.
            ret.append(_copy_table(table.slice(prev_i, i - prev_i)))
            prev_i = i
        ret.append(_copy_table(table.slice(prev_i)))
        return ret
Esempio n. 7
0
    def sum_of_squared_diffs_from_mean(
        self,
        on: KeyFn,
        ignore_nulls: bool,
        mean: Optional[U] = None,
    ) -> Optional[U]:
        import pyarrow.compute as pac

        if mean is None:
            # If precomputed mean not given, we compute it ourselves.
            mean = self.mean(on, ignore_nulls)
            if mean is None:
                return None
        return self._apply_arrow_compute(
            lambda col, skip_nulls: pac.sum(
                pac.power(pac.subtract(col, mean), 2),
                skip_nulls=skip_nulls,
            ),
            on,
            ignore_nulls,
        )
Esempio n. 8
0
 def merge(self, states: pa.Array) -> None:
     # Not nice since pyarrow scalars can't be summed yet.
     # This breaks on `None`
     self._sum = pa.scalar(self._sum.as_py() + pc.sum(states).as_py())
Esempio n. 9
0

#
# Prepopulate registry with simple functions
#
registry = UDFRegistry.registry()
registry.add(ScalarUDF("lower", 1, lambda col: compute.utf8_lower(col.cast(string()))))
registry.add(ScalarUDF("upper", 1, lambda col: compute.utf8_upper(col.cast(string()))))

#
# Prepopulate with incremental aggregation functions
#

registry.add(AggUDF("count", 1, lambda col: compute.count(col).cast(float64())))
registry.add(AggUDF("avg", 1, lambda col: compute.mean(col).cast(float64())))
registry.add(AggUDF("sum", 1, lambda col: compute.sum(col).cast(float64())))

# Welford's algorithm for online std
std_init = lambda: [0, 0., 0]
def std_update(s, v):
  s[0] += 1
  d = v - s[1]
  s[1] += d / s[0]
  s[2] += d * (v - s[1])
  return s
def std_finalize(s):
  if s[0] < 2: return float('nan')
  return s[2] / (s[0] - 1)

registry.add(IncAggUDF("std", 1, np.std, std_init, std_update, std_finalize))
registry.add(IncAggUDF("stdev", 1, np.std, std_init, std_update, std_finalize))
def convert_feather_v1_to_v2_vice_versa(
    input_ct_db_filename: str,
    output_ct_db_filename: str,
    compression: Optional[str] = "zstd",
    compression_level: int = 6,
    to_version: int = 2,
):
    """
    Convert cisTarget Feather database from Feather v1 to v2 format (with or without compression) and vice versa.

    :param input_ct_db_filename: input cisTarget database filename.
    :param output_ct_db_filename: output cisTarget database filename.
    :param compression: Compression method: "zstd" (default), "lz4" or "uncompressed".
    :param compression_level: Compression level for "zstd" or "lz4".
    :param to_version: Output Feather file format version: 1 (legacy) or 2 (default).
    :return:
    """

    if to_version != 2 and to_version != 1:
        raise ValueError(
            "Feather file version only supports 1 (legacy) or 2 (default).")

    if to_version == 1:
        # Compression is not supported in Feather v1 format.
        compression = "uncompressed"
        compression_level = None

    if compression not in {"zstd", "lz4", "uncompressed"}:
        raise ValueError(
            f'Unsupported compression value "{compression}". Choose "zstd" (default), "lz4" or "uncompressed".'
        )

    # Read input cisTarget database as a pyarrow Table.
    df_pa_table = pf.read_table(source=input_ct_db_filename, )

    # Get all column names.
    all_column_names = df_pa_table.column_names

    try:
        # Check if we have an old database that still used a "features" column and rename it.
        features_idx = all_column_names.index("features")

        # Get column which contains motif or track names.
        motifs_or_track_names = df_pa_table.column(features_idx)

        if pc.sum(pc.starts_with(motifs_or_track_names, "jaspar")).as_py() > 0:
            # It is a motif vs genes/regions database if JASPAR motif names were found in the "features" column.
            all_column_names[features_idx] = "motifs"
        else:
            all_column_names[features_idx] = "tracks"

        df_pa_table.drop(["features"])
        # Rename features column in database to "motifs" or "tracks".
        df_pa_table = df_pa_table.rename_columns(all_column_names)
    except ValueError:
        # No old database (with "features" column).
        pass

    # Get database index column ("motifs", "tracks", "regions" or "genes" depending of the database type).
    for column_idx, column_name in enumerate(all_column_names):
        if column_name in {"motifs", "tracks", "regions", "genes"}:
            index_column = df_pa_table.column(column_idx)
            break

    # Sort column names (non-index columns) and add index column as last column.
    column_names_sorted_and_index = sorted([
        column_name for column_name in all_column_names
        if column_name not in index_column._name
    ])
    column_names_sorted_and_index.append(index_column._name)

    # Create a new pyarrow Table with columns in the new order.
    df_pa_table = df_pa_table.select(column_names_sorted_and_index)

    # Writhe cisTarget database to a new Feather file with the requested compression/version settings.
    pf.write_feather(df=df_pa_table,
                     dest=output_ct_db_filename,
                     compression=compression,
                     compression_level=compression_level,
                     version=to_version)