Example #1
0
def test_row_ordering_multiple_groups(ms, group_cols, index_cols, chunks):
    group_taql = group_ordering_taql(table_proxy(ms), group_cols, index_cols)
    assert_liveness(2, 1)
    orders = group_row_ordering(group_taql, group_cols, index_cols, chunks)
    assert_liveness(2, 1)
    first_rows = group_taql.getcol("__firstrow__").result()
    assert_liveness(2, 1)

    # We get two groups out
    assert len(orders) == len(first_rows) == 2
    assert_array_equal(first_rows, [0, 7])

    rowid_arrays = tuple(o[0] for o in orders)
    rowids = dask.compute(rowid_arrays)[0]

    # Check the two resulting groups

    # Normalise chunks to match that of the output array
    row_chunks = chunks[0]['row']
    expected_chunks = da.core.normalize_chunks(row_chunks, (7, ))
    assert_array_equal(rowids[0], [6, 5, 4, 3, 2, 1, 0])
    assert rowid_arrays[0].chunks == expected_chunks

    # If chunks only supplied for the first group, re-use it's chunking
    row_chunks = chunks[0]['row'] if len(chunks) == 1 else chunks[1]['row']
    expected_chunks = da.core.normalize_chunks(row_chunks, (3, ))
    assert_array_equal(rowids[1], [9, 8, 7])
    assert rowid_arrays[1].chunks == expected_chunks

    del first_rows, orders, rowid_arrays, group_taql
    assert_liveness(0, 0)
Example #2
0
def test_ordering_multiple_groups(ms, group_cols, index_cols):
    group_taql = group_ordering_taql(table_proxy(ms), group_cols, index_cols)
    assert_liveness(2, 1)
    orders = group_row_ordering(group_taql, group_cols, index_cols, [{
        'row': 2
    }])
    assert_liveness(2, 1)
    first_rows = group_taql.getcol("__firstrow__").result()
    assert_liveness(2, 1)

    assert len(first_rows) == len(orders) == 6

    assert_array_equal(first_rows, [0, 1, 3, 4, 7, 8])

    rowid_arrays = tuple(o[0] for o in orders)
    rowids = dask.compute(rowid_arrays)[0]

    assert_array_equal(rowids[0], [2, 0])
    assert_array_equal(rowids[1], [1])
    assert_array_equal(rowids[2], [5, 3])
    assert_array_equal(rowids[3], [6, 4])
    assert_array_equal(rowids[4], [9, 7])
    assert_array_equal(rowids[5], [8])

    del first_rows, orders, rowid_arrays, group_taql
    assert_liveness(0, 0)
Example #3
0
    def datasets(self):
        table_proxy = self._table_proxy()

        # No grouping case
        if len(self.group_cols) == 0:
            order_taql = ordering_taql(table_proxy, self.index_cols,
                                       self.taql_where)
            orders = row_ordering(order_taql, self.index_cols, self.chunks[0])
            datasets = [self._single_dataset(orders)]
        # Group by row
        elif len(self.group_cols) == 1 and self.group_cols[0] == "__row__":
            order_taql = ordering_taql(table_proxy, self.index_cols,
                                       self.taql_where)
            sorted_rows, row_runs = row_ordering(
                order_taql,
                self.index_cols,
                # chunk ordering on each row
                dict(self.chunks[0], row=1))

            # Produce a dataset for each chunk (block),
            # each containing a single row
            row_blocks = sorted_rows.blocks
            run_blocks = row_runs.blocks

            # Exemplar actually correspond to the sorted rows.
            # We reify them here so they can be assigned on each
            # dataset as an attribute
            np_sorted_row = sorted_rows.compute()

            datasets = [
                self._single_dataset((row_blocks[r], run_blocks[r]),
                                     exemplar_row=er)
                for r, er in enumerate(np_sorted_row)
            ]
        # Grouping column case
        else:
            order_taql = group_ordering_taql(table_proxy, self.group_cols,
                                             self.index_cols, self.taql_where)
            orders = group_row_ordering(order_taql, self.group_cols,
                                        self.index_cols, self.chunks)

            groups = [order_taql.getcol(g).result() for g in self.group_cols]
            exemplar_rows = order_taql.getcol("__firstrow__").result()
            assert len(orders) == len(exemplar_rows)

            datasets = self._group_datasets(groups, exemplar_rows, orders)

        ret = (datasets, )

        if self.table_keywords is True:
            ret += (table_proxy.getkeywords().result(), )

        if self.column_keywords is True:
            keywords = table_proxy.submit(_col_keyword_getter, READLOCK)
            ret += (keywords.result(), )

        if len(ret) == 1:
            return ret[0]

        return ret