Ejemplo n.º 1
0
    def _fit_eig(self, x):
        scatter_matrix = _scatter_matrix(x, self.arity)
        cov_matrix = _estimate_covariance(scatter_matrix, x.shape[0])

        if self.n_components:
            shape1 = self.n_components
        else:
            shape1 = x.shape[1]

        n_blocks = int(ceil(shape1 / x._reg_shape[1]))

        val_blocks = Array._get_out_blocks((1, n_blocks))
        vec_blocks = Array._get_out_blocks((n_blocks, x._n_blocks[1]))

        _decompose(cov_matrix, self.n_components, x._reg_shape[1],
                   val_blocks,
                   vec_blocks)

        bshape = (x._reg_shape[1], x._reg_shape[1])

        self.components_ = Array(vec_blocks, bshape, bshape,
                                 (shape1, x.shape[1]), False)
        self.explained_variance_ = Array(val_blocks, bshape, bshape,
                                         (1, shape1), False)

        return self
Ejemplo n.º 2
0
def _compute_u_sorted(a, sorting):
    u_blocks = [[] for _ in range(a._n_blocks[1])]
    hbsize = a._reg_shape[1]

    for i, vblock in enumerate(a._iterator("columns")):
        u_block = [object() for _ in range(a._n_blocks[1])]
        _compute_u_block_sorted(vblock._blocks, i, hbsize, sorting, u_block)

        for j in range(len(u_block)):
            u_blocks[j].append(u_block[j])

    vbsize = a._reg_shape[0]
    final_blocks = Array._get_out_blocks(a._n_blocks)

    for i, u_block in enumerate(u_blocks):
        new_block = [object() for _ in range(a._n_blocks[0])]
        _merge_svd_block(u_block, i, hbsize, vbsize, sorting, new_block)

        for j in range(len(new_block)):
            final_blocks[j][i] = new_block[j]

        for elem in u_block:
            compss_delete_object(elem)

    return Array(final_blocks, a._top_left_shape, a._reg_shape, a.shape,
                 a._sparse)
Ejemplo n.º 3
0
def _sort_v(v, sorting):
    v_blocks = [[] for _ in range(v._n_blocks[1])]
    hbsize = v._reg_shape[1]

    for i, vblock in enumerate(v._iterator("columns")):
        out_blocks = [[] for _ in range(v._n_blocks[1])]
        _sort_v_block(vblock._blocks, i, hbsize, sorting, out_blocks)

        for j in range(len(out_blocks)):
            v_blocks[j].append(out_blocks[j])

    vbsize = v._reg_shape[0]
    final_blocks = Array._get_out_blocks(v._n_blocks)

    for i, v_block in enumerate(v_blocks):
        new_block = [object() for _ in range(v._n_blocks[0])]
        _merge_svd_block(v_block, i, hbsize, vbsize, sorting, new_block)

        for j in range(len(new_block)):
            final_blocks[j][i] = new_block[j]

        for elem in v_block:
            compss_delete_object(elem)

    return Array(final_blocks, v._top_left_shape, v._reg_shape, v.shape,
                 v._sparse)
Ejemplo n.º 4
0
def load_svmlight_file(path, block_size, n_features, store_sparse):
    """ Loads a SVMLight file into a distributed array.

    Parameters
    ----------
    path : string
        File path.
    block_size : tuple (int, int)
        Size of the blocks for the output ds-array.
    n_features : int
        Number of features.
    store_sparse : boolean
        Whether to use scipy.sparse data structures to store data. If False,
        numpy.array is used instead.

    Returns
    -------
    x, y : (ds-array, ds-array)
        A distributed representation (ds-array) of the X and y.
    """
    n, m = block_size
    lines = []
    x_blocks, y_blocks = [], []

    n_rows = 0
    with open(path, "r") as f:
        for line in f:
            n_rows += 1
            lines.append(line.encode())

            if len(lines) == n:
                # line 0 -> X, line 1 -> y
                out_blocks = Array._get_out_blocks((1, ceil(n_features / m)))
                out_blocks.append([object()])
                # out_blocks.append([])
                _read_svmlight(lines,
                               out_blocks,
                               col_size=m,
                               n_features=n_features,
                               store_sparse=store_sparse)
                # we append only the list forming the row (out_blocks depth=2)
                x_blocks.append(out_blocks[0])
                y_blocks.append(out_blocks[1])
                lines = []

    if lines:
        out_blocks = Array._get_out_blocks((1, ceil(n_features / m)))
        out_blocks.append([object()])
        _read_svmlight(lines,
                       out_blocks,
                       col_size=m,
                       n_features=n_features,
                       store_sparse=store_sparse)
        # we append only the list forming the row (out_blocks depth=2)
        x_blocks.append(out_blocks[0])
        y_blocks.append(out_blocks[1])

    x = Array(x_blocks,
              top_left_shape=block_size,
              reg_shape=block_size,
              shape=(n_rows, n_features),
              sparse=store_sparse)

    # y has only a single line but it's treated as a 'column'
    y = Array(y_blocks,
              top_left_shape=(n, 1),
              reg_shape=(n, 1),
              shape=(n_rows, 1),
              sparse=False)

    return x, y
Ejemplo n.º 5
0
def kron(a, b, block_size=None):
    """ Kronecker product of two ds-arrays.

    Parameters
    ----------
    a, b : ds-arrays
        Input ds-arrays.
    block_size : tuple of two ints, optional
        Block size of the resulting array. Defaults to the block size of `b`.

    Returns
    -------
    out : ds-array

    Raises
    ------
    NotImplementedError
        If a or b are sparse.
    """
    if a._sparse or b._sparse:
        raise NotImplementedError("Sparse ds-arrays not supported.")

    k_n_blocks = ((a.shape[0] * b._n_blocks[0]), a.shape[1] * b._n_blocks[1])
    k_blocks = Array._get_out_blocks(k_n_blocks)

    # compute the kronecker product by multipliying b by each element in a.
    # The resulting array keeps the block structure of b repeated many
    # times. This is why we need to rechunk it at the end.
    offseti = 0

    for i in range(a._n_blocks[0]):
        offsetj = 0

        for j in range(a._n_blocks[1]):
            bshape_a = a._get_block_shape(i, j)

            for k in range(b._n_blocks[0]):
                for q in range(b._n_blocks[1]):
                    out_blocks = Array._get_out_blocks(bshape_a)
                    _kron(a._blocks[i][j], b._blocks[k][q], out_blocks)

                    for m in range(bshape_a[0]):
                        for n in range(bshape_a[1]):
                            bi = (offseti + m) * b._n_blocks[0] + k
                            bj = (offsetj + n) * b._n_blocks[1] + q
                            k_blocks[bi][bj] = out_blocks[m][n]

            offsetj += bshape_a[1]
        offseti += bshape_a[0]

    shape = (a.shape[0] * b.shape[0], a.shape[1] * b.shape[1])

    if not block_size:
        bsize = b._reg_shape
    else:
        bsize = block_size

    # rechunk the array unless all blocks of b are of the same size and
    # block_size is None
    if (not block_size or block_size == b._reg_shape) and (
            b.shape[0] % b._reg_shape[0] == 0
            and b.shape[1] % b._reg_shape[1] == 0 and b._is_regular()):
        return Array(k_blocks, bsize, bsize, shape, False)
    else:
        out = Array._rechunk(k_blocks, shape, bsize, _kron_shape_f, b)

        for blocks in k_blocks:
            for block in blocks:
                compss_delete_object(block)

        return out