Ejemplo n.º 1
0
def _svm_mapper(array, ex, labels, alpha, w, m, scale, lambda_n):
  '''
  Local linear SVM solver.

  Args:
    array(DistArray): features of the training data.
    ex(Extent): Region being processed.
    labels(DistArray): labels of the training data.
    alpha(DistArray): alpha vector which is the parameter optimized by SVM. 
    w(DistArray): weight vector of the previous iteration.
    m(int): number of samples to train (now we set it to the whole local data set).
    scale(int): number of tiles
    lambda_n: lambda/size(total train data) which is the parameter of this svm model.
  '''
  X = array.fetch(ex)
  Y = labels.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), labels.shape))
  
  tile_id = ex.ul[0]/(ex.lr[0]-ex.ul[0])
  ex_alpha = extent.create((tile_id*m, 0), ((tile_id+1)*m, 1), alpha.shape)
  old_alpha = alpha.fetch(ex_alpha)
  
  old_w = np.zeros((X.shape[1],1)) if w is None else w[:]
  
  new_w, new_alpha = _svm_disdca_train(X, Y, old_alpha, old_w, m, scale, lambda_n)
  
  # update the alpha vector
  alpha.update(ex_alpha, new_alpha)
  
  # reduce the weight vector
  yield extent.create((0,0),(array.shape[1],1),(array.shape[1], 1)), new_w
Ejemplo n.º 2
0
def cholesky(A):
  '''
  Cholesky matrix decomposition.
 
  Args:
    A(Expr): matrix to be decomposed
  '''
 
  A = expr.force(A)
  n = int(math.sqrt(len(A.tiles)))
  tile_size = A.shape[0] / n
  for k in range(n):
    # A[k,k] = DPOTRF(A[k,k])
    diag_ex = get_ex(k, k, tile_size, A.shape)
    A = expr.region_map(A, diag_ex, _cholesky_dpotrf_mapper)
    
    if k == n - 1: break
    
    # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n)
    col_ex = extent.create(((k+1)*tile_size, k*tile_size),(n*tile_size, (k+1)*tile_size), A.shape)
    A = expr.region_map(A, col_ex, _cholesky_dtrsm_mapper, fn_kw=dict(diag_ex=diag_ex))
    
    # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n)
    # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n)
    col_exs = list([extent.create((m*tile_size, m*tile_size), (n*tile_size, (m+1)*tile_size), A.shape) for m in range(k+1,n)])
    A = expr.region_map(A, col_exs, _cholesky_dsyrk_dgemm_mapper, fn_kw=dict(k=k))
  
  
  # update the right corner to 0
  col_exs = list([extent.create((0, m*tile_size),(m*tile_size, (m+1)*tile_size),A.shape) for m in range(1,n)])
  A = expr.region_map(A, col_exs, lambda input, array, ex: np.zeros(input.shape, input.dtype))
  return A
Ejemplo n.º 3
0
def _solve_U_or_M_mapper(array, ex, U_or_M, la, alpha, implicit_feedback):
  '''
  given A and U (or M), solve M (or U) such that A = U M' 
  using alternating least-squares factorization method
  
  Args:
    array(DistArray): the user-item (or item-user) rating matrix.
    ex(Extent): region being processed.
    U_or_M(DistArray): the matrix U (or M).
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
  '''
  rating_matrix = array.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape))
  U_or_M = U_or_M[:]
  
  if implicit_feedback:
    Y = U_or_M
    YT = Y.T
    YTY = np.dot(YT, Y)
 
  result = np.zeros((rating_matrix.shape[0], U_or_M.shape[1]))
  for i in range(rating_matrix.shape[0]):
    if implicit_feedback:
      result[i] = _implicit_feedback_als_solver(rating_matrix[i], la, alpha, Y, YT, YTY)
    else:
      non_zero_idx = rating_matrix[i].nonzero()[0]
      rating_vector = rating_matrix[i, non_zero_idx]
      feature_vectors = U_or_M[non_zero_idx]
      result[i] = _als_solver(feature_vectors, rating_vector, la)
    
  yield extent.create((ex.ul[0], 0), (ex.lr[0], U_or_M.shape[1]), (array.shape[0], U_or_M.shape[1])), result
Ejemplo n.º 4
0
def cholesky(A):
    '''
  Cholesky matrix decomposition.

  Args:
    A(Expr): matrix to be decomposed
  '''

    A = expr.force(A)
    n = int(math.sqrt(len(A.tiles)))
    tile_size = A.shape[0] / n
    for k in range(n):
        # A[k,k] = DPOTRF(A[k,k])
        diag_ex = get_ex(k, k, tile_size, A.shape)
        A = expr.map2(A, ((0, 1), ),
                      fn=_cholesky_dpotrf_mapper,
                      shape=A.shape,
                      update_region=diag_ex)

        if k == n - 1: break

        # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n)
        col_ex = extent.create(((k + 1) * tile_size, k * tile_size),
                               (n * tile_size, (k + 1) * tile_size), A.shape)
        diag_tile = A.force().fetch(diag_ex)
        A = expr.map2(A, ((0, 1), ),
                      fn=_cholesky_dtrsm_mapper,
                      fn_kw=dict(array=force(A), diag_tile=diag_tile),
                      shape=A.shape,
                      update_region=col_ex)

        # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n)
        # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n)
        col_exs = list([
            extent.create((m * tile_size, m * tile_size),
                          (n * tile_size, (m + 1) * tile_size), A.shape)
            for m in range(k + 1, n)
        ])
        dgemm_1 = expr.transpose(A)[(k * tile_size):((k + 1) * tile_size), :]
        dgemm_2 = A[:, (k * tile_size):((k + 1) * tile_size)]
        A = expr.map2((A, dgemm_1, dgemm_2), ((0, 1), 1, 0),
                      fn=_cholesky_dsyrk_dgemm_mapper,
                      fn_kw=dict(array=force(A), k=k),
                      shape=A.shape,
                      update_region=col_exs)

    # update the right corner to 0
    col_exs = list([
        extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size),
                      A.shape) for m in range(1, n)
    ])
    A = expr.map2(A, ((0, 1), ),
                  fn=_zero_mapper,
                  shape=A.shape,
                  update_region=col_exs)
    return A
Ejemplo n.º 5
0
def kmeans_map2_center_mapper(ex, tile, centers=None, m=None):
  X = tile[0]
  weights = tile[1] ** m
  new_centers = np.dot(X.T, weights).T
  target_ex = extent.create((ex[0].ul[0], ),
                            (ex[0].lr[0], ),
                            (ex[0].array_shape[0], ))
  target_ex = extent.create((0, 0), (centers.shape[0], centers.shape[1]),
                            (centers.shape[0], centers.shape[1]))
  yield target_ex, new_centers
Ejemplo n.º 6
0
def _init_label_mapper(array, ex):
  data = array.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape))
  
  labels = np.zeros((data.shape[0], 1), dtype=np.int64)
  for i in range(data.shape[0]):
    if data[i,0] > data[i,1]:
      labels[i,0] = 1.0
    else:
      labels[i,0] = -1.0
    
  yield extent.create((ex.ul[0], 0), (ex.lr[0], 1), (array.shape[0], 1)), labels
Ejemplo n.º 7
0
def _cholesky_dsyrk_dgemm_mapper(input, array, ex, k):
  
  mk_ex = extent.create((ex.ul[1], k*input.shape[1]), (ex.lr[1], (k+1)*input.shape[1]), array.shape)
  A_mk = array.fetch(mk_ex)
  
  if ex.ul[0] == ex.ul[1] and ex.lr[0] == ex.lr[1]:
    # diag block
    return linalg.blas.dsyrk(-1.0, A_mk, 1.0, input, lower=1)
  else:
    # other block
    lk_ex = extent.create((ex.ul[0], k*input.shape[1]), (ex.lr[0], (k+1)*input.shape[1]), array.shape)
    A_lk = array.fetch(lk_ex)
    return linalg.blas.dgemm(-1.0, A_lk, A_mk.T, 1.0, input)
Ejemplo n.º 8
0
def cholesky(A):
    '''
  Cholesky matrix decomposition.

  Args:
    A(Expr): matrix to be decomposed
  '''
    n = int(math.sqrt(FLAGS.num_workers))
    tile_size = A.shape[0] / n
    print n, tile_size
    for k in range(n):
        # A[k,k] = DPOTRF(A[k,k])
        diag_ex = get_ex(k, k, tile_size, A.shape)
        A = expr.map2(A, ((0, 1), ),
                      fn=_cholesky_dpotrf_mapper,
                      shape=A.shape,
                      update_region=diag_ex)

        if k == n - 1: break

        # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n)
        col_ex = extent.create(((k + 1) * tile_size, k * tile_size),
                               (n * tile_size, (k + 1) * tile_size), A.shape)
        A = expr.map2((A, A[diag_ex.to_slice()]), ((0, 1), None),
                      fn=_cholesky_dtrsm_mapper,
                      shape=A.shape,
                      update_region=col_ex)

        # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n)
        # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n)
        col_exs = list([
            extent.create((m * tile_size, m * tile_size),
                          (n * tile_size, (m + 1) * tile_size), A.shape)
            for m in range(k + 1, n)
        ])
        dgemm = A[:, (k * tile_size):((k + 1) * tile_size)]
        A = expr.map2((A, expr.transpose(dgemm), dgemm), ((0, 1), 1, 0),
                      fn=_cholesky_dsyrk_dgemm_mapper,
                      shape=A.shape,
                      update_region=col_exs).optimized()

    # update the right corner to 0
    col_exs = list([
        extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size),
                      A.shape) for m in range(1, n)
    ])
    A = expr.map2(A, ((0, 1), ),
                  fn=_zero_mapper,
                  shape=A.shape,
                  update_region=col_exs)
    return A
Ejemplo n.º 9
0
def _cluster_mapper(array, ex, centers):
    '''
  label the cluster id for each data point.
  
  Args:
    array(DistArray): the input data points matrix.
    ex(Extent): region being processed.
    centers(numpy.array): the center points for each cluster.
  '''
    points = array.fetch(ex)
    labels = np.zeros(points.shape[0], dtype=np.int32)
    for i in range(points.shape[0]):
        point = points[i]
        max = -1
        max_id = -1
        for j in range(centers.shape[0]):
            dist = np.square(centers[j] - point).sum()
            pdf = 1.0 / (1 + dist)
            if max < pdf:
                max = pdf
                max_id = j

        labels[i] = max_id

    yield extent.create((ex.ul[0], ), (ex.lr[0], ), (array.shape[0], )), labels
Ejemplo n.º 10
0
def _lda_doc_topic_mapper(
    ex_a, term_docs_matrix, ex_b, local_topic_term_counts, k_topics, alpha, eta, max_iter_per_doc
):
    """
  Last iteration that uses Collapsed Variational Bayes method (Mahout implementation) to calculate the final document/topic inference.

  Args:
    array(DistArray): the count of each term in each document.
    ex(Extent): Region being processed.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter_per_doc(int): the max iterations to train each document.
    topic_term_counts(DistArray): the matrix to save p(topic x | term).
  """
    # term_docs_matrix = array.fetch(extent.create((0, ex.ul[1]), (array.shape[0], ex.lr[1]), array.shape))
    # local_topic_term_counts = topic_term_counts[:]
    local_topic_sums = np.linalg.norm(local_topic_term_counts, 1, axis=1)

    doc_topics = np.ones((term_docs_matrix.shape[1], k_topics), dtype=np.float64) / k_topics

    local_topic_term_counts = _lda_train(
        term_docs_matrix, local_topic_term_counts, local_topic_sums, doc_topics, k_topics, alpha, eta, max_iter_per_doc
    )

    # yield extent.create((ex.ul[1], 0), (ex.lr[1], k_topics), (array.shape[1], k_topics)), doc_topics
    yield (extent.create((ex_a.ul[1], 0), (ex_a.lr[1], k_topics), (ex_a.array_shape[1], k_topics)), doc_topics)
Ejemplo n.º 11
0
def _row_similarity_mapper(array, ex, similarity_measurement):
    '''
  calculate distances for each pair of points.
  
  Args:
    array(DistArray): the input data points matrix.
    ex(Extent): region being processed.
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''
    measurement = distance_methods[similarity_measurement]
    points = array.fetch(ex)
    result = np.zeros((points.shape[0], array.shape[0]))
    for other_ex in array.tiles:
        if ex == other_ex:
            other_points = points
        else:
            other_points = array.fetch(other_ex)

        for i in range(points.shape[0]):
            for j in range(other_points.shape[0]):
                result[i, other_ex.ul[0] + j] = measurement(
                    points[i], other_points[j])

    yield extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[0]),
                        (array.shape[0], array.shape[0])), result
Ejemplo n.º 12
0
def _lda_mapper(ex_a, term_docs_matrix, ex_b, local_topic_term_counts,
                k_topics, alpha, eta, max_iter_per_doc):
    '''
  Using Collapsed Variational Bayes method (Mahout implementation) to train local LDA model.

  Args:
    array(DistArray): the count of each term in each document.
    ex(Extent): Region being processed.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter_per_doc(int): the max iterations to train each document.
    topic_term_counts(DistArray): the matrix to save p(topic x | term).
  '''
    #term_docs_matrix = array.fetch(extent.create((0, ex.ul[1]), (array.shape[0], ex.lr[1]), array.shape))
    #local_topic_term_counts = topic_term_counts[:]
    local_topic_sums = np.linalg.norm(local_topic_term_counts, 1, axis=1)

    local_topic_term_counts = _lda_train(term_docs_matrix,
                                         local_topic_term_counts,
                                         local_topic_sums, None, k_topics,
                                         alpha, eta, max_iter_per_doc)

    #yield extent.create((0, 0), (k_topics, array.shape[0]), (k_topics, array.shape[0])), local_topic_term_counts
    yield (extent.create(
        (0, 0), (k_topics, ex_a.array_shape[0]),
        (k_topics, ex_a.array_shape[0])), local_topic_term_counts)
Ejemplo n.º 13
0
def _lda_doc_topic_mapper(ex_a, term_docs_matrix, ex_b,
                          local_topic_term_counts, k_topics, alpha, eta,
                          max_iter_per_doc):
    '''
  Last iteration that uses Collapsed Variational Bayes method (Mahout implementation) to calculate the final document/topic inference.

  Args:
    array(DistArray): the count of each term in each document.
    ex(Extent): Region being processed.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter_per_doc(int): the max iterations to train each document.
    topic_term_counts(DistArray): the matrix to save p(topic x | term).
  '''
    #term_docs_matrix = array.fetch(extent.create((0, ex.ul[1]), (array.shape[0], ex.lr[1]), array.shape))
    #local_topic_term_counts = topic_term_counts[:]
    local_topic_sums = np.linalg.norm(local_topic_term_counts, 1, axis=1)

    doc_topics = np.ones(
        (term_docs_matrix.shape[1], k_topics), dtype=np.float64) / k_topics

    local_topic_term_counts = _lda_train(term_docs_matrix,
                                         local_topic_term_counts,
                                         local_topic_sums, doc_topics,
                                         k_topics, alpha, eta,
                                         max_iter_per_doc)

    #yield extent.create((ex.ul[1], 0), (ex.lr[1], k_topics), (array.shape[1], k_topics)), doc_topics
    yield (extent.create((ex_a.ul[1], 0), (ex_a.lr[1], k_topics),
                         (ex_a.array_shape[1], k_topics)), doc_topics)
Ejemplo n.º 14
0
def _lda_mapper(ex_a, term_docs_matrix, ex_b, local_topic_term_counts, k_topics, alpha, eta, max_iter_per_doc):
    """
  Using Collapsed Variational Bayes method (Mahout implementation) to train local LDA model.

  Args:
    array(DistArray): the count of each term in each document.
    ex(Extent): Region being processed.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter_per_doc(int): the max iterations to train each document.
    topic_term_counts(DistArray): the matrix to save p(topic x | term).
  """
    # term_docs_matrix = array.fetch(extent.create((0, ex.ul[1]), (array.shape[0], ex.lr[1]), array.shape))
    # local_topic_term_counts = topic_term_counts[:]
    local_topic_sums = np.linalg.norm(local_topic_term_counts, 1, axis=1)

    local_topic_term_counts = _lda_train(
        term_docs_matrix, local_topic_term_counts, local_topic_sums, None, k_topics, alpha, eta, max_iter_per_doc
    )

    # yield extent.create((0, 0), (k_topics, array.shape[0]), (k_topics, array.shape[0])), local_topic_term_counts
    yield (
        extent.create((0, 0), (k_topics, ex_a.array_shape[0]), (k_topics, ex_a.array_shape[0])),
        local_topic_term_counts,
    )
Ejemplo n.º 15
0
def kmeans_outer_dist_mapper(ex_a, tile_a, ex_b, tile_b):
  points = tile_a
  centers = tile_b
  target_ex = extent.create((ex_a[0].ul[0], ),
                            (ex_a[0].lr[0], ),
                            (ex_a[0].array_shape[0], ))
  yield target_ex, np.argmin(cdist(points, centers), axis=1)
Ejemplo n.º 16
0
def _cluster_mapper(array, ex, centers):
  '''
  label the cluster id for each data point.

  Args:
    array(DistArray): the input data points matrix.
    ex(Extent): region being processed.
    centers(numpy.array): the center points for each cluster.
  '''
  points = array.fetch(ex)
  labels = np.zeros(points.shape[0], dtype=np.int32)
  for i in range(points.shape[0]):
    point = points[i]
    max = -1
    max_id = -1
    for j in range(centers.shape[0]):
      dist = np.square(centers[j] - point).sum()
      pdf = 1.0 / (1 + dist)
      if max < pdf:
        max = pdf
        max_id = j

    labels[i] = max_id

  yield extent.create((ex.ul[0],), (ex.lr[0],), (array.shape[0],)), labels
Ejemplo n.º 17
0
def _fuzzy_kmeans_mapper(array, ex, old_centers, centers, counts, labels, m):
  '''
  Update the new centers, new counts and labels using fuzzy kmeans method.
  
  Args:
    array(DistArray): the input data points matrix.
    ex(Extent): region being processed.
    old_centers(DistArray): the current centers of each cluster.
    centers(DistArray): the new centers to be updated.
    counts(DistArray): the new counts to be updated.
    labels(DistArray): the new labels for each point to be updated.
    m(float): the parameter of fuzzy kmeans. 
  '''
  points = array.fetch(ex)
  old_centers = old_centers[:]
  
  new_centers = np.zeros_like(old_centers)
  new_counts = np.zeros((old_centers.shape[0], 1))
  new_labels = np.zeros(points.shape[0], dtype=np.int)
  for i in range(points.shape[0]):
    point = points[i]    
    prob = _calc_probability(point, old_centers, m)
    new_labels[i] = np.argmax(prob)
    
    for i in prob.nonzero()[0]:
      new_counts[i] += prob[i]
      new_centers[i] += prob[i] * point
      
  centers.update(extent.from_shape(centers.shape), new_centers)
  counts.update(extent.from_shape(counts.shape), new_counts)
  labels.update(extent.create((ex.ul[0],), (ex.lr[0],), labels.shape), new_labels)
  return []  
Ejemplo n.º 18
0
def _solve_U_or_M_mapper(ex_a, rating_matrix, ex_b, U_or_M, la, alpha, implicit_feedback, shape=None):
  '''
  given A and U (or M), solve M (or U) such that A = U M'
  using alternating least-squares factorization method

  Args:
    rating_matrix: the user-item (or item-user) rating matrix.
    U_or_M: the matrix U (or M).
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
  '''
  if implicit_feedback:
    Y = U_or_M
    YT = Y.T
    YTY = np.dot(YT, Y)

  result = np.zeros((rating_matrix.shape[0], U_or_M.shape[1]))
  if implicit_feedback:
    for i in range(rating_matrix.shape[0]):
      result[i] = _implicit_feedback_als_solver(rating_matrix[i], la, alpha, Y, YT, YTY)
  else:
    for i in range(rating_matrix.shape[0]):
      non_zero_idx = rating_matrix[i].nonzero()[0]
      rating_vector = rating_matrix[i, non_zero_idx]
      feature_vectors = U_or_M[non_zero_idx]
      result[i] = _als_solver(feature_vectors, rating_vector, la)

  target_ex = extent.create((ex_a.ul[0], 0), (ex_a.lr[0], U_or_M.shape[1]), shape)
  yield target_ex, result
Ejemplo n.º 19
0
def _init_label_mapper(array, ex):
  data = array.fetch(ex)
  
  labels = np.zeros((data.shape[0], 1), dtype=np.int64)
  for i in range(data.shape[0]):
    labels[i] = np.argmax(data[i])
    
  yield extent.create((ex.ul[0], 0), (ex.lr[0], 1), (array.shape[0], 1)), labels
Ejemplo n.º 20
0
def _init_label_mapper(array, ex, data):
  data = data.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], data.shape[1]), data.shape))
  
  labels = np.zeros((data.shape[0], 1), dtype=np.int64)
  for i in range(data.shape[0]):
    labels[i] = np.argmax(data[i])
    
  yield ex, labels
Ejemplo n.º 21
0
def test_intersection():
    a = extent.create((0, 0), (10, 10), None)
    b = extent.create((5, 5), (6, 6), None)

    Assert.eq(extent.intersection(a, b), extent.create((5, 5), (6, 6), None))
    Assert.eq(extent.intersection(b, a), extent.create((5, 5), (6, 6), None))

    a = extent.create((5, 5), (10, 10), None)
    b = extent.create((4, 6), (6, 8), None)
    Assert.eq(extent.intersection(a, b), extent.create((5, 6), (6, 8), None))

    a = extent.create((5, 5), (5, 5), None)
    b = extent.create((1, 1), (2, 2), None)
    assert extent.intersection(a, b) == None
Ejemplo n.º 22
0
def _sum_instance_by_label_mapper(array, ex, labels, label_size):
  '''
  For each label, compute the sum of the feature vectors which belong to that label.
  
  Args:
    array(DistArray): tf-idf normalized training data.
    ex(Extent): Region being processed.
    labels(DistArray): labels of the training data.
    label_size: the number of different labels.
  '''
  X = array.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape))
  Y = labels.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), labels.shape))
  
  sum_instance_by_label = np.zeros((label_size, X.shape[1]))
  for i in xrange(Y.shape[0]):
    sum_instance_by_label[Y[i, 0]] += X[i]
    
  yield extent.create((0, 0), (label_size, X.shape[1]), (label_size, X.shape[1])), sum_instance_by_label
Ejemplo n.º 23
0
def test_unravel():
    for i in range(100):
        shp = (20, 77)
        ul = (random.randint(0, 19), random.randint(0, 76))
        lr = (random.randint(ul[0] + 1, 20), random.randint(ul[1] + 1, 77))

        a = extent.create(ul, lr, shp)
        ravelled = a.ravelled_pos()
        unravelled = extent.unravelled_pos(ravelled, a.array_shape)
        Assert.eq(a.ul, unravelled)
Ejemplo n.º 24
0
def test_ravelled_pos():
    a = extent.create((2, 2), (7, 7), (10, 10))
    for i in range(0, 10):
        for j in range(0, 10):
            assert extent.ravelled_pos((i, j), a.array_shape) == 10 * i + j

    Assert.eq(a.to_global(0, axis=None), 22)
    Assert.eq(a.to_global(10, axis=None), 42)
    Assert.eq(a.to_global(11, axis=None), 43)
    Assert.eq(a.to_global(20, axis=None), 62)
Ejemplo n.º 25
0
def kmeans_map2_dist_mapper(ex, tile, centers=None, m=None):
  points = tile[0]
  target_ex = extent.create((ex[0].ul[0], 0),
                            (ex[0].lr[0], centers.shape[0]),
                            (ex[0].array_shape[0], centers.shape[0]))
  distances = cdist(points, centers)
  distances[distances == 0] = 0.0000000001
  distances **= 1.0 / (m - 1)
  distances /= np.sum(distances, axis=1)[:, np.newaxis]
  yield target_ex, distances
Ejemplo n.º 26
0
def test_ravelled_pos():
  a = extent.create((2, 2), (7, 7), (10, 10))
  for i in range(0, 10):
    for j in range(0, 10):
      assert extent.ravelled_pos((i, j), a.array_shape) == 10 * i + j
      
  Assert.eq(a.to_global(0, axis=None), 22)
  Assert.eq(a.to_global(10, axis=None), 42)
  Assert.eq(a.to_global(11, axis=None), 43)
  Assert.eq(a.to_global(20, axis=None), 62)
Ejemplo n.º 27
0
def test_unravel():
  for i in range(100):
    shp = (20, 77)
    ul = (random.randint(0, 19), random.randint(0, 76))
    lr = (random.randint(ul[0] + 1, 20), random.randint(ul[1] + 1, 77))
                         
    a = extent.create(ul, lr, shp)
    ravelled = a.ravelled_pos()
    unravelled = extent.unravelled_pos(ravelled, a.array_shape)
    Assert.eq(a.ul, unravelled)
Ejemplo n.º 28
0
def _local_read_sparse_mm(array, ex, fn, data_begin):
  '''
  1. Noted that Matrix Market format doesn't require (row, col) to be sorted.
     If the file is sorted (by either row or col), each worker will return
     only a part of the array. If the file is unsorted, each worker may
     return a very big and sparser sub-array of the original array. In the
     worst case, the sub-array can be as large as the original array but
     sparser.
  2. We can't know how many lines without reading the whole file. So we simply
     decide the region this worker should read based on the file size.
  '''
  data_size = os.path.getsize(fn) - data_begin
  array_size = np.product(array.shape)
  begin = extent.ravelled_pos(ex.ul, array.shape)
  begin = math.ceil(((begin * 1.0) / array_size) * data_size) + data_begin
  end = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape)
  end = math.floor(((end * 1.0) / array_size) * data_size) + data_begin

  ul = [array.shape[0], array.shape[1]]
  lr = [0, 0]
  rows = []
  cols = []
  data = []
  with open(fn) as fp:
    fp.seek(begin)
    if begin != data_begin:
      fp.seek(begin - 1)
      a = fp.read(1)
      if a != '\n':
        line = fp.readline()

    pos = fp.tell()
    for line in fp:
      if pos > end + 1: # +1 in case end locates on \n
        break
      pos += len(line)
      (_row, _col), val = _extract_mm_coordinate(line)
      _row -= 1
      _col -= 1
      rows.append(_row)
      cols.append(_col)
      data.append(float(val))
      ul[0] = _row if _row < ul[0] else ul[0]
      ul[1] = _col if _col < ul[1] else ul[1]
      lr[0] = _row if _row > lr[0] else lr[0]
      lr[1] = _col if _col > lr[1] else lr[1]

  # Adjust rows and cols based on the ul of this submatrix.
  for i in xrange(len(rows)):
    rows[i] -= ul[0]
    cols[i] -= ul[1]

  new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape)
  new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape)
  return new_ex, sparse.convert_sparse_array(new_array)
Ejemplo n.º 29
0
def _local_read_sparse_mm(array, ex, fn, data_begin):
    '''
  1. Noted that Matrix Market format doesn't require (row, col) to be sorted.
     If the file is sorted (by either row or col), each worker will return
     only a part of the array. If the file is unsorted, each worker may
     return a very big and sparser sub-array of the original array. In the
     worst case, the sub-array can be as large as the original array but
     sparser.
  2. We can't know how many lines without reading the whole file. So we simply
     decide the region this worker should read based on the file size.
  '''
    data_size = os.path.getsize(fn) - data_begin
    array_size = np.product(array.shape)
    begin = extent.ravelled_pos(ex.ul, array.shape)
    begin = math.ceil(((begin * 1.0) / array_size) * data_size) + data_begin
    end = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape)
    end = math.floor(((end * 1.0) / array_size) * data_size) + data_begin

    ul = [array.shape[0], array.shape[1]]
    lr = [0, 0]
    rows = []
    cols = []
    data = []
    with open(fn) as fp:
        fp.seek(begin)
        if begin != data_begin:
            fp.seek(begin - 1)
            a = fp.read(1)
            if a != '\n':
                line = fp.readline()

        pos = fp.tell()
        for line in fp:
            if pos > end + 1:  # +1 in case end locates on \n
                break
            pos += len(line)
            (_row, _col), val = _extract_mm_coordinate(line)
            _row -= 1
            _col -= 1
            rows.append(_row)
            cols.append(_col)
            data.append(float(val))
            ul[0] = _row if _row < ul[0] else ul[0]
            ul[1] = _col if _col < ul[1] else ul[1]
            lr[0] = _row if _row > lr[0] else lr[0]
            lr[1] = _col if _col > lr[1] else lr[1]

    # Adjust rows and cols based on the ul of this submatrix.
    for i in xrange(len(rows)):
        rows[i] -= ul[0]
        cols[i] -= ul[1]

    new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape)
    new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape)
    return new_ex, sparse.convert_sparse_array(new_array)
Ejemplo n.º 30
0
def _transpose_mapper(array, ex, orig_array):
  '''
  Transpose ``orig_array`` into ``array``.
  
  Args:
    array(DistArray): destination array.
    ex(Extent): region being processed.
    orig_array(DistArray): array to be transposed.
  '''
  orig_ex = extent.create(ex.ul[::-1], ex.lr[::-1], orig_array.shape)
  yield ex, orig_array.fetch(orig_ex).transpose()
Ejemplo n.º 31
0
def _sum_instance_by_label_mapper(array, ex, labels, label_size):
    '''
  For each label, compute the sum of the feature vectors which belong to that label.
  
  Args:
    array(DistArray): tf-idf normalized training data.
    ex(Extent): Region being processed.
    labels(DistArray): labels of the training data.
    label_size: the number of different labels.
  '''
    X = array.fetch(
        extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape))
    Y = labels.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), labels.shape))

    sum_instance_by_label = np.zeros((label_size, X.shape[1]))
    for i in xrange(Y.shape[0]):
        sum_instance_by_label[Y[i, 0]] += X[i]

    yield extent.create((0, 0), (label_size, X.shape[1]),
                        (label_size, X.shape[1])), sum_instance_by_label
Ejemplo n.º 32
0
def test_intersection():
  a = extent.create((0, 0), (10, 10), None)
  b = extent.create((5, 5), (6, 6), None)
  
  Assert.eq(extent.intersection(a, b),
            extent.create((5,5), (6,6), None))
  Assert.eq(extent.intersection(b, a),
            extent.create((5,5), (6,6), None))
  
  a = extent.create((5, 5), (10, 10), None)
  b = extent.create((4, 6), (6, 8), None)
  Assert.eq(extent.intersection(a, b),
            extent.create((5,6), (6, 8), None))

  a = extent.create((5, 5), (5, 5), None)
  b = extent.create((1, 1), (2, 2), None)
  assert extent.intersection(a, b) == None
Ejemplo n.º 33
0
def test_tilesharing(ctx):
  print "#worker:", ctx.num_workers
  N_EXAMPLES = 5 * ctx.num_workers
  x = expr.ones((N_EXAMPLES, 1), tile_hint=(N_EXAMPLES / ctx.num_workers, 1))
  y = expr.region_map(x, extent.create((0, 0), (3, 1), (N_EXAMPLES, 1)), fn=lambda data, ex, a: data+a, fn_kw={'a': 1})

  npx = np.ones((N_EXAMPLES, 1))
  npy = np.ones((N_EXAMPLES, 1))
  npy[0:3, 0] += 1

  assert np.all(np.equal(x.glom(), npx))
  assert np.all(np.equal(y.glom(), npy))
Ejemplo n.º 34
0
def _svm_mapper(array, ex, labels, alpha, w, lambda_n):
  '''
  Local linear SVM solver.

  Args:
    array(DistArray): features of the training data.
    ex(Extent): Region being processed.
    labels(DistArray): labels of the training data.
    alpha(DistArray): alpha vector which is the parameter optimized by SVM. 
    w(DistArray): weight vector of the previous iteration.
    lambda_n: lambda/size(total train data) which is the parameter of this svm model.
  '''
  X = array.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape))
  Y = labels.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), labels.shape))
  old_alpha = alpha.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), alpha.shape))
  old_w = w[:]
  
  new_alpha = _svm_disdca_train(X, Y, old_alpha, old_w, len(array.tiles), lambda_n)
  
  # update the alpha vector
  yield extent.create((ex.ul[0], 0), (ex.lr[0], 1), alpha.shape), new_alpha
Ejemplo n.º 35
0
def _write_mapper(ex, source=None, sregion=None, dst_slice=None):
    intersection = extent.intersection(ex, sregion)

    futures = rpc.FutureGroup()
    if intersection is not None:
        dst_lr = np.asarray(intersection.lr) - np.asarray(sregion.ul)
        dst_ul = np.asarray(intersection.ul) - np.asarray(sregion.ul)
        dst_ex = extent.create(tuple(dst_ul), tuple(dst_lr), dst_slice.shape)
        v = dst_slice.fetch(dst_ex)
        futures.append(source.update(intersection, v, wait=False))

    return LocalKernelResult(result=None, futures=futures)
Ejemplo n.º 36
0
def _write_mapper(ex, source = None, sregion = None, dst_slice = None):
  intersection = extent.intersection(ex, sregion)

  futures = rpc.FutureGroup()
  if intersection != None:
    dst_lr = np.asarray(intersection.lr) - np.asarray(sregion.ul)
    dst_ul = np.asarray(intersection.ul) - np.asarray(sregion.ul)
    dst_ex = extent.create(tuple(dst_ul), tuple(dst_lr), dst_slice.shape)
    v = dst_slice.fetch(dst_ex)
    futures.append(source.update(intersection, v, wait=False))

  return LocalKernelResult(result=None, futures=futures)
Ejemplo n.º 37
0
def _init_M_mapper(array, ex, avg_rating):
  '''
  Initialize the M matrix with its first column equals to avg_rating.
  
  Args:
    array(DistArray): the array to be created.
    ex(Extent): region being processed.
    avg_rating(DistArray): the average rating for each item.
  '''
  avg_rating = avg_rating.fetch(extent.create((ex.ul[0],), (ex.lr[0],), avg_rating.shape))
  M = np.zeros(ex.shape)
  for i in avg_rating.nonzero()[0]:
    M[i, 0] = avg_rating[i]
    M[i, 1:] = np.random.rand(M.shape[1]-1)
  yield ex, M
Ejemplo n.º 38
0
def cholesky(A):
  '''
  Cholesky matrix decomposition.

  Args:
    A(Expr): matrix to be decomposed
  '''
  n = int(math.sqrt(FLAGS.num_workers))
  tile_size = A.shape[0] / n
  print n, tile_size
  for k in range(n):
    # A[k,k] = DPOTRF(A[k,k])
    diag_ex = get_ex(k, k, tile_size, A.shape)
    A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper,
                  shape=A.shape, update_region=diag_ex)

    if k == n - 1: break

    # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n)
    col_ex = extent.create(((k+1)*tile_size, k*tile_size), (n*tile_size, (k+1)*tile_size), A.shape)
    A = expr.map2((A, A[diag_ex.to_slice()]), ((0, 1), None), fn=_cholesky_dtrsm_mapper,
                  shape=A.shape, update_region=col_ex)

    # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n)
    # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n)
    col_exs = list([extent.create((m*tile_size, m*tile_size), (n*tile_size, (m+1)*tile_size), A.shape) for m in range(k+1, n)])
    dgemm = A[:, (k * tile_size):((k + 1) * tile_size)]
    A = expr.map2((A, expr.transpose(dgemm), dgemm), ((0, 1), 1, 0),
                  fn=_cholesky_dsyrk_dgemm_mapper,
                  shape=A.shape, update_region=col_exs).optimized()

  # update the right corner to 0
  col_exs = list([extent.create((0, m*tile_size), (m*tile_size, (m+1)*tile_size), A.shape) for m in range(1, n)])
  A = expr.map2(A, ((0, 1), ), fn=_zero_mapper,
                shape=A.shape, update_region=col_exs)
  return A
Ejemplo n.º 39
0
def _naive_bayes_mapper(array, ex, weights_per_label, alpha):
  '''
  train local naive bayes weights.
  
  Args:
    array(DistArray): weights for each label and feature.
    ex(Extent): Region being processed.
    weights_per_label(DistArray): weights for each label.
    alpha: naive bayes parameter.
  '''
  weights_per_label_and_feature = array.fetch(ex)
  weights_per_label = weights_per_label.fetch(extent.create((ex.ul[0],), (ex.lr[0],), weights_per_label.shape))
  weights_per_label = weights_per_label.reshape((weights_per_label.shape[0], 1))

  weights_per_label_and_feature = np.log((weights_per_label_and_feature + alpha) / 
                                         (weights_per_label + alpha * weights_per_label_and_feature.shape[1]))

  yield ex, weights_per_label_and_feature
Ejemplo n.º 40
0
def _find_cluster_mapper(inputs, ex, d_pts, old_centers,
                         new_centers, new_counts, labels):
  centers = old_centers
  pts = d_pts.fetch(ex)
  closest = _find_closest(pts, centers)

  l_counts = np.zeros((centers.shape[0], 1), dtype=np.int)
  l_centers = np.zeros_like(centers)

  for i in range(centers.shape[0]):
    matching = (closest == i)
    l_counts[i] = matching.sum()
    l_centers[i] = pts[matching].sum(axis=0)

  # update centroid positions
  new_centers.update(extent.from_shape(new_centers.shape), l_centers)
  new_counts.update(extent.from_shape(new_counts.shape), l_counts)
  labels.update(extent.create(ex.ul, (ex.lr[0], 1), labels.shape),
                closest.reshape(pts.shape[0], 1))
  return []
Ejemplo n.º 41
0
def _make_site_sparse(tile, ex,
                      num_outlinks=None,
                      same_site_prob=None):

  if ex.shape[0] == tile.shape[0]:
    tile_pages = ex.shape[1]
    ul, lr = ex.ul[1], ex.lr[1]
  else:
    tile_pages = ex.shape[0]
    ul, lr = ex.ul[0], ex.lr[0]

  same_site = np.random.rand(num_outlinks * tile_pages) <= same_site_prob
  outlink = np.zeros(num_outlinks * tile_pages, dtype=np.int32)
  outlink[same_site] = np.random.randint(ul, lr, np.count_nonzero(same_site))
  outlink[~same_site] = np.random.randint(0, tile.shape[0], np.count_nonzero(~same_site))

  rows, cols, data = _build_site_coo(tile_pages, num_outlinks, outlink, ul, lr)
  result = scipy.sparse.coo_matrix((data, (rows, cols)), shape=(tile.shape[0], tile_pages), dtype=np.float32)
  result_ex = extent.create((0, ul), (tile.shape[0], lr), tile.shape)
  yield result_ex, result
Ejemplo n.º 42
0
def _find_cluster_mapper(inputs, ex, d_pts, old_centers, new_centers,
                         new_counts, labels):
    centers = old_centers
    pts = d_pts.fetch(ex)
    closest = _find_closest(pts, centers)

    l_counts = np.zeros((centers.shape[0], 1), dtype=np.int)
    l_centers = np.zeros_like(centers)

    for i in range(centers.shape[0]):
        matching = (closest == i)
        l_counts[i] = matching.sum()
        l_centers[i] = pts[matching].sum(axis=0)

    # update centroid positions
    new_centers.update(extent.from_shape(new_centers.shape), l_centers)
    new_counts.update(extent.from_shape(new_counts.shape), l_counts)
    labels.update(extent.create(ex.ul, (ex.lr[0], 1), labels.shape),
                  closest.reshape(pts.shape[0], 1))
    return []
Ejemplo n.º 43
0
def _make_site_sparse(tile, ex, num_outlinks=None, same_site_prob=None):

    if ex.shape[0] == tile.shape[0]:
        tile_pages = ex.shape[1]
        ul, lr = ex.ul[1], ex.lr[1]
    else:
        tile_pages = ex.shape[0]
        ul, lr = ex.ul[0], ex.lr[0]

    same_site = np.random.rand(num_outlinks * tile_pages) <= same_site_prob
    outlink = np.zeros(num_outlinks * tile_pages, dtype=np.int32)
    outlink[same_site] = np.random.randint(ul, lr, np.count_nonzero(same_site))
    outlink[~same_site] = np.random.randint(0, tile.shape[0],
                                            np.count_nonzero(~same_site))

    rows, cols, data = _build_site_coo(tile_pages, num_outlinks, outlink, ul,
                                       lr)
    result = scipy.sparse.coo_matrix((data, (rows, cols)),
                                     shape=(tile.shape[0], tile_pages),
                                     dtype=np.float32)
    result_ex = extent.create((0, ul), (tile.shape[0], lr), tile.shape)
    yield result_ex, result
Ejemplo n.º 44
0
def _solve_U_or_M_mapper(ex_a,
                         rating_matrix,
                         ex_b,
                         U_or_M,
                         la,
                         alpha,
                         implicit_feedback,
                         shape=None):
    '''
  given A and U (or M), solve M (or U) such that A = U M'
  using alternating least-squares factorization method

  Args:
    rating_matrix: the user-item (or item-user) rating matrix.
    U_or_M: the matrix U (or M).
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
  '''
    if implicit_feedback:
        Y = U_or_M
        YT = Y.T
        YTY = np.dot(YT, Y)

    result = np.zeros((rating_matrix.shape[0], U_or_M.shape[1]))
    if implicit_feedback:
        for i in range(rating_matrix.shape[0]):
            result[i] = _implicit_feedback_als_solver(rating_matrix[i], la,
                                                      alpha, Y, YT, YTY)
    else:
        for i in range(rating_matrix.shape[0]):
            non_zero_idx = rating_matrix[i].nonzero()[0]
            rating_vector = rating_matrix[i, non_zero_idx]
            feature_vectors = U_or_M[non_zero_idx]
            result[i] = _als_solver(feature_vectors, rating_vector, la)

    target_ex = extent.create((ex_a.ul[0], 0), (ex_a.lr[0], U_or_M.shape[1]),
                              shape)
    yield target_ex, result
Ejemplo n.º 45
0
def _row_similarity_mapper(array, ex, similarity_measurement):
  '''
  calculate distances for each pair of points.
  
  Args:
    array(DistArray): the input data points matrix.
    ex(Extent): region being processed.
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''
  measurement = distance_methods[similarity_measurement]
  points = array.fetch(ex)
  result = np.zeros((points.shape[0], array.shape[0]))
  for other_ex in array.tiles:
    if ex == other_ex:
      other_points = points
    else:
      other_points = array.fetch(other_ex)
    
    for i in range(points.shape[0]):
      for j in range(other_points.shape[0]):
        result[i, other_ex.ul[0] + j] = measurement(points[i], other_points[j])
    
  yield extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[0]), (array.shape[0], array.shape[0])), result
Ejemplo n.º 46
0
def kmeans_center_mapper(extents, tiles, centers_count):
  points = tiles[0]
  labels = tiles[1]
  target_ex = extent.create((0, 0), (centers_count, points.shape[1]),
                            (centers_count, points.shape[1]))
  #new_centers = np.ndarray((centers_count, points.shape[1]))
  #sorted_labels = np.sort(tiles[1])
  #argsorted_labels = np.argsort(tiles[1])
  #index = np.searchsorted(sorted_labels, np.arange(centers_count), side='right')
  #for i in xrange(centers_count):
    #if i == 0 or sorted_labels[index[i] - 1] != i:
      #continue
    #else:
      #if i == 0:
        #new_centers[i] = np.sum(argsorted_labels[0:index[0]], axis=0)
      #else:
        #new_centers[i] = np.sum(argsorted_labels[index[i - 1]:index[i]], axis=0)
  new_centers = np.zeros((centers_count, points.shape[1]))
  for i in xrange(centers_count):
    matching = (labels == i)
    new_centers[i] = points[matching].sum(axis=0)

  yield target_ex, new_centers
Ejemplo n.º 47
0
def kmeans_center_mapper(extents, tiles, centers_count):
    points = tiles[0]
    labels = tiles[1]
    target_ex = extent.create((0, 0), (centers_count, points.shape[1]),
                              (centers_count, points.shape[1]))
    #new_centers = np.ndarray((centers_count, points.shape[1]))
    #sorted_labels = np.sort(tiles[1])
    #argsorted_labels = np.argsort(tiles[1])
    #index = np.searchsorted(sorted_labels, np.arange(centers_count), side='right')
    #for i in xrange(centers_count):
    #if i == 0 or sorted_labels[index[i] - 1] != i:
    #continue
    #else:
    #if i == 0:
    #new_centers[i] = np.sum(argsorted_labels[0:index[0]], axis=0)
    #else:
    #new_centers[i] = np.sum(argsorted_labels[index[i - 1]:index[i]], axis=0)
    new_centers = np.zeros((centers_count, points.shape[1]))
    for i in xrange(centers_count):
        matching = (labels == i)
        new_centers[i] = points[matching].sum(axis=0)

    yield target_ex, new_centers
Ejemplo n.º 48
0
def _select_most_k_similar_mapper(array, ex, top_k_similar_indices, k):
    ''' Find the top k similar items for each item.
  Parameters
  ----------
  top_k_similar_indices: Spartan array of shape (N, k)
                         The indices of top k similar items.

  k : Integer
  '''
    local_similarity_table = array.fetch(ex)
    local_top_k_values = np.zeros((ex.shape[0], k))

    start_idx = ex.ul[0]
    # Find the k largest value of each row. This function is adapted from
    # bottlenect.argpartsort.
    sorted_indices = argpartsort(local_similarity_table, k, axis=1)[:, :k]

    for i in range(sorted_indices.shape[0]):
        local_top_k_values[i] = local_similarity_table[i, sorted_indices[i]]

    top_k_similar_indices[ex.ul[0]:ex.lr[0], :] = sorted_indices
    yield extent.create((ex.ul[0], 0), (ex.lr[0], k),
                        (array.shape[0], k)), local_top_k_values
Ejemplo n.º 49
0
def _local_read_sparse_npy(array, ex, fn):
    '''
  1. Noted that coo_matrix format doesn't require row[] or col[] to be sorted.
     If one of row[] or col[] is sorted (by either row or col), each worker will
     return only a part of the array. If the file is unsorted, each worker may
     return a very big and sparser sub-array of the original array. In the worst
     case, the sub-array can be as large as the original array but sparser.
  2. For numpy format, we can evenly distribute the files we need to read to
     workers.
  '''
    #data_begin = {}
    #dtype = {}
    #dtype_size = {}
    #shape = {}
    #fp = {}
    #read_next = {}
    attr = {
        'data_begin': {},
        'dtype': {},
        'shape': None,
        'read_next': {},
        'fn': {}
    }
    types = ['row', 'col', 'data']
    dtype_name = {'float64': 'd', 'float32': 'f', 'int64': 'q', 'int32': 'i'}

    for i in types:
        _fn = '%s_%s.npy' % (fn, i)
        attr['fn'][i] = _fn
        _shape, attr['dtype'][i], attr['data_begin'][i] = _parse_npy_header(
            _fn)
        if attr['shape'] is not None:
            assert attr['shape'] == _shape
        else:
            attr['shape'] = _shape
    #shape['row'], dtype['row'], data_begin['row'] = _parse_npy_header(fn + '_row.npy')
    #shape['col'], dtype['col'], data_begin['col'] = _parse_npy_header(fn + '_col.npy')
    #shape['data'], dtype['data'], data_begin['data'] = _parse_npy_header(fn + '_data.npy')

    item_count = np.product(array.shape)
    begin_item = extent.ravelled_pos(ex.ul, array.shape)
    begin_item = int(
        math.ceil(((begin_item * 1.0) / item_count) * attr['shape'][0]))
    end_item = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape)
    end_item = int(math.floor(
        (end_item * 1.0) / item_count * attr['shape'][0])) + 1
    end_item = attr['shape'][0] if end_item > attr['shape'][0] else end_item

    ul = [array.shape[0], array.shape[1]]
    lr = [0, 0]
    rows = []
    cols = []
    data = []
    with FileHelper(row=open(attr['fn']['row'], 'rb'),
                    col=open(attr['fn']['col'], 'rb'),
                    data=open(attr['fn']['data'], 'rb')) as fp:
        for k in types:
            _dtype = attr['dtype'][k]
            _dtype_size = _dtype.itemsize
            _fp = getattr(fp, k)

            _fp.seek(attr['data_begin'][k] + begin_item * _dtype_size)
            attr['read_next'][k] = _bulk_read(_fp, _dtype_size)
            attr['dtype'][k] = dtype_name[_dtype.name]

        for i in xrange(begin_item, end_item):
            _row = struct.unpack(attr['dtype']['row'],
                                 attr['read_next']['row'].next())[0]
            rows.append(_row)
            _col = struct.unpack(attr['dtype']['col'],
                                 attr['read_next']['col'].next())[0]
            cols.append(_col)
            _data = struct.unpack(attr['dtype']['data'],
                                  attr['read_next']['data'].next())[0]
            data.append(_data)

            ul[0] = _row if _row < ul[0] else ul[0]
            ul[1] = _col if _col < ul[1] else ul[1]
            lr[0] = _row if _row > lr[0] else lr[0]
            lr[1] = _col if _col > lr[1] else lr[1]

    for i in xrange(len(rows)):
        rows[i] -= ul[0]
        cols[i] -= ul[1]

    new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape)
    new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape)
    return new_ex, sparse.convert_sparse_array(new_array)
Ejemplo n.º 50
0
def get_ex(i, j, step, array_shape):
    return extent.create((i * step, j * step),
                         ((i + 1) * step, (j + 1) * step), array_shape)
Ejemplo n.º 51
0
def _similarity_mapper(array, ex, item_norm, step):
    ''' Find all pair similarities between items. 
  Parameters
  ----------
  item_norm : Spartan array of shape(N,)
              The norm values of each item.

  step : Integer.
         How many items need to be fetched for each iteration, now this equals to 
         the columns of tiles.
  '''
    M = array.shape[0]
    N = array.shape[1]

    local_ratings = array.fetch(ex)
    local_item_norm = item_norm[ex.ul[1]:ex.lr[1]]
    local_item_norm = local_item_norm.reshape(local_item_norm.shape[0], 1)

    assert local_ratings.shape[0] == M

    # The start index of the items this worker is responsible for.
    local_start_idx = ex.ul[1]
    # The start index of the items which will be fetched next.
    fetch_start_idx = 0
    count = 0

    while fetch_start_idx < N:
        util.log_info("Round : %s on %s", count, socket.gethostname())
        # Maybe last tile of the rating matrix doesn't have enough items.
        if N - fetch_start_idx <= step:
            step = N - fetch_start_idx

        count += 1

        with util.TIMER.item_fetching:
            # Fetch the ratings of remote items. The matrix is sparse, so this step
            # will not be very expensive.
            remote_ratings = array[:, fetch_start_idx:fetch_start_idx + step]
            remote_item_norm = item_norm[fetch_start_idx:fetch_start_idx +
                                         step]
            remote_item_norm = remote_item_norm.reshape(
                1, remote_item_norm.shape[0])

        with util.TIMER.calculate_similarities:
            '''
      Calculate the all-paris similarities between local items and remote items.
      local_ratings is a local matrix of shape(M, N1), remote_ratings is a local
      matrix of shape(M, N2).

      We calculate the cosine similarity, which is defined as:

          simi(V1, V2) = dot(V1, V2) / (|| V1 || * || V2 ||)

      For effiency, we calculate this in the way of matrix multiplication.
      
      "local_ratings.T.dot(remote_ratings)" generates a N1 X N2 matrix S.
      S[i, j] equals dot(Vi, Vj).
      
      "local_item_norm.dot(remote_item_norm)" generates a N1 X N2 matrix N.
      N[i, j] equals (|| Vi || * || Vj ||).

      In final step, we divide S by N, which yields all-pairs similarity.
      '''
            similarities = local_ratings.T.dot(remote_ratings)
            similarities = np.array(similarities.todense())
            norms = local_item_norm.dot(remote_item_norm)
            similarities = similarities / norms
            # In case some norms are zero.
            similarities = np.nan_to_num(similarities)

        # Update this to global array.
        yield extent.create((local_start_idx, fetch_start_idx),
                            (local_start_idx + similarities.shape[0],
                             fetch_start_idx + similarities.shape[1]),
                            (array.shape[1], array.shape[1])), similarities

        # Update fetch_start_idx, fetch next part of table.
        fetch_start_idx += step
Ejemplo n.º 52
0
def test_local_offset():
    a = extent.create((0, 0), (5, 5), None)
    b = extent.create((2, 2), (3, 3), None)
    util.log_info('%s', extent.offset_from(a, b))
Ejemplo n.º 53
0
import unittest

import numpy as np
from spartan import expr, util
from spartan.array import tile
from spartan.util import Assert
import spartan.array.extent as extent
import test_common
import scipy.sparse as sp

ARRAY_SIZE = (10, 10)
UPDATE_SHAPE = (8, 8)
UPDATE_SUBSLICE = extent.create((0, 0), (8, 8), UPDATE_SHAPE).to_slice()


class TestTile(test_common.ClusterTest):
    def test_create_dense(self):
        t = tile.from_shape(ARRAY_SIZE,
                            dtype=np.float32,
                            tile_type=tile.TYPE_DENSE)
        t._initialize()
        Assert.eq(t.mask.shape, ARRAY_SIZE)

    def test_create_sparse(self):
        t = tile.from_shape(ARRAY_SIZE,
                            dtype=np.float32,
                            tile_type=tile.TYPE_SPARSE)
        t._initialize()
        Assert.eq(t.data.shape, ARRAY_SIZE)
        Assert.eq(t.mask, None)
Ejemplo n.º 54
0
def kmeans_count_mapper(extents, tiles, centers_count):
    target_ex = extent.create((0, ), (centers_count, ), (centers_count, ))
    result = np.bincount(tiles[0].astype(np.int), minlength=centers_count)
    yield target_ex, result
Ejemplo n.º 55
0
def kmeans_map2_dist_mapper(ex, tile, centers=None):
    points = tile[0]
    target_ex = extent.create((ex[0].ul[0], ), (ex[0].lr[0], ),
                              (ex[0].array_shape[0], ))
    yield target_ex, np.argmin(cdist(points, centers), axis=1)
Ejemplo n.º 56
0
def kmeans_outer_dist_mapper(ex_a, tile_a, ex_b, tile_b):
    points = tile_a
    centers = tile_b
    target_ex = extent.create((ex_a[0].ul[0], ), (ex_a[0].lr[0], ),
                              (ex_a[0].array_shape[0], ))
    yield target_ex, np.argmin(cdist(points, centers), axis=1)