コード例 #1
0
def _select_most_k_similar_mapper(array, ex, top_k_similar_indices, k):
    ''' Find the top k similar items for each item.
  Parameters
  ----------
  top_k_similar_indices: Spartan array of shape (N, k)
                         The indices of top k similar items.

  k : Integer
  '''
    local_similarity_table = array.fetch(ex)
    local_top_k_values = np.zeros((ex.shape[0], k))

    start_idx = ex.ul[0]
    # Find the k largest value of each row. This function is adapted from
    # bottlenect.argpartsort.
    sorted_indices = argpartsort(local_similarity_table, k, axis=1)[:, :k]

    for i in range(sorted_indices.shape[0]):
        local_top_k_values[i] = local_similarity_table[i, sorted_indices[i]]

    top_k_similar_indices[ex.ul[0]:ex.lr[0], :] = sorted_indices
    yield extent.create((ex.ul[0], 0), (ex.lr[0], k),
                        (array.shape[0], k)), local_top_k_values
コード例 #2
0
def _select_most_k_similar_mapper(array, ex, 
                                  top_k_similar_indices, 
                                  k):
  ''' Find the top k similar items for each item.
  Parameters
  ----------
  top_k_similar_indices: Spartan array of shape (N, k)
                         The indices of top k similar items.

  k : Integer
  '''
  local_similarity_table = array.fetch(ex)
  local_top_k_values = np.zeros((ex.shape[0], k)) 

  start_idx = ex.ul[0] 
  # Find the k largest value of each row. This function is adapted from 
  # bottlenect.argpartsort.
  sorted_indices = argpartsort(local_similarity_table, k, axis=1)[:, :k]
    
  for i in range(sorted_indices.shape[0]):
    local_top_k_values[i] = local_similarity_table[i, sorted_indices[i]]
  
  top_k_similar_indices[ex.ul[0]:ex.lr[0], :] = sorted_indices
  yield extent.create((ex.ul[0], 0), (ex.lr[0], k), (array.shape[0], k)), local_top_k_values
コード例 #3
0
def _similarity_mapper(array, ex, item_norm, step):
    ''' Find all pair similarities between items. 
  Parameters
  ----------
  item_norm : Spartan array of shape(N,)
              The norm values of each item.

  step : Integer.
         How many items need to be fetched for each iteration, now this equals to 
         the columns of tiles.
  '''
    M = array.shape[0]
    N = array.shape[1]

    local_ratings = array.fetch(ex)
    local_item_norm = item_norm[ex.ul[1]:ex.lr[1]]
    local_item_norm = local_item_norm.reshape(local_item_norm.shape[0], 1)

    assert local_ratings.shape[0] == M

    # The start index of the items this worker is responsible for.
    local_start_idx = ex.ul[1]
    # The start index of the items which will be fetched next.
    fetch_start_idx = 0
    count = 0

    while fetch_start_idx < N:
        util.log_info("Round : %s on %s", count, socket.gethostname())
        # Maybe last tile of the rating matrix doesn't have enough items.
        if N - fetch_start_idx <= step:
            step = N - fetch_start_idx

        count += 1

        with util.TIMER.item_fetching:
            # Fetch the ratings of remote items. The matrix is sparse, so this step
            # will not be very expensive.
            remote_ratings = array[:, fetch_start_idx:fetch_start_idx + step]
            remote_item_norm = item_norm[fetch_start_idx:fetch_start_idx +
                                         step]
            remote_item_norm = remote_item_norm.reshape(
                1, remote_item_norm.shape[0])

        with util.TIMER.calculate_similarities:
            '''
      Calculate the all-paris similarities between local items and remote items.
      local_ratings is a local matrix of shape(M, N1), remote_ratings is a local
      matrix of shape(M, N2).

      We calculate the cosine similarity, which is defined as:

          simi(V1, V2) = dot(V1, V2) / (|| V1 || * || V2 ||)

      For effiency, we calculate this in the way of matrix multiplication.
      
      "local_ratings.T.dot(remote_ratings)" generates a N1 X N2 matrix S.
      S[i, j] equals dot(Vi, Vj).
      
      "local_item_norm.dot(remote_item_norm)" generates a N1 X N2 matrix N.
      N[i, j] equals (|| Vi || * || Vj ||).

      In final step, we divide S by N, which yields all-pairs similarity.
      '''
            similarities = local_ratings.T.dot(remote_ratings)
            similarities = np.array(similarities.todense())
            norms = local_item_norm.dot(remote_item_norm)
            similarities = similarities / norms
            # In case some norms are zero.
            similarities = np.nan_to_num(similarities)

        # Update this to global array.
        yield extent.create((local_start_idx, fetch_start_idx),
                            (local_start_idx + similarities.shape[0],
                             fetch_start_idx + similarities.shape[1]),
                            (array.shape[1], array.shape[1])), similarities

        # Update fetch_start_idx, fetch next part of table.
        fetch_start_idx += step
コード例 #4
0
def _similarity_mapper(array, ex, item_norm, step):
  ''' Find all pair similarities between items. 
  Parameters
  ----------
  item_norm : Spartan array of shape(N,)
              The norm values of each item.

  step : Integer.
         How many items need to be fetched for each iteration, now this equals to 
         the columns of tiles.
  '''
  M = array.shape[0]
  N = array.shape[1]

  local_ratings = array.fetch(ex)
  local_item_norm = item_norm[ex.ul[1] : ex.lr[1]]
  local_item_norm = local_item_norm.reshape(local_item_norm.shape[0], 1)

  assert local_ratings.shape[0] == M 
  
  # The start index of the items this worker is responsible for.
  local_start_idx = ex.ul[1]  
  # The start index of the items which will be fetched next.
  fetch_start_idx = 0
  count = 0

  while fetch_start_idx < N: 
    util.log_info("Round : %s on %s", count, socket.gethostname())
    # Maybe last tile of the rating matrix doesn't have enough items.
    if N - fetch_start_idx <= step:
      step = N - fetch_start_idx

    count += 1
    
    with util.TIMER.item_fetching:
      # Fetch the ratings of remote items. The matrix is sparse, so this step
      # will not be very expensive.
      remote_ratings = array[:, fetch_start_idx : fetch_start_idx + step]
      remote_item_norm = item_norm[fetch_start_idx : fetch_start_idx + step]
      remote_item_norm = remote_item_norm.reshape(1, remote_item_norm.shape[0])

    with util.TIMER.calculate_similarities:
      '''
      Calculate the all-paris similarities between local items and remote items.
      local_ratings is a local matrix of shape(M, N1), remote_ratings is a local
      matrix of shape(M, N2).

      We calculate the cosine similarity, which is defined as:

          simi(V1, V2) = dot(V1, V2) / (|| V1 || * || V2 ||)

      For effiency, we calculate this in the way of matrix multiplication.
      
      "local_ratings.T.dot(remote_ratings)" generates a N1 X N2 matrix S.
      S[i, j] equals dot(Vi, Vj).
      
      "local_item_norm.dot(remote_item_norm)" generates a N1 X N2 matrix N.
      N[i, j] equals (|| Vi || * || Vj ||).

      In final step, we divide S by N, which yields all-pairs similarity.
      '''
      similarities = local_ratings.T.dot(remote_ratings)
      similarities = np.array(similarities.todense())
      norms = local_item_norm.dot(remote_item_norm)
      similarities = similarities / norms
      # In case some norms are zero. 
      similarities = np.nan_to_num(similarities) 

    # Update this to global array.
    yield extent.create((local_start_idx, fetch_start_idx), (local_start_idx + similarities.shape[0], fetch_start_idx + similarities.shape[1]), (array.shape[1], array.shape[1])), similarities

    # Update fetch_start_idx, fetch next part of table.
    fetch_start_idx += step