Esempio n. 1
0
    def _phase2(self):
        """
		Execute phase 2 of the SP region. This phase is used to compute the
		active columns.
		
		Note - This should only be called after phase 1 has been called and
		after the inhibition radius and neighborhood have been updated.
		"""

        # Shift the outputs
        self.y[:, 1:] = self.y[:, :-1]
        self.y[:, 0] = 0

        # Calculate k
        #   - For a column to be active its overlap must be above the overlap
        #     value of the k-th largest column in its neighborhood.
        k = self._get_num_cols()

        if self.global_inhibition:
            # The neighborhood is all columns, thus the set of active columns
            # is simply columns that have an overlap above the k-th largest
            # in the entire region

            # Compute the winning column indexes
            if self.learn:
                # Randomly break ties
                ix = bn.argpartsort(
                    -self.overlap[:, 0] -
                    self.prng.uniform(.1, .2, self.ncolumns), k)[:k]
            else:
                # Choose the same set of columns each time
                ix = bn.argpartsort(-self.overlap[:, 0], k)[:k]

            # Set the active columns
            self.y[ix, 0] = self.overlap[ix, 0] > 0
        else:
            # The neighborhood is bounded by the inhibition radius, therefore
            # each column's neighborhood must be considered

            for i in xrange(self.ncolumns):
                # Get the neighbors
                ix = np.where(self.neighbors[i])[0]

                # Compute the minimum top overlap
                if ix.shape[0] <= k:
                    # Desired number of candidates is at or below the desired
                    # activity level, so find the overall max
                    m = max(bn.nanmax(self.overlap[ix, 0]), 1)
                else:
                    # Desired number of candidates is above the desired
                    # activity level, so find the k-th largest
                    m = max(-bn.partsort(-self.overlap[ix, 0], k + 1)[k], 1)

                # Set the column activity
                if self.overlap[i, 0] >= m: self.y[i, 0] = True
Esempio n. 2
0
	def _phase2(self):
		"""
		Execute phase 2 of the SP region. This phase is used to compute the
		active columns.
		
		Note - This should only be called after phase 1 has been called and
		after the inhibition radius and neighborhood have been updated.
		"""
		
		# Shift the outputs
		self.y[:, 1:] = self.y[:, :-1]
		self.y[:, 0] = 0
		
		# Calculate k
		#   - For a column to be active its overlap must be above the overlap
		#     value of the k-th largest column in its neighborhood.
		k = self._get_num_cols()
		
		if self.global_inhibition:
			# The neighborhood is all columns, thus the set of active columns
			# is simply columns that have an overlap above the k-th largest
			# in the entire region
			
			# Compute the winning column indexes
			if self.learn:				
				# Randomly break ties
				ix = bn.argpartsort(-self.overlap[:, 0] -
					self.prng.uniform(.1, .2, self.ncolumns), k)[:k]
			else:
				# Choose the same set of columns each time
				ix = bn.argpartsort(-self.overlap[:, 0], k)[:k]
			
			# Set the active columns
			self.y[ix, 0] = self.overlap[ix, 0] > 0
		else:
			# The neighborhood is bounded by the inhibition radius, therefore
			# each column's neighborhood must be considered
			
			for i in xrange(self.ncolumns):
				# Get the neighbors
				ix = np.where(self.neighbors[i])[0]
				
				# Compute the minimum top overlap
				if ix.shape[0] <= k:
					# Desired number of candidates is at or below the desired
					# activity level, so find the overall max
					m = max(bn.nanmax(self.overlap[ix, 0]), 1)
				else:
					# Desired number of candidates is above the desired
					# activity level, so find the k-th largest
					m = max(-bn.partsort(-self.overlap[ix, 0], k + 1)[k], 1)
				
				# Set the column activity
				if self.overlap[i, 0] >= m: self.y[i, 0] = True
Esempio n. 3
0
def extractIrIcR(line, d1):
    # zeroIdx assumes that there is no offset in the axis
    zeroIdx = round(d1.start*d1.pt/(d1.start-d1.stop))
    zeroIdx = 452  # Temp override the autofound value
    l1 = line[:zeroIdx]
    l2 = line[zeroIdx:]
    IrIdx = np.sort(bn.argpartsort(-l1, 3)[:3])
    IcIdx = np.sort(bn.argpartsort(-l2, 13)[:13]) + zeroIdx
    # store only peaks closest to zero
    Ir = d1.lin[IrIdx[-1]]*1e-6
    Ic = d1.lin[IcIdx[1]]*1e-6
    R = np.mean(line[IrIdx[:]])
    return Ir, Ic, R
Esempio n. 4
0
def MAP_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100,
                   vad_data=None):
    '''
    mean average precision@k
    '''
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu,
                              vad_data=vad_data)
    idx_topk_part = bn.argpartsort(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    aps = np.zeros(batch_users)
    for i, idx in enumerate(xrange(user_idx.start, user_idx.stop)):
        actual = heldout_data[idx].nonzero()[1]
        if len(actual) > 0:
            predicted = idx_topk[i]
            aps[i] = apk(actual, predicted, k=k)
        else:
            aps[i] = np.nan
    return aps
Esempio n. 5
0
def precision_at_k_batch(train_data,
                         vad_data,
                         test_data,
                         Et,
                         Eb,
                         user_idx,
                         k=20,
                         normalize=True):
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data, vad_data, Et, Eb, user_idx,
                              batch_users)
    idx = bn.argpartsort(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.tile(np.arange(batch_users), (k, 1)).T, idx[:, :k]] = True

    X_true_binary = (test_data[user_idx] > 0).toarray()
    tmp = (np.logical_and(X_true_binary,
                          X_pred_binary).sum(axis=1)).astype(np.float32)

    if normalize:
        precision = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    else:
        precision = tmp / k
    return precision
Esempio n. 6
0
 def get_non_elites(self, limit):
     # get indexes of chromosomes that are not elites
     indexes = bn.argpartsort(-self.fitnesses, n=self.NUM_ELITES)
     num_no_elites = self.NUM_CHROMOSOMES - self.NUM_ELITES
     no_elites = indexes[-num_no_elites:]
     np.random.shuffle(no_elites)
     return no_elites[:limit]
def min_k_indices(arr, k, inv_ind=False):
    """Returns indices of the k-smallest values in each row, unsorted.
  The `inv_ind` flag returns the tuple (k-smallest,(n-k)-largest). """
    psorted = argpartsort(arr, k)
    if inv_ind:
        return psorted[..., :k], psorted[..., k:]
    return psorted[..., :k]
Esempio n. 8
0
 def get_non_elites(self, limit):
     # get indexes of chromosomes that are not elites
     indexes = bn.argpartsort(-self.fitnesses, n=self.NUM_ELITES)
     num_no_elites = self.NUM_CHROMOSOMES - self.NUM_ELITES
     no_elites = indexes[-num_no_elites:]
     np.random.shuffle(no_elites)
     return no_elites[:limit]
Esempio n. 9
0
def min_k_indices(arr, k, inv_ind=False):
    '''Returns indices of the k-smallest values in each row, unsorted.
  The `inv_ind` flag returns the tuple (k-smallest,(n-k)-largest). '''
    psorted = argpartsort(arr, k)
    if inv_ind:
        return psorted[..., :k], psorted[..., k:]
    return psorted[..., :k]
Esempio n. 10
0
def recall_at_k_batch(train_data,
                      heldout_data,
                      Et1,
                      Eb1,
                      Et2,
                      Eb2,
                      user_idx,
                      k=20,
                      normalize=True,
                      mu=None,
                      vad_data=None):
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data,
                              Et1,
                              Eb1,
                              Et2,
                              Eb2,
                              user_idx,
                              batch_users,
                              mu=mu,
                              vad_data=vad_data)
    idx = bn.argpartsort(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_data[user_idx] > 0).toarray()
    tmp = (np.logical_and(X_true_binary,
                          X_pred_binary).sum(axis=1)).astype(np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall
Esempio n. 11
0
def eval_multiple(true_scores, pred_scores, topk):
    idx = bottleneck.argpartsort(-pred_scores, topk)[:topk]
    noise = np.random.random(topk)
    if not isinstance(pred_scores, np.ndarray):
        pred_scores = np.array(pred_scores)
    if not isinstance(true_scores, np.ndarray):
        true_scores = np.array(true_scores)
    rec = sorted(zip(pred_scores[idx], noise, true_scores[idx]), reverse=True)
    nhits = 0.0
    nhits_topk = 0.0
    k = topk if topk >= 0 else len(rec)
    sumap = 0.0
    for i in range(len(rec)):
        if rec[i][-1] != 0.0:
            nhits += 1.0
            if i < k:
                nhits_topk += 1
                sumap += nhits / (i + 1.0)
    nhits = np.sum(true_scores)
    if nhits != 0:
        sumap /= min(nhits, k)
        map_at_k = sumap
        recall_at_k = nhits_topk / nhits
        precision_at_k = nhits_topk / k
    else:
        map_at_k = 0.0
        recall_at_k = 0.0
        precision_at_k = 0.0

    return map_at_k, recall_at_k, precision_at_k
Esempio n. 12
0
 def process(self, image):
     '''Performs llc encoding.
     '''
     K = self.specs.get('k', 5)
     reg = self.specs.get('reg', 1e-4)
     D = self.dictionary
     shape = image.shape[:-1]
     X = image.reshape((np.prod(shape), image.shape[-1]))
     # D_norm is the precomputed norm of the entries
     if 'D_norm' not in self.specs:
         self.specs['D_norm'] = (D**2).sum(1) / 2.
     D_norm = self.specs['D_norm']
     distance = mathutil.dot(X, -D.T)
     distance += D_norm
     # find the K closest indices
     if bn is not None:
         # use bottleneck which would be faster
         IDX = bn.argpartsort(distance, K, axis=1)[:, :K]
     else:
         IDX = np.argsort(distance,1)[:, :K]
     # do LLC approximate coding
     coeff = np.zeros((X.shape[0], D.shape[0]))
     ONES = np.ones(K)
     Z = np.empty((K, D.shape[1]))
     for i in range(X.shape[0]):
         # shift to origin
         Z[:] = D[IDX[i]]
         Z -= X[i]
         # local covariance
         C = mathutil.dot(Z,Z.T)
         # add regularization
         C.flat[::K+1] += reg * C.trace()
         w = np.linalg.solve(C,ONES)
         coeff[i][IDX[i]] = w / w.sum()
     return coeff.reshape(shape + (coeff.shape[1],))
Esempio n. 13
0
def NDCG_binary_at_k_batch(train_data, heldout_data, Et, Eb, user_idx,
                           mu=None, k=100, vad_data=None):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data, Et, Eb, user_idx,
                              batch_users, mu=mu, vad_data=vad_data)
    idx_topk_part = bn.argpartsort(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    heldout_batch = heldout_data[user_idx]
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG
Esempio n. 14
0
def recall_at_k_batch(train_data,
                      heldout_data,
                      Et,
                      Eb,
                      user_idx,
                      k=20,
                      vad_data=None):
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data,
                              Et,
                              Eb,
                              user_idx,
                              batch_users,
                              vad_data=vad_data)
    idx = bn.argpartsort(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.tile(np.arange(batch_users), (k, 1)).T, idx[:, :k]] = True

    X_true_binary = (heldout_data[user_idx] > 0).toarray()
    tmp = (np.logical_and(X_true_binary,
                          X_pred_binary).sum(axis=1)).astype(np.float32)

    recall = tmp / X_true_binary.sum(axis=1)
    return recall
Esempio n. 15
0
def newton_optimized(values, k, bottom=False):
    T = values[:k]
    prec_avg = weighted_average(T)
    while True:
        if bottom:
            T_next = argpartsort(([vj - prec_avg * wj for (vj, wj) in values]),
                                 k)[:k]
        else:
            T_next = argpartsort(([prec_avg * wj - vj for (vj, wj) in values]),
                                 k)[:k]

        now_avg = weighted_average(
            ((values[j][0], values[j][1]) for j in T_next))
        if abs(prec_avg - now_avg) < 10**(-9):
            break
        prec_avg = now_avg
    return now_avg
 def _knn_euclidean(X, mask, Xc, k, verbose=False):
     t = time.time()
     nn_ind = np.zeros((X.shape[0], k), dtype=int)
     for n in xrange(X.shape[0]):
         dists = euclidean_distances(Xc[:, mask[n, :]], X[n, mask[n, :]], squared=True).flatten()
         nn_ind[n, :] = bn.argpartsort(dists, k)[:k]
     if verbose:
         print('Finished knn in {:.3f} s'.format(time.time() - t))
     return nn_ind
 def _knn_dot(X, mask, Xc, k, verbose=False):
     t = time.time()
     nn_ind = np.zeros((X.shape[0], k), dtype=int)
     for n in xrange(X.shape[0]):
         dists = -np.dot(Xc[:, mask[n, :]], X[n, mask[n, :]])
         nn_ind[n, :] = bn.argpartsort(dists, k)[:k]
     if verbose:
         print('Finished knn in {:.3f} s'.format(time.time() - t))
     return nn_ind
Esempio n. 18
0
 def argsort(x, topn=None):
     if topn is None:
         topn = x.size
     if topn <= 0:
         return []
     if topn >= x.size:
         return numpy.argsort(x)[::-1]
     biggest = bottleneck.argpartsort(x, x.size - topn)[-topn:]
     return biggest.take(numpy.argsort(x.take(biggest))[::-1])
Esempio n. 19
0
 def _knn_dot(X, mask, Xc, k, verbose=False):
     t = time.time()
     nn_ind = np.zeros((X.shape[0], k), dtype=int)
     for n in xrange(X.shape[0]):
         dists = -np.dot(Xc[:, mask[n, :]], X[n, mask[n, :]])
         nn_ind[n, :] = bn.argpartsort(dists, k)[:k]
     if verbose:
         print('Finished knn in {:.3f} s'.format(time.time() - t))
     return nn_ind
Esempio n. 20
0
def retrieve(code, k):
    """ Retrieve top k nearest images from training set """
    assert(train_hash_code.shape[0] > k), 'Invalid k'
    inv_code = 1 - code
    hamming_dist = 48 - (np.dot(train_hash_code, code) + np.dot(inv_train_hash_code, inv_code))
    top_k_idx_unordered = bn.argpartsort(hamming_dist, k)[:k] # get indices of k minimum
    top_k_hamm_dist = hamming_dist[top_k_idx_unordered]
    top_k_idx_ordered = top_k_idx_unordered[top_k_hamm_dist.argsort()]
    return top_k_idx_ordered
Esempio n. 21
0
    def get_top_n(self, probs, labels):
        '''
		Get top n most likely subreddits. Each row in probs should correspond to the subreddit probability distribution 
		over an item in the prediction set. Labels should be subreddit labels which reflect the ordering of probs.
		
		Resources: http://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array
		'''

        return labels[bn.argpartsort(-probs, self.num_suggestions,
                                     axis=1)[:, :self.num_suggestions]]
	def get_top_n(self, probs, labels):

		'''
		Get top n most likely subreddits. Each row in probs should correspond to the subreddit probability distribution 
		over an item in the prediction set. Labels should be subreddit labels which reflect the ordering of probs.
		
		Resources: http://stackoverflow.com/questions/10337533/a-fast-way-to-find-the-largest-n-elements-in-an-numpy-array
		'''

		return labels[bn.argpartsort(-probs,self.num_suggestions,axis=1)[:,:self.num_suggestions]]
Esempio n. 23
0
def find_geo_NN(lat, long, location_data, K=1):
    #location_data is a 2-d nx2 numpy array of lat-long coordinates.
    v = ((location_data - np.array([lat, long]))**2).sum(axis=1)
    ix2 = bn.argpartsort(v, K, axis=None)[:K]
    #print ix
    #ix2 =ix[ np.nonzero( v[ix] < 100)]
    #ix2 = np.append( ix2, np.random.randint(0, location_data.shape[0], (1, K - ix2.shape[0] )  )  )

    #print ix2
    return ix2
def find_geo_NN( lat, long, location_data, K = 1 ):
    #location_data is a 2-d nx2 numpy array of lat-long coordinates.
    v = (( location_data - np.array( [lat, long] )  )**2).sum(axis=1)
    ix2 = bn.argpartsort( v, K, axis=None)[:K]
    #print ix
    #ix2 =ix[ np.nonzero( v[ix] < 100)]
    #ix2 = np.append( ix2, np.random.randint(0, location_data.shape[0], (1, K - ix2.shape[0] )  )  )
    
    #print ix2
    return ix2
Esempio n. 25
0
 def argsort(x, topn=None):
     """Return indices of the `topn` greatest elements in numpy array `x`, in order."""
     if topn is None:
         topn = x.size
     if topn <= 0:
         return []
     if topn >= x.size:
         return numpy.argsort(x)[::-1]
     biggest = bottleneck.argpartsort(x, x.size - topn)[-topn:]
     # the indices in `biggest` are not sorted by magnitude => sort & return
     return biggest.take(numpy.argsort(x.take(biggest))[::-1])
Esempio n. 26
0
 def _knn_euclidean(X, mask, Xc, k, verbose=False):
     t = time.time()
     nn_ind = np.zeros((X.shape[0], k), dtype=int)
     for n in xrange(X.shape[0]):
         dists = euclidean_distances(Xc[:, mask[n, :]],
                                     X[n, mask[n, :]],
                                     squared=True).flatten()
         nn_ind[n, :] = bn.argpartsort(dists, k)[:k]
     if verbose:
         print('Finished knn in {:.3f} s'.format(time.time() - t))
     return nn_ind
Esempio n. 27
0
    def get_rec(self, algname, user, max=10, **param):
        """Wrapper aroung the real recommendation getter to set parameters"""
        if algname == 'AsySVD':
            param = dict(param, userToTest=user)

        rec = self._get_rec(algname, user, **param)
        rec = numpy.squeeze(rec)
        indexes = bottleneck.argpartsort(rec, rec.size - max, axis=0)[-max:]
        rec = [(rec[index], index) for index in indexes]
        rec.sort(reverse=True)
        return rec
Esempio n. 28
0
 def argsort(x, topn=None):
     """Return indices of the `topn` greatest elements in numpy array `x`, in order."""
     if topn is None:
         topn = x.size
     if topn <= 0:
         return []
     if topn >= x.size:
         return numpy.argsort(x)[::-1]
     biggest = bottleneck.argpartsort(x, x.size - topn)[-topn:]
     # the indices in `biggest` are not sorted by magnitude => sort & return
     return biggest.take(numpy.argsort(x.take(biggest))[::-1])
Esempio n. 29
0
    def get_rec(self, algname, user, max=10, **param):
        """Wrapper aroung the real recommendation getter to set parameters"""
        if algname == 'AsySVD':
            param = dict(param, userToTest=user)

        rec = self._get_rec(algname, user, **param)
        rec = numpy.squeeze(rec)
        indexes = bottleneck.argpartsort(rec, rec.size-max, axis=0)[-max:]
        rec = [(rec[index], index) for index in indexes]
        rec.sort(reverse=True)
        return rec
Esempio n. 30
0
 def _knn_euclid_helper(self, D):
     """Helper function to reduce memory consumption.
     """        
     dist = distance.cdist(D, self.C, 'sqeuclidean')
     L = D.shape[0]  # number of samples
     k = self.K  # number of nearest neighbours to return
     k_idx = argpartsort(dist, k, 1)[:, :k]  # getting K smallest indices, unordered
     k_dist = dist[[[t]*k for t in xrange(L)], k_idx]  # get distances, unordered
     idx = np.argsort(k_dist, 1)  # get correct ordering
     k_dist = k_dist[[[t]*k for t in xrange(L)], idx]  # apply ordering to distances
     k_idx = k_idx[[[t]*k for t in xrange(L)], idx]  # apply ordering to indices
     return k_idx, k_dist
Esempio n. 31
0
 def most_similar_to(self, vec):
     """ Get top num most similar vectors
     """
     vecnorm = NP.sqrt(NP.sum(vec * vec))
     numerator = NP.sum(vec.reshape(1, -1) * self.vecs, axis=1)
     denominator = vecnorm * self.vecsnorm
     sims = numerator / denominator
     if 0 < self.simnum < sims.shape[0]:
         n_idc = BN.argpartsort(-sims, self.simnum, axis=None)[:self.simnum]
         return NP.array([self.vecs[i] for i in n_idc], dtype=NP.float32), \
             NP.array(sims[n_idc], dtype=NP.float32)
     else:
         return self.vecs, sims
def CalcSimilarUsersSongs(userid):
	usersongsset = userDict[userid].keys()
	usersongintersection = p.DataFrame(index = [userid])
	top5similarusers = []
	for otheruserid in userDict.iteritems():
		otherusersongsset = userDict[otheruserid].keys()
		usersongintersection.insert(0, otheruserid, len(np.intersect1d(usersongsset,otherusersongsset, False)), False)
		top5similarusers = usersongintersection.loc[userid][bn.argpartsort(-usersongintersection.loc[userid], 5)[:5]].index.values
	unlistenedsongs = np.array([])
	for userid in top5similarusers:
		otherusersongsset = userDict[otheruserid].keys()
		np.union1d(unlistenedsongs, np.setdiff1d(otherusersongsset, usersongsset))
		if(len(unlistenedsongs) >= 5):
			break
	return unlistenedsongs
Esempio n. 33
0
def recall_at_k_batch(train_data, heldout_data, Et, Eb, user_idx,
                      k=20, normalize=True, mu=None, vad_data=None):
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data, Et, Eb, user_idx,
                              batch_users, mu=mu, vad_data=vad_data)
    idx = bn.argpartsort(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_data[user_idx] > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall
Esempio n. 34
0
def get_dist_bln(P, Q, A, m, K):
    """Distance function using Bottleneck.
    """
    Z = (P+Q).dot(A)
    Z = Z**m
    Z[Z==0] = 1
    D = (P-Q)/Z
    # calc only diagonal of dot operation, for multiple vectors at a time
    DA = D.dot(A)
    sqdist = np.einsum('ij,ji->i', DA, D.T)  # squared distance
    # getting the closest K members
    k_idx = argpartsort(sqdist, K)[:K]
    k_dist = sqdist[k_idx]
    idx = np.argsort(k_dist)
    k_dist = k_dist[idx] ** 0.5
    k_idx = k_idx[idx]
    return (k_idx, k_dist)
Esempio n. 35
0
def recall_at_multiple_ks_batch(train_data, heldout_data, Et, Eb, user_idx,
                                topks, vad_data):
    batch_users = user_idx.stop - user_idx.start

    X_pred = rec_eval._make_prediction(train_data, Et, Eb, user_idx,
                                       batch_users, vad_data=vad_data)
    recalls = np.empty((len(topks), batch_users))
    for i, k in enumerate(topks):
        idx = bn.argpartsort(-X_pred, k, axis=1)
        X_pred_binary = np.zeros_like(X_pred, dtype=bool)
        X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

        X_true_binary = (heldout_data[user_idx] > 0).toarray()
        tmp = (np.logical_and(X_true_binary,
                              X_pred_binary).sum(axis=1)).astype(np.float32)
        recalls[i] = tmp / X_true_binary.sum(axis=1)
    return recalls
Esempio n. 36
0
def get_n_max_sal(sal, n):
    """
    Receives:
        saliency map
        n, number of points to return

    Returns:
        numpy array with the indices of maximum points
    """

    # max number of candidate indices
    max_cand = n

    # distance threshold for distance based cutting out
    #thresh = 10

    sal_ = sal
    #max_indices = bn.argpartsort(-1*sal_.flatten(), n)[:n]
    size = sal_.shape[0] * sal.shape[1]
    #cand_indices = bn.argpartsort(-1*sal_.flatten(), max_cand)[:max_cand]
    max_indices = bn.argpartsort(-1 * sal_.flatten(), n)[:n]

    # pdist needs matrix, not 1d array
    #max_indices = np.reshape(max_indices, (max_indices.shape, 1))

    # iterating over max indices to see if there are points too close to each
    # other. if there is, we must eliminate one of them and pick the next one
    # after the n max saliency points
    """
    incr = 0

    for i in range(n):

        # obtaining distance matrix  
        dist_mat = squareform(pdist(max_indices, 'minkowski', 1)) 

        for j in range(n):

            if (dist_mat[i, j] < thres) and (i != j):

                max_indices[j] = cand_indices[n+incr]
                incr += 1
    """
    max_indices = np.unravel_index(max_indices, sal_.shape)
    print(max_indices)
    return max_indices
Esempio n. 37
0
def get_wifi_top(wifi_str='', ntop=15):  # 获取wifi强度15的店铺编号
    str_list = wifi_str.split(';')
    wifi_ifos = np.array([x.strip(' ').split('|')[:3] for x in str_list])
    w_name = wifi_ifos[:, 0]  #     获取列表中的wifi 的名字
    w_value = [int(x) for x in wifi_ifos[:, 1]]
    w_state = wifi_ifos[:, 2]  # 连接状态
    if 'true' in w_state:
        connection_wifi_name = w_name[w_state.tolist().index('true')]
    else:
        connection_wifi_name = 'unkown'
    if len(wifi_ifos) > ntop:
        top_5_idx = bottleneck.argpartsort(-np.array(w_value), ntop)[:ntop]
        return wf_name_2_idx(w_name[top_5_idx]), wf_name_2_idx([connection_wifi_name])
    else:
        sort_idx = np.argsort(-np.array(w_value))
        w_name = w_name[sort_idx].tolist()
        w_name.extend(['b_null'] * (ntop - len(wifi_ifos)))
        return wf_name_2_idx(w_name), wf_name_2_idx([connection_wifi_name])
Esempio n. 38
0
File: learn.py Progetto: xgdgsc/plsi
    def output(self,outpath,words,z):
        #topIndices=bn.argpartsort(-self.p_wz_n[:][z])
        with open(os.path.join(outpath,str(z)+"-topic.txt"),'w') as outfile:
            for i in range(z):
                #print self.p_wz_n.shape
                #print self.p_wz_n[:,i]
                topIndices=bn.argpartsort(-self.p_wz_n[:,i],20)[:20]
                #                print topIndices
                topList=[]
                for index in topIndices:
                    topList.append([index,self.p_wz_n[index,i]])
                sortedList=sorted(topList,key=lambda x:(-x[1]))

                outfile.write("Topic "+str(i)+":\n")
                for w in sortedList:
                    outfile.write(words[w[0]+1]+":"+str(w[1])+"\n")


        with open(os.path.join(outpath,"k-likelihood-time.txt"),'a') as outfile:
            outfile.write(str(z)+' '+str(self.log_like)+' '+str(self.time)+'\n')
Esempio n. 39
0
def nn(feat, feats, distance='euclidean', K=-1):
    """
    Exact nearest neighbor seach through exhaustive comparison.
    """
    if distance == 'manhattan':
        dists = metrics.manhattan_distances(feat, feats)
    elif distance == 'euclidean':
        dists = metrics.euclidean_distances(feat, feats, squared=True)
    elif distance == 'chi_square':
        dists = -metrics.additive_chi2_kernel(feat, feats)

    dists = dists.flatten()
    if K > 0:
        nn_ind = bn.argpartsort(dists, K).flatten()[:K]
        nn_ind = nn_ind[np.argsort(dists[nn_ind])]
    else:
        nn_ind = np.argsort(dists)
    nn_dist = dists[nn_ind]

    return nn_ind, nn_dist
Esempio n. 40
0
def bn_topargn(arr, N, ascending=None):
    """
    Return the indices of the top N results. 
    The following should be equivalent

    >>> res1 = arr[bn_topargn(arr, 10)] 
    >>> res2 = bn_topn(arr, 10)
    >>> np.all(res1 == res2)
        True
    """
    if arr.ndim > 1:
        raise Exception("Only works on ndim=1")
    if ascending is None:
        ascending = not N > 0

    na_mask = np.isnan(arr)
    has_na = na_mask.sum()
    if has_na:
        # store the old indices for translating back later
        old_index_map = np.where(~na_mask)[0]
        arr = arr[~na_mask]

    if N > 0: # nlargest
        N = len(arr) - abs(N)
        sl = slice(N, None)
    else: # nsmallest
        N = abs(N)
        sl = slice(None, N)
    out = nb.argpartsort(arr, N)
    index = out[sl]
    # sort the index by their values
    index_sort = np.argsort(arr[index])
    if not ascending:
        index_sort = index_sort[::-1]
    index = index[index_sort]

    # index is only correct with arr without nans. 
    # Map back to old_index if needed
    if has_na:
        index = old_index_map[index]
    return index
Esempio n. 41
0
def keep_k_best(co_occ, k=200):
    """
    Keep the ``k`` best values in the matrix and set the rest to 0. Relies on the bottleneck library for fast sort.

    Args:

     * ``co_occ`` (*ndarray*): input matrix.
     * ``k`` (*int, optional*): number of values to keep. Defaults to 200.

    Returns:

     * ``normalized`` (*ndarray*): normalized matrix.
    """
    import bottleneck as bn
    part = bn.argpartsort(-co_occ, k, axis=1)[:, :k]
    for line in xrange(co_occ.shape[0]):
        c = co_occ[line, :]
        kbest = c[part[line, -1]]
        c[c < kbest] = 0.
        co_occ[line, :] = c
    return co_occ
Esempio n. 42
0
    def get_knn(self, D):
        """Indexes and distances of *k* nearest neighbours.
        
        Uses *argpartsort* for fast determination of kNN.
        
        Currently used.
        """
        if self.C is None:
            return "Uninitialized centroids"

        
        dist = distance.cdist(D, self.C, 'sqeuclidean')
        L = D.shape[0]  # number of samples
        k = cf._nn_count  # number of nearest neighbours
        k_idx = argpartsort(dist, k, 1)[:, :k]  # getting K smallest indices, unordered
        k_dist = dist[[[t]*k for t in xrange(L)], k_idx]  # get distances, unordered
        idx = np.argsort(k_dist, 1)  # get correct ordering
        k_dist = k_dist[[[t]*k for t in xrange(L)], idx]  # apply ordering to distances
        k_idx = k_idx[[[t]*k for t in xrange(L)], idx]  # apply ordering to indices
        
        return (k_idx, k_dist)
Esempio n. 43
0
    def predict(self, testDat):

        pred = np.zeros(testDat.shape[0])
            
        for j,t in enumerate(testDat):
            
            distances = cdist(t[np.newaxis,:], self.X, 'euclidean').ravel()
            
            index = bn.argpartsort(distances, n=self.k)
            label = self.y[index[:self.k]]
            votes = {}

            for i in label:
                if i in votes:
                    votes[i] += 1
                else:
                    votes[i]  = 1

            pred[j] = max(votes.iteritems(), key=operator.itemgetter(1))[0]
    
        return pred
Esempio n. 44
0
def detect_line(img):
    img_edge = edge_detect.full_detect(img, is_binary=False)

    sum_x = np.sum(img_edge, axis=0)
    # max_xs = bn.argpartsort(-sum_x, 10)[:10]
    max_xs = sum_x.argsort()[-10:]

    sum_y = np.sum(img_edge, axis=1)
    max_ys = bn.argpartsort(-sum_y, 10)[:10]

    height, width = img_edge.shape[:2]

    for x in max_xs:
        # print (x, 0), (x, height)
        cv2.line(img, (x, 0), (x, height), (255, 255, 255), 1)

    for y in max_ys:
        # print (0, y), (width, y)
        cv2.line(img, (0, y), (width, y), (255, 255, 255), 1)

    return img
Esempio n. 45
0
def recall_at_multiple_ks_batch(train_data, heldout_data, Et, Eb, user_idx,
                                topks, vad_data):
    batch_users = user_idx.stop - user_idx.start

    X_pred = rec_eval._make_prediction(train_data,
                                       Et,
                                       Eb,
                                       user_idx,
                                       batch_users,
                                       vad_data=vad_data)
    recalls = np.empty((len(topks), batch_users))
    for i, k in enumerate(topks):
        idx = bn.argpartsort(-X_pred, k, axis=1)
        X_pred_binary = np.zeros_like(X_pred, dtype=bool)
        X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

        X_true_binary = (heldout_data[user_idx] > 0).toarray()
        tmp = (np.logical_and(X_true_binary,
                              X_pred_binary).sum(axis=1)).astype(np.float32)
        recalls[i] = tmp / X_true_binary.sum(axis=1)
    return recalls
Esempio n. 46
0
def detect_line(img):
    img_edge = edge_detect.full_detect(img, is_binary=False)

    sum_x = np.sum(img_edge, axis=0)
    # max_xs = bn.argpartsort(-sum_x, 10)[:10]
    max_xs = sum_x.argsort()[-10:]

    sum_y = np.sum(img_edge, axis=1)
    max_ys = bn.argpartsort(-sum_y, 10)[:10]

    height, width = img_edge.shape[:2]

    for x in max_xs:
        # print (x, 0), (x, height)
        cv2.line(img, (x, 0), (x, height), (255, 255, 255), 1)

    for y in max_ys:
        # print (0, y), (width, y)
        cv2.line(img, (0, y), (width, y), (255, 255, 255), 1)

    return img
Esempio n. 47
0
 def process(self, image, out=None):
     '''Performs llc encoding.
     '''
     K = self.specs.get('k', 5)
     reg = self.specs.get('reg', 1e-4)
     D = self.dictionary
     shape = image.shape[:-1]
     X = image.reshape((np.prod(shape), image.shape[-1]))
     # D_norm is the precomputed norm of the entries
     if 'D_norm' not in self.specs:
         self.specs['D_norm'] = (D**2).sum(1) / 2.
     D_norm = self.specs['D_norm']
     distance = mathutil.dot(X, -D.T)
     distance += D_norm
     # find the K closest indices
     if bn is not None:
         # use bottleneck which would be faster
         IDX = bn.argpartsort(distance, K, axis=1)[:, :K]
     else:
         IDX = np.argsort(distance,1)[:, :K]
     # do LLC approximate coding
     if out is None:
         out = np.zeros((X.shape[0], D.shape[0]))
     else:
         out.resize((X.shape[0], D.shape[0]))
         out[:] = 0
     ONES = np.ones(K)
     Z = np.empty((K, D.shape[1]))
     for i in range(X.shape[0]):
         # shift to origin
         Z[:] = D[IDX[i]]
         Z -= X[i]
         # local covariance
         C = mathutil.dot(Z,Z.T)
         # add regularization
         C.flat[::K+1] += reg * C.trace()
         w = np.linalg.solve(C,ONES)
         out[i][IDX[i]] = w / w.sum()
     out.resize(shape + (out.shape[1],))
     return out
Esempio n. 48
0
def nn(feat, feats, distance='euclidean', K=-1):
    """
    Exact nearest neighbor seach through exhaustive comparison.
    """
    if distance == 'manhattan':
        dists = metrics.manhattan_distances(feat, feats)
    elif distance == 'euclidean':
        dists = metrics.euclidean_distances(feat, feats, squared=True)
    elif distance == 'chi_square':
        dists = -metrics.additive_chi2_kernel(feat, feats)
    elif distance == 'dot':
        dists = -np.dot(feat, feats)

    dists = dists.flatten()
    if K > 0:
        nn_ind = bn.argpartsort(dists, K).flatten()[:K]
        nn_ind = nn_ind[np.argsort(dists[nn_ind])]
    else:
        nn_ind = np.argsort(dists)
    nn_dist = dists[nn_ind]

    return nn_ind, nn_dist
Esempio n. 49
0
 def brute_radius_search(self, v, radius2=None, limit=None):
     v = v.flatten().astype(self._data_dtype)
     v_norm2 = bottleneck.ss(v)  # same as sum(v * v)
     d_norm2 = self.get_dataset('norm2', mmap_mode='r')
     dists = d_norm2 + v_norm2 - 2 * np.dot(self.data, v)
     #assert dists.ndim == 1 and not bottleneck.anynan(dists)
     ids = self.ids
     if radius2:
         mask = (dists < radius2)
         dists = dists[mask]
         ids = ids[mask]
     if limit:
         if limit == 1:
             imin = np.argmin(dists)
             return [(dists[imin], ids[imin])]
         else:
             # limit to the smallest values
             smallest_indices = bottleneck.argpartsort(dists, limit)[:limit]
             dists = dists[smallest_indices]
             ids = ids[smallest_indices]
     order = np.argsort(dists)
     return [(dists[i], ids[i]) for i in order]
def CalcSimilarSongs(userid):
	totalplaycount,meanplaycount = 0,0;
	usersongs = userDict[userid]
	totalnumofsongs = len(usersongs)

	for songid,pc in usersongs.iteritems():
		totalplaycount = totalplaycount + pc

	meanplaycount = totalplaycount/totalnumofsongs
	highestnormalizedpc = 0.00

	for songid, pc in usersongs.iteritems():
		usersongs[songid] = pc/meanplaycount
		if(usersongs[songid] > highestnormalizedpc):
			highestnormalizedpc = usersongs[songid]
			highnormpcsongid = songid




	top5similarsongs = songdataframe.loc[highnormpcsongid][bn.argpartsort(-songdataframe.loc[highnormpcsongid], 5)[:5]].index.values
	return top5similarsongs
Esempio n. 51
0
def solve(X_train, y_train, X_test, y_test):
    """
    The basic genetic algorithm.
    :return: An uncompiled keras model. The best found Neural Network for the data.
    """
    # Generate pop_size number of random solutions. Solutions may be infeasible
    population = generate_population(pop_size, 136)
    for generation in trange(num_gen):
        # Calculate the "goodness" of each solution and give it a score
        fitness_scores = evaluate_fitness(population, X_train, y_train, X_test, y_test)
        # Introduce concept of elitism. In each generation, num_elite number of best solutions will be chosen to be
        # carried forward to the next iteration without any modifications to them. Instead of sorting entire array,
        # sort it partially so that only top num_elite number of solutions are sorted, and the rest remains same
        if num_elite != 0:
            part_sorted = bn.argpartsort(fitness_scores, fitness_scores.shape[0] - num_elite)
            elite_indices = part_sorted[-num_elite:]  # Sorted top num_elite no of solutions
            remaining_indices = part_sorted[:-num_elite]  # Unsorted solutions
        # Print the best fitness score of every 20 generations to see how algorithm is performing.
        if generation % 10 == 0:
            print(np.max(fitness_scores))
        # Select parents which will create the next generation
        if num_elite != 0:
            parents = select_parents(population[remaining_indices], fitness_scores[remaining_indices])
        else:
            parents = select_parents(population, fitness_scores)
        # Perform a crossover operation on selected parents
        children = crossover(parents, xover_rate)
        # Perform a mutation operation on crossovered parents
        mutate(children, mut_rate)

        if num_elite != 0:
            # Add the elite solutions that were removed from parents, and set them as population (parents) for next gen
            population = np.vstack((children, population[elite_indices]))
        else:
            population = children

    # Find solution with highest fitness scores, i.e. solution with least number of vertices.
    best_solution_index = np.argmax(fitness_scores)
    return population[best_solution_index]
Esempio n. 52
0
                def calc():
                    if(k<dist_local.shape[0]):
                        index_local  = bn.argpartsort(dist_local,k,axis=0)[:k,:]
                        
                        for r in xrange(stop_point-ii):
                            dists_cpu_buffer_local = dist_local[index_local[:,r],r];
                            indexes_cpu_buffer_local = index_local[:,r]
                         
                
                            index_local2  = np.argsort(dists_cpu_buffer_local,axis=0)
                        
                            dists[ii+r,:] = dists_cpu_buffer_local[index_local2]
                            indexes[ii+r,:] = indexes_cpu_buffer_local[index_local2]
                            #print ii+r,indexes_cpu_buffer_local[index_local2[0]]
                    else:
                        for r in xrange(stop_point-ii):
                            dists_cpu_buffer_local = dist_local[:,r];

                            index_local2  = np.argsort(dists_cpu_buffer_local,axis=0)
                        
                            dists[ii+r,:] = dists_cpu_buffer_local[index_local2]
                            indexes[ii+r,:] = index_local2                        
Esempio n. 53
0
def frameAnalysis():
    image = np.zeros((4000, 4000), np.uint8)
    for index in range(len(pointAngleArray)):
        theta = pointAngleArray[index]
        rho = pointDistanceArray[index]
        x, y = pol2cart(rho, math.radians(theta))
        if config.debug:
            print x, y
        if -2000 < x < 2000 and -2000 < y < 2000:
            image[2000 + x][2000 + y] = 200
    accumulator, thetas, rhos = hough_line(image)
    arr = np.ravel(accumulator)
    sortedArr = bn.argpartsort(arr, n=arr.shape[0] - config.maxLines)
    possibleLines = sortedArr[-config.maxLines :]
    lines = []
    for x in range(len(possibleLines)):
        index = possibleLines[x]
        rho = rhos[index / accumulator.shape[1]]
        theta = thetas[index % accumulator.shape[1]]
        for i in range(x + 1, len(possibleLines)):
            index2 = possibleLines[i]
            rho2 = rhos[index2 / accumulator.shape[1]]
            theta2 = thetas[index2 % accumulator.shape[1]]
            # print "Theta1: " + repr(theta) + " Theta2: " + repr(theta2) + " Rho1: " + repr(rho) + " Rho2: " + repr(rho2)
            if abs(theta - theta2) < 0.04 and ((rho > 0) == (rho2 > 0)):
                # print "Merge suceeded"
                arr[index] += arr[index2]
                arr[index2] = 0
                theta = (theta + theta2) / 2.0
                rho = (rho + rho2) / 2
            # else:
            #    print "Merge failed"
        if arr[index] >= config.minLength:
            print "index:"
            print arr[index]
            lines.append([arr[index], theta, rho])
            print "rho={0:.2f}, theta={1:.0f}".format(rho, np.rad2deg(theta))
    cv2.imwrite("houghlines3.jpg", image)
Esempio n. 54
0
def frameAnalysis():
    image = np.zeros((4000, 4000), np.uint8)
    for index in range(len(pointAngleArray)):
        theta = pointAngleArray[index]
        rho = pointDistanceArray[index]
        x, y = pol2cart(rho, math.radians(theta))
        if config.debug:
            print x, y
        if -2000 < x < 2000 and -2000 < y < 2000:
            image[2000 + x][2000 + y] = 200
    accumulator, thetas, rhos = hough_line(image)
    arr = np.ravel(accumulator)
    sortedArr = bn.argpartsort(arr, n=arr.shape[0] - config.maxLines)
    possibleLines = sortedArr[-config.maxLines:]
    lines = []
    for x in range(len(possibleLines)):
        index = possibleLines[x]
        rho = rhos[index / accumulator.shape[1]]
        theta = thetas[index % accumulator.shape[1]]
        for i in range(x + 1, len(possibleLines)):
            index2 = possibleLines[i]
            rho2 = rhos[index2 / accumulator.shape[1]]
            theta2 = thetas[index2 % accumulator.shape[1]]
            #print "Theta1: " + repr(theta) + " Theta2: " + repr(theta2) + " Rho1: " + repr(rho) + " Rho2: " + repr(rho2)
            if abs(theta - theta2) < .04 and ((rho > 0) == (rho2 > 0)):
                #print "Merge suceeded"
                arr[index] += arr[index2]
                arr[index2] = 0
                theta = (theta + theta2) / 2.
                rho = (rho + rho2) / 2
            #else:
            #    print "Merge failed"
        if arr[index] >= config.minLength:
            print "index:"
            print arr[index]
            lines.append([arr[index], theta, rho])
            print "rho={0:.2f}, theta={1:.0f}".format(rho, np.rad2deg(theta))
    cv2.imwrite('houghlines3.jpg', image)
 def argpartition(a, kth, axis=-1):
     return bottleneck.argpartsort(a, kth, axis)
Esempio n. 56
0
     _step = 0
     _num += 1
 if _num > 1:
     break
 batch_xs, batch_ys = get_next_batch_rnn(_step)
 Pred, _ = sess.run([pred, train_op],
                    feed_dict={
                        _x: batch_xs,
                        _y: batch_ys
                    })
 loss = sess.run(cost, feed_dict={_x: batch_xs, _y: batch_ys})
 test_index_max3 = np.zeros([size, out_times])
 batch_test_xs, batch_test_ys = random_get_batch()
 Pred_test = sess.run(tf.nn.softmax(pred),
                      feed_dict={_x: batch_test_xs})
 test_index = bottleneck.argpartsort(-batch_test_ys, 1,
                                     axis=1)[:, :1]
 test_index_max3[:][:] = test_index
 Pred_index_max3 = bottleneck.argpartsort(-Pred_test,
                                          out_times,
                                          axis=1)[:, :out_times]
 test_acc = np.amax(1 * np.equal(Pred_index_max3, test_index_max3),
                    axis=1)
 test_acc = 1.0 * sum(test_acc) / len(test_acc)
 result = np.concatenate((test_index, Pred_test), axis=1)
 text = "_num:{0} _step:{1} _loss:{2} _accuracy:{3}".format(
     _num, _step, loss, test_acc)
 # text = "_num:{0} _step:{1} _loss:{2} _accuracy:{3} _Pred_test:{4}".format(_num, _step, loss, test_acc, result)
 print(text)
 if test_acc >= acc:
     saver.save(sess, save_path=mkdir(acc), global_step=_step)
     acc += 0.02
Esempio n. 57
0
def get_topics(files_array=[], n_clusters=100):
    if not len(files_array):
        return

    start_time = time.time()
    print 'Time at start: %.3f' % (time.time() - start_time)
    sys.stdout.flush()

    train_texts = get_training_text()
    print 'Time after getting train text from XML: %.3f' % (time.time() -
                                                            start_time)
    sys.stdout.flush()

    vectorizer = CountVectorizer(max_df=0.5)
    train_mat_init = vectorizer.fit_transform(train_texts)
    del train_texts
    vocab = vectorizer.vocabulary_
    vocab_rev = {v: k for k, v in vocab.items()}
    print 'Time after CountVectorizer on train data: %.3f' % (time.time() -
                                                              start_time)
    sys.stdout.flush()

    tfidf_fit = TfidfTransformer().fit(train_mat_init)
    train_mat = tfidf_fit.transform(train_mat_init)
    del train_mat_init
    print 'Time after TFIDF on train data: %.3f' % (time.time() - start_time)
    sys.stdout.flush()

    NMF_fit = NMF(n_components=n_clusters, init='nndsvda').fit(train_mat)
    del train_mat
    H = NMF_fit.components_
    #W = NMF_fit.transform(train_mat)
    num_best = 50
    best_indices = map(
        lambda v: list(bn.argpartsort(-v, num_best)[0:num_best]), H)
    for i in range(len(best_indices)):
        best_indices[i].sort(key=lambda j: -H[i, j])
    best_words = [[vocab_rev[i] for i in lst] for lst in best_indices]

    print 'Time after NMF fit: %.3f\n' % (time.time() - start_time)

    if best_words_filename is not None:
        with open(best_words_filename, 'wb') as best_words_file:
            for c, lst in enumerate(best_words):
                best_words_file.write(
                    str(c) + ' [' +
                    ', '.join(map(lambda s: '\'' + s + '\'', lst)) + ']\n')
    else:
        print 'BEST WORDS FOR EACH CLUSTER:'
        for c, lst in enumerate(best_words):
            print '%d' % c, lst
        sys.stdout.flush()

    print '\nTime after NMF output: %.3f' % (time.time() - start_time)
    sys.stdout.flush()

    test_data = map(get_data, files_array)
    test_texts = [t for f, y, j, t in test_data]

    test_mat_init = vectorizer.transform(test_texts)
    del test_texts
    test_mat = tfidf_fit.transform(test_mat_init)
    del test_mat_init
    test_W = NMF_fit.transform(test_mat)
    del test_mat
    test_clusters = map(np.argmax, test_W)

    print 'Time after NMF test transform: %.3f\n' % (time.time() - start_time)

    print 'NUMBER OF CASES PER CLUSTER:'
    cluster_sizes = [
        np.sum(np.array(test_clusters) == c) for c in range(n_clusters)
    ]
    for c, sz in enumerate(cluster_sizes):
        print '%d: %d' % (c, sz)
    print
    sys.stdout.flush()

    results = zip(test_clusters, test_data)

    results.sort(
        key=lambda
        (c, (f, y, j, t)): 2000 * c + y.year)  # sort by cluster, then by year
    with open(cluster_output_filename, 'ab') as output_file:
        writer = csv.writer(output_file)
        for c, (f, y, j, t) in results:
            writer.writerow((f, y, j, c))

    print '\nTime after all remaining output: %.3f\n' % (time.time() -
                                                         start_time)

    cluster_weights = zip(files_array, test_W)
    n_cases = 20
    if best_match_cases is not None:
        with open(best_match_cases, 'ab') as best_matches_file:
            for cluster_id in range(n_clusters):
                best_matches_file.write('FOR CLUSTER %d:\n' % cluster_id)
                cluster_weights.sort(
                    key=lambda (case, weights): -weights[cluster_id])
                clusters_ranked = map(lambda (c, w): c,
                                      cluster_weights[0:n_cases])
                for case in clusters_ranked:
                    best_matches_file.write(case + '\n')
                best_matches_file.write('\n')