class BirchImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X)
# logging.info("Fitting Birch clustering for sparse coding ...") birch = Birch(threshold=0.5, branching_factor=50, n_clusters=args.dim) #MiniBatchKMeans(n_clusters=args.dim, init='k-means++', max_iter=4, batch_size=batch_size) words = [] for i, batch in enumerate(batches(sparse_word_centroids, batch_size)): #buffer.append(vstack(batch)) logging.info("Fitted the %d th batch..." % i) words.append(batch[0]) birch.partial_fit(batch[1]) words = list(chain(*words)) for i, batch in enumerate(batches(sparse_word_centroids, batch_size)): if i == 0: #word_embeddings = batch[1].dot(csr_matrix(birch.subcluster_centers_).T) word_embeddings = birch.transform(batch[1]) else: #word_embeddings = vstack([word_embeddings, batch[1].dot(csr_matrix(birch.subcluster_centers_).T)]) word_embeddings = np.vstack([word_embeddings, birch.transform(batch[1])]) # word_embeddings.shape = (vocab_size, args.dim) logging.info("DB Vocabulary size %d ..." % index.vocab_size) logging.info("Vectorizer vocabulary size %d ..." % len(vectorizer.vocabulary_.keys())) logging.info("Shape of resulting embedding matrix: ({}, {})".format(birch.subcluster_centers_.shape[0], birch.subcluster_centers_.shape[1])) logging.info("Writing word vectors into file %s ..." % args.output) write = partial(indexing.write_given_embedding, fname=args.output) with open(args.output, "w") as f: f.write("{} {}\n".format(word_embeddings.shape[0], word_embeddings.shape[1]) )
def qmrf_regions(data, edges, nbow=20, lamda=1, sampling='random', nsamples=10000, label_potential='l1', unary_sq=True, online=True, gamma=None, max_iter=5, truncated=False, rng=42, verbose=True, return_centers=False, return_edge_costs=True): with Timer('Colors'): if nbow == 'birch': clf = Birch(threshold=0.8, branching_factor=100) elif online: clf = MiniBatchKMeans(n_clusters=nbow, verbose=verbose, random_state=rng, batch_size=100, max_iter=100, max_no_improvement=10) else: clf = KMeans(n_clusters=nbow, verbose=verbose, random_state=rng) if nsamples is None: dist = clf.fit_transform(data) else: if sampling == 'random': idx = np.random.choice(data.shape[0], nsamples, replace=False) else: n = np.sqrt(nsamples) ratio = image.shape[0] / float(image.shape[1]) ny = int(n * ratio) nx = int(n / ratio) y = np.linspace(0, image.shape[0], ny, endpoint=False) + (image.shape[0] // ny // 2) x = np.linspace(0, image.shape[1], nx, endpoint=False) + (image.shape[1] // nx // 2) xx, yy = np.meshgrid(x, y) idx = np.round(yy * image.shape[1] + xx).astype(int).flatten() clf.fit(data[idx]) dist = clf.transform(data) if nbow == 'birch': centers = clf.subcluster_centers_ else: centers = clf.cluster_centers_ with Timer('Unary'): K = centers.shape[0] if label_potential == 'color': unary_cost = np.zeros((data.shape[0], centers.shape[0]), np.float32) for i in range(centers.shape[0]): unary_cost[:, i] = colordiff(data, centers[i:i + 1]) else: unary_cost = dist.astype(np.float32) if unary_sq: unary_cost **= 2 with Timer('Pairwise'): if label_potential == 'l1': label_cost = np.abs(centers[:, None, :] - centers[None, ...]).sum(-1) elif label_potential == 'l2': label_cost = np.sqrt( ((centers[:, None, :] - centers[None, ...])**2).sum(-1)) elif label_potential == 'potts': label_cost = np.ones((K, K), int) - np.eye(K, dtype=int) elif label_potential == 'color': label_cost = np.zeros((centers.shape[0], centers.shape[0]), np.float32) for i in range(centers.shape[0]): label_cost[:, i] = colordiff(centers, centers[i:i + 1]) if truncated: label_cost = np.maximum(1, label_cost) label_cost = (label_cost * lamda).astype(np.float32) if verbose: print("=================") print("Minimizing graph:") print("Nodes: %d, edges: %d, labels: %d" % \ (unary_cost.shape[0], edges.shape[0], label_cost.shape[0])) print("UnarySq: %s, LabelPotential: %s, EdgeCost: %s" % \ (unary_sq, label_potential, (gamma is not None))) print("#################") with Timer('Edge Cost'): diff = ((data[edges[:, 0]] - data[edges[:, 1]])**2).sum(axis=1) if gamma is not None and type(gamma) in [int, float]: edge_costs = np.exp(-gamma * diff).astype(np.float32) elif gamma == 'auto': edge_costs = np.exp(-diff.mean() * diff).astype(np.float32) elif gamma == 'color': edge_costs = 1. / (1. + colordiff(data[edges[:, 0]], data[edges[:, 1]])) edge_costs = edge_costs.astype(np.float32) else: edge_costs = np.ones(edges.shape[0], dtype=np.float32) with Timer('Minimize'): if label_cost.shape[0] == 2: labels = solve_binary(edges, unary_cost, edge_costs, label_cost) else: labels = solve_aexpansion(edges, unary_cost, edge_costs, label_cost) if return_centers: return labels, label_cost, centers return labels, label_cost