コード例 #1
0
class BirchImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)

    def predict(self, X):
        return self._wrapped_model.predict(X)
コード例 #2
0
# 
logging.info("Fitting Birch clustering for sparse coding ...")
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=args.dim) #MiniBatchKMeans(n_clusters=args.dim, init='k-means++', max_iter=4, batch_size=batch_size)
words = []
for i, batch in enumerate(batches(sparse_word_centroids, batch_size)):
    #buffer.append(vstack(batch))
    logging.info("Fitted the %d th batch..." % i)
    words.append(batch[0])
    birch.partial_fit(batch[1])

words = list(chain(*words))

for i, batch in enumerate(batches(sparse_word_centroids, batch_size)):
    if i == 0:
        #word_embeddings = batch[1].dot(csr_matrix(birch.subcluster_centers_).T)
        word_embeddings = birch.transform(batch[1])
    else:
        #word_embeddings = vstack([word_embeddings, batch[1].dot(csr_matrix(birch.subcluster_centers_).T)])
        
        word_embeddings = np.vstack([word_embeddings, birch.transform(batch[1])])

# word_embeddings.shape = (vocab_size, args.dim)
logging.info("DB Vocabulary size %d ..." % index.vocab_size)
logging.info("Vectorizer vocabulary size %d ..." % len(vectorizer.vocabulary_.keys()))
logging.info("Shape of resulting embedding matrix: ({}, {})".format(birch.subcluster_centers_.shape[0],
                                                                    birch.subcluster_centers_.shape[1]))
logging.info("Writing word vectors into file %s ..." % args.output)
write = partial(indexing.write_given_embedding, fname=args.output)
                                                
with open(args.output, "w") as f:
    f.write("{} {}\n".format(word_embeddings.shape[0], word_embeddings.shape[1]) )
コード例 #3
0
def qmrf_regions(data,
                 edges,
                 nbow=20,
                 lamda=1,
                 sampling='random',
                 nsamples=10000,
                 label_potential='l1',
                 unary_sq=True,
                 online=True,
                 gamma=None,
                 max_iter=5,
                 truncated=False,
                 rng=42,
                 verbose=True,
                 return_centers=False,
                 return_edge_costs=True):
    with Timer('Colors'):
        if nbow == 'birch':
            clf = Birch(threshold=0.8, branching_factor=100)
        elif online:
            clf = MiniBatchKMeans(n_clusters=nbow,
                                  verbose=verbose,
                                  random_state=rng,
                                  batch_size=100,
                                  max_iter=100,
                                  max_no_improvement=10)
        else:
            clf = KMeans(n_clusters=nbow, verbose=verbose, random_state=rng)

        if nsamples is None:
            dist = clf.fit_transform(data)
        else:
            if sampling == 'random':
                idx = np.random.choice(data.shape[0], nsamples, replace=False)
            else:
                n = np.sqrt(nsamples)
                ratio = image.shape[0] / float(image.shape[1])
                ny = int(n * ratio)
                nx = int(n / ratio)
                y = np.linspace(0, image.shape[0], ny,
                                endpoint=False) + (image.shape[0] // ny // 2)
                x = np.linspace(0, image.shape[1], nx,
                                endpoint=False) + (image.shape[1] // nx // 2)
                xx, yy = np.meshgrid(x, y)
                idx = np.round(yy * image.shape[1] + xx).astype(int).flatten()
            clf.fit(data[idx])
            dist = clf.transform(data)

        if nbow == 'birch':
            centers = clf.subcluster_centers_
        else:
            centers = clf.cluster_centers_

    with Timer('Unary'):
        K = centers.shape[0]

        if label_potential == 'color':
            unary_cost = np.zeros((data.shape[0], centers.shape[0]),
                                  np.float32)
            for i in range(centers.shape[0]):
                unary_cost[:, i] = colordiff(data, centers[i:i + 1])
        else:
            unary_cost = dist.astype(np.float32)

        if unary_sq:
            unary_cost **= 2

    with Timer('Pairwise'):
        if label_potential == 'l1':
            label_cost = np.abs(centers[:, None, :] -
                                centers[None, ...]).sum(-1)
        elif label_potential == 'l2':
            label_cost = np.sqrt(
                ((centers[:, None, :] - centers[None, ...])**2).sum(-1))
        elif label_potential == 'potts':
            label_cost = np.ones((K, K), int) - np.eye(K, dtype=int)
        elif label_potential == 'color':
            label_cost = np.zeros((centers.shape[0], centers.shape[0]),
                                  np.float32)
            for i in range(centers.shape[0]):
                label_cost[:, i] = colordiff(centers, centers[i:i + 1])
        if truncated:
            label_cost = np.maximum(1, label_cost)
        label_cost = (label_cost * lamda).astype(np.float32)

    if verbose:
        print("=================")
        print("Minimizing graph:")
        print("Nodes: %d, edges: %d, labels: %d" % \
              (unary_cost.shape[0], edges.shape[0], label_cost.shape[0]))
        print("UnarySq: %s, LabelPotential: %s, EdgeCost: %s" % \
              (unary_sq, label_potential, (gamma is not None)))
        print("#################")

    with Timer('Edge Cost'):
        diff = ((data[edges[:, 0]] - data[edges[:, 1]])**2).sum(axis=1)
        if gamma is not None and type(gamma) in [int, float]:
            edge_costs = np.exp(-gamma * diff).astype(np.float32)
        elif gamma == 'auto':
            edge_costs = np.exp(-diff.mean() * diff).astype(np.float32)
        elif gamma == 'color':
            edge_costs = 1. / (1. +
                               colordiff(data[edges[:, 0]], data[edges[:, 1]]))
            edge_costs = edge_costs.astype(np.float32)
        else:
            edge_costs = np.ones(edges.shape[0], dtype=np.float32)

    with Timer('Minimize'):
        if label_cost.shape[0] == 2:
            labels = solve_binary(edges, unary_cost, edge_costs, label_cost)
        else:
            labels = solve_aexpansion(edges, unary_cost, edge_costs,
                                      label_cost)

    if return_centers:
        return labels, label_cost, centers

    return labels, label_cost