Beispiel #1
0
def term_clustering(terms: List[str], wv: Dict[str, np.ndarray],
                    n_clusters: int) -> Tuple[List[int], List[str]]:
    """Use spherical k-means to cluster word vectors.

  Args:
    terms: A list of terms to cluster.
    wv: A dictionary of word to their vectors.
    n_clusters: Number of output clusters.

  Returns:
    labels: A list of clustering assignment for each word.
    terms: A list of words, aligned with labels.
  """
    X = []
    X_terms = []
    n_out_of_vocab = 0
    logger.debug(f"#wv {len(wv)}")
    logger.debug(terms[:20])
    for term in terms:
        try:
            phrase = term
            emb = wv[phrase]
            X.append(emb)
            X_terms.append(phrase)
        except KeyError as e:
            n_out_of_vocab += 1

    logger.warning(f"{n_out_of_vocab} / {len(terms)} words out of vocab")
    logger.info(f"Clustering {len(X)} words")
    clus = SphericalKMeans(n_clusters=n_clusters)
    clus.fit(X)
    logger.info(f"Clustering complete")
    return clus.labels_, X_terms
Beispiel #2
0
def testSpericalKMeans():
    # Find K clusters from data matrix X (n_examples x n_features)
    # spherical k-means

    skm = SphericalKMeans(n_clusters=3)
    skm.fit(X)
    print(skm.labels_)
Beispiel #3
0
    def fit(self, data, norm=False):
        '''
        Args:
            data numpy.ndarray: [m, n] m samples every sample with n dimention
            norm boolean: False as default, 
        '''
        if not norm:
            self.mean = np.mean(data, axis=0)
            centered = data - self.mean
            process_data = self.l2norm(centered)
        else:
            process_data = data
        #clusterid, error, nfound = kcluster(process_data, nclusters=self.ncluster)#dist="u"
        #cdata, cmask = clustercentroids(process_data, mask=None, transpose=0, clusterid=clusterid, method='a')
        skm = SphericalKMeans(n_clusters=self.ncluster, verbose=0)
        skm.fit(process_data)
        self.clusters = skm.cluster_centers_
        self.clusterid = skm.labels_
        self.loss = skm.inertia_
        scores = []
        for i in range(self.ncluster):
            idxs = np.where(self.clusterid == i)[0].tolist()
            cluster_data = process_data[idxs, :]
            confs = np.dot(cluster_data, self.clusters[i, :].T)
            #print(confs)
            score = np.mean(confs)
            scores.append(score)
        #print(scores)
        self.main_id = np.argmin(scores)
        print(self.main_id)

        return self.clusters, self.clusterid, self.main_id
Beispiel #4
0
def calc_logit_regress_stats(inputs, outputs, plot_name, K_SIZE):
    skm = SphericalKMeans(n_clusters=K_SIZE)
    skm.fit(inputs)
    input_labels = skm.labels_

    out_keys = list(set(outputs))
    out_idx_mapping = {out: idx for idx, out in enumerate(out_keys)}
    #out_key_list = [out_idx_mapping[key] for key in out_keys]
    print(out_idx_mapping)

    k_center_labels = [[0] * K_SIZE for x in range(len(out_keys))]
    for k_c, lab in zip(input_labels, outputs):
        out_idx = out_idx_mapping[lab]
        k_center_labels[out_idx][k_c] += 1

    k_center_labels = np.asarray(k_center_labels)
    ind = np.arange(K_SIZE)
    plots = []
    bottom = np.zeros(K_SIZE)
    for x in range(len(out_keys)):
        plots.append(plt.bar(ind, k_center_labels[x], bottom=bottom))
        bottom += k_center_labels[x]

    plt.title('Song genres in spherical k-means clusters')
    plt.xticks(ind, ["K" + str(i + 1) for i in range(K_SIZE)])
    #plt.yticks(np.arange(0, 81, 10))
    plt.legend(plots, out_keys)
    plt.savefig(plot_name)
Beispiel #5
0
 def cluster_test(self, test_file, clusters=10):
     df_test1 = pd.read_csv(test_file)
     output = {}
     for K in clusters:
         vectors = list()
         y_true = list()
         sections = dict()
         idx = 0
         for word, section, y in df_test1.values:
             sliceIdx = self.yearDict[str(y)]
             if word in self.vocabularies[sliceIdx]:
                 if section not in sections:
                     sections[section] = idx
                     idx += 1
                 y_true.append(sections[section])
                 vectors.append(self.matrices_norm[sliceIdx][
                     self.vocabularies[sliceIdx][word]])
         skm = SphericalKMeans(n_clusters=K, max_iter=100000)
         skm.fit(np.array(vectors))
         metric = normalized_mutual_info_score(skm.predict(
             np.array(vectors)),
                                               y_true,
                                               average_method='arithmetic')
         y_true_bool = [(triplet1 == triplet2) for triplet2 in y_true
                        for triplet1 in y_true]
         y_pred = skm.predict(np.array(vectors))
         y_pred_bool = [(triplet1 == triplet2) for triplet2 in y_pred
                        for triplet1 in y_pred]
         metric2 = fbeta_score(y_true_bool, y_pred_bool, beta=5)
         output[f'NMI({K})'] = metric
         output[f'F_beta-score({K})'] = metric2
     return output
def doc_clustering(model, cluster_num):
    doc_num = len(model.docvecs.doctags.keys())
    train_data = np.array(
        [model.docvecs['a_' + str(doc + 1)] for doc in range(doc_num)])
    clusterer = SphericalKMeans(cluster_num)
    print('Start clustering...')
    clusterer.fit(train_data)
    print('Done.')
    return clusterer
Beispiel #7
0
 def _init_match(self):
     skm = SphericalKMeans(n_clusters=self.config['cluster_nums'],
                           init='k-means++',
                           n_init=20)
     data = self.data
     data = data[data['qs_embed'].apply(
         lambda x: True if np.linalg.norm(x) > 0 else False)]
     skm.fit(data['qs_embed'].tolist())
     data['skm_label'] = skm.labels_
     data = data[['qid', 'skm_label']]
     self.data = pd.merge(self.data, data, how='left', on=['qid'])
     self.data['skm_label'] = self.data['skm_label'].fillna(-1)
     self._cluster_centers = skm.cluster_centers_
Beispiel #8
0
def kmeans_codebook(patches, k=30):
    shape = patches[0].shape

    x = patches.reshape(-1, shape[0] * shape[1])
    # normalize
    #x = x / ( 1e-6 + x.sum(axis=1, keepdims=True) )

    est = SphericalKMeans(k)
    #est = KMeans(n_clusters=k)
    est.fit(x)

    codebook = est.cluster_centers_.reshape(-1, shape[0], shape[1])
    return codebook
def SphericalkMeansCluster(X,nfclusters):
	# Find K clusters from data matrix X (n_examples x n_features)

	# spherical k-means
	
	skm = SphericalKMeans(nfclusters)
	skm.fit(X)

	#print(skm.cluster_centers_)
	#print("Labels =")
	#print(skm.labels_)
	#print("Inertia = ")
	#print(nfclusters,skm.inertia_)
	#return skm.inertia_
	return skm.labels_
class Clusterer:
    def __init__(self, data, n_cluster):
        self.data = data
        self.n_cluster = n_cluster
        self.clus = SphericalKMeans(n_cluster)
        self.clusters = defaultdict(list)  # cluster id -> members
        self.membership = None  # a list contain the membership of the data points
        self.center_ids = None  # a list contain the ids of the cluster centers
        self.inertia_scores = None

    def fit(self):
        print("bbbbbbb")
        self.clus.fit(self.data)
        print("bbbbbbb")
        labels = self.clus.labels_
        print("bbbbbbb")
        for idx, label in enumerate(labels):
            self.clusters[label].append(idx)
        print("bbbbbbb")
        self.membership = labels
        print("bbbbbbb")
        self.center_ids = self.gen_center_idx()
        print("bbbbbbb")
        self.inertia_scores = self.clus.inertia_
        print('Clustering concentration score:', self.inertia_scores)

    # find the idx of each cluster center
    def gen_center_idx(self):
        ret = []
        for cluster_id in range(self.n_cluster):
            center_idx = self.find_center_idx_for_one_cluster(cluster_id)
            ret.append((cluster_id, center_idx))
        return ret

    def find_center_idx_for_one_cluster(self, cluster_id):
        query_vec = self.clus.cluster_centers_[cluster_id]
        members = self.clusters[cluster_id]
        best_similarity, ret = -1, -1
        for member_idx in members:
            member_vec = self.data[member_idx]
            cosine_sim = self.calc_cosine(query_vec, member_vec)
            if cosine_sim > best_similarity:
                best_similarity = cosine_sim
                ret = member_idx
        return ret

    def calc_cosine(self, vec_a, vec_b):
        return 1 - cosine(vec_a, vec_b)
def cluster_doc(doc_emb, K, method):
    y_pred = []
    if method == "kmeans":
        # k-means
        print("Clustering using K-Means")
        from sklearn.cluster import KMeans
        km = KMeans(n_clusters=K, n_init=1)
        km.fit(doc_emb)
        y_pred = km.labels_
    elif method == "skmeans":
        # spherical k-means
        print("Clustering using Spherical K-Means")
        from spherecluster import SphericalKMeans
        skm = SphericalKMeans(n_clusters=K, n_init=1)
        skm.fit(doc_emb)
        y_pred = skm.labels_
    return y_pred
    def initialize(self):

        self.R12_train = np.multiply(NMTF1.R12, self.M)
        """spherical k-means"""
        skm1 = SphericalKMeans(n_clusters=self.K[0])
        skm1.fit(self.R12_train.transpose())
        skm2 = SphericalKMeans(n_clusters=self.K[1])
        skm2.fit(self.R12_train)

        self.G1 = skm1.cluster_centers_.transpose()
        self.G2 = skm2.cluster_centers_.transpose()

        self.S12 = np.linalg.multi_dot(
            [self.G1.transpose(), self.R12_train, self.G2])

        #Save the factor matrices for the mext models
        NMTF1.G1 = self.G1
        NMTF1.G2 = self.G2
Beispiel #13
0
    def semantic_sim_driver(self,time_mapping,log_filename = "yao_test1.txt",):

        df = pd.read_csv("eval/yao/testset_1.csv")


        try:
            df.real_year = df.year.apply(lambda x: int(time_mapping[str(x)]))

        except Exception as  e:
            print(e)
            print(time_mapping.keys())
            print(df.year.unique())
            df.real_year = df.year.apply(lambda x: int(time_mapping[str(x // 10 * 10) + "s"]))

        labels = set(df.label.unique())
        labels_mapping =  { label : index  for index,label in enumerate(labels) }
        df.label_id = df.label.apply(lambda  x: labels_mapping[x])
        # print(df.label_id)

        embeddings,known_index = self.get_embedding_in_a_year(df.word,df.real_year.tolist(),return_known_index =True)

        from spherecluster import SphericalKMeans

        scores = []
        for n in [10,15,20]:
            skm = SphericalKMeans(n_clusters = n)
            skm.fit(embeddings)
            # print(skm.labels_.shape)
            # print(len(df.label_id[known_index]))
            # print(sum(known_index))
            score = get_score(skm.labels_,df.label_id[known_index])
            score1 = get_score1(skm.labels_,df.label_id[known_index])
            scores.append(score)
            scores.append(score1)

        print(scores)

        with open(log_filename, "w", encoding="utf-8") as f:
            line = "\t".join(["{0:.4f}".format(s) for s  in scores]) + "\n"
            print(line)
            f.write(line)

        return None
    def initialize(self):

        self.R12_train = np.multiply(NMTF2.R12, self.M)
        """spherical k-means"""
        skm3 = SphericalKMeans(n_clusters=self.K[2])
        skm3.fit(NMTF2.R23)

        #Reload matrices that have already been used before
        self.G1 = NMTF1.G1
        self.G2 = NMTF1.G2
        self.G3 = skm3.cluster_centers_.transpose()

        self.S12 = np.linalg.multi_dot(
            [self.G1.transpose(), self.R12_train, self.G2])
        self.S23 = np.linalg.multi_dot(
            [self.G2.transpose(), NMTF2.R23, self.G3])

        #Save G3 for the next models
        NMTF2.G3 = self.G3
Beispiel #15
0
def get_topic_vecs(model, n_topics=20):
    """ Computes and returns the topic vectors of a doc2vec model. the topic
        vectors are simply the centroids of the classes after the documents
        have been clustered. They are therefore "virtual" documents that are
        an average of a group of similar documents.
        Arguments:
            - (gensim.models.doc2vec.Doc2Vec) model: A doc2vec model
            - (<float>) n_topics: The number of topics that should be
                found, defaults to 20.
        Returns:
            - (numpy.ndarray) topics: The topic vectors of the model
    """
    from spherecluster import SphericalKMeans
    skm = SphericalKMeans(n_clusters=n_topics)
    # getting the data as a numpy array
    dv = model.docvecs.vectors_docs
    # carrying out K-means to group documents by topic
    skm.fit(dv)
    # extracting topic vectors (centroids of the groups)
    return skm.cluster_centers_
    def initialize(self):

        self.R12_train = np.multiply(NMTF5.R12, self.M)
        """spherical k-means"""
        skm5 = SphericalKMeans(n_clusters=self.K[4])
        skm5.fit(NMTF5.R25)

        self.G1 = NMTF1.G1
        self.G2 = NMTF1.G2
        self.G3 = NMTF2.G3
        self.G4 = NMTF3.G4
        self.G5 = skm5.cluster_centers_.transpose()

        self.S12 = np.linalg.multi_dot(
            [self.G1.transpose(), self.R12_train, self.G2])
        self.S23 = np.linalg.multi_dot(
            [self.G2.transpose(), NMTF5.R23, self.G3])
        self.S34 = np.linalg.multi_dot(
            [self.G3.transpose(), NMTF5.R34, self.G4])
        self.S25 = np.linalg.multi_dot(
            [self.G2.transpose(), NMTF5.R25, self.G5])
Beispiel #17
0
 def cluster(self, docs, k):
     vecs = []
     words = []
     cnt = 0
     for doc in docs:
         cnt += 1
         #print('processing doc {}'.format(cnt), end='\r')
         ws = self.extract_keywords(doc)
         words.append(ws)
         vecs.append(self.sent2vec(ws))
     print('processing doc {} over.'.format(cnt))
     skm = SphericalKMeans(n_clusters=k)
     result = skm.fit(np.array(vecs))
     return result.labels_, words
    def initialize(self):

        self.R12_train = np.multiply(NMTF3.R12, self.M)
        """spherical k-means"""
        skm4 = SphericalKMeans(n_clusters=self.K[3])
        skm4.fit(NMTF3.R34)

        self.G4 = skm4.cluster_centers_.transpose()

        #Use the same matrices as those precedently computed
        self.G1 = NMTF1.G1
        self.G2 = NMTF1.G2
        self.G3 = NMTF2.G3

        self.S12 = np.linalg.multi_dot(
            [self.G1.transpose(), self.R12_train, self.G2])
        self.S23 = np.linalg.multi_dot(
            [self.G2.transpose(), NMTF3.R23, self.G3])
        self.S34 = np.linalg.multi_dot(
            [self.G3.transpose(), NMTF3.R34, self.G4])

        #Save G4 for next models
        NMTF3.G4 = self.G4
    def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int,
                       n_clusters: int) -> Tuple[float, int]:
        """
        Calculate the gap value of the given data, n_refs, and number of clusters.
        Return the resutling gap value and n_clusters
        """
        # Holder for reference dispersion results
        ref_dispersions = np.zeros(n_refs)  # type: np.ndarray

        # For n_references, generate random sample and perform kmeans getting resulting dispersion of each loop
        # print(0, n_refs)
        for i in range(n_refs):
            # Create new random reference set
            random_data = random_sample_data(
                X, random_sampling=self.random_sampling)

            # Fit to it, getting the centroids and labels, and add to accumulated reference dispersions array.
            if self.algo == "kmeans2":
                centroids, labels = kmeans2(
                    data=random_data, k=n_clusters, iter=10,
                    minit='points')  # type: Tuple[np.ndarray, np.ndarray]
                dispersion = self._calculate_dispersion(
                    X=random_data, labels=labels,
                    centroids=centroids)  # type: float
            elif self.algo == "kmeans":
                centroids, dispersion = kmeans(
                    obs=random_data, k_or_guess=n_clusters,
                    iter=10)  # type: Tuple[np.ndarray, np.ndarray]
            elif self.algo == "skl-kmeans":
                km = KMeans(n_clusters=n_clusters, random_state=0)
                km.fit(random_data)
                centroids, labels = km.cluster_centers_, km.labels_
                dispersion = km.inertia_
            elif self.algo == "sph-kmeans":
                skm = SphericalKMeans(n_clusters=n_clusters, random_state=0)
                skm.fit(random_data)
                centroids, labels = skm.cluster_centers_, skm.labels_
                dispersion = skm.inertia_

            ref_dispersions[i] = dispersion

        # Fit cluster to original data and create dispersion calc.
        if self.algo == "kmeans2":
            centroids, labels = kmeans2(data=X,
                                        k=n_clusters,
                                        iter=10,
                                        minit='points')
            dispersion = self._calculate_dispersion(X=X,
                                                    labels=labels,
                                                    centroids=centroids)
        elif self.algo == "kmeans":
            centroids, dispersion = kmeans(
                obs=X, k_or_guess=n_clusters,
                iter=10)  # type: Tuple[np.ndarray, np.ndarray]
        elif self.algo == "skl-kmeans":
            km = KMeans(n_clusters=n_clusters, random_state=0)
            km.fit(X)
            centroids, labels = km.cluster_centers_, km.labels_
            dispersion = km.inertia_
        elif self.algo == "sph-kmeans":
            skm = SphericalKMeans(n_clusters=n_clusters, random_state=0)
            skm.fit(X)
            centroids, labels = skm.cluster_centers_, skm.labels_
            dispersion = skm.inertia_

        # Calculate gap statistic
        ref_log_dispersion = np.mean(np.log(ref_dispersions))
        log_dispersion = np.log(dispersion)
        gap_value = ref_log_dispersion - log_dispersion

        # compute standard deviation
        sdk = np.sqrt(
            np.mean((np.log(ref_dispersions) - ref_log_dispersion)**2.))
        sk = np.sqrt(1. + 1. / n_refs) * sdk

        return gap_value, int(
            n_clusters), log_dispersion, ref_log_dispersion, sk
            data = f.read()
        text.append(clean_str(data))
        cat_list.append(start)

result = np.zeros((1, len(cat_list)), dtype=np.int)
result = result.tolist()[0]

vectorizer = CountVectorizer(min_df=1,
                             stop_words='english',
                             strip_accents='ascii')
count_vectorizer = vectorizer.fit_transform(text)
transformer = TfidfTransformer(smooth_idf=True)
tfidf = transformer.fit_transform(count_vectorizer)

km = SphericalKMeans(n_clusters=len(categories))
clusters = km.fit(tfidf)
centroids = km.cluster_centers_
labels = km.labels_

for c in range(len(categories)):
    idx = np.where(labels == c)[0]
    for l in idx:
        result[l] = c + 1

print len(result)
print result

score = normalized_mutual_info_score(cat_list, result)
adjusted_score = adjusted_mutual_info_score(cat_list, result)
print score
print adjusted_score
Beispiel #21
0
def sphe_kmeans(matrix, n_clusters, nb_init):
    labeler = SphericalKMeans(n_clusters=n_clusters,
                              n_init=nb_init,
                              max_iter=100)
    print("sphe_kmeans")
    return labeler.fit(matrix)
km_mu_0_idx = np.argmin(cdists)
km_mu_1_idx = 1 - km_mu_0_idx

km_mu_0_error = np.linalg.norm(mus[0] - km.cluster_centers_[km_mu_0_idx])
km_mu_1_error = np.linalg.norm(mus[1] - km.cluster_centers_[km_mu_1_idx])
km_mu_0_error_norm = np.linalg.norm(
    mus[0] - km.cluster_centers_[km_mu_0_idx] /
    np.linalg.norm(km.cluster_centers_[km_mu_0_idx]))
km_mu_1_error_norm = np.linalg.norm(
    mus[1] - km.cluster_centers_[km_mu_1_idx] /
    np.linalg.norm(km.cluster_centers_[km_mu_1_idx]))

###############################################################################
# Spherical K-Means clustering
skm = SphericalKMeans(n_clusters=2, init='k-means++', n_init=20)
skm.fit(X)

cdists = []
for center in skm.cluster_centers_:
    cdists.append(np.linalg.norm(mus[0] - center))

skm_mu_0_idx = np.argmin(cdists)
skm_mu_1_idx = 1 - skm_mu_0_idx

skm_mu_0_error = np.linalg.norm(mus[0] - skm.cluster_centers_[skm_mu_0_idx])
skm_mu_1_error = np.linalg.norm(mus[1] - skm.cluster_centers_[skm_mu_1_idx])

###############################################################################
# Mixture of von Mises Fisher clustering (soft)
vmf_soft = VonMisesFisherMixture(n_clusters=2,
                                 posterior_type='soft',
from sklearn import metrics



nps=np.load("/home/psrivastava/Intern_Summer/data/tfs_encode.npy")
df=pd.DataFrame(nps,columns=['embds','title','sets','catg'])

embed=[np.array(x) for x in df.iloc[:20000,0].to_list()]
sets=[x for x in df.iloc[:20000,2].to_list()]
catg=[x for x in df.iloc[:20000,3].to_list()]
title=[x for x in df.iloc[:20000,1].to_list()]
#dim_reduc=KernelPCA(n_components=2000,kernel='cosine').fit_transform(np.array(embed))
#print(dim_reduc.shape)
print(np.squeeze(np.array(embed)).shape)
skm=SphericalKMeans(n_clusters=11)
skm.fit(np.squeeze(np.array(embed)))

X_embeded=TSNE(n_components=2,metric="cosine").fit_transform(np.squeeze(np.array(embed)))
dim_reduc=X_embeded
uni,coun=np.unique(np.array(sets),return_counts=True)
print("Dimension reduc",X_embeded.shape)
print(dict(zip(uni,coun)))




la=skm.labels_
print(metrics.silhouette_score(np.squeeze(np.array(embed)),la,metric="cosine"))
#cluster_center=skm.cluster_centers_
#print(cluster_center.shape)
#print(skm.inertia_.shape)
Beispiel #24
0
    def initialize(self, initialize_strategy,verbose):
        if initialize_strategy == "random":
            if verbose==True:
                print("Association matrix filename: " + self.filename) 
                print("Used parameters: " + '\033[1m' + " k\u2081 = " + str(self.k1) + " and" + " k\u2082 = " + str(self.k2) + '\033[0m')
                print("Non-zero elements of the association matrix = " + '\033[1m' + "{}".format(np.count_nonzero(self.association_matrix)) + '\033[0m')
            if self.G_left is None:
                self.G_left = np.random.rand(self.association_matrix.shape[0], self.k1)
                self.G_left_primary = True
            if self.G_right is None:
                self.G_right = np.random.rand(self.association_matrix.shape[1], self.k2)
                self.G_right_primary = True
        elif initialize_strategy == "oldkmeans":
            if verbose==True:
                print("Association matrix filename: " + self.filename) 
                print("Used parameters: " + '\033[1m' + " k\u2081 = " + str(self.k1) + " and" + " k\u2082 = " + str(self.k2) + '\033[0m')
                print("Non-zero elements of the association matrix = " + '\033[1m' + "{}".format(np.count_nonzero(self.association_matrix)) + '\033[0m')
            if self.G_left is None:
              with suppress_stdout():
                km = KMeans(n_clusters=self.k1).fit(self.association_matrix)
                self.G_left = np.zeros((self.association_matrix.shape[0], self.k1))
                for row in range(self.association_matrix.shape[0]):
                    for col in range(self.k1):
                        self.G_left[row,col] = np.linalg.norm(self.association_matrix[row] - km.cluster_centers_[col])
                self.G_left_primary = True
            if self.G_right is None:
              with suppress_stdout():
                km = KMeans(n_clusters=self.k2).fit(self.association_matrix.transpose())
                self.G_right = np.zeros((self.association_matrix.shape[1], self.k2))
                for row in range(self.association_matrix.shape[1]):
                    for col in range(self.k2):
                        self.G_right[row,col] = np.linalg.norm(self.association_matrix.transpose()[row] - km.cluster_centers_[col])
                self.G_right_primary = True
        elif initialize_strategy == "kmeans":
            if verbose==True:
                print("Association matrix filename: " + self.filename) 
                print("Used parameters: " + '\033[1m' + " k\u2081 = " + str(self.k1) + " and" + " k\u2082 = " + str(self.k2) + '\033[0m')
                print("Non-zero elements of the association matrix = " + '\033[1m' + "{}".format(np.count_nonzero(self.association_matrix)) + '\033[0m')
            if self.G_left is None:
              with suppress_stdout():
                km = KMeans(n_clusters=self.k1, n_init = 10).fit_predict(self.association_matrix.transpose())
                self.G_left = np.array([np.mean([self.association_matrix[:,i] for i in range(len(km)) if km[i] == p], axis = 0) for p in range(self.k1)]).transpose()
                self.G_left_primary = True
            if self.G_right is None:
              with suppress_stdout():
                km = KMeans(n_clusters=self.k2, n_init = 10).fit_predict(self.association_matrix)
                self.G_right = np.array([np.mean([self.association_matrix[i] for i in range(len(km)) if km[i] == p], axis = 0) for p in range(self.k2)]).transpose()
                self.G_right_primary = True
        elif initialize_strategy == "skmeans":
            if verbose==True:
                print("Association matrix filename: " + self.filename) 
                print("Used parameters: " + '\033[1m' + " k\u2081 = " + str(self.k1) + " and" + " k\u2082 = " + str(self.k2) + '\033[0m')
                print("Non-zero elements of the association matrix = " + '\033[1m' + "{}".format(np.count_nonzero(self.association_matrix)) + '\033[0m')
            #with suppress_stdout():
            if self.G_left is None:
              with suppress_stdout():
                skm = SphericalKMeans(n_clusters=self.k1)
                skm = skm.fit(self.association_matrix.transpose())
            #Factor matrices are initialized with the center coordinates
                self.G_left = skm.cluster_centers_.transpose()
                self.G_left_primary = True
            if self.G_right is None:
              with suppress_stdout():
                skm = SphericalKMeans(n_clusters=self.k2).fit(self.association_matrix) 
            #Factor matrices are initialized with the center coordinates
                self.G_right = skm.cluster_centers_.transpose()
                self.G_right_primary = True



        for am in self.dep_own_left_other_left:
            if am.G_left is None:
                am.G_left = self.G_left
        for am in self.dep_own_left_other_right:
            if am.G_right is None:
                am.G_right = self.G_left
        for am in self.dep_own_right_other_left:
            if am.G_left is None:
                am.G_left = self.G_right
        for am in self.dep_own_right_other_right:
            if am.G_right is None:
                am.G_right = self.G_right
        if verbose==True:
            print(self.leftds, self.rightds, self.association_matrix.shape)
            print("Shape Factor Matrix left " +  str(self.G_left.shape))
            print("Shape Factor Matrix right " +  str(self.G_right.shape) + "\n")
        self.S = np.linalg.multi_dot([self.G_left.transpose(), self.association_matrix, self.G_right])
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    if args.random:
        centers = Xtrain[np.random.choice(Xtrain.shape[0],
                                          replace=False,
                                          size=args.num_centers)]
    else:
        kmeans = SphericalKMeans(args.num_centers, verbose=1, n_init=5)
        dump_path = Path("out") / ("kmeans-cifar-%u-%u.pkl" %
                                   (args.num_centers, args.seed))
        if dump_path.exists():
            with open(dump_path, 'rb') as fp:
                centers = pickle.load(fp)
        else:
            kmeans.fit(Xtrain)
            centers = kmeans.cluster_centers_
            with open(dump_path, 'wb') as fp:
                pickle.dump(centers, fp)

    print("Calculating distances...")
    Xtrain = cdist(Xtrain, centers, "cosine")
    Xtest = cdist(Xtest, centers, "cosine")
    Xtrain, Xtest = 1 - Xtrain, 1 - Xtest

    print("Sorting...")
    if args.top_centers is not None:
        furthest = (-Xtrain).argsort(axis=1)[:, args.top_centers:]
        rows = np.array([[i] * furthest.shape[1]
                         for i in range(Xtrain.shape[0])]).ravel()
        cols = furthest.ravel()
Beispiel #26
0
                             strip_accents='ascii')
count_vectorizer = vectorizer.fit_transform(text)
transformer = TfidfTransformer(smooth_idf=True)
tfidf = transformer.fit_transform(count_vectorizer)

#LSA

svd = TruncatedSVD(100)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
reduced_matrix = lsa.fit_transform(tfidf)

print reduced_matrix.shape

km = SphericalKMeans(n_clusters=len(categories))
clusters = km.fit(reduced_matrix)
centroids = km.cluster_centers_
labels = km.labels_

for c in range(len(categories)):
    idx = np.where(labels == c)[0]
    for l in idx:
        result[l] = c + 1

print len(result)
print result

score = normalized_mutual_info_score(cat_list, result)
adjusted_score = adjusted_mutual_info_score(cat_list, result)
print score
print adjusted_score
    centroid_file.write(str(len(doc_centroids[0])) + "\n")
    for cen in doc_centroids:
        for i in range(len(cen)):
            centroid_file.write(str(cen[i]) + "\n")
    centroid_file.close()


############################################################################################

centroids = [list() for x in range(11)]
number_of_clusters = 20
for i in range(11):
    number_of_clusters = min(number_of_clusters, len(documents[i]))
kmeans_clustering = SphericalKMeans(number_of_clusters)
for i in range(11):
    idx = kmeans_clustering.fit(documents[i])
    centroids[i] = idx.cluster_centers_
    writeDocVectorCentroids(centroids[i], i)
    print str(i) + " clustered"

###########################################################################################


###################################Cosine Distance#########################################
def cosineDistance(vector1, vector2, vectorSize):
    res = 0
    norm1 = 0
    norm2 = 0
    for i in range(vectorSize):
        res += vector1[i] * vector2[i]
    for i in range(vectorSize):
Beispiel #28
0
 def initialize(self):
     
     self.R12_train = np.multiply(NMTF.R12, self.M)
     
     if self.init_method == 'random':
         """Random uniform"""
         self.G1 = np.random.rand(NMTF.n1, self.K[0])
         self.G2 = np.random.rand(NMTF.n2, self.K[1])
         self.G3 = np.random.rand(NMTF.n3, self.K[2])
         self.G4 = np.random.rand(NMTF.n4, self.K[3])
         self.G5 = np.random.rand(NMTF.n5, self.K[4])
     
     if self.init_method == 'skmeans':
         """spherical k-means"""
         
         #Sperical k-means clustering is done on the initial data
         skm1 = SphericalKMeans(n_clusters=self.K[0])
         skm1.fit(self.R12_train.transpose())
         skm2 = SphericalKMeans(n_clusters=self.K[1])
         skm2.fit(self.R12_train)
         skm3 = SphericalKMeans(n_clusters=self.K[2])
         skm3.fit(NMTF.R23)
         skm4 = SphericalKMeans(n_clusters=self.K[3])
         skm4.fit(NMTF.R34)
         skm5 = SphericalKMeans(n_clusters=self.K[4])
         skm5.fit(NMTF.R25)
         
         #Factor matrices are initialized with the center coordinates
         self.G1 = skm1.cluster_centers_.transpose()
         self.G2 = skm2.cluster_centers_.transpose()
         self.G3 = skm3.cluster_centers_.transpose()
         self.G4 = skm4.cluster_centers_.transpose()
         self.G5 = skm5.cluster_centers_.transpose()
         
     if self.init_method == 'acol':
         """random ACOL"""
         #We will "shuffle" the columns of R matrices and take the mean of k batches
         Num1 = np.random.permutation(NMTF.n2)
         Num2 = np.random.permutation(NMTF.n1)
         Num3 = np.random.permutation(NMTF.n2)
         Num4 = np.random.permutation(NMTF.n3)
         Num5 = np.random.permutation(NMTF.n2)
         
         G1 = []
         for l in np.array_split(Num1, self.K[0]):
             G1.append(np.mean(self.R12_train[:,l], axis = 1))
         self.G1 = np.array(G1).transpose()
         
         G2 = []
         for l in np.array_split(Num2, self.K[1]):
             G2.append(np.mean(self.R12_train.transpose()[:,l], axis = 1))
         self.G2 = np.array(G2).transpose()
         
         G3 = []
         for l in np.array_split(Num3, self.K[2]):
             G3.append(np.mean(NMTF.R23.transpose()[:,l], axis = 1))
         self.G3 = np.array(G3).transpose()
         
         G4 = []
         for l in np.array_split(Num4, self.K[3]):
             G4.append(np.mean(NMTF.R34.transpose()[:,l], axis = 1))
         self.G4 = np.array(G4).transpose()
         
         G5 = []
         for l in np.array_split(Num5, self.K[4]):
             G5.append(np.mean(NMTF.R25.transpose()[:,l], axis = 1))
         self.G5 = np.array(G5).transpose()
     
     if self.init_method == 'kmeans':
         """k-means with clustering on previous item"""
         #As for spherical k-means, factor matrices will be initialized with the centers of clusters.
         km1 = KMeans(n_clusters=self.K[0], n_init = 10).fit_predict(self.R12_train.transpose())
         km2 = KMeans(n_clusters=self.K[1], n_init = 10).fit_predict(self.R12_train)
         km3 = KMeans(n_clusters=self.K[2], n_init = 10).fit_predict(self.R23)
         km4 = KMeans(n_clusters=self.K[3], n_init = 10).fit_predict(self.R34)
         km5 = KMeans(n_clusters=self.K[4], n_init = 10).fit_predict(self.R25)
         
         self.G1 = np.array([np.mean([self.R12_train[:,i] for i in range(len(km1)) if km1[i] == p], axis = 0) for p in range(self.K[0])]).transpose()
         self.G2 = np.array([np.mean([self.R12_train[i] for i in range(len(km2)) if km2[i] == p], axis = 0) for p in range(self.K[1])]).transpose()
         self.G3 = np.array([np.mean([self.R23[i] for i in range(len(km3)) if km3[i] == p], axis = 0) for p in range(self.K[2])]).transpose()
         self.G4 = np.array([np.mean([self.R34[i] for i in range(len(km4)) if km4[i] == p], axis = 0) for p in range(self.K[3])]).transpose()
         self.G5 = np.array([np.mean([self.R25[i] for i in range(len(km5)) if km5[i] == p], axis = 0) for p in range(self.K[4])]).transpose()
         
     self.S12 = np.linalg.multi_dot([self.G1.transpose(), self.R12_train, self.G2])
     self.S23 = np.linalg.multi_dot([self.G2.transpose(), self.R23, self.G3])
     self.S34 = np.linalg.multi_dot([self.G3.transpose(), self.R34, self.G4])
     self.S25 = np.linalg.multi_dot([self.G2.transpose(), self.R25, self.G5])
Beispiel #29
0
    docs, terms = doc_term_matrix.nonzero()
    for doc, term in zip(docs, terms):
        # Inefficient way of calculating the average...
        doc_num_terms[doc] = doc_num_terms[doc] + 1
        doc_embed_matrix[doc, :] = ((doc_embed_matrix[doc, :] * \
                                     doc_num_terms[doc] - 1) + \
                                    (doc_term_matrix[doc, term] * embed[term])
                                   ) / doc_num_terms[doc]

    print('Formed document embedding matrix in {:.6f}s'.format(time() - t0))

    # Cluster documents by spherical K-means (cluster centroids are projected
    # onto the unit hypersphere)
    t0 = time()
    skm = SphericalKMeans(n_clusters=n_clusters)
    skm.fit(doc_embed_matrix)

    print('Clustered documents by spherical K-means in {:.6f}s'.format(time() -
                                                                       t0))

    # Add column to dataframe to hold assigned cluster and output to CSV
    grants['cluster'] = skm.labels_.tolist()
    grants.to_csv(path_or_buf=output_data)

    # Find terms associated with each cluster by applying Tf-idf to a corpus of
    # documents that are the concatenation of all grants assigned to each cluster.
    t0 = time()
    cluster_docs = grants.groupby('cluster')['document'].agg(
        lambda x: ' '.join(x))

    cluster_vectorizer = TfidfVectorizer(stop_words='english')
def kb_doc_clustering(train_data, cluster_num):
    clusterer = SphericalKMeans(cluster_num)
    print('Start clustering...')
    clusterer.fit(train_data)
    print('Done.')
    return clusterer