Ejemplo n.º 1
0
 def cluster(self,
             assignAndReturnDetails=False,
             numberOfTopFeatures=5,
             algorithmSource='nltk',
             **kwargs):
     bestFeatures, error = {}, None
     if algorithmSource == 'nltk':
         clusterer = cluster.KMeansClusterer(self.numberOfClusters,
                                             euclidean_distance, **kwargs)
         clusters = clusterer.cluster(self.vectors, True)
         means = clusterer.means()
         for id, mean in zip(clusterer.cluster_names(), means):
             bestFeatures[id] = [
                 (dimension, score) for dimension, score in
                 sorted(zip([
                     self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i)
                     for i in range(len(mean))
                 ], mean),
                        key=itemgetter(1),
                        reverse=True)[:numberOfTopFeatures] if score > 0
             ]
     elif algorithmSource == 'biopython':
         from Bio.Cluster import kcluster, clustercentroids
         clusters, error, _ = kcluster(self.vectors,
                                       nclusters=self.numberOfClusters,
                                       npass=kwargs['repeats'])
         means, _ = clustercentroids(self.vectors, self.masks, clusters)
         means = [unitVector(c) for c in means]
         for id, mean in zip(range(len(means)), means):
             bestFeatures[id] = [
                 (dimension, score) for dimension, score in
                 sorted(zip([
                     self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i)
                     for i in range(len(mean))
                 ], mean),
                        key=itemgetter(1),
                        reverse=True)[:numberOfTopFeatures] if score > 0
             ]
     if assignAndReturnDetails:
         documentAssignments = sorted(
             [(docId, clusterId)
              for docId, clusterId in zip(self.docIds, clusters)],
             key=itemgetter(1))
         clusters = dict(
             (clusterId, [t[0] for t in documents])
             for clusterId, documents in groupby(documentAssignments,
                                                 key=itemgetter(1)))
         return {
             'clusters': clusters,
             'bestFeatures': bestFeatures,
             'error': error
         }
     return clusters
Ejemplo n.º 2
0
 def cluster(self, assignAndReturnDetails=False, numberOfTopFeatures = 5, algorithmSource='nltk', **kwargs):
     bestFeatures, error = {}, None
     if algorithmSource=='nltk':
         clusterer = cluster.KMeansClusterer(self.numberOfClusters, euclidean_distance, **kwargs)
         clusters = clusterer.cluster(self.vectors, True)
         means = clusterer.means()
         for id, mean in zip(clusterer.cluster_names(), means): bestFeatures[id]=[(dimension, score) for dimension, score in sorted(zip([self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i) for i in range(len(mean))], mean), key=itemgetter(1), reverse=True)[:numberOfTopFeatures] if score>0]
     elif algorithmSource=='biopython':
         from Bio.Cluster import kcluster, clustercentroids
         clusters, error, _ = kcluster(self.vectors, nclusters=self.numberOfClusters, npass=kwargs['repeats'])
         means, _ = clustercentroids(self.vectors, self.masks, clusters)
         means = [unitVector(c) for c in means]
         for id, mean in zip(range(len(means)), means): bestFeatures[id]=[(dimension, score) for dimension, score in sorted(zip([self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i) for i in range(len(mean))], mean), key=itemgetter(1), reverse=True)[:numberOfTopFeatures] if score>0]
     if assignAndReturnDetails: 
         documentAssignments=sorted([(docId, clusterId)for docId, clusterId in zip(self.docIds, clusters)], key=itemgetter(1))
         clusters = dict((clusterId, [t[0] for t in documents]) for clusterId, documents in groupby(documentAssignments, key=itemgetter(1)))
         return {'clusters': clusters, 'bestFeatures': bestFeatures, 'error': error}
     return clusters
Ejemplo n.º 3
0
           [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
    corr = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
            [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],
            [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
    dataset = []

    clusterid, error, nfound = kcluster(X_features, cluster_number, None, None,
                                        1, 1, 'a', 'c', None)

    X_features = np.transpose(X_features)

    for i in range(len(clusterid)):
        dst[clusterid[i]].append(X_features[i])

    X_features = np.transpose(X_features)
    cdata, cmask = clustercentroids(X_features, None, clusterid, 'a', 1)
    cdata = np.transpose(cdata)

    for i in range(len(corr)):
        for j in range(len(dst[i])):
            cr, p_val = pearsonr(dst[i][j], cdata[i])
            corr[i].append(cr)

    ddst = []
    accuracy_final = 0.0

    for i in range(cluster_rep):
        done = False
        for j in range(len(dst)):
            if i > len(dst):
                done = True
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 26 21:31:00 2019
bio.cluster.clustercentriods Test
@author: Administrator
"""

from sklearn.preprocessing import MinMaxScaler
from Bio.Cluster import kcluster
from Bio.Cluster import clustercentroids
import pandas as pd

df_full = pd.read_csv('../data/FCM_testdata.csv')
print(df_full.head(5))
print('size of df_full with real_labels:', df_full.shape)
columns = list(df_full.columns)
features = columns[:len(columns) - 1]
df = df_full[features]
print(df.head(5))
print('size of df without real_labels:', df.shape)
minMax = MinMaxScaler()
print(minMax)
dataset = minMax.fit_transform(df)
print(dataset[0:10, :])

clusterid1, error, nfound = kcluster(dataset, nclusters=3, dist='e', npass=100)
cdata, cmask = clustercentroids(dataset, clusterid=clusterid1)