Example #1
0
	def cluster_kmedoids(self, k=2, npass=50):
		# Utilise la distance pour produire une partition de k classes
		# n est le nombre d'itérations
		
		c, err, nfound = pc.kmedoids(self.zd, k, npass=npass)
		
		return partition(c, self.mat)
def clusterSessionsPre(catQueryDist, featMan, weightMatrix):

  tclusters = {}
  print len(catQueryDist)
  for termCount in range(4, 5):
    tclusters[termCount] = []
    for cat, qSet in catQueryDist.items():
      if len(qSet) > 1:  # and cat in pairs:
        k = len(qSet) / termCount
        if k == 0:
          k = 1
        #print cat, len(qSet), k
        qList = list(qSet)
        catDist = getWeightMatrixForKMed(qList, weightMatrix)

        clusArray, error, opt = clust.kmedoids(catDist, k, 5, None)
        #print 'Queries', qList
        clusters = {}
        for c in range(len(clusArray)):
          clusId = clusArray[c]
          if clusId not in clusters:
            clusters[clusId] = []
          qc = featMan.returnQuery(qList[c])
          if len(qc) > 1:
            clusters[clusId].append(qc)
        #print cat, len(clusters)
        for entry in clusters.values():
          tclusters[termCount].append(entry)

  print len(tclusters[4])
  return tclusters[4]
    def kmedoids_cluster(self, similarities=None):
        # https://jpcomputing.wordpress.com/2014/05/18/pycluster-kmedoids-example/
        from sklearn.metrics import silhouette_score

        distances = []
        if similarities is None:
            for page_id, sim_vector in self._pages_similarities.iteritems():
                distances.append([1 - x[1][USED_DISTANCE] for x in sim_vector])
        else:
            for x in similarities:
                distances.append([1 - a for a in x])

        np_distances = np.asarray(distances)
        import scipy.cluster
        from sklearn.metrics import silhouette_score
        from scipy.spatial.distance import squareform
        squareform_distances = squareform(np_distances)

        import Pycluster
        nb_clusters = 2  # this is the number of cluster the dataset is supposed to be partitioned into
        clusterid, error, nfound = Pycluster.kmedoids(squareform_distances,
                                                      nclusters=nb_clusters,
                                                      npass=50)
        print 'clusterid: ', len(set(clusterid)), clusterid
        res = silhouette_score(np_distances, clusterid, metric='precomputed')
        print 'Res: ', res
        return
        # grouping to clusters
        clusters_indexes = {}
        for i, medoid in enumerate(clusterid):
            if medoid not in clusters_indexes:
                clusters_indexes[medoid] = [i]
            else:
                clusters_indexes[medoid].append(i)
def clusterSessionsKmed(featMan, weightFile):

  data = featMan.returnKeys()
  weightList = getWeightMatrixForKMedFromFile(featMan.returnLastId(),
                                              weightFile, data)
  cnt = 0
  kclusters = {}
  for k in range(4, 5, 2):
    i = (len(weightList) + 1) / k
    if i == 0:
      i = 1
    clusArray, error, opt = clust.kmedoids(weightList, i, 10, None)
    print error, len(clusArray)
    clusters = {}
    for c in range(len(clusArray)):
      clusId = clusArray[c]
      q = featMan.returnQuery(c)
      if len(q) > 1:
        if clusId not in clusters:
          clusters[clusId] = set()
        clusters[clusId].add(q)
        cnt += 1

    kclusters[k] = clusters.values()

    print 'Cluster with kmed ', len(clusters), cnt, ' queries'
  return kclusters[4]
Example #5
0
def cluster_kmedoids(sessions, clusters, distance_fn=string_similarity.jaccard_distance):
    """
    kmedoids clustering, requires distance matrix, therefore slow
    """
    distances = compute_distances(sessions, distance_fn)
    clusterids, error, nfound = Pycluster.kmedoids(distances, nclusters=clusters)
    return clusterids, error, nfound
def clusterCatWithMediods(lowerLimit, upperLimit,featMan, weightMatrix, \
						 samePairsSet, differentPairsSet, catQueryDist, \
						outFile = 'cat-clusters-with-med.txt'):
	
	oFile = open(outFile,'w')
	metrics = {}
	for noTerms in range(lowerLimit, upperLimit):
		#fclusters = []
		cluster_list = []
		i = 0
		oFile = open(outFile+str(noTerms)+'.txt','w')
		for cat, qSet in catQueryDist.items():
			if len(qSet) > 1: # and cat in pairs:
				k = len(qSet)/noTerms
				if k == 0:
					k = 1
			
				qList = sorted(list(qSet),reverse=True)
				catDist = getWeightMatrixForKMed(qList, weightMatrix,'cat_kmediods')
							
				clusArray, error, opt = clust.kmedoids(catDist,k, 5, None)
				clusters = {}
				for c in range(1, len(clusArray)):
					clusId = clusArray[c]
					if clusId not in clusters:
						clusters[clusId] = set()
					clusters[clusId].add(qList[c-1])

				
				for entry in clusters.values():
					cluster_list.append(list(entry))
					qStr = toString(entry,featMan)
					#fclusters.append(qStr)
					oFile.write(cat+'\t'+qStr+'\n');
				print 'Clust category',cat, 'length', len(clusters),\
                                        'Queries' , len(qSet),'k', k,  'error', error, opt
				if i % 5 == 0:
					print i
				i+=1	
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		#metrics[noTerms] = getRecallPrecision(samePairsSet, \
		#			differentPairsSet,\
		#			predictedSamePairsSet,\
		#			predictedDifferentPairsSet)	
                metrics[noTerms] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                                predictedSamePairsSet)

		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
def clusterCatWithMediodsAndNetwork(threshold, \
				    lowerLimit, upperLimit, featMan, \
				    weightMatrix, samePairsSet, \
				    differentPairsSet, catQueryDist, \
				    catNetwork, \
				    outFile = 'cat-clusters-with-med.txt'):
	#cluster each cat find the outliers
	#move them to parents
	metrics = {}
	for noTerms in range(lowerLimit, upperLimit, 2):
		cluster_list = []
		#fclusters = []
		i = 0
		oFile = open(outFile+str(noTerms)+'.txt','w')
		for cat, qSet in catQueryDist.items():
			if len(qSet) > 1: # and cat in pairs:
				k = len(qSet)/noTerms
				if k == 0:
					k = 1
				#print cat, len(qSet), k
				qList = list(qSet)
				catDist = getWeightMatrixForKMed(qList, weightMatrix)
				clusArray, error, opt = clust.kmedoids(catDist,k, 5, None)
				#print 'Queries', qList
				clusters = {}
				for c in range(len(clusArray)):
					clusId = clusArray[c]
					if clusId not in clusters:
						clusters[clusId] = set()
					clusters[clusId].add(qList[c])
				#outliers = getOutliers(qList,catDist)
				for entry in clusters.values():
					cluster_list.append(list(entry))
					qStr = toString(entry,featMan)
					oFile.write(cat+'\t'+qStr+'\n');
					#fclusters.append(qStr)
				print 'Clust ',cat, len(clusters), error, opt
				if i % 50 == 0:
					print i
				i+=1
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		key = str(threshold)+'_'+str(noTerms)
		metrics[key] = getRecallPrecision(samePairsSet, differentPairsSet,\
			     		            predictedSamePairsSet,\
			     		            predictedDifferentPairsSet)
		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
def cluster(D, k):
    import Pycluster as pcl
    labels, _, _ = pcl.kmedoids(D, nclusters=k, npass=10, initialid=None)
    errors = np.array([ D[labels[i], i] for i in range(len(labels)) ])
    centroidids = np.unique(labels)
    cmap = np.zeros(labels.max()+1)
    for c in centroidids:
        cmap[c] = np.nonzero(centroidids == c)[0][0]
    labels = cmap[labels]
    logger.debug('k-medoids (k=%i): %.2f.' % (k, errors.sum()))
    return labels, { 'method': 'kmedoids',
                     'init': 'random',
                     'k': k,
                     'centroidids': centroidids,
                     'errors': errors,
                     'error': errors.sum(),
                     'error-label': 'sum of distances' }
def kmedoids(m):
    labels, error, nfound = Pycluster.kmedoids(m, 16, 5)

    # Find the clusters and rename to have same naming convention as affinity propagation
    clusters = []
    for label in labels:
        if label not in clusters:
            clusters.append(label)

    currentCluster = 0
    for cluster in clusters:
        currentLabel = 0
        for label in labels:
            if label == cluster:
                labels[currentLabel] = currentCluster
            currentLabel += 1
            # clusters[currentCluster] = currentCluster
        currentCluster += 1
    return labels, clusters
Example #10
0
def cluster(D, k):
    import Pycluster as pcl
    labels, _, _ = pcl.kmedoids(D, nclusters=k, npass=10, initialid=None)
    errors = np.array([D[labels[i], i] for i in range(len(labels))])
    centroidids = np.unique(labels)
    cmap = np.zeros(labels.max() + 1)
    for c in centroidids:
        cmap[c] = np.nonzero(centroidids == c)[0][0]
    labels = cmap[labels]
    logger.debug('k-medoids (k=%i): %.2f.' % (k, errors.sum()))
    return labels, {
        'method': 'kmedoids',
        'init': 'random',
        'k': k,
        'centroidids': centroidids,
        'errors': errors,
        'error': errors.sum(),
        'error-label': 'sum of distances'
    }
Example #11
0
def matchs_ia():

    # if not current_user.admin:
    #     return jsonify({'message' : 'Cannot perform that function!'})

    tests = TestR.query.all()

    dataSet = []
    trans_tab = dict.fromkeys(map(ord, u"\u0301\u0308"), None)
    for test in tests:
        # test_data = {}
        resident = Residents.query.filter_by(public_id=test.public_id).first()
        test_data = ("-" + str(resident.id) + "-" + test.gender +
                     str(test.age) + test.musicGender + test.sport +
                     test.hobbie + test.movieSeries + test.filmGender +
                     test.tabaco + test.alcohol + test.party +
                     str(test.ordenConvivencia) + str(test.ordenPersonal) +
                     test.personalidad)
        test_data = normalize(
            "NFKC",
            normalize("NFKD", test_data).translate(trans_tab))
        dataSet.append(test_data)

    distans = [
        distance.edit_distance(dataSet[i], dataSet[j])
        for i in range(1, len(dataSet)) for j in range(0, i)
    ]

    labels, error, nfound = PC.kmedoids(distans, nclusters=5, npass=10)
    cluster = dict()
    output = []
    for roommate, label in zip(dataSet, labels):
        cluster.setdefault(label, []).append(roommate)
    for label, grp in cluster.items():
        cluster_data = {}
        cluster_data["Roommate"] = grp
        cluster_data["Count"] = len(grp)
        cluster_data["label"] = str(label)
        output.append(cluster_data)

    return jsonify({"testsALL": output}, {"error": error}, {"nfound": nfound})
Example #12
0
    def cluster(self, num_cluster):
        category_tfidf = self.category_tfidf
        categories = list(category_tfidf)
        random.shuffle(categories)

        tfidf_norms = {category: sum(value**2 for value in tfidf.values())
                       for category, tfidf in category_tfidf.items()}

        for category, norm in tfidf_norms.items():
            if not norm:
                raise Exception((category, category_tfidf[category]))

        distances = []
        for i, category1 in enumerate(categories):
            cat1_tfidf = category_tfidf[category1]
            row_array = array([0.0] * i)
            for j, category2 in enumerate(categories):
                if j >= i:
                    break
                row_array[j] = self.compute_distance(cat1_tfidf, category_tfidf[category2], tfidf_norms[category1], tfidf_norms[category2])

            distances.append(row_array)

        clusterids, error, nfound = Pycluster.kmedoids(distances, num_cluster)
        print error

        category_clusters = [[] for _ in range(num_cluster)]

        print len(clusterids)
        print len(categories)
        print clusterids

        clusterid_map = {}


        for i, category in enumerate(clusterids):
            category_id = clusterid_map.setdefault(category,
                                                   len(clusterid_map))
            category_clusters[category_id].append(categories[i])

        return category_clusters
Example #13
0
def Kmedoids(num_patches, samples, progress=None):
  """Estimate patches as centroids of samples using k-Medoids.

  This requires the `Pycluster` library to be installed.

  :param int num_patches: number of patches to create
  :type samples: 2D array
  :param samples: example patches
  :param progress: ignored
  :rtype: 2D array with `num_patches` rows and N columns, where N is the number
     of columns in `samples`.
  :return: created patches

  """
  logging.info("Learning %d prototypes per size by k-Medoids clustering" %
      num_patches)
  import Pycluster
  dist = Pycluster.distancematrix(samples)
  cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches)
  # `cluster_ids` contains `num_patches` unique values, each of which is
  # the index of the medoid for a different cluster.
  return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
Example #14
0
def Kmedoids(num_patches, samples, progress=None):
    """Estimate patches as centroids of samples using k-Medoids.

  This requires the `Pycluster` library to be installed.

  :param int num_patches: number of patches to create
  :type samples: 2D array
  :param samples: example patches
  :param progress: ignored
  :rtype: 2D array with `num_patches` rows and N columns, where N is the number
     of columns in `samples`.
  :return: created patches

  """
    logging.info("Learning %d prototypes per size by k-Medoids clustering" %
                 num_patches)
    import Pycluster
    dist = Pycluster.distancematrix(samples)
    cluster_ids, _, _ = Pycluster.kmedoids(dist, nclusters=num_patches)
    # `cluster_ids` contains `num_patches` unique values, each of which is
    # the index of the medoid for a different cluster.
    return samples[np.unique(cluster_ids)].astype(ACTIVATION_DTYPE)
def clusterAllWithKMediods(lowerLimit, upperLimit,\
				 featMan, weightMatrix, \
                                 #weightFile,
                                 allTaskDict, 
                                 samePairsSet, \
				 differentPairsSet, outDir):
	
	data = featMan.returnKeys()
	#weightList = getWeightMatrixForKMedFromFile(featMan.returnLastId(),\
        #        weightFile,data)
	weightList= getWeightMatrixForKMed(data, weightMatrix,'kmediods')
	print len(weightList)
	metrics = {}
	
	for k in range(lowerLimit,upperLimit,3):
		print 'Clustering with terms ', k
		cluster_list = []
		i = k # (len(weightList)+1)/k
		if i == 0:
			i = 1
		clusArray, error, opt = clust.kmedoids(weightList,i, 10, None)
		clusters = {}
		for c in range(len(clusArray)):
			clusId = clusArray[c]
			if clusId not in clusters:
				clusters[clusId] = set()
			try:
				clusters[clusId].add(c)	
			except:
				print c #len(data)
		print 'Error and cluster length ' , error, len(clusters)
                '''for clid, ind in clusters.items():
                    print clid, ind
                    for qind in sorted(ind):
                        print 'query', featMan.returnQuery(qind),
                    print
                    for i1 in sorted(ind):
                        for i2 in sorted(ind):
                            print 'i1 and i2',i1, i2
                            if i1 in weightMatrix and i2 in weightMatrix[i1]:
                                print i1, i2,'matrix', weightMatrix[i1][i2],(weightList[i2])[i1]
                '''
		fname = outDir+'_'+str(i)+'.txt'
		oFile = open(fname,'w');
		for entry in clusters.values():
			cluster_list.append(list(entry))
			qStr = toString(entry,featMan)	
			oFile.write(qStr+'\n')
		oFile.close()
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		#metrics[k] = getRecallPrecision(samePairsSet, \
		#				differentPairsSet,\
		#				predictedSamePairsSet,\
		#				predictedDifferentPairsSet)	
                # metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                #                predictedSamePairsSet)
                metrics[k] = getSetBasedLabelsAndMetric(cluster_list,\
                        allTaskDict, featMan)

	for tcount, met in metrics.items():
		print tcount, met
		
	return metrics
Example #16
0
def cluster_domains( GDA, missing_da,  p, starting_time) :
    """Clusters the domain graph using DBSCAN algorithm and make a picture of
    the whole matrix map
    
    Parameters
    ----------
        GDA : Graph
            undirected graph of domain similarities
        missing_da : list
            DA not in Graph, no edge (no similarity) no any other DA
        p : argument parser object
            parameter object
        starting_time : int 
            program starting time
    Returns
    -------
        clusters : list
            a list of list containing the clusterised DA
    """
    # add missing da as a self cluster
    clusters = [ [da] for da in missing_da ]       
    
    if p.daonly :
        #if True :
        # to gain some memory space DBSCAN is only used on connected components
        clusters_comp = nx.connected_components( GDA )
        all_unclustered = [ ]
        for comp in clusters_comp :
            if len( comp ) > p.minpts :
                H = GDA.subgraph( comp )
                # networkx return an numpy.matrixlib.defmatrix.matrix 
                mat =  1.0 - np.array(nx.to_numpy_matrix( H, nodelist=comp ) ) 
                mat.flat[ :: mat.shape[0] + 1 ] = 0  # diag to 0
                # run OPTICS on distance matrix
                optics = Optics(  p.minpts, epsilon=p.epsilon )
                ordered, reachability, core_dist = optics.run(mat)
                labels = optics.cluster( p.epsilon_p ) 
                # run dbscan on distance matrix
                slabels = set( labels )
                for k in slabels :
                    ind = np.where( labels == k )[0]
                    if k == -1 :
                        for i in ind :
                            clusters.append( [comp[i]] )
                    else :
                        clusters.append(  [ comp[i] for i in ind ]  )
            else :
                # if the component is whith less memebers than the minpts cutoff
                # all the members of the same components are put in the same clusters
                clusters.append( comp )
    else :
        nodes = GDA.nodes( )
        # networkx return an numpy.matrixlib.defmatrix.matrix 
        # instead of an numpy.ndarray matrix, not really convenient ...
        bigmat = 1.0 - np.array( nx.to_numpy_matrix( GDA, nodelist=nodes ) )
        bigmat.flat[ :: bigmat.shape[0] +1 ] = 0 # diagonal to 0
        clusterid, error, nfound = Pycluster.kmedoids (bigmat, nclusters=p.kcluster, npass=10 )
        for l in np.unique( clusterid ) :
            tmp_clust =  [nodes[i] for i in range(clusterid.shape[0]) if clusterid[i] == l ] 
            clusters.append( tmp_clust )            

    return clusters
Example #17
0
#!/usr/bin/python -tt

import Pycluster as pc
import numpy as np
import sys
import re

filename, n = sys.argv[1], int(sys.argv[2])
dist = np.loadtxt(filename, usecols=range(0, 799))

clustermap, error, nfound = pc.kmedoids(dist, n, npass=1000)
medoids = {}
for i in clustermap:
    medoids[i] = medoids.get(i, 0) + 1

output_filename = re.sub('\/[^\/]+$', '/kmedoids_result', filename)
np.savetxt(output_filename, clustermap, delimiter=" ", fmt="%d")

print clustermap
print "\n"
print medoids
print "nfound: " + str(nfound)
Example #18
0
filename, n = sys.argv[1], int(sys.argv[2])

data = np.loadtxt(filename)
k = len(data)

# Calculate the distance matrix
m = np.zeros(k * k)
m.shape = (k, k)

for i in range(0, k):
    for j in range(i, k):
        d = dist(data[i], data[j])
        m[i][j] = d
        m[j][i] = d

# Perform the actual clustering
clustermap, _, _ = pc.kmedoids(m, n, npass=20)

# Find the indices of the points used as medoids, and the cluster masses
medoids = {}
for i in clustermap:
    medoids[i] = medoids.get(i, 0) + 1

# Print points, grouped by cluster
for i in medoids.keys():
    print "Cluster=", i, " Mass=", medoids[i], " Centroid: ", data[i]

    for j in range(0, len(data)):
        if clustermap[j] == i:
            print "\t", data[j]
Example #19
0
filename, n = sys.argv[1], int( sys.argv[2] )

data = np.loadtxt( filename )
k = len(data)

# Calculate the distance matrix
m = np.zeros( k*k )
m.shape = ( k, k )

for i in range( 0, k ):
    for j in range( i, k ):
        d = dist( data[i], data[j] )
        m[i][j] = d
        m[j][i] = d

# Perform the actual clustering
clustermap, _, _ = pc.kmedoids( m, n, npass=20 )

# Find the indices of the points used as medoids, and the cluster masses
medoids = {}
for i in clustermap:
    medoids[i] = medoids.get(i,0) + 1

# Print points, grouped by cluster
for i in medoids.keys():
    print "Cluster=", i, " Mass=", medoids[i], " Centroid: ", data[i]

    for j in range( 0, len(data) ):
        if clustermap[j] == i:
            print "\t", data[j]
Example #20
0
'19HombreHip hopVolleyballEstar con mi familia AmbasRom-comsIntrovertid@',
'19MujerFlamenco Futbol DibujarAmbasThrillerExtrovertid@']

#td*(0-1) saber el tiempo que toma ejecuatar cada parte del codigo
td0= time.process_time_ns()
#Get distance 
dist2 = [distance.edit_distance(DataSetTFG[i], DataSetTFG[j]) 
        for i in range(1, len(DataSetTFG))
        for j in range(0, i)]
#Tiempo Distancias
td1 = time.process_time_ns() - td0

#TiempoClusterK1
tc10 = time.process_time_ns()
#Cluster kmedoids K1 npass3
labels1, error1, nfound1 = PC.kmedoids(dist2, nclusters=1,npass=10)
#Guardamos el resultado en un dicionario cluster1
cluster1 = dict()
for roommate, label in zip(DataSetTFG, labels1):
    cluster1.setdefault(label, []).append(roommate)
for label, grp in cluster1.items():
    print(grp)

tc11 = time.process_time_ns() - tc10

#TiempoClusterK2
tc20 = time.process_time_ns()       
#Cluster kmedoids K2 npass3
labels2, error2, nfound2 = PC.kmedoids(dist2, nclusters=2,npass=10)
cluster2 = dict()
for roommate, label in zip(DataSetTFG, labels2):
Example #21
0
# > Rather than using the mean (which is usually not a real datapoint), K-medoids uses a particular datapoint as the centroid. This makes K-medoids less sensitive to outliers. And in case of text, can give an actual "representative" document of a topic.
# 
# Valid values for metric are:
# from scikit-learn: [‘euclidean’, ‘l2’, ‘l1’, ‘manhattan’, ‘cityblock’]
# from scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘cosine’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the documentation for scipy.spatial.distance for details on these metrics.

# <codecell>

#K-Medoid
#Can pass in other distances for K-Medoids
import Pycluster
from Pycluster import kmedoids

distances = pairwise_distances(X, metric='euclidean', n_jobs=1)
nb_clusters = 20
clusterid, error, nfound = Pycluster.kmedoids(distances, nclusters= nb_clusters, npass=100)

# <codecell>

#Look at the actual comments that are central to each cluster:
medoids = list(set(clusterid))

for m in medoids:
    print "------------------------"
    print "medoid ", m, ":", grouped_questions[m]

# <markdowncell>

# ##Non-Negative Matrix Factorization

# <codecell>
        temp = []
        for i in range(0,len(points)):
            p2 = points[i] 
            temp.append ( distance_function(p1,p2) )
        distances.append (temp)
    return distances



# def timespan(list):



nb_clusters = 15 # this is the number of cluster the dataset is supposed to be partitioned into
distances = get_distance_matrix(vectors, euclidean)
clusterid, error, nfound = Pycluster.kmedoids(distances, nclusters= nb_clusters, npass=100)



uniq_ids = list(set(clusterid))

new_ids = [ uniq_ids.index(val) for val in clusterid]

# print uniq_ids
# print new_ids



#############################################
# new_ids  ->  index:clusterid 				#
# vectors  ->  index:location				#
Example #23
0
def matchsTestM(current_user):

    if not current_user.admin:
        return jsonify({'message': 'Cannot perform that function!'})
    tiempoTI = time.process_time_ns()
    tests = TestR.query.filter_by(gender="Mujer").all()
    dataSet = []
    #Quitar asentos, ñ , etc..
    trans_tab = dict.fromkeys(map(ord, u"\u0301\u0308"), None)
    for test in tests:
        resident = Residents.query.filter_by(public_id=test.public_id).first()
        test_data = (
            "-" + str(resident.id) + "-"
            #+ test.gender
            + " " + str(test.age) + " " + test.musicGender + " " + test.sport +
            " " + test.hobbie + " " + test.movieSeries + " " +
            test.filmGender + " " + test.tabaco + " " + test.alcohol + " " +
            test.party + " " + str(test.ordenConvivencia) + " " +
            str(test.ordenPersonal) + test.personalidad)
        test_data = normalize(
            "NFKC",
            normalize("NFKD", test_data).translate(trans_tab))
        dataSet.append(test_data)

    tiempoPI = time.process_time_ns()
    tiempoII = time.process_time_ns()
    distans = [
        distance.edit_distance(dataSet[i], dataSet[j])
        for i in range(1, len(dataSet)) for j in range(0, i)
    ]
    tiempoPF = (time.process_time_ns() - tiempoPI)
    tiempoPF = tiempoPF / 60000000000
    tiempoCI = time.process_time_ns()
    labels, error, nfound = PC.kmedoids(distans, nclusters=10, npass=10)
    cluster = dict()
    datosCluster = []
    outputId = []
    for roommate, label in zip(dataSet, labels):
        cluster.setdefault(label, []).append(roommate)
    for label, grp in cluster.items():
        cluster_dataID = {}
        cluster_dataID["resultados"] = grp
        outputId.append(cluster_dataID)
    tiempoCF = (time.process_time_ns() - tiempoCI)
    tiempoIF = (time.process_time_ns() - tiempoII)
    tiempoCF = tiempoCF / 60000000000
    tiempoIF = tiempoIF / 60000000000

    returnObj = []
    sizeRooms = 4
    #Makes rooms
    for cluster in outputId:
        room = []
        for row in cluster.values():
            num = len(row)
            #number of tests in cluster
            residual = num % sizeRooms
            offset = 1
            for test in row:
                #extract and format data
                x = test.split("-")
                index = row.index(test)
                realI = index + 1
                room.append(x[1])

                if realI % sizeRooms == 0:
                    #add room and create room
                    returnObj.append(room[:])
                    #empty room
                    room.clear()
                if (residual > 0 and realI > sizeRooms * (num // sizeRooms)):
                    #si residuo es 1, ultima, 2, penultima, 3 antepenutima.
                    #si % de num es =! 0, (1,2,3) meter en ult, pen, o ante
                    #number of rooms
                    counter = (len(returnObj)) - offset
                    #print(counter, x[1], "ok"
                    returnObj[counter].append(x[1])
                    offset += 1
    #Write in DB
    for rooms in returnObj:
        new_room = Rooms(state="Completo")
        db.session.add(new_room)
        db.session.commit()
        for x in range(0, len(rooms)):
            new_match = Matches(user_id=rooms[x], room_id=new_room.id)
            db.session.add(new_match)
            db.session.commit()

    tiempoTF = (time.process_time_ns() - tiempoTI)
    tiempoTF = tiempoTF / 60000000000

    datos_data = {}
    datos_data["tiempoT"] = tiempoTF
    datos_data["TimepoIA"] = tiempoIF
    datos_data["tiempoP"] = tiempoPF
    datos_data["tiempoC"] = tiempoCF
    datos_data["error"] = error
    datos_data["nS"] = nfound
    datosCluster.append(datos_data)

    return jsonify({"datosIA": datos_data})
Example #24
0
	def generate_kmedoid(self,locationid):

		trend_cross_trend_matrix,trends_list = self.get_matrix(locationid)
		clusterid , error , nfound = Pycluster.kmedoids(trend_cross_trend_matrix,nclusters=4,npass=100)
		return clusterid , trends_list
	
words=readwordlist.read("rt_words.csv")
vectWords=[]
for i in range(50):
	sysnet.setdefault(i,{'word':words[i],'sysnet':wn.synsets(words[i])})
	vectWords.append(i)


totalElement=len(vectWords)
totalClusters=10
distMatrix=numpy.ones((totalElement,totalElement),dtype=float)
for i in range(totalElement):
	for j in range(totalElement):
		distMatrix[i,j]=getDistance(i,j)

clusters=Pycluster.kmedoids(distMatrix,nclusters=totalClusters,npass=100)
print distMatrix
print clusters
groups={}
for i in range(len(clusters[0])):
	if clusters[0][i]<totalClusters :
		groups.setdefault(clusters[0][i],[]).append(sysnet[i]['word'])

for key,value in groups.items():
	print "\n***********************************\n"
	for v in value:
		print v
		
#print findclusters(vectWords,100)
#plt.show()