def icluster(data, terms, userFeedbackTerm, k, userU=-1): N, M = data.shape if userU == +1: # it means reclustering signal has been sent # clusterNames = eval(form.getvalue('serverClusterName')) userU = numpy.zeros((k, M), float) userFeedbackTermId = [] for i in range(len(userFeedbackTerm)): tempArray = [] if (len(userFeedbackTerm[i]) == 1): if (numpy.where(terms == userFeedbackTerm[i][0])[1].size > 0): userU[i, numpy.where( terms == userFeedbackTerm[i][0])[1][0]] = 1 else: step = 0.05 # the lower terms will recive lower value for j in range(len(userFeedbackTerm[i])): if (numpy.where(terms == userFeedbackTerm[i][j])[1].size > 0): userU[i, numpy.where(terms == userFeedbackTerm[i][j])[1][0]] = max( 1 - j * step, 0.5) docs = numpy.arange(1, N + 1).reshape((1, N)) Vars = numpy.var(data, axis=0).transpose() options = (1.1, 25, 0.01, 0) keyterms = [] clusterKeyterms = [] clusterDocs = [] realK = 0 # in case the number of clusters are less than user specified, it will recluster until it gets the right number. while realK < k: idp = [] selectedCentroids = numpy.empty([k, M], dtype=float) fcm = Fuzzy.FuzzyCMeans(data.transpose(), k, options[0], 'cosine', userU, options[1], options[2]) fcm() bestU = fcm.mu # .transpose() for p in range(k): sortIDX = numpy.argsort(bestU[p, :]) sortV = numpy.sort(bestU[p, :]) tempIndex = numpy.argmax(sortV > (1.0 / k)) idp.append(sortIDX[tempIndex:]) for p in range(k): idx = [] idpp = idp[p] Varsp = Vars[idpp] meanVarsp = numpy.mean(Varsp) tempIndex = numpy.where(Varsp >= meanVarsp)[0] keyTerms = idpp[tempIndex] newDataset = data[:, keyTerms] sumDataset = numpy.mean(newDataset, axis=1) temp, label = scipy.cluster.vq.kmeans2(sumDataset, 2, iter=50, thresh=1e-03, minit='random', missing='warn') idx.append(numpy.where(label == 0)[0]) idx.append(numpy.where(label == 1)[0]) if (idx[0].size == 0): relDocs = idx[1] elif (idx[1].size == 0): relDocs = idx[0] else: if (idx[0].size >= idx[1].size): relDocs = idx[1] else: relDocs = idx[0] selectedCentroids[p, :] = numpy.mean(data[relDocs, :], axis=0) Y = cdist(data, selectedCentroids, 'cosine') minY = numpy.min(Y, axis=1) maxY = numpy.max(Y, axis=1) maxMmin = maxY - minY minY = numpy.kron(numpy.ones((k, 1)), minY).transpose() maxMmin = numpy.kron(numpy.ones((k, 1)), maxMmin).transpose() tempY = numpy.multiply((Y - minY), numpy.power(maxMmin, -1.0)) tempY = 1 - tempY threshold = 0.95 tempY = (tempY > threshold) clusters = [] for p in range(k): clusters.append(numpy.where(tempY[:, p])[0]) realK = 0 IDX = numpy.argmin(Y, axis=1) newclusters = [] for p in range(k): newclusters.append(numpy.where(IDX == p)[0]) if (len(newclusters[p]) > 0): realK = realK + 1 del newclusters silhouette_avg = silhouette_score(data, IDX, 'cosine') sample_silhouette_values = silhouette_samples(data, IDX, 'cosine') scores = dict() for i, label in enumerate(IDX): ith_cluster_silhouette_values = sample_silhouette_values[IDX == label] avg = numpy.mean(ith_cluster_silhouette_values) scores[str(label)] = scale_score(avg) attrVals = numpy.empty([M, k], dtype=float) computeX2(attrVals, clusters, data, N) for p in range(k): temp = numpy.argsort(attrVals[:, p]) temp = temp[::-1] keyterms.append(temp[range(f)]) for p in range(k): tempStr = '[' comma = '' for j in range(len(keyterms[p])): tempStr += comma + '\"' + terms[0, keyterms[p][j]] + '\"' comma = ',' tempStr += ']' clusterKeyterms.append(tempStr) for p in range(k): tmp = [] for j in range(len(clusters[p])): tmp.append(docs[0, clusters[p][j]]) clusterDocs.append(tmp) clusterKeyterms = [ast.literal_eval(x) for x in clusterKeyterms] # clusterDocs = [ast.literal_eval(x) for x in clusterDocs] return clusterDocs, clusterKeyterms, keyterms, silhouette_avg, scores
NMI = [] keyterms = [] #clusterKeyterms = numpy.empty([1,k], dtype=object) clusterKeyterms = [] clusterDocs = [] realK = 0 while (realK < k): idp = [] selectedCentroids = numpy.empty([k, M], dtype=float) attrVals = numpy.empty([M, k], dtype=float) fcm = Fuzzy.FuzzyCMeans(data.transpose(), k, options[0], 'cosine', userU) fcm() bestU = fcm.mu #.transpose() for p in range(k): sortIDX = numpy.argsort(bestU[p, :]) sortV = numpy.sort(bestU[p, :]) tempIndex = numpy.argmax(sortV > (1.0 / k)) idp.append(sortIDX[tempIndex:]) for p in range(k): idx = [] idpp = idp[p] Varsp = Vars[idpp] meanVarsp = numpy.mean(Varsp)