コード例 #1
0
def clusterAllWithKMeans(lowerLimit, upperLimit, featMan, weightMatrix,\
					samePairsSet, differentPairsSet, outDir):
	metrics = {}
	print 'Weight matrix length' ,len(weightMatrix)

	data = featMan.returnKeys()
	for k in range(lowerLimit,upperLimit,2):
		i = len(data)/k
		if i == 0:
			i = 1
		kmeans = KMeans(i,data,weightMatrix,5, 0.1)
		kmeans.cluster();
		clusters = kmeans.getClusters();
		noClus =kmeans.getTermInNoCluster();
		
		if clusters:
                        print 'Found clusters length', len(clusters),'singaltons', len(noClus)
			predictedSamePairsSet, predictedDifferentPairsSet = \
				getPairLabelsFromClusters(clusters,featMan)
			fname = outDir+'_'+str(i)+'.txt'
			oFile = open(fname,'w');
			for entry in clusters:
				if len(entry) > 0:
					oFile.write(toString(entry,featMan)+'\n')
			oFile.write('NO CLUST\t'+toString(noClus,featMan)+'\n');
			oFile.close()
                        print 'Same pair set', len(predictedSamePairsSet)
			#metrics[k] = getRecallPrecision(samePairsSet, \
			#differentPairsSet, predictedSamePairsSet, predictedDifferentPairsSet)
                        metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                                predictedSamePairsSet)
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
コード例 #2
0
def clusterCatWithKMeans(lowerLimit, upperLimit, featMan, \
						weightMatrix, samePairsSet, \
						differentPairsSet, catQueryDist,\
						outFile = 'cat-clusters-with-mean.txt'):
	metrics = {}
	for termCount in range(lowerLimit, upperLimit):
		i = 1
		fclusters = []
		allCatClusters = []
		oFile = open(outFile+'_'+str(termCount)+'.txt','w')
		for cat, qIdSet in catQueryDist.items():
			if len(qIdSet) > 1:
				k = len(qIdSet)/termCount
				if k == 0:
					k = 1
				print cat, len(qIdSet), k
				if k > 1:
					kmeans = KMeans(k,list(qIdSet),weightMatrix,5, 0.1)
					kmeans.cluster();
					clusters = kmeans.getClusters();
					noClus =kmeans.getTermInNoCluster();
					for entry in clusters:
						if len(entry) > 1:
							allCatClusters.append(entry)
						if len(entry) > 0:
							cStr = toString(entry,featMan)
							fclusters.append(cStr)
							oFile.write(cat+'\t'+cStr+'\n');
					oFile.write(cat+'\t'+'NO CLUST\t'+\
								toString(noClus,featMan)+'\n');
				else:
					cStr = toString(qIdSet,featMan)
					oFile.write(cat+'\t'+cStr+'\n');
					allCatClusters.append(list(qIdSet))
				
				if i % 50 == 0:
					print i
				i+=1	
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(allCatClusters,featMan)
		print 'COUNTS ',termCount, len(allCatClusters), \
		len(predictedSamePairsSet), len(catQueryDist)
		#metrics[termCount] = getRecallPrecision(samePairsSet, \
		#				differentPairsSet,\
		#				predictedSamePairsSet,\
		#				predictedDifferentPairsSet)	
                metrics[termCount] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                                predictedSamePairsSet)
		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
コード例 #3
0
def clusterCatWithMediods(lowerLimit, upperLimit,featMan, weightMatrix, \
						 samePairsSet, differentPairsSet, catQueryDist, \
						outFile = 'cat-clusters-with-med.txt'):
	
	oFile = open(outFile,'w')
	metrics = {}
	for noTerms in range(lowerLimit, upperLimit):
		#fclusters = []
		cluster_list = []
		i = 0
		oFile = open(outFile+str(noTerms)+'.txt','w')
		for cat, qSet in catQueryDist.items():
			if len(qSet) > 1: # and cat in pairs:
				k = len(qSet)/noTerms
				if k == 0:
					k = 1
			
				qList = sorted(list(qSet),reverse=True)
				catDist = getWeightMatrixForKMed(qList, weightMatrix,'cat_kmediods')
							
				clusArray, error, opt = clust.kmedoids(catDist,k, 5, None)
				clusters = {}
				for c in range(1, len(clusArray)):
					clusId = clusArray[c]
					if clusId not in clusters:
						clusters[clusId] = set()
					clusters[clusId].add(qList[c-1])

				
				for entry in clusters.values():
					cluster_list.append(list(entry))
					qStr = toString(entry,featMan)
					#fclusters.append(qStr)
					oFile.write(cat+'\t'+qStr+'\n');
				print 'Clust category',cat, 'length', len(clusters),\
                                        'Queries' , len(qSet),'k', k,  'error', error, opt
				if i % 5 == 0:
					print i
				i+=1	
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		#metrics[noTerms] = getRecallPrecision(samePairsSet, \
		#			differentPairsSet,\
		#			predictedSamePairsSet,\
		#			predictedDifferentPairsSet)	
                metrics[noTerms] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                                predictedSamePairsSet)

		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics