def clusterAllWithKMeans(lowerLimit, upperLimit, featMan, weightMatrix,\ samePairsSet, differentPairsSet, outDir): metrics = {} print 'Weight matrix length' ,len(weightMatrix) data = featMan.returnKeys() for k in range(lowerLimit,upperLimit,2): i = len(data)/k if i == 0: i = 1 kmeans = KMeans(i,data,weightMatrix,5, 0.1) kmeans.cluster(); clusters = kmeans.getClusters(); noClus =kmeans.getTermInNoCluster(); if clusters: print 'Found clusters length', len(clusters),'singaltons', len(noClus) predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(clusters,featMan) fname = outDir+'_'+str(i)+'.txt' oFile = open(fname,'w'); for entry in clusters: if len(entry) > 0: oFile.write(toString(entry,featMan)+'\n') oFile.write('NO CLUST\t'+toString(noClus,featMan)+'\n'); oFile.close() print 'Same pair set', len(predictedSamePairsSet) #metrics[k] = getRecallPrecision(samePairsSet, \ #differentPairsSet, predictedSamePairsSet, predictedDifferentPairsSet) metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) for tcount, met in metrics.items(): print tcount, met return metrics
def clusterCatWithKMeans(lowerLimit, upperLimit, featMan, \ weightMatrix, samePairsSet, \ differentPairsSet, catQueryDist,\ outFile = 'cat-clusters-with-mean.txt'): metrics = {} for termCount in range(lowerLimit, upperLimit): i = 1 fclusters = [] allCatClusters = [] oFile = open(outFile+'_'+str(termCount)+'.txt','w') for cat, qIdSet in catQueryDist.items(): if len(qIdSet) > 1: k = len(qIdSet)/termCount if k == 0: k = 1 print cat, len(qIdSet), k if k > 1: kmeans = KMeans(k,list(qIdSet),weightMatrix,5, 0.1) kmeans.cluster(); clusters = kmeans.getClusters(); noClus =kmeans.getTermInNoCluster(); for entry in clusters: if len(entry) > 1: allCatClusters.append(entry) if len(entry) > 0: cStr = toString(entry,featMan) fclusters.append(cStr) oFile.write(cat+'\t'+cStr+'\n'); oFile.write(cat+'\t'+'NO CLUST\t'+\ toString(noClus,featMan)+'\n'); else: cStr = toString(qIdSet,featMan) oFile.write(cat+'\t'+cStr+'\n'); allCatClusters.append(list(qIdSet)) if i % 50 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(allCatClusters,featMan) print 'COUNTS ',termCount, len(allCatClusters), \ len(predictedSamePairsSet), len(catQueryDist) #metrics[termCount] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) metrics[termCount] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics