def clusterAllWithKMeans(lowerLimit, upperLimit, featMan, weightMatrix,\ samePairsSet, differentPairsSet, outDir): metrics = {} print 'Weight matrix length' ,len(weightMatrix) data = featMan.returnKeys() for k in range(lowerLimit,upperLimit,2): i = len(data)/k if i == 0: i = 1 kmeans = KMeans(i,data,weightMatrix,5, 0.1) kmeans.cluster(); clusters = kmeans.getClusters(); noClus =kmeans.getTermInNoCluster(); if clusters: print 'Found clusters length', len(clusters),'singaltons', len(noClus) predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(clusters,featMan) fname = outDir+'_'+str(i)+'.txt' oFile = open(fname,'w'); for entry in clusters: if len(entry) > 0: oFile.write(toString(entry,featMan)+'\n') oFile.write('NO CLUST\t'+toString(noClus,featMan)+'\n'); oFile.close() print 'Same pair set', len(predictedSamePairsSet) #metrics[k] = getRecallPrecision(samePairsSet, \ #differentPairsSet, predictedSamePairsSet, predictedDifferentPairsSet) metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) for tcount, met in metrics.items(): print tcount, met return metrics
def clusterCatWithKMeans(lowerLimit, upperLimit, featMan, \ weightMatrix, samePairsSet, \ differentPairsSet, catQueryDist,\ outFile = 'cat-clusters-with-mean.txt'): metrics = {} for termCount in range(lowerLimit, upperLimit): i = 1 fclusters = [] allCatClusters = [] oFile = open(outFile+'_'+str(termCount)+'.txt','w') for cat, qIdSet in catQueryDist.items(): if len(qIdSet) > 1: k = len(qIdSet)/termCount if k == 0: k = 1 print cat, len(qIdSet), k if k > 1: kmeans = KMeans(k,list(qIdSet),weightMatrix,5, 0.1) kmeans.cluster(); clusters = kmeans.getClusters(); noClus =kmeans.getTermInNoCluster(); for entry in clusters: if len(entry) > 1: allCatClusters.append(entry) if len(entry) > 0: cStr = toString(entry,featMan) fclusters.append(cStr) oFile.write(cat+'\t'+cStr+'\n'); oFile.write(cat+'\t'+'NO CLUST\t'+\ toString(noClus,featMan)+'\n'); else: cStr = toString(qIdSet,featMan) oFile.write(cat+'\t'+cStr+'\n'); allCatClusters.append(list(qIdSet)) if i % 50 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(allCatClusters,featMan) print 'COUNTS ',termCount, len(allCatClusters), \ len(predictedSamePairsSet), len(catQueryDist) #metrics[termCount] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) metrics[termCount] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
def clusterCatWithMediods(lowerLimit, upperLimit,featMan, weightMatrix, \ samePairsSet, differentPairsSet, catQueryDist, \ outFile = 'cat-clusters-with-med.txt'): oFile = open(outFile,'w') metrics = {} for noTerms in range(lowerLimit, upperLimit): #fclusters = [] cluster_list = [] i = 0 oFile = open(outFile+str(noTerms)+'.txt','w') for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet)/noTerms if k == 0: k = 1 qList = sorted(list(qSet),reverse=True) catDist = getWeightMatrixForKMed(qList, weightMatrix,'cat_kmediods') clusArray, error, opt = clust.kmedoids(catDist,k, 5, None) clusters = {} for c in range(1, len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(qList[c-1]) for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) #fclusters.append(qStr) oFile.write(cat+'\t'+qStr+'\n'); print 'Clust category',cat, 'length', len(clusters),\ 'Queries' , len(qSet),'k', k, 'error', error, opt if i % 5 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) #metrics[noTerms] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) metrics[noTerms] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics