def clusterCatWithMediodsAndNetwork(threshold, \
				    lowerLimit, upperLimit, featMan, \
				    weightMatrix, samePairsSet, \
				    differentPairsSet, catQueryDist, \
				    catNetwork, \
				    outFile = 'cat-clusters-with-med.txt'):
	#cluster each cat find the outliers
	#move them to parents
	metrics = {}
	for noTerms in range(lowerLimit, upperLimit, 2):
		cluster_list = []
		#fclusters = []
		i = 0
		oFile = open(outFile+str(noTerms)+'.txt','w')
		for cat, qSet in catQueryDist.items():
			if len(qSet) > 1: # and cat in pairs:
				k = len(qSet)/noTerms
				if k == 0:
					k = 1
				#print cat, len(qSet), k
				qList = list(qSet)
				catDist = getWeightMatrixForKMed(qList, weightMatrix)
				clusArray, error, opt = clust.kmedoids(catDist,k, 5, None)
				#print 'Queries', qList
				clusters = {}
				for c in range(len(clusArray)):
					clusId = clusArray[c]
					if clusId not in clusters:
						clusters[clusId] = set()
					clusters[clusId].add(qList[c])
				#outliers = getOutliers(qList,catDist)
				for entry in clusters.values():
					cluster_list.append(list(entry))
					qStr = toString(entry,featMan)
					oFile.write(cat+'\t'+qStr+'\n');
					#fclusters.append(qStr)
				print 'Clust ',cat, len(clusters), error, opt
				if i % 50 == 0:
					print i
				i+=1
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		key = str(threshold)+'_'+str(noTerms)
		metrics[key] = getRecallPrecision(samePairsSet, differentPairsSet,\
			     		            predictedSamePairsSet,\
			     		            predictedDifferentPairsSet)
		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
Beispiel #2
0
        else:
            print 'Query feature error ', session[i]
      sessCount += 1
    labels = qcc.getTaskComponents()
    fname = args.outDir + '_'+args.algo+'_' + str(threshold) + '.txt'
    outFile = open(fname, 'w')

    for entry in labels:
      string = ''
      for qid in entry:
        string += featMan.returnQuery(qid) + '\t'
      outFile.write(string.strip() + '\n')
    outFile.close()
    predicted_same_pairs, predicted_different_pairs=\
     getPairLabelsFromClusters(labels,featMan)
    metrics[threshold] = getRecallPrecision(samePairsSet, differentPairsSet, predicted_same_pairs, predicted_different_pairs)
    for tcount, met in metrics.items():
      print tcount, met
    mergeMetrics(total_metrics_dict, metrics)
  computeAverageAndVarianceOfMetrics(args.algo, total_metrics_dict)

  #qcos, ucos, userCos, sessionCos, ngramCos, entCos, \
  #catCos,typeCos = qf1.findCosineDistance(qf2)
  #qjac = qf1.findJacardDistance(qf2)
  ##qedit = qf1.findEditDistance(qf2)
  ##normalized distance
  ##dist = (j - i)#*1.0/len(session)
  ##oFile.write(str(qid1)+'\t'+str(qid2)+'\t'+\
  ##str(round(qcos,2))+'\t'+str(round(qjac,2))+'\t'+\
  ##str(round(ngramCos,2))+'\t'+str(round(userCos,2))+'\t' + \
  ##str(round(entCos,2))+'\t'+ str(round(catCos,2))+\