def main(argv): featMan = FeatureManager() featMan.readFeatures(argv[2]) weightMatrix = readWeightMatrix(argv[3]) returnFilteredNetwork(argv[2], argv[1], featMan, weightMatrix)
def main(argv): featMan = FeatureManager() featMan.readFeatures(argv[1]) data = featMan.returnKeys() #random.sample(list(featMan.returnKeys()),40) print 'To cluster', data #weightMatrix = readWeightMatrix(argv[2]) weightList = readWeightMatrix(argv[2]) catNetwork, catQueryDist = returnFilteredNetwork(argv[3], argv[4], featMan) dp = DPMeans() x = float(argv[5]) while x < 0.85: dp.clusterWithNetwork(featMan, weightList, catNetwork, catQueryDist, x, 0.01) x += 0.10
def sampleQueryPairs(fileName, weightFile, featFile): weightMatrix = readWeightMatrix(weightFile) featMan = FeatureManager() featMan.loadQueries(featFile) idDict = featMan.returnIdDict() qDict = featMan.returnQueryDict() clusters = loadClustersWithQueryFile(fileName, idDict) done = {} for entry in clusters: minPair = None maxPair = None minDist = 1000 maxDist = 0 #print entry if len(entry) > 3: sentry = sorted(entry) for i in range(len(sentry) - 1): for j in range(i + 1, len(sentry)): try: if weightMatrix[sentry[i]][sentry[j]] < minDist: minDist = weightMatrix[sentry[i]][sentry[j]] minPair = (qDict[sentry[i]], qDict[sentry[j]]) if weightMatrix[sentry[i]][sentry[j]] > maxDist: maxDist = weightMatrix[sentry[i]][sentry[j]] maxPair = (qDict[sentry[i]], qDict[sentry[j]]) except: dist = random.uniform(0.8, 1.0) if dist < minDist: minDist = dist minPair = (qDict[sentry[i]], qDict[sentry[j]]) if dist > maxDist: maxDist = dist maxPair = (qDict[sentry[i]], qDict[sentry[j]]) if minPair and minPair[0] not in done and minPair[1] not in done: print 'Min\t' + minPair[0] + '\t' + minPair[1] if maxPair and maxPair[0] not in done and maxPair[1] not in done: print 'Max\t' + maxPair[0] + '\t' + maxPair[1] if minPair: done[minPair[0]] = 1 done[minPair[1]] = 1 if maxPair: done[maxPair[0]] = 1 done[maxPair[1]] = 1
' cluster', required=True,type=float) parser.add_argument('-s', '--sessionFile', help='Session file containing'+\ ' queries', required=True) parser.add_argument('-p', '--pairLabelFile', help='Task labels for a'+\ ' pair of queries, same_task and different_task',\ required=False) args = parser.parse_args() qcc = QCCTasks() featMan = FeatureManager() #stemmer = stem.porter.PorterStemmer() featMan.readFeatures(args.featFile) # Loads the distance between two queries (i.e. 1-similarity) weightMatrix = readWeightMatrix(args.distFile) print len(weightMatrix) samePairsSet = differentPairsSet = None if args.pairLabelFile: samePairsSet , differentPairsSet = loadPairsFromFile(args.pairLabelFile) total_metrics_dict = {} for threshold in np.arange(args.lowerLimit, args.upperLimit, 0.02): sessCount = 0 lastSes = None session = [] metrics = {} qcc = QCCTasks() for session in getSessionWithQuery(args.sessionFile): #calculate the score for i in range(len(session) - 1):
avg_inter_ij[i] = min(avg_inter_ij[i], score) avg_inter_ij[j] = min(avg_inter_ij[j], score) fmin_i = [] for i, mini in avg_inter_ij.items(): #print i, vals.values() fmin_i.append(mini / maxDiam) print 'FMIN ', fmin_i print 'Dunn index ', min(fmin_i) if __name__ == '__main__': argv = sys.argv lbreak = False weightMatrix = readWeightMatrix(argv[2]) featMan = FeatureManager() featMan.loadQueries(argv[3]) for ifile in os.listdir(argv[1]): clusters = loadClustersWithQueryFile(argv[1] + '/' + ifile, featMan.returnIdDict()) print len(clusters), len(featMan.returnIdDict()), len(weightMatrix) #load the cluster-assignments and points Dunn(clusters, weightMatrix) #DB(clusters,weightMatrix) #load the weight matrix #load the centers