def main(argv): featMan = FeatureManager() featMan.readFeatures(argv[2]) weightMatrix = readWeightMatrix(argv[3]) returnFilteredNetwork(argv[2], argv[1], featMan, weightMatrix)
def generatePhraseFeatures(featureFile, spotFile, outFile): #load features for queries qfeatMan = FeatureManager() qfeatMan.readFeatures(featureFile) pid = 0 pfeatMan = FeatureManager() #generate features for phrases for query, pList in generatePhrases(spotFile): qkey, qfeat = qfeatMan.returnFeature(query) #print query, qkey if qkey: #print query, pList for phrase in pList: qVect = getDictFromSet(phrase.split()) ngrams = getNGramsAsList(phrase, 2) url = qfeat.returnUrl() user = qfeat.returnUsers() ent = qfeat.returnEntities() cat = qfeat.returnCategories() typ = qfeat.returnType() sess = qfeat.returnSessions() if 'tournament' in phrase: print query, phrase print sess print typ print ent nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent, cat, typ) pfeatMan.addFeature(phrase, pid, nFeature) pid += 1 pfeatMan.writeFeatures(outFile)
def main(argv): featMan = FeatureManager() featMan.readFeatures(argv[1]) data = featMan.returnKeys() #random.sample(list(featMan.returnKeys()),40) print 'To cluster', data #weightMatrix = readWeightMatrix(argv[2]) weightList = readWeightMatrix(argv[2]) catNetwork, catQueryDist = returnFilteredNetwork(argv[3], argv[4], featMan) dp = DPMeans() x = float(argv[5]) while x < 0.85: dp.clusterWithNetwork(featMan, weightList, catNetwork, catQueryDist, x, 0.01) x += 0.10
def sampleQueryPairs(fileName, weightFile, featFile): weightMatrix = readWeightMatrix(weightFile) featMan = FeatureManager() featMan.loadQueries(featFile) idDict = featMan.returnIdDict() qDict = featMan.returnQueryDict() clusters = loadClustersWithQueryFile(fileName, idDict) done = {} for entry in clusters: minPair = None maxPair = None minDist = 1000 maxDist = 0 #print entry if len(entry) > 3: sentry = sorted(entry) for i in range(len(sentry) - 1): for j in range(i + 1, len(sentry)): try: if weightMatrix[sentry[i]][sentry[j]] < minDist: minDist = weightMatrix[sentry[i]][sentry[j]] minPair = (qDict[sentry[i]], qDict[sentry[j]]) if weightMatrix[sentry[i]][sentry[j]] > maxDist: maxDist = weightMatrix[sentry[i]][sentry[j]] maxPair = (qDict[sentry[i]], qDict[sentry[j]]) except: dist = random.uniform(0.8, 1.0) if dist < minDist: minDist = dist minPair = (qDict[sentry[i]], qDict[sentry[j]]) if dist > maxDist: maxDist = dist maxPair = (qDict[sentry[i]], qDict[sentry[j]]) if minPair and minPair[0] not in done and minPair[1] not in done: print 'Min\t' + minPair[0] + '\t' + minPair[1] if maxPair and maxPair[0] not in done and maxPair[1] not in done: print 'Max\t' + maxPair[0] + '\t' + maxPair[1] if minPair: done[minPair[0]] = 1 done[minPair[1]] = 1 if maxPair: done[maxPair[0]] = 1 done[maxPair[1]] = 1
def findPairwiseDistance(featureFile, outFile): featMan = FeatureManager() featMan.readFeatures(featureFile) featDict = featMan.featureDict oFile = open(outFile, 'w') ids = featDict.keys() keys = sorted(ids) print len(keys), keys[-5:] for i in range(0, len(keys) - 1): qid1, qf1 = featMan.returnFeature(keys[i]) for j in range(i + 1, len(keys)): qid2, qf2 = featMan.returnFeature(keys[j]) qcos, ucos, userCos, sessionCos, ngramCos, entCos, \ catCos,typeCos = qf1.findCosineDistance(qf2) qjac = qf1.findJacardDistance(qf2) #qedit = qf1.findEditDistance(qf2) edgeScore = (15*((qcos + qjac )/2.0) +\ 12.5*ngramCos + 12.5*ucos + 20*sessionCos +\ 20*userCos + 10*((entCos + catCos)/2.0) + 10*typeCos)/100.0 if edgeScore > 0.0: oFile.write( #str(qid1) + ' ' + str(qid2) + ' ' + str(round(edgeScore, 3)) + '\n') featMan.returnQuery(qid1) + '\t' + featMan.returnQuery(qid2) + '\t' + str(round(edgeScore, 3)) + '\n') #oFile1.write(str(qid1)+'\t'+str(qid2)+'\t'+\ #str(round(qcos,2))+'\t'+str(round(qjac,2))+'\t'+\ #str(round(ngramCos,2))+'\t'+str(round(userCos,2))+'\t' + \ #str(round(entCos,2))+'\t'+ str(round(catCos,2))+\ #'\t'+ str(round(sessionCos,2))+'\t'+ str(round(typeCos,2))+'\n') oFile.close()
def printCategoryQueryDictionary(fileName, clusFile, weightFile): featMan = FeatureManager() featMan.readFeatures(fileName) categoryDictionary = {} for query, feat in featMan.iterFeatures(): catDict = feat.returnFeature('cat') for entry in catDict: if entry not in categoryDictionary: categoryDictionary[entry] = set() categoryDictionary[entry].add(query) outC = open(clusFile,'w') outW = open(weightFile,'w') for entry, qlist in categoryDictionary.items(): outC.write(toString(qlist,featMan)+'\n') outW.write(str(qlist)+'\n') outC.close() weightMatrix = {} cc = 0 #calculate the weight matrix for entry, qlist in categoryDictionary.items(): sort = sorted(qlist) for i in range(len(sort)-1): qid1, qf1 = featMan.returnFeature(sort[i]) if qf1: if sort[i] not in weightMatrix: weightMatrix[sort[i]] = {} for j in range(i+1,len(sort)): qid2, qf2 = featMan.returnFeature(sort[j]) if qf2: if sort[j] not in weightMatrix[sort[i]]: qcos, ucos, userCos, ngramCos, entCos, catCos = qf1.findCosineDistance(qf2) qjac = qf1.findJacardDistance(qf2) #qedit = qf1.findEditDistance(qf2) #normalized distance #dist = (j - i)#*1.0/len(session) edgeScore = (.25*((qcos + qjac )/2.0) +\ .15*ngramCos + .15*ucos + \ .15*userCos + .15*entCos + .15*catCos) if edgeScore > 0.05: weightMatrix[sort[i]][sort[j]] = edgeScore if cc % 10000==0: print cc cc+=1 outW.write('\n') for entry1, scoreList in weightMatrix.items(): for entry2, score in scoreList.items(): outW.write(str(entry1)+' '+str(entry2)+' '+str(score)+'\n'); outW.close();
parser.add_argument('-a', '--algo', help='qcc', \ required=True) parser.add_argument('-l', '--lowerLimit', help='min limit on #terms in '+\ 'cluster', required=True,type=float) parser.add_argument('-u', '--upperLimit', help='upper limit on #terms in'+\ ' cluster', required=True,type=float) parser.add_argument('-s', '--sessionFile', help='Session file containing'+\ ' queries', required=True) parser.add_argument('-p', '--pairLabelFile', help='Task labels for a'+\ ' pair of queries, same_task and different_task',\ required=False) args = parser.parse_args() qcc = QCCTasks() featMan = FeatureManager() #stemmer = stem.porter.PorterStemmer() featMan.readFeatures(args.featFile) # Loads the distance between two queries (i.e. 1-similarity) weightMatrix = readWeightMatrix(args.distFile) print len(weightMatrix) samePairsSet = differentPairsSet = None if args.pairLabelFile: samePairsSet , differentPairsSet = loadPairsFromFile(args.pairLabelFile) total_metrics_dict = {} for threshold in np.arange(args.lowerLimit, args.upperLimit, 0.02): sessCount = 0 lastSes = None session = []
avg_inter_ij[i] = min(avg_inter_ij[i], score) avg_inter_ij[j] = min(avg_inter_ij[j], score) fmin_i = [] for i, mini in avg_inter_ij.items(): #print i, vals.values() fmin_i.append(mini / maxDiam) print 'FMIN ', fmin_i print 'Dunn index ', min(fmin_i) if __name__ == '__main__': argv = sys.argv lbreak = False weightMatrix = readWeightMatrix(argv[2]) featMan = FeatureManager() featMan.loadQueries(argv[3]) for ifile in os.listdir(argv[1]): clusters = loadClustersWithQueryFile(argv[1] + '/' + ifile, featMan.returnIdDict()) print len(clusters), len(featMan.returnIdDict()), len(weightMatrix) #load the cluster-assignments and points Dunn(clusters, weightMatrix) #DB(clusters,weightMatrix) #load the weight matrix #load the centers
parser.add_argument('-u', '--upperLimit', help='upper limit on #terms in'+\ ' cluster', required=True,type=int) parser.add_argument('-p', '--pairLabelFile', help='Task labels for a'+\ ' pair of queries: "same_task"\ or "different_task".',\ required=False) parser.add_argument('-t', '--taskLabelFile', help='Task labels for every'+\ 'query: Task id and query.',\ required=False) parser.add_argument('-s', '--ontFile', help='DBpedia ontology file', required=False) #argv = sys.argv args = parser.parse_args() featMan = FeatureManager() featMan.readFeatures(args.featFile) # weightMatrix = readWeightMatrix(args.distFile) weightMatrix = readWeightMatrixWithQueries(args.distFile,featMan) ##stemmer = stem.porter.PorterStemmer() #print len(catQueryDist) # ##PRE-MERGE WRITE #oFile = open(argv[4],'w') #for cat, entry in catQueryDist.items(): # qStr = toString(entry,featMan) # oFile.write(cat +'\t'+qStr+'\n') #oFile.close() ##CLUSTER PRE-MERGE