def main(argv): featMan = FeatureManager() featMan.readFeatures(argv[2]) weightMatrix = readWeightMatrix(argv[3]) returnFilteredNetwork(argv[2], argv[1], featMan, weightMatrix)
def generatePhraseFeatures(featureFile, spotFile, outFile): #load features for queries qfeatMan = FeatureManager() qfeatMan.readFeatures(featureFile) pid = 0 pfeatMan = FeatureManager() #generate features for phrases for query, pList in generatePhrases(spotFile): qkey, qfeat = qfeatMan.returnFeature(query) #print query, qkey if qkey: #print query, pList for phrase in pList: qVect = getDictFromSet(phrase.split()) ngrams = getNGramsAsList(phrase, 2) url = qfeat.returnUrl() user = qfeat.returnUsers() ent = qfeat.returnEntities() cat = qfeat.returnCategories() typ = qfeat.returnType() sess = qfeat.returnSessions() if 'tournament' in phrase: print query, phrase print sess print typ print ent nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent, cat, typ) pfeatMan.addFeature(phrase, pid, nFeature) pid += 1 pfeatMan.writeFeatures(outFile)
def findPairwiseDistance(featureFile, outFile): featMan = FeatureManager() featMan.readFeatures(featureFile) featDict = featMan.featureDict oFile = open(outFile, 'w') ids = featDict.keys() keys = sorted(ids) print len(keys), keys[-5:] for i in range(0, len(keys) - 1): qid1, qf1 = featMan.returnFeature(keys[i]) for j in range(i + 1, len(keys)): qid2, qf2 = featMan.returnFeature(keys[j]) qcos, ucos, userCos, sessionCos, ngramCos, entCos, \ catCos,typeCos = qf1.findCosineDistance(qf2) qjac = qf1.findJacardDistance(qf2) #qedit = qf1.findEditDistance(qf2) edgeScore = (15*((qcos + qjac )/2.0) +\ 12.5*ngramCos + 12.5*ucos + 20*sessionCos +\ 20*userCos + 10*((entCos + catCos)/2.0) + 10*typeCos)/100.0 if edgeScore > 0.0: oFile.write( #str(qid1) + ' ' + str(qid2) + ' ' + str(round(edgeScore, 3)) + '\n') featMan.returnQuery(qid1) + '\t' + featMan.returnQuery(qid2) + '\t' + str(round(edgeScore, 3)) + '\n') #oFile1.write(str(qid1)+'\t'+str(qid2)+'\t'+\ #str(round(qcos,2))+'\t'+str(round(qjac,2))+'\t'+\ #str(round(ngramCos,2))+'\t'+str(round(userCos,2))+'\t' + \ #str(round(entCos,2))+'\t'+ str(round(catCos,2))+\ #'\t'+ str(round(sessionCos,2))+'\t'+ str(round(typeCos,2))+'\n') oFile.close()
def printCategoryQueryDictionary(fileName, clusFile, weightFile): featMan = FeatureManager() featMan.readFeatures(fileName) categoryDictionary = {} for query, feat in featMan.iterFeatures(): catDict = feat.returnFeature('cat') for entry in catDict: if entry not in categoryDictionary: categoryDictionary[entry] = set() categoryDictionary[entry].add(query) outC = open(clusFile,'w') outW = open(weightFile,'w') for entry, qlist in categoryDictionary.items(): outC.write(toString(qlist,featMan)+'\n') outW.write(str(qlist)+'\n') outC.close() weightMatrix = {} cc = 0 #calculate the weight matrix for entry, qlist in categoryDictionary.items(): sort = sorted(qlist) for i in range(len(sort)-1): qid1, qf1 = featMan.returnFeature(sort[i]) if qf1: if sort[i] not in weightMatrix: weightMatrix[sort[i]] = {} for j in range(i+1,len(sort)): qid2, qf2 = featMan.returnFeature(sort[j]) if qf2: if sort[j] not in weightMatrix[sort[i]]: qcos, ucos, userCos, ngramCos, entCos, catCos = qf1.findCosineDistance(qf2) qjac = qf1.findJacardDistance(qf2) #qedit = qf1.findEditDistance(qf2) #normalized distance #dist = (j - i)#*1.0/len(session) edgeScore = (.25*((qcos + qjac )/2.0) +\ .15*ngramCos + .15*ucos + \ .15*userCos + .15*entCos + .15*catCos) if edgeScore > 0.05: weightMatrix[sort[i]][sort[j]] = edgeScore if cc % 10000==0: print cc cc+=1 outW.write('\n') for entry1, scoreList in weightMatrix.items(): for entry2, score in scoreList.items(): outW.write(str(entry1)+' '+str(entry2)+' '+str(score)+'\n'); outW.close();
def main(argv): featMan = FeatureManager() featMan.readFeatures(argv[1]) data = featMan.returnKeys() #random.sample(list(featMan.returnKeys()),40) print 'To cluster', data #weightMatrix = readWeightMatrix(argv[2]) weightList = readWeightMatrix(argv[2]) catNetwork, catQueryDist = returnFilteredNetwork(argv[3], argv[4], featMan) dp = DPMeans() x = float(argv[5]) while x < 0.85: dp.clusterWithNetwork(featMan, weightList, catNetwork, catQueryDist, x, 0.01) x += 0.10
'cluster', required=True,type=float) parser.add_argument('-u', '--upperLimit', help='upper limit on #terms in'+\ ' cluster', required=True,type=float) parser.add_argument('-s', '--sessionFile', help='Session file containing'+\ ' queries', required=True) parser.add_argument('-p', '--pairLabelFile', help='Task labels for a'+\ ' pair of queries, same_task and different_task',\ required=False) args = parser.parse_args() qcc = QCCTasks() featMan = FeatureManager() #stemmer = stem.porter.PorterStemmer() featMan.readFeatures(args.featFile) # Loads the distance between two queries (i.e. 1-similarity) weightMatrix = readWeightMatrix(args.distFile) print len(weightMatrix) samePairsSet = differentPairsSet = None if args.pairLabelFile: samePairsSet , differentPairsSet = loadPairsFromFile(args.pairLabelFile) total_metrics_dict = {} for threshold in np.arange(args.lowerLimit, args.upperLimit, 0.02): sessCount = 0 lastSes = None session = [] metrics = {} qcc = QCCTasks() for session in getSessionWithQuery(args.sessionFile):