コード例 #1
0
def main(argv):

  featMan = FeatureManager()
  featMan.readFeatures(argv[2])

  weightMatrix = readWeightMatrix(argv[3])
  returnFilteredNetwork(argv[2], argv[1], featMan, weightMatrix)
コード例 #2
0
ファイル: __init__.py プロジェクト: vmanisha/QueryExpansion
def generatePhraseFeatures(featureFile, spotFile, outFile):
  #load features for queries
  qfeatMan = FeatureManager()
  qfeatMan.readFeatures(featureFile)

  pid = 0
  pfeatMan = FeatureManager()

  #generate features for phrases
  for query, pList in generatePhrases(spotFile):
    qkey, qfeat = qfeatMan.returnFeature(query)
    #print query, qkey
    if qkey:
      #print query, pList
      for phrase in pList:
        qVect = getDictFromSet(phrase.split())
        ngrams = getNGramsAsList(phrase, 2)
        url = qfeat.returnUrl()
        user = qfeat.returnUsers()
        ent = qfeat.returnEntities()
        cat = qfeat.returnCategories()
        typ = qfeat.returnType()
        sess = qfeat.returnSessions()
        if 'tournament' in phrase:
          print query, phrase
          print sess
          print typ
          print ent
        nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent,
                                cat, typ)
        pfeatMan.addFeature(phrase, pid, nFeature)
        pid += 1

  pfeatMan.writeFeatures(outFile)
コード例 #3
0
ファイル: __init__.py プロジェクト: vmanisha/QueryExpansion
def findPairwiseDistance(featureFile, outFile):
  featMan = FeatureManager()

  featMan.readFeatures(featureFile)
  featDict = featMan.featureDict

  oFile = open(outFile, 'w')

  ids = featDict.keys()
  keys = sorted(ids)
  print len(keys), keys[-5:]
  for i in range(0, len(keys) - 1):
    qid1, qf1 = featMan.returnFeature(keys[i])
    for j in range(i + 1, len(keys)):
      qid2, qf2 = featMan.returnFeature(keys[j])
      qcos, ucos, userCos, sessionCos, ngramCos, entCos, \
			catCos,typeCos = qf1.findCosineDistance(qf2)
      qjac = qf1.findJacardDistance(qf2)
      #qedit = qf1.findEditDistance(qf2)
      edgeScore = (15*((qcos + qjac )/2.0) +\
			12.5*ngramCos + 12.5*ucos + 20*sessionCos +\
			20*userCos + 10*((entCos + catCos)/2.0) + 10*typeCos)/100.0
      if edgeScore > 0.0:
        oFile.write(
            #str(qid1) + ' ' + str(qid2) + ' ' + str(round(edgeScore, 3)) + '\n')
            featMan.returnQuery(qid1) + '\t' + featMan.returnQuery(qid2) + '\t' + str(round(edgeScore, 3)) + '\n')
        
        #oFile1.write(str(qid1)+'\t'+str(qid2)+'\t'+\
        #str(round(qcos,2))+'\t'+str(round(qjac,2))+'\t'+\
        #str(round(ngramCos,2))+'\t'+str(round(userCos,2))+'\t' + \
        #str(round(entCos,2))+'\t'+ str(round(catCos,2))+\
        #'\t'+ str(round(sessionCos,2))+'\t'+ str(round(typeCos,2))+'\n')
  oFile.close()
コード例 #4
0
def printCategoryQueryDictionary(fileName, clusFile, weightFile):
	
	featMan = FeatureManager()
	
	featMan.readFeatures(fileName)
	categoryDictionary = {}
	for query, feat in featMan.iterFeatures():
		catDict = feat.returnFeature('cat')
		for entry in catDict:
			if entry not in categoryDictionary:
				categoryDictionary[entry] = set()
			categoryDictionary[entry].add(query)
		
	outC = open(clusFile,'w')
	outW = open(weightFile,'w')
	for entry, qlist in categoryDictionary.items():
		outC.write(toString(qlist,featMan)+'\n')
		outW.write(str(qlist)+'\n')
	outC.close()
	
			
	weightMatrix = {}
	cc = 0
	#calculate the weight matrix
	for entry, qlist in categoryDictionary.items():
		sort = sorted(qlist)
		for i in range(len(sort)-1):
			qid1, qf1 = featMan.returnFeature(sort[i])
			if qf1:
				if sort[i]  not in weightMatrix:
					weightMatrix[sort[i]] = {}
				for j in range(i+1,len(sort)):
					qid2, qf2 = featMan.returnFeature(sort[j])
					if qf2:
						if sort[j] not in weightMatrix[sort[i]]:
							qcos, ucos, userCos, ngramCos, entCos, catCos = qf1.findCosineDistance(qf2)
							qjac = qf1.findJacardDistance(qf2)
							#qedit = qf1.findEditDistance(qf2)
							#normalized distance
							#dist = (j - i)#*1.0/len(session)
							
							edgeScore = (.25*((qcos + qjac )/2.0) +\
							.15*ngramCos + .15*ucos + \
							.15*userCos + .15*entCos + .15*catCos)
					
							if edgeScore > 0.05:
								weightMatrix[sort[i]][sort[j]] = edgeScore
		if cc % 10000==0:
			print cc
		cc+=1
		
	outW.write('\n')
	
	
	for entry1, scoreList in weightMatrix.items():
		for entry2, score in scoreList.items():
			outW.write(str(entry1)+' '+str(entry2)+' '+str(score)+'\n');
	outW.close();
コード例 #5
0
ファイル: dpmeans.py プロジェクト: vmanisha/QueryExpansion
def main(argv):
  featMan = FeatureManager()
  featMan.readFeatures(argv[1])

  data = featMan.returnKeys()  #random.sample(list(featMan.returnKeys()),40)
  print 'To cluster', data
  #weightMatrix = readWeightMatrix(argv[2])
  weightList = readWeightMatrix(argv[2])

  catNetwork, catQueryDist = returnFilteredNetwork(argv[3], argv[4], featMan)
  dp = DPMeans()
  x = float(argv[5])
  while x < 0.85:
    dp.clusterWithNetwork(featMan, weightList, catNetwork, catQueryDist, x,
                          0.01)
    x += 0.10
コード例 #6
0
ファイル: qccTasks.py プロジェクト: vmanisha/QueryExpansion
                    'cluster', required=True,type=float)
  parser.add_argument('-u', '--upperLimit', help='upper limit on #terms in'+\
                    ' cluster', required=True,type=float)
  parser.add_argument('-s', '--sessionFile', help='Session file containing'+\
                    ' queries', required=True)
  parser.add_argument('-p', '--pairLabelFile', help='Task labels for a'+\
                    ' pair of queries, same_task and different_task',\
                     required=False)

  args = parser.parse_args()

  qcc = QCCTasks()
  featMan = FeatureManager()

  #stemmer = stem.porter.PorterStemmer()
  featMan.readFeatures(args.featFile)
  # Loads the distance between two queries (i.e. 1-similarity)
  weightMatrix = readWeightMatrix(args.distFile)
  print len(weightMatrix)
  samePairsSet = differentPairsSet = None
  if args.pairLabelFile:
    samePairsSet , differentPairsSet =   loadPairsFromFile(args.pairLabelFile)

  total_metrics_dict = {}
  for threshold in np.arange(args.lowerLimit, args.upperLimit, 0.02):
    sessCount = 0
    lastSes = None
    session = []
    metrics = {}
    qcc = QCCTasks()
    for session in getSessionWithQuery(args.sessionFile):