def main(argv):

  featMan = FeatureManager()
  featMan.readFeatures(argv[2])

  weightMatrix = readWeightMatrix(argv[3])
  returnFilteredNetwork(argv[2], argv[1], featMan, weightMatrix)
Esempio n. 2
0
def generatePhraseFeatures(featureFile, spotFile, outFile):
  #load features for queries
  qfeatMan = FeatureManager()
  qfeatMan.readFeatures(featureFile)

  pid = 0
  pfeatMan = FeatureManager()

  #generate features for phrases
  for query, pList in generatePhrases(spotFile):
    qkey, qfeat = qfeatMan.returnFeature(query)
    #print query, qkey
    if qkey:
      #print query, pList
      for phrase in pList:
        qVect = getDictFromSet(phrase.split())
        ngrams = getNGramsAsList(phrase, 2)
        url = qfeat.returnUrl()
        user = qfeat.returnUsers()
        ent = qfeat.returnEntities()
        cat = qfeat.returnCategories()
        typ = qfeat.returnType()
        sess = qfeat.returnSessions()
        if 'tournament' in phrase:
          print query, phrase
          print sess
          print typ
          print ent
        nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent,
                                cat, typ)
        pfeatMan.addFeature(phrase, pid, nFeature)
        pid += 1

  pfeatMan.writeFeatures(outFile)
Esempio n. 3
0
def main(argv):
  featMan = FeatureManager()
  featMan.readFeatures(argv[1])

  data = featMan.returnKeys()  #random.sample(list(featMan.returnKeys()),40)
  print 'To cluster', data
  #weightMatrix = readWeightMatrix(argv[2])
  weightList = readWeightMatrix(argv[2])

  catNetwork, catQueryDist = returnFilteredNetwork(argv[3], argv[4], featMan)
  dp = DPMeans()
  x = float(argv[5])
  while x < 0.85:
    dp.clusterWithNetwork(featMan, weightList, catNetwork, catQueryDist, x,
                          0.01)
    x += 0.10
Esempio n. 4
0
def sampleQueryPairs(fileName, weightFile, featFile):
  weightMatrix = readWeightMatrix(weightFile)
  featMan = FeatureManager()
  featMan.loadQueries(featFile)
  idDict = featMan.returnIdDict()
  qDict = featMan.returnQueryDict()

  clusters = loadClustersWithQueryFile(fileName, idDict)
  done = {}
  for entry in clusters:
    minPair = None
    maxPair = None
    minDist = 1000
    maxDist = 0
    #print entry
    if len(entry) > 3:
      sentry = sorted(entry)
      for i in range(len(sentry) - 1):
        for j in range(i + 1, len(sentry)):
          try:
            if weightMatrix[sentry[i]][sentry[j]] < minDist:
              minDist = weightMatrix[sentry[i]][sentry[j]]
              minPair = (qDict[sentry[i]], qDict[sentry[j]])
            if weightMatrix[sentry[i]][sentry[j]] > maxDist:
              maxDist = weightMatrix[sentry[i]][sentry[j]]
              maxPair = (qDict[sentry[i]], qDict[sentry[j]])
          except:
            dist = random.uniform(0.8, 1.0)
            if dist < minDist:
              minDist = dist
              minPair = (qDict[sentry[i]], qDict[sentry[j]])
            if dist > maxDist:
              maxDist = dist
              maxPair = (qDict[sentry[i]], qDict[sentry[j]])

    if minPair and minPair[0] not in done and minPair[1] not in done:
      print 'Min\t' + minPair[0] + '\t' + minPair[1]

    if maxPair and maxPair[0] not in done and maxPair[1] not in done:
      print 'Max\t' + maxPair[0] + '\t' + maxPair[1]

    if minPair:
      done[minPair[0]] = 1
      done[minPair[1]] = 1
    if maxPair:
      done[maxPair[0]] = 1
      done[maxPair[1]] = 1
Esempio n. 5
0
def findPairwiseDistance(featureFile, outFile):
  featMan = FeatureManager()

  featMan.readFeatures(featureFile)
  featDict = featMan.featureDict

  oFile = open(outFile, 'w')

  ids = featDict.keys()
  keys = sorted(ids)
  print len(keys), keys[-5:]
  for i in range(0, len(keys) - 1):
    qid1, qf1 = featMan.returnFeature(keys[i])
    for j in range(i + 1, len(keys)):
      qid2, qf2 = featMan.returnFeature(keys[j])
      qcos, ucos, userCos, sessionCos, ngramCos, entCos, \
			catCos,typeCos = qf1.findCosineDistance(qf2)
      qjac = qf1.findJacardDistance(qf2)
      #qedit = qf1.findEditDistance(qf2)
      edgeScore = (15*((qcos + qjac )/2.0) +\
			12.5*ngramCos + 12.5*ucos + 20*sessionCos +\
			20*userCos + 10*((entCos + catCos)/2.0) + 10*typeCos)/100.0
      if edgeScore > 0.0:
        oFile.write(
            #str(qid1) + ' ' + str(qid2) + ' ' + str(round(edgeScore, 3)) + '\n')
            featMan.returnQuery(qid1) + '\t' + featMan.returnQuery(qid2) + '\t' + str(round(edgeScore, 3)) + '\n')
        
        #oFile1.write(str(qid1)+'\t'+str(qid2)+'\t'+\
        #str(round(qcos,2))+'\t'+str(round(qjac,2))+'\t'+\
        #str(round(ngramCos,2))+'\t'+str(round(userCos,2))+'\t' + \
        #str(round(entCos,2))+'\t'+ str(round(catCos,2))+\
        #'\t'+ str(round(sessionCos,2))+'\t'+ str(round(typeCos,2))+'\n')
  oFile.close()
def printCategoryQueryDictionary(fileName, clusFile, weightFile):
	
	featMan = FeatureManager()
	
	featMan.readFeatures(fileName)
	categoryDictionary = {}
	for query, feat in featMan.iterFeatures():
		catDict = feat.returnFeature('cat')
		for entry in catDict:
			if entry not in categoryDictionary:
				categoryDictionary[entry] = set()
			categoryDictionary[entry].add(query)
		
	outC = open(clusFile,'w')
	outW = open(weightFile,'w')
	for entry, qlist in categoryDictionary.items():
		outC.write(toString(qlist,featMan)+'\n')
		outW.write(str(qlist)+'\n')
	outC.close()
	
			
	weightMatrix = {}
	cc = 0
	#calculate the weight matrix
	for entry, qlist in categoryDictionary.items():
		sort = sorted(qlist)
		for i in range(len(sort)-1):
			qid1, qf1 = featMan.returnFeature(sort[i])
			if qf1:
				if sort[i]  not in weightMatrix:
					weightMatrix[sort[i]] = {}
				for j in range(i+1,len(sort)):
					qid2, qf2 = featMan.returnFeature(sort[j])
					if qf2:
						if sort[j] not in weightMatrix[sort[i]]:
							qcos, ucos, userCos, ngramCos, entCos, catCos = qf1.findCosineDistance(qf2)
							qjac = qf1.findJacardDistance(qf2)
							#qedit = qf1.findEditDistance(qf2)
							#normalized distance
							#dist = (j - i)#*1.0/len(session)
							
							edgeScore = (.25*((qcos + qjac )/2.0) +\
							.15*ngramCos + .15*ucos + \
							.15*userCos + .15*entCos + .15*catCos)
					
							if edgeScore > 0.05:
								weightMatrix[sort[i]][sort[j]] = edgeScore
		if cc % 10000==0:
			print cc
		cc+=1
		
	outW.write('\n')
	
	
	for entry1, scoreList in weightMatrix.items():
		for entry2, score in scoreList.items():
			outW.write(str(entry1)+' '+str(entry2)+' '+str(score)+'\n');
	outW.close();
Esempio n. 7
0
  parser.add_argument('-a', '--algo', help='qcc', \
                    required=True)
  parser.add_argument('-l', '--lowerLimit', help='min limit on #terms in '+\
                    'cluster', required=True,type=float)
  parser.add_argument('-u', '--upperLimit', help='upper limit on #terms in'+\
                    ' cluster', required=True,type=float)
  parser.add_argument('-s', '--sessionFile', help='Session file containing'+\
                    ' queries', required=True)
  parser.add_argument('-p', '--pairLabelFile', help='Task labels for a'+\
                    ' pair of queries, same_task and different_task',\
                     required=False)

  args = parser.parse_args()

  qcc = QCCTasks()
  featMan = FeatureManager()

  #stemmer = stem.porter.PorterStemmer()
  featMan.readFeatures(args.featFile)
  # Loads the distance between two queries (i.e. 1-similarity)
  weightMatrix = readWeightMatrix(args.distFile)
  print len(weightMatrix)
  samePairsSet = differentPairsSet = None
  if args.pairLabelFile:
    samePairsSet , differentPairsSet =   loadPairsFromFile(args.pairLabelFile)

  total_metrics_dict = {}
  for threshold in np.arange(args.lowerLimit, args.upperLimit, 0.02):
    sessCount = 0
    lastSes = None
    session = []
Esempio n. 8
0
              avg_inter_ij[i] = min(avg_inter_ij[i], score)
              avg_inter_ij[j] = min(avg_inter_ij[j], score)

    fmin_i = []
    for i, mini in avg_inter_ij.items():
      #print i, vals.values()
      fmin_i.append(mini / maxDiam)

    print 'FMIN ', fmin_i
    print 'Dunn index ', min(fmin_i)


if __name__ == '__main__':
  argv = sys.argv
  lbreak = False
  weightMatrix = readWeightMatrix(argv[2])
  featMan = FeatureManager()
  featMan.loadQueries(argv[3])
  for ifile in os.listdir(argv[1]):
    clusters = loadClustersWithQueryFile(argv[1] + '/' + ifile,
                                         featMan.returnIdDict())
    print len(clusters), len(featMan.returnIdDict()), len(weightMatrix)
    #load the cluster-assignments and points

    Dunn(clusters, weightMatrix)
  #DB(clusters,weightMatrix)

  #load the weight matrix

  #load the centers
	parser.add_argument('-u', '--upperLimit', help='upper limit on #terms in'+\
						' cluster', required=True,type=int)
	parser.add_argument('-p', '--pairLabelFile', help='Task labels for a'+\
                                                ' pair of queries: "same_task"\
                                                 or "different_task".',\
						 required=False)
	parser.add_argument('-t', '--taskLabelFile', help='Task labels for every'+\
                                                        'query: Task id and query.',\
						 required=False)
	parser.add_argument('-s', '--ontFile', help='DBpedia ontology file', required=False)
	
	#argv = sys.argv
	args = parser.parse_args()
	
	
	featMan = FeatureManager()
	featMan.readFeatures(args.featFile)
	# weightMatrix = readWeightMatrix(args.distFile)
	weightMatrix = readWeightMatrixWithQueries(args.distFile,featMan)
	
	##stemmer =  stem.porter.PorterStemmer()
	#print len(catQueryDist)
	#
	##PRE-MERGE  WRITE
	#oFile = open(argv[4],'w')
	#for cat, entry in catQueryDist.items():
	#	qStr = toString(entry,featMan)
	#	oFile.write(cat +'\t'+qStr+'\n')
	#oFile.close()

	##CLUSTER PRE-MERGE