Esempio n. 1
0
	def __init__(self, catVect, catDir, cclass):
		self.catFileDict = getCats(catDir)
		self.catVectors = catVect;
		self.catObjDict = {}
		self.catClass = cclass;
Esempio n. 2
0
def getStatsPerQuery(argv):
  tagURL = 'http://localhost:8080/rest/annotate'
  catURL = 'http://localhost:8080/rest/graph/get-entity-categories'

  catVector = loadCategoryVector(argv[3])
  f1Dict = getCats(argv[2])
  sFound = 0.0
  sTotal = 0.0
  eTotal = set()
  eRemov = set()
  catFoundNoTerm = set()
  catNotFound = set()
  catTermFound = set()
  catEntity = set()
  outfile = open('match_session_dom.txt', 'w')
  #categoryVectors = {}
  for session in getSessionWithNL(argv[1]):
    catCount = {}
    entCount = {}
    querySpotList = {}
    for query in session:
      #find the entities in query
      try:
        spotDict = None  #tagQueryWithDexter(query, tagURL,catURL)
        querySpotList[query] = spotDict
        for text in spotDict.keys():
          for entry in spotDict[text]['cat'].split():
            catCount[entry] = catCount.setdefault(entry, 1) + 1
          entCount[text] = entCount.setdefault(text, 1) + 1
      except Exception as err:
        print err
        #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount

    found = False
    if len(catCount) > 0:
      #find the dominant entity
      maxEnt = max(entCount.values())
      #sessionQueryMapping = {}
      for query, spotList in querySpotList.iteritems():
        matchl = spotList.keys()
        for entry in matchl:
          eTotal.add(entry)
          if entCount[entry] < maxEnt:
            spotList.pop(entry, None)
            print 'Removing spot', query, entry
            eRemov.add(entry)
          else:
            #get the categories
            #catTermMatch = {}
            rquery = query.replace(entry, '')
            queryTerms = set(rquery.split())
            for cat in spotList[entry]['cat'].lower().split():
              catEntity.add(entry + '_' + cat)
              if cat in f1Dict:
                phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat])
                pVector = catVector[cat]
                queryDict = getDictFromSet(queryTerms)
                pTotal = sum(phrase1.values())
                pset = set(phrase1.keys())
                sint = pset & queryTerms
                score = 0.0
                cscore = get_cosine(queryDict, pVector)

                for iphrase in sint:
                  score += phrase1[iphrase] / pTotal
                if len(queryTerms) > 0:
                  score *= (1.0 * len(sint)) / len(queryTerms)

                if sint:

                  outfile.write(query + '\t' + entry + '\t' + cat + '\t' +
                                str(cscore) + '\t' + ', '.join(sint) + '\n')
                  found = True
                  catTermFound.add(entry + '_' + cat)
                else:
                  outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n')
                  catFoundNoTerm.add(cat + '_' + entry)
              else:
                outfile.write(
                    query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n')
                catNotFound.add(cat + '_' + entry)

                #load the terms for category
                #check if these terms match
    if found:
      sFound += 1
    sTotal += 1
    outfile.write('\n')

  print 'Total Sessions ', sTotal
  print 'Sessions with dominant entity in AOL', sFound
  print '# Unique Entities', len(eTotal)
  print '# Removed Entities (non dominant)', len(eRemov)
  print '# no of entity types', len(catEntity)
  print '# no of entity types with terms match ', len(catTermFound)
  print '# no of entity types with no term match', len(catFoundNoTerm)
  print '# no of entity types with no match in AOL', len(catNotFound)