def getPrecRecall(opt, catList, f1Dict, catVector, queryTerms, aTerms, index):

  catScore = {}
  maxQs = -1000
  maxCat = ''

  notFound = set()
  for cat in catList:
    if cat in f1Dict:
      catScore[cat
           ] = {'aP': 0.0,
                'aR': 0.0,
                'qS': 0.0,
                'qInt': set(),
                'aInt': set()}
      #phrase cat score
      phrase1 = loadPhrasesWithScore(f1Dict[cat])
      pTotal = sum(phrase1.values())  #total no of terms in cat
      pset = set(phrase1.keys())  #unique phrases in cat
      qInt = pset & queryTerms  #no of query terms cat contains
      score = 0.0
      for iphrase in qInt:
        score += phrase1[iphrase] / pTotal
      if len(queryTerms) > 0:
        score *= (1.0 * len(qInt)) / len(queryTerms)

      #cosine score
      queryDict = getDictFromSet(queryTerms)
      cVector = catVector[cat]
      cscore = get_cosine(queryDict, cVector)

      #total score
      catScore[cat]['qs'] = cscore + score
      if maxQs < catScore[cat]['qs']:
        maxQs = catScore[cat]['qs']
        maxCat = cat

      sortP = sorted(phrase1.items(), reverse=True, key=lambda x: x[1])
      #print 'sorted' , sortP[0],sortP[1]
      apset = set(x[0] for x in sortP[0:index])
      #print 'pSet ',apset

      aInt = aTerms & apset
      catScore[cat]['aP'] = (1.0 * len(aInt)) / len(aTerms)
      catScore[cat]['aR'] = (1.0 * len(aInt)) / len(apset)
      catScore[cat]['aInt'] = aInt
      catScore[cat]['qInt'] = qInt
    else:
      notFound.add(cat)

  if opt == 'max':
    if maxCat in catScore:
      return notFound, maxCat, catScore[maxCat]
    else:
      return notFound, None, {
          'aP': 0.0,
          'aR': 0.0,
          'qS': 0.0,
          'qInt': set(),
          'aInt': set()
      }

  else:
    avgScore = {'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set()}
    for entry, cdict in catScore.iteritems():
      avgScore['aP'] += cdict['aP']
      avgScore['aR'] += cdict['aR']
      avgScore['qS'] += cdict['qS']
      avgScore['qInt'] |= cdict['qInt']
      avgScore['aInt'] |= cdict['aInt']

    avgScore['aP'] /= len(catScore)
    avgScore['aR'] /= len(catScore)
    avgScore['qS'] /= len(catScore)

    return notFound, None, avgScore

  return notFound, None, None
def getStatsPerQuery(argv):
  tagURL = 'http://localhost:8080/rest/annotate'
  catURL = 'http://localhost:8080/rest/graph/get-entity-categories'

  catVector = loadCategoryVector(argv[3])
  f1Dict = getCats(argv[2])
  sFound = 0.0
  sTotal = 0.0
  eTotal = set()
  eRemov = set()
  catFoundNoTerm = set()
  catNotFound = set()
  catTermFound = set()
  catEntity = set()
  outfile = open('match_session_dom.txt', 'w')
  #categoryVectors = {}
  for session in getSessionWithNL(argv[1]):
    catCount = {}
    entCount = {}
    querySpotList = {}
    for query in session:
      #find the entities in query
      try:
        spotDict = None  #tagQueryWithDexter(query, tagURL,catURL)
        querySpotList[query] = spotDict
        for text in spotDict.keys():
          for entry in spotDict[text]['cat'].split():
            catCount[entry] = catCount.setdefault(entry, 1) + 1
          entCount[text] = entCount.setdefault(text, 1) + 1
      except Exception as err:
        print err
        #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount

    found = False
    if len(catCount) > 0:
      #find the dominant entity
      maxEnt = max(entCount.values())
      #sessionQueryMapping = {}
      for query, spotList in querySpotList.iteritems():
        matchl = spotList.keys()
        for entry in matchl:
          eTotal.add(entry)
          if entCount[entry] < maxEnt:
            spotList.pop(entry, None)
            print 'Removing spot', query, entry
            eRemov.add(entry)
          else:
            #get the categories
            #catTermMatch = {}
            rquery = query.replace(entry, '')
            queryTerms = set(rquery.split())
            for cat in spotList[entry]['cat'].lower().split():
              catEntity.add(entry + '_' + cat)
              if cat in f1Dict:
                phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat])
                pVector = catVector[cat]
                queryDict = getDictFromSet(queryTerms)
                pTotal = sum(phrase1.values())
                pset = set(phrase1.keys())
                sint = pset & queryTerms
                score = 0.0
                cscore = get_cosine(queryDict, pVector)

                for iphrase in sint:
                  score += phrase1[iphrase] / pTotal
                if len(queryTerms) > 0:
                  score *= (1.0 * len(sint)) / len(queryTerms)

                if sint:

                  outfile.write(query + '\t' + entry + '\t' + cat + '\t' +
                                str(cscore) + '\t' + ', '.join(sint) + '\n')
                  found = True
                  catTermFound.add(entry + '_' + cat)
                else:
                  outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n')
                  catFoundNoTerm.add(cat + '_' + entry)
              else:
                outfile.write(
                    query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n')
                catNotFound.add(cat + '_' + entry)

                #load the terms for category
                #check if these terms match
    if found:
      sFound += 1
    sTotal += 1
    outfile.write('\n')

  print 'Total Sessions ', sTotal
  print 'Sessions with dominant entity in AOL', sFound
  print '# Unique Entities', len(eTotal)
  print '# Removed Entities (non dominant)', len(eRemov)
  print '# no of entity types', len(catEntity)
  print '# no of entity types with terms match ', len(catTermFound)
  print '# no of entity types with no term match', len(catFoundNoTerm)
  print '# no of entity types with no match in AOL', len(catNotFound)