def main(argv):

  ttCoOc = CoOccurrence()
  teCoOc = ObjCoOccurrence()
  ecCoOc = ObjCoOccurrence()

  termEntMan = CoOcManager(argv[1], teCoOc, '\t')
  entCatMan = CoOcManager(argv[2], ecCoOc, '\t')
  termTermMan = CoOcManager(argv[3], ttCoOc, ' ')

  ipaddress = 'localhost'
  #dexter object
  tagURL = 'http://' + ipaddress + ':8080/rest/annotate'
  catURL = 'http://' + ipaddress + ':8080/rest/graph/get-entity-categories'
  dexter = Dexter(tagURL, catURL)
  et = ProbExpansion(termTermMan, termEntMan, entCatMan, dexter, Ranker())

  for i, session in getSessionWithNL(argv[4]):
    query = session[0]
    terms = et.expandText(query, 50)
    print i, '\t', query, '\t', terms
def getStatsPerSession(catVector, f1Dict, argv):

  tagURL = 'http://localhost:8080/rest/annotate'
  catURL = 'http://localhost:8080/rest/graph/get-entity-categories'

  print 'Cats ', len(f1Dict)
  #stats
  sStat = {'ef': 0, 'total': 0, 'aTerms': 0}
  #eStat = {'total':set(), 'remov':set()}
  catStat = {'nfTerm': set(), 'nf': set(), 'tf': set(), 'total': set()}
  outfile = open('match_session_' + str(argv[4]) + '.txt', 'w')
  #categoryVectors = {}
  #load the session
  arTotal = 0.0
  apTotal = 0.0
  for session in getSessionWithNL(argv[1]):
    bQuery = session[0].lower()
    bQuery = re.sub(SYMB, ' ', bQuery)
    bQuery = re.sub('\s+', ' ', bQuery).strip()
    aTerms, rTerms = addedAndRemovedTerms(bQuery, session[1:])
    arMax = 0.0
    apMax = 0.0
    try:
      spotDict = None  #tagQueryWithDexter(bQuery, tagURL,catURL)
      time.sleep(1)
      if aTerms:
        sStat['aTerms'] += 1.0
        if len(spotDict) > 0:
          sStat['ef'] += 1.0
          print 'Found Entity \t', '\t'.join(session)
        for entry in spotDict.keys():
          rquery = bQuery.replace(entry, '')
          queryTerms = set(rquery.split())
          catList = spotDict[entry]['cat'].lower().split()
          #notFound, maxCat, rDict = getPrecRecall('avg',catList,f1Dict,catVector, queryTerms, argv[2])
          #print 'Avg', notFound, rDict
          notFound, maxCat, rDict = getPrecRecall(
              'max', catList, f1Dict, catVector, queryTerms, aTerms, int(
                  argv[4]))
          print 'Max', bQuery, 'Ent', entry, 'Cat', maxCat, 'NFC', notFound, rDict
          nf = 0
          for centry in catList:
            catStat['total'].add(centry + '_' + entry)
            if centry in notFound:
              catStat['nf'].add(centry + '_' + entry)
              nf += 1.0
            else:
              if rDict and len(rDict['qInt']) == 0:
                catStat['nfTerm'].add(centry + '_' + entry)
          if nf == len(catList):
            print 'For Query', bQuery, 'With ent list', spotDict.keys(
            ), 'for ENT', entry, 'No cat found'

          if rDict:
            #to choose the type with max values
            if arMax < rDict['aR']:
              arMax = rDict['aR']
            if apMax < rDict['aP']:
              apMax = rDict['aP']

            outfile.write(
                bQuery + '\t' + entry + '\t' + str(rDict['qS']) + '\t' +
                ', '.join(rDict['qInt']) + '\t' + ', '.join(rDict['aInt']) +
                '\t' + str(rDict['aR']) + '\t' + str(rDict['aP']) + '\n')
      #else:
      #	outfile.write(bQuery+'\tNOT\tNOT\tNOT\tNO TERMS\n')
    except Exception as err:
      print 'SESSION WITH ERR', session, err, err.args
    if aTerms:
      print 'Prec ', argv[4], bQuery, '\t', apMax
      for query in session[1:]:
        outfile.write(query + '\tNIL\t0.0\tNIL\tNIL\t0.0\t0.0\n')

    sStat['total'] += 1
    outfile.write('\n')
    apTotal += apMax
    arTotal += arMax

  print 'Total Sessions ', sStat['total']
  print 'Sessions with entity in AOL', sStat['ef']
  print '# no of entity types', len(catStat['total'])
  #print '# no of entity types with terms match ', len(catStat['tf'])
  print '# no of entity types present but no qterm match', len(
      catStat['nfTerm'])
  print '# no of entity types not present in AOL', len(catStat['nf'])
  if sStat['ef'] > 0:
    print argv[4], '\t', 'Prec', apTotal / sStat['ef'
                                           ], 'Recall', arTotal / sStat['ef']
    print argv[4], '\t', 'Prec', apTotal / sStat[
        'aTerms'
    ], 'Recall', arTotal / sStat['aTerms']
Exemple #3
0
def findMarkovStats(argv):

  i = 0

  wikiIndexDir = argv[2]
  queryIndexDir = argv[3]

  iFile = argv[1]

  wIndex, wsearcher = loadIndex(wikiIndexDir, wikiIndexDir)
  qIndex, qsearcher = loadIndex(queryIndexDir, queryIndexDir)

  wtlc = loadCollector(wsearcher, 2000, 20)
  qtlc = loadCollector(qsearcher, 2000, 20)

  qqp = loadQueryParser(qIndex, 'session')
  wqp = loadQueryParser(wIndex, 'content')

  prec = {}
  recall = {}

  count = 0.0
  for session in getSessionWithNL(iFile):
    #get the query
    query = session[0].lower()
    query = re.sub(SYMB, ' ', query)
    query = re.sub('\d+', ' ', query)
    query = re.sub('\s+', ' ', query).strip()

    aTerms, bTerms = addedAndRemovedTerms(query, session)

    if aTerms:
      count += 1.0
      totalNetwork = {}
      #stemNetwork = {}
      #queryNetwork = {}
      #wikiNetwork = {}
      terms = updateNetwork(query, totalNetwork, wqp, wsearcher, wtlc,
                            'content', 'wiki')
      terms2 = updateNetwork(query, totalNetwork, qqp, qsearcher, qtlc,
                             'session', 'query')
      print len(terms), len(terms2)
      #updateStemNetwork(queryNetwork,stemNetwork, porter)	
      #updateStemNetwork(wikiNetwork,stemNetwork, porter)
      updateStemNetwork(totalNetwork)
      #normalizeNetworks(queryNetwork)			
      #normalizeNetworks(stemNetwork)			
      #normalizeNetworks(wikiNetwork)

      #calculate the mixtures at two stages
      stage1 = {}
      stage2 = {}
      combineNetwork(1.0, stage1, totalNetwork, 'stem')
      combineNetwork(0.5, stage2, totalNetwork, 'query')
      combineNetwork(0.5, stage2, totalNetwork, 'wiki')

      #convert into matrix for multiplication
      totalDim = sorted(list(set(stage1.keys()) | set(stage2.keys())))

      dim = len(totalDim)
      if dim > 0:
        stage1Matrix = toMatrix(totalDim, stage1)
        print 'STAGE1', stage1Matrix[0], stage1Matrix.shape
        stage2Matrix = toMatrix(totalDim, stage2)
        print 'STAGE2', stage2Matrix[0], stage2Matrix.shape

        backSmooth = 1.0 / len(totalDim)
        stage3Matrix = numpy.zeros((dim, dim))
        stage3Matrix.fill(backSmooth)
        print 'STAGE3', stage3Matrix[0], stage3Matrix.shape

        alpha = 0.80
        #matrix = ['stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage3']
        matrix = ['stage1', 'stage2', 'stage2', 'stage2', 'stage3']
        totalSum = numpy.zeros((dim, dim))
        cK = numpy.ones((dim, dim))

        #start walk!
        for k in range(len(matrix)):
          print k, matrix[k]
          if matrix[k] == 'stage1':
            cK = numpy.dot(stage1Matrix, cK)
          elif matrix[k] == 'stage2':
            cK = numpy.dot(stage2Matrix, cK)
          else:
            cK = numpy.dot(cK, stage3Matrix)
          print 'CK', cK[0]

          totalSum = totalSum + (math.pow(alpha, k) * cK)
        totalSum = totalSum * (1 - alpha)

        #rank Terms
        qList = []
        terms = query.split()  #getQueryTerms(query)
        for term in terms:
          if term in totalDim:
            qList.append(totalDim.index(term))
          else:
            print 'ERROR dint find ', query, '\t', term, len(totalDim)

        termScore = {}
        for i in range(len(totalDim)):
          termScore[totalDim[i]] = 0.0
          for j in qList:
            if totalSum[i][j] > 0.0:
              termScore[totalDim[i]] += math.log(totalSum[i][j])

        #find the precision for different term sets
        sortTerms = sorted(termScore.iteritems(),
                           reverse=True,
                           key=lambda x: x[1])
        for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']:
          try:
            cTerms = set([x[0] for x in sortTerms[:i]])
            print 'CTERMS ', sortTerms[0:10], len(cTerms), 'ATERMS', aTerms
            p = len(aTerms & cTerms) / (len(aTerms) * 1.0)
            r = len(aTerms & cTerms) / (len(cTerms) * 1.0)
            prec[i] = prec.setdefault(i, 0.0) + p
            recall[i] = recall.setdefault(i, 0.0) + r
            print 'Prec', i, '\t', query, '\t', p
          except Exception as err:
            cTerms = set([x[0] for x in sortTerms])
            p = len(aTerms & cTerms) / (len(aTerms) * 1.0)
            r = len(aTerms & cTerms) / (len(cTerms) * 1.0)
            prec[i] = prec.setdefault(i, 0.0) + p
            recall[i] = recall.setdefault(i, 0.0) + r
            print 'Prec', i, '\t', query, '\t', p

      else:
        for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']:
          print 'Prec', i, '\t', query, '\t', 0.0

    #average the prec & recall
    #print prec and recall
  print 'Printing Precison'
  for entry, value in prec.iteritems():
    print entry, value / count

  print 'Printing Precison'
  for entry, value in recall.iteritems():
    print entry, value / count

  wIndex.close()
  qIndex.close()
def getStatsPerQuery(argv):
  tagURL = 'http://localhost:8080/rest/annotate'
  catURL = 'http://localhost:8080/rest/graph/get-entity-categories'

  catVector = loadCategoryVector(argv[3])
  f1Dict = getCats(argv[2])
  sFound = 0.0
  sTotal = 0.0
  eTotal = set()
  eRemov = set()
  catFoundNoTerm = set()
  catNotFound = set()
  catTermFound = set()
  catEntity = set()
  outfile = open('match_session_dom.txt', 'w')
  #categoryVectors = {}
  for session in getSessionWithNL(argv[1]):
    catCount = {}
    entCount = {}
    querySpotList = {}
    for query in session:
      #find the entities in query
      try:
        spotDict = None  #tagQueryWithDexter(query, tagURL,catURL)
        querySpotList[query] = spotDict
        for text in spotDict.keys():
          for entry in spotDict[text]['cat'].split():
            catCount[entry] = catCount.setdefault(entry, 1) + 1
          entCount[text] = entCount.setdefault(text, 1) + 1
      except Exception as err:
        print err
        #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount

    found = False
    if len(catCount) > 0:
      #find the dominant entity
      maxEnt = max(entCount.values())
      #sessionQueryMapping = {}
      for query, spotList in querySpotList.iteritems():
        matchl = spotList.keys()
        for entry in matchl:
          eTotal.add(entry)
          if entCount[entry] < maxEnt:
            spotList.pop(entry, None)
            print 'Removing spot', query, entry
            eRemov.add(entry)
          else:
            #get the categories
            #catTermMatch = {}
            rquery = query.replace(entry, '')
            queryTerms = set(rquery.split())
            for cat in spotList[entry]['cat'].lower().split():
              catEntity.add(entry + '_' + cat)
              if cat in f1Dict:
                phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat])
                pVector = catVector[cat]
                queryDict = getDictFromSet(queryTerms)
                pTotal = sum(phrase1.values())
                pset = set(phrase1.keys())
                sint = pset & queryTerms
                score = 0.0
                cscore = get_cosine(queryDict, pVector)

                for iphrase in sint:
                  score += phrase1[iphrase] / pTotal
                if len(queryTerms) > 0:
                  score *= (1.0 * len(sint)) / len(queryTerms)

                if sint:

                  outfile.write(query + '\t' + entry + '\t' + cat + '\t' +
                                str(cscore) + '\t' + ', '.join(sint) + '\n')
                  found = True
                  catTermFound.add(entry + '_' + cat)
                else:
                  outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n')
                  catFoundNoTerm.add(cat + '_' + entry)
              else:
                outfile.write(
                    query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n')
                catNotFound.add(cat + '_' + entry)

                #load the terms for category
                #check if these terms match
    if found:
      sFound += 1
    sTotal += 1
    outfile.write('\n')

  print 'Total Sessions ', sTotal
  print 'Sessions with dominant entity in AOL', sFound
  print '# Unique Entities', len(eTotal)
  print '# Removed Entities (non dominant)', len(eRemov)
  print '# no of entity types', len(catEntity)
  print '# no of entity types with terms match ', len(catTermFound)
  print '# no of entity types with no term match', len(catFoundNoTerm)
  print '# no of entity types with no match in AOL', len(catNotFound)