コード例 #1
0
def kFoldEvaluation(k, sessFile, featFile, weightFile, percent, typeFile):
  sessions = loadSessions(sessFile)
  #weightMatrix = readWeightMatrix(weightFile)
  #
  #p1 = {}
  #r1 = {}
  #p2 = {}
  #r2 = {}
  #p3 = {}
  #r3 = {}
  #p4 = {}
  #r4 = {}

  #
  amean = []
  ymean = []
  for i in range(k):
    x, y, uniqx, uniqy = sampleSessions(sessions, percent)
    acount = 0.0
    ylen = 0.0
    termList, termDict = getTermList(uniqx)
    for session in y:
      aTerms, rTerms = addedAndRemovedTerms(session[0], session[1:], termDict)
      acount += len(aTerms)
      ylen += len(session)
    print acount, ylen, acount / len(y), ylen / len(y)
    amean.append(acount / len(y))
    ymean.append(ylen / len(y))

  print np.mean(amean), np.mean(ymean)
コード例 #2
0
def predictTerms(queryList, y, qclusters):
  termList, termDict = getTermList(queryList)
  oracle_prec = 0.0
  oracle_mrr = 0.0
  added = 0
  cScorer = ScoreClusterTerms()
  for session in y:
    query = session[0]
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      prec1, mrr1 = getPrecRecall(termList, aTerms)
      added += 1.0
      oracle_prec += prec1
      oracle_mrr += mrr1
  print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added, added
  #porter = stem.porter.PorterStemmer();
  clusters, clusIndex = toTerms(qclusters)
  lim = 5
  i = 0
  prec = {}
  mrr = {}
  pf = 0.0
  pr = 0.0
  for session in y:
    query = session[0].strip()
    qSet = getQueryTerms(query)  #getQueryTermsStemmed(query, porter);
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      terms = cScorer.scoreWithCosine(qSet, clusters, clusIndex, lim)

      if len(terms) > 0:
        #print len(aTerms), len(terms)
        prec1, mrr1 = getClustPrecRecall(terms, aTerms)  # returns a list
        #print 'METRIC',i, prec1, mrr1
        #print topk , prec1, mrr1
        if sum(prec1) > 0:
          pf += 1.0

        if sum(mrr1) > 0:
          pr += 1.0

        for topk in range(len(prec1)):
          if topk not in prec:
            prec[topk] = []
            mrr[topk] = []

          prec[topk].append(prec1[topk])
          mrr[topk].append(mrr1[topk])
      i += 1

  retPrec = {}
  retRecall = {}

  for entry, ls in prec.items():
    print 'Prec @', entry, np.mean(ls)
    retPrec[entry] = np.mean(ls)

  for entry, ls in mrr.items():
    print 'Recall @', entry, np.mean(ls)
    retRecall[entry] = np.mean(ls)

  print 'Percentage ', pf / i, pr / i

  return retPrec, retRecall
コード例 #3
0
ファイル: __init__.py プロジェクト: vmanisha/QueryExpansion
  entExp1 = CatThesExpansion(dexter, catManage1, ranker, catCoMan, None)
  entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, None)

  oFile1 = open(argv[1][:argv[1].rfind('.')] + '_ent.txt', 'w')
  oFile2 = open(argv[1][:argv[1].rfind('.')] + '_entSub.txt', 'w')
  i = 0

  porter = stem.porter.PorterStemmer()
  totalVocab = loadFileInList(argv[7])

  stats = {'sess': 0.0, 'aTerms': 0.0, 'ent': 0.0, 'enta': 0.0, 'acount': 0.0}
  for session, documents, clicks, cTitle, scontents in getSessionWithXML(
      argv[1]):
    query = session[0]
    cText = None
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab)
    entStatus2, entExpTerms2 = entExp2.getTopSubclusters(query, cText, 1, 5)
    if entStatus2:
      stats['ent'] += 1

    if len(aTerms) > 0:
      stats['aTerms'] += 1

      if entStatus2:
        stats['enta'] += 1.0

    stats['sess'] += 1.0
    stats['acount'] += len(aTerms)
    '''cText = normalize(' '.join(cTitle[0]),porter);

                i+=1
コード例 #4
0
def main(argv):

  #Scorer
  coSessOccur = CoOccurrence()
  coSessOcMan = CoOcManager(argv[2], coSessOccur, ' ')
  tScorer = CoOccurSimScore(coSessOcMan)
  cScorer = ScoreClusterTerms()

  #vocab = set()
  i = 0
  prec = {}
  mrr = {}
  lim = 55

  queryList = loadFileInList(argv[5])
  termList, termDict = getTermList(queryList)
  print len(termList)
  added = 0
  oracle_prec = 0.0
  oracle_mrr = 0.0
  for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    query = session[0].strip()
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      prec1, mrr1 = getPrecRecall(termList, aTerms)
      added += 1.0
      oracle_prec += prec1
      oracle_mrr += mrr1

  print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added

  porter = stem.porter.PorterStemmer()
  ttype = argv[6]

  print ttype

  for iFile in os.listdir(argv[3]):
    qclusters = loadClusters(argv[3] + '/' + iFile)
    clusters, clusIndex = toTerms(qclusters)

    print iFile, len(clusters)
    prec[iFile] = {}
    mrr[iFile] = {}
    added = 0.0
    i = 1
    for tid, session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
        argv[1]):
      i += 1
      query = session[0].strip()
      qSet = getQueryTermsStemmed(query, porter)

      print 'Query ', query, qSet
      if ttype == 'query':
        aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
      elif ttype == 'title':
        aTerms = getTerms(cTitle, qSet, termDict, porter, range(
            1, len(session) - 1))
      else:
        aTerms = getTerms(cTitle, qSet, termDict, porter, range(
            1, len(session) - 1))
        bTerms = getTerms(cSummary, qSet, termDict, porter, range(
            1, len(session) - 1))
        aTerms = aTerms | bTerms
        #aTerms,rTerms = addedAndRemovedTerms(query, session[1:], None )

      if len(aTerms) > 0:
        terms = cScorer.scoreWithIndex(qSet, clusters, clusIndex, tScorer, lim)
        #terms = cScorer.scoreWithClustPos(qSet, clusters,tScorer, lim)
        print 'TERMS', '\t', i, '\t', ttype, '\t', iFile, '\t', len(
            terms), terms
        #for topk in range(1,lim,5):
        prec1, mrr1 = getClustPrecMrr(terms, aTerms)  # returns a list
        print 'METRIC', iFile, i, prec1, mrr1
        #print topk , prec1, mrr1
        for topk in prec1.keys():
          if topk not in prec[iFile]:
            prec[iFile][topk] = []
            mrr[iFile][topk] = []

          prec[iFile][topk].append(prec1[topk])
          mrr[iFile][topk].append(mrr1[topk])

          #prec[iFile][topk] += prec1
          #mrr[iFile][topk] += mrr1
        added += 1.0
      #if i == 3:
      #	break

  for fName, scoreDict in prec.items():
    for pos in scoreDict.keys():
      print 'Prec all', fName, pos, len(scoreDict[pos])
      total = sum(scoreDict[pos])
      prec[fName][pos] = total / added  #len(scoreDict[pos])
      print 'Prec', fName, pos, prec[fName][pos], total

  for fName, scoreDict in mrr.items():
    for pos in scoreDict.keys():
      print 'Mrr all', fName, pos, len(scoreDict[pos])
      total = sum(mrr[fName][pos])
      mrr[fName][pos] = total / added  #len(scoreDict[pos])
      print 'MRR', fName, pos, mrr[fName][pos], total
  #for entry in prec.keys():
  #for t in prec[entry].keys():
  #print 'Prec',entry, t, prec[entry][t], prec[entry][t]/added
  #prec[entry][t]/=added

  #for entry in mrr.keys():
  #for t in mrr[entry].keys():
  #print 'Mrr',entry, t, mrr[entry][t], mrr[entry][t]/added
  #mrr[entry][t]/=added

  print 'Plotting Precision and MRR'

  plotMultipleSys(prec, 'No of Terms', 'Prec', argv[4] + 'prec.png',
                  'Term Prediction Prec Plot')
  plotMultipleSys(mrr, 'No of Terms', 'MRR', argv[4] + 'mrr.png',
                  'Term Prediction MRR Plot')
コード例 #5
0
def getStatsPerSession(catVector, f1Dict, argv):

  tagURL = 'http://localhost:8080/rest/annotate'
  catURL = 'http://localhost:8080/rest/graph/get-entity-categories'

  print 'Cats ', len(f1Dict)
  #stats
  sStat = {'ef': 0, 'total': 0, 'aTerms': 0}
  #eStat = {'total':set(), 'remov':set()}
  catStat = {'nfTerm': set(), 'nf': set(), 'tf': set(), 'total': set()}
  outfile = open('match_session_' + str(argv[4]) + '.txt', 'w')
  #categoryVectors = {}
  #load the session
  arTotal = 0.0
  apTotal = 0.0
  for session in getSessionWithNL(argv[1]):
    bQuery = session[0].lower()
    bQuery = re.sub(SYMB, ' ', bQuery)
    bQuery = re.sub('\s+', ' ', bQuery).strip()
    aTerms, rTerms = addedAndRemovedTerms(bQuery, session[1:])
    arMax = 0.0
    apMax = 0.0
    try:
      spotDict = None  #tagQueryWithDexter(bQuery, tagURL,catURL)
      time.sleep(1)
      if aTerms:
        sStat['aTerms'] += 1.0
        if len(spotDict) > 0:
          sStat['ef'] += 1.0
          print 'Found Entity \t', '\t'.join(session)
        for entry in spotDict.keys():
          rquery = bQuery.replace(entry, '')
          queryTerms = set(rquery.split())
          catList = spotDict[entry]['cat'].lower().split()
          #notFound, maxCat, rDict = getPrecRecall('avg',catList,f1Dict,catVector, queryTerms, argv[2])
          #print 'Avg', notFound, rDict
          notFound, maxCat, rDict = getPrecRecall(
              'max', catList, f1Dict, catVector, queryTerms, aTerms, int(
                  argv[4]))
          print 'Max', bQuery, 'Ent', entry, 'Cat', maxCat, 'NFC', notFound, rDict
          nf = 0
          for centry in catList:
            catStat['total'].add(centry + '_' + entry)
            if centry in notFound:
              catStat['nf'].add(centry + '_' + entry)
              nf += 1.0
            else:
              if rDict and len(rDict['qInt']) == 0:
                catStat['nfTerm'].add(centry + '_' + entry)
          if nf == len(catList):
            print 'For Query', bQuery, 'With ent list', spotDict.keys(
            ), 'for ENT', entry, 'No cat found'

          if rDict:
            #to choose the type with max values
            if arMax < rDict['aR']:
              arMax = rDict['aR']
            if apMax < rDict['aP']:
              apMax = rDict['aP']

            outfile.write(
                bQuery + '\t' + entry + '\t' + str(rDict['qS']) + '\t' +
                ', '.join(rDict['qInt']) + '\t' + ', '.join(rDict['aInt']) +
                '\t' + str(rDict['aR']) + '\t' + str(rDict['aP']) + '\n')
      #else:
      #	outfile.write(bQuery+'\tNOT\tNOT\tNOT\tNO TERMS\n')
    except Exception as err:
      print 'SESSION WITH ERR', session, err, err.args
    if aTerms:
      print 'Prec ', argv[4], bQuery, '\t', apMax
      for query in session[1:]:
        outfile.write(query + '\tNIL\t0.0\tNIL\tNIL\t0.0\t0.0\n')

    sStat['total'] += 1
    outfile.write('\n')
    apTotal += apMax
    arTotal += arMax

  print 'Total Sessions ', sStat['total']
  print 'Sessions with entity in AOL', sStat['ef']
  print '# no of entity types', len(catStat['total'])
  #print '# no of entity types with terms match ', len(catStat['tf'])
  print '# no of entity types present but no qterm match', len(
      catStat['nfTerm'])
  print '# no of entity types not present in AOL', len(catStat['nf'])
  if sStat['ef'] > 0:
    print argv[4], '\t', 'Prec', apTotal / sStat['ef'
                                           ], 'Recall', arTotal / sStat['ef']
    print argv[4], '\t', 'Prec', apTotal / sStat[
        'aTerms'
    ], 'Recall', arTotal / sStat['aTerms']
コード例 #6
0
def main(argv):
    ipaddress = "localhost"
    # dexter object
    tagURL = "http://" + ipaddress + ":8080/rest/annotate"
    catURL = "http://" + ipaddress + ":8080/rest/graph/get-entity-categories"
    dexter = Dexter(tagURL, catURL, argv[5])

    # load the Category co-occurrence bit
    catCoMan = CoOcManager(argv[4], CoOccurrence(), " ")

    # category vector
    catVect = loadCategoryVector(argv[2])
    catManage1 = CategoryManager(catVect, argv[3], Category)
    catManage2 = CategoryManager(catVect, argv[7], CategorySubcluster)

    # ranker
    ranker = Ranker()
    totalVocab = loadFileInList(argv[6])
    # task extraction
    # htcTask = TaskExpansion('Indexes/htcIndex',ranker,3000);
    qccTask = TaskExpansion("Indexes/qccIndex", ranker, 3000, totalVocab)
    # taskK = argv[5][argv[5].rfind('/')+1:]

    wordFeatMan = None
    # WordManager(argv[8],False);

    # expansion
    # entExp1 = CatThesExpansion(dexter, catManage1, ranker,catCoMan,wordFeatMan);
    entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan, wordFeatMan)
    # term expansion
    coOccExp = CoOccurExpansion(catCoMan, None, ranker)
    # randomWalk
    # randWalk = RandomWalk(argv[2],argv[3],ranker)
    prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}}
    mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}}

    ent_prec = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}}
    ent_mrr = {"ent": {}, "qccTask": {}, "htcTask": {}, "co": {}, "entSub": {}}

    """
	sess_prec = {};
	sess_mrr = {};
	"""
    covered = {}

    i = 0

    porter = stem.porter.PorterStemmer()

    ttype = argv[10]

    for session, doc, click, cTitle, cSummary in getSessionWithXML(argv[1]):
        query = session[0]
        qSet = getQueryTerms(query)
        # print 'Title, Summary clicked ',cTitle[0], cSummary[0];
        aTerms = None
        # cText = normalize(' '.join(cTitle[0]),porter);
        if ttype == "query":
            aTerms, rTerms = addedAndRemovedTerms(query, session[1:], totalVocab)
        elif ttype == "title":
            aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1))
        else:
            aTerms = getTerms(cTitle, qSet, totalVocab, porter, range(1, len(session) - 1))
            bTerms = getTerms(cSummary, qSet, totalVocab, porter, range(1, len(session) - 1))
            aTerms = aTerms | bTerms

        print i, "Query", query, aTerms, len(aTerms)

        if len(aTerms) > 0:  # and query not in covered:
            covered[query] = 1

            coExpTerms = coOccExp.expandTextWithStep(query, 0, 55, 5)

            # entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query,'',1,0,55,5);
            entStatus1, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(query, "", 1, 0, 55, 5)

            qccTaskTerms = qccTask.expandTextWithStep(query, 0, 55, 5)
            # htcTaskTerms = htcTask.expandTextWithStep(query,0,55,5)
            # randExpTerms = randWalk.expandTextWithStep(query,55,105,5)
            if not entStatus1:
                print i, "Ent False", query

                # addLen = getBand(len(aTerms));
                # if addLen not in sess_prec:
                # 	sess_prec[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} };
                # 	sess_mrr[addLen] = {'ent':{}};#, 'qccTask':{}, 'htcTask':{}, 'co':{} };

                # for noTerms in entExpTerms1.keys():
                # print 'ETerms\t',i,'\t',query,'\t',entExpTerms1[noTerms],'\t',noTerms;
                # prec1 , mrr1 = getPrecRecall(entExpTerms1[noTerms],aTerms);
                # prec = updateStats(noTerms, 'ent',prec1, prec);
                # mrr = updateStats(noTerms, 'ent',mrr1, mrr);
                # if entStatus1:
                # ent_prec = updateStats(noTerms, 'ent',prec1, ent_prec)
                # ent_mrr = updateStats(noTerms, 'ent',mrr1, ent_mrr);
                ##sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen])
                ##sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]);
                # print 'EMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1;
                #
            for noTerms in entExpTerms2.keys():
                print "ESubTerms\t", i, "\t", query, "\t", entExpTerms2[noTerms], "\t", noTerms
                prec1, mrr1 = getPrecRecall(entExpTerms2[noTerms], aTerms)
                prec = updateStats(noTerms, "entSub", prec1, prec)
                mrr = updateStats(noTerms, "entSub", mrr1, mrr)
                if entStatus1:
                    ent_prec = updateStats(noTerms, "entSub", prec1, ent_prec)
                    ent_mrr = updateStats(noTerms, "entSub", mrr1, ent_mrr)
                    # sess_prec[addLen] = updateStats(noTerms, 'ent',prec1, sess_prec[addLen])
                    # sess_mrr[addLen] = updateStats(noTerms, 'ent',mrr1, sess_mrr[addLen]);
                print "ESubMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1

            for noTerms in qccTaskTerms.keys():
                print "qccTaskTerms\t", i, "\t", query, "\t", qccTaskTerms[noTerms], "\t", noTerms
                prec1, mrr1 = getPrecRecall(qccTaskTerms[noTerms], aTerms)
                prec = updateStats(noTerms, "qccTask", prec1, prec)
                mrr = updateStats(noTerms, "qccTask", mrr1, mrr)
                if entStatus1:
                    ent_prec = updateStats(noTerms, "qccTask", prec1, ent_prec)
                    ent_mrr = updateStats(noTerms, "qccTask", mrr1, ent_mrr)
                """
				sess_prec[addLen] = updateStats(noTerms, 'qccTask',prec1, sess_prec[addLen])
				sess_mrr[addLen] = updateStats(noTerms, 'qccTask',mrr1, sess_mrr[addLen]);
				"""
                print "qccTaskMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1

                # for noTerms in htcTaskTerms.keys():
                # print 'htcTaskTerms\t',i,'\t',query,'\t',htcTaskTerms[noTerms],'\t',noTerms
                # prec1 , mrr1 = getPrecRecall(htcTaskTerms[noTerms],aTerms)
                # prec = updateStats(noTerms, 'htcTask',prec1, prec)
                # mrr = updateStats(noTerms, 'htcTask',mrr1, mrr);
                # if entStatus1:
                # ent_prec = updateStats(noTerms, 'htcTask',prec1, ent_prec)
                # ent_mrr = updateStats(noTerms, 'htcTask',mrr1, ent_mrr);
                ##sess_prec[addLen] = updateStats(noTerms, 'htcTask',prec1, sess_prec[addLen])
                ##sess_mrr[addLen] = updateStats(noTerms, 'htcTask',mrr1, sess_mrr[addLen]);
                #
                # print 'htcTaskMetrics ',i,'\t',noTerms,'\t', len(aTerms), '\t', aTerms, '\t',prec1, '\t',mrr1

            for noTerms in coExpTerms.keys():
                print "CoTerms\t", i, "\t", query, "\t", coExpTerms[noTerms], "\t", noTerms
                prec1, mrr1 = getPrecRecall(coExpTerms[noTerms], aTerms)
                prec = updateStats(noTerms, "co", prec1, prec)
                mrr = updateStats(noTerms, "co", mrr1, mrr)
                if entStatus1:
                    ent_prec = updateStats(noTerms, "co", prec1, ent_prec)
                    ent_mrr = updateStats(noTerms, "co", mrr1, ent_mrr)
                """
				sess_prec[addLen] = updateStats(noTerms, 'co',prec1, sess_prec[addLen])
				sess_mrr[addLen] = updateStats(noTerms, 'co' ,mrr1, sess_mrr[addLen]);
				"""
                print "CoMetrics ", i, "\t", noTerms, "\t", len(aTerms), "\t", aTerms, "\t", prec1, "\t", mrr1

        else:
            pass
            # print 'NO ADDED TERMS in', i;
        i += 1

    printMetric(prec, "entSub", "Prec")
    printMetric(mrr, "entSub", "Mrr")

    printMetric(prec, "ent", "Prec")
    printMetric(mrr, "ent", "Mrr")

    printMetric(prec, "htcTask", "Prec")
    printMetric(mrr, "htcTask", "Mrr")

    printMetric(prec, "qccTask", "Prec")
    printMetric(mrr, "qccTask", "Mrr")

    printMetric(prec, "co", "Prec")
    printMetric(mrr, "co", "Mrr")

    printMetric(ent_prec, "entSub", "EntPrec")
    printMetric(ent_mrr, "entSub", "EntMrr")

    printMetric(ent_prec, "ent", "EntPrec")
    printMetric(ent_mrr, "ent", "EntMrr")

    printMetric(ent_prec, "htcTask", "EntPrec")
    printMetric(ent_mrr, "htcTask", "EntMrr")

    printMetric(ent_prec, "qccTask", "EntPrec")
    printMetric(ent_mrr, "qccTask", "EntMrr")

    printMetric(ent_prec, "co", "EntPrec")
    printMetric(ent_mrr, "co", "EntMrr")

    plotMultipleSys(
        prec,
        "No of Terms",
        "Prec",
        argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "prec.png",
        "Term Prediction Prec Plot",
    )
    plotMultipleSys(
        mrr,
        "No of Terms",
        "MRR",
        argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "mrr.png",
        "Term Prediction MRR Plot",
    )
    plotMultipleSys(
        ent_prec,
        "No of Terms",
        "Prec",
        argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_prec.png",
        "Term Prediction Prec Plot (Ent queries)",
    )
    plotMultipleSys(
        ent_mrr,
        "No of Terms",
        "MRR",
        argv[9] + "/" + argv[1][argv[1].rfind("/") + 1 : -4] + "_" + "_ent_mrr.png",
        "Term Prediction MRR Plot (Ent queries)",
    )

    # htcTask.closeIndex();
    qccTask.closeIndex()
    """