Ejemplo n.º 1
0
def kFoldEvaluation(k, sessFile, featFile, weightFile, percent, typeFile):
  sessions = loadSessions(sessFile)
  #weightMatrix = readWeightMatrix(weightFile)
  #
  #p1 = {}
  #r1 = {}
  #p2 = {}
  #r2 = {}
  #p3 = {}
  #r3 = {}
  #p4 = {}
  #r4 = {}

  #
  amean = []
  ymean = []
  for i in range(k):
    x, y, uniqx, uniqy = sampleSessions(sessions, percent)
    acount = 0.0
    ylen = 0.0
    termList, termDict = getTermList(uniqx)
    for session in y:
      aTerms, rTerms = addedAndRemovedTerms(session[0], session[1:], termDict)
      acount += len(aTerms)
      ylen += len(session)
    print acount, ylen, acount / len(y), ylen / len(y)
    amean.append(acount / len(y))
    ymean.append(ylen / len(y))

  print np.mean(amean), np.mean(ymean)
Ejemplo n.º 2
0
def predictTerms(queryList, y, qclusters):
  termList, termDict = getTermList(queryList)
  oracle_prec = 0.0
  oracle_mrr = 0.0
  added = 0
  cScorer = ScoreClusterTerms()
  for session in y:
    query = session[0]
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      prec1, mrr1 = getPrecRecall(termList, aTerms)
      added += 1.0
      oracle_prec += prec1
      oracle_mrr += mrr1
  print 'Oracle prec and recall ', oracle_prec / added, oracle_mrr / added, added
  #porter = stem.porter.PorterStemmer();
  clusters, clusIndex = toTerms(qclusters)
  lim = 5
  i = 0
  prec = {}
  mrr = {}
  pf = 0.0
  pr = 0.0
  for session in y:
    query = session[0].strip()
    qSet = getQueryTerms(query)  #getQueryTermsStemmed(query, porter);
    aTerms, rTerms = addedAndRemovedTerms(query, session[1:], termDict)
    if len(aTerms) > 0:
      terms = cScorer.scoreWithCosine(qSet, clusters, clusIndex, lim)

      if len(terms) > 0:
        #print len(aTerms), len(terms)
        prec1, mrr1 = getClustPrecRecall(terms, aTerms)  # returns a list
        #print 'METRIC',i, prec1, mrr1
        #print topk , prec1, mrr1
        if sum(prec1) > 0:
          pf += 1.0

        if sum(mrr1) > 0:
          pr += 1.0

        for topk in range(len(prec1)):
          if topk not in prec:
            prec[topk] = []
            mrr[topk] = []

          prec[topk].append(prec1[topk])
          mrr[topk].append(mrr1[topk])
      i += 1

  retPrec = {}
  retRecall = {}

  for entry, ls in prec.items():
    print 'Prec @', entry, np.mean(ls)
    retPrec[entry] = np.mean(ls)

  for entry, ls in mrr.items():
    print 'Recall @', entry, np.mean(ls)
    retRecall[entry] = np.mean(ls)

  print 'Percentage ', pf / i, pr / i

  return retPrec, retRecall