Example #1
0
def main(argv):

  #load the session
  #find the terms
  porter = stem.porter.PorterStemmer()
  searcher = SearchIndex(argv[2])
  searcher.initializeAnalyzer()
  #oSession = open('session-words.all','w')
  #oClicked = open('clicked-words.all','w')
  oOtherSession = open('other-session-words.all', 'w')
  oOtherClicked = open('other-clicked-words.all', 'w')

  otherSessions = {}
  otherClicked = {}

  ranker = Ranker()
  qId = 1
  for topicId, session, doc, click, cTitle, cSummary in getSessionWithXML(
      argv[1]):

    if topicId not in otherSessions:
      otherSessions[topicId] = {}
      otherClicked[topicId] = {}

      #add session terms
    sessionTerms = getSessionTerms(session, porter)

    otherSessions[topicId] = mergeDicts(otherSessions[topicId], sessionTerms)

    #eTerms = ranker.getTopKWithFilter(sessionTerms,10,15)
    #print session[-1], sessionTerms, eTerms

    #k = 0
    #for entry in searcher.getTopDocumentsWithExpansion(session[-1],eTerms, 1000, 'content','id'):
    #oSession.write(str(qId)+' Q0 '+entry[0]+' '+str(k)+' '+str(round(entry[1],2))+' session\n')
    #k+=1

    #add clicked terms

    cTTerms, cSTerms = getClickedSummaryTerms(session, joinLists(
        cSummary.values()), joinLists(cTitle.values()), porter)
    nTerms = mergeDicts(cTTerms, cSTerms)
    otherClicked[topicId] = mergeDicts(otherClicked[topicId], nTerms)

    #fTerms = normalizeDict(nTerms, porter)
    #eTerms = ranker.getTopKWithFilter(nTerms,10,15)
    ##
    #k = 0
    #for entry in searcher.getTopDocumentsWithExpansion(session[-1],eTerms, 1000, 'content','id'):
    #oClicked.write(str(qId)+' Q0 '+entry[0]+' '+str(k)+' '+str(round(entry[1],2))+' click\n')
    #k+=1

    qId += 1

  oSession.close()
  oClicked.close()
  searcher.close()
def main(argv):
  #open the index
  searcher = SearchIndex(argv[2])
  searcher.initializeAnalyzer()

  ipaddress = 'localhost'
  #dexter object
  tagURL = 'http://' + ipaddress + ':8080/rest/annotate'
  catURL = 'http://' + ipaddress + ':8080/rest/graph/get-entity-categories'
  dexter = Dexter(tagURL, catURL, argv[7])

  #category vector
  catVect = loadCategoryVector(argv[3])
  catManage1 = CategoryManager(catVect, argv[4], Category)
  catManage2 = CategoryManager(catVect, argv[5], CategorySubcluster)

  #load the Category co-occurrence bit
  catCoMan = CoOcManager(argv[6], CoOccurrence(), ' ')

  #ranker
  ranker = Ranker()

  #task extraction
  htcTask = TaskExpansion('Indexes/htcIndex', ranker, 3000)
  qccTask = TaskExpansion('Indexes/qccIndex', ranker, 3000)
  #taskK = argv[5][argv[5].rfind('/')+1:]

  #totalVocab = loadFileInList(argv[6]);

  #expansion
  entExp1 = CatThesExpansion(dexter, catManage1, ranker, catCoMan)
  entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan)
  #term expansion
  coOccExp = CoOccurExpansion(catCoMan, None, ranker)

  rel, noRel = loadRelJudgements(argv[8])

  outFolder = argv[9]

  #randomWalk
  #randWalk = RandomWalk(argv[3],argv[4],ranker)
  #randWalk = RandomWalk(catManage,catCoMan,entTermVect, catTermVect,ranker)

  #result String
  #query key terms
  #queryList = loadQueryList(argv[4]);

  #plotMap = {'baseline':{},'ent':{}, 'entSub':{}, 'qccTask':{}, 'htcTask':{},'co':{}};
  #plotNDCG = {'baseline':{},'ent':{}, 'entSub':{}, 'qccTask':{}, 'htcTask':{},'co':{}};

  #viewedFileFolder =  argv[5]
  #i=0
  ##qMap = [];
  ##qNdcg = [];
  #meth = 'baseline'
  #oFile  = open(outFolder+'/baseline.RL1','w');
  #covered = {};
  #porter = stem.porter.PorterStemmer();
  #for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(argv[1]):
  #i+=1
  #query = session[0].strip();
  #if i in rel and query not in covered:
  #covered[query] = 1.0;
  #docList = searcher.getTopDocuments(query,1000,'content','id');
  #k = 1
  #for dtuple  in docList:
  #oFile.write(str(i)+' Q0 '+dtuple[0]+' '+str(k)+' '+str(round(dtuple[1],2))+' baseline\n');
  #k +=1
  #'''qmap = findAvgPrec(docList,rel[i],noRel[i]);
  #dcg10, idcg10 = findDCG(docList[:10],rel[i]);
  ##print dcg10, idcg10, rel[i].values();
  #ndcg10 = 0.0;
  #if idcg10 > 0:
  #ndcg10 = dcg10/idcg10;
  #qMap.append(qmap);
  #qNdcg.append(ndcg10);
  #oFile.write('ndcg10 '+str(i)+' '+str(ndcg10)+'\n');
  #oFile.write('map '+str(i)+' '+str(qmap)+'\n');
  #'''
  #else:
  #print 'No rel ', i, session[0];
  #oFile.close();
  #'''
  #fmap = sum(qMap)/len(qMap);
  #fnd = sum(qNdcg)/len(qNdcg);
  #oFile.write('all map ' +str(fmap)+'\n');
  #oFile.write('all ndcg10 '+str(fnd)+'\n');
  #for val in range(0,55,5):
  #plotMap[meth][val] = fmap;
  #plotNDCG[meth][val] = fnd;
  #oFile.close();
  #'''

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'co'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()

    if i in rel and query not in covered:
      covered[query] = 1.0
      coExpTerms = coOccExp.expandTextWithStep(query, 50, 55, 5)
      for noTerms, terms in coExpTerms.items():
        #if noTerms not in qMap:
        #	qMap[noTerms] = [];	
        #	qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(query, terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1
        '''qmap = findAvgPrec(docList,rel[i],noRel[i]);

                                dcg10, idcg10 = findDCG(docList[:10],rel[i]);
                                ndcg10 = 0.0;
                                if idcg10 > 0:
                                        ndcg10 = dcg10/idcg10;
                                qMap[noTerms].append(qmap);
                                qNdcg[noTerms].append(ndcg10);
                                oFile[noTerms].write('ndcg10 '+str(i)+'
                                '+str(ndcg10)+' '+str(dcg10)+'
                                '+str(idcg10)+'\n');
                                oFile[noTerms].write('map '+str(i)+'
                                '+str(qmap)+'\n');
                                '''
  '''
        for entry, vlist in qMap.items():
                i = len(vlist);
                fmap = sum(vlist)/i;
                fnd = sum(qNdcg[entry])/i;
                print sum(vlist), len(vlist);
                oFile[entry].write('all map ' +str(fmap)+'\n');
                oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
                plotMap[meth][entry] = fmap;
                plotNDCG[meth][entry] = fnd;
                oFile[entry].close();
        '''
  for entry in oFile.keys():
    oFile[entry].close()

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'ent'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()
    cText = normalize(' '.join(cTitle[0]), porter)
    if i in rel and query not in covered:
      covered[query] = 1.0
      entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query, cText, 1, 50,
                                                            55, 5)
      for noTerms, terms in entExpTerms1.items():
        #if noTerms not in qMap:
        #	qMap[noTerms] = [];	
        #	qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1
        '''
                                qmap = findAvgPrec(docList,rel[i],noRel[i]);
                                dcg10, idcg10 = findDCG(docList[:10],rel[i]);
                                ndcg10 = 0.0;
                                if idcg10 > 0:
                                        ndcg10 = dcg10/idcg10;

                                qMap[noTerms].append(qmap);
                                qNdcg[noTerms].append(ndcg10);
                                oFile[noTerms].write('ndcg10 '+str(i)+'
                                '+str(ndcg10)+' '+str(dcg10)+'
                                '+str(idcg10)+'\n');
                                oFile[noTerms].write('map '+str(i)+'
                                '+str(qmap)+'\n');

        for entry, vlist in qMap.items():
                i = len(vlist);
                fmap = sum(qMap[entry])/i;
                fnd = sum(qNdcg[entry])/i;
                oFile[entry].write('all map ' +str(fmap)+'\n');
                oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
                plotMap[meth][entry] = fmap;
                plotNDCG[meth][entry] = fnd;
                oFile[entry].close();
        '''
  for entry in oFile.keys():
    oFile[entry].close()

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'entSub'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()
    cText = normalize(' '.join(cTitle[0]), porter)
    if i in rel and query not in covered:
      covered[query] = 1.0
      entStatus2, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(
          query, cText, 1, 50, 55, 5)
      for noTerms, terms in entExpTerms2.items():
        #if noTerms not in qMap:
        #qMap[noTerms] = [];	
        #qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1
        '''qmap = findAvgPrec(docList,rel[i],noRel[i]);

                                dcg10, idcg10 = findDCG(docList[:10],rel[i]);
                                ndcg10 = 0.0;
                                if idcg10 > 0:
                                        ndcg10 = dcg10/idcg10;

                                qMap[noTerms].append(qmap);
                                qNdcg[noTerms].append(ndcg10);
                                oFile[noTerms].write('ndcg10 '+str(i)+'
                                '+str(ndcg10)+' '+str(dcg10)+'
                                '+str(idcg10)+'\n');
                                oFile[noTerms].write('map '+str(i)+'
                                '+str(qmap)+'\n');

        for entry, vlist in qMap.items():
                i = len(vlist);
                fmap = sum(qMap[entry])/i;
                fnd = sum(qNdcg[entry])/i;
                oFile[entry].write('all map ' +str(fmap)+'\n');
                oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
                plotMap[meth][entry] = fmap;
                plotNDCG[meth][entry] = fnd;
                oFile[entry].close();
        '''
  for entry in oFile.keys():
    oFile[entry].close()

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'qccTask'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()

    if i in rel and query not in covered:
      covered[query] = 1.0
      qccTaskTerms = qccTask.expandTextWithStep(query, 50, 55, 5)
      for noTerms, terms in qccTaskTerms.items():
        #if noTerms not in qMap:
        #qMap[noTerms] = [];	
        #qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1

  #qmap = findAvgPrec(docList,rel[i],noRel[i]);
  #dcg10, idcg10 = findDCG(docList[:10],rel[i]);
  #ndcg10 = 0.0;
  #if idcg10 > 0:
  #ndcg10 = dcg10/idcg10;
  #
  #qMap[noTerms].append(qmap);
  #qNdcg[noTerms].append(ndcg10);
  #oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n');
  #oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n');
  #
  #for entry, vlist in qMap.items():
  #i = len(vlist);
  #fmap = sum(qMap[entry])/i;
  #fnd = sum(qNdcg[entry])/i;
  #oFile[entry].write('all map ' +str(fmap)+'\n');
  #oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
  #plotMap[meth][entry] = fmap;
  #plotNDCG[meth][entry] = fnd;
  #oFile[entry].close();
  #
  for entry in oFile.keys():
    oFile[entry].close()

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'htcTask'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()

    if i in rel and query not in covered:
      covered[query] = 1.0
      htcTaskTerms = htcTask.expandTextWithStep(query, 50, 55, 5)
      for noTerms, terms in htcTaskTerms.items():
        #if noTerms not in qMap:
        #qMap[noTerms] = [];	
        #qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1
        #qmap = findAvgPrec(docList,rel[i],noRel[i]);
        #dcg10, idcg10 = findDCG(docList[:10],rel[i]);
        #ndcg10 = 0.0;
        #if idcg10 > 0:
        #ndcg10 = dcg10/idcg10;
        #qMap[noTerms].append(qmap);
        #qNdcg[noTerms].append(ndcg10);
        #oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n');
        #oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n');
        #
        #for entry, vlist in qMap.items():
        #i = len(vlist);
        #fmap = sum(qMap[entry])/i;
        #fnd = sum(qNdcg[entry])/i;
        #oFile[entry].write('all map ' +str(fmap)+'\n');
        #oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
        #plotMap[meth][entry] = fmap;
        #plotNDCG[meth][entry] = fnd;
        #oFile[entry].close();
  for entry in oFile.keys():
    oFile[entry].close()

  #plotMultipleSys(plotMap,'No of Terms', 'MAP',outFolder+'/map.png','Retrieval MAP Plot');
  #plotMultipleSys(plotNDCG,'No of Terms', 'NDCG@10',outFolder+'/ndcg10.png','Retrieval NDCG Plot');

  searcher.close()