def main(argv):
  #for each query
  #get bi-grams, unigrams and update frequency

  coOccur = CoOccurrence()
  stemmer = stem.porter.PorterStemmer()
  for line in open(argv[1], 'r'):
    split = line.strip().split('\t')

    query = normalize(split[0].strip(), stemmer)
    freq = int(split[1].strip())
    #generate ngrams
    ngrams = getNGramsAsList(query, 1)
    #if it has more than one term
    lngrams = len(ngrams)
    if lngrams > 1:

      for i in range(lngrams - 1):
        if ngrams[i] not in stopSet and len(ngrams[i]) > 2:
          for j in range(i + 1, lngrams):
            if ngrams[j] not in stopSet and len(ngrams[j]) > 2:
              coOccur.updateStats(ngrams[i], ngrams[j], freq)
  coOccur.setTermTotal()
  #for each query find the terms highly co-occured wth
  for line in open(argv[2], 'r'):
    split = line.split('\t')
    query = normalize(split[1].lower().strip(), stemmer)
    nGrams = getNGramsAsList(query, 1)
    toScore = set()
    result = {}

    for entry in nGrams:
      elist = coOccur.getNeighbours(entry)
      if elist:
        toScore |= set(elist)

    for term1 in toScore:
      if term1 not in query:
        result[term1] = 0.0
        for term2 in nGrams:
          pmi = coOccur.getPMI(term1, term2, 50)
          result[term1] += pmi
        result[term1] /= len(nGrams)

    for entry in result.keys():
      if result[entry] == 0:
        del result[entry]

    sort = sorted(result.items(), reverse=True, key=lambda x: x[1])
    print query, '\t', '\t'.join('{0}:{1}'.format(x[0], round(x[1], 3))
                                 for x in sort[:50])
Example #2
0
def normalizeDict(idict, stemmer):
  returnDict = {}
  #tSum = sum(idict.values())*1.0

  for entry, count in idict.items():
    entry = normalize(entry, stemmer)
    if len(entry) > 2:
      try:
        returnDict[entry] += count  #/tSum
      except:
        returnDict[entry] = count  #/tSum

  return returnDict
Example #3
0
def mergeFeatures(featFile, taggedFile, newFile):
  entFeatDict = {}
  stemmer = stem.porter.PorterStemmer()
  for line in open(taggedFile, 'r'):
    spotDict = ast.literal_eval(line.strip())
    normQuery = normalize(spotDict['text'], stemmer)
    if normQuery not in entFeatDict:
      entFeatDict[normQuery] = []
    #convert ent stuff into a dictionary
    entVect = {}
    catVect = {}
    for entry in spotDict['spots']:
      entVect[entry['wikiname']] = 1.0
      cats = ast.literal_eval(entry['cat'])
      for cat in cats:
        if cat not in catVect:
          catVect[cat] = 0.0
        catVect[cat] += 1.0
    entFeatDict[normQuery].append(entVect)
    entFeatDict[normQuery].append(catVect)

  print len(entFeatDict)

  featDict = {}

  outF = open(newFile, 'w')

  for line in open(featFile, 'r'):
    split = line.split('\t')
    query = split[0].strip()
    featDict[query] = []
    for entry in split[1:]:
      featDict[query].append(entry.strip())
    if query in entFeatDict:
      featDict[query] = featDict[query] + entFeatDict[query]
    else:
      featDict[query] = featDict[query] + [{}, {}]
      print 'Query not tagged! ', query

    outF.write(query)
    for entry in featDict[query]:
      outF.write('\t' + str(entry))
    outF.write('\n')
    #convert cat stuff into a dictionary

  outF.close()
Example #4
0
def generatePhraseStats(labelFile):
  phrases = {}
  stemmer = stem.porter.PorterStemmer()
  entQuery = 0.0
  for line in open(labelFile, 'r'):

    try:
      entList = []
      spotDict = ast.literal_eval(line)
      text = spotDict['text']
      for entry in spotDict['spots']:
        mention = entry['mention']
        text = text.replace(mention, '<entity>').strip()
        entList.append(entry['wikiname'])

      if len(text) > 2:
        split = text.split('<entity>')
        if len(split) == 1:
          entQuery += 1.0
        for entry in split:
          entry = normalize(entry, stemmer).strip()

          if entry not in phrases:
            phrases[entry] = {}

          for entity in entList:
            if entity not in phrases[entry]:
              phrases[entry][entity] = 0.0
            phrases[entry][entity] += 1.0
          #phrases[entry]+=1.0
    except:
      pass
      #print line
  print entQuery

  words = {}
  for entry in sorted(phrases.items(), reverse=True, key=lambda x: len(x[1])):
    split = entry[0].split()
    wlen = len(split)
    if wlen not in words:
      words[wlen] = 0.0
    words[wlen] += 1.0
    print entry
  for entry, count in words.items():
    print entry, count
Example #5
0
def generatePhrases(labelFile):
  stemmer = stem.porter.PorterStemmer()
  for line in open(labelFile, 'r'):
    pList = []
    spotDict = ast.literal_eval(line)
    query = spotDict['text']
    text = spotDict['text']
    for entry in spotDict['spots']:
      mention = entry['mention']
      text = text.replace(mention, '<entity>').strip()

    if len(text) > 2:
      split = text.split('<entity>')
      for entry in split:
        entry = normalize(entry, stemmer).strip()
        if len(entry) > 1:
          pList.append(entry)
      yield query, pList
Example #6
0
def filterQueries(queryCountFile, queryFile, trainFile, sessionFile):
  queryCount = loadFileInDict(queryCountFile)

  #print len(queryCount)
  toPrint = set()
  toFilter = loadFileInList(queryFile)
  training = loadFileInList(trainFile)
  session = loadFileInList(sessionFile)
  stemmer = stem.porter.PorterStemmer()
  for entry in toFilter:

    if (entry in queryCount) and ((queryCount[entry] > 15) or \
		 (entry in training) or (entry in session)):
      entry1 = normalize(entry, stemmer)
      toPrint.add(entry1)
      #print entry, '\t', queryCount[entry]
  for entry in toPrint:
    print entry
def main(argv):

  #load the co-occurrence terms
  coOccurTermList = loadDictFromFile(argv[2], '\t', ':')
  #for each query find the entities
  #score the cats

  ipaddress = 'localhost'
  #dexter object
  tagURL = 'http://' + ipaddress + ':8080/rest/annotate'
  catURL = 'http://' + ipaddress + ':8080/rest/graph/get-entity-categories'
  dexter = Dexter(tagURL, catURL)
  catManage = CategoryManager(argv[3], argv[4])
  catCoMan = CoOcManager(argv[5], CoOccurrence(), ' ')

  ranker = Ranker()
  entExp = CatThesExpansion(dexter, catManage, ranker, catCoMan)
  stemmer = stem.porter.PorterStemmer()
  result = {}
  done = set()
  noEnt = 0
  oFile = open(argv[6], 'w')
  for line in open(argv[1], 'r'):
    split = line.strip().split('\t')
    oquery = split[0].strip()
    query = normalize(oquery, stemmer)

    if query not in done and len(query) > 2:
      result = {}
      result = {'coTerms': {}, 'catTerms': None, 'freq': 0.0}
      result['freq'] = int(split[1])
      #print query, query in coOccurTermList
      if query in coOccurTermList:
        result['coTerms'] = coOccurTermList[query]
        entCatTermDict = entExp.getTopEntityCategoryTerms(oquery, 1, 40)
        result['catTerms'] = entCatTermDict
        if len(result['catTerms']) > 0:
          oFile.write(query + '\t' + json.dumps(result) + '\n')
        else:
          noEnt += 1.0
      done.add(query)

  print 'No of queries with no Ent ', noEnt
  oFile.close()
Example #8
0
def populateDatasetWithBigrams(logFile, bigramSet, queryFile):
  sid = 0

  queryList = buildBigramSet(queryFile)

  stemmer = stem.porter.PorterStemmer()
  for session in getSessionWithQuery(logFile):
    sessionStr = ' '.join(session)
    sessionSet = set(getNGramsAsList(sessionStr, 2))
    inter = sessionSet & bigramSet
    #print len(sessionSet), len(bigramSet), inter

    if len(inter) > 0:
      lastq = None
      for q in session:
        if q in queryList:
          q = normalize(q, stemmer)
          if lastq != q and len(q) > 1:
            print sid, '\t', q
          lastq = q
    sid += 1
Example #9
0
def findUniqueQueries(fileName, file2, index):

  toCheck = {}
  #for line in open(file2,'r'):
  ##split = line.split('\t');
  ##query = split[0].strip()
  #spotDict = ast.literal_eval(line)
  #query = spotDict['text']
  #toCheck[query] = 1.0
  #
  #print len(toCheck)
  #queryList = {}
  for line in open(fileName, 'r'):
    split = line.strip().split('\t')
    query = split[0].strip()
    #if query in toCheck:
    #	print query
    toCheck[query] = 1.0

  print len(toCheck)
  #if query not in toCheck:
  #rsplit = query.split()
  #if not hasInapWords(rsplit):
  #if query not in queryList:
  #queryList[query] = 1.0
  #else:
  #queryList[query] +=1.0
  ##else:
  #print query
  stemmer = stem.porter.PorterStemmer()

  for line in open(file2, 'r'):
    split = line.split('\t')
    entry = split[index].strip()
    norm = normalize(entry, stemmer)
    if norm in toCheck and len(norm) > 3:
      print line,
def main(argv):
  #open the index
  searcher = SearchIndex(argv[2])
  searcher.initializeAnalyzer()

  ipaddress = 'localhost'
  #dexter object
  tagURL = 'http://' + ipaddress + ':8080/rest/annotate'
  catURL = 'http://' + ipaddress + ':8080/rest/graph/get-entity-categories'
  dexter = Dexter(tagURL, catURL, argv[7])

  #category vector
  catVect = loadCategoryVector(argv[3])
  catManage1 = CategoryManager(catVect, argv[4], Category)
  catManage2 = CategoryManager(catVect, argv[5], CategorySubcluster)

  #load the Category co-occurrence bit
  catCoMan = CoOcManager(argv[6], CoOccurrence(), ' ')

  #ranker
  ranker = Ranker()

  #task extraction
  htcTask = TaskExpansion('Indexes/htcIndex', ranker, 3000)
  qccTask = TaskExpansion('Indexes/qccIndex', ranker, 3000)
  #taskK = argv[5][argv[5].rfind('/')+1:]

  #totalVocab = loadFileInList(argv[6]);

  #expansion
  entExp1 = CatThesExpansion(dexter, catManage1, ranker, catCoMan)
  entExp2 = CatThesExpansion(dexter, catManage2, ranker, catCoMan)
  #term expansion
  coOccExp = CoOccurExpansion(catCoMan, None, ranker)

  rel, noRel = loadRelJudgements(argv[8])

  outFolder = argv[9]

  #randomWalk
  #randWalk = RandomWalk(argv[3],argv[4],ranker)
  #randWalk = RandomWalk(catManage,catCoMan,entTermVect, catTermVect,ranker)

  #result String
  #query key terms
  #queryList = loadQueryList(argv[4]);

  #plotMap = {'baseline':{},'ent':{}, 'entSub':{}, 'qccTask':{}, 'htcTask':{},'co':{}};
  #plotNDCG = {'baseline':{},'ent':{}, 'entSub':{}, 'qccTask':{}, 'htcTask':{},'co':{}};

  #viewedFileFolder =  argv[5]
  #i=0
  ##qMap = [];
  ##qNdcg = [];
  #meth = 'baseline'
  #oFile  = open(outFolder+'/baseline.RL1','w');
  #covered = {};
  #porter = stem.porter.PorterStemmer();
  #for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(argv[1]):
  #i+=1
  #query = session[0].strip();
  #if i in rel and query not in covered:
  #covered[query] = 1.0;
  #docList = searcher.getTopDocuments(query,1000,'content','id');
  #k = 1
  #for dtuple  in docList:
  #oFile.write(str(i)+' Q0 '+dtuple[0]+' '+str(k)+' '+str(round(dtuple[1],2))+' baseline\n');
  #k +=1
  #'''qmap = findAvgPrec(docList,rel[i],noRel[i]);
  #dcg10, idcg10 = findDCG(docList[:10],rel[i]);
  ##print dcg10, idcg10, rel[i].values();
  #ndcg10 = 0.0;
  #if idcg10 > 0:
  #ndcg10 = dcg10/idcg10;
  #qMap.append(qmap);
  #qNdcg.append(ndcg10);
  #oFile.write('ndcg10 '+str(i)+' '+str(ndcg10)+'\n');
  #oFile.write('map '+str(i)+' '+str(qmap)+'\n');
  #'''
  #else:
  #print 'No rel ', i, session[0];
  #oFile.close();
  #'''
  #fmap = sum(qMap)/len(qMap);
  #fnd = sum(qNdcg)/len(qNdcg);
  #oFile.write('all map ' +str(fmap)+'\n');
  #oFile.write('all ndcg10 '+str(fnd)+'\n');
  #for val in range(0,55,5):
  #plotMap[meth][val] = fmap;
  #plotNDCG[meth][val] = fnd;
  #oFile.close();
  #'''

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'co'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()

    if i in rel and query not in covered:
      covered[query] = 1.0
      coExpTerms = coOccExp.expandTextWithStep(query, 50, 55, 5)
      for noTerms, terms in coExpTerms.items():
        #if noTerms not in qMap:
        #	qMap[noTerms] = [];	
        #	qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(query, terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1
        '''qmap = findAvgPrec(docList,rel[i],noRel[i]);

                                dcg10, idcg10 = findDCG(docList[:10],rel[i]);
                                ndcg10 = 0.0;
                                if idcg10 > 0:
                                        ndcg10 = dcg10/idcg10;
                                qMap[noTerms].append(qmap);
                                qNdcg[noTerms].append(ndcg10);
                                oFile[noTerms].write('ndcg10 '+str(i)+'
                                '+str(ndcg10)+' '+str(dcg10)+'
                                '+str(idcg10)+'\n');
                                oFile[noTerms].write('map '+str(i)+'
                                '+str(qmap)+'\n');
                                '''
  '''
        for entry, vlist in qMap.items():
                i = len(vlist);
                fmap = sum(vlist)/i;
                fnd = sum(qNdcg[entry])/i;
                print sum(vlist), len(vlist);
                oFile[entry].write('all map ' +str(fmap)+'\n');
                oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
                plotMap[meth][entry] = fmap;
                plotNDCG[meth][entry] = fnd;
                oFile[entry].close();
        '''
  for entry in oFile.keys():
    oFile[entry].close()

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'ent'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()
    cText = normalize(' '.join(cTitle[0]), porter)
    if i in rel and query not in covered:
      covered[query] = 1.0
      entStatus1, entExpTerms1 = entExp1.expandTextWithStep(query, cText, 1, 50,
                                                            55, 5)
      for noTerms, terms in entExpTerms1.items():
        #if noTerms not in qMap:
        #	qMap[noTerms] = [];	
        #	qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1
        '''
                                qmap = findAvgPrec(docList,rel[i],noRel[i]);
                                dcg10, idcg10 = findDCG(docList[:10],rel[i]);
                                ndcg10 = 0.0;
                                if idcg10 > 0:
                                        ndcg10 = dcg10/idcg10;

                                qMap[noTerms].append(qmap);
                                qNdcg[noTerms].append(ndcg10);
                                oFile[noTerms].write('ndcg10 '+str(i)+'
                                '+str(ndcg10)+' '+str(dcg10)+'
                                '+str(idcg10)+'\n');
                                oFile[noTerms].write('map '+str(i)+'
                                '+str(qmap)+'\n');

        for entry, vlist in qMap.items():
                i = len(vlist);
                fmap = sum(qMap[entry])/i;
                fnd = sum(qNdcg[entry])/i;
                oFile[entry].write('all map ' +str(fmap)+'\n');
                oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
                plotMap[meth][entry] = fmap;
                plotNDCG[meth][entry] = fnd;
                oFile[entry].close();
        '''
  for entry in oFile.keys():
    oFile[entry].close()

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'entSub'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()
    cText = normalize(' '.join(cTitle[0]), porter)
    if i in rel and query not in covered:
      covered[query] = 1.0
      entStatus2, entExpTerms2 = entExp2.expandTextWithStepAndSubcluster(
          query, cText, 1, 50, 55, 5)
      for noTerms, terms in entExpTerms2.items():
        #if noTerms not in qMap:
        #qMap[noTerms] = [];	
        #qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1
        '''qmap = findAvgPrec(docList,rel[i],noRel[i]);

                                dcg10, idcg10 = findDCG(docList[:10],rel[i]);
                                ndcg10 = 0.0;
                                if idcg10 > 0:
                                        ndcg10 = dcg10/idcg10;

                                qMap[noTerms].append(qmap);
                                qNdcg[noTerms].append(ndcg10);
                                oFile[noTerms].write('ndcg10 '+str(i)+'
                                '+str(ndcg10)+' '+str(dcg10)+'
                                '+str(idcg10)+'\n');
                                oFile[noTerms].write('map '+str(i)+'
                                '+str(qmap)+'\n');

        for entry, vlist in qMap.items():
                i = len(vlist);
                fmap = sum(qMap[entry])/i;
                fnd = sum(qNdcg[entry])/i;
                oFile[entry].write('all map ' +str(fmap)+'\n');
                oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
                plotMap[meth][entry] = fmap;
                plotNDCG[meth][entry] = fnd;
                oFile[entry].close();
        '''
  for entry in oFile.keys():
    oFile[entry].close()

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'qccTask'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()

    if i in rel and query not in covered:
      covered[query] = 1.0
      qccTaskTerms = qccTask.expandTextWithStep(query, 50, 55, 5)
      for noTerms, terms in qccTaskTerms.items():
        #if noTerms not in qMap:
        #qMap[noTerms] = [];	
        #qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1

  #qmap = findAvgPrec(docList,rel[i],noRel[i]);
  #dcg10, idcg10 = findDCG(docList[:10],rel[i]);
  #ndcg10 = 0.0;
  #if idcg10 > 0:
  #ndcg10 = dcg10/idcg10;
  #
  #qMap[noTerms].append(qmap);
  #qNdcg[noTerms].append(ndcg10);
  #oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n');
  #oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n');
  #
  #for entry, vlist in qMap.items():
  #i = len(vlist);
  #fmap = sum(qMap[entry])/i;
  #fnd = sum(qNdcg[entry])/i;
  #oFile[entry].write('all map ' +str(fmap)+'\n');
  #oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
  #plotMap[meth][entry] = fmap;
  #plotNDCG[meth][entry] = fnd;
  #oFile[entry].close();
  #
  for entry in oFile.keys():
    oFile[entry].close()

  i = 0
  #qMap = {};
  #qNdcg = {};
  oFile = {}
  meth = 'htcTask'
  covered = {}
  for session, viewDocs, clickDocs, cTitle, cSummary in getSessionWithXML(
      argv[1]):
    i += 1
    query = session[0].strip()

    if i in rel and query not in covered:
      covered[query] = 1.0
      htcTaskTerms = htcTask.expandTextWithStep(query, 50, 55, 5)
      for noTerms, terms in htcTaskTerms.items():
        #if noTerms not in qMap:
        #qMap[noTerms] = [];	
        #qNdcg[noTerms] = [];
        if noTerms not in oFile:
          oFile[noTerms] = open(
              outFolder + '/' + meth + '_' + str(noTerms) + '.RL1', 'w')
        docList = searcher.getTopDocumentsWithExpansion(session[0], terms, 1000,
                                                        'content', 'id')
        k = 1
        for dtuple in docList:
          oFile[noTerms].write(str(i) + ' Q0 ' + dtuple[0] + ' ' + str(k) + ' '
                               + str(round(dtuple[1], 2)) + ' baseline\n')
          k += 1
        #qmap = findAvgPrec(docList,rel[i],noRel[i]);
        #dcg10, idcg10 = findDCG(docList[:10],rel[i]);
        #ndcg10 = 0.0;
        #if idcg10 > 0:
        #ndcg10 = dcg10/idcg10;
        #qMap[noTerms].append(qmap);
        #qNdcg[noTerms].append(ndcg10);
        #oFile[noTerms].write('ndcg10 '+str(i)+' '+str(ndcg10)+' '+str(dcg10)+' '+str(idcg10)+'\n');
        #oFile[noTerms].write('map '+str(i)+' '+str(qmap)+'\n');
        #
        #for entry, vlist in qMap.items():
        #i = len(vlist);
        #fmap = sum(qMap[entry])/i;
        #fnd = sum(qNdcg[entry])/i;
        #oFile[entry].write('all map ' +str(fmap)+'\n');
        #oFile[entry].write('all ndcg10 '+str(fnd)+'\n');
        #plotMap[meth][entry] = fmap;
        #plotNDCG[meth][entry] = fnd;
        #oFile[entry].close();
  for entry in oFile.keys():
    oFile[entry].close()

  #plotMultipleSys(plotMap,'No of Terms', 'MAP',outFolder+'/map.png','Retrieval MAP Plot');
  #plotMultipleSys(plotNDCG,'No of Terms', 'NDCG@10',outFolder+'/ndcg10.png','Retrieval NDCG Plot');

  searcher.close()