def findBigramCoOccurence(sessionFile, outFile):
  #generate all bigrams from session
  # for every bigram get term and co-occurrence

  bigramCo = {}
  for session in getSessionWithQuery(sessionFile):
    for i in range(len(session) - 1):
      b1 = getNGramsAsList(session[i], 2)
      b2 = getNGramsAsList(session[i + 1], 2)
def main(argv):
  #for each query
  #get bi-grams, unigrams and update frequency

  coOccur = CoOccurrence()
  stemmer = stem.porter.PorterStemmer()
  for line in open(argv[1], 'r'):
    split = line.strip().split('\t')

    query = normalize(split[0].strip(), stemmer)
    freq = int(split[1].strip())
    #generate ngrams
    ngrams = getNGramsAsList(query, 1)
    #if it has more than one term
    lngrams = len(ngrams)
    if lngrams > 1:

      for i in range(lngrams - 1):
        if ngrams[i] not in stopSet and len(ngrams[i]) > 2:
          for j in range(i + 1, lngrams):
            if ngrams[j] not in stopSet and len(ngrams[j]) > 2:
              coOccur.updateStats(ngrams[i], ngrams[j], freq)
  coOccur.setTermTotal()
  #for each query find the terms highly co-occured wth
  for line in open(argv[2], 'r'):
    split = line.split('\t')
    query = normalize(split[1].lower().strip(), stemmer)
    nGrams = getNGramsAsList(query, 1)
    toScore = set()
    result = {}

    for entry in nGrams:
      elist = coOccur.getNeighbours(entry)
      if elist:
        toScore |= set(elist)

    for term1 in toScore:
      if term1 not in query:
        result[term1] = 0.0
        for term2 in nGrams:
          pmi = coOccur.getPMI(term1, term2, 50)
          result[term1] += pmi
        result[term1] /= len(nGrams)

    for entry in result.keys():
      if result[entry] == 0:
        del result[entry]

    sort = sorted(result.items(), reverse=True, key=lambda x: x[1])
    print query, '\t', '\t'.join('{0}:{1}'.format(x[0], round(x[1], 3))
                                 for x in sort[:50])
Exemple #3
0
def generatePhraseFeatures(featureFile, spotFile, outFile):
  #load features for queries
  qfeatMan = FeatureManager()
  qfeatMan.readFeatures(featureFile)

  pid = 0
  pfeatMan = FeatureManager()

  #generate features for phrases
  for query, pList in generatePhrases(spotFile):
    qkey, qfeat = qfeatMan.returnFeature(query)
    #print query, qkey
    if qkey:
      #print query, pList
      for phrase in pList:
        qVect = getDictFromSet(phrase.split())
        ngrams = getNGramsAsList(phrase, 2)
        url = qfeat.returnUrl()
        user = qfeat.returnUsers()
        ent = qfeat.returnEntities()
        cat = qfeat.returnCategories()
        typ = qfeat.returnType()
        sess = qfeat.returnSessions()
        if 'tournament' in phrase:
          print query, phrase
          print sess
          print typ
          print ent
        nFeature = QueryFeature(phrase, ngrams, qVect, url, user, sess, ent,
                                cat, typ)
        pfeatMan.addFeature(phrase, pid, nFeature)
        pid += 1

  pfeatMan.writeFeatures(outFile)
def findQueryCounts(queryFile):
  #coOccur = CoOccurrence();
  pairs = {}
  porter = stem.porter.PorterStemmer()
  qTerms = ''
  for line in open(queryFile, 'r'):
    split = line.strip().lower().split('\t')
    query = split[0].strip()
    freq = float(split[1])
    #for each query get nonEntTerms and update co-occurrence stats
    qTerms = ''
    qTerms = ' '.join(getQueryTerms(query))
    if len(qTerms) > 3:
      ngrams = sorted(getNGramsAsList(qTerms.strip(), 1))
      lngrams = len(ngrams)
      if lngrams > 1:
        for i in range(lngrams - 1):
          if ngrams[i] not in stopSet and len(ngrams[i]) > 2:
            for j in range(i + 1, lngrams):
              if ngrams[j] not in stopSet and len(ngrams[j]) > 2:
                stemd1 = porter.stem(ngrams[i])
                stemd2 = porter.stem(ngrams[j])
                key = stemd1 + ' ' + stemd2
                if key not in pairs:
                  pairs[key] = 0.0
                pairs[key] += freq
                #coOccur.updateStats(stemd1, stemd2, freq);
                #coOccur.setTermTotal();
                #coOccur.writeTermCo(outFile);
  return pairs
def findSessionCountsOfNonEnt(netDict, queryFile, outFile):

  coOccur = CoOccurrence()

  qTerms = ''
  for session in getSessionWithQuery(queryFile):
    #for each query get nonEntTerms and update co-occurrence stats
    qTerms = ''
    for query in session:
      query = (query.decode('utf-8')).encode('ascii', 'ignore')
      if query in netDict:
        for entry in netDict[query].getNonEntityTerms():
          if entry not in qTerms:
            qTerms += ' ' + entry
    qTerms = qTerms.strip()
    if len(qTerms) > 2:
      ngrams = getNGramsAsList(qTerms.strip(), 1)
      lngrams = len(ngrams)
      if lngrams > 1:
        for i in range(lngrams - 1):
          if ngrams[i] not in stopSet and len(ngrams[i]) > 2:
            for j in range(i + 1, lngrams):
              if ngrams[j] not in stopSet and len(ngrams[j]) > 2:
                coOccur.updateStats(ngrams[i], ngrams[j], 1.0)
  coOccur.setTermTotal()
  coOccur.writeTermCo(outFile)
Exemple #6
0
def sampleSessions(sessiontrackFile, biGramFile, freqFile, sessionFileToPrune):
  #if the session contains any session Track query
  #if it contains top 100 bigrams 20 unigrams
  # if the session average query count > 1/ session length
  #else rand number is

  sessionTrackQueries = {}
  #load sessionTrack queries
  for line in open(sessiontrackFile, 'r'):
    query = normalizeWithoutStem(line.strip().lower())
    sessionTrackQueries[query] = 1.0
    sessionTrackQueries[line.strip().lower()] = 1.0

  biGrams = set()
  for line in open(biGramFile, 'r'):
    split = line.split('\t')
    biGrams.add(split[0])
    if len(biGrams) == 2500:
      break

  freq = {}
  for line in open(freqFile, 'r'):
    split = line.split('\t')
    freq[split[0].strip()] = float(split[1])

  avgFreq = 0.0
  lastSes = None
  session = []
  hasQuery = False
  hasBigram = False
  for line in open(sessionFileToPrune, 'r'):
    split = line.split('\t')
    sessNo = int(split[0])

    query = split[1].strip()
    if query in sessionTrackQueries:
      hasQuery = True

    nGrams = set(getNGramsAsList(query, 2))
    inter = nGrams & biGrams
    if len(inter) > 0:
      hasBigram = True

    avgFreq += freq[query]
    if lastSes != sessNo:
      rnum = random.random()
      if len(session) > 0:
        avgFreq /= len(session)
      if (rnum > 0.80 or hasQuery or hasBigram) and (len(session) > 3
                                                  ) and avgFreq > 90:
        for entry in session:
          print entry,
      session = []
      hasQuery = False
      hasBigram = False

    session.append(line)
    lastSes = sessNo
def getNString(string, glen):
  string = string.strip()

  gString = ''
  ngrams = getNGramsAsList(string, glen)
  #print glen, string, bi, ind

  gString = ' '.join('{0}:{1}'.format(x.replace(' ', '_'), y)
                     for x, y in ngrams.items())

  queryVect = getDictFromSet(string.split())
  qVectString = ' '.join('{0}:{1}'.format(x, y) for x, y in queryVect.items())

  return gString + '\t' + qVectString
def findSessionCounts(queryFile, outFile, wordSet):
  coOccur = {}
  #CoOccurrence();

  qTerms = ''
  sess = 0
  qid = 0.0
  qSet = set()
  for session in getSessionWithQuery(queryFile):
    qSet.clear()
    for query in session:
      qid += 1
      terms = getQueryTerms(query)
      if len(terms) > 0:
        qSet |= getQueryTerms(query)
      if qid % 1000000 == 0:
        print qid
        print len(coOccur)

        #print len(session)	, len(qSet);
        #for each query get nonEntTerms and update co-occurrence stats
    qTerms = ''
    qTerms = ' '.join(qSet)
    if len(qTerms) > 3 and len(qSet) > 1:
      #print qSet;
      ngrams = sorted(getNGramsAsList(qTerms.strip(), 1))
      lngrams = len(ngrams)
      if lngrams > 1:
        for i in range(lngrams - 1):
          if ngrams[i] not in stopSet and len(
              ngrams[i]) > 2 and ngrams[i] in wordSet:
            for j in range(i + 1, lngrams):
              if ngrams[j] not in stopSet and len(
                  ngrams[j]) > 2 and ngrams[j] in wordSet:
                #coOccur.updateStats(ngrams[i],ngrams[j],1.0);
                key = ngrams[i] + ' ' + ngrams[j]
                if key not in coOccur:
                  coOccur[key] = 0.0
                coOccur[key] += 1.0
                if len(coOccur) >= 9000000:
                  writeDictToFile(outFile, coOccur, sess)
                  coOccur.clear()
                  coOccur = {}
                  sess += 1
Exemple #9
0
def populateDatasetWithBigrams(logFile, bigramSet, queryFile):
  sid = 0

  queryList = buildBigramSet(queryFile)

  stemmer = stem.porter.PorterStemmer()
  for session in getSessionWithQuery(logFile):
    sessionStr = ' '.join(session)
    sessionSet = set(getNGramsAsList(sessionStr, 2))
    inter = sessionSet & bigramSet
    #print len(sessionSet), len(bigramSet), inter

    if len(inter) > 0:
      lastq = None
      for q in session:
        if q in queryList:
          q = normalize(q, stemmer)
          if lastq != q and len(q) > 1:
            print sid, '\t', q
          lastq = q
    sid += 1
def main():
    parser = ap.ArgumentParser(description = 'Generate features for entity tagged queries')
    parser.add_argument('-i', '--iFile', help='Query log file', required=True)
    parser.add_argument('-o', '--oFile', help='Output feature file', required=True)
    parser.add_argument('-t', '--typeFile', help='DBPedia type file', required=True)
    parser.add_argument('-c', '--catFile', help='DBPedia cat file', required=True)
    parser.add_argument('-u', '--uid', help='User id present or not', required=True,type=bool)
    parser.add_argument('-w', '--wtype', help='Phrase (phrase) or query (query) features', required=True)

    args = parser.parse_args()

    boolUid = args.uid

    #load the category list
    dbCatList = loadCategories(args.catFile)
    #print 'Categories',len(dbCatList)
    #load the type list
    dbTypeList = loadInstancesInList(args.typeFile)
    #print 'Types',len(dbTypeList)

    #query list
    queryList = {}
    #user list
    userList = {}
    #url list
    urlList = {}
    #session list
    sessionList = {}
    #entity List
    entityList = {}
    #category List
    categoryList = {}
    #type list
    typeList = {}

    ipaddress = 'localhost'
    tagURL = 'http://'+ipaddress+':8080/dexter-webapp/api/rest/annotate'

    cqid = 1
    sid = 1
    qid = None
    for session in getSessionTuples(args.iFile,'\t', 1560):
        print 'Session id and length' , sid, len(session)
        for entry in session:
            query = entry[QUERY]
            #tag it with dexter and get all 3 parameters
            spotDict = tagQueryWithDexter(query,tagURL)
            if 'spots' in spotDict:
                updatedSpotDict = getCatAndTypeInfo(spotDict,dbCatList, dbTypeList)
            if args.wtype == 'query':
                #given wtype find the following
                if query not in queryList:
                    #print 'Mapping ', query , 'to ', cqid
                    queryList[query] = cqid
                    qid = cqid
                    cqid+=1
                else:
                    qid = queryList[query]
                updateDict(sessionList,sid, qid)

                if boolUid:
                    updateDict(userList, entry[USER], qid)
                if CLICKU in entry:
                    updateDict(urlList, entry[CLICKU],qid)
                if updatedSpotDict:
                    for spot in updatedSpotDict['spots']:
                        updateDict(categoryList,spot['cat'], qid)
                        updateDict(typeList,spot['type'], qid)
                        updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid)

            if args.wtype == 'phrase':
                for spot in updatedSpotDict['spots']:
                    splits = query.split(spot['mention'])
                    for split in splits:
                        split = split.strip()
                        #remove stop words
                        split = filterStopWordsFromQuery(split)
                        if len(split) > 1:
                            if split not in queryList:
                                queryList[split] = cqid
                                qid = cqid
                                cqid+=1
                            else:
                                qid = queryList[split]
                            updateDict(sessionList,sid, qid)

                            if boolUid:
                                updateDict(userList, entry[USER], qid)
                            if CLICKU in entry:
                                updateDict(urlList, entry[CLICKU],qid)
                            if updatedSpotDict:
                                updateDict(categoryList,spot['cat'], qid)
                                updateDict(typeList,spot['type'], qid)
                                updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid)
        sid+=1

    #write the features to the outfile
    outF = open(args.oFile,'w')

    for query, qid in queryList.items():
        outF.write(query)
        #generate ngrams
        queryVect = getDictFromSet(query.split())
        ngramString = getNGramsAsList(query,3)
        #ngrams = 1
        outF.write('\t'+str(ngramString))
        #query vect = 2
        outF.write('\t'+str(queryVect))


        if qid in urlList:
            outF.write('\t'+str(urlList[qid]))
        else:
            outF.write('\t{}')

        if qid in userList:
            outF.write('\t'+str(userList[qid]))
        else:
            outF.write('\t{}')

        if qid in entityList:
            outF.write('\t'+str(entityList[qid]))
        else:
            outF.write('\t{}')

        if qid in categoryList:
            outF.write('\t'+str(categoryList[qid]))
        else:
            outF.write('\t{}')

        if qid in typeList:
            outF.write('\t'+str(typeList[qid]))
        else:
            outF.write('\t{}')

        if qid in sessionList:
            outF.write('\t'+str(sessionList[qid]))
        else:
            outF.write('\t{}')

        outF.write('\n')

    outF.close()
def combineQueryFeatures(queryFile, spotFile, featFile, newFile):
  #load features
  featDict = {}
  i = 1
  urlDict = {}

  for line in open(featFile, 'r'):
    split = line.strip().split('\t')
    featDict[split[0].strip()] = split[1:]

  querySpots = {}
  for line in open(spotFile, 'r'):
    spotDict = ast.literal_eval(line)
    querySpots[spotDict['text']] = spotDict
  outF = open(newFile, 'w')

  #all queries
  for line in open(queryFile, 'r'):
    query = line.strip()
    queryFeat = []

    #getNString(query,3).decode('utf-8')
    #triString = str(triString.encode('ascii','ignore')).strip()
    triString = getNGramsAsList(query, 3)
    if len(triString) > 0:
      queryFeat.append(triString)
    else:
      queryFeat.append({})

    queryVect = getDictFromSet(query.split())
    if len(queryVect) > 0:
      queryFeat.append(queryVect)
    else:
      queryFeat.append({})

    if query in featDict:
      #normalize the users
      userString = getUserString(featDict[query][0])
      if len(userString) > 0:
        queryFeat.append(userString)
      else:
        queryFeat.append({})

      #normalize the urls
      i, urlDict, linkString = getUrlString(featDict[query][1], urlDict, i)
      if len(linkString) > 0:
        queryFeat.append(linkString)
      else:
        queryFeat.append({})
    else:
      print 'Query not found ', query

    if query in querySpots:
      spotDict = querySpots[query]  #ast.literal_eval(line)
      #cat, ent and type info
      result = getCatEntString(spotDict)
      for entry in result:
        if len(entry) > 0:
          queryFeat.append(entry)
        else:
          queryFeat.append({})
    else:
      queryFeat += [{}, {}, {}]
      #print queryFeat
    try:
      outF.write(query)
      for entry in queryFeat:
        outF.write('\t' + str(entry))
      outF.write('\n')
    except:
      print 'ERROR ', queryFeat

  outF.close()