def findSessionCountsOfNonEnt(netDict, queryFile, outFile):

  coOccur = CoOccurrence()

  qTerms = ''
  for session in getSessionWithQuery(queryFile):
    #for each query get nonEntTerms and update co-occurrence stats
    qTerms = ''
    for query in session:
      query = (query.decode('utf-8')).encode('ascii', 'ignore')
      if query in netDict:
        for entry in netDict[query].getNonEntityTerms():
          if entry not in qTerms:
            qTerms += ' ' + entry
    qTerms = qTerms.strip()
    if len(qTerms) > 2:
      ngrams = getNGramsAsList(qTerms.strip(), 1)
      lngrams = len(ngrams)
      if lngrams > 1:
        for i in range(lngrams - 1):
          if ngrams[i] not in stopSet and len(ngrams[i]) > 2:
            for j in range(i + 1, lngrams):
              if ngrams[j] not in stopSet and len(ngrams[j]) > 2:
                coOccur.updateStats(ngrams[i], ngrams[j], 1.0)
  coOccur.setTermTotal()
  coOccur.writeTermCo(outFile)
def main(argv):
  #for each query
  #get bi-grams, unigrams and update frequency

  coOccur = CoOccurrence()
  stemmer = stem.porter.PorterStemmer()
  for line in open(argv[1], 'r'):
    split = line.strip().split('\t')

    query = normalize(split[0].strip(), stemmer)
    freq = int(split[1].strip())
    #generate ngrams
    ngrams = getNGramsAsList(query, 1)
    #if it has more than one term
    lngrams = len(ngrams)
    if lngrams > 1:

      for i in range(lngrams - 1):
        if ngrams[i] not in stopSet and len(ngrams[i]) > 2:
          for j in range(i + 1, lngrams):
            if ngrams[j] not in stopSet and len(ngrams[j]) > 2:
              coOccur.updateStats(ngrams[i], ngrams[j], freq)
  coOccur.setTermTotal()
  #for each query find the terms highly co-occured wth
  for line in open(argv[2], 'r'):
    split = line.split('\t')
    query = normalize(split[1].lower().strip(), stemmer)
    nGrams = getNGramsAsList(query, 1)
    toScore = set()
    result = {}

    for entry in nGrams:
      elist = coOccur.getNeighbours(entry)
      if elist:
        toScore |= set(elist)

    for term1 in toScore:
      if term1 not in query:
        result[term1] = 0.0
        for term2 in nGrams:
          pmi = coOccur.getPMI(term1, term2, 50)
          result[term1] += pmi
        result[term1] /= len(nGrams)

    for entry in result.keys():
      if result[entry] == 0:
        del result[entry]

    sort = sorted(result.items(), reverse=True, key=lambda x: x[1])
    print query, '\t', '\t'.join('{0}:{1}'.format(x[0], round(x[1], 3))
                                 for x in sort[:50])