def findSessionCountsOfNonEnt(netDict, queryFile, outFile):

  coOccur = CoOccurrence()

  qTerms = ''
  for session in getSessionWithQuery(queryFile):
    #for each query get nonEntTerms and update co-occurrence stats
    qTerms = ''
    for query in session:
      query = (query.decode('utf-8')).encode('ascii', 'ignore')
      if query in netDict:
        for entry in netDict[query].getNonEntityTerms():
          if entry not in qTerms:
            qTerms += ' ' + entry
    qTerms = qTerms.strip()
    if len(qTerms) > 2:
      ngrams = getNGramsAsList(qTerms.strip(), 1)
      lngrams = len(ngrams)
      if lngrams > 1:
        for i in range(lngrams - 1):
          if ngrams[i] not in stopSet and len(ngrams[i]) > 2:
            for j in range(i + 1, lngrams):
              if ngrams[j] not in stopSet and len(ngrams[j]) > 2:
                coOccur.updateStats(ngrams[i], ngrams[j], 1.0)
  coOccur.setTermTotal()
  coOccur.writeTermCo(outFile)
def findBigramCoOccurence(sessionFile, outFile):
  #generate all bigrams from session
  # for every bigram get term and co-occurrence

  bigramCo = {}
  for session in getSessionWithQuery(sessionFile):
    for i in range(len(session) - 1):
      b1 = getNGramsAsList(session[i], 2)
      b2 = getNGramsAsList(session[i + 1], 2)
Ejemplo n.º 3
0
def loadSessions(sessFile):
  sessions = []
  count = 0.0
  clen = 0
  for session in getSessionWithQuery(sessFile):
    if len(session) > 0:
      sessions.append(session)
      count += 1
      clen += len(session)
  print count, clen / count

  return sessions
def findSessionCounts(queryFile, outFile, wordSet):
  coOccur = {}
  #CoOccurrence();

  qTerms = ''
  sess = 0
  qid = 0.0
  qSet = set()
  for session in getSessionWithQuery(queryFile):
    qSet.clear()
    for query in session:
      qid += 1
      terms = getQueryTerms(query)
      if len(terms) > 0:
        qSet |= getQueryTerms(query)
      if qid % 1000000 == 0:
        print qid
        print len(coOccur)

        #print len(session)	, len(qSet);
        #for each query get nonEntTerms and update co-occurrence stats
    qTerms = ''
    qTerms = ' '.join(qSet)
    if len(qTerms) > 3 and len(qSet) > 1:
      #print qSet;
      ngrams = sorted(getNGramsAsList(qTerms.strip(), 1))
      lngrams = len(ngrams)
      if lngrams > 1:
        for i in range(lngrams - 1):
          if ngrams[i] not in stopSet and len(
              ngrams[i]) > 2 and ngrams[i] in wordSet:
            for j in range(i + 1, lngrams):
              if ngrams[j] not in stopSet and len(
                  ngrams[j]) > 2 and ngrams[j] in wordSet:
                #coOccur.updateStats(ngrams[i],ngrams[j],1.0);
                key = ngrams[i] + ' ' + ngrams[j]
                if key not in coOccur:
                  coOccur[key] = 0.0
                coOccur[key] += 1.0
                if len(coOccur) >= 9000000:
                  writeDictToFile(outFile, coOccur, sess)
                  coOccur.clear()
                  coOccur = {}
                  sess += 1
Ejemplo n.º 5
0
def main(argv):
  simpleWalk = SimpleWalk()
  top50 = loadFileInList(argv[2])
  porter = stem.porter.PorterStemmer()
  for rsession in getSessionWithQuery(argv[1]):
    i = 0
    j = 1
    session = removeWrongEntries(rsession, top50)
    sesLen = len(session)
    while i < sesLen and j < sesLen:
      stemI = stemQuery(session[i], porter)
      stemJ = stemQuery(session[j], porter)
      simpleWalk.addEdge(stemI, stemJ, 1.0)
      i = j
      j += 1

  simpleWalk.filter(2)

  simpleWalk.walk()
Ejemplo n.º 6
0
def populateDatasetWithBigrams(logFile, bigramSet, queryFile):
  sid = 0

  queryList = buildBigramSet(queryFile)

  stemmer = stem.porter.PorterStemmer()
  for session in getSessionWithQuery(logFile):
    sessionStr = ' '.join(session)
    sessionSet = set(getNGramsAsList(sessionStr, 2))
    inter = sessionSet & bigramSet
    #print len(sessionSet), len(bigramSet), inter

    if len(inter) > 0:
      lastq = None
      for q in session:
        if q in queryList:
          q = normalize(q, stemmer)
          if lastq != q and len(q) > 1:
            print sid, '\t', q
          lastq = q
    sid += 1
Ejemplo n.º 7
0
def populateDataset(logFile, queryList):
  sid = 1
  sessionList = {}
  for session in getSessionWithQuery(logFile):
    prints = False
    #print len(session)
    for entry in session:
      if entry in queryList:
        #prints = True;
        if entry not in sessionList:
          sessionList[entry] = {}
        if sid not in sessionList[entry]:
          sessionList[entry][sid] = 0.0
        sessionList[entry][sid] += 1.0
      #else:

      #	print 'NOT FOUND ',entry
      #print session
    sid += 1

  for entry, sessionCount in sessionList.iteritems():
    print entry, '\t', sessionCount
Ejemplo n.º 8
0
def main(argv):
  ischema = Schema(session=TEXT(stored=True, phrase=False))
  if not os.path.exists(argv[2]):
    os.mkdir(argv[2])
  qindex = create_in(argv[2], schema=ischema, indexname=argv[2])
  writer = qindex.writer()

  i = 0
  for sess in getSessionWithQuery(argv[1], 1500):
    #print sess
    string = ' '.join(sess)
    try:
      writer.add_document(session=unicode(string.decode(
          'unicode_escape').encode('ascii', 'ignore')))
    except Exception as err:
      print sess, 'problem in indexing'
      print err, err.args
    i += 1
    if i % 100000 == 0:
      print i

  writer.commit()
  qindex.close()
Ejemplo n.º 9
0
  featMan.readFeatures(args.featFile)
  # Loads the distance between two queries (i.e. 1-similarity)
  weightMatrix = readWeightMatrix(args.distFile)
  print len(weightMatrix)
  samePairsSet = differentPairsSet = None
  if args.pairLabelFile:
    samePairsSet , differentPairsSet =   loadPairsFromFile(args.pairLabelFile)

  total_metrics_dict = {}
  for threshold in np.arange(args.lowerLimit, args.upperLimit, 0.02):
    sessCount = 0
    lastSes = None
    session = []
    metrics = {}
    qcc = QCCTasks()
    for session in getSessionWithQuery(args.sessionFile):
      #calculate the score
      for i in range(len(session) - 1):
        qid1, qf1 = featMan.returnFeature(session[i])
        if qf1:
          for j in range(i + 1, len(session)):
            qid2, qf2 = featMan.returnFeature(session[j])
            if qf2:
              try:
                if qid1 < qid2:
                  edgeScore = 1.0 - weightMatrix[qid1][qid2]
                else:
                  edgeScore = 1.0 - weightMatrix[qid2][qid1]
                if edgeScore > threshold:
                  qcc.addEdge(qid1, qid2, edgeScore)
              except: