Ejemplo n.º 1
0
def loadTasks(fileName, ttype):
  iFile = open(fileName, 'r')
  corpus = [[], []]
  tokenDict = {}
  taskVectorDict = {}
  tid = 0
  for line in iFile:
    split = line.strip().split('\t')
    taskDict = literal_eval(split[-1])
    for entry, entDict in taskDict[ttype]['tasks'].iteritems():
      #indexTaskVectors(entry,None,tid,tokenDict,taskVectorDict)
      ttext = SPACE.join(entry)
      tid += 1
      corpus[0].append(ttext)
      #print corpus[0], ttext, entry
      taskTokenDict = text_to_vector(ttext)
      #etDict =  {}
      #for entity in entDict['AND']:
      #etDict[str(entity)] = entDict['AND'][entity]['score']
      #etDict.update(taskTokenDict)
      corpus[1].append(taskTokenDict)
      #print etDict, corpus[1][-1]
      #print tsDict, corpus[0][-1]
  iFile.close()
  return corpus, tokenDict, taskVectorDict
Ejemplo n.º 2
0
def getUserVector(fileName, uIndex, qIndex):
  userVector = {}
  lastUser = None
  porter1 = porter.PorterStemmer()

  for line in open(fileName, 'r'):
    split = line.strip().split('\t')
    uId = split[uIndex]
    query = split[qIndex]

    if not lastUser:
      lastUser = uId
    raw_split = re.sub(SYMB, ' ', query.lower()).split(' ')
    query = filterStopWordsFromList(raw_split)
    #print uId, lastUser, lastUser!=uId
    if lastUser != uId:
      yield lastUser, userVector
      userVector = {}

    if (not (hasManyChars(query,raw_split,1,4,70) \
			or hasInapWords(raw_split) or hasManyWords(raw_split,15,40))) \
			and hasAlpha(query):
      qDict = text_to_vector(query)
      for entry, val in qDict.iteritems():
        entry1 = porter1.stem(entry)
        userVector[entry1] = userVector.setdefault(entry1, 0.0) + val

    lastUser = uId
  yield lastUser, userVector
Ejemplo n.º 3
0
def getPairFeatures(session):

  totalTime = 1.0 + (session[-1][QTIME] - session[0][QTIME]).total_seconds()
  for i in range(len(session) - 1):
    for j in range(i + 1, len(session)):
      e1 = session[i]
      e2 = session[j]
      jaccard = 1.0 - distance.jaccard(e1[QUERY].split(), e2[QUERY].split())
      edit = 1.0 - distance.nlevenshtein(e1[QUERY].split(), e2[QUERY].split())
      timeDiff = ((e2[QTIME] - e1[QTIME]).total_seconds()) / totalTime * 1.0
      #normalized distance
      dist = (j - i) * 1.0 / len(session)
      urlMatch = -1
      if CLICKU in e1 and CLICKU in e2:
        urlMatch = 1.0 - distance.nlevenshtein(e1[CLICKU], e2[CLICKU])
      cosine = get_cosine(text_to_vector(e1[QUERY]), text_to_vector(e2[QUERY]))
      edgeScore = .20 * cosine + .20 * jaccard + .20 * edit + .15 * dist + .15 * timeDiff + .10 * urlMatch
      yield i, j, edgeScore, cosine, jaccard, edit, dist, timeDiff, urlMatch
def getTermList(queryList):
  termList = {}

  for entry in queryList:
    count = text_to_vector(entry)
    for w, c in count.items():
      #w = porter.stem(w)
      if w not in termList:
        termList[w] = 0.0
      termList[w] += c

  #print 'TermList ',len(termList), termList
  return termList.items(), set(termList.keys())
def getTaskTermSet(rSort, rank):

  termSet = {}
  index = rank
  if rank == 'all':
    index = len(rSort)
  for entry in rSort[:index]:
    #tDict = taskDict[entry[0]]
    tDict = text_to_vector(entry[0])
    for tentry, value in tDict.iteritems():
      termSet[tentry] = tDict.setdefault(tentry, 0.0) + value

  return sorted(termSet.iteritems(), reverse=True, key=lambda x: x[1])
Ejemplo n.º 6
0
def loadTasksFromTxt(fileName):
  corpus = [[], []]
  tokenDict = {}
  taskVectorDict = {}
  tid = 0
  for line in open(fileName, 'r'):
    line = line.strip()
    #indexTaskVectors(line,None,tid,tokenDict,taskVectorDict)
    tid += 1
    corpus[0].append(line)
    taskTokenDict = text_to_vector(line)
    corpus[1].append(taskTokenDict)
  return corpus, tokenDict, taskVectorDict
Ejemplo n.º 7
0
  def getTaskTermSet(self, rSort, text):

    termSet = {}
    #qterms = getQueryTerms(text);
    for entry in rSort:
      #tDict = taskDict[entry[0]]
      tDict = text_to_vector(entry[0])
      for tentry, value in tDict.iteritems():
        stem = self.porter.stem(tentry)
        if tentry not in stopSet and len(tentry) >2 and hasAlpha(tentry) \
				and (tentry not in text and stem not in text) and tentry in self.vocab:
          termSet[stem] = termSet.setdefault(stem, 0.0) + value
    #for term in termSet.keys():
    #	for qterm in qTerms:
    #		termSet[term] *=
    return termSet