Esempio n. 1
0
  def setSearcher(self, wikiIndexDir, queryIndexDir):
    """ Setting the indexes to search for terms"""
    self.wIndex, self.wsearcher = loadIndex(
        wikiIndexDir, wikiIndexDir[wikiIndexDir.rfind('/') + 1:])
    self.wtlc = loadCollector(self.wsearcher, 2000, 20)

    self.qIndex, self.qsearcher = loadIndex(
        queryIndexDir, queryIndexDir[queryIndexDir.rfind('/') + 1:])
    self.qtlc = loadCollector(self.qsearcher, 2000, 20)

    self.qqp = loadQueryParser(self.qIndex, 'session')
    self.wqp = loadQueryParser(self.wIndex, 'content')
def SearchYahooQuestionsWithWhoosh(input_file, index_folder, index_name,
                                   questions_limit, output_file):
  # Open the index.
  questions_index, questions_searcher = loadIndex(index_folder, index_name)
  # Load the collector.
  questions_collector = loadCollector(questions_searcher, questions_limit, 20)
  # Search on question field for now.
  query_parser = loadQueryParser(questions_index, 'question_tokens')

  # Open the file to write the query and top questions_limit questions.
  out = open(output_file, 'w')

  for line in open(input_file, 'r'):
    query = line.strip()
    query_object = query_parser.parse(unicode(query))
    try:
      questions_searcher.search_with_collector(query_object,
                                               questions_collector)
    except TimeLimit:
      print 'ERROR: Very Long query as input.', query

    results = questions_collector.results()
    for document in results:
      question_list = document['question_text'].split('\t');
      for question in queston_list:
        out.write(query + '\t' + question + '\n')
  out.close()
Esempio n. 3
0
 def __init__(self, indexName, rnker, noTasks, wordList):
   self.ranker = rnker
   #load the index
   self.index, self.searcher = loadIndex(indexName,
                                         indexName[indexName.rfind('/') + 1:])
   self.tlc = loadCollector(self.searcher, noTasks, 20)
   self.qp = loadQueryParser(self.index, 'task')
   self.porter = stem.porter.PorterStemmer()
   self.vocab = wordList
Esempio n. 4
0
  def __init__(self, fileName=None):
    self.vector = {}
    '''if not os.path.isdir(fileName):
			self.vector = {}
			self.loadVector(fileName)
		else:
		'''
    if fileName:
      self.vIndex, self.vsearcher = loadIndex(
          fileName, fileName[fileName.rfind('/') + 1:])
      self.vtlc = loadCollector(self.vsearcher, 1, 20)
      self.qqp = loadQueryParser(self.vIndex, 'term')
def getEntitiesWithEIndex(args):

  #oFile = open(args[3],'w')
  index, searcher = loadIndex(args[2], args[3])
  tlc = loadCollector(searcher, 50, 20)
  qp = loadQueryParser(index)
  querySet = set()

  fileName = args[1]
  for query in getQuery(fileName, 1):
    if query not in querySet:
      print query, findTextEntities(query, searcher, tlc, qp)
      querySet.add(query)
Esempio n. 6
0
def findMarkovStats(argv):

  i = 0

  wikiIndexDir = argv[2]
  queryIndexDir = argv[3]

  iFile = argv[1]

  wIndex, wsearcher = loadIndex(wikiIndexDir, wikiIndexDir)
  qIndex, qsearcher = loadIndex(queryIndexDir, queryIndexDir)

  wtlc = loadCollector(wsearcher, 2000, 20)
  qtlc = loadCollector(qsearcher, 2000, 20)

  qqp = loadQueryParser(qIndex, 'session')
  wqp = loadQueryParser(wIndex, 'content')

  prec = {}
  recall = {}

  count = 0.0
  for session in getSessionWithNL(iFile):
    #get the query
    query = session[0].lower()
    query = re.sub(SYMB, ' ', query)
    query = re.sub('\d+', ' ', query)
    query = re.sub('\s+', ' ', query).strip()

    aTerms, bTerms = addedAndRemovedTerms(query, session)

    if aTerms:
      count += 1.0
      totalNetwork = {}
      #stemNetwork = {}
      #queryNetwork = {}
      #wikiNetwork = {}
      terms = updateNetwork(query, totalNetwork, wqp, wsearcher, wtlc,
                            'content', 'wiki')
      terms2 = updateNetwork(query, totalNetwork, qqp, qsearcher, qtlc,
                             'session', 'query')
      print len(terms), len(terms2)
      #updateStemNetwork(queryNetwork,stemNetwork, porter)	
      #updateStemNetwork(wikiNetwork,stemNetwork, porter)
      updateStemNetwork(totalNetwork)
      #normalizeNetworks(queryNetwork)			
      #normalizeNetworks(stemNetwork)			
      #normalizeNetworks(wikiNetwork)

      #calculate the mixtures at two stages
      stage1 = {}
      stage2 = {}
      combineNetwork(1.0, stage1, totalNetwork, 'stem')
      combineNetwork(0.5, stage2, totalNetwork, 'query')
      combineNetwork(0.5, stage2, totalNetwork, 'wiki')

      #convert into matrix for multiplication
      totalDim = sorted(list(set(stage1.keys()) | set(stage2.keys())))

      dim = len(totalDim)
      if dim > 0:
        stage1Matrix = toMatrix(totalDim, stage1)
        print 'STAGE1', stage1Matrix[0], stage1Matrix.shape
        stage2Matrix = toMatrix(totalDim, stage2)
        print 'STAGE2', stage2Matrix[0], stage2Matrix.shape

        backSmooth = 1.0 / len(totalDim)
        stage3Matrix = numpy.zeros((dim, dim))
        stage3Matrix.fill(backSmooth)
        print 'STAGE3', stage3Matrix[0], stage3Matrix.shape

        alpha = 0.80
        #matrix = ['stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage3']
        matrix = ['stage1', 'stage2', 'stage2', 'stage2', 'stage3']
        totalSum = numpy.zeros((dim, dim))
        cK = numpy.ones((dim, dim))

        #start walk!
        for k in range(len(matrix)):
          print k, matrix[k]
          if matrix[k] == 'stage1':
            cK = numpy.dot(stage1Matrix, cK)
          elif matrix[k] == 'stage2':
            cK = numpy.dot(stage2Matrix, cK)
          else:
            cK = numpy.dot(cK, stage3Matrix)
          print 'CK', cK[0]

          totalSum = totalSum + (math.pow(alpha, k) * cK)
        totalSum = totalSum * (1 - alpha)

        #rank Terms
        qList = []
        terms = query.split()  #getQueryTerms(query)
        for term in terms:
          if term in totalDim:
            qList.append(totalDim.index(term))
          else:
            print 'ERROR dint find ', query, '\t', term, len(totalDim)

        termScore = {}
        for i in range(len(totalDim)):
          termScore[totalDim[i]] = 0.0
          for j in qList:
            if totalSum[i][j] > 0.0:
              termScore[totalDim[i]] += math.log(totalSum[i][j])

        #find the precision for different term sets
        sortTerms = sorted(termScore.iteritems(),
                           reverse=True,
                           key=lambda x: x[1])
        for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']:
          try:
            cTerms = set([x[0] for x in sortTerms[:i]])
            print 'CTERMS ', sortTerms[0:10], len(cTerms), 'ATERMS', aTerms
            p = len(aTerms & cTerms) / (len(aTerms) * 1.0)
            r = len(aTerms & cTerms) / (len(cTerms) * 1.0)
            prec[i] = prec.setdefault(i, 0.0) + p
            recall[i] = recall.setdefault(i, 0.0) + r
            print 'Prec', i, '\t', query, '\t', p
          except Exception as err:
            cTerms = set([x[0] for x in sortTerms])
            p = len(aTerms & cTerms) / (len(aTerms) * 1.0)
            r = len(aTerms & cTerms) / (len(cTerms) * 1.0)
            prec[i] = prec.setdefault(i, 0.0) + p
            recall[i] = recall.setdefault(i, 0.0) + r
            print 'Prec', i, '\t', query, '\t', p

      else:
        for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']:
          print 'Prec', i, '\t', query, '\t', 0.0

    #average the prec & recall
    #print prec and recall
  print 'Printing Precison'
  for entry, value in prec.iteritems():
    print entry, value / count

  print 'Printing Precison'
  for entry, value in recall.iteritems():
    print entry, value / count

  wIndex.close()
  qIndex.close()