Example #1
0
  def setSearcher(self, wikiIndexDir, queryIndexDir):
    """ Setting the indexes to search for terms"""
    self.wIndex, self.wsearcher = loadIndex(
        wikiIndexDir, wikiIndexDir[wikiIndexDir.rfind('/') + 1:])
    self.wtlc = loadCollector(self.wsearcher, 2000, 20)

    self.qIndex, self.qsearcher = loadIndex(
        queryIndexDir, queryIndexDir[queryIndexDir.rfind('/') + 1:])
    self.qtlc = loadCollector(self.qsearcher, 2000, 20)

    self.qqp = loadQueryParser(self.qIndex, 'session')
    self.wqp = loadQueryParser(self.wIndex, 'content')
def SearchYahooQuestionsWithWhoosh(input_file, index_folder, index_name,
                                   questions_limit, output_file):
  # Open the index.
  questions_index, questions_searcher = loadIndex(index_folder, index_name)
  # Load the collector.
  questions_collector = loadCollector(questions_searcher, questions_limit, 20)
  # Search on question field for now.
  query_parser = loadQueryParser(questions_index, 'question_tokens')

  # Open the file to write the query and top questions_limit questions.
  out = open(output_file, 'w')

  for line in open(input_file, 'r'):
    query = line.strip()
    query_object = query_parser.parse(unicode(query))
    try:
      questions_searcher.search_with_collector(query_object,
                                               questions_collector)
    except TimeLimit:
      print 'ERROR: Very Long query as input.', query

    results = questions_collector.results()
    for document in results:
      question_list = document['question_text'].split('\t');
      for question in queston_list:
        out.write(query + '\t' + question + '\n')
  out.close()
Example #3
0
 def __init__(self, indexName, rnker, noTasks, wordList):
   self.ranker = rnker
   #load the index
   self.index, self.searcher = loadIndex(indexName,
                                         indexName[indexName.rfind('/') + 1:])
   self.tlc = loadCollector(self.searcher, noTasks, 20)
   self.qp = loadQueryParser(self.index, 'task')
   self.porter = stem.porter.PorterStemmer()
   self.vocab = wordList
Example #4
0
  def __init__(self, fileName=None):
    self.vector = {}
    '''if not os.path.isdir(fileName):
			self.vector = {}
			self.loadVector(fileName)
		else:
		'''
    if fileName:
      self.vIndex, self.vsearcher = loadIndex(
          fileName, fileName[fileName.rfind('/') + 1:])
      self.vtlc = loadCollector(self.vsearcher, 1, 20)
      self.qqp = loadQueryParser(self.vIndex, 'term')
def getEntitiesWithEIndex(args):

  #oFile = open(args[3],'w')
  index, searcher = loadIndex(args[2], args[3])
  tlc = loadCollector(searcher, 50, 20)
  qp = loadQueryParser(index)
  querySet = set()

  fileName = args[1]
  for query in getQuery(fileName, 1):
    if query not in querySet:
      print query, findTextEntities(query, searcher, tlc, qp)
      querySet.add(query)
  def test_create_index(self):
    stem_analyzer = StemmingAnalyzer()
    test_folder_to_index = 'test-folder'
    test_index_name = 'test-index'
    if not os.path.exists(test_folder_to_index):
      os.mkdir(test_folder_to_index)

    # Index 1 question.
    ofile = open(test_folder_to_index + '/question_indexer_test.xml', 'w')
    ofile.write(
        '<document><uri>432470</uri> <subject>Why are yawns contagious?</subject>\
    <content> When people yawn, you see that other people in the room yawn, too. Why is that?</content>\
    <bestanswer>When your body </bestanswer></document>')
    ofile.close()

    yahoo_dataset_indexer.IndexYahooQuestionsWithWhoosh(test_folder_to_index,
                                                        test_index_name)
    questions_index, questions_searcher = loadIndex(test_index_name,
                                                    test_index_name)
    # Check the schema.
    expected_schema = Schema(question_tokens=TEXT(analyzer=stem_analyzer,
                                      stored=False,
                                      phrase=False),
                        question_text=TEXT(analyzer=stem_analyzer,
                                      stored=True,
                                      phrase=False),
                        answers=TEXT(analyzer=stem_analyzer,
                                     stored=False,
                                     phrase=False))
    self.assertEqual(expected_schema, questions_index.schema)
    # Check the number of documents.
    self.assertEqual(1, questions_index.doc_count())
    # Check the number of terms in question and answer fields.
    expected_question_terms = ['contagi', 'yawn']
    i = 0
    for question_terms_tuple in questions_index.reader().iter_field('question_tokens'):
      self.assertEqual(question_terms_tuple[0], expected_question_terms[i])
      i += 1
    expected_answer_terms = ['bodi']
    i = 0
    for answer_terms_tuple in questions_index.reader().iter_field('answers'):
      self.assertEqual(answer_terms_tuple[0], expected_answer_terms[i])
      i += 1
def main(argv):
  #taskDict, wordTaskDict = loadTasks(argv[1], argv[2])
  index, searcher, tlc = loadIndex(argv[1], argv[1])
  qp = QueryParser('task', schema=index.schema, group=OrGroup)

  #rank = int(argv[4]) # rank to calculate the precision
  #topK = int(argv[5]) # top K Tasks to create the term set
  precDict = {}
  recallDict = {}
  sCount = 0
  for session in getSessionWithNL(argv[3]):
    bQuery = session[0].lower()
    bQuery = re.sub(SYMB, ' ', bQuery)
    bQuery = re.sub('\s+', ' ', bQuery).strip()
    aTerms, rTerms = addedAndRemovedTerms(bQuery, session[1:])
    if aTerms:
      print bQuery, aTerms
      #resultSet = rankAndRetrieveTasks(bQuery,taskDict, wordTaskDict)
      resultSet = rankAndRetrieveTasks(bQuery, qp, searcher, tlc)
      rSort = sorted(resultSet.items(), reverse=True, key=lambda x: x[1])
      for j in [50, 100, 500, 1000, 'all']:
        if j not in precDict:
          precDict[j] = {}
          recallDict[j] = {}
        #termSet = getTaskTermSet(rSort, taskDict, j)
        termSet = getTaskTermSet(rSort, j)
        for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 'all']:
          prec, recall = getPrecRecall(termSet, aTerms, i)
          precDict[j][i] = precDict[j].setdefault(i, 0.0) + prec
          recallDict[j][i] = recallDict[j].setdefault(i, 0.0) + recall
          print j, i, bQuery, '\t', prec
      sCount += 1.0

  print 'Printing Precision Stats'
  for j, jdict in precDict.iteritems():
    for i, score in jdict.iteritems():
      print j, i, score, sCount, score / sCount

  print 'Printing Recall Stats'
  for j, jdict in recallDict.iteritems():
    for i, score in jdict.iteritems():
      print j, i, score / sCount
Example #8
0
def findMarkovStats(argv):

  i = 0

  wikiIndexDir = argv[2]
  queryIndexDir = argv[3]

  iFile = argv[1]

  wIndex, wsearcher = loadIndex(wikiIndexDir, wikiIndexDir)
  qIndex, qsearcher = loadIndex(queryIndexDir, queryIndexDir)

  wtlc = loadCollector(wsearcher, 2000, 20)
  qtlc = loadCollector(qsearcher, 2000, 20)

  qqp = loadQueryParser(qIndex, 'session')
  wqp = loadQueryParser(wIndex, 'content')

  prec = {}
  recall = {}

  count = 0.0
  for session in getSessionWithNL(iFile):
    #get the query
    query = session[0].lower()
    query = re.sub(SYMB, ' ', query)
    query = re.sub('\d+', ' ', query)
    query = re.sub('\s+', ' ', query).strip()

    aTerms, bTerms = addedAndRemovedTerms(query, session)

    if aTerms:
      count += 1.0
      totalNetwork = {}
      #stemNetwork = {}
      #queryNetwork = {}
      #wikiNetwork = {}
      terms = updateNetwork(query, totalNetwork, wqp, wsearcher, wtlc,
                            'content', 'wiki')
      terms2 = updateNetwork(query, totalNetwork, qqp, qsearcher, qtlc,
                             'session', 'query')
      print len(terms), len(terms2)
      #updateStemNetwork(queryNetwork,stemNetwork, porter)	
      #updateStemNetwork(wikiNetwork,stemNetwork, porter)
      updateStemNetwork(totalNetwork)
      #normalizeNetworks(queryNetwork)			
      #normalizeNetworks(stemNetwork)			
      #normalizeNetworks(wikiNetwork)

      #calculate the mixtures at two stages
      stage1 = {}
      stage2 = {}
      combineNetwork(1.0, stage1, totalNetwork, 'stem')
      combineNetwork(0.5, stage2, totalNetwork, 'query')
      combineNetwork(0.5, stage2, totalNetwork, 'wiki')

      #convert into matrix for multiplication
      totalDim = sorted(list(set(stage1.keys()) | set(stage2.keys())))

      dim = len(totalDim)
      if dim > 0:
        stage1Matrix = toMatrix(totalDim, stage1)
        print 'STAGE1', stage1Matrix[0], stage1Matrix.shape
        stage2Matrix = toMatrix(totalDim, stage2)
        print 'STAGE2', stage2Matrix[0], stage2Matrix.shape

        backSmooth = 1.0 / len(totalDim)
        stage3Matrix = numpy.zeros((dim, dim))
        stage3Matrix.fill(backSmooth)
        print 'STAGE3', stage3Matrix[0], stage3Matrix.shape

        alpha = 0.80
        #matrix = ['stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage2','stage3']
        matrix = ['stage1', 'stage2', 'stage2', 'stage2', 'stage3']
        totalSum = numpy.zeros((dim, dim))
        cK = numpy.ones((dim, dim))

        #start walk!
        for k in range(len(matrix)):
          print k, matrix[k]
          if matrix[k] == 'stage1':
            cK = numpy.dot(stage1Matrix, cK)
          elif matrix[k] == 'stage2':
            cK = numpy.dot(stage2Matrix, cK)
          else:
            cK = numpy.dot(cK, stage3Matrix)
          print 'CK', cK[0]

          totalSum = totalSum + (math.pow(alpha, k) * cK)
        totalSum = totalSum * (1 - alpha)

        #rank Terms
        qList = []
        terms = query.split()  #getQueryTerms(query)
        for term in terms:
          if term in totalDim:
            qList.append(totalDim.index(term))
          else:
            print 'ERROR dint find ', query, '\t', term, len(totalDim)

        termScore = {}
        for i in range(len(totalDim)):
          termScore[totalDim[i]] = 0.0
          for j in qList:
            if totalSum[i][j] > 0.0:
              termScore[totalDim[i]] += math.log(totalSum[i][j])

        #find the precision for different term sets
        sortTerms = sorted(termScore.iteritems(),
                           reverse=True,
                           key=lambda x: x[1])
        for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']:
          try:
            cTerms = set([x[0] for x in sortTerms[:i]])
            print 'CTERMS ', sortTerms[0:10], len(cTerms), 'ATERMS', aTerms
            p = len(aTerms & cTerms) / (len(aTerms) * 1.0)
            r = len(aTerms & cTerms) / (len(cTerms) * 1.0)
            prec[i] = prec.setdefault(i, 0.0) + p
            recall[i] = recall.setdefault(i, 0.0) + r
            print 'Prec', i, '\t', query, '\t', p
          except Exception as err:
            cTerms = set([x[0] for x in sortTerms])
            p = len(aTerms & cTerms) / (len(aTerms) * 1.0)
            r = len(aTerms & cTerms) / (len(cTerms) * 1.0)
            prec[i] = prec.setdefault(i, 0.0) + p
            recall[i] = recall.setdefault(i, 0.0) + r
            print 'Prec', i, '\t', query, '\t', p

      else:
        for i in [1, 3, 5, 10, 20, 30, 40, 50, 60, 100, '10000']:
          print 'Prec', i, '\t', query, '\t', 0.0

    #average the prec & recall
    #print prec and recall
  print 'Printing Precison'
  for entry, value in prec.iteritems():
    print entry, value / count

  print 'Printing Precison'
  for entry, value in recall.iteritems():
    print entry, value / count

  wIndex.close()
  qIndex.close()