def testOutOfOrderDocsScoringSort(self): """ Two Sort criteria to instantiate the multi/single comparators. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] tfcOptions = [[False, False, False], [False, False, True], [False, True, False], [False, True, True], [True, False, False], [True, False, True], [True, True, False], [True, True, True]] actualTFCClasses = [ "OutOfOrderOneComparatorNonScoringCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorScoringNoMaxScoreCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorNonScoringCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorScoringNoMaxScoreCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector" ] bq = BooleanQuery() # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2 # which delegates to BS if there are no mandatory clauses. bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD) # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to # return the clause instead of BQ. bq.setMinimumNumberShouldMatch(1) for sort in sorts: for tfcOption, actualTFCClass in izip(tfcOptions, actualTFCClasses): tdc = TopFieldCollector.create(sort, 10, tfcOption[0], tfcOption[1], tfcOption[2], False) self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass)) self.full.search(bq, tdc) tds = tdc.topDocs() sds = tds.scoreDocs self.assertEqual(10, len(sds))
def searcher_text(text): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() tags_count = {} sentences = tokenize.sent_tokenize(text) t_phrases = [] for sentence in sentences: sentence = sentence.replace('\n', ' ') #print sentence t_phrases = sentence.split() t_phrases = [p.lower() for p in t_phrases] query = BooleanQuery() query.setMinimumNumberShouldMatch(1) i = 0 phrases = getPhrases(sentence) t_phrases = t_phrases + phrases for k in t_phrases: for t in k.split(): if (i < 1000): query.add(TermQuery(Term("Body", t)), BooleanClause.Occur.SHOULD) i = i + 1 MAX = 5 hits = searcher.search(query, MAX) for hit in hits.scoreDocs: if (hit.score > 0.0): doc = searcher.doc(hit.doc) tgs = doc.get("Tags") score = hit.score addTags(tgs, score, tags_count) for tg in tags_count: tags_count[tg] = tags_count[tg] / float(1 + math.log(stack_tags_count[tg])) sorted_words = sorted(tags_count.items(), key=lambda x: x[1], reverse=True) k = 25 ret = [] for word, score in sorted_words[:k]: ret.append(word) return ret