Ejemplo n.º 1
0
    def get_scores(self, fav_doc):
        """ identify important terms from a liked tweet, determine similarity scores of these terms against all other
         tweets and return dict of all scores.  """

        bm25_scores = {}
        doc_vector = self.doc_vectors[fav_doc]

        # get the 3 most significant terms from favorite tweet then get similar tweets via BM25
        top_terms = dict(sorted(doc_vector.vector.iteritems(), key=itemgetter(1), reverse=True)[:3])
        for key, value in top_terms.iteritems():
            actual_term = doc_vector.terms_dict.keys()[doc_vector.terms_dict.values().index(key)]

            # set up and run query against index
            query = self.queryparser.parse(actual_term)
            collector = TopScoreDocCollector.create(6, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

            for hit in hits:
                # skip if this is the favorite tweet being examined
                if hit.doc == fav_doc: continue

                if hit.doc not in bm25_scores:
                    bm25_scores[hit.doc] = 0.0

                # note the final score is weighted depending on top term IDF significance
                bm25_scores[hit.doc] += value * hit.score

        return bm25_scores
Ejemplo n.º 2
0
    def populate_frame(self, date_range, term_vector) -> pd.DataFrame:
        data_frame = pd.DataFrame(data=0,
                                  index=date_range,
                                  columns=term_vector)
        iterator = self.lucene_dictionary.getEntryIterator()

        for term in BytesRefIterator.cast_(iterator):
            term_as_string = term.utf8ToString()
            # print('term:', term_as_string)
            query = QueryParser("contents",
                                self.analyzer).parse(term_as_string)
            collector = TopScoreDocCollector.create(10000, 10000)
            hits = self.searcher.search(query, 1000)

            if hits is None:
                # print("No hit for term: ", term_as_string)
                continue

            print("Found hit: " + term_as_string)

            for hit in hits.scoreDocs:
                document = self.searcher.doc(hit.doc)

                doc_name = document.getField("doc_name")
                date = datetime.datetime.strptime(doc_name.stringValue(),
                                                  '%m%d%y')

                current_value = data_frame.at[date, term_as_string]
                if np.isnan(current_value):
                    current_value = 0
                data_frame.at[date, term_as_string] = current_value + 1

        return data_frame
Ejemplo n.º 3
0
    def search(self, q, page = 1, duplicates = False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)
        
        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
            
            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight}
            )

        del self.searcher
        
        totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))

        return totalPages, docs
Ejemplo n.º 4
0
    def searchWithRequestAndQuery(cls, query, indexReader, taxoReader,
                                  indexingParams, facetRequest):
        """
        Search an index with facets for given query and facet requests.
        returns a List<FacetResult>
        """
        # prepare searcher to search against
        searcher = IndexSearcher(indexReader)
        # collect matching documents into a collector
        topDocsCollector = TopScoreDocCollector.create(10, True)
        if not indexingParams:
            indexingParams = FacetIndexingParams.DEFAULT

        # Faceted search parameters indicate which facets are we interested in
        facetRequests = [facetRequest,]
        facetRequests = Arrays.asList(facetRequests)
        # Add the facet request of interest to the search params:
        facetSearchParams = FacetSearchParams(indexingParams, facetRequests)
        # and create a FacetsCollector to use in our facetted search:
        facetsCollector = FacetsCollector.create(facetSearchParams, indexReader, taxoReader)
        # perform documents search and facets accumulation
        searcher.search(query, MultiCollector.wrap([topDocsCollector, facetsCollector]))
        print "\nFound %d Documents for query=%s" % (topDocsCollector.totalHits,
                                                     query.toString().encode('utf-8'))
        # Obtain facets results and print them
        res = facetsCollector.getFacetResults()
        i = 0
        for facetResult in res:
            print "Result #%d has %d descendants" % (i, facetResult.getNumValidDescendants())
            print "Result #%d : %s" % (i, facetResult)
            i += 1

        return res
Ejemplo n.º 5
0
    def search(self, q, page=1, duplicates=False):
        query = self.parser.parse(q)

        if not duplicates:
            query = self.addDuplicatesQuery(query)

        perPage = 10
        start = (page - 1) * perPage

        results = TopScoreDocCollector.create(1000, True)
        self.searcher.search(query, results)

        highlighter = Highlighter(QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(40))

        docs = []
        for scoreDoc in results.topDocs(start, perPage).scoreDocs:
            doc = self.searcher.doc(scoreDoc.doc)
            tokenStream = self.analyzer.tokenStream(
                "contents", StringReader(doc['contents']))
            highlight = highlighter.getBestFragments(tokenStream,
                                                     doc['contents'], 3, "...")

            docs.append({
                'title': doc['title'],
                'url': doc['url'],
                'duplicate': doc['duplicate'],
                'highlight': highlight
            })

        del self.searcher

        totalPages = int(math.ceil(results.getTotalHits() / float(perPage)))

        return totalPages, docs
Ejemplo n.º 6
0
    def runQuery(self, structured_query, parameters, test_guid, max_results=MAX_RESULTS_RECALL):
        """
            Run the query, return a list of tuples (score,metadata) of top docs
        """

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query

        query_text=self.rewriteQuery(structured_query["structured_query"],parameters,test_guid)

        try:
            query = self.query_parser(LuceneVersion.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except lucene.JavaError:
            # this is to deal with the "too many boolean clauses" exception
            print("Lucene exception:",sys.exc_info()[:2])
            print("Query:",query_text)
            return []

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # TODO remove this completely, use DisjunctionMax
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs
        res=[]

        # explain the query
        if self.logger:
            self.logger.logReport(query_text+"\n")

            if self.logger.full_citation_id in self.logger.citations_extra_info:
                max_explanations=len(hits)
            else:
                max_explanations=1

            for index in range(max_explanations):
                self.logger.logReport(self.searcher.explain(query,index))

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata=json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))

        if self.logger and self.logger.full_citation_id in self.logger.citations_extra_info:
            print(query_text,"\n", hits, "\n", res, "\n")

        del hits
        del query
        del query_text

        return res
Ejemplo n.º 7
0
    def runQuery(self, structured_query, parameters, test_guid, max_results=MAX_RESULTS_RECALL):
        """
            Run the query, return a list of tuples (score,metadata) of top docs
        """

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query

        query_text=self.rewriteQuery(structured_query["structured_query"],parameters,test_guid)

        try:
            query = self.query_parser(LuceneVersion.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except lucene.JavaError:
            # this is to deal with the "too many boolean clauses" exception
            print("Lucene exception:",sys.exc_info()[:2])
            print("Query:",query_text)
            return []

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # TODO remove this completely, use DisjunctionMax
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs
        res=[]

        # explain the query
        if self.logger:
            self.logger.logReport(query_text+"\n")

            if self.logger.full_citation_id in self.logger.citations_extra_info:
                max_explanations=len(hits)
            else:
                max_explanations=1

            for index in range(max_explanations):
                self.logger.logReport(self.searcher.explain(query,index))

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata=json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))

        if self.logger and self.logger.full_citation_id in self.logger.citations_extra_info:
            print(query_text,"\n", hits, "\n", res, "\n")

        del hits
        del query
        del query_text

        return res
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Ejemplo n.º 9
0
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type) if len(doc_score_list) != 0 else [0] * len(
                       feature_type)  # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text',
                        analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Ejemplo n.º 10
0
    def doc_search(self, field, keywords, numHits):
        if field != 'All':
            analyzer = StandardAnalyzer()
            parser = QueryParser(field, analyzer)
            query = parser.parse(keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits
        else:
            analyzer = WhitespaceAnalyzer()
            parser = MultiFieldQueryParser(['Title', 'Body'], analyzer)
            query = MultiFieldQueryParser.parse(parser, keywords)

            # self.lReader.getDocCount("title");

            try:
                collector = TopScoreDocCollector.create(numHits)
                self.lSearcher.search(query, collector)
                hits = collector.topDocs().scoreDocs

            except RuntimeError:
                print "Score docoment run fail"
            self.hits = hits
            self.field = field
            return hits

            self.hits = hits
            self.field = field
            return hits
Ejemplo n.º 11
0
 def retrieve(self,query,field,hitsPerPage):
     querystr=query
     
     # build query
     q_lucene = QueryParser(field, self.analyzer).parse(querystr)
     # build searcher              
     collector = TopScoreDocCollector.create(hitsPerPage)
     (self.searcher).search(q_lucene, collector);
     hits = collector.topDocs().scoreDocs;
     
     len_hits=len(hits)
     single_query_result=[(self.searcher.doc(hits[j].doc),hits[j].doc) for j in range(len_hits)]
     return single_query_result
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Ejemplo n.º 13
0
    def doc_search(self, keywords):

        analyzer = StandardAnalyzer()
        parser = QueryParser('Title', analyzer)
        query = parser.parse(keywords)

        try:
            collector = TopScoreDocCollector.create(3000)
            self.lSearcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

        except RuntimeError:
            print "Score docoment run fail"
        self.hits = hits
        return hits
Ejemplo n.º 14
0
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
Ejemplo n.º 15
0
    def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL):
        """
            LOTS OF SWEET LUCENE
        """
        original_query=structured_query

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query
        query_text=self.rewriteQuery(structured_query["structured_query"], ["text"])

        try:
            query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except:
            print("Lucene exception:",sys.exc_info()[:2])
            return None

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

##        print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
        res=[]


##        if len(hits.scoreDocs) ==0:
##            print "Original query:",original_query
##            print "Query:", query

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata= json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))
        return res
Ejemplo n.º 16
0
    def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL):
        """
            LOTS OF SWEET LUCENE
        """
        original_query=structured_query

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query
        query_text=self.rewriteQuery(structured_query["structured_query"], ["text"])

        try:
            query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except:
            print("Lucene exception:",sys.exc_info()[:2])
            return None

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

##        print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
        res=[]


##        if len(hits.scoreDocs) ==0:
##            print "Original query:",original_query
##            print "Query:", query

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata= json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))
        return res
Ejemplo n.º 17
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher = IndexSearcher(reader)
    searcher.setSimilarity(BM25Similarity())

    # read query
    read_query()

    # initialize mongodb client
    mongoObj = Mongo_Object('localhost', 27017)

    # initialize word2vec
    print 'load word2vec model'
    w2vmodel = gensim.models.Word2Vec.load_word2vec_format(
        "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary",
        binary=True)
    print 'finish loading word2vec model'

    # search
    global hitsPerPage
    fields = ['name', 'value']
    #parser=MultiFieldQueryParser(fields,analyzer)
    #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    rec_result = open('pylucene.runs', 'w')

    for i in range(len(queries)):
        query = queries[i]
        print 'processing query ' + str(i) + ':' + query[0]
        querystr = remove_duplicate(stemSentence(query[1]))
        #q_lucene=MultiFieldQueryParser.parse(parser,querystr)
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        print "q_lucene: " + q_lucene.toString()
        collector = TopScoreDocCollector.create(hitsPerPage)
        searcher.search(q_lucene, collector)
        hits = collector.topDocs().scoreDocs

        # build query object for computeScore
        #queryObj=Query_Object(query,mongoObj,w2vmodel)

        # initialize duplicate remover
        docDup = set()

        # find candidate results after 1st round filter
        candidates = PriorityQueue()
        for j in xrange(len(hits)):
            docID = hits[j].doc
            d = searcher.doc(docID)
            name = cleanSentence(d['title'].strip())
            if name in docDup:
                continue
            docDup.add(name)
            # build entity object
            entityObj = Entity_Object(d, mongoObj, w2vmodel)
            #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel)
            score = hits[j].score
            candidates.put((-score, j))

        # output results from priority queue larger score first
        rank = 0
        while candidates.empty() == False and rank < 100:
            rank = rank + 1
            item = candidates.get()
            score = -item[0]
            j = item[1]  # index of hits[]
            docID = hits[j].doc
            d = searcher.doc(docID)
            title = '<dbpedia:' + d.get('title') + '>'
            res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str(
                rank) + '\t' + str(score) + '\t' + 'pylucene_multifield'
            rec_result.writelines(res_line + '\n')
    rec_result.close()
Ejemplo n.º 18
0
 def _topCollector(self, start, stop, sortKeys):
     if stop <= start:
         return TotalHitCountSuperCollector() if self._multithreaded else TotalHitCountCollector()
     # fillFields = False # always true for multi-threading/sharding
     trackDocScores = True
     trackMaxScore = False
     docsScoredInOrder = True
     if sortKeys:
         sortFields = [
             self._sortField(fieldname=sortKey['sortBy'], sortDescending=sortKey['sortDescending'])
             for sortKey in sortKeys
         ]
         sort = Sort(sortFields)
     else:
         return TopScoreDocSuperCollector(stop, docsScoredInOrder) if self._multithreaded else TopScoreDocCollector.create(stop, docsScoredInOrder)
     if self._multithreaded:
         return TopFieldSuperCollector(sort, stop, trackDocScores, trackMaxScore, docsScoredInOrder)
     else:
         fillFields = False
         return TopFieldCollector.create(sort, stop, fillFields, trackDocScores, trackMaxScore, docsScoredInOrder)
Ejemplo n.º 19
0
def lucene_retrieval_multifield(q_string,
                                q_class,
                                feature_type,
                                use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text',
                             analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name',
                                analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Ejemplo n.º 20
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
       lucene.initVM(vmargs=['-Djava.awt.headless=true'])
       lucene_vm_init = True
    
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path 
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
    
    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    
    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher1 = IndexSearcher(reader)
    searcher1.setSimilarity(BM25Similarity())
    searcher2 = IndexSearcher(reader)
    w = IndexWriter(index_mm,config)
    # read query
    read_query()
    
    # initialize mongodb client
    mongoObj=Mongo_Object('localhost',27017)
      
    # search
    docDup=set()
    finalDup={}
    
    for i in xrange(len(queries)):
        print 'process query %d' %(i)
        query = queries[i]
        querystr = stemSentence(query[3])
        # build searcher
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        collector = TopScoreDocCollector.create(hitsPerPage);
        searcher1.search(q_lucene, collector);
        hits = collector.topDocs().scoreDocs;
        
        
        # find candidate results after 1st round filter
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            if d['title'] in docDup:
               finalDup[d['title']]=d
               continue
            docDup.add(d['title'])
            
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            title=d['title']
            if d['title'] in docDup:
               continue
            docDup.add(title)
            
            item=(mongoObj.conn_me).find_one({'title':title})
            if item is None:
               continue
            entitylist=item['entitylist'].split('|')
            for en_title in entitylist:
                if title==en_title:
                   continue
                t=Term('title',en_title)
                q=TermQuery(t)
                docs=searcher2.search(q,2)
                if docs.totalHits<=1:
                   continue
                
                docID2=(docs.scoreDocs)[0].doc
                doc=searcher2.doc(docID2)
                finalDup[doc['title']]=doc

    print 'begin to clean index, there are %d dup records' %(len(finalDup))
    for title in finalDup:
        doc=finalDup[title]
        # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract
        
        name=doc['name']
        value=doc['value']
        category=doc['category']
        skos_category=doc['skos_category']
        all_text=doc['all_text']
        raw_name=doc['raw_name']
        raw_value=doc['raw_value']
        abstract=doc['abstract']
        
        print 'process '+title
        t=Term('title',title)
        q=TermQuery(t)
        w.deleteDocuments(q)
        addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract)
    # process remaining records
    #global batch,cnt_batch
    #if cnt_batch>0:
       #w.addDocuments(batch)
       #cnt_batch=0
       #del batch[:]
    w.close()