Esempio n. 1
0
    def __init__(self, root, storedir, isindexing=False, isBM25=True):

        if not os.path.exists(storedir):
            os.mkdir(storedir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)

        if isindexing:
            store = SimpleFSDirectory(Paths.get(storedir))
            config = IndexWriterConfig(self.analyzer)
            # TODO BM25 parameter tuning
            if isBM25:
                config.setSimilarity(BM25Similarity())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexer(root, writer)
            ticker = Ticker()
            print('commit index')
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print('done')

        search_dir = SimpleFSDirectory(Paths.get(storedir))
        self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
        if isBM25:
            self.searcher.setSimilarity(BM25Similarity())
Esempio n. 2
0
 def __init__(self,
              index_path,
              field,
              similarity="boolean",
              use_relevance_feedback=False,
              feedback_index_path=None):
     self.reader = DirectoryReader.open(
         FSDirectory.open(Paths.get(index_path)))
     self.searcher = IndexSearcher(self.reader)
     if use_relevance_feedback and feedback_index_path is not None:
         self.feedback_reader = DirectoryReader.open(
             FSDirectory.open(Paths.get(feedback_index_path)))
         self.feedback_searcher = IndexSearcher(self.feedback_reader)
     self.similarity = similarity
     self.stopwords = stop_words()
     if similarity == "boolean":
         self.searcher.setSimilarity(BooleanSimilarity())
     elif similarity == "tf":
         self.searcher.setSimilarity(TFSimilarity())
     elif similarity == "tfidf":
         self.searcher.setSimilarity(ClassicSimilarity())
     elif similarity == "BM25":
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     else:
         print("Unknown similarity, so we use BM25(1.2, 0.2) as default")
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     analyzer = StandardAnalyzer()
     print(self.searcher.getSimilarity())
     self.parser = QueryParser(field, analyzer)
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        #self.analyzer = StandardAnalyzer()
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searcher = IndexSearcher(self.reader)
        self.dict_term_freq = {}
        if similarity == 'BM25':
            (self.searcher).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
    def get_most_similar(self, sentence, do_log=False):
        # print('query string is',string)
        # q = QueryParser('pa', self.analyzer).parse(sentence)
        query_builder = BooleanQuery.Builder()
        for token in sentence.split(' '):
            if token not in sw:
                qtq = TermQuery(Term("pa", token))
                query_builder.add(
                    BooleanClause(qtq, BooleanClause.Occur.SHOULD))
        q = query_builder.build()
        hitsPerPage = 2
        reader = DirectoryReader.open(self.w)
        self.searcher = IndexSearcher(reader)
        simi = BM25Similarity(Config.k1, Config.b)
        # simi = ClassicSimilarity()
        self.searcher.setSimilarity(simi)

        docs = self.searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs

        # print("Found " + str(len(hits)) + " hits.")
        if len(hits) > 0:
            mate = self.searcher.doc(hits[0].doc).get("id")
            if do_log:
                print("found something. mate: ", mate, "- score : ",
                      hits[0].score)
            return hits[0], int(mate)
        else:
            return None, -1
Esempio n. 5
0
 def publish_services(self, service_list):
     transformer = WSDLTransformer()
     current_document = 1
     indexDir = SimpleFSDirectory(File("index/"))
     writerConfig = IndexWriterConfig(
         Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT))
     writerConfig.setSimilarity(BM25Similarity())
     index_writer = IndexWriter(indexDir, writerConfig)
     for wsdl in service_list:
         if self._document_expansion:
             #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl))))
             bag_of_words = ' '.join(
                 self._semantic_transformer.transform(
                     transformer.transform(wsdl)))
         else:
             #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl)))
             bag_of_words = ' '.join(transformer.transform(wsdl))
         doc = Document()
         doc.add(
             Field("content", bag_of_words, Field.Store.YES,
                   Field.Index.ANALYZED))
         doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO))
         index_writer.addDocument(doc)
         current_document += 1
     index_writer.close()
Esempio n. 6
0
def process_q_test(q, out_q):
    lucene.initVM()
    lucene.getVMEnv().attachCurrentThread()

    index = DirectoryReader.open(SimpleFSDirectory(
        Paths.get(robust_index_dir)))
    searcher = IndexSearcher(index)
    searcher.setSimilarity(BM25Similarity())
    analyzer = EnglishAnalyzer()
    qparser = QueryParser("contents", analyzer)
    preprocessor = Preprocess()

    while not exitFlag:
        qid, query = q.get()
        tname = multiprocessing.current_process().name
        # print(tname, qid, query)
        if query == "DONE":
            break

        try:
            # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000)
            # if len(dids) >= 10:
            #     out_q.put((qid, dids, scores))
            dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer,
                                            preprocessor)
            out_q.put((qid, dids_text))
        except:
            print('%s exception %s, %s' % (tname, qid, query))
Esempio n. 7
0
 def __init__(
     self,
     commitTimeout=10,
     commitCount=100000,
     multithreaded=True,
     readonly=False,
     lruTaxonomyWriterCacheSize=4000,
     analyzer=MerescoStandardAnalyzer(),
     similarity=BM25Similarity(),
     fieldRegistry=FieldRegistry(),
     maxMergeAtOnce=2,
     segmentsPerTier=8.0,
     numberOfConcurrentTasks=6,
     verbose=True,
 ):
     self.commitTimeout = commitTimeout
     self.commitCount = commitCount
     self.multithreaded = multithreaded
     self.readonly = readonly
     self.lruTaxonomyWriterCacheSize = lruTaxonomyWriterCacheSize
     self.analyzer = analyzer
     self.similarity = similarity
     self.fieldRegistry = fieldRegistry
     self.maxMergeAtOnce = maxMergeAtOnce
     self.segmentsPerTier = segmentsPerTier
     self.numberOfConcurrentTasks = numberOfConcurrentTasks
     self.verbose = verbose
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searchers = []
        self.searchers.append(IndexSearcher(self.reader))
        if similarity == 'BM25':
            (self.searchers[0]).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
            if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache_with_wikipedia']
            else:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache']
Esempio n. 9
0
 def __init__(self):
     indexdir = './IndexFiles.index'
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     search_dir = SimpleFSDirectory(Paths.get(indexdir))
     self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
     self.searcher.setSimilarity(BM25Similarity())
     self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
     self.lemmatizer = nltk.stem.WordNetLemmatizer()
Esempio n. 10
0
 def run(self):
     print("Starting " + self.name)
     lucene.getVMEnv().attachCurrentThread()
     index = DirectoryReader.open(
         SimpleFSDirectory(Paths.get(robust_index_dir)))
     searcher = IndexSearcher(index)
     searcher.setSimilarity(BM25Similarity())
     analyzer = EnglishAnalyzer()
     qparser = QueryParser("contents", analyzer)
     # process_query(self.name, self.q, self.out_q, searcher, qparser)
     print("Exiting " + self.name)
Esempio n. 11
0
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type) if len(doc_score_list) != 0 else [0] * len(
                       feature_type)  # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text',
                        analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Esempio n. 12
0
    def __init__(self):

        self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print('Creating index at', prm.index_folder)
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print('copying index from', prm.index_folder, 'to', prm.local_index_folder)
            if os.path.exists(prm.local_index_folder):
                print('Folder', prm.local_index_folder, 'already exists! Doing nothing.')
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        self.searcher.setSimilarity(BM25Similarity())

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print('Creating index at', prm.index_folder_term)
                self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True)

            if prm.local_index_folder_term:
                print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term)
                if os.path.exists(prm.local_index_folder_term):
                    print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.')
                else:
                    shutil.copytree(prm.index_folder_term, prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}
        
        print('Loading Text-ID mapping...')
        self.text_id_map, self.id_text_map = self.get_text_id_map()
Esempio n. 13
0
    def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
        # This method must be thread-safe
        if similarity is None:
            self._similarity = self._settings.similarity
        else:
            self._similarity = BM25Similarity(similarity["k1"],
                                              similarity["b"])

        if numberOfConcurrentTasks is None:
            self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
        else:
            self._numberOfConcurrentTasks = numberOfConcurrentTasks
        self._reopenSearcher = True
Esempio n. 14
0
 def find(self, query):
     transformer = StringTransformer()
     analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
     reader = IndexReader.open(SimpleFSDirectory(File("index/")))
     searcher = IndexSearcher(reader)
     searcher.setSimilarity(BM25Similarity())
     processed_query = ' '.join(
         self._preprocessor(transformer.transform(query)))
     query = QueryParser(Version.LUCENE_CURRENT, "content",
                         analyzer).parse(processed_query)
     hits = searcher.get_description(query, 10)
     result_list = []
     for hit in hits.scoreDocs:
         doc = searcher.doc(hit.doc)
         result_list.append(doc.get("path").encode("utf-8"))
     return result_list
Esempio n. 15
0
 def __init__(self,
              LUCENE_INDEX_DIR,
              similarity='BM25',
              lucene_vm_flag=False):
     if lucene_vm_flag == False:
         lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     self.lucene_vm_init = True
     self.index_dir = LUCENE_INDEX_DIR
     self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
     self.analyzer = StandardAnalyzer()
     self.config = IndexWriterConfig(self.analyzer)
     self.reader = DirectoryReader.open(self.index_mm)
     self.searcher = IndexSearcher(self.reader)
     self.dict_term_freq = {}
     self.dict_doc_field_title = {}
     if similarity == 'BM25':
         (self.searcher).setSimilarity(BM25Similarity())
Esempio n. 16
0
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
Esempio n. 17
0
 def __init__(self,
              lang=None,
              dataset=None,
              analyzer=None,
              index_path=None,
              k1=None,
              b=None):
     super().__init__(k1, b)
     print("Searcher k1: {}, b: {}", self.k1, self.b)
     self.similarity = BM25Similarity(self.k1, self.b)
     self.searcher = {}
     self.parser = {}
     self.languages = []
     self.lang = lang
     self.dataset = dataset
     self.__call__ = self.query
     if lang != None or dataset != None or analyzer != None:
         self.addLang(lang, dataset, analyzer, index_path)
Esempio n. 18
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                                   query, ['docno', 'content'],
                                                   [SHOULD, SHOULD],
                                                   self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
Esempio n. 19
0
    def __init__(self):
        wikidir = './wiki-pages-text'
        indexdir = './IndexFiles.index'
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        if not os.path.exists(indexdir):
            os.mkdir(indexdir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        store = SimpleFSDirectory(Paths.get(indexdir))
        config = IndexWriterConfig(self.analyzer)
        config.setSimilarity(BM25Similarity())
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexer(wikidir, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
 def open_searcher(self):
     self.reader = DirectoryReader.open(self.directory)
     self.searcher = IndexSearcher(self.reader)
     if (self.similarity == "bm25"):
         self.searcher.setSimilarity(BM25Similarity())
Esempio n. 21
0
    import argparse
    parser = argparse.ArgumentParser(
        description='Execute queries on comment body')
    parser.add_argument('user_name', type=str,
                        help="User name (profile to use)")
    parser.add_argument('index_dir', metavar='dir', type=str,
                        help="Index directory")
    parser.add_argument('--sim', type=str, nargs='?',
                        default="tfidf", help="Similarity (in [tfidf, lm, bm25])")
    parser.add_argument('--reorder', type=str, nargs='?',
                        default="no", help="Reordering (in [ups, normups])")
    parser.add_argument('--short', action='store_false',
                        help="Don't show the body of comments")
    args = parser.parse_args()


    if args.sim in ['bm25']:
        similarity = BM25Similarity()
    elif args.sim in ['lm']:
        similarity = LMDirichletSimilarity()
    else:
        similarity = ClassicSimilarity()

    # Sample query
    storeDir = SimpleFSDirectory(Paths.get(args.index_dir))
    searcher = IndexSearcher(DirectoryReader.open(storeDir))
    if similarity is not None:
        searcher.setSimilarity(similarity)
    analyzer = StandardAnalyzer()
    run(searcher, analyzer, args.user_name, reordering=args.reorder, show_bodies=not args.short)
Esempio n. 22
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher = IndexSearcher(reader)
    searcher.setSimilarity(BM25Similarity())

    # read query
    read_query()

    # initialize mongodb client
    mongoObj = Mongo_Object('localhost', 27017)

    # initialize word2vec
    print 'load word2vec model'
    w2vmodel = gensim.models.Word2Vec.load_word2vec_format(
        "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary",
        binary=True)
    print 'finish loading word2vec model'

    # search
    global hitsPerPage
    fields = ['name', 'value']
    #parser=MultiFieldQueryParser(fields,analyzer)
    #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    rec_result = open('pylucene.runs', 'w')

    for i in range(len(queries)):
        query = queries[i]
        print 'processing query ' + str(i) + ':' + query[0]
        querystr = remove_duplicate(stemSentence(query[1]))
        #q_lucene=MultiFieldQueryParser.parse(parser,querystr)
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        print "q_lucene: " + q_lucene.toString()
        collector = TopScoreDocCollector.create(hitsPerPage)
        searcher.search(q_lucene, collector)
        hits = collector.topDocs().scoreDocs

        # build query object for computeScore
        #queryObj=Query_Object(query,mongoObj,w2vmodel)

        # initialize duplicate remover
        docDup = set()

        # find candidate results after 1st round filter
        candidates = PriorityQueue()
        for j in xrange(len(hits)):
            docID = hits[j].doc
            d = searcher.doc(docID)
            name = cleanSentence(d['title'].strip())
            if name in docDup:
                continue
            docDup.add(name)
            # build entity object
            entityObj = Entity_Object(d, mongoObj, w2vmodel)
            #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel)
            score = hits[j].score
            candidates.put((-score, j))

        # output results from priority queue larger score first
        rank = 0
        while candidates.empty() == False and rank < 100:
            rank = rank + 1
            item = candidates.get()
            score = -item[0]
            j = item[1]  # index of hits[]
            docID = hits[j].doc
            d = searcher.doc(docID)
            title = '<dbpedia:' + d.get('title') + '>'
            res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str(
                rank) + '\t' + str(score) + '\t' + 'pylucene_multifield'
            rec_result.writelines(res_line + '\n')
    rec_result.close()
Esempio n. 23
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
       lucene.initVM(vmargs=['-Djava.awt.headless=true'])
       lucene_vm_init = True
    
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path 
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
    
    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    
    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher1 = IndexSearcher(reader)
    searcher1.setSimilarity(BM25Similarity())
    searcher2 = IndexSearcher(reader)
    w = IndexWriter(index_mm,config)
    # read query
    read_query()
    
    # initialize mongodb client
    mongoObj=Mongo_Object('localhost',27017)
      
    # search
    docDup=set()
    finalDup={}
    
    for i in xrange(len(queries)):
        print 'process query %d' %(i)
        query = queries[i]
        querystr = stemSentence(query[3])
        # build searcher
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        collector = TopScoreDocCollector.create(hitsPerPage);
        searcher1.search(q_lucene, collector);
        hits = collector.topDocs().scoreDocs;
        
        
        # find candidate results after 1st round filter
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            if d['title'] in docDup:
               finalDup[d['title']]=d
               continue
            docDup.add(d['title'])
            
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            title=d['title']
            if d['title'] in docDup:
               continue
            docDup.add(title)
            
            item=(mongoObj.conn_me).find_one({'title':title})
            if item is None:
               continue
            entitylist=item['entitylist'].split('|')
            for en_title in entitylist:
                if title==en_title:
                   continue
                t=Term('title',en_title)
                q=TermQuery(t)
                docs=searcher2.search(q,2)
                if docs.totalHits<=1:
                   continue
                
                docID2=(docs.scoreDocs)[0].doc
                doc=searcher2.doc(docID2)
                finalDup[doc['title']]=doc

    print 'begin to clean index, there are %d dup records' %(len(finalDup))
    for title in finalDup:
        doc=finalDup[title]
        # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract
        
        name=doc['name']
        value=doc['value']
        category=doc['category']
        skos_category=doc['skos_category']
        all_text=doc['all_text']
        raw_name=doc['raw_name']
        raw_value=doc['raw_value']
        abstract=doc['abstract']
        
        print 'process '+title
        t=Term('title',title)
        q=TermQuery(t)
        w.deleteDocuments(q)
        addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract)
    # process remaining records
    #global batch,cnt_batch
    #if cnt_batch>0:
       #w.addDocuments(batch)
       #cnt_batch=0
       #del batch[:]
    w.close()
Esempio n. 24
0
def lucene_retrieval_multifield(q_string,
                                q_class,
                                feature_type,
                                use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text',
                             analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name',
                                analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Esempio n. 25
0
def createIndexSearcher(indexDir):
    directory = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir)))
    searcher = IndexSearcher(directory)
    similarity = BM25Similarity(K1, B)
    searcher.setSimilarity(similarity)
    return searcher
Esempio n. 26
0
"""

import time
import json
import lucene
import nltk
from org.apache.lucene import queryparser, analysis
from org.apache.lucene.search.similarities import BM25Similarity
from lupyne import engine

start_time = time.time()

lucene.initVM()
dest = "lucene_wikis.index"
indexer = engine.Indexer(dest)
indexer.setSimilarity(BM25Similarity())
analyzer = analysis.standard.StandardAnalyzer()
parser = queryparser.classic.QueryParser("text", analyzer)
prediction = {}
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
nltk.download('averaged_perceptron_tagger')
related_sentences = []


def re_process_string(str):
    str = str.replace(" ", "_")
    str = str.replace("(", "-LRB-")
    str = str.replace(")", "-RRB-")
    return str

Esempio n. 27
0
# for txtName in gutenberg_list:
#   words = nltk.corpus.gutenberg.words(txtName)
#   sents = " ".join(words).split(".")
#   print(sents[:100])
# #   print("Indexing ", txtName, "...")
# #   for i in range(0, len(sents), 10):
# #     text = " ".join(sents[i:i+10])
# #     doc = Document()
# #     doc.add(Field("fieldname", text, TextField.TYPE_STORED))
# #     iwriter.addDocument(doc)
# # iwriter.close()

# now search the index
ireader = DirectoryReader.open(directory)
isearcher = IndexSearcher(ireader)

# set similarity method
bm25 = BM25Similarity()
isearcher.setSimilarity(bm25)

# parse a simple query that searches for "text"
parser = QueryParser("fieldname", analyzer)
query = parser.parse("her sister was reading")
hits = isearcher.search(query, 5).scoreDocs
print(len(hits))

for hit in hits:
    result = isearcher.doc(hit.doc)
    print("[%8.4f] %s" % (hit.score, result.get("fieldname")))
Esempio n. 28
0
    ]
    answers = [item["answer"] for item in qas]

    print("Loading Lucene Index ...")
    lucene.initVM(vmargs=['-Djava.aws.headless=true'])
    analyzer = StandardAnalyzer()
    searchDir = NIOFSDirectory(Paths.get(args.index_path))
    searcher = IndexSearcher(DirectoryReader.open(searchDir))

    # try tuning the hyperparameters of bm25
    for k1 in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]:
        for b in [0.5, 0.6, 0.7, 0.8, 0.9]:

            print(f"Grid search.... k1: {k1}; b: {b}")

            searcher.setSimilarity(BM25Similarity(k1, b))

            parser = QueryParser('Context', analyzer)

            retrieved = []
            print("Searching ...")
            for q in tqdm(questions):
                query = parser.parse(QueryParser.escape(q))
                # print(q, "|", QueryParser.escape(q), "|", query)
                # import pdb; pdb.set_trace()
                scoreDocs = searcher.search(query, args.topk).scoreDocs
                topkDocs = []
                for hit in scoreDocs:
                    doc = searcher.doc(hit.doc)
                    topkDocs.append({
                        "title": doc.get("Title"),
Esempio n. 29
0

def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        query = QueryParser("contents", analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")


if __name__ == '__main__':
    STORE_DIR = "/usr/src/pylucene/aclImdb/index"
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()
    searcher.setSimilarity(BM25Similarity())
    run(searcher, analyzer)
    del searcher
Esempio n. 30
0
    def __init__(self,
                 lang,
                 dataset,
                 analyzer,
                 index_path=None,
                 data_path=None,
                 ram_size=2048):
        """ Returns scored documents in multiple languages.

        Parameters:
        dataset  (str): ['mlqa_dev', 'mlqa_test', 'wiki']
        lang     (str): ['en', 'es', 'de']
        anlyzer  (str): ['en', 'es', 'de', 'standard']
        ram_size (int): Size of memory used while indexing

        Returns:
        """
        super().__init__()

        idxdir = self.get_index(lang, dataset, index_path)
        self.mlqa = True
        if dataset == 'mlqa_dev':
            self.dataset = MLQADataset('dev', lang, lang, data_path)
        elif dataset == 'mlqa_test':
            self.dataset = MLQADataset('test', lang, lang, data_path)
        elif dataset == 'wiki':
            self.mlqa = False
            self.dataset = Wiki(lang, data_path)
        else:
            raise RuntimeError("No dataloader for {}".format(dataset))

        # stores index files, poor concurency try NIOFSDirectory instead
        store = SimpleFSDirectory(Paths.get(idxdir))
        # limit max. number of tokens per document.
        # analyzer will not consume more tokens than that
        #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        # configuration for index writer
        config = IndexWriterConfig(analyzers[analyzer]())
        # creates or overwrites index
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        # setting similarity BM25Similarity(k1=1.2,b=0.75)
        similarity = BM25Similarity(self.k1, self.b)
        config.setSimilarity(similarity)
        config.setRAMBufferSizeMB(float(ram_size))
        # create index writer
        self.writer = IndexWriter(store, config)

        self.ftdata = FieldType()
        self.ftmeta = FieldType()
        # IndexSearcher will return value of the field
        self.ftdata.setStored(True)
        self.ftmeta.setStored(True)
        # will be analyzed by Analyzer
        self.ftdata.setTokenized(True)
        self.ftmeta.setTokenized(False)
        # what informations are stored (probabli DOCS would be sufficient)
        # DOCS: Only documents are indexed: term frequencies and positions are omitted.
        #       Phrase and other positional queries on the field will throw an exception,
        #       and scoring will behave as if any term in the document appears only once.
        # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are
        #       omitted. This enables normal scoring, except Phrase and other positional
        #       queries will throw an exception.
        # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions.
        #       This is a typical default for full-text search: full scoring is enabled
        #       and positional queries are supported.
        self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        self.ftmeta.setIndexOptions(IndexOptions.DOCS)
        # instantiate some reusable objects
        # TODO: create document, add fields then change only field value and
        # re-add document
        self.doc = Document()
        # Id cannot be reused because there is multiple values
        # I could store list of fields and add one if its not enough
        #self.fieldId = Field("id", "dummy", self.ftmeta)
        self.fieldTitle = Field("title", "dummy", self.ftdata)
        self.doc.add(self.fieldTitle)
        self.fieldContext = Field("context", "dummy", self.ftdata)
        self.doc.add(self.fieldContext)
        self.fieldIds = [Field("id", "dummy", self.ftmeta)]