Exemple #1
0
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()
Exemple #2
0
    def __init__(self, path, settings):
        self._settings = settings
        self._multithreaded = settings.multithreaded
        self._checker = DirectSpellChecker()
        indexDirectory = MMapDirectory(File(join(path, 'index')))
        indexDirectory.setUseUnmap(False)
        taxoDirectory = MMapDirectory(File(join(path, 'taxo')))
        taxoDirectory.setUseUnmap(False)
        conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer)
        conf.setSimilarity(settings.similarity)
        mergePolicy = TieredMergePolicy()
        mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce)
        mergePolicy.setSegmentsPerTier(settings.segmentsPerTier)
        conf.setMergePolicy(mergePolicy)

        if not settings.readonly:
            self._indexWriter = IndexWriter(indexDirectory, conf)
            self._indexWriter.commit()
            self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize))
            self._taxoWriter.commit()

        self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory)
        self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper

        self._facetsConfig = settings.fieldRegistry.facetsConfig

        self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
Exemple #3
0
def main():
    LUCENE_INDEX_DIR = 'mmapDirectory/trec_v15_wikipedia_stemmed_v2'
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    #config=config.setRAMBufferSizeMB(1024.0)  # experimental setting !!
    # write data to index

    if not is_index_Exist:
        #if True:
        print('begin backup code files')
        system_flag = platform.system()
        if system_flag == 'Windows':
            os.system('robocopy %s %s\code_files *.py' %
                      (r'%cd%', LUCENE_INDEX_DIR))
        else:
            os.system('mkdir %s/code_files' % (LUCENE_INDEX_DIR))
            os.system('cp *.py %s/code_files' % (LUCENE_INDEX_DIR))

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
Exemple #4
0
    def create_index(self, index_folder):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(True)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(
            MySimpleAnalyzer(
                CharArraySet(collections.JavaSet(utils.STOPWORDS), True)))
        writerConfig.setSimilarity(MyTFIDFSimilarity())
        writerConfig.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fsDir, writerConfig)
        logger.info(f"{self.writer.numDocs()} docs in index")
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)
            tokens = self.doc_db.get_doc_tokens(doc_id)
            self.add_doc(doc_id, text, tokens)

        logger.info(f"Indexed {self.writer.numDocs()} docs.")
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searchers = []
        self.searchers.append(IndexSearcher(self.reader))
        if similarity == 'BM25':
            (self.searchers[0]).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
            if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache_with_wikipedia']
            else:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache']
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        #self.analyzer = StandardAnalyzer()
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searcher = IndexSearcher(self.reader)
        self.dict_term_freq = {}
        if similarity == 'BM25':
            (self.searcher).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
def main():
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)
    config = config.setRAMBufferSizeMB(1024.0)
    # write data to index

    if not is_index_Exist:
        print('begin backup code files')
        system_flag = platform.system()
        cmd = 'robocopy %s %s\code_files *.py' % (
            r'%cd%', LUCENE_INDEX_DIR
        ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % (
            LUCENE_INDEX_DIR)
        os.system(cmd)

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
Exemple #8
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
    #index_mm = SimpleFSDirectory(Paths.get(LUCENE_INDEX_DIR))
    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    #config.setCodec(lucene50)
    #config.setSimilarity(BM25Similarity())
    # load index to search engine
    #reader = DirectoryReader.open(index_mm)
    #searcher1 = IndexSearcher(reader)
    #searcher1.setSimilarity(BM25Similarity())
    #searcher2 = IndexSearcher(reader)
    #w = IndexWriter(index_mm,config)
    #upgrader = IndexUpgrader(index_mm,config,True)
    upgrader = IndexUpgrader(index_mm)
    print 'begin to upgrade'
    upgrader.upgrade()
    # read query
    #read_query()

    # initialize mongodb client
    #mongoObj=Mongo_Object('localhost',27017)
    print 'finish upgrade'
Exemple #9
0
    def create_index(self, index_folder, docs_path, add_terms=False):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path)
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()
def search(command):
    STORE_DIR = "index"
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = MMapDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer()
    ans = run(searcher, analyzer, command)
    del searcher
    return ans


# vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
# for y in search('二三四五'):
#     print(y)
Exemple #11
0
 def __init__(self,
              LUCENE_INDEX_DIR,
              similarity='BM25',
              lucene_vm_flag=False):
     if lucene_vm_flag == False:
         lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     self.lucene_vm_init = True
     self.index_dir = LUCENE_INDEX_DIR
     self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
     self.analyzer = StandardAnalyzer()
     self.config = IndexWriterConfig(self.analyzer)
     self.reader = DirectoryReader.open(self.index_mm)
     self.searcher = IndexSearcher(self.reader)
     self.dict_term_freq = {}
     self.dict_doc_field_title = {}
     if similarity == 'BM25':
         (self.searcher).setSimilarity(BM25Similarity())
Exemple #12
0
    def __init__(self, args):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.args = args

        index_folder = os.path.join(DATA_DIR, args.index_folder)
        if not os.path.exists(index_folder):
            self.doc_db = DocDB()
            logger.info(f'Creating index at {index_folder}')
            self.create_index(index_folder)

        fsDir = MMapDirectory(Paths.get(index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))
        self.searcher.setSimilarity(MyTFIDFSimilarity())
        self.analyzer = MySimpleAnalyzer(
            CharArraySet(collections.JavaSet(utils.STOPWORDS), True))
        self.pool = ThreadPool(processes=args.num_search_workers)
Exemple #13
0
    def create_index(self, index_folder, docs_path, add_terms=False):

        print('Loading Vocab...')
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)
       
        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print("%d docs in index" % self.writer.numDocs())
        print("Indexing documents...")


        # import corpus_hdf5
        # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path)
        import pickle
        with open(docs_path, "rb") as read_file:
            corpus = pickle.load(read_file)
        idx_cnt = 0
        # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()):
        # for doc_id, txt in corpus.items():
        for txt in corpus:
            self.add_doc(idx_cnt, txt, add_terms)  # not lowered
            if idx_cnt % 1000 == 0:
                print('indexing doc', idx_cnt)
            idx_cnt += 1
        print("Index of %d docs..." % self.writer.numDocs())
        self.writer.close()
Exemple #14
0
    def _create_index(self, index_dir: str) -> None:
        """Index documents

        Parameters
        ----------
        index_dir : str
            The dir to store index
        """
        os.mkdir(index_dir)

        TITLE_FIELD = FieldType()  # pylint: disable=invalid-name
        TITLE_FIELD.setStored(True)
        TITLE_FIELD.setIndexOptions(IndexOptions.DOCS)

        TEXT_FIELD = FieldType()  # pylint: disable=invalid-name
        TEXT_FIELD.setStored(True)
        TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        fs_dir = MMapDirectory(Paths.get(index_dir))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fs_dir, writer_config)
        logger.info("%d docs in index", self.writer.numDocs())
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)

            doc = Document()
            doc.add(Field("title", doc_id, TITLE_FIELD))
            doc.add(Field("text", text, TEXT_FIELD))

            self.writer.addDocument(doc)

        logger.info("Indexed %d docs.", self.writer.numDocs())
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()
Exemple #15
0
    def __init__(self,
                 index_dir: str,
                 db_path: str = None,
                 num_search_workers: int = 8) -> None:

        self.env = lucene.getVMEnv()  # pylint: disable=no-member
        if not self.env:
            self.env = lucene.initVM(
                initialheap='28g',  # pylint: disable=no-member
                maxheap='28g',
                vmargs=['-Djava.awt.headless=true'])

        self.num_search_workers = num_search_workers

        if not os.path.exists(index_dir):
            self.doc_db = DocDB(db_path=db_path)
            logger.info('Creating index at %s', index_dir)
            self._create_index(index_dir)

        fs_dir = MMapDirectory(Paths.get(index_dir))
        self.searcher = IndexSearcher(DirectoryReader.open(fs_dir))
        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=num_search_workers)
Exemple #16
0
def main():
    if len(sys.argv) < 2:
        print('error: too few arguments')
        print('command:  python build_index_wikipedia.py FILENAME')
        quit()

    # create file object
    filename = sys.argv[1]
    print('processing ' + filename)

    cnt = 0
    stemmer = SnowballStemmer('english')

    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    LUCENE_INDEX_DIR = 'mmapDirectory\\index_wikipedia_2015'
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    #config=config.setRAMBufferSizeMB(1024.0)  # experimental setting !!

    # write data to index
    if not is_index_Exist:
        print('begin backup code files')
        system_flag = platform.system()
        cmd = 'robocopy %s %s\code_files *.py' % (
            r'%cd%', LUCENE_INDEX_DIR
        ) if system_flag == 'Windows' else 'cp *.py %s\code_files' % (
            LUCENE_INDEX_DIR)
        os.system(cmd)

        w = IndexWriter(index_mm, config)

    data = {}
    with open(filename, 'r', encoding='utf-8') as src:
        for page_pair in extract_pages(src):
            label, content, page_id = page_pair[0], page_pair[1], page_pair[2]

            pair_tokens = process_article((content, False, label, page_id))
            content = remove_stopwords(' '.join(pair_tokens[0]), ' ')

            if len(content.split()) < 10:
                continue

            stemmed_content = stemSentence(content, stemmer, False)

            if DEBUG_MODE == True:
                try:
                    print('%s\n%s\n%s\n%s' %
                          (label, page_id, content, stemmed_content))
                except:
                    print('encoding error')

            data.clear()
            data['label'] = (label, 'StringField')
            data['label_lower'] = (label.lower(), 'StringField')
            data['label_lower_text'] = (label.lower(), 'TextField')
            data['wiki_id'] = (page_id, 'StringField')
            #data['content']=(content,'TextField')
            data['stemmed_content'] = (stemmed_content, 'TextField')
            addDoc(w, data)

            cnt += 1
            #if cnt>20:
            #break
            if cnt % 5000 == 0:
                print('finish %d' % (cnt))

    w.close()
    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        if add_terms:
            if prm.top_tfidf > 0 or prm.idf_path:
                print 'Creating IDF dictionary...'
                self.idf = defaultdict(int)
                doc_id = 0
                if docs_path.lower().endswith('.hdf5'):
                    import corpus_hdf5
                    corpus = corpus_hdf5.CorpusHDF5(docs_path)
                    for txt in corpus.get_text_iter():
                        self.add_idf(txt)

                        if doc_id % 1000 == 0:
                            print 'Creating IDF, doc', doc_id
                        doc_id += 1

                else:
                    # ClueWeb09
                    import warc
                    import gzip
                    from bs4 import BeautifulSoup
                    # list all files in the folder.
                    paths = []
                    for root, directories, filenames in os.walk(docs_path):
                        for filename in filenames:
                            paths.append(os.path.join(root, filename))

                    for path in paths:
                        with gzip.open(path, mode='rb') as gzf:
                            for record in warc.WARCFile(fileobj=gzf):
                                # remove html tags
                                txt = BeautifulSoup(
                                    record.payload[:1000 * 1000],
                                    "lxml").get_text()
                                # remove WARC headers.
                                txt = '\n'.join(txt.split('\n')[10:])

                                self.add_idf(txt)

                                if doc_id % 1000 == 0:
                                    print 'Creating IDF, doc', doc_id
                                doc_id += 1

                for key, val in self.idf.items():
                    self.idf[key] = math.log(float(doc_id) / val)

                pkl.dump(self.idf, open(prm.idf_path, 'wb'))

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0
        if docs_path.lower().endswith('.hdf5'):
            import corpus_hdf5
            corpus = corpus_hdf5.CorpusHDF5(docs_path)
            for txt in corpus.get_text_iter():
                title = corpus.get_article_title(doc_id)
                self.add_doc(doc_id, title, txt, add_terms)
                if doc_id % 1000 == 0:
                    print 'indexing doc', doc_id
                doc_id += 1
        else:
            # ClueWeb09
            import warc
            import gzip
            from bs4 import BeautifulSoup

            # list all files in the folder.
            paths = []
            for root, directories, filenames in os.walk(docs_path):
                for filename in filenames:
                    paths.append(os.path.join(root, filename))

            for path in paths:
                with gzip.open(path, mode='rb') as gzf:
                    for record in warc.WARCFile(fileobj=gzf):
                        if 'warc-trec-id' in record:
                            title = record['warc-trec-id']
                        else:
                            title = record['warc-record-id']
                        # remove html tags
                        #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text()
                        txt = record.payload[:1000 * 1000]
                        # remove WARC headers.
                        txt = '\n'.join(txt.split('\n')[10:])

                        self.add_doc(doc_id, title, txt, add_terms)
                        if doc_id % 1000 == 0:
                            print 'indexing doc', doc_id
                        doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()
Exemple #18
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher = IndexSearcher(reader)
    searcher.setSimilarity(BM25Similarity())

    # read query
    read_query()

    # initialize mongodb client
    mongoObj = Mongo_Object('localhost', 27017)

    # initialize word2vec
    print 'load word2vec model'
    w2vmodel = gensim.models.Word2Vec.load_word2vec_format(
        "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary",
        binary=True)
    print 'finish loading word2vec model'

    # search
    global hitsPerPage
    fields = ['name', 'value']
    #parser=MultiFieldQueryParser(fields,analyzer)
    #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    rec_result = open('pylucene.runs', 'w')

    for i in range(len(queries)):
        query = queries[i]
        print 'processing query ' + str(i) + ':' + query[0]
        querystr = remove_duplicate(stemSentence(query[1]))
        #q_lucene=MultiFieldQueryParser.parse(parser,querystr)
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        print "q_lucene: " + q_lucene.toString()
        collector = TopScoreDocCollector.create(hitsPerPage)
        searcher.search(q_lucene, collector)
        hits = collector.topDocs().scoreDocs

        # build query object for computeScore
        #queryObj=Query_Object(query,mongoObj,w2vmodel)

        # initialize duplicate remover
        docDup = set()

        # find candidate results after 1st round filter
        candidates = PriorityQueue()
        for j in xrange(len(hits)):
            docID = hits[j].doc
            d = searcher.doc(docID)
            name = cleanSentence(d['title'].strip())
            if name in docDup:
                continue
            docDup.add(name)
            # build entity object
            entityObj = Entity_Object(d, mongoObj, w2vmodel)
            #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel)
            score = hits[j].score
            candidates.put((-score, j))

        # output results from priority queue larger score first
        rank = 0
        while candidates.empty() == False and rank < 100:
            rank = rank + 1
            item = candidates.get()
            score = -item[0]
            j = item[1]  # index of hits[]
            docID = hits[j].doc
            d = searcher.doc(docID)
            title = '<dbpedia:' + d.get('title') + '>'
            res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str(
                rank) + '\t' + str(score) + '\t' + 'pylucene_multifield'
            rec_result.writelines(res_line + '\n')
    rec_result.close()
def main():
    if len(sys.argv) < 2:
        print('error: too few arguments')
        print('command:  python create_category_corpus.py NUMBER_TOP_CATEGORY')
        quit()

    NUMBER_TOP_CATEGORY = int(sys.argv[1])
    print('NUMBER_TOP_CATEGORY=%d' % (NUMBER_TOP_CATEGORY))

    print('loading category profiles')
    profile = load_zipped_pickle('category_profiles_dbpedia_201510.gz')
    print('finish loading category profiles')

    system_flag = platform.system()
    cwd = os.getcwd()

    # initialize mongo client
    if system_flag == 'Windows':
        client = pymongo.MongoClient("localhost", 27017)
    else:
        client = pymongo.MongoClient("localhost", 58903)

    db = client.wiki2015
    wiki_article_categories = db['article_categories']

    category_corpus = {}

    pkl_filename = 'category_dbpedia_corpus_top%d_fsdm3.pkl.gz' % (
        NUMBER_TOP_CATEGORY)
    if system_flag == 'Windows':
        lucene_dbpedia_fsdm = Lucene_Object('mmapDirectory\\dbpedia_v2_FSDM3',
                                            'BM25', True)
    else:
        lucene_dbpedia_fsdm = Lucene_Object(
            '%s/mmapDirectory/dbpedia_v2_FSDM3' % (cwd), 'BM25', True)

    cnt = 0
    if os.path.exists(pkl_filename) == True:
        #if False==True:
        print('loading category corpus')
        category_corpus = load_zipped_pickle(pkl_filename)
    else:

        for item in wiki_article_categories.find():
            list_category = item['categories'].strip().split('|')
            uri_article = item['uri']
            title = findTitle(uri_article)

            entity_content_dict = {}
            doc_entity = lucene_dbpedia_fsdm.findEntityDocFromIndex(
                title, 'title', False)
            if doc_entity is None:
                continue

            for f in [
                    'names', 'attributes', 'categories', 'similar_entities',
                    'related_entities', 'catchall'
            ]:
                entity_content_dict[f] = doc_entity[f]
                entity_content_dict['stemmed_' + f] = doc_entity['stemmed_' +
                                                                 f]

            if len(entity_content_dict['catchall'].strip()) == 0:
                continue

            for cat in list_category[:NUMBER_TOP_CATEGORY]:
                if ('<http://dbpedia.org/resource/Category:' + cat +
                        '>') not in profile:
                    continue
                if cat not in category_corpus:
                    category_corpus[cat] = []
                if len(category_corpus[cat]) < 300:
                    category_corpus[cat].append(entity_content_dict)

            #cnt+=1
            #if cnt>20:
            #break

        print('saving corpus to pkl.gz')
        save_zipped_pickle(category_corpus, pkl_filename)
    client.close()

    # begin write the data into index
    print('begin write into index')
    if system_flag == 'Windows':
        LUCENE_INDEX_DIR = 'mmapDirectory\\category_corpus_dbpedia201510_top' + str(
            NUMBER_TOP_CATEGORY) + '_fsdm3'
    else:
        LUCENE_INDEX_DIR = '%s/mmapDirectory/category_corpus_dbpedia201510_top' % (
            cwd) + str(NUMBER_TOP_CATEGORY) + '_fsdm3'

    # backup code files
    cmd = 'robocopy %s %s\code_files *.py' % (
        r'%cd%', LUCENE_INDEX_DIR
    ) if system_flag == 'Windows' else 'cp *.py %s/code_files' % (
        LUCENE_INDEX_DIR)
    os.system(cmd)

    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)

    # write data to index
    w = IndexWriter(index_mm, config)

    cnt = 0
    data = {}
    max_article_num = 0
    stemmer = SnowballStemmer('english')
    for cat, list_entity_dict in category_corpus.items():
        cat_label = cleanSentence(cat, True)
        data.clear()
        data['category'] = (cat, 'StringField')
        data['label'] = (cat_label, 'CUSTOM_FIELD_TEXT')
        data['stemmed_label'] = (stemSentence(cat_label, stemmer,
                                              True), 'CUSTOM_FIELD_TEXT')
        data['num_articles'] = (len(list_entity_dict), 'INTEGER_STORED')

        if data['num_articles'][0] > max_article_num:
            max_article_num = data['num_articles'][0]

        for f in [
                'names', 'attributes', 'categories', 'similar_entities',
                'related_entities', 'catchall'
        ]:
            contents = cleanSentence(
                ' '.join([dic[f] for dic in list_entity_dict]), True, ' ')
            data[f] = (contents, 'CUSTOM_FIELD_TEXT_NOT_STORED')
            data['stemmed_' + f] = (stemSentence(contents, stemmer, False),
                                    'CUSTOM_FIELD_TEXT_NOT_STORED')
        #print ('--------------------')
        # need to calculate corpus average length
        addDoc(w, data)

        #cnt+=1
        #if cnt>20:
        #break

    w.close()
    print('max article num=%d' % (max_article_num))
Exemple #20
0
            sys.exit(1)
        try:
            print("creating index:", fname)
            IndexFiles(sys.argv[1], fname, analyzer, not opts.all_line)
            end = datetime.now()
            print(end - start)
        except Exception as e:
            print("Failed: ", e)
            raise e
        print("loading index:", fname)

    else:
        print("creating index...")
        if opts.simple_fs:
            directory = SimpleFSDirectory(File(fname))
        else:
            directory = MMapDirectory.open(File(fname))

        dir_reader = DirectoryReader.open(directory)
        searcher = IndexSearcher(dir_reader)

        index = Index(searcher, analyzer, opts.verbose)

        if opts.server:
            from . import server
            server.run(opts.port, index)
        else:
            index.prompt(opts.max_n)

        del searcher
Exemple #21
0
    def openStore(self):

        return MMapDirectory(Paths.get(self.STORE_DIR))
Exemple #22
0
    def openStore(self):

        return MMapDirectory(File(self.STORE_DIR))
Exemple #23
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
       lucene.initVM(vmargs=['-Djava.awt.headless=true'])
       lucene_vm_init = True
    
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path 
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
    
    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    
    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher1 = IndexSearcher(reader)
    searcher1.setSimilarity(BM25Similarity())
    searcher2 = IndexSearcher(reader)
    w = IndexWriter(index_mm,config)
    # read query
    read_query()
    
    # initialize mongodb client
    mongoObj=Mongo_Object('localhost',27017)
      
    # search
    docDup=set()
    finalDup={}
    
    for i in xrange(len(queries)):
        print 'process query %d' %(i)
        query = queries[i]
        querystr = stemSentence(query[3])
        # build searcher
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        collector = TopScoreDocCollector.create(hitsPerPage);
        searcher1.search(q_lucene, collector);
        hits = collector.topDocs().scoreDocs;
        
        
        # find candidate results after 1st round filter
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            if d['title'] in docDup:
               finalDup[d['title']]=d
               continue
            docDup.add(d['title'])
            
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            title=d['title']
            if d['title'] in docDup:
               continue
            docDup.add(title)
            
            item=(mongoObj.conn_me).find_one({'title':title})
            if item is None:
               continue
            entitylist=item['entitylist'].split('|')
            for en_title in entitylist:
                if title==en_title:
                   continue
                t=Term('title',en_title)
                q=TermQuery(t)
                docs=searcher2.search(q,2)
                if docs.totalHits<=1:
                   continue
                
                docID2=(docs.scoreDocs)[0].doc
                doc=searcher2.doc(docID2)
                finalDup[doc['title']]=doc

    print 'begin to clean index, there are %d dup records' %(len(finalDup))
    for title in finalDup:
        doc=finalDup[title]
        # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract
        
        name=doc['name']
        value=doc['value']
        category=doc['category']
        skos_category=doc['skos_category']
        all_text=doc['all_text']
        raw_name=doc['raw_name']
        raw_value=doc['raw_value']
        abstract=doc['abstract']
        
        print 'process '+title
        t=Term('title',title)
        q=TermQuery(t)
        w.deleteDocuments(q)
        addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract)
    # process remaining records
    #global batch,cnt_batch
    #if cnt_batch>0:
       #w.addDocuments(batch)
       #cnt_batch=0
       #del batch[:]
    w.close()
 def __init__(self, path, lruTaxonomyWriterCacheSize=100):
     Observable.__init__(self)
     taxoDirectory = MMapDirectory(File(path))
     taxoDirectory.setUseUnmap(False)
     self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(lruTaxonomyWriterCacheSize))
Exemple #25
0
 def __init__(self, index_dir_ngram='./ngram', index_dir_vocab='./vocab'):
     self._index_dir_ngram = index_dir_ngram
     self._index_dir_vocab = index_dir_vocab
     fs = MMapDirectory.open(File(index_dir_ngram))
     self._searcher_ngram = IndexSearcher(DirectoryReader.open(fs))
Exemple #26
0
    def __init__(self, DATA_DIR, vocab, n_threads, max_terms_per_doc,
                 index_name, index_name_term, docs_path, docs_path_term,
                 use_cache):
        self.n_threads = n_threads
        self.index_folder = DATA_DIR + '/data/' + index_name + '/'  # folder to store lucene's index. It will be created in case it does not exist.
        self.index_folder_term = DATA_DIR + '/data/' + index_name_term + '/'  # folder to store lucene's index. It will be created in case it does not exist.
        self.local_index_folder = './' + index_name
        self.local_index_folder_term = './' + index_name_term
        self.use_cache = use_cache
        self.docs_path = docs_path
        self.docs_path_term = docs_path_term
        self.max_terms_per_doc = max_terms_per_doc

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = vocab

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(self.index_folder):
            print 'Creating index at', self.index_folder
            if self.docs_path == self.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(self.index_folder, self.docs_path, add_terms)

        if self.local_index_folder:
            print 'copying index from', self.index_folder, 'to', self.local_index_folder
            if os.path.exists(self.local_index_folder):
                print 'Folder', self.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(self.index_folder, self.local_index_folder)
            self.index_folder = self.local_index_folder
        else:
            self.index_folder = self.index_folder

        fsDir = MMapDirectory(Paths.get(self.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if self.docs_path != self.docs_path_term:
            if not os.path.exists(self.index_folder_term):
                print 'Creating index at', self.index_folder_term
                self.create_index(self.index_folder_term,
                                  self.docs_path_term,
                                  add_terms=True)

            if self.local_index_folder_term:
                print 'copying index from', self.index_folder_term, 'to', self.local_index_folder_term
                if os.path.exists(self.local_index_folder_term):
                    print 'Folder', self.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(self.index_folder_term,
                                    self.local_index_folder_term)
                self.index_folder_term = self.local_index_folder_term
            else:
                self.index_folder_term = self.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(self.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=self.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()