Exemple #1
0
    def create_index(self, index_folder):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(True)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(
            MySimpleAnalyzer(
                CharArraySet(collections.JavaSet(utils.STOPWORDS), True)))
        writerConfig.setSimilarity(MyTFIDFSimilarity())
        writerConfig.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fsDir, writerConfig)
        logger.info(f"{self.writer.numDocs()} docs in index")
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)
            tokens = self.doc_db.get_doc_tokens(doc_id)
            self.add_doc(doc_id, text, tokens)

        logger.info(f"Indexed {self.writer.numDocs()} docs.")
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()
Exemple #2
0
 def publish_services(self, service_list):
     transformer = WSDLTransformer()
     current_document = 1
     indexDir = SimpleFSDirectory(File("index/"))
     writerConfig = IndexWriterConfig(
         Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT))
     writerConfig.setSimilarity(BM25Similarity())
     index_writer = IndexWriter(indexDir, writerConfig)
     for wsdl in service_list:
         if self._document_expansion:
             #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl))))
             bag_of_words = ' '.join(
                 self._semantic_transformer.transform(
                     transformer.transform(wsdl)))
         else:
             #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl)))
             bag_of_words = ' '.join(transformer.transform(wsdl))
         doc = Document()
         doc.add(
             Field("content", bag_of_words, Field.Store.YES,
                   Field.Index.ANALYZED))
         doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO))
         index_writer.addDocument(doc)
         current_document += 1
     index_writer.close()
Exemple #3
0
    def __init__(self, path, settings):
        self._settings = settings
        self._multithreaded = settings.multithreaded
        self._checker = DirectSpellChecker()
        indexDirectory = MMapDirectory(File(join(path, 'index')))
        indexDirectory.setUseUnmap(False)
        taxoDirectory = MMapDirectory(File(join(path, 'taxo')))
        taxoDirectory.setUseUnmap(False)
        conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer)
        conf.setSimilarity(settings.similarity)
        mergePolicy = TieredMergePolicy()
        mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce)
        mergePolicy.setSegmentsPerTier(settings.segmentsPerTier)
        conf.setMergePolicy(mergePolicy)

        if not settings.readonly:
            self._indexWriter = IndexWriter(indexDirectory, conf)
            self._indexWriter.commit()
            self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize))
            self._taxoWriter.commit()

        self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory)
        self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper

        self._facetsConfig = settings.fieldRegistry.facetsConfig

        self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
Exemple #4
0
    def __init__(self, root, storedir, isindexing=False, isBM25=True):

        if not os.path.exists(storedir):
            os.mkdir(storedir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)

        if isindexing:
            store = SimpleFSDirectory(Paths.get(storedir))
            config = IndexWriterConfig(self.analyzer)
            # TODO BM25 parameter tuning
            if isBM25:
                config.setSimilarity(BM25Similarity())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexer(root, writer)
            ticker = Ticker()
            print('commit index')
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print('done')

        search_dir = SimpleFSDirectory(Paths.get(storedir))
        self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
        if isBM25:
            self.searcher.setSimilarity(BM25Similarity())
Exemple #5
0
 def _get_writer(self, analyzer=None, create=False):
     config = IndexWriterConfig(Version.LUCENE_CURRENT, self._analyzer)
     if create:
         config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     if self._similarity is not None:
         config.setSimilarity(self._similarity)
     writer = IndexWriter(self._store, config)
     return writer
Exemple #6
0
 def __init__(self, indexDir, analyzer):
     lucene.initVM()
     logger.info("RAM index")
     writerConfig = IndexWriterConfig(analyzer)
     writerConfig.setSimilarity(mySimilarity())
     logger.debug('writer similarity func: {}'.format(
         writerConfig.getSimilarity()))
     writer = IndexWriter(indexDir, writerConfig)
     self.writer = writer
Exemple #7
0
    def __init__(self):
        indexDir = RAMDirectory()
        analyzer = SmartChineseAnalyzer()
        writerConfig = IndexWriterConfig(analyzer)

        # create new directory, remove previously indexed documents
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writerConfig.setSimilarity(mySimilarity())
        logger.debug('search similarity:{}'.format(
            writerConfig.getSimilarity()))
        self.indexDir = indexDir
        self.writer = IndexWriter(indexDir, writerConfig)
Exemple #8
0
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            logger.info("index directory:{}".format(config.IDX_SSQA))
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA)))
            analyzer = SmartChineseAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))
        writerConfig = IndexWriterConfig(analyzer)
        writerConfig.setSimilarity(mySimilarity())
        logger.debug('writer similarity func: {}'.format(
            writerConfig.getSimilarity()))
        writer = IndexWriter(indexDir, writerConfig)
        self.writer = writer
Exemple #9
0
    def __init__(self, storeDir, similarity=None):
        """Constructor

        storeDir -- path where to save the index"""

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        self.dir = store
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        if similarity is not None:
            config.setSimilarity(similarity)
        writer = IndexWriter(store, config)

        self.writer = writer
Exemple #10
0
    def createIndexWriter(self, actual_dir, max_field_length=20000000):
        """
            Returns an IndexWriter object created for the actual_dir specified
        """
        ensureDirExists(actual_dir)
        index = SimpleFSDirectory(File(actual_dir))
        analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)

        writerConfig=IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer)
        similarity=FieldAgnosticSimilarity()

        writerConfig.setSimilarity(similarity)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

    ##    res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length))
        res= IndexWriter(index, writerConfig)
        res.deleteAll()
        return res
    def __init__(self, fileRoot, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store    = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config   = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setSimilarity(similarities.BM25Similarity())
    #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer   = IndexWriter(store, config)

        self.indexDocs(fileRoot, writer)
        print 'commit index',
        writer.commit()
        writer.close()
        print 'done'
Exemple #12
0
    def createIndexWriter(self, actual_dir, max_field_length=20000000):
        """
            Returns an IndexWriter object created for the actual_dir specified
        """
        ensureDirExists(actual_dir)
        index = SimpleFSDirectory(File(actual_dir))
        analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)

        writerConfig = IndexWriterConfig(LuceneVersion.LUCENE_CURRENT,
                                         analyzer)
        similarity = FieldAgnosticSimilarity()

        writerConfig.setSimilarity(similarity)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        ##    res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length))
        res = IndexWriter(index, writerConfig)
        res.deleteAll()
        return res
Exemple #13
0
class Indexer(object):  # 建立索引
    def __init__(self, indexDir, doClear=True, computeLengthNorm=False):
        #         if not jpype.isJVMStarted():
        #         lucene.initVM()
        lucene.getVMEnv().attachCurrentThread()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here?
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setRAMBufferSizeMB(256.0)  # 设置自动提交的最大RAM为256MB
        self.config.setMaxBufferedDocs(10000)  # 设置自动提交的最大Docs个数为10000
        if not computeLengthNorm:
            sim = CustomSimilarity()
            self.config.setSimilarity(sim)
        self.path = os.path.join(INDEX_PATH, indexDir)
        # print self.path
        # path.mkdir(self.path)
        #         if doClear:
        #             self.clearExistingIndex()
        self.store = SimpleFSDirectory(File(self.path))
        self.writer = IndexWriter(self.store, self.config)

        self.t1 = FieldType()  # 域t1
        self.t1.setIndexed(True)
        self.t1.setStored(True)
        self.t1.setTokenized(False)
        self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.t2 = FieldType()  # 域t2
        self.t2.setIndexed(True)
        self.t2.setStored(False)
        self.t2.setTokenized(True)
        self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    def clearExistingIndex(self):  # 删除索引?
        indexdir = self.path
        for thefile in os.listdir(indexdir):
            filepath = os.path.join(indexdir, thefile)
            try:
                if os.path.isfile(filepath):
                    os.unlink(filepath)
            except Exception, e:
                logger.error("Delete file %s failed: %s", filepath, str(e))
Exemple #14
0
    def __init__(self):
        wikidir = './wiki-pages-text'
        indexdir = './IndexFiles.index'
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        if not os.path.exists(indexdir):
            os.mkdir(indexdir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)
        store = SimpleFSDirectory(Paths.get(indexdir))
        config = IndexWriterConfig(self.analyzer)
        config.setSimilarity(BM25Similarity())
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexer(wikidir, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
    def __init__(self, storeDir, aWrapper):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        aWrapper = LimitTokenCountAnalyzer(aWrapper, 1048576)
        bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
        config = IndexWriterConfig(Version.LUCENE_CURRENT, aWrapper)
        config.setSimilarity(bm25Sim)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)


        self.indexTable(writer)
        ticker = Ticker()
        print 'commit index'
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
class IndexingEngine():

	def __init__(self):

		self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory
		self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory
		self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers()


		############################# Writer Configurattion #####################################
		map = HashMap()
		map.put('name', self.mAnalyzers['name'])
		map.put('parent', self.mAnalyzers['parent'])
		map.put('content', self.mAnalyzers['default'])
		map.put('id', self.mAnalyzers['id'])		

		analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map)

		self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper)
		self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode)

		if settings.ADMINS_ENGINE.mSimilarity != None:
			self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity)
		########################################################################################


		directory = SimpleFSDirectory(File(self.mIndexDirectory))
		self.mIndexWriter = IndexWriter(directory, self.mWriterConfig)


		############################# FieldType Prepration #####################
		nameField = FieldType()
		nameField.setIndexed(True)
		nameField.setStored(True)
		nameField.setTokenized(True)
		nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

		parentField = FieldType()
		parentField.setIndexed(True)
		parentField.setStored(True)
		parentField.setTokenized(True)
		parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)

		contentField = FieldType()
		contentField.setIndexed(True)
		contentField.setStored(True)
		contentField.setTokenized(True)
		contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

		idField = FieldType()
		idField.setIndexed(True)
		idField.setStored(True)
		idField.setTokenized(False)
		idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)


		self.mFieldTypes = {
			'name' 		: nameField,
			'parent'	: parentField,
			'content'	: contentField,
			'id'		: idField
		}
		#######################################################################

		self.mLog = ""

	

	def indexing(self, root = settings.ADMINS_ENGINE.mDocumentDirectory, parent = [], docID = 1, parentID = 0, id = 0):

		realPath = os.path.abspath(root)
		for i in os.listdir(realPath):

			path = os.path.join(realPath, i)
			if os.path.isfile(path):
				#index this file
				doc = Document()

				doc.add(Field('name', ("%s %s" % (' '.join(parent), i)).strip(), self.mFieldTypes['name']))
				doc.add(Field('parent', ' '.join(parent), self.mFieldTypes['parent']))
				doc.add(Field('id', str(docID), self.mFieldTypes['id']))
				doc.add(Field('parentID', str(parentID), self.mFieldTypes['id']))

				fd = open(path, 'r')
				content = fd.read()
				fd.close()

				if len(content) > 0:
					doc.add(Field('content', content, self.mFieldTypes['content']))

				self.mIndexWriter.addDocument(doc)
				##################### Logging ##############################
				if IS_DEBUG:
					nameDebug = AnalyzerDebug.debug(self.mAnalyzers['name'], ("%s %s" % (' '.join(parent), i)).strip())
					parentDebug = AnalyzerDebug.debug(self.mAnalyzers['parent'], ' '.join(parent))
					contentDebug = AnalyzerDebug.debug(self.mAnalyzers['default'], content)
					self.mLog = self.mLog + ( "File %s\n   {name - %s}: %s\n   {parent - %s}: %s\n   {content}: %s\n\n" % (path, docID, nameDebug, parentID, parentDebug, contentDebug) )



				docID = docID + 1
				################### index sub commands	
				if os.path.isdir(path + ".sub"):
					parent.append(i)
					docID = self.indexing(path + ".sub", parent, docID, docID - 1, id + 1)
					parent.pop()
					
						
		
		if id == 0:
			self.mIndexWriter.commit()
			self.mIndexWriter.close()
			
			if IS_DEBUG:
				loggingBot = LoggingBot(self.mLog, settings.ADMINS_ENGINE.getIndexingLogQueue())
				loggingBot.start()
				self.mLog = ""
		return docID
Exemple #17
0
    def __init__(self,
                 lang,
                 dataset,
                 analyzer,
                 index_path=None,
                 data_path=None,
                 ram_size=2048):
        """ Returns scored documents in multiple languages.

        Parameters:
        dataset  (str): ['mlqa_dev', 'mlqa_test', 'wiki']
        lang     (str): ['en', 'es', 'de']
        anlyzer  (str): ['en', 'es', 'de', 'standard']
        ram_size (int): Size of memory used while indexing

        Returns:
        """
        super().__init__()

        idxdir = self.get_index(lang, dataset, index_path)
        self.mlqa = True
        if dataset == 'mlqa_dev':
            self.dataset = MLQADataset('dev', lang, lang, data_path)
        elif dataset == 'mlqa_test':
            self.dataset = MLQADataset('test', lang, lang, data_path)
        elif dataset == 'wiki':
            self.mlqa = False
            self.dataset = Wiki(lang, data_path)
        else:
            raise RuntimeError("No dataloader for {}".format(dataset))

        # stores index files, poor concurency try NIOFSDirectory instead
        store = SimpleFSDirectory(Paths.get(idxdir))
        # limit max. number of tokens per document.
        # analyzer will not consume more tokens than that
        #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        # configuration for index writer
        config = IndexWriterConfig(analyzers[analyzer]())
        # creates or overwrites index
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        # setting similarity BM25Similarity(k1=1.2,b=0.75)
        similarity = BM25Similarity(self.k1, self.b)
        config.setSimilarity(similarity)
        config.setRAMBufferSizeMB(float(ram_size))
        # create index writer
        self.writer = IndexWriter(store, config)

        self.ftdata = FieldType()
        self.ftmeta = FieldType()
        # IndexSearcher will return value of the field
        self.ftdata.setStored(True)
        self.ftmeta.setStored(True)
        # will be analyzed by Analyzer
        self.ftdata.setTokenized(True)
        self.ftmeta.setTokenized(False)
        # what informations are stored (probabli DOCS would be sufficient)
        # DOCS: Only documents are indexed: term frequencies and positions are omitted.
        #       Phrase and other positional queries on the field will throw an exception,
        #       and scoring will behave as if any term in the document appears only once.
        # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are
        #       omitted. This enables normal scoring, except Phrase and other positional
        #       queries will throw an exception.
        # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions.
        #       This is a typical default for full-text search: full scoring is enabled
        #       and positional queries are supported.
        self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        self.ftmeta.setIndexOptions(IndexOptions.DOCS)
        # instantiate some reusable objects
        # TODO: create document, add fields then change only field value and
        # re-add document
        self.doc = Document()
        # Id cannot be reused because there is multiple values
        # I could store list of fields and add one if its not enough
        #self.fieldId = Field("id", "dummy", self.ftmeta)
        self.fieldTitle = Field("title", "dummy", self.ftdata)
        self.doc.add(self.fieldTitle)
        self.fieldContext = Field("context", "dummy", self.ftdata)
        self.doc.add(self.fieldContext)
        self.fieldIds = [Field("id", "dummy", self.ftmeta)]