Esempio n. 1
0
    def import_csv(self, csv_file):

        try:
            writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False,
                                        IndexWriter.MaxFieldLength.LIMITED)
            changed_rows = addmetadata.add_metadata_from_csv(self.searcher, self.reader, writer, csv_file,self.args_dir,
                                                             new_files=True)
            writer.close()
        except UnicodeDecodeError:
            try:
                writer.close()
            except:
                pass
            self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'})
            return
        self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
Esempio n. 2
0
def rollback(collection_name):
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = IndexWriter(direc, config)

    writer.rollback()
    writer.close()
Esempio n. 3
0
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            logger.info("index directory:{}".format(config.IDX_SSQA))
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA)))
            analyzer = SmartChineseAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))
        writerConfig = IndexWriterConfig(analyzer)
        writerConfig.setSimilarity(mySimilarity())
        logger.debug('writer similarity func: {}'.format(
            writerConfig.getSimilarity()))
        writer = IndexWriter(indexDir, writerConfig)
        self.writer = writer
Esempio n. 4
0
def delete(primary_keys_map, collection_name, todelete, commit=False):
    INDEX_DIR_DEFAULT = "IndexFiles.index"
    if collection_name != "DEFAULT":
        INDEX_DIR = collection_name
    else:
        INDEX_DIR = INDEX_DIR_DEFAULT

    try:
        tofind_keyvalue_pairs = json.loads(todelete)
    except:
        return 100

    direc = SimpleFSDirectory(File(INDEX_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    #setting writer configurations
    try:
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(direc, config)
        ireader = IndexReader.open(direc)
    except:
        return 105

    ###as of now deletion of documents support is only based on indexed keys.###################3
    tofind_primary_keyvalue_pairs = {}
    tofind_nonprimary_keyvalue_pairs = {}

    #separating out primary and non_primary keys
    for key in tofind_keyvalue_pairs.keys():
        if key in primary_keys_map:
            tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]
        else:
            tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key]

    #filtering documents according to primary keys
    query = BooleanQuery()
    for key in tofind_primary_keyvalue_pairs.keys():
        temp = QueryParser(Version.LUCENE_CURRENT, key,
                           analyzer).parse(tofind_primary_keyvalue_pairs[key])
        query.add(BooleanClause(temp, BooleanClause.Occur.MUST))

    a = writer.deleteDocuments(query)
    if commit == True:
        writer.commit()
    writer.close()
    return 000
Esempio n. 5
0
    def __init__(self, sentences, base_dir):
        try:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        except:
            pass
        analyzer = StandardAnalyzer()
        storeDir = os.path.join(base_dir, INDEX_DIR)
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexsents(sentences, writer)
Esempio n. 6
0
    def index(self):
        if not (os.path.exists(self._dataDir)
                and os.path.isdir(self._dataDir)):
            raise IOError, "%s isn't existed or is not a directory" % (
                self._dataDir)

        dir = SimpleFSDirectory(Paths.get(self._indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)
        self.indexDirectory(writer, self._dataDir)
        numIndexed = writer.numDocs()
        writer.optimize()
        writer.close()
        dir.close()

        return numIndexed
Esempio n. 7
0
    def __init__(self, root, storeDir, analyzer):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print('commit index', )
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
Esempio n. 8
0
def index_wiki(wiki_xmlfile, index_directory_name):
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(
        FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
Esempio n. 9
0
    def __init__(self, storeDir, similarity=None):
        """Constructor

        storeDir -- path where to save the index"""

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        self.dir = store
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        if similarity is not None:
            config.setSimilarity(similarity)
        writer = IndexWriter(store, config)

        self.writer = writer
Esempio n. 10
0
    def __init__(self, path=INDEX_DIR, index_limit=LIMIT, training=False):
        self.index_limit = index_limit
        self.training = training

        p = Path(path)
        if not p.is_dir():
            os.mkdir(path)

        # 初始化lucene,准备好analyzer和writer
        lucene.initVM()
        indexdir = SimpleFSDirectory(Paths.get(path))
        analyzer = WhitespaceAnalyzer(
            Version.LATEST)  # 由于thulac分词的时候已经实现了用空格来表示不同的词,所以直接用空格分析器就可以。
        iwconf = IndexWriterConfig(analyzer)
        iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        index_writer = IndexWriter(indexdir, iwconf)

        self.Indexing(index_writer)
Esempio n. 11
0
    def _init_index(self):

        if not os.path.exists(self.corpus.path):
            os.mkdir(self.corpus.path)
        try:
            searcher = IndexSearcher(SimpleFSDirectory(File(self.corpus.path)), True)
        #except lucene.JavaError:
        except:
            analyzer = self.corpus.analyzer
            writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED)
            writer.setMaxFieldLength(1048576)
            writer.optimize()
            writer.close()

        self.lucene_index = SimpleFSDirectory(File(self.corpus.path))
        self.searcher = IndexSearcher(self.lucene_index, True)
        self.reader = IndexReader.open(self.lucene_index, True)
        self.analyzer = self.corpus.analyzer
Esempio n. 12
0
def index(analyzer, index_dest_dir, documents):
    """ Builds Lucene index from provided documents using given analyzer
    :param analyzer:
    :param index_dest_dir:
    :param list[Document] documents:
    :return:
    """
    if not all([isinstance(d, Document) for d in documents]):
        raise TypeError(
            "documents should be iterable of type Document! Given: %s" %
            type(documents[0]))

    writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer)
    writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)),
                         writer_config)
    for doc in documents:
        writer.addDocument(doc)
    writer.close()
def indexing():
    print("建立索引,文本文件夹 [%s] ..." % TEXT_DIR)
    create_dir(INDEX_DIR)
    directory = SimpleFSDirectory(Paths.get(INDEX_DIR))
    config = IndexWriterConfig(ANALYZER)
    writer = IndexWriter(directory, config)

    for x in glob.glob(os.path.join(TEXT_DIR, "*.txt")):
        title, post, terms = get_terms(x)
        doc = Document()
        if terms:
            doc.add(Field("title", title, TextField.TYPE_STORED))
            doc.add(Field("post", post, TextField.TYPE_STORED))
            doc.add(Field("terms", terms, TextField.TYPE_STORED))
            writer.addDocument(doc)

    writer.commit()
    writer.close()
Esempio n. 14
0
    def __init__(self, storeDir, analyzer, function):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
        eval("self." + function + "(writer)")
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print '索引建立完成.'
Esempio n. 15
0
def _index_files(storeDir, indexFile):
    jieba.initialize()

    store = SimpleFSDirectory(File(storeDir))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)

    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

    writer = IndexWriter(store, config)

    _index_docs(indexFile, writer)

    print('commit index')
    writer.commit()
    writer.close()
    print('done')
Esempio n. 16
0
    def createIndexWriter(self, actual_dir, max_field_length=20000000):
        """
            Returns an IndexWriter object created for the actual_dir specified
        """
        ensureDirExists(actual_dir)
        index = SimpleFSDirectory(File(actual_dir))
        analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)

        writerConfig = IndexWriterConfig(LuceneVersion.LUCENE_CURRENT,
                                         analyzer)
        similarity = FieldAgnosticSimilarity()

        writerConfig.setSimilarity(similarity)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        ##    res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length))
        res = IndexWriter(index, writerConfig)
        res.deleteAll()
        return res
Esempio n. 17
0
    def __init__(self, sourceDir, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(sourceDir, writer)
        ticker = Ticker()
        print('index ...')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
Esempio n. 18
0
    def update(self):
        delete(self._dir, self._counter)

        index_dir = SimpleFSDirectory(Paths.get(self._dir))
        config = IndexWriterConfig(StandardAnalyzer())
        index_writer = IndexWriter(index_dir, config)

        for key, val in self._data.items():
            document = Document()
            document.add(Field('id', key, StringField.TYPE_STORED))
            for k, v in val.items():
                if v:
                    if k == 'text':
                        document.add(Field('text', v, TextField.TYPE_STORED))
                    else:
                        document.add(Field(k, v, StringField.TYPE_STORED))
            index_writer.addDocument(document)
        index_writer.commit()
        index_writer.close()
Esempio n. 19
0
    def get_writer(self, store_dir, analyzer):
        '''
        Generate an `IndexWriter` according to the parameters.

        Input: `store_dir`: directory to store the Lucene index
               `analyzer`: analyzer used to analyze the docs
        Output: `IndexWriter` with the correct parameters
        '''
        # Initialize the `store_dir`
        if not os.path.exists(store_dir):
            os.mkdir(store_dir)
        store = SimpleFSDirectory(Paths.get(store_dir))

        # Generate an analyzer according to parameters
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        return IndexWriter(store, config)
Esempio n. 20
0
    def __init__(self, root, storeDir):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)
        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
Esempio n. 21
0
    def create_index(self, index_folder, docs_path, add_terms=False):

        print('Loading Vocab...')
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)
       
        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print("%d docs in index" % self.writer.numDocs())
        print("Indexing documents...")


        # import corpus_hdf5
        # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path)
        import pickle
        with open(docs_path, "rb") as read_file:
            corpus = pickle.load(read_file)
        idx_cnt = 0
        # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()):
        # for doc_id, txt in corpus.items():
        for txt in corpus:
            self.add_doc(idx_cnt, txt, add_terms)  # not lowered
            if idx_cnt % 1000 == 0:
                print('indexing doc', idx_cnt)
            idx_cnt += 1
        print("Index of %d docs..." % self.writer.numDocs())
        self.writer.close()
Esempio n. 22
0
    def __init__(self, root, index, storeDir):
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        #store=SimpleFSDirectory(File(storeDir).toPath())
        store = SimpleFSDirectory(Paths.get(storeDir))
        analyzer = StandardAnalyzer()  #analyzer对文档进行词法分析和语言处理
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)  #创建一个IndexWriter用来写索引文件

        self.indexDocs(root, index, writer)
        ticker = Ticker()
        print('commit index')
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
Esempio n. 23
0
def build_index(docs_dir, index_dir):
    """
    Indexes files in `docs_root` recursively, placing the built index in `store_dir`
    """
    if not exists(index_dir):
        mkdir(index_dir)
    storage = SimpleFSDirectory(Paths.get(index_dir))  # index kept on disk

    config = IndexWriterConfig(Analyzer())
    config.setOpenMode(
        IndexWriterConfig.OpenMode.CREATE)  # overwrite existing index

    writer = IndexWriter(storage, config)

    print 'Indexing documents:'
    index_docs(docs_dir, writer)

    print 'Writing index...'
    writer.commit()
    writer.close()
Esempio n. 24
0
    def inicializar(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        contador = self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
        return contador
Esempio n. 25
0
def main():
	INDEX_DIR = "indexes"
	try:
		print "Indexing..."
		#########################################  경   로  ####################################
		indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_A_Indices")

		#writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
		analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {	"typed_method_call": analyzer, "extends": analyzer,
				"used_classes": analyzer, "methods": analyzer,
				"class_instance_creation": analyzer, "methods_called": analyzer, "view_count" : KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer() }
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
		writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

		# analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		# a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
		# 	 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
		# 	 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
		# 	 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
		# wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		# config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
		# writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

		counter = Counter()
		index_code_snippet(writer, counter)
		writer.commit()
		writer.close()

		print "Done"
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()
	except SQLException as e: 	#when Database error occurs
			e.printStackTrace()
Esempio n. 26
0
    def index(self):
        # if exists sent_index, delete and create a new one
        doc_tool.cleardir(index_root)
        doc_tool.mkdir(index_root)

        index_dir = FSDirectory.open(Paths.get(index_root))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(index_dir, writer_config)

        ft1 = FieldType()
        ft1.setStored(True)
        ft1.setIndexOptions(IndexOptions.NONE)

        ft2 = FieldType()
        ft2.setStored(False)
        ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        doc_list = self.doc()
        file_path = os.path.join(config.SENT_RETRIEVAL_ROOT, "merge_doc")
        file_list = os.listdir(file_path)

        num = 0
        for file in file_list:
            docs = doc_tool.load_json_file(file_path, file)
            for page_identifier in docs:
                if page_identifier in doc_list:
                    num += 1
                    for sent_number in docs[page_identifier]:
                        sentence_text = self.process_sent(
                            docs[page_identifier][sent_number])
                        doc = Document()
                        doc.add(Field("page_identifier", page_identifier, ft1))
                        doc.add(Field("sentence_number", sent_number, ft1))
                        doc.add(Field("sentence_text", sentence_text, ft2))
                        writer.addDocument(doc)
                    print(num)

        writer.commit()
        writer.close()
        index_dir.close()
 def __init__(self, **kwargs):
     xmlpath = kwargs.get('xmlpath')
     storeDir = kwargs.get('storeDir')
     analyzer = kwargs.get('analyzer')
     ItemClass = kwargs.get('ItemClass')
     if not os.path.exists(storeDir):
         os.mkdir(storeDir)
     store = SimpleFSDirectory(File(storeDir))
     analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
     config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(store, config)
     # self.indexDocs(xmlpath, writer)
     self.indexXML(xmlpath, writer, ItemClass)
     ticker = Ticker()
     print 'commit index',
     threading.Thread(target=ticker.run).start()
     writer.commit()
     writer.close()
     ticker.tick = False
     print 'done'
Esempio n. 28
0
 def updateDeleteRec(self, pid1, pid2, personDB, familyDB, relationDB):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     mt = matchtext()
     writer.deleteDocuments(Term('uid', pid1))
     writer.deleteDocuments(Term('uid', pid2))
     p = personDB.find_one({'_id': pid1})
     matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
     doc = Document()
     doc.add(Field('uid', str(pid1), StringField.TYPE_STORED))
     doc.add(Field('sex', str(p['sex']), StringField.TYPE_STORED))
     doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
     doc.add(
         Field("text", mt.luceneFix(self.personText(p)),
               TextField.TYPE_NOT_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
Esempio n. 29
0
    def __enter__(self):
        """
        Used by "with" statement. Like an "open" / "init" method.
        """
        if lucene.getVMEnv() is None:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        index_path = Path(self.index_root_loc).joinpath('%s/' %
                                                        self.index_subdir_name)
        index_path.mkdir(parents=True, exist_ok=True)
        store = SimpleFSDirectory(Paths.get(str(index_path)))
        self.analyzer = StandardAnalyzer()
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        # IndexWriter
        self.writer = IndexWriter(store, config)
        # IndexReader
        self.reader = DirectoryReader.open(self.writer)
        # IndexSearcher
        self.searcher = IndexSearcher(self.reader)

        return self
Esempio n. 30
0
def indexer():
    '''索引器'''
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT,
                                     StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    movies = MovieInfo.query.limit(10000).all()
    print("Index starting...")
    for n, l in enumerate(movies):
        doc = Document()
        doc.add(Field("name", l.name, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(
            Field("shortcut", l.shortcut, Field.Store.YES,
                  Field.Index.ANALYZED))
        doc.add(Field('url', l.url, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        print("Item {} indexed...".format(n + 1))
    print("Index finished...")
    print("Closing index of %d docs..." % writer.numDocs())
    writer.close()