Beispiel #1
0
def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()
Beispiel #2
0
 def __init__(self):
     self.__dict__ = self.__shared_state
     if not self.__shared_state:
         self.jccvm = lucene.initVM()
         self.index = SimpleFSDirectory(
             lucene.File(settings.lucene_index_dir))
         self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
def main1():
    print "retrieve and display files......"
    direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    searcher = lucene.IndexSearcher(direc)
    search(searcher, analyzer)
    search2(searcher, analyzer)
Beispiel #4
0
 def __init__(self):
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     self.analyzers = {"smartcn": smartcn}
     directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
     self.searcher = lucene.IndexSearcher(directory, True)
     self.pgconn = mypass.getConn()
     self.sw = sinaweibooauth.SinaWeiboOauth()
Beispiel #5
0
 def __init__(self, index_dir):
     '''
     Initialises index parameters
     '''
     lucene.initVM()
     self.index_dir = index_dir
     if not os.path.exists(self.index_dir):
         os.mkdir(self.index_dir)
     store = lucene.SimpleFSDirectory(lucene.File(self.index_dir))
     self.analyser = PorterStemmerAnalyzer()
     self.writer = lucene.IndexWriter(
         store, self.analyser, True,
         lucene.IndexWriter.MaxFieldLength.LIMITED)
     self.writer.setMaxFieldLength(1048576)
     directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir))
     self.reader = lucene.FilterIndexReader.open(directory, True)
Beispiel #6
0
    def search(self, restrictions, destination):
        """ 
        @see: L{NullPrincipalSearcher<datafinder.persistence.search.searcher.NullSearcher>} 
        
        E1101: Pylint cannot detect the internals of the modules solr and lucene. 
        """
        # pylint: disable=E1101

        results = list()
        queryString = search_restriction_mapping.mapSearchRestriction(
            restrictions)
        if self._configuration.luceneIndexUri.startswith("file:///"):
            try:
                self._configuration.env.attachCurrentThread()
                indexDir = lucene.SimpleFSDirectory(
                    lucene.File(
                        self._configuration.luceneIndexUri.replace(
                            "file:///", "")))
                analyzer = lucene.StandardAnalyzer(
                    lucene.Version.LUCENE_CURRENT)
                searcher = lucene.IndexSearcher(indexDir)
                query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT,
                                           "content",
                                           analyzer).parse(queryString)
                hits = searcher.search(query, constants.MAX_RESULTS)
                for hit in hits.scoreDocs:
                    doc = searcher.doc(hit.doc)
                    results.append("/%s" % urllib.unquote(
                        doc.get(constants.FILEPATH_FIELD).encode("utf-8")))
                searcher.close()
            except Exception, error:
                errorMessage = "Cannot search items. Reason: '%s'" % error
                raise PersistenceError(errorMessage)
Beispiel #7
0
 def __init__(self, forumname):
     if not forumname in self.supported_forums:
         sys.exit()
     else:
         self.forum = forumname
     self.STORE_DIR = self.STORE_BASE_DIR + forumname
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     self.analyzers = {"smartcn": smartcn}
     directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
     self.searcher = lucene.IndexSearcher(directory, True)
     self.pgconn = mypass.getConn()
Beispiel #8
0
 def __init__(self, network):
     self.network = network
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
     analyzers = {"smartcn": smartcn}
     self.pgconn = mypass.getConn()
     writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33,
                                             analyzers["smartcn"])
     writerconfig.setWriteLockTimeout(600000L)
     writerconfig.setMaxThreadStates(50)
     writerconfig.setRAMBufferSizeMB(128.0)
     self.storeDir = self.storeDirBase + self.network
     store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
     self.writer = lucene.IndexWriter(store, writerconfig)
Beispiel #9
0
	def __init__(self, root, storeDir, analyzer):
		if not os.path.exists(storeDir):
			os.mkdir(storeDir)
		store = lucene.SimpleFSDirectory(lucene.File(storeDir))
		writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED)
		writer.setMaxFieldLength(1048576)
		self.indexDocs(root, writer)
		ticker = Ticker()
		print 'optimizing index',
		threading.Thread(target=ticker.run).start()
		writer.optimize()
		writer.close()
		ticker.tick = False
		print 'done'
Beispiel #10
0
    def __init__(self):
	smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
	#analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
	analyzers = { "smartcn": smartcn }
	self.pgconn = mypass.getConn()
	self.sw = sinaweibooauth.SinaWeiboOauth()
	if not os.path.exists(self.storeDir):
	    os.mkdir(self.storeDir)
	store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
	writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"])
	writerconfig.setWriteLockTimeout(600000L)
	writerconfig.setMaxThreadStates(50)
	writerconfig.setRAMBufferSizeMB(128.0)
	self.writer = lucene.IndexWriter(store, writerconfig)
def func_pic(command):
    global vm_env
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR="graphIndex"
    directory = lucene.SimpleFSDirectory(lucene.File(STORE_DIR))
    searcher = lucene.IndexSearcher(directory, True)
    analyzer = lucene.SimpleAnalyzer(lucene.Version.LUCENE_CURRENT)
    title = []
    url = []
    imgurl = []
    score = []
    resultInfo, title, url, imgurl, score = run(command, searcher, analyzer)
    searcher.close()
    return resultInfo, title, url, imgurl, score
Beispiel #12
0
def search(request,
           template_name='reviews/search.html',
           local_site_name=None):
    """
    Searches review requests on Review Board based on a query string.
    """
    query = request.GET.get('q', '')
    siteconfig = SiteConfiguration.objects.get_current()

    if not siteconfig.get("search_enable"):
        # FIXME: show something useful
        raise Http404

    if not query:
        # FIXME: I'm not super thrilled with this
        return HttpResponseRedirect(reverse("root"))

    if query.isdigit():
        query_review_request = get_object_or_none(ReviewRequest, pk=query)
        if query_review_request:
            return HttpResponseRedirect(query_review_request.get_absolute_url())

    import lucene
    lv = [int(x) for x in lucene.VERSION.split('.')]
    lucene_is_2x = lv[0] == 2 and lv[1] < 9
    lucene_is_3x = lv[0] == 3 or (lv[0] == 2 and lv[1] == 9)

    # We may have already initialized lucene
    try:
        lucene.initVM(lucene.CLASSPATH)
    except ValueError:
        pass

    index_file = siteconfig.get("search_index_file")
    if lucene_is_2x:
        store = lucene.FSDirectory.getDirectory(index_file, False)
    elif lucene_is_3x:
        store = lucene.FSDirectory.open(lucene.File(index_file))
    else:
        assert False

    try:
        searcher = lucene.IndexSearcher(store)
    except lucene.JavaError, e:
        # FIXME: show a useful error
        raise e
Beispiel #13
0
def index_files(board, time_delta):
    store = lucene.SimpleFSDirectory(
        lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX))
    writer = lucene.IndexWriter(
        store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True,
        lucene.IndexWriter.MaxFieldLength.UNLIMITED)
    #  writer.setMaxFieldLength(1048576) # 1MB

    flist = get_all_files(board, time_delta)
    for filename, owner, title in flist:
        path = BOARDSPATH + board + '/' + filename
        if not os.path.exists(path):
            continue

        f = open(path, 'r')
        contents = filter_file(f)
        debug(contents)
        try:
            title = title.decode('gbk')
            owner = owner.decode('gbk')
            contents = unicode(contents, 'gbk')
        except UnicodeDecodeError:
            f.close()
            debug(filename)
            continue
        f.close()

        if len(contents) > 0:
            doc = lucene.Document()
            doc.add(
                lucene.Field("name", filename, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("owner", owner, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("title", title, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("contents", contents, lucene.Field.Store.NO,
                             lucene.Field.Index.ANALYZED))
            writer.addDocument(doc)
            debug('adding ' + filename)
    writer.optimize()
    writer.close()
Beispiel #14
0
def main1():
    print "started indexing sample files......"
    direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = lucene.IndexWriter(direc, config)

    #fix this later.....FieldType not defined
    #field_type=lucene.FieldType()
    #field_type.setIndexed(True)
    #field_type.setStored(False)
    #field_type.setTokenized(False)

    file1 = open("nitin.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    file1 = open("nitin2.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    writer.optimize()
    print "Indexed and optimized %d documents" % writer.numDocs()
    writer.close()
    def __init__(self, root, storeDir, analyzer, startDate, endDate):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = lucene.SimpleFSDirectory(lucene.File(storeDir))
        # 创建IndexWriter对象,第一个参数是Directory,第二个是分词器,
        # 第三个表示是否是创建,如果为false为在此基础上面修改,
        # 第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,一般用IndexWriter.MaxFieldLength.LIMITED
        writer = lucene.IndexWriter(store, analyzer, False,
                                    lucene.IndexWriter.MaxFieldLength.LIMITED)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer, startDate, endDate)
        ticker = Ticker()
        print 'optimizing index',
        threading.Thread(target=ticker.run).start()
        writer.optimize()
        writer.close()
        ticker.tick = False
        print 'done'
Beispiel #16
0
def Index():
    field_list, conn, _config_dict = _InitIndexer()

    indexDir = _config_dict['indexDir']
    if not os.path.exists(indexDir):
        os.mkdir(indexDir)
    store = SimpleFSDirectory(lucene.File(indexDir))
    #print store
    writer = IndexWriter(store,
                         SmartChineseAnalyzer(lucene.Version.LUCENE_CURRENT),
                         True, IndexWriter.MaxFieldLength.LIMITED)
    writer.setMaxFieldLength(1048576)
    try:
        ticker = Ticker()
        ticker.start()
        _IndexDocs(writer, field_list, conn)
        ticker.end()
        ticker.TimeCost()
    except Exception, e:
        print "Failed in Indexing...", e
        traceback.print_exc()
Beispiel #17
0
    def search(self, query, field="content", limit=None):
        '''
        Searches the index based on the query supplied.
        '''
        directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir))
        searcher = lucene.IndexSearcher(directory, True)

        query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, field,
                                   self.analyser).parse(query)
        try:
            #if there's no limit then use a collector to retrieve them all
            if limit is None:
                collector = DocumentHitCollector(searcher)
                scoreDocs = searcher.search(query, collector)
                results = collector.get_collected_documents()
            else:
                scoreDocs = searcher.search(query, limit).scoreDocs
                results = []
                for scoreDoc in scoreDocs:
                    results.append(searcher.doc(scoreDoc.doc))
        except lucene.JavaError, e:
            print e
Beispiel #18
0
    def handle_noargs(self, **options):
        siteconfig = SiteConfiguration.objects.get_current()

        # Refuse to do anything if they haven't turned on search.
        if not siteconfig.get("search_enable"):
            sys.stderr.write('Search is currently disabled. It must be '
                             'enabled in the Review Board administration '
                             'settings to run this command.\n')
            sys.exit(1)

        if not have_lucene:
            sys.stderr.write('PyLucene is required to build the search index.\n')
            sys.exit(1)

        incremental = options.get('incremental', True)

        store_dir = siteconfig.get("search_index_file")
        if not os.path.exists(store_dir):
            os.mkdir(store_dir)
        timestamp_file = os.path.join(store_dir, 'timestamp')

        timestamp = 0
        if incremental:
            try:
                f = open(timestamp_file, 'r')
                timestamp = datetime.utcfromtimestamp(int(f.read()))
                f.close()
            except IOError:
                incremental = False

        f = open(timestamp_file, 'w')
        f.write('%d' % time.time())
        f.close()

        if lucene_is_2x:
            store = lucene.FSDirectory.getDirectory(store_dir, False)
            writer = lucene.IndexWriter(store, False,
                                        lucene.StandardAnalyzer(),
                                        not incremental)
        elif lucene_is_3x:
            store = lucene.FSDirectory.open(lucene.File(store_dir))
            writer = lucene.IndexWriter(store,
                lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT),
                not incremental,
                lucene.IndexWriter.MaxFieldLength.LIMITED)
        else:
            assert False

        status = Q(status='P') | Q(status='S')
        objects = ReviewRequest.objects.filter(status)
        if incremental:
            query = Q(last_updated__gt=timestamp)
            # FIXME: re-index based on reviews once reviews are indexed.  I
            # tried ORing this in, but it doesn't seem to work.
            #        Q(review__timestamp__gt=timestamp)
            objects = objects.filter(query)

        if sys.stdout.isatty():
            print 'Creating Review Request Index'
        totalobjs = objects.count()
        i = 0
        prev_pct = -1

        for request in objects:
            try:
                # Remove the old documents from the index
                if incremental:
                    writer.deleteDocuments(lucene.Term('id', str(request.id)))

                self.index_review_request(writer, request)

                if sys.stdout.isatty():
                    i += 1
                    pct = (i * 100 / totalobjs)
                    if pct != prev_pct:
                        sys.stdout.write("  [%s%%]\r" % pct)
                        sys.stdout.flush()
                        prev_pct = pct

            except Exception, e:
                sys.stderr.write('Error indexing ReviewRequest #%d: %s\n' % \
                                 (request.id, e))
Beispiel #19
0
def index_ontology_files(oboFile, outDir, xref_map):
    """
    Iterates over our list of ontology files and creates an index for each file.
    """
    lucene.initVM()
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Handle a little bit of lucene setup
    filename, _ext = os.path.splitext(os.path.basename(oboFile))

    indexDir = os.path.join(outDir, filename)
    if os.path.exists(indexDir):
        raise ExistingIndexDirectoryException(
            'Error, attempted to index same file twice or index two files named the same'
        )

    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
    writer = lucene.IndexWriter(dir, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    for term in oboparser.parse(oboFile, ['is_a']):
        if term.obsolete:
            continue

        doc = lucene.Document()
        add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED)
        add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 4.0)

        # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could
        # query hits that we would not want to occur thus errantly increasing the score of the field.
        # We will strip out these hyperlinks and index just the text.
        add_field_to_document(doc, "definition",
                              strip_urls_from_text(term.definition),
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 0.4)

        # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists
        # in our Ontology object and need to be entered in one at a time
        add_fields_to_document(doc, "synonym",
                               [x[0] for x in term.synonyms if x],
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED, 0.7)

        add_fields_to_document(doc, "alt_id", term.alternateIds,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "xref",
            [replace_xref_identifier(x, xref_map) for x in term.xrefs],
            lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "relationship",
            [" ".join(list(x)) for x in list(term.relationships)],
            lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)
        add_fields_to_document(doc, "subset", term.subsets,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        writer.addDocument(doc)

    writer.optimize()
    writer.close()
Beispiel #20
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import lucene, sys, os
import traceback

path = os.getcwd()
lucene.initVM()
dict = lucene.File("./myindex")
directory = lucene.FSDirectory.open(dict)
sp = lucene.SpellChecker(directory)
dictionary = lucene.File("%s/pipimovieUTF8.txt" % path)
sp.indexDictionary(lucene.PlainTextDictionary(dictionary))
suggestions = sp.suggestSimilar("天汽预报", 2)
for item in suggestions:
    print item
Beispiel #21
0
import lucene

import sys
sys.path.append("..")
import util
from util.rake import Rake

print("load vm")
index_dir = '../../data/index/'

#搭配地址
location_dir = '../../data/location/'

lucene.initVM()

directory = lucene.SimpleFSDirectory(lucene.File(index_dir))
analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

directory1 = lucene.SimpleFSDirectory(lucene.File(location_dir))
analyzer1 = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

rake = Rake("../../data/SmartStoplist.txt")


def search(word):
    print("searching ")

    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    searcher = lucene.IndexSearcher(directory, True)
# -*_ coding: utf-8 -*-
#
from lucene import *
import lucene
text = ["a b c d" , "c d e e"]
texts = ["Python 是 一个 很有 吸引力 的 语言",
"C++ 语言 也 很 有 吸引力 , 长久 不衰",
"我们 希望 Python 和 C++ 高手加入",
"我们 的 技术 巨牛 ,人人 都是 高手"]

initVM()
INDEX_DIR = '/root/weibo_corpus/post_index'
directory = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
analyzer = SimpleAnalyzer()


def read(filename):
    text = []
    with open(filename,'r') as f:
        count = 0
        for line in f:
            text.append(line.strip())
            count = count + 1
            if(count%10000==1):
                print(count)
    return text




def search(searcher,qtext):
Beispiel #23
0
            doc.get("title").encode('gbk')
        ])

    # sort result
    results.sort(lambda x, y: cmp(x[0], y[0]))
    for name, owner, title in results:
        print name, owner, title


def test_fixture():
    global BOARDSPATH
    BOARDSPATH = './'


if __name__ == '__main__':
    #test_fixture()

    board = sys.argv[1]
    querystr = sys.argv[2].decode('gbk').strip()

    lucene.initVM()

    path = BOARDSPATH + board + '/' + RECENT_INDEX
    if not os.path.exists(path) or len(querystr) == 0:
        sys.exit(-1)
    directory = lucene.SimpleFSDirectory(lucene.File(path))
    searcher = IndexSearcher(directory)
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    run(searcher, analyzer, querystr)
    searcher.close()
Beispiel #24
0
 def __init__(self, storeDir):
     lucene.initVM()
     print 'lucene', lucene.VERSION
     self.dir = lucene.SimpleFSDirectory(lucene.File(storeDir))
#OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
""" Creates a sample Lucene index for the full-text search feature. """

import lucene
import sys

if __name__ == "__main__":
    lucene.initVM()
    indexDir = "D:/Downloads/index"
    dir_ = lucene.SimpleFSDirectory(lucene.File(indexDir))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = lucene.IndexWriter(dir_, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    print("Currently there are %d documents in the index..." %
          writer.numDocs())

    content = (
        "Strategische Konzeption, Umsetzung und Betreuung von langfristig " +
        "hochwirksamen und messbar erfolgreichen Maßnahmen in Social Media.")
    doc = lucene.Document()
    doc.add(
        lucene.Field("content", content, lucene.Field.Store.YES,
                     lucene.Field.Index.ANALYZED))
    doc.add(
Beispiel #26
0
#!/usr/bin/python
#coding: utf-8

#建索引的文件

import lucene
import csv

index_dir = '../../data/index/'
data_dir = '../../data/corpus.csv'

lucene.initVM()
directory = lucene.SimpleFSDirectory(lucene.File(index_dir))
analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)


def build_index():
    f = open(data_dir)
    reader = csv.reader(f)

    print("开始创建索引")

    indx = 0

    writer = lucene.IndexWriter(directory, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength.UNLIMITED)

    for line in reader:
        eng, zh = line[0], line[1]

        doc = lucene.Document()
Beispiel #27
0
 def __init__(self, dir_file_path):
     lucene.initVM()
     self.directory = lucene.SimpleFSDirectory(lucene.File(dir_file_path))
     self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_30)
     self.search = lucene.IndexSearcher(self.directory)