def main1():
    print "retrieve and display files......"
    direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    searcher = lucene.IndexSearcher(direc)
    search(searcher, analyzer)
    search2(searcher, analyzer)
Example #2
0
def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()
Example #3
0
 def __init__(self):
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     self.analyzers = {"smartcn": smartcn}
     directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
     self.searcher = lucene.IndexSearcher(directory, True)
     self.pgconn = mypass.getConn()
     self.sw = sinaweibooauth.SinaWeiboOauth()
Example #4
0
 def __init__(self, index_dir):
     '''
     Initialises index parameters
     '''
     lucene.initVM()
     self.index_dir = index_dir
     if not os.path.exists(self.index_dir):
         os.mkdir(self.index_dir)
     store = lucene.SimpleFSDirectory(lucene.File(self.index_dir))
     self.analyser = PorterStemmerAnalyzer()
     self.writer = lucene.IndexWriter(
         store, self.analyser, True,
         lucene.IndexWriter.MaxFieldLength.LIMITED)
     self.writer.setMaxFieldLength(1048576)
     directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir))
     self.reader = lucene.FilterIndexReader.open(directory, True)
Example #5
0
    def search(self, restrictions, destination):
        """ 
        @see: L{NullPrincipalSearcher<datafinder.persistence.search.searcher.NullSearcher>} 
        
        E1101: Pylint cannot detect the internals of the modules solr and lucene. 
        """
        # pylint: disable=E1101

        results = list()
        queryString = search_restriction_mapping.mapSearchRestriction(
            restrictions)
        if self._configuration.luceneIndexUri.startswith("file:///"):
            try:
                self._configuration.env.attachCurrentThread()
                indexDir = lucene.SimpleFSDirectory(
                    lucene.File(
                        self._configuration.luceneIndexUri.replace(
                            "file:///", "")))
                analyzer = lucene.StandardAnalyzer(
                    lucene.Version.LUCENE_CURRENT)
                searcher = lucene.IndexSearcher(indexDir)
                query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT,
                                           "content",
                                           analyzer).parse(queryString)
                hits = searcher.search(query, constants.MAX_RESULTS)
                for hit in hits.scoreDocs:
                    doc = searcher.doc(hit.doc)
                    results.append("/%s" % urllib.unquote(
                        doc.get(constants.FILEPATH_FIELD).encode("utf-8")))
                searcher.close()
            except Exception, error:
                errorMessage = "Cannot search items. Reason: '%s'" % error
                raise PersistenceError(errorMessage)
Example #6
0
 def __init__(self, forumname):
     if not forumname in self.supported_forums:
         sys.exit()
     else:
         self.forum = forumname
     self.STORE_DIR = self.STORE_BASE_DIR + forumname
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     self.analyzers = {"smartcn": smartcn}
     directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
     self.searcher = lucene.IndexSearcher(directory, True)
     self.pgconn = mypass.getConn()
Example #7
0
 def __init__(self, network):
     self.network = network
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
     analyzers = {"smartcn": smartcn}
     self.pgconn = mypass.getConn()
     writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33,
                                             analyzers["smartcn"])
     writerconfig.setWriteLockTimeout(600000L)
     writerconfig.setMaxThreadStates(50)
     writerconfig.setRAMBufferSizeMB(128.0)
     self.storeDir = self.storeDirBase + self.network
     store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
     self.writer = lucene.IndexWriter(store, writerconfig)
Example #8
0
	def __init__(self, root, storeDir, analyzer):
		if not os.path.exists(storeDir):
			os.mkdir(storeDir)
		store = lucene.SimpleFSDirectory(lucene.File(storeDir))
		writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED)
		writer.setMaxFieldLength(1048576)
		self.indexDocs(root, writer)
		ticker = Ticker()
		print 'optimizing index',
		threading.Thread(target=ticker.run).start()
		writer.optimize()
		writer.close()
		ticker.tick = False
		print 'done'
Example #9
0
    def __init__(self):
	smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
	#analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
	analyzers = { "smartcn": smartcn }
	self.pgconn = mypass.getConn()
	self.sw = sinaweibooauth.SinaWeiboOauth()
	if not os.path.exists(self.storeDir):
	    os.mkdir(self.storeDir)
	store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
	writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"])
	writerconfig.setWriteLockTimeout(600000L)
	writerconfig.setMaxThreadStates(50)
	writerconfig.setRAMBufferSizeMB(128.0)
	self.writer = lucene.IndexWriter(store, writerconfig)
def func_pic(command):
    global vm_env
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR="graphIndex"
    directory = lucene.SimpleFSDirectory(lucene.File(STORE_DIR))
    searcher = lucene.IndexSearcher(directory, True)
    analyzer = lucene.SimpleAnalyzer(lucene.Version.LUCENE_CURRENT)
    title = []
    url = []
    imgurl = []
    score = []
    resultInfo, title, url, imgurl, score = run(command, searcher, analyzer)
    searcher.close()
    return resultInfo, title, url, imgurl, score
Example #11
0
def index_files(board, time_delta):
    store = lucene.SimpleFSDirectory(
        lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX))
    writer = lucene.IndexWriter(
        store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True,
        lucene.IndexWriter.MaxFieldLength.UNLIMITED)
    #  writer.setMaxFieldLength(1048576) # 1MB

    flist = get_all_files(board, time_delta)
    for filename, owner, title in flist:
        path = BOARDSPATH + board + '/' + filename
        if not os.path.exists(path):
            continue

        f = open(path, 'r')
        contents = filter_file(f)
        debug(contents)
        try:
            title = title.decode('gbk')
            owner = owner.decode('gbk')
            contents = unicode(contents, 'gbk')
        except UnicodeDecodeError:
            f.close()
            debug(filename)
            continue
        f.close()

        if len(contents) > 0:
            doc = lucene.Document()
            doc.add(
                lucene.Field("name", filename, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("owner", owner, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("title", title, lucene.Field.Store.YES,
                             lucene.Field.Index.NOT_ANALYZED))
            doc.add(
                lucene.Field("contents", contents, lucene.Field.Store.NO,
                             lucene.Field.Index.ANALYZED))
            writer.addDocument(doc)
            debug('adding ' + filename)
    writer.optimize()
    writer.close()
Example #12
0
def main1():
    print "started indexing sample files......"
    direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
    writer = lucene.IndexWriter(direc, config)

    #fix this later.....FieldType not defined
    #field_type=lucene.FieldType()
    #field_type.setIndexed(True)
    #field_type.setStored(False)
    #field_type.setTokenized(False)

    file1 = open("nitin.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    file1 = open("nitin2.json")
    data = file1.read()
    contents = json.loads(data)
    doc = lucene.Document()
    field = lucene.Field("name", contents['name'], lucene.Field.Store.NO,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    field = lucene.Field("data", data, lucene.Field.Store.YES,
                         lucene.Field.Index.ANALYZED)
    doc.add(field)
    writer.addDocument(doc)
    file1.close()

    writer.optimize()
    print "Indexed and optimized %d documents" % writer.numDocs()
    writer.close()
    def __init__(self, root, storeDir, analyzer, startDate, endDate):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = lucene.SimpleFSDirectory(lucene.File(storeDir))
        # 创建IndexWriter对象,第一个参数是Directory,第二个是分词器,
        # 第三个表示是否是创建,如果为false为在此基础上面修改,
        # 第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,一般用IndexWriter.MaxFieldLength.LIMITED
        writer = lucene.IndexWriter(store, analyzer, False,
                                    lucene.IndexWriter.MaxFieldLength.LIMITED)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer, startDate, endDate)
        ticker = Ticker()
        print 'optimizing index',
        threading.Thread(target=ticker.run).start()
        writer.optimize()
        writer.close()
        ticker.tick = False
        print 'done'
Example #14
0
    def search(self, query, field="content", limit=None):
        '''
        Searches the index based on the query supplied.
        '''
        directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir))
        searcher = lucene.IndexSearcher(directory, True)

        query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, field,
                                   self.analyser).parse(query)
        try:
            #if there's no limit then use a collector to retrieve them all
            if limit is None:
                collector = DocumentHitCollector(searcher)
                scoreDocs = searcher.search(query, collector)
                results = collector.get_collected_documents()
            else:
                scoreDocs = searcher.search(query, limit).scoreDocs
                results = []
                for scoreDoc in scoreDocs:
                    results.append(searcher.doc(scoreDoc.doc))
        except lucene.JavaError, e:
            print e
# -*_ coding: utf-8 -*-
#
from lucene import *
import lucene
text = ["a b c d" , "c d e e"]
texts = ["Python 是 一个 很有 吸引力 的 语言",
"C++ 语言 也 很 有 吸引力 , 长久 不衰",
"我们 希望 Python 和 C++ 高手加入",
"我们 的 技术 巨牛 ,人人 都是 高手"]

initVM()
INDEX_DIR = '/root/weibo_corpus/post_index'
directory = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR))
analyzer = SimpleAnalyzer()


def read(filename):
    text = []
    with open(filename,'r') as f:
        count = 0
        for line in f:
            text.append(line.strip())
            count = count + 1
            if(count%10000==1):
                print(count)
    return text




def search(searcher,qtext):
Example #16
0
            doc.get("title").encode('gbk')
        ])

    # sort result
    results.sort(lambda x, y: cmp(x[0], y[0]))
    for name, owner, title in results:
        print name, owner, title


def test_fixture():
    global BOARDSPATH
    BOARDSPATH = './'


if __name__ == '__main__':
    #test_fixture()

    board = sys.argv[1]
    querystr = sys.argv[2].decode('gbk').strip()

    lucene.initVM()

    path = BOARDSPATH + board + '/' + RECENT_INDEX
    if not os.path.exists(path) or len(querystr) == 0:
        sys.exit(-1)
    directory = lucene.SimpleFSDirectory(lucene.File(path))
    searcher = IndexSearcher(directory)
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    run(searcher, analyzer, querystr)
    searcher.close()
Example #17
0
 def __init__(self, storeDir):
     lucene.initVM()
     print 'lucene', lucene.VERSION
     self.dir = lucene.SimpleFSDirectory(lucene.File(storeDir))
#OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
""" Creates a sample Lucene index for the full-text search feature. """

import lucene
import sys

if __name__ == "__main__":
    lucene.initVM()
    indexDir = "D:/Downloads/index"
    dir_ = lucene.SimpleFSDirectory(lucene.File(indexDir))
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = lucene.IndexWriter(dir_, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    print("Currently there are %d documents in the index..." %
          writer.numDocs())

    content = (
        "Strategische Konzeption, Umsetzung und Betreuung von langfristig " +
        "hochwirksamen und messbar erfolgreichen Maßnahmen in Social Media.")
    doc = lucene.Document()
    doc.add(
        lucene.Field("content", content, lucene.Field.Store.YES,
                     lucene.Field.Index.ANALYZED))
    doc.add(
Example #19
0
#!/usr/bin/python
#coding: utf-8

#建索引的文件

import lucene
import csv

index_dir = '../../data/index/'
data_dir = '../../data/corpus.csv'

lucene.initVM()
directory = lucene.SimpleFSDirectory(lucene.File(index_dir))
analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)


def build_index():
    f = open(data_dir)
    reader = csv.reader(f)

    print("开始创建索引")

    indx = 0

    writer = lucene.IndexWriter(directory, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength.UNLIMITED)

    for line in reader:
        eng, zh = line[0], line[1]

        doc = lucene.Document()
Example #20
0
import lucene

import sys
sys.path.append("..")
import util
from util.rake import Rake

print("load vm")
index_dir = '../../data/index/'

#搭配地址
location_dir = '../../data/location/'

lucene.initVM()

directory = lucene.SimpleFSDirectory(lucene.File(index_dir))
analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

directory1 = lucene.SimpleFSDirectory(lucene.File(location_dir))
analyzer1 = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

rake = Rake("../../data/SmartStoplist.txt")


def search(word):
    print("searching ")

    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()

    searcher = lucene.IndexSearcher(directory, True)
Example #21
0
def index_ontology_files(oboFile, outDir, xref_map):
    """
    Iterates over our list of ontology files and creates an index for each file.
    """
    lucene.initVM()
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Handle a little bit of lucene setup
    filename, _ext = os.path.splitext(os.path.basename(oboFile))

    indexDir = os.path.join(outDir, filename)
    if os.path.exists(indexDir):
        raise ExistingIndexDirectoryException(
            'Error, attempted to index same file twice or index two files named the same'
        )

    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
    writer = lucene.IndexWriter(dir, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    for term in oboparser.parse(oboFile, ['is_a']):
        if term.obsolete:
            continue

        doc = lucene.Document()
        add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED)
        add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 4.0)

        # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could
        # query hits that we would not want to occur thus errantly increasing the score of the field.
        # We will strip out these hyperlinks and index just the text.
        add_field_to_document(doc, "definition",
                              strip_urls_from_text(term.definition),
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 0.4)

        # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists
        # in our Ontology object and need to be entered in one at a time
        add_fields_to_document(doc, "synonym",
                               [x[0] for x in term.synonyms if x],
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED, 0.7)

        add_fields_to_document(doc, "alt_id", term.alternateIds,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "xref",
            [replace_xref_identifier(x, xref_map) for x in term.xrefs],
            lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "relationship",
            [" ".join(list(x)) for x in list(term.relationships)],
            lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)
        add_fields_to_document(doc, "subset", term.subsets,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        writer.addDocument(doc)

    writer.optimize()
    writer.close()
Example #22
0
 def __init__(self, dir_file_path):
     lucene.initVM()
     self.directory = lucene.SimpleFSDirectory(lucene.File(dir_file_path))
     self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_30)
     self.search = lucene.IndexSearcher(self.directory)