Beispiel #1
0
def init_lucene_search():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    print 'Index ', INDEX_DIR
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))  # current dir
    directory = SimpleFSDirectory(File(INDEX_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT,
                                    SmartChineseAnalyzer.getDefaultStopSet())

    return searcher, analyzer
Beispiel #2
0
 def __init__(self, path=INDEX_DIR):
     # 初始化lucene,设置好analyzer、reader、searcher和分词器
     lucene.initVM()
     self.indir = SimpleFSDirectory(Paths.get(path))
     self.analyzer = SmartChineseAnalyzer()
     self.reader = DirectoryReader.open(self.indir)
     self.searcher = IndexSearcher(self.reader)
Beispiel #3
0
def searchResults(command):
    STORE_DIR = "./index_2"
    directory = SimpleFSDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer()
    num, results = run(searcher, analyzer, command)
    del searcher
    return results
Beispiel #4
0
def init_lucene_search():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    print 'Index ', INDEX_DIR
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))  # current dir
    directory = SimpleFSDirectory(File(INDEX_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet())

    return searcher, analyzer
Beispiel #5
0
    def __init__(self):
        indexDir = RAMDirectory()
        analyzer = SmartChineseAnalyzer()
        writerConfig = IndexWriterConfig(analyzer)

        # create new directory, remove previously indexed documents
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writerConfig.setSimilarity(mySimilarity())
        logger.debug('search similarity:{}'.format(
            writerConfig.getSimilarity()))
        self.indexDir = indexDir
        self.writer = IndexWriter(indexDir, writerConfig)
Beispiel #6
0
def index(request):
    vm_env = lucene.getVMEnv()
    if (vm_env):
        vm_env.attachCurrentThread()
        base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        directory = SimpleFSDirectory(
            Paths.get("/Users/css/nlplearn/yuliao/index1"))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        ana = SmartChineseAnalyzer()
        command = "你好"
        query = QueryParser("question", ana).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        tmpdata = scoreDocs[0]
        doc = searcher.doc(tmpdata.doc)
        del searcher
        tmpresult = doc.get("answer").encode('utf-8')
        # print tmpresult
        response = HttpResponse(tmpresult)
        # response = HttpResponse('helloworld2')
        return response
    else:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        #lucene.initVM()

        base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        directory = SimpleFSDirectory(
            Paths.get("/Users/css/nlplearn/yuliao/index1"))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        ana = SmartChineseAnalyzer()
        command = "你好"
        query = QueryParser("question", ana).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        tmpdata = scoreDocs[0]
        doc = searcher.doc(tmpdata.doc)
        del searcher
        tmpresult = doc.get("answer").encode('utf-8')
        #print tmpresult
        response = HttpResponse(tmpresult)
        #response = HttpResponse('helloworld2')
        return response
Beispiel #7
0
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        # analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        # use smart chinese analyzer
        analyzer = SmartChineseAnalyzer(
            Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet())
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
def search(command):
    STORE_DIR = "index"
    # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = MMapDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer()
    ans = run(searcher, analyzer, command)
    del searcher
    return ans


# vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
# for y in search('二三四五'):
#     print(y)
Beispiel #9
0
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA)))
            analyzer = SmartChineseAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))

        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.debug('search similarity func: {}'.format(
            self.searcher.getSimilarity()))
Beispiel #10
0
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            logger.info("index directory:{}".format(config.IDX_SSQA))
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA)))
            analyzer = SmartChineseAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))
        writerConfig = IndexWriterConfig(analyzer)
        writerConfig.setSimilarity(mySimilarity())
        logger.debug('writer similarity func: {}'.format(
            writerConfig.getSimilarity()))
        writer = IndexWriter(indexDir, writerConfig)
        self.writer = writer
Beispiel #11
0
    def __init__(self, path):
        print('Searcher initialized...')
        self.path = path
        self.analyzer = SmartChineseAnalyzer()
        # self.analyzer = WhitespaceAnalyzer(Version.LATEST)
        self.reader = DirectoryReader.open(
            SimpleFSDirectory(Paths.get(self.path)))
        self.searcher = IndexSearcher(self.reader)
        self.thu = thulac.thulac(deli='/')

        file = Path('w2v.model')
        if file.is_file():
            print('Model was already trained...loading model')
            self.w2v_model = Word2Vec.load('w2v.model')
        else:
            self.model_train()
            print('Model trained...')
Beispiel #12
0
    def __init__(self, Lid, db_path=config.DB_SSQA):
        lucene.initVM()
        self.db = SSQA_DB(db_path)

        lesson_str = self.db.get_lesson_str(Lid)
        parags = str_lesson2parags(lesson_str)

        # Index a Lesson
        myIndexer = _ChineseRamIndexer()
        myIndexer.index_lesson(parags)
        myIndexer.close()

        self.reader = DirectoryReader.open(myIndexer.indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = SmartChineseAnalyzer()
        logger.debug('search similarity:{}'.format(
            self.searcher.getSimilarity()))
Beispiel #13
0
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        # analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        # use smart chinese analyzer
        analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet())
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'
Beispiel #14
0
def index_and_search_sentence(list_paragraph, question):
    ramDir = RAMDirectory()
    analyzer = SmartChineseAnalyzer()
    myIndexer = SSQA_S_Indexer(ramDir, analyzer)
    try:
        sent_num = 0
        logger.info("Start indexing sentences...")
        for paragraph in tqdm(list_paragraph):
            sentences = re.split('#', paragraph)
            for sent in sentences:
                myIndexer.add(sent)
                sent_num += 1
        logger.info("Indexed {} sentences.".format(sent_num))
        myIndexer.close()
        mySearcher = SSQA_S_Searcher(ramDir, analyzer)
        ret_sents = mySearcher.search(question, 1)
        return ret_sents
        mySearcher.close()
    finally:
        myIndexer.close()
        mySearcher.close()
    def __init__(self, root, storeDir):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(Paths.get(storeDir))
        # analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        analyzer = SmartChineseAnalyzer()
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        # config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print('commit index', )
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print('done')
def searchResults(command):
    STORE_DIR = "index"
    vm_env = getenv()
    try:
        vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    except:
        vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer()
    num, results = run(searcher, analyzer, command)
    del searcher
    return num, results


# if __name__ == '__main__':
#     STORE_DIR = "index"
#     vm_env = getenv()
#     try:
#         vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
#     except:
#         vm_env.attachCurrentThread()
#     print('lucene', lucene.VERSION)
#     #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
#     directory = SimpleFSDirectory(Paths.get(STORE_DIR))
#     searcher = IndexSearcher(DirectoryReader.open(directory))
#     analyzer = SmartChineseAnalyzer()
#     command = input("Query:")
#     num,results = run(searcher, analyzer,command)
#     print(num)
#     for result in results:
#         print(result['url'])
#         print(result['Acodes'])
#         print(result['Stockname'])
#         print(result['relative1'])
#         print(result['relative2'])
#     del searcher
Beispiel #17
0
                    contents = unicode(file.read(), 'utf-8')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print IndexFiles.__doc__
        sys.exit(1)
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
        # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        IndexFiles(sys.argv[1], INDEX_DIR,
                   SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet()))
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
        raise e
Beispiel #18
0
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print IndexFiles.__doc__
        sys.exit(1)
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
        # base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
        IndexFiles(
            sys.argv[1], INDEX_DIR,
            SmartChineseAnalyzer(Version.LUCENE_CURRENT,
                                 SmartChineseAnalyzer.getDefaultStopSet()))
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
        raise e
Beispiel #19
0
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        command = unicode(command, 'utf-8')
        if command == '':
            return

        print
        print "Searching for:", command
        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print doc.get("name1"), doc.get("name2")


if __name__ == '__main__':
    STORE_DIR = "index"
    vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    vm_env.attachCurrentThread()
    print 'lucene', lucene.VERSION
    #base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer)
    del searcher