Python PerFieldAnalyzerWrapperの例、org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper Pythonの例

コード例 #1

0

ファイルを表示

def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/home/ubuntu/Desktop/CoCaBu_remote/GitSearch/Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = KeywordAnalyzer(
        )  #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.close()
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()

コード例 #2

0

ファイルを表示

def main():
	try:
		indicesDestination = File(dest_path)
		analyzer = KeywordAnalyzer()
		porter_analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"code": porter_analyzer, "description": porter_analyzer, "typed_method_call": KeywordAnalyzer(),
			 "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": porter_analyzer}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_benchmark(writer, counter)
		writer.close()

		print "All jobs are done.."
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()

コード例 #3

0

ファイルを表示

def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/Users/Raphael/Downloads/stackoverflow1107")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "methods_called": KeywordAnalyzer(),
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.commit()
        writer.close()
        print "Done"
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()

コード例 #4

0

ファイルを表示

def main():
    try:
        print "Indexing starts..."
        # indicesDestination = File("/Users/Falcon/Desktop/dyclink_2014")############################################

        indicesDestination = File("/Indices/dyclink/2014")

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)

        counter = Counter()
        generate_indices_from_projects(writer, counter)
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()

コード例 #5

0

ファイルを表示

ファイル: Indexer_Question.py プロジェクト: pombredanne/facoy

def main():
	try:
		print "Indexing..."
		indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_Q_Indices")
		# writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
		analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
			 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
			 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
		#KeywordAnalyzer : 필드의 전체 원문을 하나의 토큰으로 처리
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		#PerFieldAnalyzerWrapper : 필드별로 분석기를 지정하는 기능을 지원하는 클래스
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
		config.setInfoStream(System.out)  # 루씬 색인작업 디버깅 // 루크라는 도구를 사용해서 루씬 색인 관리를 할 수도 있음..
		writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

		counter = Counter()
		index_code_snippet(writer, counter)
		writer.commit()

		writer.close()
		print "Done"
		print str(counter)

	except CorruptIndexException as e:  # when index is corrupt
		e.printStackTrace()
	except LockObtainFailedException as e:  # when other writer is using the index
		e.printStackTrace()
	except IOException as e:  # when directory can't be read/written
		e.printStackTrace()
	except SQLException as e:  # when Database error occurs
		e.printStackTrace()

コード例 #6

0

ファイルを表示

	def load_index(self):
		a = {"code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(),
			 "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer}
		self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
		self.directory = SimpleFSDirectory(self.index_path)
		self.reader = DirectoryReader.open(self.directory)
		self.searcher = IndexSearcher(self.reader)

コード例 #7

0

ファイルを表示

 def load_index(self):
     indexDir = File(self.index_path)
     a = {"code": self.porter_analyzer}
     self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
     print("Index contains %d documents." % n_docs)

コード例 #8

0

ファイルを表示

ファイル: test_PerFieldAnalyzerWrapper.py プロジェクト: svn2github/pylucene

    def testPerField(self):

        perField = HashMap()
        perField.put("special", SimpleAnalyzer())
        analyzer = PerFieldAnalyzerWrapper(WhitespaceAnalyzer(), perField)

        text = "Qwerty"
        tokenStream = analyzer.tokenStream("field", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)

        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("Qwerty", termAtt.toString(),
                         "WhitespaceAnalyzer does not lowercase")

        tokenStream = analyzer.tokenStream("special", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)
        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("qwerty", termAtt.toString(),
                         "SimpleAnalyzer lowercases")

コード例 #9

0

ファイルを表示

    def testPerField(self):

        perField = HashMap()
        perField.put("special", SimpleAnalyzer())
        analyzer = PerFieldAnalyzerWrapper(WhitespaceAnalyzer(), perField)

        text = "Qwerty"
        tokenStream = analyzer.tokenStream("field", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)

        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("Qwerty", termAtt.toString(),
                         "WhitespaceAnalyzer does not lowercase")

        tokenStream = analyzer.tokenStream("special", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)
        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("qwerty", termAtt.toString(),
                         "SimpleAnalyzer lowercases")

コード例 #10

0

ファイルを表示

 def load_index(self):
     a = {
         "code": self.porter_analyzer,
         "description": self.porter_analyzer,
         "typed_method_call": KeywordAnalyzer(),
         "extends": KeywordAnalyzer(),
         "used_classes": KeywordAnalyzer(),
         "methods": KeywordAnalyzer(),
         "class_instance_creation": KeywordAnalyzer(),
         "id": KeywordAnalyzer(),
         "literals": self.porter_analyzer
     }
     self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)

コード例 #11

0

ファイルを表示

def main():
    try:
        print "Indexing..."
        #########################################  경   로  ####################################
        indexDestination = File(
            "/Users/Falcon/Desktop/New_Indices/Stack_A_Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": analyzer,
            "extends": analyzer,
            "used_classes": analyzer,
            "methods": analyzer,
            "class_instance_creation": analyzer,
            "methods_called": analyzer,
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        # analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        # a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
        # 	 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
        # 	 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
        # 	 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
        # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        # config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        # writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        counter = Counter()
        index_code_snippet(writer, counter)
        writer.commit()
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()

コード例 #12

0

ファイルを表示

    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()

コード例 #13

0

ファイルを表示

ファイル: Project_Searcher.py プロジェクト: pombredanne/facoy

    def load_index(self):
        indexDir = File(self.index_path)
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer()
        }
        self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a)

        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(index)
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("\nLoading Indices... GitHub index contains [%d] documents." %
              n_docs)

コード例 #14

0

ファイルを表示

    def load_index(self):
        indexDir = File(self.index_path)
        porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "code": JavaCodeAnalyzer()
        }

        self.analyzer = PerFieldAnalyzerWrapper(porter_analyzer, a)
        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(
            index)  #IndexReader 열고 닫지 않았었음...........................
        n_docs = self.reader.numDocs()
        print("Index contains %d documents." % n_docs)

コード例 #15

0

ファイルを表示

ファイル: Indexer_GitHub.py プロジェクト: pombredanne/facoy

def main(src, dst):
    try:
        start_time = time.time()

        print "Indexing starts..."
        indicesDestination = File(dst)
        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        #Analyzer : 본문이나 제목 등의 텍스트를 색인하기 전에 반드시 분석기를 거쳐 단어로 분리해야 한다. Analyzer 클래스는 Directory와 함께 IndexWrite 클래스의 생성 메소드에 지정하며 지정된 텍슽트를 색인할 단위 단어로 분리하고 필요 없는 단어를 제거하는 등의 역할을 담당

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
        #SimpleFSDirectory 옵션은 파일시스템에 특정 디렉토리에 인덱스 파일을 저장하겠다. DB, RAM, File system 3개가 있음
        #config 는 IndexWriter 사용에 필요한 Analyzed 된 token이다.

        counter = Counter()
        generate_indices_from_projects(src, writer, counter)
        writer.close()
        print "Done"
        print str(counter)
        print "$$$%s\tseconds" % (time.time() - start_time)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()

コード例 #16

0

ファイルを表示

def main():
	try:
		print "Indexing starts..."
		indicesDestination = File("/Users/Falcon/Desktop/New_Indices/IJA_Indices")

		analyzer = KeywordAnalyzer()  
		a = {"code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) 				
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_projects(writer, counter)
		writer.close()

		print "Done"
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()

コード例 #17

0

ファイルを表示

ファイル: SOSearcher.py プロジェクト: pombredanne/CoCaBu

    print apis


indexDir = File("/tmp/stackoverflow")

# 1. open the index
analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()}
wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "lucene get similar documents to the current one"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"],
                                     wrapper_analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)

base_query = query_parser.parse(query_string)

#http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html
boost_query = FunctionQuery(LongFieldSource("view_count"))
query = CustomScoreQuery(base_query, boost_query)

# queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
# query = queryparser.parse(query_string)