Beispiel #1
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/home/ubuntu/Desktop/CoCaBu_remote/GitSearch/Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = KeywordAnalyzer(
        )  #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.close()
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
Beispiel #2
0
def main():
	try:
		indicesDestination = File(dest_path)
		analyzer = KeywordAnalyzer()
		porter_analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"code": porter_analyzer, "description": porter_analyzer, "typed_method_call": KeywordAnalyzer(),
			 "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": porter_analyzer}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_benchmark(writer, counter)
		writer.close()

		print "All jobs are done.."
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()
Beispiel #3
0
def main():
    INDEX_DIR = "indexes"
    try:
        print "Indexing..."
        indexDir = File("/Users/Raphael/Downloads/stackoverflow1107")

        #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "methods_called": KeywordAnalyzer(),
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDir), config)

        index_code_snippet(writer)

        writer.commit()
        writer.close()
        print "Done"
    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
Beispiel #4
0
def main():
    try:
        print "Indexing starts..."
        # indicesDestination = File("/Users/Falcon/Desktop/dyclink_2014")############################################

        indicesDestination = File("/Indices/dyclink/2014")

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)

        counter = Counter()
        generate_indices_from_projects(writer, counter)
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
Beispiel #5
0
def main():
	try:
		print "Indexing..."
		indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_Q_Indices")
		# writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
		analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
		a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
			 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
			 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
		#KeywordAnalyzer : 필드의 전체 원문을 하나의 토큰으로 처리
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
		#PerFieldAnalyzerWrapper : 필드별로 분석기를 지정하는 기능을 지원하는 클래스
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
		config.setInfoStream(System.out)  # 루씬 색인작업 디버깅 // 루크라는 도구를 사용해서 루씬 색인 관리를 할 수도 있음..
		writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

		counter = Counter()
		index_code_snippet(writer, counter)
		writer.commit()

		writer.close()
		print "Done"
		print str(counter)

	except CorruptIndexException as e:  # when index is corrupt
		e.printStackTrace()
	except LockObtainFailedException as e:  # when other writer is using the index
		e.printStackTrace()
	except IOException as e:  # when directory can't be read/written
		e.printStackTrace()
	except SQLException as e:  # when Database error occurs
		e.printStackTrace()
Beispiel #6
0
	def load_index(self):
		a = {"code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(),
			 "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
			 "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": self.porter_analyzer}
		self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
		self.directory = SimpleFSDirectory(self.index_path)
		self.reader = DirectoryReader.open(self.directory)
		self.searcher = IndexSearcher(self.reader)
Beispiel #7
0
 def load_index(self):
     indexDir = File(self.index_path)
     a = {"code": self.porter_analyzer}
     self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
     index = SimpleFSDirectory(indexDir)
     self.reader = IndexReader.open(index)
     n_docs = self.reader.numDocs()
     self.searcher = IndexSearcher(self.reader)
     print("Index contains %d documents." % n_docs)
    def testPerField(self):

        perField = HashMap()
        perField.put("special", SimpleAnalyzer())
        analyzer = PerFieldAnalyzerWrapper(WhitespaceAnalyzer(), perField)

        text = "Qwerty"
        tokenStream = analyzer.tokenStream("field", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)

        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("Qwerty", termAtt.toString(),
                         "WhitespaceAnalyzer does not lowercase")

        tokenStream = analyzer.tokenStream("special", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)
        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("qwerty", termAtt.toString(),
                         "SimpleAnalyzer lowercases")
Beispiel #9
0
    def testPerField(self):

        perField = HashMap()
        perField.put("special", SimpleAnalyzer())
        analyzer = PerFieldAnalyzerWrapper(WhitespaceAnalyzer(), perField)

        text = "Qwerty"
        tokenStream = analyzer.tokenStream("field", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)

        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("Qwerty", termAtt.toString(),
                         "WhitespaceAnalyzer does not lowercase")

        tokenStream = analyzer.tokenStream("special", StringReader(text))
        tokenStream.reset()
        termAtt = tokenStream.getAttribute(CharTermAttribute.class_)
        self.assertTrue(tokenStream.incrementToken())
        self.assertEqual("qwerty", termAtt.toString(),
                         "SimpleAnalyzer lowercases")
Beispiel #10
0
 def load_index(self):
     a = {
         "code": self.porter_analyzer,
         "description": self.porter_analyzer,
         "typed_method_call": KeywordAnalyzer(),
         "extends": KeywordAnalyzer(),
         "used_classes": KeywordAnalyzer(),
         "methods": KeywordAnalyzer(),
         "class_instance_creation": KeywordAnalyzer(),
         "id": KeywordAnalyzer(),
         "literals": self.porter_analyzer
     }
     self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
Beispiel #11
0
def main():
    try:
        print "Indexing..."
        #########################################  경   로  ####################################
        indexDestination = File(
            "/Users/Falcon/Desktop/New_Indices/Stack_A_Indices")

        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": analyzer,
            "extends": analyzer,
            "used_classes": analyzer,
            "methods": analyzer,
            "class_instance_creation": analyzer,
            "methods_called": analyzer,
            "view_count": KeywordAnalyzer(),
            "code_hints": JavaCodeAnalyzer()
        }
        wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        # analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
        # a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(),
        # 	 "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(),
        # 	 "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(),
        # 	 "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()}
        # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)
        # config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)
        # writer = IndexWriter(SimpleFSDirectory(indexDestination), config)

        counter = Counter()
        index_code_snippet(writer, counter)
        writer.commit()
        writer.close()

        print "Done"
        print str(counter)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
    except SQLException as e:  #when Database error occurs
        e.printStackTrace()
Beispiel #12
0
    def load_index(self):
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "literals": self.porter_analyzer
        }
        self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a)
        self.directory = SimpleFSDirectory(self.index_path)

        self.searchermgr = SearcherManager(self.directory, SearcherFactory())
        self.searchermgr.maybeRefresh()
        self.searcher = self.searchermgr.acquire()
Beispiel #13
0
    def load_index(self):
        indexDir = File(self.index_path)
        a = {
            "code": self.porter_analyzer,
            "description": self.porter_analyzer,
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer()
        }
        self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a)

        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(index)
        n_docs = self.reader.numDocs()
        self.searcher = IndexSearcher(self.reader)
        print("\nLoading Indices... GitHub index contains [%d] documents." %
              n_docs)
Beispiel #14
0
    def load_index(self):
        indexDir = File(self.index_path)
        porter_analyzer = PorterAnalyzer(
            StandardAnalyzer(Version.LUCENE_CURRENT))
        a = {
            "typed_method_call": KeywordAnalyzer(),
            "extends": KeywordAnalyzer(),
            "used_classes": KeywordAnalyzer(),
            "methods": KeywordAnalyzer(),
            "class_instance_creation": KeywordAnalyzer(),
            "id": KeywordAnalyzer(),
            "code": JavaCodeAnalyzer()
        }

        self.analyzer = PerFieldAnalyzerWrapper(porter_analyzer, a)
        index = SimpleFSDirectory(indexDir)
        self.reader = IndexReader.open(
            index)  #IndexReader 열고 닫지 않았었음...........................
        n_docs = self.reader.numDocs()
        print("Index contains %d documents." % n_docs)
Beispiel #15
0
def main(src, dst):
    try:
        start_time = time.time()

        print "Indexing starts..."
        indicesDestination = File(dst)
        #writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        #Analyzer : 본문이나 제목 등의 텍스트를 색인하기 전에 반드시 분석기를 거쳐 단어로 분리해야 한다. Analyzer 클래스는 Directory와 함께 IndexWrite 클래스의 생성 메소드에 지정하며 지정된 텍슽트를 색인할 단위 단어로 분리하고 필요 없는 단어를 제거하는 등의 역할을 담당

        analyzer = KeywordAnalyzer(
        )  #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.)
        a = {
            "code": JavaCodeAnalyzer(),
            "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)
        }  #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict())
        wrapper_analyzer = PerFieldAnalyzerWrapper(
            analyzer, a
        )  #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co
        config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

        writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
        #SimpleFSDirectory 옵션은 파일시스템에 특정 디렉토리에 인덱스 파일을 저장하겠다. DB, RAM, File system 3개가 있음
        #config 는 IndexWriter 사용에 필요한 Analyzed 된 token이다.

        counter = Counter()
        generate_indices_from_projects(src, writer, counter)
        writer.close()
        print "Done"
        print str(counter)
        print "$$$%s\tseconds" % (time.time() - start_time)

    except CorruptIndexException as e:  #when index is corrupt
        e.printStackTrace()
    except LockObtainFailedException as e:  #when other writer is using the index
        e.printStackTrace()
    except IOException as e:  #when directory can't be read/written
        e.printStackTrace()
Beispiel #16
0
def main():
	try:
		print "Indexing starts..."
		indicesDestination = File("/Users/Falcon/Desktop/New_Indices/IJA_Indices")

		analyzer = KeywordAnalyzer()  
		a = {"code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT)}
		wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) 				
		config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer)

		writer = IndexWriter(SimpleFSDirectory(indicesDestination), config)
		counter = Counter()
		generate_indices_from_projects(writer, counter)
		writer.close()

		print "Done"
		print str(counter)

	except CorruptIndexException as e:		#when index is corrupt
			e.printStackTrace()
	except LockObtainFailedException as e:	#when other writer is using the index
			e.printStackTrace()
	except IOException as e:	#when directory can't be read/written
			e.printStackTrace()
Beispiel #17
0
    print apis


indexDir = File("/tmp/stackoverflow")

# 1. open the index
analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT))
index = SimpleFSDirectory(indexDir)
reader = IndexReader.open(index)
n_docs = reader.numDocs()
print("Index contains %d documents." % n_docs)

# 2. parse the query from the command line
a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()}
wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a)

query_string = "lucene get similar documents to the current one"
query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"],
                                     wrapper_analyzer)

#base_query = getSpanNearQuery(analyzer, query_string)

base_query = query_parser.parse(query_string)

#http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html
boost_query = FunctionQuery(LongFieldSource("view_count"))
query = CustomScoreQuery(base_query, boost_query)

# queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer)
# query = queryparser.parse(query_string)