Beispiel #1
0
	def __init__(self, writerConfig, indexDir):
		
		lucene.initVM()

		self.mIndexDir = SimpleFSDirectory(File(indexDir))
		self.mConfig = writerConfig
		self.mWriter = IndexWriter(self.mIndexDir, self.mConfig)
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
Beispiel #3
0
def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
Beispiel #4
0
	def index( self ):
		lucene.initVM()
		indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		analyzer = StandardAnalyzer( Version.LUCENE_30 )
		index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) )
		# read input files (.xml)
		for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ):
			corpus = codecs.open( in_file, encoding='utf-8' ).read()
			d = pq( corpus, parser='html' )
			for text in d( 'Article' ).items():
				document = Document()
				# find ID
				art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' )
				# find Title
				art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) )
				# find Abstract
				art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) )
				# find Keyword
				art_keyword = text.find( 'Keyword' ).html().encode('utf-8')
				# find Content
				art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) )
				# find Authors
				art_authors = text.find( 'Authors' ).html().encode('utf-8')
				document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\
									 Field.Store.YES,\
									 Field.Index.ANALYZED ) )
				index_writer.addDocument( document )
			index_writer.optimize()
			index_writer.close()
Beispiel #5
0
    def run(self):
        print "Booting lucene driver worker...."
        lucene.initVM()

        self.fieldType1 = FieldType()
        self.fieldType1.setIndexed(True)
        self.fieldType1.setStored(False)
        self.fieldType1.setTokenized(True)

        self.fieldType2 = FieldType()
        self.fieldType2.setIndexed(True)
        self.fieldType2.setStored(True)
        self.fieldType2.setTokenized(False)

        while(True):
            data = self.queue.get()
            da = data[1]
            response = None
            try:
                self.fil = File(da['data']['indexdir'])
                self.d = NIOFSDirectory(self.fil)
                self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
                self.conf = IndexWriterConfig(
                    Version.LUCENE_CURRENT,
                    self.analyzer)

                response = getattr(self, da['action'])(da['data'])
                self.d.close()
            except Exception as e:
                print e
            if response is None:
                response = {}

            self.ret[data[0]] = response
Beispiel #6
0
def names():
    lst = []
    
    search = "spax"#request.form['product']
    lucene.initVM()
    
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)

    query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search)
    MAX = 1000
    hits = searcher.search(query, MAX)

    print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)

    for hit in hits.scoreDocs:
        if hit.score >= 1:
            print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            print doc.get("text").encode("utf-8")
            items = doc.get("text").encode("utf-8").split(',')
            for item in items:
                if item == search:
                    pass
                elif item not in lst:
                    lst.append(item)
    #print lst
    data = {"products": lst}
    if request.method == 'POST':
        return jsonify(data)
    else:
	return jsonify(data)
Beispiel #7
0
	def retrieve( self, query, max_res = 10 ):
		lucene.initVM()
		inDir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 )
		lucene_searcher = IndexSearcher( inDir )
		my_query = QueryParser( Version.LUCENE_30, 'content' , lucene_analyzer ).parse( query )
		MAX = max_res
		total_hits = lucene_searcher.search( my_query, MAX )
		res_head = '{"query":"' + query + '","results":['
		res_tail = ']}'
		result = res_head
		hits = total_hits.totalHits
		if ( hits > 0 ):
			res_body = ''
			it = 0
			for hit in total_hits.scoreDocs:
				it += 1
				doc = lucene_searcher.doc( hit.doc )
				res_body += '{"rank":' +\
							str( it ) +\
							',"score":"' +\
							str( hit.score ) +\
							'","title":"' +\
							doc.get( 'title' ).encode('utf-8') +\
							'","id":"' +\
							doc.get( 'id' ).encode('utf-8') +\
							'"}'
				if ( it < hits ):
					res_body += ','
			result += res_body
		result += res_tail
		return result
Beispiel #8
0
	def document( self, docId, max_res = 1 ):
		lucene.initVM()
		inDir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 )
		lucene_searcher = IndexSearcher( inDir )
		my_query = QueryParser( Version.LUCENE_30, 'id' , lucene_analyzer ).parse( docId )
		MAX = max_res
		total_hits = lucene_searcher.search( my_query, MAX )
		result = '{'
		hits = total_hits.totalHits
		if ( hits == 1 ):
			for hit in total_hits.scoreDocs:
				doc = lucene_searcher.doc( hit.doc )
				result += '"id":"' +\
						  doc.get( 'id' ) +\
						  '","title":"' +\
						  doc.get( 'title' ) +\
						  '","abstract":"' +\
						  doc.get( 'abstract' ) +\
						  '","keyword":"' +\
						  doc.get( 'keyword' ) +\
						  '","content":"' +\
						  doc.get( 'content' ) +\
						  '","authors":"' +\
						  doc.get( 'authors' ) +\
						  '"'
		result += '}'
		return result
Beispiel #9
0
def initial_searcher():
    lucene.initVM()
    indexDir = INDEX_DIR 
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    searcher = IndexSearcher(dir)
    return searcher, analyzer
def lucene_search(index_dir, limit, query_text):
    '''
    lucene_search: Search a built index and return upto limit number of responses 
    Arguments: Input index folder, limit value of results returned, query(as string)
    Returns: paths of responsive files as list
    '''
    
    logging.basicConfig(file=os.path.join(index_dir,"lucene_search.log"))
    logger.info("Initializing search....")
    lucene.initVM()
    logger.info("Reading index from "+index_dir)
    index = SimpleFSDirectory(File(index_dir))
    analyzer = StandardAnalyzer(Version.LUCENE_30) #Lucene version used to generate index
    searcher = IndexSearcher(index)
    
    logger.info("Parsing query :"+ query_text)
    query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query_text)
    hits = searcher.search(query, limit)

    logger.info("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
    hit_paths = []

    for hit in hits.scoreDocs:
        # The following code also generates score for responsive/found documents and the 
        # content index which matched
        # print hit.score, hit.doc, hit.toString()
        doc = searcher.doc(hit.doc)
        hit_paths.append(doc.get("path"))
    
    return hit_paths 
Beispiel #11
0
def luceneRetriver(query):

	lucene.initVM()

	indir = SimpleFSDirectory(File(INDEXDIR))

	lucene_analyzer = StandardAnalyzer(Version.LUCENE_30)

	lucene_searcher = IndexSearcher(indir)

	my_query = QueryParser(Version.LUCENE_30,"text",\

	lucene_analyzer).parse(query)

	MAX = 1000

	total_hits = lucene_searcher.search(my_query,MAX)

	print "Hits: ",total_hits.totalHits

	for hit in total_hits.scoreDocs:

		print "Hit Score: ",hit.score, "Hit Doc:",hit.doc, "Hit String:",hit.toString()

		doc = lucene_searcher.doc(hit.doc)

		print doc.get("text").encode("utf-8")
def retrieve(indexdir, queries):
    lucene.initVM()
    f = open("results_lucene.txt", "w")
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
    searcher = IndexSearcher(reader)

    fields = ["title", "abstract", "authors"]

    st = PorterStemmer()
    for id, q in queries.iteritems():
        query = q
        tokenizer = RegexpTokenizer(r'\w+')
        qwords = tokenizer.tokenize(query)
        qwords_k = [st.stem(q) for q in qwords]
        query = " ".join(qwords_k)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, query)
        MAX = 1000
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        for i, hit in enumerate(hits.scoreDocs):
            f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
            # print hit.doc+1, hit.score
            # doc = searcher.doc(hit.doc)
            # print doc.get("authors").encode("utf-8")
    f.close()
def search():

	lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
	args = []
	if request.method == 'POST':
		if request.form['ies']:
			args.append('+ies:'+request.form['ies'])
		if request.form['area']:
			args.append('+area:'+request.form['area'])
		if request.form['professor']:
			args.append('+professor:'+request.form['professor'])
		if request.form['conceito']:
			#args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito'])
			args.append('m:'+request.form['conceito'])
			args.append('d:'+request.form['conceito'])
			args.append('f:'+request.form['conceito'])

	table = []
	if(len(args) > 0): 
		scoreDocs = mansearch.buscar('indexer/',args)
		fsDir = SimpleFSDirectory(File(indexDir))
		searcher = IndexSearcher(DirectoryReader.open(fsDir))
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			table.append(dict((field.name(), field.stringValue()) for field in doc.getFields()))
	return render_template('busca.html',table = table)
	
	pass
Beispiel #14
0
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
Beispiel #15
0
def irsolver(data_file, index) :
	from questions import get_input_data
	lucene.initVM()
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	reader = IndexReader.open(SimpleFSDirectory(File(index)))
	searcher = IndexSearcher(reader)
	pred = []
	mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'}

	idx, ques, ans = get_input_data(data_file)
	for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) :
		max_score = -1000000
		best_ans = 'A'
		for i, ai in enumerate(a):
			sc = query(q, ai, analyzer, searcher)
			print(acm, i, sc)
			if sc > max_score :
				max_score = sc
				best_ans = mapp[i+1]
		pred.append(best_ans)

	return idx, pred
    def build_words_index(self):
        relevant_words = self.process_texts()

        # Initialize lucene and JVM
        lucene.initVM()

        # Get the analyzer
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        # Get index storage
        store = SimpleFSDirectory(File(self.index_words))

        # Get index writer
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        writer = IndexWriter(store, config)

        try:
            for word in relevant_words:
                time_series = TimeSeries(word).get_series()
                series_str  = ''
                for t in time_series:
                    series_str += str(t) + ':' + str(time_series[t]) + '\t'
                doc = Document()
                # Add a fields to this document
                doc.add(Field('word', word, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field('series', series_str, Field.Store.YES, Field.Index.ANALYZED))
                # Add the document to the index
                writer.addDocument(doc)
        except Exception, e:
            print "Failed in creating document to add to the index:", e
Beispiel #17
0
    def __init__(self, root, store_dir):

        if not os.path.exists(store_dir):
            os.mkdir(store_dir, 0777)


        # NOTE: Hardcoded the analyzer instead of passing it
        lucene.initVM()
        '''
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        '''
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        store = SimpleFSDirectory(File(store_dir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)

        # Set the permissions to 777 for the index directory and the write.lock file
        chmod_indexdir_cmd = "chmod 0777 " + store_dir
        writelock_file = store_dir + "/" + "write.lock"
        chmod_writelock_cmd = "chmod 0777 " + writelock_file

        if os.path.exists(store_dir):
            cicmd=os.popen("sudo -S %s"%(chmod_indexdir_cmd), 'w').write('vagrant')

        if os.path.exists(writelock_file):
            cwcmd=os.popen("sudo -S %s"%(chmod_writelock_cmd), 'w').write('vagrant')

        # setting CREATE will rewrite over the existing indexes.
        ###config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        writer.close()
Beispiel #18
0
def search(request, template_name="reviews/search.html"):
    """
    Searches review requests on Review Board based on a query string.
    """
    query = request.GET.get("q", "")
    siteconfig = SiteConfiguration.objects.get_current()

    if not siteconfig.get("search_enable"):
        # FIXME: show something useful
        raise Http404

    if not query:
        # FIXME: I'm not super thrilled with this
        return HttpResponseRedirect(reverse("root"))

    import lucene

    # We may have already initialized lucene
    try:
        lucene.initVM(lucene.CLASSPATH)
    except ValueError:
        pass

    index_file = siteconfig.get("search_index_file")
    store = lucene.FSDirectory.getDirectory(index_file, False)
    try:
        searcher = lucene.IndexSearcher(store)
    except lucene.JavaError, e:
        # FIXME: show a useful error
        raise e
Beispiel #19
0
def ExportIndex(b_print = False,b_write_file = False,b_filter = True):
    _dict = ReadConfig()
    initVM()
    try:
        if(b_write_file == True):
            output_file = _dict['resultDir'] + '/' + sys.argv[1] + '.xls'
            _fw = open(output_file,'w')
        directory = SimpleFSDirectory(File(_dict['indexDir']))
        ireader = IndexReader.open(directory)
        # Enum all the terms
        all_terms = ireader.terms()
        word_dict = {}
        _stopword_set = ImportStopword()
#        SetPrint(_stopword_set)
        while all_terms.next():
            term_elem = all_terms.term()
            if term_elem.field() == sys.argv[1]:
                _temp = term_elem.text().rstrip()
                word_dict[_temp] = all_terms.docFreq()
        if(b_filter == True):
            StopwordFilter(word_dict,_stopword_set)
        if(b_print != False):
            DictPrint(word_dict)
        if(b_write_file != False):
            DictPrint(word_dict,out_file=_fw)
            _fw.close()
        all_terms.close()
        return word_dict
    except Exception,e:
        print "Failed: ",e
        traceback.print_exc(file=sys.stdout)
Beispiel #20
0
def start_jvm(clspath='', vmargs=''):
    """Starts the JVM - note that only the first initVM() is effective for java VM!
    Make sure you pass important arguments for the first call, because they will
    set the environment of the Java VM
    @keyword clspath: platform-separator separated values that will be passed to
        the Java VM
    @keyword vmargs: other arguments, comma-separated to pass to the java VM
    @return: JVM object (either new one or already existing one)"""


    #initialize the JVM if not already initialized
    jvm = _jcc_module.getVMEnv()
    if not jvm:
        classpath = []
        if clspath:
            classpath.append(clspath)
        if dumeanj:
            classpath.append(dumeanj.CLASSPATH)
        if lucene:
            classpath.append(lucene.CLASSPATH)
        jvm = _jcc_module.initVM(os.pathsep.join(classpath), vmargs=vmargs)

        if lucene != _jcc_module:
            lucene.initVM(lucene.CLASSPATH)
    elif vmargs:
        raise Exception('initVM() was already started, the second call will be ineffective. Please make sure you are initializing components in the right order!')

    return jvm
Beispiel #21
0
def search(r, keyword=""):
    import logging

    logger = logging.getLogger("search")
    bench = Benchmark(logger)
    from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit
    import lucene, os

    os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17"
    lucene.initVM(lucene.CLASSPATH)

    directory = FSDirectory.open(File(CONFIG.INDEX_PATH))
    ROBOT_INDEX = IndexSearcher(directory, True)
    ROBOT_ANALYZER = StandardAnalyzer()

    keyword = keyword or r.GET["keyword"]
    query = QueryParser("context", ROBOT_ANALYZER)
    query = query.parse('"%s"' % keyword)

    bench.start_mark("search")
    hits = ROBOT_INDEX.search(query)
    count = len(hits)
    result = []
    i = 0
    for hit in hits:
        i += 1
        if i > 100:
            break
        doc = Hit.cast_(hit).getDocument()
        result.append(SearchResult(doc, i, keyword))
    ROBOT_INDEX.close()

    et = bench.stop_mark()

    return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
def wikipedia_indexer(storage, wikipedia_file) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open(wikipedia_file)

	for i, line in enumerate(f) :
		text = line.strip().decode('utf-8').split('\t')
		title = text[0]
		if 'disambigu' in text[0] or len(text) < 2:
			continue
		text = text[1]
		doc = Document()
		doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
		doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
		doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)
		if writer.numDocs() % 1000 == 0 :
			print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
		
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()	
def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
 except lucene.JavaError:
  #print 'Inside Index Except'
  writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
 #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)
 #print 'In the index function'
 #print writer.numDocs()

#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 #print 'ending Indexing'
 #print string 
 #print 'Total indexes'
 #print writer.numDocs() 
 writer.close()
Beispiel #24
0
	def _init_search(cls):
		"""
		Initializes everything needed for search.
		"""
		config_path = cls.search_config_path
		if not os.path.exists(config_path):
			raise OSError(errno.ENOENT, "Config %r does not exist." % config_path, config_path)
		config_dir = os.path.dirname(config_path)
		
		# Read config.
		with open(config_path, 'rb') as fh:
			config = json.load(fh)
		cls.search_config = config
		
		# Connect to mongo.
		host = config['mongo']['host']
		port = config['mongo'].get('port', None) or 27017
		thread_pool = reactor.getThreadPool()
		pool_size = int(math.ceil((thread_pool.min + thread_pool.max) / 2))
		cls.search_mongo = txmongo.lazyMongoConnectionPool(host=host, port=port, pool_size=pool_size)
		cls.search_order_db = cls.search_mongo[config['mongo']['order_dbname']]
		cls.search_order_tb = cls.search_order_db[config['mongo']['order_tbname']]
		
		# Initialize PyLucene.
		lucene.initVM()
		
		# Open index.
		index_path = os.path.abspath(os.path.join(config_dir, config['lucene']['index_path']))
		if not os.path.exists(index_path):
			raise OSError(errno.ENOENT, "Index %r does not exist." % index_path, index_path)
		elif not os.path.isdir(index_path):
			raise OSError(errno.ENOTDIR, "Index %r is not a directory." % index_path, index_path)
		index_dir = lucene.NIOFSDirectory(lucene.File(index_path))
		#index_dir = lucene.SimpleFSDirectory(lucene.File(index_path)) # windows
		cls.search_searcher = lucene.IndexSearcher(index_dir)
def build_lda_corpus(index_folder, paths_index_file,  
                     dictionary_file, ldac_file, min_frequency, 
                     min_word_len, max_word_len=20):
    '''
    The main function that does the job! 
    
    '''
    initVM()  
    store = SimpleFSDirectory(File(index_folder))
    index_reader = IndexReader.open(store)

    # Stores the file paths index (for LDA)
    _store_file_paths_index(index_reader, paths_index_file) 
    
    # Creates the dictionary 
    _create_dictionary(index_reader, dictionary_file, min_frequency, 
                       min_word_len, max_word_len)

    # Creates the corpus 
    dictionary = corpora.Dictionary().load(dictionary_file)      
    # doesn't load the corpus into the memory! 
    corpus_memory_friendly = _TextCorpus(dictionary, index_reader) 
    corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, 
                                 id2word=dictionary)
    
    logging.info('The Enron corpus building is completed.')
	def search(self):
		''' Searches the given query in the index '''

		lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION
		# base_dir = os.path.dirname(os.path.abspath('.'))
		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		

		while True:
			print
			print "Hit enter with no input to quit."
			command = raw_input("Query:")
			if command == '':
				return

			print
			print "Searching for:", command

			query = QueryParser(Version.LUCENE_CURRENT, "title",
								analyzer).parse(command)
			scoreDocs = searcher.search(query, 50).scoreDocs
			print "%s total matching documents." % len(scoreDocs)

			for scoreDoc in scoreDocs:
				doc = searcher.doc(scoreDoc.doc)
				# print 'path:', doc.get("path"), 'name:', doc.get("name")
				print doc
Beispiel #27
0
def get_candidates(qatp):

    if prm.create_index:
        create_index()

    lucene.initVM()
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
    reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
    searcher = IndexSearcher(reader)
    candidates = []
    n = 0
    for q,a,t,p in qatp:
        if n % 100 == 0:
            print 'finding candidates sample', n
        n+=1

        q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
        query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
        hits = searcher.search(query, prm.max_candidates)
        c = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            c.append(doc.get("id"))

        candidates.append(c)
        
    return candidates
Beispiel #28
0
 def __init__(self):
   # Create index directory
   lucene.initVM(lucene.CLASSPATH)
   if not os.path.exists(STORE_DIR):
     os.mkdir(STORE_DIR)
   self.store = lucene.SimpleFSDirectory(lucene.File(STORE_DIR))
   self.im = IndexManager()
Beispiel #29
0
def build_index():

    lucene.initVM()

    # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
    post_dir = '/Users/w3/data/github/codeif_backup'
    index_store_dir = current_app.config['INDEX_STORE_DIR']
    print post_dir
    print index_store_dir

    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    store = SimpleFSDirectory(File(index_store_dir))
    analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)

    indexDocs(post_dir, writer)
    ticker = Ticker()
    print 'commit index',
    threading.Thread(target=ticker.run).start()
    writer.commit()
    writer.close()
    ticker.tick = False
    print 'done'
Beispiel #30
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Beispiel #31
0
# -*- coding: cp949 -*-
import os, os.path, sys
os.environ['PATH'] = os.path.join(os.environ['JAVA_HOME'], r'jre\bin\client') + ';' + os.environ['PATH']
import lucene
lucene.initVM(lucene.CLASSPATH)		# Initialize  JVM

def	IndexCreate(fileDir, indexDir):
	analyzer = lucene.StandardAnalyzer()	# 루씬에서 사용하는 객체 생성
	store = lucene.FSDirectory.getDirectory(indexDir)
	writer = lucene.IndexWriter(store, analyzer)

	for root, dirnames, filenames in os.walk(fileDir):	# 입력받은 폴더에서 텍스트 파일만 검색
		for filename in filenames:
			if not filename.endswith('.txt'):
				continue
			
			print("Adding: %s" % filename)
			try:
				path = os.path.join(root, filename)
				f = open(path)
				content = f.read()
				f.close()

				content = content.decode('cp949').encode('utf-8')	# 인코딩을 'utf-8'로 변경

				doc = lucene.Document()				# Document 객체 추가
				doc.add(lucene.Field(	"name", 	# 파일명
										filename,
										lucene.Field.Store.YES,
										lucene.Field.Index.NO))
				doc.add(lucene.Field(	"path", 	# 파일 경로
Beispiel #32
0
def create_index_from_folder(folder, index_file):
    """Lets Lucene create an index of all database files within a specified folder

    :param folder: absolute or relative path to database files
    :param index_file: absolute or relative output location for index

    Notes:
    - Does not go through database folder recursively, i.e. all files have to be at the root of the folder
    - Only CSV files are supported
    - Column headers are hardcoded and should follow:
        ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold
    """
    # Set up Lucene
    print()
    print("Starting Lucene ...")
    lucene.initVM()
    index_store = SimpleFSDirectory.open(File(index_file).toPath())
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(index_store, config)

    print()
    # Go through files, add rows of each as Documents to writer
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            print("Indexing {} ...".format(file), end=" ", flush=True)
            with open(os.path.join(folder, file), newline='') as db:
                reader = csv.reader(db)

                # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those
                post_ids = set()
                duplicate_counter = 0

                # To store term vectors (used for query expansion) we have to use a custom fieldtype
                customfield = FieldType()
                customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
                customfield.setStored(True)
                customfield.setTokenized(True)
                customfield.setStoreTermVectors(True)

                # CSV files have a useless first row...
                skipfirst = True
                # ... and a useless first column. Skip both.
                for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader:
                    if skipfirst:
                        skipfirst = False
                        continue
                    doc = Document()

                    if rid in post_ids:
                        duplicate_counter += 1
                        continue  # skip
                    else:
                        post_ids.add(rid)

                    # Tokenize, index and store
                    doc.add(Field("text", text, customfield))

                    # Index and store
                    doc.add(StringField("id", rid, Field.Store.YES))
                    doc.add(
                        StringField("subreddit", subreddit, Field.Store.YES))
                    doc.add(StringField("meta", meta, Field.Store.YES))
                    doc.add(StringField("time", time, Field.Store.YES))
                    doc.add(StringField("author", author, Field.Store.YES))

                    # Store only
                    doc.add(StoredField("ups", ups))
                    doc.add(StoredField("downs", downs))
                    doc.add(StoredField("authorlinkkarma", authorlinkkarma))
                    doc.add(StoredField("authorkarma", authorkarma))
                    doc.add(StoredField("authorisgold", authorisgold))

                    writer.addDocument(doc)

            print("DONE!\t(Duplicate posts skipped: {})".format(
                duplicate_counter))

    writer.commit()
    writer.close()

    print()
    print("Finished indexing!")
Beispiel #33
0
# noinspection PyUnresolvedReferences
from org.apache.lucene.search import IndexSearcher, PhraseQuery, RegexpQuery
# noinspection PyUnresolvedReferences
from org.apache.lucene.search.spans import SpanMultiTermQueryWrapper, SpanNearQuery
# noinspection PyUnresolvedReferences
from org.apache.lucene.index import DirectoryReader, Term
# noinspection PyUnresolvedReferences
from org.apache.lucene.store import FSDirectory
# noinspection PyUnresolvedReferences
from org.apache.lucene.queryparser.classic import QueryParser
# noinspection PyUnresolvedReferences
from org.apache.lucene.analysis.standard import StandardAnalyzer

if __name__ == "__main__":
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)

    term = Term("contents", "tiger")
    print(f'Tiger frequency: {reader.totalTermFreq(term)}')

    q_regex = RegexpQuery(Term("contents", "[0-9]+\.?[0-9]*"))
    print(f'regex results: {searcher.search(q_regex,1000000).totalHits}')

    span1 = SpanMultiTermQueryWrapper(q_regex)
    span2 = SpanMultiTermQueryWrapper(RegexpQuery(Term("contents", "tiger")))
    spannearquery = SpanNearQuery([span1, span2], 20, True)
    print(
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher, TermQuery, PhraseQuery, PrefixQuery, FuzzyQuery
from org.apache.lucene.search import WildcardQuery
import lucene
from org.apache.lucene import analysis, document, index, queryparser, search, store
from org.apache.lucene.document import Document, Field, StringField, TextField
from org.apache.lucene.search import IndexSearcher, TermQuery, PhraseQuery
from org.apache.lucene.index import (IndexWriter, IndexReader, DirectoryReader,
                                     Term, IndexWriterConfig)
from lupyne import engine
from tqdm import tqdm
import unicodedata
assert lucene.getVMEnv() or lucene.initVM()
################################################################
# ENVIRON
################################################################
# This is gpu
server1_homepath = "/home/ubuntu/workspace/codelab/"
server2_homepath = "/home/ubuntu/workspace/codelab/"
gpu_homepath = "/home/shawn/workspace/research/final_codelab/"
jun_homepath = "/home/junw/workspace/codelab/"

# choose from the server1, server2, gpu, jun.
SERVERNAME = 'server1'
HOMEPATH = {
    'server1': server1_homepath,
    'server2': server2_homepath,
    'gpu': gpu_homepath,
Beispiel #35
0
        command_dict = parseCommand(command)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)
        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            ##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'path:', doc.get("path")
            print 'name:', doc.get("name")
            print 'title:', doc.get('title')
            print 'author:', doc.get('author')
            print 'language:', doc.get('language')


##            print explanation

if __name__ == '__main__':
    STORE_DIR = "index"
    initVM()
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer)
    searcher.close()
Beispiel #36
0
                                             lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("url", url,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("urltitle", title,
                                             lucene.Field.Store.YES,
                                             lucene.Field.Index.NOT_ANALYZED))
                    writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中
                    print "----------------------------------------------------"
                except Exception, e:
                    print "Failed in indexDocs:", e
            else:
                break
        t.close()

if __name__ == '__main__':
##    if len(sys.argv) < 2:
##        print IndexFiles.__doc__
##        sys.exit(1)
    lucene.initVM() #初始化Java虚拟机
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
##        IndexFiles(sys.argv[1], "index", lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
#        IndexFiles('html', "index", lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
        IndexFiles('html', "index for pic", lucene.WhitespaceAnalyzer(lucene.Version.LUCENE_CURRENT))
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
Beispiel #37
0
def initVM():
    vm_env = lucene.getVMEnv()
    if vm_env:
        vm_env.attachCurrentThread()
    else:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
Beispiel #38
0
 def __init__(self, dir_file_path):
     lucene.initVM()
     self.directory = lucene.SimpleFSDirectory(lucene.File(dir_file_path))
     self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_30)
     self.search = lucene.IndexSearcher(self.directory)
Beispiel #39
0
 def __init__(self):
     lucene.initVM()
 def __init__(self, xmlpath, indexpath, ItemClass):
     self.jccenv = lucene.initVM()
     self.xmlpath = xmlpath
     self.indexpath = indexpath
     self.ItemClass = ItemClass
Beispiel #41
0
    help='optional configuration file or json object of global params')
parser.add_argument(
    '--autoreload',
    type=float,
    metavar='SECONDS',
    help='automatically reload modules; replacement for engine.autoreload',
)
parser.add_argument(
    '--autoupdate',
    type=float,
    metavar='SECONDS',
    help='automatically update index version and commit any changes')
parser.add_argument('--real-time',
                    action='store_true',
                    help='search in real-time without committing')

args = parser.parse_args()
read_only = args.read_only or len(args.directories) > 1
kwargs = {'nrt': True} if args.real_time else {}
if read_only and (args.real_time or not args.directories):
    parser.error('incompatible read/write options')
if args.config and not os.path.exists(args.config):
    args.config = {'global': json.loads(args.config)}
assert lucene.initVM(vmargs='-Xrs,-Djava.awt.headless=true')
cls = WebSearcher if read_only else WebIndexer
root = cls(*args.directories, **kwargs)
start(root,
      config=args.config,
      autoreload=args.autoreload,
      autoupdate=args.autoupdate)
Beispiel #42
0
        elif sys.argv[i] == "-co" or sys.argv[i] == "--check-only":
            sw.checkonly = True
        elif sys.argv[i] == "-u" or sys.argv[i] == "--update":
            sw.doupdate = True
        elif sys.argv[i] == "-srp" or sys.argv[i] == "--save-range-partition":
            sw.saveRP = True
        elif sys.argv[i] == "-i" or sys.argv[i] == "--index":
            sw.index = True

    # bind token and set the api
    sw.setToken(sw.sinaweiboOauth["oauth_token"],
                sw.sinaweiboOauth["oauth_token_secret"])

    # initialize the indexer, if needed
    if sw.index:
        lucene.initVM(lucene.CLASSPATH)
        sw.indexer = sinaweibolucene.IndexSinaWeibo()

    # dispatch
    if sw.force_screenname:
        out = sw.dispatch(opt, fname, output_counts)
        out = [out]
    elif id > 0:
        out = sw.dispatch(opt, id, output_counts)
        out["id"] = id
        out = [out]  # put in an array for consistency with list of ids
    else:
        try:
            f = open(fname, "r")
        except IOError:
            print sw.usage
Beispiel #43
0
def lucene_indexing():
    lucene.initVM()
    whole_tokenized_db_cursor = wiki_db_tool.get_cursor(
        config.WHOLE_PROCESS_FOR_RINDEX_DB)
    whole_tokenized_db_cursor.execute("SELECT * from unnamed")

    indexDir = SimpleFSDirectory(Paths.get(str(config.LUCENE_INDEXED)))
    analyzer = PorterStemmerAnalyzer()
    writerConfig = IndexWriterConfig(analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print("Building lucene index ...")
    with SqliteDict(str(config.WHOLE_WIKI_DB),
                    flag='r',
                    encode=json.dumps,
                    decode=json.loads) as whole_wiki_db:
        for key, value in tqdm(whole_tokenized_db_cursor,
                               total=config.TOTAL_ARTICLE_NUMBER_WHOLE):

            item = json.loads(value)
            article_title = item['title']
            article_clean_text = item['clean_text']
            article_poss = item['poss']

            # TODO: change it to extract abstract wiki?
            # get the first paragraph which has the length >= 50? so weired.
            abs_index = get_first_paragraph_index(whole_wiki_db[article_title])

            if abs_index == -1:  # document too short
                valid_page = False

            # only title
            title_term_list = []
            title_poss_list = []

            # only abstract content
            abstract_term_list = []
            abstract_poss_list = []

            assert len(article_clean_text) == len(article_poss)

            for p_i, (paragraph_text, paragraph_poss) in enumerate(
                    zip(article_clean_text, article_poss)):
                for sent_text, sent_poss in zip(paragraph_text,
                                                paragraph_poss):
                    if p_i == 0:  # In title.
                        title_term_list.extend(sent_text)
                        title_poss_list.extend(sent_poss)
                        continue  # If the terms are in title, we don't include those terms in abstract and article term.
                    else:
                        if p_i == abs_index:  # If the terms are in abstract
                            abstract_term_list.extend(sent_text)
                            abstract_poss_list.extend(sent_poss)

            added_title = article_title
            added_text = " ".join(title_term_list + abstract_term_list)

            doc = Document()
            doc.add(Field("title", added_title, StoredField.TYPE))
            doc.add(Field("text", added_text, TextField.TYPE_STORED))
            writer.addDocument(doc)
    writer.close()
                            print "warning: no content in sentence %d of file %s" % sentence_num, filename
                        writer.addDocument(doc)
                        sentence_num += 1
                except Exception, e:
                    #print "Failed in indexDocs:", e
                    error = 1
        print "Index has Added " + str(docindex_num) + " files..."


if __name__ == '__main__':  # 即可以作为主程序运行,也可以作为模块导入
    if len(sys.argv) < 5:
        print IndexFiles.__doc__
        sys.exit(1)

    startDate = datetime.strptime(sys.argv[3], '%Y%m%d')
    endDate = datetime.strptime(sys.argv[4], '%Y%m%d')

    lucene.initVM()  # 初始化java虚拟机
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
        IndexFiles(sys.argv[1], sys.argv[2],
                   lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT),
                   startDate, endDate)
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e

#    os.system("pause")
 def indexFile(self):
     self._th=lucene.initVM()
     self._analyzer = StandardAnalyzer(Version.LUCENE_36)
     self._dir = RAMDirectory()
     self._writer = IndexWriter(self._dir, self._analyzer, True, IndexWriter.MaxFieldLength(25000))
Beispiel #46
0
    def post(self):
        q = self.get_argument("query")
        k = self.get_argument("kTerms")

        # self.write(key)

        # def query(query):
        # query = self.get_argument("q")
        lucene.initVM()
        indexDir = "index"
        dir = SimpleFSDirectory(File(indexDir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        searcher = IndexSearcher(dir)

        query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q)
        MAX = 10
        hits = searcher.search(query, MAX)

        print "Found %d document(s) that matched query '%s':" % (
            hits.totalHits, query)
        items = []
        rQ = []

        #for key, value in doc_urls.iteritems()
        # print (key, value)

        for hit in hits.scoreDocs:
            #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]})
            print hit.score, hit.doc, hit.toString()
            print(len(doc_urls))
            items.append(doc_urls[str(hit.doc)])
            print(doc_urls[str(hit.doc)])
            doc = searcher.doc(hit.doc)
            print(hit.doc)
            rQ.append("html_files/" + str(hit.doc))

        i = 0
        rqSize = 0
        for url in rQ:
            rqSize = rqSize + 1
            print(url)
            f = codecs.open(url, 'r')
            html = f.read()
            html = html.decode('utf-8')
            tag_free = strip_tags(html)
            path = 'strippedHTML_files'
            if not os.path.exists(path):
                os.makedirs(path)
            filename = str(i)
            with open(os.path.join(path, filename), 'wb') as temp_file:
                temp_file.write(tag_free.encode('utf-8'))
            i = i + 1

        path = 'strippedHTML_files'
        i = 0
        for filename in os.listdir(path):
            with open(os.path.join(path, filename), 'r') as myfile:
                data = myfile.read()
                stripStopWords(data, i)
                i = i + 1
        if k > 0:
            newQuery = calcNewQuery(k, q, rqSize)
            q = newQuery
            print("new query is ")
            print(q)

        self.render("index.html",
                    title="Results",
                    items=items,
                    query=q,
                    kTerms=k)
Beispiel #47
0
    def GET(self, query):
        data_input = web.input()
        page = 0
        if "page" in data_input:
            page = int(data_input["page"])
        render = web.template.render('templates/')
        anses = []
        num_pages = 0
        if use_elasticsearch:
            # importing libraries for Elasticsearch
            from elasticsearch import Elasticsearch
            from elasticsearch_dsl import Search, document, field, connections, Q
            from elasticsearch_dsl.connections import connections
            from booktype import Book

            es = Elasticsearch()
            es.indices.create(index='book-index', ignore=[400, 404])
            connections.create_connection(hosts=['localhost'], timeout=20)
            connections.add_connection('book', es)
            # print(connections.get_connection().cluster.health())
            s = Search(es).index('book-index').doc_type('book').query(
                Q('match', title=query.strip())
                | Q('match', description=query.strip())
                | Q("match", userreviews_userReview=query.strip()))
            ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute()
            s = s[page * 10:page * 10 + 10]
            response = s.execute()
            # print 'total number of hits: ', response.hits.total
            num_pages = (response.hits.total / 10) + 1
            for res in response:
                authors = zip(res.authors_name, res.authors_url)
                anses.append({
                    'title': res.title,
                    'description': res.description.encode('utf-8'),
                    'url': res.url,
                    'cover': res.cover,
                    'authors': authors
                })
        else:
            # importing libraries for Lucene
            import lucene
            from java.io import File
            from org.apache.lucene.index import DirectoryReader, Term
            from org.apache.lucene.queryparser.classic import QueryParser
            from org.apache.lucene.store import SimpleFSDirectory
            from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery
            from org.apache.lucene.util import Version
            from org.apache.lucene.analysis.standard import StandardAnalyzer
            import os

            # fields
            title_field = 'title'
            description_field = 'description'
            cover_field = 'cover'
            authors_name_field = 'authors_name'
            authors_url_field = 'authors_url'
            url_field = 'url'

            index_folder = '.'
            index_name = 'lucene.index'
            index_path = os.path.join(index_folder, index_name)

            lucene.initVM()
            version = Version.LUCENE_CURRENT
            directory = SimpleFSDirectory(File(index_path))
            searcher = IndexSearcher(DirectoryReader.open(directory))
            analyzer = StandardAnalyzer(version)

            title_tq = TermQuery(Term(title_field, query))
            desc_tq = TermQuery(Term(description_field, query))
            query = BooleanQuery()
            query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD))
            query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD))
            scoreDocs = searcher.search(query, 1000).scoreDocs
            num_pages = (len(scoreDocs) / 10) + 1

            for scoreDoc in scoreDocs[page * 10:page * 10 + 10]:
                doc = searcher.doc(scoreDoc.doc)
                authors = zip([doc.get(authors_name_field)],
                              [doc.get(authors_url_field)])
                anses.append({
                    'title':
                    doc.get(title_field),
                    'description':
                    doc.get(description_field).encode('utf-8'),
                    'url':
                    doc.get(url_field),
                    'cover':
                    doc.get(cover_field),
                    'authors':
                    authors
                })

        return render.index(anses, query, num_pages)
def index_ontology_files(oboFile, outDir, xref_map):
    """
    Iterates over our list of ontology files and creates an index for each file.
    """
    lucene.initVM()
    analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

    # Handle a little bit of lucene setup
    filename, _ext = os.path.splitext(os.path.basename(oboFile))

    indexDir = os.path.join(outDir, filename)
    if os.path.exists(indexDir):
        raise ExistingIndexDirectoryException(
            'Error, attempted to index same file twice or index two files named the same'
        )

    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
    writer = lucene.IndexWriter(dir, analyzer, True,
                                lucene.IndexWriter.MaxFieldLength(512))

    for term in oboparser.parse(oboFile, ['is_a']):
        if term.obsolete:
            continue

        doc = lucene.Document()
        add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED)
        add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 4.0)

        # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could
        # query hits that we would not want to occur thus errantly increasing the score of the field.
        # We will strip out these hyperlinks and index just the text.
        add_field_to_document(doc, "definition",
                              strip_urls_from_text(term.definition),
                              lucene.Field.Store.YES,
                              lucene.Field.Index.ANALYZED, 0.4)

        # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists
        # in our Ontology object and need to be entered in one at a time
        add_fields_to_document(doc, "synonym",
                               [x[0] for x in term.synonyms if x],
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED, 0.7)

        add_fields_to_document(doc, "alt_id", term.alternateIds,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "xref",
            [replace_xref_identifier(x, xref_map) for x in term.xrefs],
            lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)
        add_fields_to_document(
            doc, "relationship",
            [" ".join(list(x)) for x in list(term.relationships)],
            lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)
        add_fields_to_document(doc, "subset", term.subsets,
                               lucene.Field.Store.NO,
                               lucene.Field.Index.ANALYZED)
        writer.addDocument(doc)

    writer.optimize()
    writer.close()
Beispiel #49
0
                                     lucene.Field.Index.NOT_ANALYZED))
                    doc.add(
                        lucene.Field("path", path, lucene.Field.Store.YES,
                                     lucene.Field.Index.NOT_ANALYZED))
                    if len(contents) > 0:
                        doc.add(
                            lucene.Field("contents", contents,
                                         lucene.Field.Store.NO,
                                         lucene.Field.Index.ANALYZED))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print IndexFiles.__doc__
        sys.exit(1)
    lucene.initVM()
    print 'lucene', lucene.VERSION
    start = datetime.now()
    try:
        IndexFiles(sys.argv[1], "index",
                   lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
Beispiel #50
0
def ansSearch(command, prior, page, RPP):
    if searcher_ans.vm == None:
        searcher_ans.vm = initVM()
    searcher_ans.vm.attachCurrentThread()
    return searcher_ans.Searchfile(command, prior, page, RPP)
            query1 = ''.join(query)

        command = query1
        os.remove('search.txt')
        if command == '':
            return

        print
        start = datetime.now()
        print("Searching for:", command)
        query = QueryParser("contents", analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print("%s total matching documents." % len(scoreDocs))
        end = datetime.now()

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print(doc.get("path"), 'name:', doc.get("name"))
        print('done...')
        print(end - start)


if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print('lucene', lucene.VERSION)
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()
    run(searcher, analyzer)
    del searcher
Beispiel #52
0
    if result:  #DO_NOT_DISTRIBUTE
        exit(result)  #DO_NOT_DISTRIBUTE
sysPath.insert(0, libDir)  #DO_NOT_DISTRIBUTE

from os import getenv
from warnings import warn

maxheap = getenv('PYLUCENE_MAXHEAP')
if not maxheap:
    maxheap = '4g'
    warn(
        "Using '4g' as maxheap for lucene.initVM(). To override use PYLUCENE_MAXHEAP environment variable."
    )
from lucene import initVM, getVMEnv
try:
    VM = initVM(maxheap=maxheap)  #, vmargs='-agentlib:hprof=heap=sites')
except ValueError:
    VM = getVMEnv()
from meresco_lucene import initVM

VMM = initVM()

from fieldregistry import SORTED_PREFIX, UNTOKENIZED_PREFIX, KEY_PREFIX, NUMERIC_PREFIX
from _version import version
from luceneresponse import LuceneResponse
from _lucene import Lucene
from lucenesettings import LuceneSettings
from fields2lucenedoc import Fields2LuceneDoc
from cqltolucenequery import CqlToLuceneQuery
from multilucene import MultiLucene
from termnumerator import TermNumerator
Beispiel #53
0
 def __init__(self, storeDir):
     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
     print 'lucene', lucene.VERSION
     self.dir = SimpleFSDirectory(File(storeDir))
Beispiel #54
0
    op.add_option("-i",
                  dest='create_index',
                  default=False,
                  action='store_true',
                  help="create index; not search")
    op.add_option("--maxheap",
                  dest='maxheap',
                  default='8g',
                  help="min ram for the VM")
    op.add_option("--max_n",
                  dest='max_n',
                  default=MAX_N,
                  help="max return search item")
    opts, args = op.parse_args(sys.argv)

    lucene.initVM(maxheap=opts.maxheap)
    print('lucene', lucene.VERSION)
    start = datetime.now()

    if opts.exact_match:
        print("creating keyworkanalyzer -> exact match on %s" %
              DEFAULT_SEARCH_FIELD)
        analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT)
    else:
        print("creating stdanalyzer -> keyword match on %s" %
              DEFAULT_SEARCH_FIELD)
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    fname = os.path.join(base_dir, opts.index_dir)
Beispiel #55
0
        if ikeyword=="":
            return render.result_text(ikeyword, [[]], 0)
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        results = search_site(ikeyword)
        print len(results[0])
        length = len(results)
        return render.result_text(ikeyword, results, length)



class image:
    def POST(self):
        i = web.input(myfile={})
        f=open('target.jpg','w')
        f.write(str(i['myfile'].value))
        f.close()
        ikeyword='blank'
        if i.myfile.value == "":
            return render.result_pic(ikeyword,[[]], 0)
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()
        results = search_img(match_imgs("target.jpg"))
        length = len(results)
        return render.result_pic(ikeyword, results, length)


if __name__ == "__main__":
    vm_env = lucene.initVM()
    app = web.application(urls, globals())
    app.run()
	def ready(self):
		settings.JVM = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
Beispiel #57
0
        idfpath = "D:\\ICTCLAS\\wordIDF_MBStrategy.txt"
        totalfile = 659796
        stockcodeflag = 1
    if sourcedata == "股票论坛".decode('utf8').encode('gbk'):
        STORE_DIR = "D:\\DATA\\Index\\text"
        idfpath = "D:\\ICTCLAS\\wordIDF_text.txt"
        totalfile = 1487094
        stockcodeflag = 1
    if sourcedata == "个股新闻".decode('utf8').encode('gbk'):
        STORE_DIR = "D:\\DATA\\Index\\sinaStockNews"
        idfpath = "D:\\ICTCLAS\\wordIDF_sinaStockNews.txt"
        totalfile = 441061
        stockcodeflag = 0

#------------- Lucene Init -----------------
    initVM(maxheap='512m')
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    print "Lucene Search Init Done..."

    #------------- Divde Word Init -----------------
    divide = cdll.LoadLibrary("D:\\ICTCLAS\\ICTCLAS50.dll")
    IfInit = divide.ICTCLAS_Init(c_char_p("D:\\ICTCLAS"))
    wordcount = divide.ICTCLAS_ImportUserDictFile('D:\\ICTCLAS\\userdict.txt',
                                                  "CODE_TYPE_UTF8")
    print "Divde Word Init Done..."
    print

    #------------- Net Word Generate -----------------
Beispiel #58
0
    def __init__(self, DATA_DIR, vocab, n_threads, max_terms_per_doc,
                 index_name, index_name_term, docs_path, docs_path_term,
                 use_cache):
        self.n_threads = n_threads
        self.index_folder = DATA_DIR + '/data/' + index_name + '/'  # folder to store lucene's index. It will be created in case it does not exist.
        self.index_folder_term = DATA_DIR + '/data/' + index_name_term + '/'  # folder to store lucene's index. It will be created in case it does not exist.
        self.local_index_folder = './' + index_name
        self.local_index_folder_term = './' + index_name_term
        self.use_cache = use_cache
        self.docs_path = docs_path
        self.docs_path_term = docs_path_term
        self.max_terms_per_doc = max_terms_per_doc

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = vocab

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(self.index_folder):
            print 'Creating index at', self.index_folder
            if self.docs_path == self.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(self.index_folder, self.docs_path, add_terms)

        if self.local_index_folder:
            print 'copying index from', self.index_folder, 'to', self.local_index_folder
            if os.path.exists(self.local_index_folder):
                print 'Folder', self.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(self.index_folder, self.local_index_folder)
            self.index_folder = self.local_index_folder
        else:
            self.index_folder = self.index_folder

        fsDir = MMapDirectory(Paths.get(self.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if self.docs_path != self.docs_path_term:
            if not os.path.exists(self.index_folder_term):
                print 'Creating index at', self.index_folder_term
                self.create_index(self.index_folder_term,
                                  self.docs_path_term,
                                  add_terms=True)

            if self.local_index_folder_term:
                print 'copying index from', self.index_folder_term, 'to', self.local_index_folder_term
                if os.path.exists(self.local_index_folder_term):
                    print 'Folder', self.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(self.index_folder_term,
                                    self.local_index_folder_term)
                self.index_folder_term = self.local_index_folder_term
            else:
                self.index_folder_term = self.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(self.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=self.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()
Beispiel #59
0
def main(use_elasticsearch=True, calculate_PageRank=False, tele_const=0.2):
    """
    main entry for the indexer module.
    """
    jsons_root_dir = 'JSONs/'

    # list of addresses of all json files
    all_json_dirs = glob.glob(jsons_root_dir + '*.json')

    # first reading all json files
    jsons = []
    for jdir in all_json_dirs:
        with open(jdir, 'r') as f:
            jsn = json.load(f)
            jsons.append(jsn)
    print len(jsons), ' json files imported.'

    # now creating a set of all links and then a list of all links in json files
    print 'creating a list of all links'
    links_set = set()
    for js in jsons:
        links_set.add(js["url"])
        for l in js["outlinks"]:
            links_set.add(l)
    print len(links_set), ' links found'
    links = list(links_set)

    ## if user has selected to index documents using Elasticsearch
    # Note that when using Elasticsearch, page rank is ignored
    if use_elasticsearch:
        from elasticsearch import Elasticsearch
        from elasticsearch_dsl import Search, document, field, connections, Q
        from elasticsearch_dsl.connections import connections

        print 'Using Elasticsearch for indexing, PageRank is ignored'
        es = Elasticsearch()
        es.indices.create(index='book-index', ignore=[400, 404])
        connections.create_connection(hosts=['localhost'], timeout=20)
        connections.add_connection('book', es)
        Book.init('book-index')

        ## adding all document to the index 'book-index'
        for idx, js in enumerate(jsons):
            book = Book(average=js['average'],
                        cover=js['cover'],
                        description=js['description'].encode('utf-8'),
                        ratings=js['ratings'],
                        reviews=js['reviews'],
                        title=js['title'],
                        url=js['url'],
                        outlinks=js['outlinks'])
            book.add_authors(js['authors'])
            book.add_userreviews(js['userreviews'])
            book.id = idx
            book.save()
        print 'Elasticsearch index created'

    ### use pyLucene instead
    else:
        import lucene
        from java.io import File
        from org.apache.lucene.index import IndexWriterConfig, IndexWriter, FieldInfo
        from org.apache.lucene.document import Document, Field, FieldType, IntField, FloatField
        from org.apache.lucene.store import SimpleFSDirectory
        from org.apache.lucene.util import Version
        from org.apache.lucene.analysis.standard import StandardAnalyzer

        print 'Using Lucene for indexing'
        ## if user has selected to calculate the PageRank
        if calculate_PageRank:
            # now creating the unnormalized adjacency matrix
            print 'creating the unnormalized adjacency matrix.'
            adjacency = np.zeros((len(links_set), len(links_set)))
            for js in jsons:
                node_idx = links.index(js["url"])
                for l in js["outlinks"]:
                    out_idx = links.index(l)
                    adjacency[node_idx, out_idx] += 1
            print 'the unnormalized adjacency matrix created.'

            print 'normalizing the adjacency matrix with teleporting constant value of ', tele_const
            norm_mat = Normalize(adjacency, tele_const)
            print 'calculating the PageRank scores'
            pr_scores = PageRankScore(norm_mat)

        ## here goes the pyLucene code, which means I should swith to the damn Ubuntu
        index_folder = '.'
        index_name = 'lucene.index'
        index_path = os.path.join(index_folder, index_name)
        print 'initializing Lucene VM'
        lucene.initVM()
        print 'lucene version ', lucene.VERSION
        version = Version.LUCENE_CURRENT
        index_store = SimpleFSDirectory(File(index_path))
        analyzer = StandardAnalyzer(version)
        config = IndexWriterConfig(version, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(index_store, config)

        # Options
        TokenizeFields = True

        # Title field type
        title_field = 'title'
        tft = FieldType()
        tft.setIndexed(True)
        tft.setStored(True)
        tft.setTokenized(TokenizeFields)
        tft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS
                            )  #only index the document and frequency data

        # Authors name field type
        authors_name_field = 'authors_name'
        anft = FieldType()
        anft.setIndexed(True)
        anft.setStored(True)
        anft.setTokenized(TokenizeFields)
        anft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        # Authors url field type
        authors_url_field = 'authors_url'
        auft = FieldType()
        auft.setIndexed(False)
        auft.setStored(True)

        # Average rating field type
        average_field = 'average'

        # Cover Image URL field type
        cover_field = 'cover'
        cft = FieldType()
        cft.setIndexed(False)
        cft.setStored(True)

        # Book description field type
        description_field = 'description'
        descft = FieldType()
        descft.setIndexed(True)
        descft.setStored(True)
        descft.setTokenized(TokenizeFields)
        descft.setIndexOptions(
            FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        # Outlinks field type
        outlinks_field = "outlinks"
        outft = FieldType()
        outft.setIndexed(False)
        outft.setStored(True)

        # Ratings count field type
        ratings_field = 'ratings'

        # Reviews count field type
        reviews_field = 'reviews'

        # URL field type
        url_field = 'url'
        uft = FieldType()
        uft.setIndexed(False)
        uft.setStored(True)

        # userreviews.userName field type
        userreviews_userName_field = 'userreviews_userName'
        usunft = FieldType()
        usunft.setIndexed(False)
        usunft.setStored(True)

        #userreviews.userReview field type
        userreviews_userReview_field = 'userreviews_userReview'
        usurft = FieldType()
        usurft.setIndexed(True)
        usurft.setStored(False)
        usurft.setTokenized(TokenizeFields)
        usurft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        #userreviews.userReviewDate field type
        userreviews_userReviewDate_field = 'userreviews_userReviewDate'
        usudft = FieldType()
        usudft.setIndexed(False)
        usudft.setStored(True)

        #userreviews.userURL field type
        userreviews_userURL_field = 'userreviews_userURL'
        usuuft = FieldType()
        usuuft.setIndexed(False)
        usuuft.setStored(True)

        docid_field = 'docid'

        for idx, js in enumerate(jsons):
            boostVal = js['average']
            if calculate_PageRank:
                boostVal *= pr_scores[links.index(js['url'])]
            doc = Document()
            for author in js['authors']:
                doc.add(Field(authors_name_field, author['name'], anft))
                doc.add(Field(authors_url_field, author['url'], auft))
            doc.add(
                FloatField(average_field, float(js['average']),
                           Field.Store.YES))
            doc.add(Field(cover_field, js['cover'], cft))
            df = Field(description_field, js['description'], descft)
            df.setBoost(boostVal)
            doc.add(df)
            for u in js['outlinks']:
                doc.add(Field(outlinks_field, u, outft))
            doc.add(IntField(ratings_field, js['ratings'], Field.Store.YES))
            doc.add(IntField(reviews_field, js['reviews'], Field.Store.YES))
            tf = Field(title_field, js['title'], tft)
            tf.setBoost(boostVal)
            doc.add(tf)
            doc.add(Field(url_field, js['url'], uft))

            for rev in js['userreviews']:
                doc.add(
                    Field(userreviews_userName_field, rev['userName'], usunft))
                doc.add(
                    Field(userreviews_userReview_field, rev['userReview'],
                          usurft))
                doc.add(
                    Field(userreviews_userReviewDate_field,
                          rev['userReviewDate'], usurft))
                doc.add(
                    Field(userreviews_userURL_field, rev['userURL'], usuuft))
            doc.add(IntField(docid_field, idx, Field.Store.YES))

            writer.addDocument(doc)
        print 'lucene index created'
        writer.commit()
        writer.close()
        print 'writing lucene indexing finished'
            return

        print
        print "Searching for:", command
        querys = BooleanQuery()
        command_dict = parseCommand(command)
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        scoreDocs = searcher.search(querys, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print '------------------------------------------------'
            print 'title:', doc.get('title')
            print 'url:', doc.get('url')
            print 'src:', doc.get('src')


if __name__ == '__main__':
    STORE_DIR = "image_index_v3"
    initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', VERSION
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(directory, True)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer)
    searcher.close()