def __init__(self, directory): self.directory = directory # create Directories for the search index and for the taxonomy index # in RAM or on Disc #indexDir = RAMDirectory() #taxoDir = RAMDirectory() self.indexDir = FSDirectory.open( File(os.path.join(self.directory, 'Index'))) self.taxoDir = FSDirectory.open( File(os.path.join(self.directory, 'Taxonomy')))
def search(r, keyword=""): import logging logger = logging.getLogger("search") bench = Benchmark(logger) from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit import lucene, os os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17" lucene.initVM(lucene.CLASSPATH) directory = FSDirectory.open(File(CONFIG.INDEX_PATH)) ROBOT_INDEX = IndexSearcher(directory, True) ROBOT_ANALYZER = StandardAnalyzer() keyword = keyword or r.GET["keyword"] query = QueryParser("context", ROBOT_ANALYZER) query = query.parse('"%s"' % keyword) bench.start_mark("search") hits = ROBOT_INDEX.search(query) count = len(hits) result = [] i = 0 for hit in hits: i += 1 if i > 100: break doc = Hit.cast_(hit).getDocument() result.append(SearchResult(doc, i, keyword)) ROBOT_INDEX.close() et = bench.stop_mark() return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(tempfile.gettempdir(), 'index-dir') dir = FSDirectory.open(indexDir,) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = tempfile.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add(Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') dir = FSDirectory.getDirectory(indexDir, True) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = System.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add( Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def __init__(self, user_loc_string, debug=False): analyzer = StopAnalyzer() fields = ["name", "alternate_names", "state"] directory = FSDirectory.getDirectory("index") self.DEBUG = debug self.searcher = IndexSearcher(directory) self.parser = MultiFieldQueryParser(fields, analyzer) self.user_location = self.doSearch(user_loc_string)
def __init__(self, user_loc_string, debug=False): analyzer = StopAnalyzer() fields = ['name', 'alternate_names', 'state'] directory = FSDirectory.getDirectory("index") self.DEBUG = debug self.searcher = IndexSearcher(directory) self.parser = MultiFieldQueryParser(fields, analyzer) self.user_location = self.doSearch(user_loc_string)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = FSDirectory.getDirectory(storeDir, True) writer = IndexWriter(store, analyzer, True) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer) print 'optimizing index', writer.optimize() writer.close() print 'done'
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" %(dataDir) writer = IndexWriter(FSDirectory.open(indexDir), StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(False) numIndexed = cls.indexDirectory(writer, dataDir) writer.optimize() writer.close() return numIndexed
def search(q): initLucene() fsDir = FSDirectory.getDirectory(INDEX_PATH, False) searcher = IndexSearcher(fsDir) query = QueryParser("contents", StandardAnalyzer()).parse(q) start = time() hits = searcher.search(query) duration = timedelta(seconds=time() - start) matchpages = [] for hit in hits: doc = Hit.cast_(hit).getDocument() temp = ({"title":doc["title"],"url":doc["url"]}) matchpages.append(temp) result = {"no_of_hits":hits.length(),"duration":duration, "query":q,"result":matchpages} return result
def run(command): if command == '': return None STORE_DIR = "index" initVM(CLASSPATH) directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) parser.setFuzzyMinSim(0.2) query = parser.parse(command) hits = map(transform, searcher.search(query)) searcher.close() return hits
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def search(cls, indexDir, q): fsDir = FSDirectory.getDirectory(indexDir, False) searcher = IndexSearcher(fsDir) query = QueryParser("contents", StandardAnalyzer()).parse(q) start = time() hits = searcher.search(query) duration = timedelta(seconds=time() - start) # result = {"no_of_hits":hits.length(),"duration":duration, "query":q,} # return print "Found %d document(s) (in %s) that matched query '%s':" % (hits.length(), duration, q) for hit in hits: doc = Hit.cast_(hit).getDocument() print doc
def __init__(self): self.save = Field.Store.YES self.ddict = dict() lvm.attachCurrentThread() if not WorkflowIndexer.store and not WorkflowIndexer.writer: try: # open lucene index WorkflowIndexer.store = FSDirectory.open(File(location)) # if the index directory does not exist, create it. WorkflowIndexer.writer = IndexWriter( WorkflowIndexer.store, vistrailAnalyzer(), not len(WorkflowIndexer.store.list())) except Exception, e: print "EXCEPTION", e self.close() raise
def index(self): dirPath = os.path.join(tempfile.gettempdir(), "verbose-index") dir = FSDirectory.open(dirPath) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(InfoStreamOut()) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def similar(command, docno): STORE_DIR = "index" initVM(CLASSPATH) directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) parser.setFuzzyMinSim(0.2) query = parser.parse(command) hits = searcher.search(query) document = hits.id(docno) ir = IndexReader.open(STORE_DIR) mlt = MoreLikeThis(ir) mlt.setFieldNames(['name', 'contents']) mlt.setMinWordLen(2) mlt.setBoost(True) query = mlt.like(document) hits = map(transform, searcher.search(query)) searcher.close() return hits
emo_propagation_by_time = sorted(emoticon_propagation_hash.items(), key=itemgetter(0)) emoticon_file = open(emoticon_file_name,'w') emoticon_file.write("day,"+",".join(countrylist)+",total,totalinreplies,alltweets,emoticontweets,httpemoticons,UStweets,JPtweets\n") for emo_day_entry in emo_propagation_by_time: emoticon_file.write(str(emo_day_entry[0])+","+",".join([str(emo_day_entry[1].get(ctry,0)) for ctry in countrylist]) + "," + \ str(emo_day_entry[1]["total"]) + "," + str(emo_day_entry[1]["total_in_replies"]) + "," + str(emo_day_entry[1]['total tweets']) + "," + \ str(emo_day_entry[1]["total emoticon tweets"]) + "," + str(emo_day_entry[1]["total http emoticons"]) + "," + \ str(emo_day_entry[1]["total US tweets"]) + "," + str(emo_day_entry[1]["total JP tweets"]) + "\n") emoticon_file.close() print "done at: ", time.time() if __name__ == '__main__': STORE_DIR = "/Volumes/TerraFirma/SharedData/vdb5/lucene_index" initVM(CLASSPATH, maxheap='1024m') print 'lucene', VERSION directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = WhitespaceAnalyzer() getBaselineStatistics(searcher, analyzer) #emoticon_list = [":)", ":(", ";)", ":P", ":0", "^^", "TT", ":p", ":/", "^_^", "T_T"] #emoticon_list = [":)", ":(", ":'(", ":-|", "^^", "+_+", "-_-", "T_T"] emoticon_list = [":)",":D",":(",";)",":P","^^","^_^","-_-","T_T",":o","@_@"] for prop_emoticon in emoticon_list: getEmoticonPropagationCurves(prop_emoticon, searcher, analyzer) #getEmoticonPropagationCurves(":)", searcher, analyzer) #getEmoticonPropagationCurves(":(", searcher, analyzer) #getEmoticonPropagationCurves("^_^", searcher, analyzer) #getEmoticonPropagationCurves(";)", searcher, analyzer) #getEmoticonPropagationCurves("TT", searcher, analyzer) #getEmoticonPropagationCurves("=^", searcher, analyzer) searcher.close()
def run(searcher, parser): while True: print print "Hit enter with no input to quit." command = raw_input("Query: ") if command == '': return print "Searching for:", command query = parser.parse(command) hits = searcher.search(query, Sort("population", True)) print "%s total matching documents." % hits.length() for hit in hits: doc = Hit.cast_(hit).getDocument() print 'name:', doc.get("name"), ' state:', doc.get("state") if __name__ == '__main__': STORE_DIR = "index" initVM(CLASSPATH) print 'lucene', VERSION directory = FSDirectory.getDirectory(STORE_DIR) searcher = IndexSearcher(directory) analyzer = StopAnalyzer() parser = QueryParser("all_names", analyzer) parser.setDefaultOperator(parser.AND_OPERATOR) run(searcher, parser) searcher.close()
import sys, shutil from itertools import count from lucene import (IndexWriter, StandardAnalyzer, Document, Field, MultiFieldQueryParser, IndexSearcher, initVM, CLASSPATH, Hit, FSDirectory, BooleanClause) initVM(CLASSPATH) DIRECTORY = 'xxindex' STORE = FSDirectory.getDirectory(DIRECTORY, True) def indexDoc(writer, d): doc = Document() name_ = Field("name_", d.name, Field.Store.YES, Field.Index.TOKENIZED) name_.setBoost(2.0) full_text = Field("full_text", d.full, Field.Store.YES, Field.Index.TOKENIZED) id = Field("id", str(d.id), Field.Store.YES, Field.Index.UN_TOKENIZED) doc.add(name_) doc.add(full_text) doc.add(id) writer.addDocument(doc) class MyHit(object): """One monster search result Essentially an adapter for Lucene's hits """