def author_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] print(entry['prim_author']) if qry in entry['prim_author'].lower(): fname = short_title + CONTENT_EXT results[entry_id] = {'title': short_title, 'file': fname } f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w') f.write(json.dumps(results)) f.close() return json.dumps(results)
def custom_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' print rootdir results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] year = entry['publ_year'] fname = short_title + CONTENT_EXT results[fname] = year;
def do_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File print os.path.abspath(os.path.pardir) reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = [] for hit in hits: doc = searcher.doc(hit.doc); entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) #print 'entry:', entry score = hit.score #print 'Hit:', entry['short_title'], score results.append((score, doc, entry)) return results