def retrieve_sents(self): indexDir = self.indexDir query = self.query sent_ind_list = [] # template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) # print indexDir searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(query) # print query start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start # print query if self.stats: print("Found %d sentences (in %s) that matched query '%s':" % (len(scoreDocs), duration, query), file=sys.stderr) for scoreDoc in scoreDocs: # print scoreDoc.doc # doc = searcher.doc(scoreDoc.doc) sent_ind_list.append(scoreDoc.doc) return sent_ind_list
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def createWriter(index_dir): indexDir = SimpleFSDirectory(File(index_dir).toPath()) writerConfig = IndexWriterConfig() print(Codec.availableCodecs()) print(f"Codec : {writerConfig.getCodec()}") writer = IndexWriter(indexDir, writerConfig) return writer
def load_index(self): indexDir = File(self.index_path) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("Index contains %d documents." % n_docs)
def __init__(self, store_dir, analyzer, db_path): self.write_type = True self.spacy_number_types = ['DATE', 'CARDINAL', 'QUANTITY', 'MONEY', 'TIME', 'PERCENT', 'ORDINAL'] if not os.path.exists(store_dir): os.mkdir(store_dir) store = SimpleFSDirectory(Paths.get(store_dir)) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(store, config) # TODO checksum self.wiki_db = DocDB(db_path=db_path) print('Getting docs..', db_path) self.doc_ids = self.wiki_db.get_ner_doc_ids(limit=None) print('# wiki docs', len(self.doc_ids)) assert len(self.doc_ids) == 5075182 self.entity2idx = dict() self.idx2entity = dict() self.UNK = 'UNK' self.entity2idx[self.UNK] = 0 self.idx2entity[self.entity2idx[self.UNK]] = self.UNK self.entitytype2idx = dict() self.entitytype2idx[self.UNK] = 0 self.entity_dict = dict() self.num_entities_max = -1 print('Init. Done')
def __init__(self, indexDir): f = Paths.get(indexDir) self._dir = SimpleFSDirectory(f) self._indexSearcher = IndexSearcher(DirectoryReader.open(self._dir)) self._weights = HashMap() self._weights.put(FIELDS[0], 1) self._weights.put(FIELDS[1], 0.2)
def __init__(self, store_dir, hits_dir, frags_dir=None): # store_dir is the location of our generated lucene index # hits_dir is the location of the highlighted document hits # frags_dif is the location of the document hit fragments - optional self.store_dir = store_dir self.hits_dir = hits_dir self.frags_dir = frags_dir if not os.path.exists(self.store_dir): os.mkdir(self.store_dir) if not os.path.exists(self.hits_dir): os.mkdir(self.hits_dir) if self.frags_dir is not None and not os.path.exists(self.frags_dir): os.mkdir(self.frags_dir) self.directory = SimpleFSDirectory(File(self.store_dir)) # For now I just use the StandardAnalyzer self.analyzer = StandardAnalyzer(Version.LUCENE_43) config = IndexWriterConfig(Version.LUCENE_43, self.analyzer) self.writer = IndexWriter(self.directory, config)
def run(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index1" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(analysis(command)) HighlightFormatter = SimpleHTMLFormatter() highlighter = Highlighter(HighlightFormatter, QueryScorer(query)) scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get( "name"), 'url:', doc.get("url"), 'title:', doc.get("title") text = doc.get('contents') highLightText = highlighter.getBestFragment(analyzer, "contents", text) if highLightText != None: highLightText = ''.join(highLightText.split(' ')) data = {} data['url'] = doc.get("url") data['title'] = doc.get('title') data['highlight'] = highLightText result.append(data) return result
def run_img(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index2" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) querys = BooleanQuery() query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent", analyzer).parse(command) query_title = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) querys.add(query_content, BooleanClause.Occur.SHOULD) querys.add(query_title, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 50).scoreDocs if len(scoreDocs) == 0: print "WARNING: No result" result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print doc.get("title") data = {} data['title'] = doc.get('title') data['url'] = doc.get('url') data['imgurl'] = doc.get('imgurl') result.append(data) return result
def search_loop(index_dir, field="contents", explain=False): searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(Paths.get(index_dir)))) analyzer = StandardAnalyzer() print("Hit enter with no input to quit.") while True: command = input("Query:") if command == '': return print("Searching for: %s" % command) query = QueryParser(field, analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print("%s total matching documents." % len(scoreDocs)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) if field == 'web': print( f'{doc.get("web")} | {doc.get("raw")} | {scoreDoc.score}') else: print('path:', doc.get("path"), 'name:', doc.get("name")) if explain: explanation = searcher.explain(query, scoreDoc.doc) print(explanation) print('------------')
def GET(self, name): STORE_DIR_GOOD = "index_good" STORE_DIR_BAD = "index_bad" vm_env.attachCurrentThread() directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD)) searcher_good = IndexSearcher(DirectoryReader.open(directory_good)) directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD)) searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) user_data = web.input(name=None) command = yourInput(user_data.shop) #command=command+u' '+u'brand:'+xx.decode('utf8') res = Run_Score(searcher_good, searcher_bad, analyzer, command, user_data.brand) res.append(command) return render.SearchResult(res)
def __init__(self, root, storeDir, analyzer, type="html"): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.load_stop_words([ "CNstopwords.txt", "ENstopwords.txt", ]) self.html2text = HTML2Text() self.html2text.ignore_links = True self.html2text.ignore_images = True type_to_index = { "html": self.index_html, "image": self.index_image, } type_to_index[type](root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q, a, t, p in qatp: if n % 100 == 0: print 'finding candidates sample', n n += 1 q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % ( n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def create_index_for_wiki_sentence(filename, path, firstTime=False): logging.info('Start create wiki_sentence!') wiki_dict = get_wiki_data(path) logging.info('Start creating index!') filename = '_wiki_sentence' analyzer = analysis.standard.StandardAnalyzer() # # Store the index in memory: base_dir = HOMEPATH INDEX_DIR = "IndexFiles" + filename + ".index" storeDir = os.path.join(base_dir, INDEX_DIR) if not os.path.exists(storeDir): os.mkdir(storeDir) directory = SimpleFSDirectory(Paths.get(storeDir)) if firstTime: config = index.IndexWriterConfig(analyzer) iwriter = index.IndexWriter(directory, config) for cnt, key in enumerate(wiki_dict.keys()): if cnt % 1000 == 0: logging.info( 'I have preprocessed {} index in creating index by document!' .format(str(cnt))) org_title = key[0] preprocessed_title = key[1] doc_id = key[2] sentence = wiki_dict[key] doc = create_document_by_document_sentence(org_title, preprocessed_title, doc_id, sentence) iwriter.addDocument(doc) iwriter.close() logging.info('Finish creating index wiki_sentence!') return directory
def irsolver(data_file, index): from questions import get_input_data lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) pred = [] mapp = {1: 'A', 2: 'B', 3: 'C', 4: 'D'} idx, ques, ans = get_input_data(data_file) for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)): max_score = -1000000 best_ans = 'A' for i, ai in enumerate(a): sc = query(q, ai, analyzer, searcher) print(acm, i, sc) if sc > max_score: max_score = sc best_ans = mapp[i + 1] pred.append(best_ans) return idx, pred
def func2(command): STORE_DIR = "index1" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) res = [] if command == '': return query = QueryParser(Version.LUCENE_CURRENT, "zhuliao", analyzer).parse(command) scoreDocs = searcher.search(query, 9).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) try: res.append([ doc.get("name"), doc.get("collect_num"), doc.get("zhuliao").split(' '), doc.get("zuofa").split('\n'), doc.get("img_url"), doc.get("url") ]) except: pass res1 = [] for i in res: i[1] = int(i[1]) res1.append(tuple(i)) res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True) return res2
def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.nameQueryParser = QueryParser('name', StandardAnalyzer()) self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND) self.idQueryParser = QueryParser('id', StandardAnalyzer()) self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
def func1(command): global vm_env STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() #lucene.initVM(vmargs=['-Djava.awt.headless=true']) # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) if command == '': return [] command_list = jieba.cut(command) command = " ".join(command_list) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doct = { 'title': doc.get("title"), 'url': doc.get("url"), "sentence": doc.get("sentence") } result.append(doct) del searcher return result
def process_q_test(q, out_q): lucene.initVM() lucene.getVMEnv().attachCurrentThread() index = DirectoryReader.open(SimpleFSDirectory( Paths.get(robust_index_dir))) searcher = IndexSearcher(index) searcher.setSimilarity(BM25Similarity()) analyzer = EnglishAnalyzer() qparser = QueryParser("contents", analyzer) preprocessor = Preprocess() while not exitFlag: qid, query = q.get() tname = multiprocessing.current_process().name # print(tname, qid, query) if query == "DONE": break try: # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000) # if len(dids) >= 10: # out_q.put((qid, dids, scores)) dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer, preprocessor) out_q.put((qid, dids_text)) except: print('%s exception %s, %s' % (tname, qid, query))
def wikipedia_indexer(storage, wikipedia_file): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f): text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0: print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def main(): INDEX_DIR = "indexes" try: print "Indexing..." indexDir = File("/Users/Raphael/Downloads/github2") #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = KeywordAnalyzer( ) #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT) } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indexDir), config) index_code_snippet(writer) writer.close() except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def search(self, index_dir): # Get handle to index directory directory = SimpleFSDirectory(File(index_dir)) # Creates a searcher searching the provided index. ireader = DirectoryReader.open(directory) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(self.query) # Run the query and get top 50 results topDocs = searcher.search(query, self.retrieve_count) # Get top hits scoreDocs = topDocs.scoreDocs doc_ids = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_ids.append(doc.get(FIELD_PATH)) return [int(item) for item in doc_ids]
def publish_services(self, service_list): transformer = WSDLTransformer() current_document = 1 indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig( Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT)) writerConfig.setSimilarity(BM25Similarity()) index_writer = IndexWriter(indexDir, writerConfig) for wsdl in service_list: if self._document_expansion: #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl)))) bag_of_words = ' '.join( self._semantic_transformer.transform( transformer.transform(wsdl))) else: #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl))) bag_of_words = ' '.join(transformer.transform(wsdl)) doc = Document() doc.add( Field("content", bag_of_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO)) index_writer.addDocument(doc) current_document += 1 index_writer.close()
def func(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() # ------------ # STORE_DIR = "index" directory = SimpleFSDirectory(File(STORE_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(DirectoryReader.open(directory)) # ------------ # query = QueryParser(Version.LUCENE_CURRENT, "Tags", analyzer).parse(command) scoreDocs = searcher.search(query, 200).scoreDocs dict1 = {} result = "" for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) rank = 0.6 * float(doc.get("Likes")) + 0.4 * float(doc.get("Views")) ch = doc.get('Page_num') + ' ' ch += 'data/' + doc.get('Page_num') + '.jpg' + ' ' ch += doc.get('Page_link') + ' ' ch += doc.get('Views') + ' ' ch += doc.get('Likes') + ' ' tmp_alt = doc.get('Img_alt') tmp_alt = '_'.join(tmp_alt.split()) ch += tmp_alt dict1[ch] = rank res_list = sorted(dict1.items(), key=lambda item: item[1], reverse=True) for i in res_list: result += i[0] result += ' ' del searcher del analyzer return result
def __init__(self, folder=None, fields=[], similarity="tfidf"): self.jcc = lucene.initVM() if folder: self.directory = SimpleFSDirectory(File(folder)) else: self.directory = RAMDirectory() self.fields = {} for field in fields: ft = FieldType() for pname, pvalue in field.props.items(): setter = getattr(ft, "set" + pname.capitalize()) setter(pvalue) ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # ft.setOmitNorms(True) self.fields[field.name] = ft self.similarity = similarity.lower() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.writer = None self.searcher = None
def retrieve(indexdir, queries): lucene.initVM() f = open("results_lucene.txt", "w") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(indexdir))) searcher = IndexSearcher(reader) fields = ["title", "abstract", "authors"] st = PorterStemmer() for id, q in queries.iteritems(): query = q tokenizer = RegexpTokenizer(r'\w+') qwords = tokenizer.tokenize(query) qwords_k = [st.stem(q) for q in qwords] query = " ".join(qwords_k) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) MAX = 1000 hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for i, hit in enumerate(hits.scoreDocs): f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score)) # print hit.doc+1, hit.score # doc = searcher.doc(hit.doc) # print doc.get("authors").encode("utf-8") f.close()
def main(): try: indicesDestination = File(dest_path) analyzer = KeywordAnalyzer() porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "code": porter_analyzer, "description": porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": porter_analyzer, "word": KeywordAnalyzer() } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) counter = Counter() generate_indices_from_benchmark(writer, counter) writer.close() print "All jobs are done.." print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def buscar(): global folder_path, folder_index logging.info("Ingresando en la peticion para busqueda") # print folder_path # print folder_index logging.info("palabra buscada: "+request.form['id_entrada']) palabra = str(request.form['id_entrada']).replace('"', "").replace(":","").replace(".","").replace(",","").replace(";","").replace("'","") logging.info("Obteniendo ambiente de lucene en busqueda") vm_env = lucene.getVMEnv() logging.info("Creando hilo en el ambiente en busqueda") vm_env.attachCurrentThread() #base_dir = os.path.dirname(os.path.abspath(folder_path)) logging.info("Llamando a SimpleFSDirectory") directory = SimpleFSDirectory(Paths.get(INDEX_PATH)) # directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, folder_index))) logging.info("Llamando a IndexSearcher") searcher = IndexSearcher(DirectoryReader.open(directory)) logging.info("Llamando a StandardAnalyzer") analyzer = StandardAnalyzer() logging.info("Buscando palabra: "+palabra) SearchFiles().buscar(searcher, analyzer, palabra) listanombres=SearchFiles().getlistanombres() logging.info("Obteniendo la lista de nombres: "+str(listanombres)) #print "Lista controller: ",listanombres # print "Entro" logging.info("Renderizando template de busqueda con resultado") return render_template('search.html', texto=palabra, nombres=listanombres, resultado=str("Se encontraron "+str(len(listanombres))+" documentos!."))
def addLang(self, lang, dataset, analyzer, index_path=None): self.languages.append(lang) idxdir = self.get_index(lang, dataset, index_path) directory = SimpleFSDirectory(Paths.get(idxdir)) self.searcher[lang] = IndexSearcher(DirectoryReader.open(directory)) self.parser[lang] = QueryParser("context", analyzers[analyzer]()) self.searcher[lang].setSimilarity(self.similarity) self.lang = lang