def delete(indexDir: str, id: str): index_dir = SimpleFSDirectory(Paths.get(indexDir)) config = IndexWriterConfig(StandardAnalyzer()) index_writer = IndexWriter(index_dir, config) delete_term_query = RegexpQuery(Term('id', id)) delete_reg_query = RegexpQuery(Term('id', id + '\..*')) index_writer.deleteDocuments(delete_term_query) index_writer.deleteDocuments(delete_reg_query) index_writer.commit() index_writer.close()
def query_doc(self): searcher = self._searcher query_document = RegexpQuery(Term('id', str(self._id))) top_docs_doc = searcher.search(query_document, 1) document_id = str(self._id) res_dict = {} query_section = RegexpQuery(Term('id', document_id + '\.[0-9]+')) top_docs_section = searcher.search(query_section, 99999) query_paragraph = RegexpQuery( Term('id', document_id + '\.[0-9]+\.[0-9]+')) top_docs_sentence = searcher.search(query_paragraph, 99999) top_docs = top_docs_doc.merge( 1000000, [top_docs_section, top_docs_doc, top_docs_sentence]) for hit in top_docs.scoreDocs: doc = searcher.doc(hit.doc) id = doc.get('id') document = doc.get('document') section = doc.get('section') author = doc.get('author') dynasty = doc.get('dynasty') type = doc.get('type') text = doc.get('text') color = doc.get('color') area = doc.get('area') zhujie = doc.get('zhujie') detail = doc.get('detail') res_dict[id] = {} if document: res_dict[id]['document'] = document if section: res_dict[id]['section'] = section if author: res_dict[id]['author'] = author if dynasty: res_dict[id]['dynasty'] = dynasty if type: res_dict[id]['type'] = type if text: res_dict[id]['text'] = text if color: res_dict[id]['color'] = color if area: res_dict[id]['area'] = area if zhujie: res_dict[id]['zhujie'] = zhujie if detail: res_dict[id]['detail'] = detail self._resDict = res_dict return self
def get_ancient_content(id, dir): index_dir = SimpleFSDirectory(Paths.get(dir)) searcher = IndexSearcher(DirectoryReader.open(index_dir)) all_text = '' query = RegexpQuery(Term('id', id + '\.[0-9]+\.[0-9]+')) hits = searcher.search(query, 9999) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) text = doc.get('text') return doc
def query_section(self, section): searcher = self._searcher query_doc = RegexpQuery(Term('id', self._id + '\\..+')) query_section = TermQuery(Term('section', section)) query = BooleanQuery.Builder() bc1 = BooleanClause(query_doc, BooleanClause.Occur.MUST) bc2 = BooleanClause(query_section, BooleanClause.Occur.MUST) query = query.add(bc1).add(bc2).build() top_docs = searcher.search(query, 1000000) hits = top_docs.scoreDocs res_dict = {} for hit in hits: doc = searcher.doc(hit.doc) id = doc.get('id') document = doc.get('document') section = doc.get('section') author = doc.get('author') dynasty = doc.get('dynasty') type = doc.get('type') text = doc.get('text') color = doc.get('color') area = doc.get('area') zhujie = doc.get('zhujie') detail = doc.get('detail') res_dict[id] = {} if document: res_dict[id]['document'] = document if section: res_dict[id]['section'] = section if author: res_dict[id]['author'] = author if dynasty: res_dict[id]['dynasty'] = dynasty if type: res_dict[id]['type'] = type if text: res_dict[id]['text'] = text if color: res_dict[id]['color'] = color if area: res_dict[id]['area'] = area if zhujie: res_dict[id]['zhujie'] = zhujie if detail: res_dict[id]['detail'] = detail res_dict[self._id] = {'document': section} self._resDict = res_dict return self
def query(self): searcher = self._searcher res_list = [] query = RegexpQuery(Term('id', '[0-9]+')) hits = searcher.search(query, 99999) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) id = doc.get('id') document = doc.get('document') sections = doc.get('sections') update_user = doc.get('username') res_list.append({ 'id': id, 'document': document, 'sections': sections, 'update_user': update_user }) return res_list
def new_get_content(dir, id, show_length=300): index_dir = SimpleFSDirectory(Paths.get(dir)) searcher = IndexSearcher(DirectoryReader.open(index_dir)) cur_id_list = [int(x) for x in id.split(".")] query = TermQuery(Term("id", ".".join([str(x) for x in cur_id_list]))) hit = searcher.search(query, 1) doc = searcher.doc(hit.scoreDocs[0].doc) info_dict = {} for field_info in doc.getFields(): info_dict[field_info.name()] = field_info.stringValue() text_cur = [[info_dict]] text_prev = [] cur_id_list[2] -= 1 while list_text_len(text_prev) < show_length and cur_id_list[1] >= 1: para_info = [] while cur_id_list[2] >= 1: query = TermQuery( Term("id", ".".join([str(x) for x in cur_id_list]))) hit = searcher.search(query, 1) doc = searcher.doc(hit.scoreDocs[0].doc) info_dict = {} for field_info in doc.getFields(): info_dict[field_info.name()] = field_info.stringValue() para_info.insert(0, info_dict) cur_id_list[2] -= 1 text_prev.insert(0, para_info) cur_id_list[1] -= 1 query = RegexpQuery( Term("id", str(cur_id_list[0]) + "\\." + str(cur_id_list[1]) + "\\..+")) hits = searcher.search(query, 99999) s_id_max = 1 for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) s_id = int(doc.get("id").split(".")[2]) if s_id > s_id_max: s_id_max = s_id cur_id_list[2] = s_id_max if len(text_prev) > 0 and list_text_len(text_prev) >= show_length: over_count = list_text_len(text_prev) - show_length len_count = 0 para_count = 0 new_para = [] while True: text = text_prev[0][para_count]["text"] prev_len = len_count len_count += len(text) if len_count > over_count: text = text[over_count - prev_len:] # 修注解的offset if "zhujie" in text_prev[0][para_count].keys(): zj = json.loads(text_prev[0][para_count]["zhujie"]) new_zj_offset = zj["offset"][:] new_zj_content = zj["content"][:] count = 0 for i in range(len(zj["offset"])): new_offset = zj["offset"][i] - (over_count - prev_len) if new_offset < 0: new_zj_offset.pop(count) new_zj_content.pop(count) count -= 1 else: new_zj_offset[count] = new_offset count += 1 text_prev[0][para_count]["zhujie"] = json.dumps({ "offset": new_zj_offset, "content": new_zj_content }) text_prev[0][para_count]["text"] = text break para_count += 1 while para_count < len(text_prev[0]): new_para.insert(0, text_prev[0][para_count]) para_count += 1 text_prev[0] = new_para cur_id_list = [int(x) for x in id.split(".")] text_next = [] cur_id_list[2] += 1 while list_text_len(text_next) < show_length: query = RegexpQuery( Term("id", str(cur_id_list[0]) + "\\." + str(cur_id_list[1]) + "\\..+")) hits = searcher.search(query, 100) s_id_max = 1 if hits.totalHits < 1: break for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) s_id = int(doc.get("id").split(".")[2]) if s_id > s_id_max: s_id_max = s_id para_info = [] while cur_id_list[2] <= s_id_max: query = TermQuery( Term("id", ".".join([str(x) for x in cur_id_list]))) hit = searcher.search(query, 1) doc = searcher.doc(hit.scoreDocs[0].doc) info_dict = {} for field_info in doc.getFields(): info_dict[field_info.name()] = field_info.stringValue() para_info.append(info_dict) # para_text += doc.get("text") cur_id_list[2] += 1 text_next.append(para_info) cur_id_list[2] = 1 cur_id_list[1] += 1 if len(text_next) > 0 and list_text_len(text_next) >= show_length: over_count = list_text_len(text_next) - show_length len_count = 0 para_count = len(text_next[-1]) - 1 new_para = [] while True: text = text_next[-1][para_count]["text"] len_count += len(text) if len_count > over_count: text = text[:len_count - over_count] # 修注解的offset if "zhujie" in text_next[-1][para_count].keys(): zj = json.loads(text_next[-1][para_count]["zhujie"]) new_zj_offset = zj["offset"][:] new_zj_content = zj["content"][:] count = 0 for i in range(len(zj["offset"])): if zj["offset"][i] >= len(text): new_zj_offset.pop(count) new_zj_content.pop(count) count -= 1 count += 1 text_next[-1][para_count]["zhujie"] = json.dumps({ "offset": new_zj_offset, "content": new_zj_content }) text_next[-1][para_count]["text"] = text break para_count -= 1 para_count_new = 0 while para_count_new <= para_count: new_para.append(text_next[-1][para_count_new]) para_count_new += 1 text_next[-1] = new_para if len(text_prev) == 0: text_prev = [] if len(text_cur) == 0: text_cur = [] if len(text_next) == 0: text_next = [] return {"prev": text_prev, "cur": text_cur, "next": text_next}
def search(self, field): s = self._search u = self._userQuery zh_to_hant_dict = self._zh_to_hant_dict info = u.getFlagsInfo() flags_list = u.getFlagsList() sq_list = [] word_index_list = [] index_count = 0 for flag in flags_list: if flag["type"] == "word": word_index_list.append(index_count) if len(flag["content"]) == 1: if flag["content"][0] in zh_to_hant_dict: stq_list = [ SpanTermQuery(Term(field, flag["content"][0])) ] for hant in zh_to_hant_dict[flag["content"][0]]: stq_list.append(SpanTermQuery(Term(field, hant))) sq_list.append(SpanOrQuery(stq_list)) else: sq_list.append( SpanTermQuery(Term(field, flag["content"][0]))) else: snq_list = [] for w in flag["content"]: if w in zh_to_hant_dict: stq_list = [SpanTermQuery(Term(field, w))] for hant in zh_to_hant_dict[w]: stq_list.append( SpanTermQuery(Term(field, hant))) snq_list.append(SpanOrQuery(stq_list)) else: snq_list.append(SpanTermQuery(Term(field, w))) sq_list.append(SpanNearQuery(snq_list, 0, True)) else: sq_list.append({ "op": info[flag["content"]]["op"], "num": info[flag["content"]]["num"] }) index_count += 1 q = None count = 0 for index in word_index_list: if count == 0: q = sq_list[index] count += 1 else: if not isinstance(sq_list[index - 1], dict): q = SpanNearQuery([q, sq_list[index]], 0, True) else: q = SpanNearQuery([q, sq_list[index]], sq_list[index - 1]["num"][-1], True) query = q # 过滤项 filters = u.getFields() bq = BooleanQuery.Builder() bq.add(BooleanClause(query, BooleanClause.Occur.MUST)) for key in filters.keys(): cur_reg = '(' for ft in filters[key]: cur_reg += ft + '|' cur_reg = cur_reg[0:-1] + ')' rq = RegexpQuery(Term(key, cur_reg)) bq.add(BooleanClause(rq, BooleanClause.Occur.MUST)) query = bq.build() top_docs = s.search(query, 9999) self._cur_field = field reg = get_test_reg(flags_list, info, zh_to_hant_dict) doc_id_list = [] hits = top_docs.scoreDocs for hit in hits: doc = s.doc(hit.doc) text = doc.get("text") match_res = re.search(reg, text) if match_res: doc_id_list.append(hit.doc) self._res = doc_id_list self._reg = reg return self
def search(self, field): s = self._search u = self._userQuery z = self._zh_to_hant_dict keys = u.getKey() nums = u.getNum() word_list = u.getWordList() filters = u.getFields() # 只检索过滤项 if len(word_list) == 0: query = None # 简单项 elif len(keys) == 0: query = simple_term_to_query(field, word_list[0], z) elif keys[0] == '#': query_left = simple_term_to_query(field, word_list[0], z) query_right = simple_term_to_query(field, word_list[1], z) query = SpanNearQuery([query_left, query_right], int(nums[0]), False) elif keys[0] == '+' or keys[0] == '$': prev_query = simple_term_to_query(field, word_list[0], z) for i in range(len(keys)): cur_query = simple_term_to_query(field, word_list[i + 1], z) if keys[i] == '+': span_list = [prev_query] for j in range(int(nums[i])): span = SpanMultiTermQueryWrapper( RegexpQuery(Term(field, '.'))) span_list.append(span) span_list.append(cur_query) prev_query = SpanNearQuery(span_list, 0, True) else: span_list = [prev_query, cur_query] prev_query = SpanNearQuery(span_list, int(nums[i]), True) query = prev_query elif keys[0] == '-' or keys[0] == '~': query_left = simple_term_to_query(field, word_list[0], z) query_right = simple_term_to_query(field, word_list[1], z) if keys[0] == '-': n_q_list = [query_left, query_right] else: n_q_list = [query_right, query_left] n_query = SpanNearQuery(n_q_list, int(nums[0]) - 1, True) bq = BooleanQuery.Builder() bc1 = BooleanClause(query_left, BooleanClause.Occur.MUST) bc2 = BooleanClause(n_query, BooleanClause.Occur.MUST_NOT) query = bq.add(bc1).add(bc2).build() else: raise ValueError("检索语句错误!") # 过滤项 bq = BooleanQuery.Builder() if query: bq.add(BooleanClause(query, BooleanClause.Occur.MUST)) for key in filters.keys(): cur_reg = '(' for ft in filters[key]: cur_reg += ft + '|' cur_reg = cur_reg[0:-1] + ')' rq = RegexpQuery(Term(key, cur_reg)) bq.add(BooleanClause(rq, BooleanClause.Occur.MUST)) query = bq.build() self._res = s.search(query, 100000) self._cur_field = field return self
from org.apache.lucene.queryparser.classic import QueryParser # noinspection PyUnresolvedReferences from org.apache.lucene.analysis.standard import StandardAnalyzer if __name__ == "__main__": # noinspection PyUnresolvedReferences lucene.initVM(initialheap='32m', maxheap='4G') file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT") dir = FSDirectory.open(file) reader = DirectoryReader.open(dir) searcher = IndexSearcher(reader) term = Term("contents", "tiger") print(f'Tiger frequency: {reader.totalTermFreq(term)}') q_regex = RegexpQuery(Term("contents", "[0-9]+\.?[0-9]*")) print(f'regex results: {searcher.search(q_regex,1000000).totalHits}') span1 = SpanMultiTermQueryWrapper(q_regex) span2 = SpanMultiTermQueryWrapper(RegexpQuery(Term("contents", "tiger"))) spannearquery = SpanNearQuery([span1, span2], 20, True) print( f'spanquery results: {searcher.search(spannearquery, 1000000).totalHits}' ) parser = QueryParser('contents', StandardAnalyzer()) q = parser.parse('"tiger leopard"') print(q) # prints contents:"tiger leopard" print(searcher.search(q, 10000000).totalHits) phrase_query = PhraseQuery(10, 'contents', 'tiger leopard')