Exemple #1
0
def delete(indexDir: str, id: str):
    index_dir = SimpleFSDirectory(Paths.get(indexDir))
    config = IndexWriterConfig(StandardAnalyzer())

    index_writer = IndexWriter(index_dir, config)

    delete_term_query = RegexpQuery(Term('id', id))
    delete_reg_query = RegexpQuery(Term('id', id + '\..*'))

    index_writer.deleteDocuments(delete_term_query)
    index_writer.deleteDocuments(delete_reg_query)
    index_writer.commit()
    index_writer.close()
Exemple #2
0
 def query_doc(self):
     searcher = self._searcher
     query_document = RegexpQuery(Term('id', str(self._id)))
     top_docs_doc = searcher.search(query_document, 1)
     document_id = str(self._id)
     res_dict = {}
     query_section = RegexpQuery(Term('id', document_id + '\.[0-9]+'))
     top_docs_section = searcher.search(query_section, 99999)
     query_paragraph = RegexpQuery(
         Term('id', document_id + '\.[0-9]+\.[0-9]+'))
     top_docs_sentence = searcher.search(query_paragraph, 99999)
     top_docs = top_docs_doc.merge(
         1000000, [top_docs_section, top_docs_doc, top_docs_sentence])
     for hit in top_docs.scoreDocs:
         doc = searcher.doc(hit.doc)
         id = doc.get('id')
         document = doc.get('document')
         section = doc.get('section')
         author = doc.get('author')
         dynasty = doc.get('dynasty')
         type = doc.get('type')
         text = doc.get('text')
         color = doc.get('color')
         area = doc.get('area')
         zhujie = doc.get('zhujie')
         detail = doc.get('detail')
         res_dict[id] = {}
         if document:
             res_dict[id]['document'] = document
         if section:
             res_dict[id]['section'] = section
         if author:
             res_dict[id]['author'] = author
         if dynasty:
             res_dict[id]['dynasty'] = dynasty
         if type:
             res_dict[id]['type'] = type
         if text:
             res_dict[id]['text'] = text
         if color:
             res_dict[id]['color'] = color
         if area:
             res_dict[id]['area'] = area
         if zhujie:
             res_dict[id]['zhujie'] = zhujie
         if detail:
             res_dict[id]['detail'] = detail
     self._resDict = res_dict
     return self
Exemple #3
0
def get_ancient_content(id, dir):
    index_dir = SimpleFSDirectory(Paths.get(dir))
    searcher = IndexSearcher(DirectoryReader.open(index_dir))
    all_text = ''
    query = RegexpQuery(Term('id', id + '\.[0-9]+\.[0-9]+'))
    hits = searcher.search(query, 9999)
    for hit in hits.scoreDocs:
        doc = searcher.doc(hit.doc)
        text = doc.get('text')
    return doc
Exemple #4
0
 def query_section(self, section):
     searcher = self._searcher
     query_doc = RegexpQuery(Term('id', self._id + '\\..+'))
     query_section = TermQuery(Term('section', section))
     query = BooleanQuery.Builder()
     bc1 = BooleanClause(query_doc, BooleanClause.Occur.MUST)
     bc2 = BooleanClause(query_section, BooleanClause.Occur.MUST)
     query = query.add(bc1).add(bc2).build()
     top_docs = searcher.search(query, 1000000)
     hits = top_docs.scoreDocs
     res_dict = {}
     for hit in hits:
         doc = searcher.doc(hit.doc)
         id = doc.get('id')
         document = doc.get('document')
         section = doc.get('section')
         author = doc.get('author')
         dynasty = doc.get('dynasty')
         type = doc.get('type')
         text = doc.get('text')
         color = doc.get('color')
         area = doc.get('area')
         zhujie = doc.get('zhujie')
         detail = doc.get('detail')
         res_dict[id] = {}
         if document:
             res_dict[id]['document'] = document
         if section:
             res_dict[id]['section'] = section
         if author:
             res_dict[id]['author'] = author
         if dynasty:
             res_dict[id]['dynasty'] = dynasty
         if type:
             res_dict[id]['type'] = type
         if text:
             res_dict[id]['text'] = text
         if color:
             res_dict[id]['color'] = color
         if area:
             res_dict[id]['area'] = area
         if zhujie:
             res_dict[id]['zhujie'] = zhujie
         if detail:
             res_dict[id]['detail'] = detail
     res_dict[self._id] = {'document': section}
     self._resDict = res_dict
     return self
Exemple #5
0
 def query(self):
     searcher = self._searcher
     res_list = []
     query = RegexpQuery(Term('id', '[0-9]+'))
     hits = searcher.search(query, 99999)
     for hit in hits.scoreDocs:
         doc = searcher.doc(hit.doc)
         id = doc.get('id')
         document = doc.get('document')
         sections = doc.get('sections')
         update_user = doc.get('username')
         res_list.append({
             'id': id,
             'document': document,
             'sections': sections,
             'update_user': update_user
         })
     return res_list
Exemple #6
0
def new_get_content(dir, id, show_length=300):
    index_dir = SimpleFSDirectory(Paths.get(dir))
    searcher = IndexSearcher(DirectoryReader.open(index_dir))
    cur_id_list = [int(x) for x in id.split(".")]
    query = TermQuery(Term("id", ".".join([str(x) for x in cur_id_list])))
    hit = searcher.search(query, 1)
    doc = searcher.doc(hit.scoreDocs[0].doc)
    info_dict = {}
    for field_info in doc.getFields():
        info_dict[field_info.name()] = field_info.stringValue()
    text_cur = [[info_dict]]
    text_prev = []
    cur_id_list[2] -= 1
    while list_text_len(text_prev) < show_length and cur_id_list[1] >= 1:
        para_info = []
        while cur_id_list[2] >= 1:
            query = TermQuery(
                Term("id", ".".join([str(x) for x in cur_id_list])))
            hit = searcher.search(query, 1)
            doc = searcher.doc(hit.scoreDocs[0].doc)
            info_dict = {}
            for field_info in doc.getFields():
                info_dict[field_info.name()] = field_info.stringValue()
            para_info.insert(0, info_dict)
            cur_id_list[2] -= 1
        text_prev.insert(0, para_info)
        cur_id_list[1] -= 1
        query = RegexpQuery(
            Term("id",
                 str(cur_id_list[0]) + "\\." + str(cur_id_list[1]) + "\\..+"))
        hits = searcher.search(query, 99999)
        s_id_max = 1
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            s_id = int(doc.get("id").split(".")[2])
            if s_id > s_id_max:
                s_id_max = s_id
        cur_id_list[2] = s_id_max
    if len(text_prev) > 0 and list_text_len(text_prev) >= show_length:
        over_count = list_text_len(text_prev) - show_length
        len_count = 0
        para_count = 0
        new_para = []
        while True:
            text = text_prev[0][para_count]["text"]
            prev_len = len_count
            len_count += len(text)
            if len_count > over_count:
                text = text[over_count - prev_len:]
                # 修注解的offset
                if "zhujie" in text_prev[0][para_count].keys():
                    zj = json.loads(text_prev[0][para_count]["zhujie"])
                    new_zj_offset = zj["offset"][:]
                    new_zj_content = zj["content"][:]
                    count = 0
                    for i in range(len(zj["offset"])):
                        new_offset = zj["offset"][i] - (over_count - prev_len)
                        if new_offset < 0:
                            new_zj_offset.pop(count)
                            new_zj_content.pop(count)
                            count -= 1
                        else:
                            new_zj_offset[count] = new_offset
                        count += 1
                    text_prev[0][para_count]["zhujie"] = json.dumps({
                        "offset":
                        new_zj_offset,
                        "content":
                        new_zj_content
                    })
                text_prev[0][para_count]["text"] = text
                break
            para_count += 1
        while para_count < len(text_prev[0]):
            new_para.insert(0, text_prev[0][para_count])
            para_count += 1
        text_prev[0] = new_para
    cur_id_list = [int(x) for x in id.split(".")]
    text_next = []
    cur_id_list[2] += 1
    while list_text_len(text_next) < show_length:
        query = RegexpQuery(
            Term("id",
                 str(cur_id_list[0]) + "\\." + str(cur_id_list[1]) + "\\..+"))
        hits = searcher.search(query, 100)
        s_id_max = 1
        if hits.totalHits < 1:
            break
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)
            s_id = int(doc.get("id").split(".")[2])
            if s_id > s_id_max:
                s_id_max = s_id
        para_info = []
        while cur_id_list[2] <= s_id_max:
            query = TermQuery(
                Term("id", ".".join([str(x) for x in cur_id_list])))
            hit = searcher.search(query, 1)
            doc = searcher.doc(hit.scoreDocs[0].doc)
            info_dict = {}
            for field_info in doc.getFields():
                info_dict[field_info.name()] = field_info.stringValue()
            para_info.append(info_dict)
            # para_text += doc.get("text")
            cur_id_list[2] += 1
        text_next.append(para_info)
        cur_id_list[2] = 1
        cur_id_list[1] += 1
    if len(text_next) > 0 and list_text_len(text_next) >= show_length:
        over_count = list_text_len(text_next) - show_length
        len_count = 0
        para_count = len(text_next[-1]) - 1
        new_para = []
        while True:
            text = text_next[-1][para_count]["text"]
            len_count += len(text)
            if len_count > over_count:
                text = text[:len_count - over_count]
                # 修注解的offset
                if "zhujie" in text_next[-1][para_count].keys():
                    zj = json.loads(text_next[-1][para_count]["zhujie"])
                    new_zj_offset = zj["offset"][:]
                    new_zj_content = zj["content"][:]
                    count = 0
                    for i in range(len(zj["offset"])):
                        if zj["offset"][i] >= len(text):
                            new_zj_offset.pop(count)
                            new_zj_content.pop(count)
                            count -= 1
                        count += 1
                    text_next[-1][para_count]["zhujie"] = json.dumps({
                        "offset":
                        new_zj_offset,
                        "content":
                        new_zj_content
                    })
                text_next[-1][para_count]["text"] = text
                break
            para_count -= 1
        para_count_new = 0
        while para_count_new <= para_count:
            new_para.append(text_next[-1][para_count_new])
            para_count_new += 1
        text_next[-1] = new_para
    if len(text_prev) == 0:
        text_prev = []
    if len(text_cur) == 0:
        text_cur = []
    if len(text_next) == 0:
        text_next = []
    return {"prev": text_prev, "cur": text_cur, "next": text_next}
Exemple #7
0
    def search(self, field):
        s = self._search
        u = self._userQuery
        zh_to_hant_dict = self._zh_to_hant_dict
        info = u.getFlagsInfo()
        flags_list = u.getFlagsList()
        sq_list = []
        word_index_list = []
        index_count = 0
        for flag in flags_list:
            if flag["type"] == "word":
                word_index_list.append(index_count)
                if len(flag["content"]) == 1:
                    if flag["content"][0] in zh_to_hant_dict:
                        stq_list = [
                            SpanTermQuery(Term(field, flag["content"][0]))
                        ]
                        for hant in zh_to_hant_dict[flag["content"][0]]:
                            stq_list.append(SpanTermQuery(Term(field, hant)))
                        sq_list.append(SpanOrQuery(stq_list))
                    else:
                        sq_list.append(
                            SpanTermQuery(Term(field, flag["content"][0])))
                else:
                    snq_list = []
                    for w in flag["content"]:
                        if w in zh_to_hant_dict:
                            stq_list = [SpanTermQuery(Term(field, w))]
                            for hant in zh_to_hant_dict[w]:
                                stq_list.append(
                                    SpanTermQuery(Term(field, hant)))
                            snq_list.append(SpanOrQuery(stq_list))
                        else:
                            snq_list.append(SpanTermQuery(Term(field, w)))
                    sq_list.append(SpanNearQuery(snq_list, 0, True))
            else:
                sq_list.append({
                    "op": info[flag["content"]]["op"],
                    "num": info[flag["content"]]["num"]
                })
            index_count += 1
        q = None
        count = 0
        for index in word_index_list:
            if count == 0:
                q = sq_list[index]
                count += 1
            else:
                if not isinstance(sq_list[index - 1], dict):
                    q = SpanNearQuery([q, sq_list[index]], 0, True)
                else:
                    q = SpanNearQuery([q, sq_list[index]],
                                      sq_list[index - 1]["num"][-1], True)
        query = q
        # 过滤项
        filters = u.getFields()
        bq = BooleanQuery.Builder()
        bq.add(BooleanClause(query, BooleanClause.Occur.MUST))
        for key in filters.keys():
            cur_reg = '('
            for ft in filters[key]:
                cur_reg += ft + '|'
            cur_reg = cur_reg[0:-1] + ')'
            rq = RegexpQuery(Term(key, cur_reg))
            bq.add(BooleanClause(rq, BooleanClause.Occur.MUST))
        query = bq.build()
        top_docs = s.search(query, 9999)
        self._cur_field = field

        reg = get_test_reg(flags_list, info, zh_to_hant_dict)
        doc_id_list = []
        hits = top_docs.scoreDocs
        for hit in hits:
            doc = s.doc(hit.doc)
            text = doc.get("text")
            match_res = re.search(reg, text)
            if match_res:
                doc_id_list.append(hit.doc)
        self._res = doc_id_list
        self._reg = reg
        return self
Exemple #8
0
 def search(self, field):
     s = self._search
     u = self._userQuery
     z = self._zh_to_hant_dict
     keys = u.getKey()
     nums = u.getNum()
     word_list = u.getWordList()
     filters = u.getFields()
     # 只检索过滤项
     if len(word_list) == 0:
         query = None
     # 简单项
     elif len(keys) == 0:
         query = simple_term_to_query(field, word_list[0], z)
     elif keys[0] == '#':
         query_left = simple_term_to_query(field, word_list[0], z)
         query_right = simple_term_to_query(field, word_list[1], z)
         query = SpanNearQuery([query_left, query_right], int(nums[0]),
                               False)
     elif keys[0] == '+' or keys[0] == '$':
         prev_query = simple_term_to_query(field, word_list[0], z)
         for i in range(len(keys)):
             cur_query = simple_term_to_query(field, word_list[i + 1], z)
             if keys[i] == '+':
                 span_list = [prev_query]
                 for j in range(int(nums[i])):
                     span = SpanMultiTermQueryWrapper(
                         RegexpQuery(Term(field, '.')))
                     span_list.append(span)
                 span_list.append(cur_query)
                 prev_query = SpanNearQuery(span_list, 0, True)
             else:
                 span_list = [prev_query, cur_query]
                 prev_query = SpanNearQuery(span_list, int(nums[i]), True)
         query = prev_query
     elif keys[0] == '-' or keys[0] == '~':
         query_left = simple_term_to_query(field, word_list[0], z)
         query_right = simple_term_to_query(field, word_list[1], z)
         if keys[0] == '-':
             n_q_list = [query_left, query_right]
         else:
             n_q_list = [query_right, query_left]
         n_query = SpanNearQuery(n_q_list, int(nums[0]) - 1, True)
         bq = BooleanQuery.Builder()
         bc1 = BooleanClause(query_left, BooleanClause.Occur.MUST)
         bc2 = BooleanClause(n_query, BooleanClause.Occur.MUST_NOT)
         query = bq.add(bc1).add(bc2).build()
     else:
         raise ValueError("检索语句错误!")
     # 过滤项
     bq = BooleanQuery.Builder()
     if query:
         bq.add(BooleanClause(query, BooleanClause.Occur.MUST))
     for key in filters.keys():
         cur_reg = '('
         for ft in filters[key]:
             cur_reg += ft + '|'
         cur_reg = cur_reg[0:-1] + ')'
         rq = RegexpQuery(Term(key, cur_reg))
         bq.add(BooleanClause(rq, BooleanClause.Occur.MUST))
     query = bq.build()
     self._res = s.search(query, 100000)
     self._cur_field = field
     return self
Exemple #9
0
from org.apache.lucene.queryparser.classic import QueryParser
# noinspection PyUnresolvedReferences
from org.apache.lucene.analysis.standard import StandardAnalyzer

if __name__ == "__main__":
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)

    term = Term("contents", "tiger")
    print(f'Tiger frequency: {reader.totalTermFreq(term)}')

    q_regex = RegexpQuery(Term("contents", "[0-9]+\.?[0-9]*"))
    print(f'regex results: {searcher.search(q_regex,1000000).totalHits}')

    span1 = SpanMultiTermQueryWrapper(q_regex)
    span2 = SpanMultiTermQueryWrapper(RegexpQuery(Term("contents", "tiger")))
    spannearquery = SpanNearQuery([span1, span2], 20, True)
    print(
        f'spanquery results: {searcher.search(spannearquery, 1000000).totalHits}'
    )

    parser = QueryParser('contents', StandardAnalyzer())
    q = parser.parse('"tiger leopard"')
    print(q)  # prints contents:"tiger leopard"
    print(searcher.search(q, 10000000).totalHits)

    phrase_query = PhraseQuery(10, 'contents', 'tiger leopard')