def search(self, field: str): sear = self._search if len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']: query = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) elif self._commandInfo.getKey()[0] == '#': query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1])) bc1 = BooleanClause(query1, BooleanClause.Occur.MUST) bc2 = BooleanClause(query2, BooleanClause.Occur.MUST) query = BooleanQuery.Builder().add(bc1).add(bc2).build() elif self._commandInfo.getKey()[0] in ['$', '+']: bq = BooleanQuery.Builder() for w in self._commandInfo.getWordList(): queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w)) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() else: query = '' hits = sear.search(query, 999999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get(field+'_id') if doc_hit(res, self._commandInfo): sentences = re.split('[!?!?。]', res) map(lambda x: sentences.pop(x) if x == '' else 0, range(len(sentences))) for sentence in sentences: if key_filter(self._commandInfo, sentence): self._doc[id] = res self._resultSentencesList.append((id, sentence)) return self
def testFlat(self): q = BooleanQuery() q.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) self.assertEqual(1, self.search(q))
def testFlat(self): b = BooleanQuery.Builder() b.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) b.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) b.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) b.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) q = b.build() self.assertEqual(1, self.search(q))
def testParenthesisMust2(self): q3 = BooleanQuery() q3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) q3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) q4 = BooleanQuery() q4.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) q4.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) q2 = BooleanQuery() q2.add(q3, BooleanClause.Occur.SHOULD) q2.add(q4, BooleanClause.Occur.MUST) self.assertEqual(1, self.search(q2))
def testParenthesisMust2(self): b3 = BooleanQuery.Builder() b3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) b3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) q3 = b3.build() b4 = BooleanQuery.Builder() b4.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) b4.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) q4 = b4.build() b2 = BooleanQuery.Builder() b2.add(q3, BooleanClause.Occur.SHOULD) b2.add(q4, BooleanClause.Occur.MUST) q2 = b2.build() self.assertEqual(1, self.search(q2))
def query_section(self, section): searcher = self._searcher query_doc = RegexpQuery(Term('id', self._id + '\\..+')) query_section = TermQuery(Term('section', section)) query = BooleanQuery.Builder() bc1 = BooleanClause(query_doc, BooleanClause.Occur.MUST) bc2 = BooleanClause(query_section, BooleanClause.Occur.MUST) query = query.add(bc1).add(bc2).build() top_docs = searcher.search(query, 1000000) hits = top_docs.scoreDocs res_dict = {} for hit in hits: doc = searcher.doc(hit.doc) id = doc.get('id') document = doc.get('document') section = doc.get('section') author = doc.get('author') dynasty = doc.get('dynasty') type = doc.get('type') text = doc.get('text') color = doc.get('color') area = doc.get('area') zhujie = doc.get('zhujie') detail = doc.get('detail') res_dict[id] = {} if document: res_dict[id]['document'] = document if section: res_dict[id]['section'] = section if author: res_dict[id]['author'] = author if dynasty: res_dict[id]['dynasty'] = dynasty if type: res_dict[id]['type'] = type if text: res_dict[id]['text'] = text if color: res_dict[id]['color'] = color if area: res_dict[id]['area'] = area if zhujie: res_dict[id]['zhujie'] = zhujie if detail: res_dict[id]['detail'] = detail res_dict[self._id] = {'document': section} self._resDict = res_dict return self
def getBooleanQuery(self, clauses): extra_query = TermQuery(Term("all", "extra_clause")) extra_clause = BooleanClause(extra_query, BooleanClause.Occur.SHOULD) clauses.add(extra_clause) return super(BooleanTestMixin, self).getBooleanQuery(clauses)
def get_most_similar(self, sentence, do_log=False): # print('query string is',string) # q = QueryParser('pa', self.analyzer).parse(sentence) query_builder = BooleanQuery.Builder() for token in sentence.split(' '): if token not in sw: qtq = TermQuery(Term("pa", token)) query_builder.add( BooleanClause(qtq, BooleanClause.Occur.SHOULD)) q = query_builder.build() hitsPerPage = 2 reader = DirectoryReader.open(self.w) self.searcher = IndexSearcher(reader) simi = BM25Similarity(Config.k1, Config.b) # simi = ClassicSimilarity() self.searcher.setSimilarity(simi) docs = self.searcher.search(q, hitsPerPage) hits = docs.scoreDocs # print("Found " + str(len(hits)) + " hits.") if len(hits) > 0: mate = self.searcher.doc(hits[0].doc).get("id") if do_log: print("found something. mate: ", mate, "- score : ", hits[0].score) return hits[0], int(mate) else: return None, -1
def searchAncient(self, field): indexDir = SimpleFSDirectory(Paths.get(self._dir)) sear = IndexSearcher(DirectoryReader.open(indexDir)) bq = BooleanQuery.Builder() q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._searchWord)) bc = BooleanClause(q, BooleanClause.Occur.MUST) bq.add(bc) search_fields = self._fields for i in search_fields: if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(search_fields[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() hits = sear.search(query, 9999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get('id') detail = get_detail(doc) zhujie = detail['zhujie'] if detail['detail'] and 'detail' in detail['detail'].keys(): detail['detail'] = detail['detail']['detail'] detail.pop('zhujie') detail.pop('text') detail.pop('type') detail = json.dumps(detail) self._doc[id] = res if doc_hit(res, self._words): f = key_filter(self._words, self._re, res) if f: if 'section' in search_fields.keys(): if not search_upper_title_filter(id, sear, search_fields['section'], 0): continue if 'document' in search_fields.keys(): if not search_upper_title_filter(id, sear, search_fields['document'], 1): continue self._match.append(f) self._resultSentencesList.append((id, res, detail, zhujie)) print(res) print(self._match) return self
def delete(primary_keys_map, collection_name, todelete, commit=False): INDEX_DIR_DEFAULT = "IndexFiles.index" if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(todelete) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) ireader = IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents according to primary keys query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) a = writer.deleteDocuments(query) if commit == True: writer.commit() writer.close() return 000
def search(self, field): s = self._search u = self._userQuery zh_to_hant_dict = self._zh_to_hant_dict info = u.getFlagsInfo() flags_list = u.getFlagsList() sq_list = [] word_index_list = [] index_count = 0 for flag in flags_list: if flag["type"] == "word": word_index_list.append(index_count) if len(flag["content"]) == 1: if flag["content"][0] in zh_to_hant_dict: stq_list = [ SpanTermQuery(Term(field, flag["content"][0])) ] for hant in zh_to_hant_dict[flag["content"][0]]: stq_list.append(SpanTermQuery(Term(field, hant))) sq_list.append(SpanOrQuery(stq_list)) else: sq_list.append( SpanTermQuery(Term(field, flag["content"][0]))) else: snq_list = [] for w in flag["content"]: if w in zh_to_hant_dict: stq_list = [SpanTermQuery(Term(field, w))] for hant in zh_to_hant_dict[w]: stq_list.append( SpanTermQuery(Term(field, hant))) snq_list.append(SpanOrQuery(stq_list)) else: snq_list.append(SpanTermQuery(Term(field, w))) sq_list.append(SpanNearQuery(snq_list, 0, True)) else: sq_list.append({ "op": info[flag["content"]]["op"], "num": info[flag["content"]]["num"] }) index_count += 1 q = None count = 0 for index in word_index_list: if count == 0: q = sq_list[index] count += 1 else: if not isinstance(sq_list[index - 1], dict): q = SpanNearQuery([q, sq_list[index]], 0, True) else: q = SpanNearQuery([q, sq_list[index]], sq_list[index - 1]["num"][-1], True) query = q # 过滤项 filters = u.getFields() bq = BooleanQuery.Builder() bq.add(BooleanClause(query, BooleanClause.Occur.MUST)) for key in filters.keys(): cur_reg = '(' for ft in filters[key]: cur_reg += ft + '|' cur_reg = cur_reg[0:-1] + ')' rq = RegexpQuery(Term(key, cur_reg)) bq.add(BooleanClause(rq, BooleanClause.Occur.MUST)) query = bq.build() top_docs = s.search(query, 9999) self._cur_field = field reg = get_test_reg(flags_list, info, zh_to_hant_dict) doc_id_list = [] hits = top_docs.scoreDocs for hit in hits: doc = s.doc(hit.doc) text = doc.get("text") match_res = re.search(reg, text) if match_res: doc_id_list.append(hit.doc) self._res = doc_id_list self._reg = reg return self
import lucene from java.nio.file import Paths from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.index import DirectoryReader, Term input_q = input().replace('ی', 'ي') lucene.initVM() index_path = Paths.get('./lucene.index') question_field = 'question' answer_field = 'answer' directory = SimpleFSDirectory(index_path) searcher = IndexSearcher(DirectoryReader.open(directory)) query_builder = BooleanQuery.Builder() for q_word in input_q.split(' '): qtq = TermQuery(Term(question_field, q_word)) query_builder\ .add(BooleanClause(qtq, BooleanClause.Occur.SHOULD)) query = query_builder.build() top_n = 5 scoreDocs = searcher.search(query, top_n).scoreDocs print('found nums: ', len(scoreDocs)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print('Best Math: ', doc.get(question_field), '\n') print('Answer: ', doc.get(answer_field)) print('---------------------\n')
def GET(self, query): data_input = web.input() page = 0 if "page" in data_input: page = int(data_input["page"]) render = web.template.render('templates/') anses = [] num_pages = 0 if use_elasticsearch: # importing libraries for Elasticsearch from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections from booktype import Book es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) # print(connections.get_connection().cluster.health()) s = Search(es).index('book-index').doc_type('book').query( Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip())) ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute() s = s[page * 10:page * 10 + 10] response = s.execute() # print 'total number of hits: ', response.hits.total num_pages = (response.hits.total / 10) + 1 for res in response: authors = zip(res.authors_name, res.authors_url) anses.append({ 'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url, 'cover': res.cover, 'authors': authors }) else: # importing libraries for Lucene import lucene from java.io import File from org.apache.lucene.index import DirectoryReader, Term from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer import os # fields title_field = 'title' description_field = 'description' cover_field = 'cover' authors_name_field = 'authors_name' authors_url_field = 'authors_url' url_field = 'url' index_folder = '.' index_name = 'lucene.index' index_path = os.path.join(index_folder, index_name) lucene.initVM() version = Version.LUCENE_CURRENT directory = SimpleFSDirectory(File(index_path)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(version) title_tq = TermQuery(Term(title_field, query)) desc_tq = TermQuery(Term(description_field, query)) query = BooleanQuery() query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD)) query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD)) scoreDocs = searcher.search(query, 1000).scoreDocs num_pages = (len(scoreDocs) / 10) + 1 for scoreDoc in scoreDocs[page * 10:page * 10 + 10]: doc = searcher.doc(scoreDoc.doc) authors = zip([doc.get(authors_name_field)], [doc.get(authors_url_field)]) anses.append({ 'title': doc.get(title_field), 'description': doc.get(description_field).encode('utf-8'), 'url': doc.get(url_field), 'cover': doc.get(cover_field), 'authors': authors }) return render.index(anses, query, num_pages)
def search(self, field): s = self._search u = self._userQuery z = self._zh_to_hant_dict keys = u.getKey() nums = u.getNum() word_list = u.getWordList() filters = u.getFields() # 只检索过滤项 if len(word_list) == 0: query = None # 简单项 elif len(keys) == 0: query = simple_term_to_query(field, word_list[0], z) elif keys[0] == '#': query_left = simple_term_to_query(field, word_list[0], z) query_right = simple_term_to_query(field, word_list[1], z) query = SpanNearQuery([query_left, query_right], int(nums[0]), False) elif keys[0] == '+' or keys[0] == '$': prev_query = simple_term_to_query(field, word_list[0], z) for i in range(len(keys)): cur_query = simple_term_to_query(field, word_list[i + 1], z) if keys[i] == '+': span_list = [prev_query] for j in range(int(nums[i])): span = SpanMultiTermQueryWrapper( RegexpQuery(Term(field, '.'))) span_list.append(span) span_list.append(cur_query) prev_query = SpanNearQuery(span_list, 0, True) else: span_list = [prev_query, cur_query] prev_query = SpanNearQuery(span_list, int(nums[i]), True) query = prev_query elif keys[0] == '-' or keys[0] == '~': query_left = simple_term_to_query(field, word_list[0], z) query_right = simple_term_to_query(field, word_list[1], z) if keys[0] == '-': n_q_list = [query_left, query_right] else: n_q_list = [query_right, query_left] n_query = SpanNearQuery(n_q_list, int(nums[0]) - 1, True) bq = BooleanQuery.Builder() bc1 = BooleanClause(query_left, BooleanClause.Occur.MUST) bc2 = BooleanClause(n_query, BooleanClause.Occur.MUST_NOT) query = bq.add(bc1).add(bc2).build() else: raise ValueError("检索语句错误!") # 过滤项 bq = BooleanQuery.Builder() if query: bq.add(BooleanClause(query, BooleanClause.Occur.MUST)) for key in filters.keys(): cur_reg = '(' for ft in filters[key]: cur_reg += ft + '|' cur_reg = cur_reg[0:-1] + ')' rq = RegexpQuery(Term(key, cur_reg)) bq.add(BooleanClause(rq, BooleanClause.Occur.MUST)) query = bq.build() self._res = s.search(query, 100000) self._cur_field = field return self
def ancientSearch(self, field): sear = self._search fieldOnly = False # 只搜索域 if len(self._commandInfo.getWordList()) == 0: fieldOnly = True bq = BooleanQuery.Builder() fields = self._commandInfo.getFields() for key in fields: queryx = QueryParser(key, KeywordAnalyzer()).parse(fields[key][0]) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']: bq = BooleanQuery.Builder() q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) bc = BooleanClause(q, BooleanClause.Occur.MUST) bq.add(bc) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif self._commandInfo.getKey()[0] == '#': bq = BooleanQuery.Builder() query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1])) bc1 = BooleanClause(query1, BooleanClause.Occur.MUST) bc2 = BooleanClause(query2, BooleanClause.Occur.MUST) bq.add(bc1).add(bc2) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() elif self._commandInfo.getKey()[0] in ['$', '+']: bq = BooleanQuery.Builder() for w in self._commandInfo.getWordList(): queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w)) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) for i in self._commandInfo.getFields(): if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(self._commandInfo.getFields()[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() else: query = '' hits = sear.search(query, 9999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get('id') detail = get_detail(doc) zhujie = detail['zhujie'] if detail['detail'] and 'detail' in detail['detail'].keys(): detail['detail'] = detail['detail']['detail'] detail.pop('zhujie') detail.pop('text') detail.pop('type') detail = json.dumps(detail) if fieldOnly: if not doc.get("text").strip(): continue if id.count(".") == 2: self._doc[id] = doc.get("text") self._resultSentencesList.append((id, doc.get("text"))) elif id.count(".") == 1: searcher = self._search query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1') hits = searcher.search(query, 1) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) res = doc.get("text") if res: self._doc[id+".1"] = doc.get('text') self._resultSentencesList.append((id + ".1", doc.get('text'))) else: searcher = self._search query = QueryParser('id', KeywordAnalyzer()).parse(id + '.1.1') hits = searcher.search(query, 1) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) res = doc.get("text") if not doc.get("text").strip(): continue if res: self._doc[id+".1.1"] = doc.get('text') self._resultSentencesList.append((id + ".1.1", doc.get('text'))) elif doc_hit(res, self._commandInfo): if key_filter(self._commandInfo, res): if 'section' in self._commandInfo.getFields().keys(): if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['section'], 0): continue if 'document' in self._commandInfo.getFields().keys(): if not search_upper_title_filter(id, sear, self._commandInfo.getFields()['document'], 1): continue self._doc[id] = res self._resultSentencesList.append((id, res, detail, zhujie)) return self
def update(collection_name, tofind, update, commit=False, add_field_if_not_exists=True): #As of now the update will be implemented as search,modify data in json file,delete and re-write if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(tofind) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader = IndexReader.open(direc) searcher = IndexSearcher(ireader) #setting writer configurations config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) except: return 105 no_of_documents_modified = 0 #finding the document to update #Scope for making this more efficient def rewrite(data_string): data = json.loads(data_string) toupdate = json.loads(update) #primary_key_modified=False #delete the appropriate document query = BooleanQuery() for key in primary_keys_map[collection_name]: temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(data[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) #print query #modify the values for key, value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists == False: if key in data.keys(): data[key] = value else: data[key] = value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue primary_key_update = False for key in toupdate.keys(): if key in primary_keys_map[INDEX_DIR]: primary_key_update = True break if primary_key_update == True: query_search = BooleanQuery() for key in primary_keys_map[INDEX_DIR]: temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(data[key]) query_search.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query_search, MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc = Document() #index files wrt primary key for primary_key in primary_keys_map[collection_name]: try: field = Field(primary_key, data[primary_key], Field.Store.NO, Field.Index.NOT_ANALYZED) doc.add(field) except: primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_map[collection_name] == True: temp = json.dumps(data) data_string = base64.b64encode(snappy.compress(temp)) else: temp = json.dumps(data) data_string = base64.b64encode(temp) field = Field("$DATA$", data_string, Field.Store.YES, Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map[collection_name]: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs) > 0: query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse( tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query, MAX_RESULTS).scoreDocs for hit in hits: doc = searcher.doc(hit.doc) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: if rewrite(data) != 106: no_of_documents_modified += 1 else: return 106 else: if rewrite(data) != 106: no_of_documents_modified += 1 else: return 106 else: for i in range(0, ireader.numDocs()): doc = searcher.doc(i) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: if rewrite(data) != 106: no_of_documents_modified += 1 else: return 106 else: if rewrite(data) != 106: no_of_documents_modified += 1 else: return 106 ireader.close() if commit == True: writer.commit() writer.close() return str(no_of_documents_modified) + " have been modified"
def search(collection_name, tofind): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(tofind) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader = IndexReader.open(direc) searcher = IndexSearcher(ireader) except: return 105 #initializing return list return_list = [] #check_list=[] tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map[collection_name]: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs) > 0: query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse( tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query, MAX_RESULTS).scoreDocs for hit in hits: doc = searcher.doc(hit.doc) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: return_list.append(data) else: return_list.append(data) else: for i in range(0, ireader.numDocs()): doc = searcher.doc(i) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: return_list.append(data) else: return_list.append(data) ireader.close() if len(return_list) == 0: return None else: return return_list
def store(collection_name, data, commit=False): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT print "started indexing input data......" #extracting values try: contents = json.loads(data) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #checking for existance of record with same primary_key set try: ireader = IndexReader.open(direc) searcher = IndexSearcher(ireader) query = BooleanQuery() for key in primary_keys_map[INDEX_DIR]: temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(contents[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query, MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 except: pass #setting writer configurations config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) #fix this later.....FieldType not defined #field_type=FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) try: doc = Document() #index files wrt primary key for primary_key in primary_keys_map[collection_name]: try: field = Field(primary_key, contents[primary_key], Field.Store.NO, Field.Index.NOT_ANALYZED) doc.add(field) except: primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_map[collection_name] == True: # print "here" #data=data.encode('utf-8') data = base64.b64encode(snappy.compress(data)) # print data else: data = base64.b64encode(data) field = Field("$DATA$", data, Field.Store.YES, Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) if commit == True: writer.commit() writer.close() return 000 except: return 102
def rewrite(data_string): data = json.loads(data_string) toupdate = json.loads(update) #primary_key_modified=False #delete the appropriate document query = BooleanQuery() for key in primary_keys_map[collection_name]: temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(data[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) #print query #modify the values for key, value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists == False: if key in data.keys(): data[key] = value else: data[key] = value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue primary_key_update = False for key in toupdate.keys(): if key in primary_keys_map[INDEX_DIR]: primary_key_update = True break if primary_key_update == True: query_search = BooleanQuery() for key in primary_keys_map[INDEX_DIR]: temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(data[key]) query_search.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query_search, MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc = Document() #index files wrt primary key for primary_key in primary_keys_map[collection_name]: try: field = Field(primary_key, data[primary_key], Field.Store.NO, Field.Index.NOT_ANALYZED) doc.add(field) except: primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_map[collection_name] == True: temp = json.dumps(data) data_string = base64.b64encode(snappy.compress(temp)) else: temp = json.dumps(data) data_string = base64.b64encode(temp) field = Field("$DATA$", data_string, Field.Store.YES, Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc)