def get_interior_doc(self, doc, data=None, xap_doc=None): """ convert python dict into xapian document object doc: {field1:value1, field2:value2} data: raw data for original object return: xapian document object """ document = xap_doc or xapian.Document() termgen = xapian.TermGenerator() termgen.set_document(document) removed_prefix = set() # new value will be replace old value for field, value in doc.iteritems(): # sortable if isinstance(value, (int, float, datetime)): value = clean_value(value) slotnum = self.get_slot(field) if isinstance(value, float): value = xapian.sortable_serialise(float(value)) document.add_value(int(slotnum), value) else: document.add_value(int(slotnum), str(value)) # field else: value = clean_value(value) prefix = self.get_prefix(field) types = 'freetext' # 移除旧的term if xap_doc and prefix not in removed_prefix: termlist = xap_doc.termlist() term = termlist.skip_to(prefix) while 1: if term.term.startswith(prefix): #if term.term[:2] == prefix: document.remove_term(term.term) else: break try: term = termlist.next() except StopIteration: break removed_prefix.add(prefix) if types == 'exact': if len(value) > 0: # We use the following check, rather than "isupper()" to ensure # that we match the check performed by the queryparser, regardless # of our locale. if ord(value[0]) >= ord('A') and ord(value[0]) <= ord('Z'): prefix = prefix + ':' # Note - xapian currently restricts term lengths to about 248 # characters - except that zero bytes are encoded in two bytes, so # in practice a term of length 125 characters could be too long. # Xapian will give an error when commit() is called after such # documents have been added to the database. # As a simple workaround, we give an error here for terms over 220 # characters, which will catch most occurrences of the error early. # # In future, it might be good to change to a hashing scheme in this # situation (or for terms over, say, 64 characters), where the # characters after position 64 are hashed (we obviously need to do this # hashing at search time, too). if len(prefix + value) > 220: raise Exception("Field %r is too long: maximum length " "220 - was %d (%r)" % (field, len(prefix + value), prefix + value)) document.add_term(prefix + value, 1) # wdfinc default set 1 elif types == 'freetext': # no positions, weight default set 1 termgen.index_text_without_positions(str(value), 1, prefix) termgen.increase_termpos(10) # data if data is not None: if xap_doc: old_data = pickle.loads(xap_doc.get_data()) old_data.update(data) document.set_data(pickle.dumps(old_data)) else: document.set_data(pickle.dumps(data)) return document
def _get_xapian_query(self, querys, exclude=False, database=None): """ convert to xapian query combined = query_filter(self.query_restricted(), combined) """ if not querys: return xapian.Query('') qp = xapian.QueryParser() if database is not None: qp.set_database(database) qp.set_default_op(xapian.Query.OP_AND) # parse filters queries = [] for filters in querys: field, value, op = filters if op == 'parse': _queries = [] value = clean_value(value, is_query=True) for f in field: prefix = self.schema.get_prefix(f, auto_add=False) _queries.append( qp.parse_query(value, xapian.QueryParser.FLAG_WILDCARD, prefix) ) query = xapian.Query(xapian.Query.OP_OR, _queries) queries.append(query) continue if not value: continue if op == 'allof': prefix = self.schema.get_prefix(field, auto_add=False) value = clean_value(value) query = query_field(prefix, value) queries.append(query) elif op == 'anyof': prefix = self.schema.get_prefix(field, auto_add=False) value = clean_value(value) query = query_field(prefix, value, default_op=xapian.Query.OP_OR) queries.append(query) elif op == 'range': prefix = self.schema.get_slot(field, auto_add=False) value = clean_date(value) begin, end = value[:2] query = self.query_range(prefix, begin, end) queries.append(query) elif not op: prefix = self.schema.get_prefix(field, auto_add=False) value = clean_value(value) query = query_field(prefix, value) queries.append(query) if len(queries) == 1: combined = queries[0] else: if not exclude: _func = lambda q1, q2: query_filter(q1, q2) combined = reduce( _func, queries) else: combined = xapian.Query(xapian.Query.OP_OR, list(queries)) return combined
def get_interior_doc(self, doc, data=None, old_doc=None): """ convert python dict into xapian document object doc: {field1:value1, field2:value2} data: raw data for original object return: xapian document object """ def _add_term(doc, termgen, prefix, value): type_name = 'freetext' if type_name == 'exact': if len(value) > 0: # We use the following check, rather than "isupper()" to ensure # that we match the check performed by the queryparser, regardless # of our locale. if ord(value[0]) >= ord('A') and ord(value[0]) <= ord('Z'): prefix = prefix + ':' # Note - xapian currently restricts term lengths to about 248 # characters - except that zero bytes are encoded in two bytes, so # in practice a term of length 125 characters could be too long. # Xapian will give an error when commit() is called after such # documents have been added to the database. # As a simple workaround, we give an error here for terms over 220 # characters, which will catch most occurrences of the error early. # # In future, it might be good to change to a hashing scheme in this # situation (or for terms over, say, 64 characters), where the # characters after position 64 are hashed (we obviously need to do this # hashing at search time, too). if len(prefix + value) > 220: raise Exception("Field is too long: maximum length " "220 - was %d (%r)" % (len(prefix + value), prefix + value)) doc.add_term(prefix + value, 1) # wdfinc default set 1 elif type_name == 'freetext': # no positions, weight default set 1 termgen.index_text_without_positions(str(value), 1, prefix) termgen.increase_termpos(10) def _add_value(doc, slotnum, value): if isinstance(value, float): value = xapian.sortable_serialise(float(value)) doc.add_value(int(slotnum), value) else: doc.add_value(int(slotnum), str(value)) document = xapian.Document() termgen = xapian.TermGenerator() termgen.set_document(document) terms = set() values = set() # build new xapian object for field, value in doc.iteritems(): both = field.startswith('+') if both: field = field[1:] is_value = isinstance(value, (int, float, datetime)) is_term = not is_value # sortable if both or is_value: if field in values: continue slotnum = self.schema.get_slot(field) value = clean_value(value) _add_value(document, slotnum, value) values.add(slotnum) # field if both or is_term: if field in terms: continue prefix = self.schema.get_prefix(field) value = clean_value(value) _add_term(document, termgen, prefix, value) terms.add(prefix) # new value will be replace old value if old_doc is not None: for term in old_doc.termlist(): prefix, value = self.schema.split_term(term.term) if prefix not in terms: _add_term(document, termgen, prefix, value) for value in old_doc.values(): if value.num not in values: _add_value(document, value.num, value.value) if data is None: data = dict() old_data = old_doc.get_data() if old_data: old_data = pickle.loads(old_data) for k, v in old_data.iteritems(): if k not in data: data[k] = v # add data if data: document.set_data(pickle.dumps(data)) return document