Ejemplo n.º 1
0
    def get_interior_doc(self, doc, data=None, xap_doc=None):
        """ convert python dict into xapian document object
            doc: 
               {field1:value1, 
                field2:value2}

            data: raw data for original object

            return: xapian document object
        """
        document = xap_doc or xapian.Document()
        termgen = xapian.TermGenerator()
        termgen.set_document(document)

        removed_prefix = set()
        # new value will be replace old value
        for field, value in doc.iteritems():

            # sortable
            if isinstance(value, (int, float, datetime)):
                value = clean_value(value)

                slotnum = self.get_slot(field)
                if isinstance(value, float):
                    value = xapian.sortable_serialise(float(value))
                    document.add_value(int(slotnum), value)
                else:
                    document.add_value(int(slotnum), str(value))

            # field
            else:
                value = clean_value(value)
                prefix = self.get_prefix(field)
                types = 'freetext'

                # 移除旧的term
                if xap_doc and prefix not in removed_prefix:
                    termlist = xap_doc.termlist()
                    term = termlist.skip_to(prefix)
                    while 1:
                        if term.term.startswith(prefix):
                        #if term.term[:2] == prefix:
                            document.remove_term(term.term)
                        else:
                            break
                        try:
                            term = termlist.next()
                        except StopIteration:
                            break
                removed_prefix.add(prefix)

                if types == 'exact':
                    if len(value) > 0:
                        # We use the following check, rather than "isupper()" to ensure
                        # that we match the check performed by the queryparser, regardless
                        # of our locale.
                        if ord(value[0]) >= ord('A') and ord(value[0]) <= ord('Z'):
                            prefix = prefix + ':'

                    # Note - xapian currently restricts term lengths to about 248
                    # characters - except that zero bytes are encoded in two bytes, so
                    # in practice a term of length 125 characters could be too long.
                    # Xapian will give an error when commit() is called after such
                    # documents have been added to the database.
                    # As a simple workaround, we give an error here for terms over 220
                    # characters, which will catch most occurrences of the error early.
                    #
                    # In future, it might be good to change to a hashing scheme in this
                    # situation (or for terms over, say, 64 characters), where the
                    # characters after position 64 are hashed (we obviously need to do this
                    # hashing at search time, too).
                    if len(prefix + value) > 220:
                        raise Exception("Field %r is too long: maximum length "
                                                   "220 - was %d (%r)" %
                                                   (field, len(prefix + value),
                                                    prefix + value))


                    document.add_term(prefix + value, 1) # wdfinc default set 1

                elif types == 'freetext':
                    # no positions, weight default set 1
                    termgen.index_text_without_positions(str(value), 1, prefix)
                    termgen.increase_termpos(10)

        # data
        if data is not None:
            if xap_doc:
                old_data = pickle.loads(xap_doc.get_data())
                old_data.update(data)
                document.set_data(pickle.dumps(old_data))
            else:
                document.set_data(pickle.dumps(data))

        return document
Ejemplo n.º 2
0
    def _get_xapian_query(self, querys, exclude=False, database=None):
        """ convert to xapian query 
        combined = query_filter(self.query_restricted(), combined)
        """
        if not querys:
            return xapian.Query('')

        qp = xapian.QueryParser()
        if database is not None:
            qp.set_database(database)
        qp.set_default_op(xapian.Query.OP_AND)

        # parse filters
        queries = []
        for filters in querys:
            field, value, op = filters
            if op == 'parse':
                _queries = []
                value = clean_value(value, is_query=True)
                for f in field:
                    prefix = self.schema.get_prefix(f, auto_add=False)
                    _queries.append( qp.parse_query(value, xapian.QueryParser.FLAG_WILDCARD, prefix) )

                query = xapian.Query(xapian.Query.OP_OR, _queries)

                queries.append(query)
                continue

            if not value:
                continue

            if op == 'allof':
                prefix = self.schema.get_prefix(field, auto_add=False)
                value = clean_value(value)
                query = query_field(prefix, value)
                queries.append(query)

            elif op == 'anyof':
                prefix = self.schema.get_prefix(field, auto_add=False)
                value = clean_value(value)
                query = query_field(prefix, value, default_op=xapian.Query.OP_OR)
                queries.append(query)

            elif op == 'range':
                prefix = self.schema.get_slot(field, auto_add=False)
                value = clean_date(value)
                begin, end = value[:2]
                query = self.query_range(prefix, begin, end)
                queries.append(query)

            elif not op:
                prefix = self.schema.get_prefix(field, auto_add=False)
                value = clean_value(value)
                query = query_field(prefix, value)
                queries.append(query)

        if len(queries) == 1:
            combined = queries[0]
        else:
            if not exclude:
                _func = lambda q1, q2: query_filter(q1, q2)
                combined = reduce( _func, queries)
            else:
                combined = xapian.Query(xapian.Query.OP_OR, list(queries))

        return combined
Ejemplo n.º 3
0
    def get_interior_doc(self, doc, data=None, old_doc=None):
        """ convert python dict into xapian document object
            doc: 
               {field1:value1, 
                field2:value2}

            data: raw data for original object

            return: xapian document object
        """
        def _add_term(doc, termgen, prefix, value):
            type_name = 'freetext'

            if type_name == 'exact':
                if len(value) > 0:
                    # We use the following check, rather than "isupper()" to ensure
                    # that we match the check performed by the queryparser, regardless
                    # of our locale.
                    if ord(value[0]) >= ord('A') and ord(value[0]) <= ord('Z'):
                        prefix = prefix + ':'

                # Note - xapian currently restricts term lengths to about 248
                # characters - except that zero bytes are encoded in two bytes, so
                # in practice a term of length 125 characters could be too long.
                # Xapian will give an error when commit() is called after such
                # documents have been added to the database.
                # As a simple workaround, we give an error here for terms over 220
                # characters, which will catch most occurrences of the error early.
                #
                # In future, it might be good to change to a hashing scheme in this
                # situation (or for terms over, say, 64 characters), where the
                # characters after position 64 are hashed (we obviously need to do this
                # hashing at search time, too).
                if len(prefix + value) > 220:
                    raise Exception("Field is too long: maximum length "
                                               "220 - was %d (%r)" %
                                               (len(prefix + value),
                                                prefix + value))


                doc.add_term(prefix + value, 1) # wdfinc default set 1

            elif type_name == 'freetext':
                # no positions, weight default set 1
                termgen.index_text_without_positions(str(value), 1, prefix)
                termgen.increase_termpos(10)

        def _add_value(doc, slotnum, value):

            if isinstance(value, float):
                value = xapian.sortable_serialise(float(value))
                doc.add_value(int(slotnum), value)
            else:
                doc.add_value(int(slotnum), str(value))

        document = xapian.Document()
        termgen = xapian.TermGenerator()
        termgen.set_document(document)

        terms = set()
        values = set()
        # build new xapian object
        for field, value in doc.iteritems():
            both = field.startswith('+') 
            if both:
                field = field[1:]
            is_value = isinstance(value, (int, float, datetime))
            is_term = not is_value
            # sortable
            if both or is_value:
                if field in values:
                    continue
                slotnum = self.schema.get_slot(field)
                value = clean_value(value)
                _add_value(document, slotnum, value)
                values.add(slotnum)
            # field
            if both or is_term:
                if field in terms:
                    continue
                prefix = self.schema.get_prefix(field)
                value = clean_value(value)
                _add_term(document, termgen, prefix, value)
                terms.add(prefix)

        # new value will be replace old value
        if old_doc is not None:
            for term in old_doc.termlist():
                prefix, value = self.schema.split_term(term.term)
                if prefix not in terms:
                    _add_term(document, termgen, prefix, value)

            for value in old_doc.values():
                if value.num not in values:
                    _add_value(document, value.num, value.value)

            if data is None: 
                data = dict()

            old_data = old_doc.get_data()
            if old_data:
                old_data = pickle.loads(old_data)
                for k, v in old_data.iteritems():
                    if k not in data:
                        data[k] = v
        # add data
        if data:
            document.set_data(pickle.dumps(data))

        return document