Example #1
0
    def to_bytes(self, value):
        """Returns a bytes representation of the given value, appropriate to be
        written to disk. The default implementation assumes a unicode value and
        encodes it using UTF-8.
        """

        if isinstance(value, (list, tuple)):
            value = value[0]
        if not isinstance(value, bytes_type):
            value = utf8encode(value)[0]
        return value
Example #2
0
    def to_bytes(self, value):
        """Returns a bytes representation of the given value, appropriate to be
        written to disk. The default implementation assumes a unicode value and
        encodes it using UTF-8.
        """

        if isinstance(value, (list, tuple)):
            value = value[0]
        if not isinstance(value, bytes_type):
            value = utf8encode(value)[0]
        return value
Example #3
0
def to_labels(key):
    """Takes a string and returns a list of bytestrings, suitable for use as
    a key or path in an FSA/FST graph.
    """

    # Convert to tuples of bytestrings (must be tuples so they can be hashed)
    keytype = type(key)

    # I hate the Python 3 bytes object so friggin much
    if keytype is tuple or keytype is list:
        if not all(isinstance(e, bytes_type) for e in key):
            raise TypeError("%r contains a non-bytestring" % key)
        if keytype is list:
            key = tuple(key)
    elif isinstance(key, bytes_type):
        key = tuple(key[i:i + 1] for i in xrange(len(key)))
    elif isinstance(key, text_type):
        key = tuple(utf8encode(key[i:i + 1])[0] for i in xrange(len(key)))
    else:
        raise TypeError("Don't know how to convert %r" % key)
    return key
Example #4
0
def to_labels(key):
    """Takes a string and returns a list of bytestrings, suitable for use as
    a key or path in an FSA/FST graph.
    """

    # Convert to tuples of bytestrings (must be tuples so they can be hashed)
    keytype = type(key)

    # I hate the Python 3 bytes object so friggin much
    if keytype is tuple or keytype is list:
        if not all(isinstance(e, bytes_type) for e in key):
            raise TypeError("%r contains a non-bytestring" % key)
        if keytype is list:
            key = tuple(key)
    elif isinstance(key, bytes_type):
        key = tuple(key[i:i + 1] for i in xrange(len(key)))
    elif isinstance(key, text_type):
        key = tuple(utf8encode(key[i:i + 1])[0] for i in xrange(len(key)))
    else:
        raise TypeError("Don't know how to convert %r" % key)
    return key
Example #5
0
    def index(self, value, **kwargs):
        """Returns an iterator of (btext, frequency, weight, encoded_value)
        tuples for each unique word in the input value.

        The default implementation uses the ``analyzer`` attribute to tokenize
        the value into strings, then encodes them into bytes using UTF-8.
        """

        if not self.format:
            raise Exception("%s field %r cannot index without a format" % (self.__class__.__name__, self))
        if not isinstance(value, (text_type, list, tuple)):
            raise ValueError("%r is not unicode or sequence" % value)
        assert isinstance(self.format, formats.Format)

        if "mode" not in kwargs:
            kwargs["mode"] = "index"

        word_values = self.format.word_values
        ana = self.analyzer
        for tstring, freq, wt, vbytes in word_values(value, ana, **kwargs):
            yield (utf8encode(tstring)[0], freq, wt, vbytes)
Example #6
0
    def index(self, value, **kwargs):
        """Returns an iterator of (btext, frequency, weight, encoded_value)
        tuples for each unique word in the input value.

        The default implementation uses the ``analyzer`` attribute to tokenize
        the value into strings, then encodes them into bytes using UTF-8.
        """

        if not self.format:
            raise Exception("%s field %r cannot index without a format"
                            % (self.__class__.__name__, self))
        if not isinstance(value, (text_type, list, tuple)):
            raise ValueError("%r is not unicode or sequence" % value)
        assert isinstance(self.format, formats.Format)

        if "mode" not in kwargs:
            kwargs["mode"] = "index"

        word_values = self.format.word_values
        ana = self.analyzer
        for tstring, freq, wt, vbytes in word_values(value, ana, **kwargs):
            yield (utf8encode(tstring)[0], freq, wt, vbytes)
Example #7
0
    def add_document(self, **fields):
        self._check_state()
        perdocwriter = self.perdocwriter
        schema = self.schema
        docnum = self.docnum
        add_post = self.pool.add

        docboost = self._doc_boost(fields)
        fieldnames = sorted([name for name in fields.keys()
                             if not name.startswith("_")])
        self._check_fields(schema, fieldnames)

        perdocwriter.start_doc(docnum)
        for fieldname in fieldnames:
            value = fields.get(fieldname)
            if value is None:
                continue
            field = schema[fieldname]

            length = 0
            if field.indexed:
                # TODO: Method for adding progressive field values, ie
                # setting start_pos/start_char?
                fieldboost = self._field_boost(fields, fieldname, docboost)
                # Ask the field to return a list of (text, weight, vbytes)
                # tuples
                items = field.index(value)
                # Only store the length if the field is marked scorable
                scorable = field.scorable
                # Add the terms to the pool
                for tbytes, freq, weight, vbytes in items:
                    weight *= fieldboost
                    if scorable:
                        length += freq
                    add_post((fieldname, tbytes, docnum, weight, vbytes))

            if field.separate_spelling():
                spellfield = field.spelling_fieldname(fieldname)
                for word in field.spellable_words(value):
                    word = utf8encode(word)[0]
                    # item = (fieldname, tbytes, docnum, weight, vbytes)
                    add_post((spellfield, word, 0, 1, vbytes))

            vformat = field.vector
            if vformat:
                analyzer = field.analyzer
                # Call the format's word_values method to get posting values
                vitems = vformat.word_values(value, analyzer, mode="index")
                # Remove unused frequency field from the tuple
                vitems = sorted((text, weight, vbytes)
                                for text, _, weight, vbytes in vitems)
                perdocwriter.add_vector_items(fieldname, field, vitems)

            # Allow a custom value for stored field/column
            customval = fields.get("_stored_%s" % fieldname, value)

            # Add the stored value and length for this field to the per-
            # document writer
            sv = customval if field.stored else None
            perdocwriter.add_field(fieldname, field, sv, length)

            column = field.column_type
            if column and customval is not None:
                cv = field.to_column_value(customval)
                perdocwriter.add_column_value(fieldname, column, cv)

        perdocwriter.finish_doc()
        self._added = True
        self.docnum += 1
Example #8
0
    def add_document(self, **fields):
        self._check_state()
        perdocwriter = self.perdocwriter
        schema = self.schema
        docnum = self.docnum
        add_post = self.pool.add

        docboost = self._doc_boost(fields)
        fieldnames = sorted(
            [name for name in fields.keys() if not name.startswith("_")])
        self._check_fields(schema, fieldnames)

        perdocwriter.start_doc(docnum)
        for fieldname in fieldnames:
            value = fields.get(fieldname)
            if value is None:
                continue
            field = schema[fieldname]

            length = 0
            if field.indexed:
                # TODO: Method for adding progressive field values, ie
                # setting start_pos/start_char?
                fieldboost = self._field_boost(fields, fieldname, docboost)
                # Ask the field to return a list of (text, weight, vbytes)
                # tuples
                items = field.index(value)
                # Only store the length if the field is marked scorable
                scorable = field.scorable
                # Add the terms to the pool
                for tbytes, freq, weight, vbytes in items:
                    weight *= fieldboost
                    if scorable:
                        length += freq
                    add_post((fieldname, tbytes, docnum, weight, vbytes))

            if field.separate_spelling():
                # For fields which use different morphemes for spelling,
                # insert fake postings for the spellable words, where
                # docnum=-1 means "this is a spelling word"

                # TODO: think of something less hacktacular
                for word in field.spellable_words(value):
                    word = utf8encode(word)[0]
                    add_post((fieldname, word, -1, -1, emptybytes))

            vformat = field.vector
            if vformat:
                analyzer = field.analyzer
                # Call the format's word_values method to get posting values
                vitems = vformat.word_values(value, analyzer, mode="index")
                # Remove unused frequency field from the tuple
                vitems = sorted((text, weight, vbytes)
                                for text, _, weight, vbytes in vitems)
                perdocwriter.add_vector_items(fieldname, field, vitems)

            # Allow a custom value for stored field/column
            customval = fields.get("_stored_%s" % fieldname, value)

            # Add the stored value and length for this field to the per-
            # document writer
            sv = customval if field.stored else None
            perdocwriter.add_field(fieldname, field, sv, length)

            column = field.column_type
            if column and customval is not None:
                cv = field.to_column_value(customval)
                perdocwriter.add_column_value(fieldname, column, cv)

        perdocwriter.finish_doc()
        self._added = True
        self.docnum += 1