Esempio n. 1
0
    def __init__(self, stored=False, field_boost=1.0):
        """
        :param stored: Whether the value of this field is stored with the
            document.
        """

        self.stored = stored
        self.field_boost = field_boost
        self.format = formats.Existence(field_boost=field_boost)
Esempio n. 2
0
    def __init__(self,
                 type=int,
                 stored=False,
                 unique=False,
                 field_boost=1.0,
                 decimal_places=0,
                 shift_step=4,
                 signed=True):
        """
        :param type: the type of numbers that can be stored in this field: one
            of ``int``, ``long``, ``float``, or ``Decimal``.
        :param stored: Whether the value of this field is stored with the
            document.
        :param unique: Whether the value of this field is unique per-document.
        :param decimal_places: specifies the number of decimal places to save
            when storing Decimal instances as ``int`` or ``float``.
        :param shift_steps: The number of bits of precision to shift away at
            each tiered indexing level. Values should generally be 1-8. Lower
            values yield faster searches but take up more space. A value
            of `0` means no tiered indexing.
        :param signed: Whether the numbers stored in this field may be
            negative.
        """

        self.type = type
        if self.type is long_type:
            # This will catch the Python 3 int type
            self._to_text = long_to_text
            self._from_text = text_to_long
            self.sortable_typecode = "q" if signed else "Q"
        elif self.type is int:
            self._to_text = int_to_text
            self._from_text = text_to_int
            self.sortable_typecode = "i" if signed else "I"
        elif self.type is float:
            self._to_text = float_to_text
            self._from_text = text_to_float
            self.sortable_typecode = "f"
        elif self.type is Decimal:
            raise TypeError("To store Decimal instances, set type to int or "
                            "float and use the decimal_places argument")
        else:
            raise TypeError("%s field type can't store %r" %
                            (self.__class__, self.type))

        self.stored = stored
        self.unique = unique
        self.decimal_places = decimal_places
        self.shift_step = shift_step
        self.signed = signed
        self.analyzer = IDAnalyzer()
        self.format = formats.Existence(field_boost=field_boost)
Esempio n. 3
0
    def __init__(self, stored=False, unique=False, field_boost=1.0,
                 spelling=False, sortable=False, analyzer=None):
        """
        :param stored: Whether the value of this field is stored with the
            document.
        """

        self.analyzer = analyzer or analysis.IDAnalyzer()
        self.format = formats.Existence(field_boost=field_boost)
        self.stored = stored
        self.unique = unique
        self.spelling = spelling
        self.set_sortable(sortable)
Esempio n. 4
0
    def __init__(self, stored=False, unique=False, expression=None,
                 field_boost=1.0, spelling=False):
        """
        :param stored: Whether the value of this field is stored with the
            document.
        :param unique: Whether the value of this field is unique per-document.
        :param expression: The regular expression object to use to extract
            tokens. The default expression breaks tokens on CRs, LFs, tabs,
            spaces, commas, and semicolons.
        """

        expression = expression or re.compile(r"[^\r\n\t ,;]+")
        self.analyzer = analysis.RegexAnalyzer(expression=expression)
        self.format = formats.Existence(field_boost=field_boost)
        self.stored = stored
        self.unique = unique
        self.spelling = spelling
Esempio n. 5
0
 def __init__(self,
              stored=False,
              unique=False,
              separator=None,
              field_boost=1.0,
              spelling=False):
     """
     :param stored: Whether the value of this field is stored with the
         document.
     :param unique: Whether the value of this field is unique per-document.
     :param expression: The regular expression object to use to extract
         tokens. The default expression breaks tokens on CRs, LFs, tabs,
         spaces, commas, and semicolons.
     """
     super(SPLITTEDIDLIST, self).__init__(stored, unique, separator,
                                          field_boost, spelling)
     self.analyzer = SplitAnalyzer(separator="\t")
     self.format = formats.Existence(field_boost=field_boost)
     self.stored = stored
     self.unique = unique
     self.spelling = spelling
Esempio n. 6
0
from __future__ import with_statement
import os.path, random, string
import sqlite3 as sqlite

from whoosh import fields, formats, index, query, sorting
from whoosh.util import now

tagcount = 100
doccount = 500000
dirname = "testindex"

schema = fields.Schema(
    tags=fields.KEYWORD(stored=True, vector=formats.Existence()))

if not os.path.exists(dirname):
    os.mkdir(dirname)

reindex = False
if reindex or not index.exists_in(dirname):
    tags = []
    for _ in xrange(tagcount):
        tag = u"".join(
            random.choice(string.ascii_lowercase) for _ in xrange(5))
        tags.append(tag)

    ix = index.create_in(dirname, schema)
    t = now()
    with ix.writer() as w:
        for i in xrange(doccount):
            doc = u" ".join(random.sample(tags, random.randint(10, 20)))
            w.add_document(tags=doc)
Esempio n. 7
0
    def __init__(self, numtype=int, bits=32, stored=False, unique=False,
                 field_boost=1.0, decimal_places=0, shift_step=4, signed=True,
                 sortable=False, default=None):
        """
        :param numtype: the type of numbers that can be stored in this field,
            either ``int``, ``float``. If you use ``Decimal``,
            use the ``decimal_places`` argument to control how many decimal
            places the field will store.
        :param stored: Whether the value of this field is stored with the
            document.
        :param unique: Whether the value of this field is unique per-document.
        :param decimal_places: specifies the number of decimal places to save
            when storing Decimal instances. If you set this, you will always
            get Decimal instances back from the field.
        :param shift_steps: The number of bits of precision to shift away at
            each tiered indexing level. Values should generally be 1-8. Lower
            values yield faster searches but take up more space. A value
            of `0` means no tiered indexing.
        :param signed: Whether the numbers stored in this field may be
            negative.
        """

        # Allow users to specify strings instead of Python types in case
        # docstring isn't clear
        if numtype == "int":
            numtype = int
        if numtype == "float":
            numtype = float
        # Raise an error if the user tries to use a type other than int or
        # float
        if numtype is Decimal:
            raise TypeError("To store Decimal instances, set type to int use "
                            "the decimal_places argument")
        elif numtype not in (int, float):
            raise TypeError("Can't use %r as a type, use int or float"
                            % numtype)
        # Sanity check
        if numtype is float and decimal_places:
            raise Exception("A float type and decimal_places argument %r are "
                            "incompatible" % decimal_places)

        # Set up field configuration based on type and size
        if numtype is float:
            bits = 64  # Floats are converted to 64 bit ints
        intsizes = [8, 16, 32, 64]
        intcodes = ["B", "H", "I", "Q"]
        if bits not in intsizes:
            raise Exception("Invalid bits %r, use 8, 16, 32, or 64"
                            % bits)
        # Type code for the *sortable* representation
        self.sortable_typecode = intcodes[intsizes.index(bits)]
        self._struct = struct.Struct(">" + self.sortable_typecode)

        self.numtype = numtype
        self.bits = bits
        self.stored = stored
        self.unique = unique
        self.decimal_places = decimal_places
        self.shift_step = shift_step
        self.signed = signed
        self.analyzer = analysis.IDAnalyzer()
        self.format = formats.Existence(field_boost=field_boost)

        # Column configuration
        if default is None:
            if numtype is int:
                default = typecode_max[self.sortable_typecode]
            else:
                default = NaN
        elif not self.is_valid(default):
            raise Exception("The default %r is not a valid number for this "
                            "field" % default)

        self.default = default
        self.set_sortable(sortable)