def test_varbytes_offsets(): values = u("alfa bravo charlie delta echo foxtrot golf hotel").split() vlen = len(values) # Without offsets: col = columns.VarBytesColumn(allow_offsets=False) schema = fields.Schema(name=fields.ID(sortable=col)) with TempIndex(schema) as ix: with ix.writer() as w: for i in xrange(5000): w.add_document(name=values[i % vlen]) with ix.reader() as r: cr = r.column_reader("name") assert isinstance(cr, columns.TranslatingColumnReader) assert not cr.raw_column().had_stored_offsets for i in (10, 100, 1000, 3000): assert cr[i] == values[i % vlen] # With offsets col = columns.VarBytesColumn(allow_offsets=True, write_offsets_cutoff=4096) schema = fields.Schema(name=fields.ID(sortable=col)) with TempIndex(schema) as ix: with ix.writer() as w: for i in xrange(5000): w.add_document(name=values[i % vlen]) with ix.reader() as r: cr = r.column_reader("name") assert isinstance(cr, columns.TranslatingColumnReader) assert cr.raw_column().had_stored_offsets for i in (10, 100, 1000, 3000): assert cr[i] == values[i % vlen]
def test_roundtrip(): _rt(columns.VarBytesColumn(), [b("a"), b("ccc"), b("bbb"), b("e"), b("dd")], b("")) _rt(columns.FixedBytesColumn(5), [ b("aaaaa"), b("eeeee"), b("ccccc"), b("bbbbb"), b("eeeee") ], b("\x00") * 5) _rt(columns.RefBytesColumn(), [b("a"), b("ccc"), b("bb"), b("ccc"), b("a"), b("bb")], b("")) _rt(columns.RefBytesColumn(3), [b("aaa"), b("bbb"), b("ccc"), b("aaa"), b("bbb"), b("ccc")], b("\x00") * 3) _rt(columns.StructColumn("ifH", (0, 0.0, 0)), [(100, 1.5, 15000), (-100, -5.0, 0), (5820, 6.5, 462), (-57829, -1.5, 6), (0, 0, 0)], (0, 0.0, 0)) numcol = columns.NumericColumn _rt(numcol("b"), [10, -20, 30, -25, 15], 0) _rt(numcol("B"), [10, 20, 30, 25, 15], 0) _rt(numcol("h"), [1000, -2000, 3000, -15000, 32000], 0) _rt(numcol("H"), [1000, 2000, 3000, 15000, 50000], 0) _rt(numcol("i"), [2**16, -(2**20), 2**24, -(2**28), 2**30], 0) _rt(numcol("I"), [2**16, 2**20, 2**24, 2**28, 2**31 & 0xFFFFFFFF], 0) _rt(numcol("q"), [10, -20, 30, -25, 15], 0) _rt(numcol("Q"), [2**35, 2**40, 2**48, 2**52, 2**63], 0) _rt(numcol("f"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) _rt(numcol("d"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) c = columns.BitColumn(compress_at=10) _rt(c, [bool(random.randint(0, 1)) for _ in xrange(70)], False) _rt(c, [bool(random.randint(0, 1)) for _ in xrange(90)], False) c = columns.PickleColumn(columns.VarBytesColumn()) _rt(c, [None, True, False, 100, -7, "hello"], None) c = columns.VarBytesListColumn() _rt(c, [[b('garnet'), b('amethyst')], [b('pearl')]], []) c = columns.VarBytesListColumn() c = columns.FixedBytesListColumn(4) _rt(c, [[b('garn'), b('amet')], [b('pear')]], [])
def test_pickleability(): # Ignore base classes ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn) # Required arguments init_args = { "ClampedNumericColumn": (columns.NumericColumn("B"), ), "FixedBytesColumn": (5, ), "FixedBytesListColumn": (5, ), "NumericColumn": ("i", ), "PickleColumn": (columns.VarBytesColumn(), ), "StructColumn": ("=if", (0, 0.0)), } coltypes = [ c for _, c in inspect.getmembers(columns, inspect.isclass) if issubclass(c, columns.Column) and not c in ignore ] for coltype in coltypes: args = init_args.get(coltype.__name__, ()) try: inst = coltype(*args) except TypeError: e = sys.exc_info()[1] raise TypeError("Error instantiating %r: %s" % (coltype, e)) _ = loads(dumps(inst, -1))
def __init__(self, analyzer=None, phrase=True, chars=False, vector=None, stored=False, field_boost=1.0, multitoken_query="default", spelling=False, sortable=False, lang=None): """ :param analyzer: The analysis.Analyzer to use to index the field contents. See the analysis module for more information. If you omit this argument, the field uses analysis.StandardAnalyzer. :param phrase: Whether the store positional information to allow phrase searching. :param chars: Whether to store character ranges along with positions. If this is True, "phrase" is also implied. :param vector: A :class:`whoosh.formats.Format` object to use to store term vectors, or ``True`` to store vectors using the same format as the inverted index, or ``None`` or ``False`` to not store vectors. By default, fields do not store term vectors. :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param spelling: Whether to generate word graphs for this field to make spelling suggestions much faster. :param sortable: If True, make this field sortable using the default column type. If you pass a :class:`whoosh.columns.Column` instance instead of True, the field will use the given column type. :param lang: automaticaly configure a :class:`whoosh.analysis.LanguageAnalyzer` for the given language. This is ignored if you also specify an ``analyzer``. """ if analyzer: self.analyzer = analyzer elif lang: self.analyzer = analysis.LanguageAnalyzer(lang) else: self.analyzer = analysis.StandardAnalyzer() if chars: formatclass = formats.Characters elif phrase: formatclass = formats.Positions else: formatclass = formats.Frequency self.format = formatclass(field_boost=field_boost) if vector: if type(vector) is type: vector = vector() elif isinstance(vector, formats.Format): pass else: vector = formatclass() else: vector = None self.vector = vector if sortable: if isinstance(sortable, columns.Column): self.column_type = sortable else: self.column_type = columns.VarBytesColumn() else: self.column_type = None self.multitoken_query = multitoken_query self.scorable = True self.stored = stored self.spelling = spelling
def __init__(self, columnobj=None): if columnobj is None: columnobj = columns.VarBytesColumn() if not isinstance(columnobj, columns.Column): raise TypeError("%r is not a column object" % (columnobj,)) self.column_type = columnobj
def default_column(self): return columns.VarBytesColumn()