Beispiel #1
0
def test_stored_fields():
    codec = default_codec()
    fieldobj = fields.TEXT(stored=True)
    with TempStorage("storedfields") as st:
        seg = codec.new_segment(st, "test")

        dw = codec.per_document_writer(st, seg)
        dw.start_doc(0)
        dw.add_field("a", fieldobj, "hello", 1)
        dw.add_field("b", fieldobj, "there", 1)
        dw.finish_doc()

        dw.start_doc(1)
        dw.add_field("a", fieldobj, "one", 1)
        dw.add_field("b", fieldobj, "two", 1)
        dw.add_field("c", fieldobj, "three", 1)
        dw.finish_doc()

        dw.start_doc(2)
        dw.finish_doc()

        dw.start_doc(3)
        dw.add_field("a", fieldobj, "alfa", 1)
        dw.add_field("b", fieldobj, "bravo", 1)
        dw.finish_doc()

        dw.close()
        seg.set_doc_count(4)

        pdr = codec.per_document_reader(st, seg)
        assert pdr.doc_count_all() == 4
        assert pdr.stored_fields(0) == {"a": "hello", "b": "there"}
        # Note: access out of order
        assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"}
        assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"}

        sfs = list(pdr.all_stored_fields())
        assert len(sfs) == 4
        assert sfs == [
            {
                "a": "hello",
                "b": "there"
            },
            {
                "a": "one",
                "b": "two",
                "c": "three"
            },
            {},
            {
                "a": "alfa",
                "b": "bravo"
            },
        ]
        pdr.close()
Beispiel #2
0
def test_stored_fields():
    codec = default_codec()
    fieldobj = fields.TEXT(stored=True)
    with TempStorage("storedfields") as st:
        seg = codec.new_segment(st, "test")

        dw = codec.per_document_writer(st, seg)
        dw.start_doc(0)
        dw.add_field("a", fieldobj, "hello", 1)
        dw.add_field("b", fieldobj, "there", 1)
        dw.finish_doc()

        dw.start_doc(1)
        dw.add_field("a", fieldobj, "one", 1)
        dw.add_field("b", fieldobj, "two", 1)
        dw.add_field("c", fieldobj, "three", 1)
        dw.finish_doc()

        dw.start_doc(2)
        dw.finish_doc()

        dw.start_doc(3)
        dw.add_field("a", fieldobj, "alfa", 1)
        dw.add_field("b", fieldobj, "bravo", 1)
        dw.finish_doc()

        dw.close()

        dr = codec.stored_fields_reader(st, seg)
        assert_equal(dr[0], {"a": "hello", "b": "there"})
        # Note: access out of order
        assert_equal(dr[3], {"a": "alfa", "b": "bravo"})
        assert_equal(dr[1], {"a": "one", "b": "two", "c": "three"})
        dr.close()

        dr = codec.stored_fields_reader(st, seg)
        sfs = list(dr)
        assert_equal(sfs, [
            {
                "a": "hello",
                "b": "there"
            },
            {
                "a": "one",
                "b": "two",
                "c": "three"
            },
            {},
            {
                "a": "alfa",
                "b": "bravo"
            },
        ])
        dr.close()
Beispiel #3
0
    def __init__(self,
                 ix,
                 poolclass=None,
                 timeout=0.0,
                 delay=0.1,
                 _lk=True,
                 limitmb=128,
                 docbase=0,
                 codec=None,
                 compound=True,
                 **kwargs):
        # Lock the index
        self.writelock = None
        if _lk:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(
                    self.writelock.acquire, timeout=timeout, delay=delay):
                raise LockError

        if codec is None:
            from whoosh.codec import default_codec
            codec = default_codec()
        self.codec = codec

        # Get info from the index
        self.storage = ix.storage
        self.indexname = ix.indexname
        info = ix._read_toc()
        self.generation = info.generation + 1
        self.schema = info.schema
        self.segments = info.segments
        self.docnum = self.docbase = docbase
        self._setup_doc_offsets()

        # Internals
        self._tempstorage = self.storage.temp_storage("%s.tmp" %
                                                      self.indexname)
        newsegment = codec.new_segment(self.storage, self.indexname)
        self.newsegment = newsegment
        self.compound = compound and newsegment.should_assemble()
        self.is_closed = False
        self._added = False
        self.pool = PostingPool(self._tempstorage,
                                self.newsegment,
                                limitmb=limitmb)

        # Set up writers
        self.perdocwriter = codec.per_document_writer(self.storage, newsegment)
        self.fieldwriter = codec.field_writer(self.storage, newsegment)

        self.merge = True
        self.optimize = False
        self.mergetype = None
def test_stored_fields():
    codec = default_codec()
    fieldobj = fields.TEXT(stored=True)
    with TempStorage("storedfields") as st:
        seg = codec.new_segment(st, "test")

        dw = codec.per_document_writer(st, seg)
        dw.start_doc(0)
        dw.add_field("a", fieldobj, "hello", 1)
        dw.add_field("b", fieldobj, "there", 1)
        dw.finish_doc()

        dw.start_doc(1)
        dw.add_field("a", fieldobj, "one", 1)
        dw.add_field("b", fieldobj, "two", 1)
        dw.add_field("c", fieldobj, "three", 1)
        dw.finish_doc()

        dw.start_doc(2)
        dw.finish_doc()

        dw.start_doc(3)
        dw.add_field("a", fieldobj, "alfa", 1)
        dw.add_field("b", fieldobj, "bravo", 1)
        dw.finish_doc()

        dw.close()
        seg.set_doc_count(4)

        pdr = codec.per_document_reader(st, seg)
        assert pdr.doc_count_all() == 4
        assert pdr.stored_fields(0) == {"a": "hello", "b": "there"}
        # Note: access out of order
        assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"}
        assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"}

        sfs = list(pdr.all_stored_fields())
        assert len(sfs) == 4
        assert sfs == [{"a": "hello", "b": "there"},
                       {"a": "one", "b": "two", "c": "three"},
                       {},
                       {"a": "alfa", "b": "bravo"},
                       ]
        pdr.close()
Beispiel #5
0
    def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True,
                 limitmb=128, docbase=0, codec=None, compound=True, **kwargs):
        # Lock the index
        self.writelock = None
        if _lk:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(self.writelock.acquire, timeout=timeout,
                           delay=delay):
                raise LockError

        if codec is None:
            from whoosh.codec import default_codec
            codec = default_codec()
        self.codec = codec

        # Get info from the index
        self.storage = ix.storage
        self.indexname = ix.indexname
        info = ix._read_toc()
        self.generation = info.generation + 1
        self.schema = info.schema
        self.segments = info.segments
        self.docnum = self.docbase = docbase
        self._setup_doc_offsets()

        # Internals
        self._tempstorage = self.storage.temp_storage("%s.tmp" % self.indexname)
        newsegment = codec.new_segment(self.storage, self.indexname)
        self.newsegment = newsegment
        self.compound = compound and newsegment.should_assemble()
        self.is_closed = False
        self._added = False
        self.pool = PostingPool(self._tempstorage, self.newsegment,
                                limitmb=limitmb)

        # Set up writers
        self.perdocwriter = codec.per_document_writer(self.storage, newsegment)
        self.fieldwriter = codec.field_writer(self.storage, newsegment)

        self.merge = True
        self.optimize = False
        self.mergetype = None
Beispiel #6
0
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.storage = storage
        self.schema = schema
        self.segment = segment
        self._gen = generation
        self.is_closed = False
        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._dc = segment.doc_count()
        self._dc_all = segment.doc_count_all()
        if hasattr(self.segment, "segment_id"):
            self.segid = self.segment.segment_id()
        else:
            from whoosh.codec.base import Segment
            self.segid = Segment._random_id()

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # cahces) if the segment is in a compound file.
        if segment.is_compound():
            # Use an overlay here instead of just the compound storage because
            # in rare circumstances a segment file may be added after the
            # segment is written
            self.files = OverlayStorage(segment.open_compound_file(storage),
                                        self.storage)
        else:
            self.files = storage

        # Get microreaders from codec
        if codec is None:
            from whoosh.codec import default_codec
            codec = default_codec()
        self._codec = codec
        self._terms = codec.terms_reader(self.files, self.segment)
        self._lengths = codec.lengths_reader(self.files, self.segment)
        self._stored = codec.stored_fields_reader(self.files, self.segment)
        self._vectors = None  # Lazy open with self._open_vectors()
        self._graph = None  # Lazy open with self._open_dawg()

        self.set_caching_policy()
Beispiel #7
0
    def __init__(self, storage, schema, segment, generation=None, codec=None):
        self.storage = storage
        self.schema = schema
        self.segment = segment
        self._gen = generation
        self.is_closed = False
        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._dc = segment.doc_count()
        self._dc_all = segment.doc_count_all()
        if hasattr(self.segment, "segment_id"):
            self.segid = self.segment.segment_id()
        else:
            from whoosh.codec.base import Segment
            self.segid = Segment._random_id()

        # self.files is a storage object from which to load the segment files.
        # This is different from the general storage (which will be used for
        # cahces) if the segment is in a compound file.
        if segment.is_compound():
            # Use an overlay here instead of just the compound storage because
            # in rare circumstances a segment file may be added after the
            # segment is written
            self.files = OverlayStorage(segment.open_compound_file(storage),
                                        self.storage)
        else:
            self.files = storage

        # Get microreaders from codec
        if codec is None:
            from whoosh.codec import default_codec
            codec = default_codec()
        self._codec = codec
        self._terms = codec.terms_reader(self.files, self.segment)
        self._lengths = codec.lengths_reader(self.files, self.segment)
        self._stored = codec.stored_fields_reader(self.files, self.segment)
        self._vectors = None  # Lazy open with self._open_vectors()
        self._graph = None  # Lazy open with self._open_dawg()

        self.set_caching_policy()
Beispiel #8
0
def _roundtrip(content, format_, astype, ana=None):
    with TempStorage("roundtrip") as st:
        codec = default_codec()
        seg = codec.new_segment(st, "")
        ana = ana or analysis.StandardAnalyzer()
        field = fields.FieldType(format=format_, analyzer=ana)

        fw = codec.field_writer(st, seg)
        fw.start_field("f1", field)
        for text, _, weight, valuestring in sorted(field.index(content)):
            fw.start_term(text)
            fw.add(0, weight, valuestring, None)
            fw.finish_term()
        fw.finish_field()
        fw.close()

        tr = codec.terms_reader(st, seg)
        ps = []
        for fieldname, btext in tr.terms():
            m = tr.matcher(fieldname, btext, format_)
            ps.append((field.from_bytes(btext), m.value_as(astype)))
        tr.close()
        return ps
Beispiel #9
0
def _roundtrip(content, format_, astype, ana=None):
    with TempStorage("roundtrip") as st:
        codec = default_codec()
        seg = codec.new_segment(st, "")
        ana = ana or analysis.StandardAnalyzer()
        field = fields.FieldType(format=format_, analyzer=ana)

        fw = codec.field_writer(st, seg)
        fw.start_field("f1", field)
        for text, _, weight, valuestring in sorted(field.index(content)):
            fw.start_term(text)
            fw.add(0, weight, valuestring, None)
            fw.finish_term()
        fw.finish_field()
        fw.close()

        tr = codec.terms_reader(st, seg)
        ps = []
        for fieldname, btext in tr.terms():
            m = tr.matcher(fieldname, btext, format_)
            ps.append((field.from_bytes(btext), m.value_as(astype)))
        tr.close()
        return ps
Beispiel #10
0
def _make_codec(**kwargs):
    st = RamStorage()
    codec = default_codec(**kwargs)
    seg = codec.new_segment(st, "test")
    return st, codec, seg
Beispiel #11
0
def _make_codec(**kwargs):
    st = RamStorage()
    codec = default_codec(**kwargs)
    seg = codec.new_segment(st, "test")
    return st, codec, seg