Exemple #1
0
def test_hash_contents():
    samp = [
        ('alfa', 'bravo'),
        ('charlie', 'delta'),
        ('echo', 'foxtrot'),
        ('golf', 'hotel'),
        ('india', 'juliet'),
        ('kilo', 'lima'),
        ('mike', 'november'),
        ('oskar', 'papa'),
        ('quebec', 'romeo'),
        ('sierra', 'tango'),
        ('ultra', 'victor'),
        ('whiskey', 'xray'),
    ]
    # Convert to bytes
    samp = set((b(k), b(v)) for k, v in samp)

    with TempStorage("hashcontents") as st:
        hwf = st.create_file("test.hsh")
        hw = HashWriter(hwf)
        hw.add_all(samp)
        hw.close()

        hrf = st.open_file("test.hsh")
        hr = HashReader(hrf)
        assert_equal(set(hr.items()), samp)
        hr.close()
Exemple #2
0
def test_fieldwriter_multiblock():
    field = fields.TEXT()
    st, codec, seg = _make_codec(blocklimit=2)

    fw = codec.field_writer(st, seg)
    fw.start_field("text", field)
    fw.start_term(u("alfa"))
    fw.add(0, 2.0, b("test1"), 2)
    fw.add(1, 5.0, b("test2"), 5)
    fw.add(2, 3.0, b("test3"), 3)
    fw.add(3, 4.0, b("test4"), 4)
    fw.add(4, 1.0, b("test5"), 1)
    fw.finish_term()
    fw.finish_field()
    fw.close()

    tr = codec.terms_reader(st, seg)
    ti = tr.terminfo("text", "alfa")
    assert_equal(ti.weight(), 15.0)
    assert_equal(ti.doc_frequency(), 5)
    assert_equal(ti.min_length(), 1)
    assert_equal(ti.max_length(), 5)
    assert_equal(ti.max_weight(), 5.0)
    assert_equal(ti.min_id(), 0)
    assert_equal(ti.max_id(), 4)

    ps = []
    m = tr.matcher("text", "alfa", field.format)
    while m.is_active():
        ps.append((m.id(), m.weight(), m.value()))
        m.next()
    assert_equal(ps, [(0, 2.0, b("test1")), (1, 5.0, b("test2")),
                      (2, 3.0, b("test3")), (3, 4.0, b("test4")),
                      (4, 1.0, b("test5"))])
def test_skip():
    _docnums = [1, 3, 12, 34, 43, 67, 68, 102, 145, 212, 283, 291, 412, 900,
                905, 1024, 1800, 2048, 15000]
    st, codec, seg = _make_codec()
    fieldobj = fields.TEXT()
    fw = codec.field_writer(st, seg)
    fw.start_field("f1", fieldobj)
    fw.start_term(b("test"))
    for n in _docnums:
        fw.add(n, 1.0, b(''), None)
    fw.finish_term()
    fw.finish_field()
    fw.close()

    tr = codec.terms_reader(st, seg)
    m = tr.matcher("f1", b("test"), fieldobj.format)
    assert m.id() == 1
    m.skip_to(220)
    assert m.id() == 283
    m.skip_to(1)
    assert m.id() == 283
    m.skip_to(1000)
    assert m.id() == 1024
    m.skip_to(1800)
    assert m.id() == 1800
Exemple #4
0
    def _btexts(self, ixreader):
        fieldname = self.fieldname
        field = ixreader.schema[fieldname]
        startexcl = self.startexcl
        endexcl = self.endexcl

        if self.start is None:
            start = b("")
        else:
            start = field.to_bytes(self.start)
        if self.end is None:
            end = b("\xFF\xFF\xFF\xFF")
        else:
            end = field.to_bytes(self.end)

        for fname, t in ixreader.terms_from(fieldname, start):
            if fname != fieldname:
                break
            if t == start and startexcl:
                continue
            if t == end and endexcl:
                break
            if t > end:
                break
            yield t
Exemple #5
0
def test_skip():
    _docnums = [1, 3, 12, 34, 43, 67, 68, 102, 145, 212, 283, 291, 412, 900,
                905, 1024, 1800, 2048, 15000]
    st, codec, seg = _make_codec()
    fieldobj = fields.TEXT()
    fw = codec.field_writer(st, seg)
    fw.start_field("f1", fieldobj)
    fw.start_term(b("test"))
    for n in _docnums:
        fw.add(n, 1.0, b(''), None)
    fw.finish_term()
    fw.finish_field()
    fw.close()

    tr = codec.terms_reader(st, seg)
    m = tr.matcher("f1", b("test"), fieldobj.format)
    assert m.id() == 1
    m.skip_to(220)
    assert m.id() == 283
    m.skip_to(1)
    assert m.id() == 283
    m.skip_to(1000)
    assert m.id() == 1024
    m.skip_to(1800)
    assert m.id() == 1800
def test_wildcard_existing_terms():
    s = fields.Schema(key=fields.ID, value=fields.TEXT)
    ix = RamStorage().create_index(s)

    w = ix.writer()
    w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta"))
    w.add_document(key=u("a"), value=u("boggle echo render rendering renders"))
    w.commit()
    r = ix.reader()
    qp = QueryParser("value", ix.schema)

    def words(terms):
        z = []
        for t in terms:
            assert t[0] == "value"
            z.append(t[1])
        return b(" ").join(sorted(z))

    q = qp.parse(u("b*"))
    ts = q.existing_terms(r)
    assert ts == set()
    ts = q.existing_terms(r, expand=True)
    assert words(ts) == b("bear boggle bravo")

    q = qp.parse(u("[a TO f]"))
    ts = q.existing_terms(r)
    assert ts == set()
    ts = q.existing_terms(r, expand=True)
    assert words(ts) == b("alfa bear boggle bravo charlie delta echo")

    q = query.Variations("value", "render")
    ts = q.existing_terms(r, expand=False)
    assert ts == set([("value", b("render"))])
    ts = q.existing_terms(r, expand=True)
    assert words(ts) == b("render rendering renders")
Exemple #7
0
    def close(self):
        dbfile = self.dbfile
        order = self.order
        keycount = self.keycount

        # Finish the pickled list of texts
        dbfile.write(b("l."))

        # Compact the order array if possible
        if self.hastexts:
            if keycount < 255:
                code = "B"
                order = array(code, order)
            elif keycount < 65535:
                code = "H"
                order = array(code, order)

        # Write the order array
        dbfile.write(code)
        dbfile.write_array(self.order)

        # Seek back to the start and write numbers of docs
        dbfile.flush()
        dbfile.seek(self.start)
        dbfile.write_uint(len(order))
        if self.hastexts:
            dbfile.write_uint(keycount)
        dbfile.flush()

        # Seek back and write the finished file tag
        dbfile.seek(self.tagpos)
        dbfile.write(b("+"))

        dbfile.close()
Exemple #8
0
def test_hash_contents():
    samp = [('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'),
            ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'),
            ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'),
            ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'),
            ]
    # Convert to bytes
    samp = set((b(k), b(v)) for k, v in samp)

    with TempStorage("hashcontents") as st:
        hw = HashWriter(st.create_file("test.hsh"))
        hw.add_all(samp)
        hw.close()

        hr = HashReader.open(st, "test.hsh")

        probes = list(samp)
        random.shuffle(probes)
        for key, value in probes:
            assert hr[key] == value

        assert set(hr.keys()) == set([k for k, v in samp])
        assert set(hr.values()) == set([v for k, v in samp])
        assert set(hr.items()) == samp

        hr.close()
    def from_string(cls, s):
        hbyte = ord(s[0:1])
        if hbyte < 2:
            st = cls.struct
            # Freq, Doc freq, min len, max len, max w, max WOL, min ID, max ID
            f, df, ml, xl, xw, xwol, mid, xid = st.unpack(s[1:st.size + 1])
            mid = None if mid == NO_ID else mid
            xid = None if xid == NO_ID else xid
            # Postings
            pstr = s[st.size + 1:]
            if hbyte == 0:
                p = unpack_long(pstr)[0]
            else:
                p = loads(pstr + b("."))
        else:
            # Old format was encoded as a variable length pickled tuple
            v = loads(s + b("."))
            if len(v) == 1:
                f = df = 1
                p = v[0]
            elif len(v) == 2:
                f = df = v[1]
                p = v[0]
            else:
                f, p, df = v
            # Fake values for stats which weren't stored before
            ml = 1
            xl = 106374
            xw = 999999999
            xwol = 999999999
            mid = -1
            xid = -1

        return cls(f, df, ml, xl, xw, xwol, mid, xid, p)
Exemple #10
0
    def close(self):
        dbfile = self.dbfile
        order = self.order
        keycount = self.keycount

        # Finish the pickled list of texts
        dbfile.write(b("l."))

        # Compact the order array if possible
        if self.hastexts:
            if keycount < 255:
                code = "B"
                order = array(code, order)
            elif keycount < 65535:
                code = "H"
                order = array(code, order)

        # Write the order array
        dbfile.write(code)
        dbfile.write_array(self.order)

        # Seek back to the start and write numbers of docs
        dbfile.flush()
        dbfile.seek(self.start)
        dbfile.write_uint(len(order))
        if self.hastexts:
            dbfile.write_uint(keycount)
        dbfile.flush()

        # Seek back and write the finished file tag
        dbfile.seek(self.tagpos)
        dbfile.write(b("+"))

        dbfile.close()
Exemple #11
0
    def _btexts(self, ixreader):
        fieldname = self.fieldname
        field = ixreader.schema[fieldname]
        startexcl = self.startexcl
        endexcl = self.endexcl

        if self.start is None:
            start = b("")
        else:
            try:
                start = field.to_bytes(self.start)
            except ValueError:
                return

        if self.end is None:
            end = b("\xFF\xFF\xFF\xFF")
        else:
            try:
                end = field.to_bytes(self.end)
            except ValueError:
                return

        for fname, t in ixreader.terms_from(fieldname, start):
            if fname != fieldname:
                break
            if t == start and startexcl:
                continue
            if t == end and endexcl:
                break
            if t > end:
                break
            yield t
Exemple #12
0
def _test_simple_compound(st):
    alist = [1, 2, 3, 5, -5, -4, -3, -2]
    blist = [1, 12, 67, 8, 2, 1023]
    clist = [100, -100, 200, -200]

    with st.create_file("a") as af:
        for x in alist:
            af.write_int(x)
    with st.create_file("b") as bf:
        for x in blist:
            bf.write_varint(x)
    with st.create_file("c") as cf:
        for x in clist:
            cf.write_int(x)

    f = st.create_file("f")
    CompoundStorage.assemble(f, st, ["a", "b", "c"])

    f = CompoundStorage(st.open_file("f"))
    with f.open_file("a") as af:
        for x in alist:
            assert x == af.read_int()
        assert af.read() == b('')

    with f.open_file("b") as bf:
        for x in blist:
            assert x == bf.read_varint()
        assert bf.read() == b('')

    with f.open_file("c") as cf:
        for x in clist:
            assert x == cf.read_int()
        assert cf.read() == b('')
def test_wildcard_existing_terms():
    s = fields.Schema(key=fields.ID, value=fields.TEXT)
    ix = RamStorage().create_index(s)

    w = ix.writer()
    w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta"))
    w.add_document(key=u("a"), value=u("boggle echo render rendering renders"))
    w.commit()
    r = ix.reader()
    qp = QueryParser("value", ix.schema)

    def words(terms):
        z = []
        for t in terms:
            assert t[0] == "value"
            z.append(t[1])
        return b(" ").join(sorted(z))

    q = qp.parse(u("b*"))
    ts = q.existing_terms(r)
    assert ts == set()
    ts = q.existing_terms(r, expand=True)
    assert words(ts) == b("bear boggle bravo")

    q = qp.parse(u("[a TO f]"))
    ts = q.existing_terms(r)
    assert ts == set()
    ts = q.existing_terms(r, expand=True)
    assert words(ts) == b("alfa bear boggle bravo charlie delta echo")

    q = query.Variations("value", "render")
    ts = q.existing_terms(r, expand=False)
    assert ts == set([("value", b("render"))])
    ts = q.existing_terms(r, expand=True)
    assert words(ts) == b("render rendering renders")
Exemple #14
0
def test_hash_contents():
    samp = [
        ('alfa', 'bravo'),
        ('charlie', 'delta'),
        ('echo', 'foxtrot'),
        ('golf', 'hotel'),
        ('india', 'juliet'),
        ('kilo', 'lima'),
        ('mike', 'november'),
        ('oskar', 'papa'),
        ('quebec', 'romeo'),
        ('sierra', 'tango'),
        ('ultra', 'victor'),
        ('whiskey', 'xray'),
    ]
    # Convert to bytes
    samp = set((b(k), b(v)) for k, v in samp)

    with TempStorage("hashcontents") as st:
        hw = HashWriter(st.create_file("test.hsh"))
        hw.add_all(samp)
        hw.close()

        hr = HashReader.open(st, "test.hsh")

        probes = list(samp)
        random.shuffle(probes)
        for key, value in probes:
            assert hr[key] == value

        assert set(hr.keys()) == set([k for k, v in samp])
        assert set(hr.values()) == set([v for k, v in samp])
        assert set(hr.items()) == samp

        hr.close()
Exemple #15
0
def _test_simple_compound(st):
    alist = [1, 2, 3, 5, -5, -4, -3, -2]
    blist = [1, 12, 67, 8, 2, 1023]
    clist = [100, -100, 200, -200]

    with st.create_file("a") as af:
        for x in alist:
            af.write_int(x)
    with st.create_file("b") as bf:
        for x in blist:
            bf.write_varint(x)
    with st.create_file("c") as cf:
        for x in clist:
            cf.write_int(x)

    f = st.create_file("f")
    CompoundStorage.assemble(f, st, ["a", "b", "c"])

    f = CompoundStorage(st, "f")
    with f.open_file("a") as af:
        for x in alist:
            assert_equal(x, af.read_int())
        assert_equal(af.read(), b(''))

    with f.open_file("b") as bf:
        for x in blist:
            assert_equal(x, bf.read_varint())
        assert_equal(bf.read(), b(''))

    with f.open_file("c") as cf:
        for x in clist:
            assert_equal(x, cf.read_int())
        assert_equal(cf.read(), b(''))
Exemple #16
0
def test_token_boost():
    from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter
    ana = RegexTokenizer() | DoubleMetaphoneFilter()
    field = fields.TEXT(analyzer=ana, phrase=False)
    results = list(field.index(u("spruce view")))
    assert_equal(results, [('SPRS', 1, 1.0, b('\x00\x00\x00\x01')),
                           ('FF', 1, 0.5, b('\x00\x00\x00\x01')),
                           ('F', 1, 1.0, b('\x00\x00\x00\x01'))])
Exemple #17
0
def test_token_boost():
    from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter
    ana = RegexTokenizer() | DoubleMetaphoneFilter()
    field = fields.TEXT(analyzer=ana, phrase=False)
    results = list(field.index(u("spruce view")))
    assert_equal(results, [('SPRS', 1, 1.0, b('\x00\x00\x00\x01')),
                           ('FF', 1, 0.5, b('\x00\x00\x00\x01')),
                           ('F', 1, 1.0, b('\x00\x00\x00\x01'))])
Exemple #18
0
def test_hash_single():
    st = RamStorage()
    hw = HashWriter(st.create_file("test.hsh"))
    hw.add(b("alfa"), b("bravo"))
    hw.close()

    hr = HashReader.open(st, "test.hsh")
    assert hr.get(b("alfa")) == b("bravo")
    assert hr.get(b("foo")) is None
Exemple #19
0
def test_hash_single():
    st = RamStorage()
    hw = HashWriter(st.create_file("test.hsh"))
    hw.add(b("alfa"), b("bravo"))
    hw.close()

    hr = HashReader.open(st, "test.hsh")
    assert hr.get(b("alfa")) == b("bravo")
    assert hr.get(b("foo")) is None
 def decode_positions(self, valuestring):
     if not valuestring.endswith(b(".")):
         valuestring += b(".")
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
     position = 0
     posns = []
     for code in codes:
         position = code[0] + position
         posns.append(position)
     return posns
Exemple #21
0
def test_shared_suffix():
    st = gwrite(enlist("blowing blue glowing"))

    gr = greader(st)
    cur1 = fst.Cursor(gr)
    cur2 = fst.Cursor(gr)

    cur1.find_path(b("blo"))
    cur2.find_path(b("glo"))
    assert cur1.stack[-1].target == cur2.stack[-1].target
 def _print_line(self, indent, command, **kwargs):
     self._dbfile.write(b("  ") * indent)
     self._dbfile.write(command.encode("latin1"))
     for k, v in iteritems(kwargs):
         if isinstance(v, memoryview):
             v = bytes(v)
         if v is not None and not isinstance(v, _reprable):
             raise TypeError(type(v))
         self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1"))
     self._dbfile.write(b("\n"))
Exemple #23
0
 def decode_positions(self, valuestring):
     if not valuestring.endswith(b(".")):
         valuestring += b(".")
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
     position = 0
     posns = []
     for code in codes:
         position = code[0] + position
         posns.append(position)
     return posns
Exemple #24
0
def test_shared_suffix():
    st = gwrite(enlist("blowing blue glowing"))

    gr = greader(st)
    cur1 = fst.Cursor(gr)
    cur2 = fst.Cursor(gr)

    cur1.find_path(b("blo"))
    cur2.find_path(b("glo"))
    assert cur1.stack[-1].target == cur2.stack[-1].target
Exemple #25
0
def minimize_values(postingsize, values, compression=0):
    if postingsize < 0:
        string = dumps(values, -1)[2:]
    elif postingsize == 0:
        string = b('')
    else:
        string = b('').join(values)
    if string and compression:
        string = compress(string, compression)
    return string
 def _print_line(self, indent, command, **kwargs):
     self._dbfile.write(b("  ") * indent)
     self._dbfile.write(command.encode("latin1"))
     for k, v in iteritems(kwargs):
         if isinstance(v, memoryview):
             v = bytes(v)
         if v is not None and not isinstance(v, _reprable):
             raise TypeError(type(v))
         self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1"))
     self._dbfile.write(b("\n"))
Exemple #27
0
def test_ordered_closest():
    keys = [
        'alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf',
        'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november'
    ]
    values = [''] * len(keys)

    with TempStorage("orderedclosest") as st:
        hwf = st.create_file("test.hsh")
        hw = OrderedHashWriter(hwf)
        hw.add_all(zip(keys, values))
        hw.close()

        hrf = st.open_file("test.hsh")
        hr = OrderedHashReader(hrf)
        ck = hr.closest_key
        assert_equal(ck(''), b('alfa'))
        assert_equal(ck(' '), b('alfa'))
        assert_equal(ck('alfa'), b('alfa'))
        assert_equal(ck('bravot'), b('charlie'))
        assert_equal(ck('charlie'), b('charlie'))
        assert_equal(ck('kiloton'), b('lima'))
        assert_equal(ck('oskar'), None)
        assert_equal(list(hr.keys()), [b(k) for k in keys])
        assert_equal(list(hr.values()), [b(v) for v in values])
        assert_equal(list(hr.keys_from('f')), [b(k) for k in keys[5:]])
        hr.close()
Exemple #28
0
def test_ordered_closest():
    keys = ['alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf',
            'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november']
    values = [''] * len(keys)
    
    with TempStorage("orderedclosest") as st:
        hwf = st.create_file("test.hsh")
        hw = OrderedHashWriter(hwf)
        hw.add_all(zip(keys, values))
        hw.close()
        
        hrf = st.open_file("test.hsh")
        hr = OrderedHashReader(hrf)
        ck = hr.closest_key
        assert_equal(ck(''), b('alfa'))
        assert_equal(ck(' '), b('alfa'))
        assert_equal(ck('alfa'), b('alfa'))
        assert_equal(ck('bravot'), b('charlie'))
        assert_equal(ck('charlie'), b('charlie'))
        assert_equal(ck('kiloton'), b('lima'))
        assert_equal(ck('oskar'), None)
        assert_equal(list(hr.keys()), [b(k) for k in keys])
        assert_equal(list(hr.values()), [b(v) for v in values])
        assert_equal(list(hr.keys_from('f')), [b(k) for k in keys[5:]])
        hr.close()
Exemple #29
0
def test_insert_bytes():
    # This test is only meaningful on Python 3
    domain = [b("alfa"), b("bravo"), b("charlie")]

    st = RamStorage()
    gw = fst.GraphWriter(st.create_file("test"))
    gw.start_field("test")
    for key in domain:
        gw.insert(key)
    gw.close()

    cur = fst.GraphReader(st.open_file("test")).cursor()
    assert list(cur.flatten()) == domain
Exemple #30
0
def test_random_access():
    times = 1000
    with TempStorage("orderedhash") as st:
        hw = HashWriter(st.create_file("test.hsh"))
        hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times))
        hw.close()

        keys = list(range(times))
        random.shuffle(keys)
        hr = HashReader.open(st, "test.hsh")
        for x in keys:
            assert hr[b("%08x" % x)] == b(str(x))
        hr.close()
Exemple #31
0
def test_insert_bytes():
    # This test is only meaningful on Python 3
    domain = [b("alfa"), b("bravo"), b("charlie")]

    st = RamStorage()
    gw = fst.GraphWriter(st.create_file("test"))
    gw.start_field("test")
    for key in domain:
        gw.insert(key)
    gw.close()

    cur = fst.GraphReader(st.open_file("test")).cursor()
    assert list(cur.flatten()) == domain
Exemple #32
0
 def decode_character_boosts(self, valuestring):
     if not valuestring.endswith(b(".")):
         valuestring += b(".")
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
     position = 0
     endchar = 0
     posn_char_boosts = []
     for code in codes:
         position = position + code[0]
         startchar = endchar + code[1]
         endchar = startchar + code[2]
         posn_char_boosts.append((position, startchar, endchar, code[3]))
     return posn_char_boosts
Exemple #33
0
def test_random_access():
    times = 1000
    with TempStorage("orderedhash") as st:
        hw = HashWriter(st.create_file("test.hsh"))
        hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times))
        hw.close()

        keys = list(range(times))
        random.shuffle(keys)
        hr = HashReader.open(st, "test.hsh")
        for x in keys:
            assert hr[b("%08x" % x)] == b(str(x))
        hr.close()
Exemple #34
0
 def decode_characters(self, valuestring):
     if not valuestring.endswith(b(".")):
         valuestring += b(".")
     codes = loads(valuestring[_INT_SIZE:])
     position = 0
     endchar = 0
     posns_chars = []
     for code in codes:
         position = code[0] + position
         startchar = code[1] + endchar
         endchar = code[2] + startchar
         posns_chars.append((position, startchar, endchar))
     return posns_chars
 def decode_character_boosts(self, valuestring):
     if not valuestring.endswith(b(".")):
         valuestring += b(".")
     codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
     position = 0
     endchar = 0
     posn_char_boosts = []
     for code in codes:
         position = position + code[0]
         startchar = endchar + code[1]
         endchar = startchar + code[2]
         posn_char_boosts.append((position, startchar, endchar, code[3]))
     return posn_char_boosts
 def decode_characters(self, valuestring):
     if not valuestring.endswith(b(".")):
         valuestring += b(".")
     codes = loads(valuestring[_INT_SIZE:])
     position = 0
     endchar = 0
     posns_chars = []
     for code in codes:
         position = code[0] + position
         startchar = code[1] + endchar
         endchar = code[2] + startchar
         posns_chars.append((position, startchar, endchar))
     return posns_chars
Exemple #37
0
def parse_glob(pattern,
               _glob_multi=b("*"),
               _glob_single=b("?"),
               _glob_range1=b("["),
               _glob_range2=b("]"),
               _glob_range_not=b("!")):
    parsed = []
    pos = 0
    while pos < len(pattern):
        char = pattern[pos]
        pos += 1
        if char == _glob_multi:  # *
            # (Ignore more than one star in a row)
            if parsed:
                prev = parsed[-1][0]
                if prev == _STAR:
                    continue
            parsed.append((_STAR, ))
        elif char == _glob_single:  # ?
            # (Ignore ? after a star)
            if parsed:
                prev = parsed[-1][0]
                if prev == _STAR:
                    continue
            parsed.append((_QUEST, ))
        elif char == _glob_range1:  # [
            chars = set()
            firstchar = True
            negate = False
            # Take the char range specification until the ]
            while pos < len(pattern):
                char = pattern[pos]
                pos += 1
                if char == _glob_range2:
                    break
                # If first char inside the range is !, negate the list
                if firstchar and char == _glob_range_not:
                    negate = True
                else:
                    chars.add(char)
                firstchar = False
            if chars:
                parsed.append((_RANGE, chars, negate))
        else:
            if parsed and parsed[-1][0] == _LIT:
                parsed[-1][1] += char
            else:
                parsed.append([_LIT, char])
    parsed.append((_END, ))
    return parsed
Exemple #38
0
def test_ordered_hash():
    times = 10000
    with TempStorage("orderedhash") as st:
        hwf = st.create_file("test.hsh")
        hw = HashWriter(hwf)
        hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times))
        hw.close()

        keys = list(range(times))
        random.shuffle(keys)
        hrf = st.open_file("test.hsh")
        hr = HashReader(hrf)
        for x in keys:
            assert_equal(hr[b("%08x" % x)], b(str(x)))
        hr.close()
Exemple #39
0
def test_indentical_fields():
    schema = fields.Schema(id=fields.STORED,
                           f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT)
    with TempIndex(schema, "identifields") as ix:
        w = ix.writer()
        w.add_document(id=1, f1=u("alfa"), f2=u("alfa"), f3=u("alfa"))
        w.commit()

        with ix.searcher() as s:
            assert list(s.lexicon("f1")) == [b("alfa")]
            assert list(s.lexicon("f2")) == [b("alfa")]
            assert list(s.lexicon("f3")) == [b("alfa")]
            assert list(s.documents(f1="alfa")) == [{"id": 1}]
            assert list(s.documents(f2="alfa")) == [{"id": 1}]
            assert list(s.documents(f3="alfa")) == [{"id": 1}]
Exemple #40
0
 def load(cls, dbfile, expand=True):
     dbfile.seek(0)
     magic = dbfile.read(4)
     if magic != b("GR01"):
         raise Exception("%r does not seem to be a graph file" % dbfile)
     _ = dbfile.read_int()  # File flags (currently unused)
     return DiskNode(dbfile, dbfile.read_uint(), expand=expand)
Exemple #41
0
    def __getitem__(self, num):
        if num > self.length - 1:
            raise IndexError("Tried to get document %s, file has %s"
                             % (num, self.length))

        dbfile = self.dbfile
        start = self.directory_offset + num * stored_pointer_size
        dbfile.seek(start)
        ptr = dbfile.read(stored_pointer_size)
        if len(ptr) != stored_pointer_size:
            raise Exception("Error reading %r @%s %s < %s"
                            % (dbfile, start, len(ptr), stored_pointer_size))
        position, length = unpack_stored_pointer(ptr)
        vlist = loads(dbfile.map[position:position + length] + b("."))

        names = self.names
        # Recreate a dictionary by putting the field names and values back
        # together by position. We can't just use dict(zip(...)) because we
        # want to filter out the None values.
        values = dict((names[i], vlist[i]) for i in xrange(len(names))
                      if vlist[i] is not None)

        # Pull any extra stored dynamic field values off the end of the list
        if len(vlist) > len(names):
            values.update(dict(vlist[len(names):]))

        return values
Exemple #42
0
 def load(cls, dbfile, expand=True):
     dbfile.seek(0)
     magic = dbfile.read(4)
     if magic != b("GR01"):
         raise Exception("%r does not seem to be a graph file" % dbfile)
     _ = dbfile.read_int()  # File flags (currently unused)
     return DiskNode(dbfile, dbfile.read_uint(), expand=expand)
Exemple #43
0
    def add_all(self, items):
        dbfile = self.dbfile
        hashes = self.hashes
        hash_func = self.hash_func
        pos = dbfile.tell()
        write = dbfile.write

        index = self.index
        lk = self.lastkey or b('')

        for key, value in items:
            if isinstance(key, text_type):
                key = key.encode('latin-1')
            if isinstance(value, text_type):
                value = value.encode('latin-1')
            if key <= lk:
                raise ValueError("Keys must increase: %r .. %r" % (lk, key))
            lk = key

            index.append(pos)
            write(pack_lengths(len(key), len(value)))
            write(key)
            write(value)

            h = hash_func(key)
            hashes[h & 255].append((h, pos))

            pos += lengths_size + len(key) + len(value)

        self.lastkey = lk
Exemple #44
0
def test_boolean():
    schema = fields.Schema(id=fields.ID(stored=True),
                           done=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), done=True)
    w.add_document(id=u("b"), done=False)
    w.add_document(id=u("c"), done=True)
    w.add_document(id=u("d"), done=False)
    w.add_document(id=u("e"), done=True)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("done:true"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        r = s.search(qp.parse("done:yes"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        q = qp.parse("done:false")
        assert q.__class__ == query.Term
        assert q.text is False
        assert schema["done"].to_bytes(False) == b("f")
        r = s.search(q)
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)

        r = s.search(qp.parse("done:no"))
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)
Exemple #45
0
def test_boolean():
    schema = fields.Schema(id=fields.ID(stored=True),
                           done=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), done=True)
    w.add_document(id=u("b"), done=False)
    w.add_document(id=u("c"), done=True)
    w.add_document(id=u("d"), done=False)
    w.add_document(id=u("e"), done=True)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("done:true"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        r = s.search(qp.parse("done:yes"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        q = qp.parse("done:false")
        assert q.__class__ == query.Term
        assert q.text is False
        assert schema["done"].to_bytes(False) == b("f")
        r = s.search(q)
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)

        r = s.search(qp.parse("done:no"))
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)
Exemple #46
0
    def __init__(self, dbfile):
        self.dbfile = dbfile
        self.map = dbfile.map

        dbfile.seek(0)
        magic = dbfile.read(4)
        if magic == b("HASH"):
            self.format = 1
            self.header_size = 16 + 256 * header_entry_size
            _pointer_struct = Struct("!Iq")  # Hash value, position
            self.hashtype = dbfile.read_byte()
            dbfile.read(3)  # Unused
            self._end_of_hashes = dbfile.read_long()
            assert self._end_of_hashes >= self.header_size
        else:
            # Old format
            self.format = self.hashtype = 0
            self.header_size = 256 * header_entry_size
            _pointer_struct = Struct("!qq")  # Hash value, position

        self.hash_func = hash_functions[self.hashtype]
        self.buckets = []
        for _ in xrange(256):
            he = unpack_header_entry(dbfile.read(header_entry_size))
            self.buckets.append(he)
        self._start_of_hashes = self.buckets[0][0]

        self.pointer_size = _pointer_struct.size
        self.unpack_pointer = _pointer_struct.unpack

        self.is_closed = False
Exemple #47
0
    def add_all(self, items):
        dbfile = self.dbfile
        hashes = self.hashes
        hash_func = self.hash_func
        pos = dbfile.tell()
        write = dbfile.write

        index = self.index
        lk = self.lastkey or b('')

        for key, value in items:
            if not isinstance(key, bytes_type):
                raise TypeError("Key %r should be bytes" % key)
            if not isinstance(value, bytes_type):
                raise TypeError("Value %r should be bytes" % value)
            if key <= lk:
                raise ValueError("Keys must increase: %r .. %r" % (lk, key))
            lk = key

            index.append(pos)
            write(pack_lengths(len(key), len(value)))
            write(key)
            write(value)

            h = hash_func(key)
            hashes[h & 255].append((h, pos))

            pos += lengths_size + len(key) + len(value)

        self.lastkey = lk
Exemple #48
0
 def digest(self):
     if self._digest is None:
         d = sha1()
         vtype = self.owner.vtype
         for arc in self.arcs:
             d.update(arc.label)
             if arc.target:
                 d.update(pack_long(arc.target))
             else:
                 d.update(b("z"))
             if arc.value:
                 d.update(vtype.to_bytes(arc.value))
             if arc.accept:
                 d.update(b("T"))
         self._digest = d.digest()
     return self._digest
Exemple #49
0
    def add_all(self, items):
        dbfile = self.dbfile
        hashes = self.hashes
        hash_func = self.hash_func
        pos = dbfile.tell()
        write = dbfile.write

        index = self.index
        lk = self.lastkey or b('')

        for key, value in items:
            if isinstance(key, text_type):
                key = key.encode('latin-1')
            if isinstance(value, text_type):
                value = value.encode('latin-1')
            if key <= lk:
                raise ValueError("Keys must increase: %r .. %r" % (lk, key))
            lk = key

            index.append(pos)
            write(pack_lengths(len(key), len(value)))
            write(key)
            write(value)

            h = hash_func(key)
            hashes[h & 255].append((h, pos))

            pos += lengths_size + len(key) + len(value)

        self.lastkey = lk
Exemple #50
0
 def digest(self):
     if self._digest is None:
         d = sha1()
         vtype = self.owner.vtype
         for arc in self.arcs:
             d.update(arc.label)
             if arc.target:
                 d.update(pack_long(arc.target))
             else:
                 d.update(b("z"))
             if arc.value:
                 d.update(vtype.to_bytes(arc.value))
             if arc.accept:
                 d.update(b("T"))
         self._digest = d.digest()
     return self._digest
def test_removefield():
    schema = fields.Schema(id=fields.ID(stored=True),
                           content=fields.TEXT,
                           city=fields.KEYWORD(stored=True))
    with TempIndex(schema, "removefield") as ix:
        w = ix.writer()
        w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad"))
        w.add_document(id=u("c"), content=u("charlie"), city=u("cairo"))
        w.add_document(id=u("d"), content=u("delta"), city=u("dakar"))
        w.commit()

        with ix.searcher() as s:
            assert s.document(id=u("c")) == {"id": "c", "city": "cairo"}

        w = ix.writer()
        w.remove_field("content")
        w.remove_field("city")
        w.commit()

        ixschema = ix._current_schema()
        assert ixschema.names() == ["id"]
        assert ixschema.stored_names() == ["id"]

        with ix.searcher() as s:
            assert ("content", b("charlie")) not in s.reader()
            assert s.document(id=u("c")) == {"id": u("c")}
Exemple #52
0
 def sortable_terms(self, ixreader, fieldname):
     zero = b("\x00")
     for token in ixreader.lexicon(fieldname):
         if token[0:1] != zero:
             # Only yield the full-precision values
             break
         yield token
Exemple #53
0
    def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
        """
        :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
            to write to.
        :param magic: the format tag bytes to write at the start of the file.
        :param hashtype: an integer indicating which hashing algorithm to use.
            Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).
        """

        self.dbfile = dbfile
        self.hashtype = hashtype
        self.hashfn = _hash_functions[self.hashtype]
        # A place for subclasses to put extra metadata
        self.extras = {}

        self.startoffset = dbfile.tell()
        # Write format tag
        dbfile.write(magic)
        # Write hash type
        dbfile.write_byte(self.hashtype)
        # Unused future expansion bits
        dbfile.write_int(0)
        dbfile.write_int(0)

        # 256 lists of hashed keys and positions
        self.buckets = [[] for _ in xrange(256)]
        # List to remember the positions of the hash tables
        self.directory = []
Exemple #54
0
def test_removefield():
    schema = fields.Schema(id=fields.ID(stored=True),
                           content=fields.TEXT,
                           city=fields.KEYWORD(stored=True))
    with TempIndex(schema, "removefield") as ix:
        w = ix.writer()
        w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad"))
        w.add_document(id=u("c"), content=u("charlie"), city=u("cairo"))
        w.add_document(id=u("d"), content=u("delta"), city=u("dakar"))
        w.commit()

        with ix.searcher() as s:
            assert s.document(id=u("c")) == {"id": "c", "city": "cairo"}

        w = ix.writer()
        w.remove_field("content")
        w.remove_field("city")
        w.commit()

        ixschema = ix._current_schema()
        assert ixschema.names() == ["id"]
        assert ixschema.stored_names() == ["id"]

        with ix.searcher() as s:
            assert ("content", b("charlie")) not in s.reader()
            assert s.document(id=u("c")) == {"id": u("c")}
Exemple #55
0
    def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
        """
        :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
            to write to.
        :param magic: the format tag bytes to write at the start of the file.
        :param hashtype: an integer indicating which hashing algorithm to use.
            Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).
        """

        self.dbfile = dbfile
        self.hashtype = hashtype
        self.hashfn = _hash_functions[self.hashtype]
        # A place for subclasses to put extra metadata
        self.extras = {}

        self.startoffset = dbfile.tell()
        # Write format tag
        dbfile.write(magic)
        # Write hash type
        dbfile.write_byte(self.hashtype)
        # Unused future expansion bits
        dbfile.write_int(0)
        dbfile.write_int(0)

        # 256 lists of hashed keys and positions
        self.buckets = [[] for _ in xrange(256)]
        # List to remember the positions of the hash tables
        self.directory = []
Exemple #56
0
    def __init__(self, dbfile):
        self.dbfile = dbfile

        dbfile.seek(0)
        magic = dbfile.read(4)
        if magic == b("HASH"):
            self.format = 1
            self.header_size = 16 + 256 * header_entry_size
            _pointer_struct = Struct("!Iq")  # Hash value, position
            self.hashtype = dbfile.read_byte()
            dbfile.read(3)  # Unused
            self._end_of_hashes = dbfile.read_long()
            assert self._end_of_hashes >= self.header_size
        else:
            # Old format
            self.format = self.hashtype = 0
            self.header_size = 256 * header_entry_size
            _pointer_struct = Struct("!qq")  # Hash value, position

        self.hash_func = hash_functions[self.hashtype]
        self.buckets = []
        for _ in xrange(256):
            he = unpack_header_entry(dbfile.read(header_entry_size))
            self.buckets.append(he)
        self._start_of_hashes = self.buckets[0][0]

        self.pointer_size = _pointer_struct.size
        self.unpack_pointer = _pointer_struct.unpack

        self.is_closed = False
def test_random_termkeys():
    def random_fieldname():
        return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20))

    def random_btext():
        a = array("H", (random.randint(0, 0xd7ff) for _ in xrange(1, 20)))
        return array_tobytes(a).decode("utf-16")

    domain = sorted(set([(random_fieldname(), random_btext().encode("utf-8"))
                         for _ in xrange(1000)]))

    st, codec, seg = _make_codec()
    fieldobj = fields.TEXT()
    tw = codec.field_writer(st, seg)
    # Stupid ultra-low-level hand-adding of postings just to check handling of
    # random fieldnames and term texts
    lastfield = None
    for fieldname, text in domain:
        if lastfield and fieldname != lastfield:
            tw.finish_field()
            lastfield = None
        if lastfield is None:
            tw.start_field(fieldname, fieldobj)
            lastfield = fieldname
        tw.start_term(text)
        tw.add(0, 1.0, b(""), 1)
        tw.finish_term()
    if lastfield:
        tw.finish_field()
    tw.close()

    tr = codec.terms_reader(st, seg)
    for term in domain:
        assert term in tr