Example #1
0
    def tokenize(self, value):
        """
        Split the incoming value into tokens and process each token,
        optionally stemming or running metaphone.

        :returns: A ``dict`` mapping token to score. The score is
            based on the relative frequency of the word in the
            document.
        """
        words = self.split_phrase(decode(value).lower())
        if self._stopwords:
            words = [w for w in words if w not in self._stopwords]
        if self._min_word_length:
            words = [w for w in words if len(w) >= self._min_word_length]

        fraction = 1. / (len(words) + 1)  # Prevent division by zero.

        # Apply optional transformations.
        if self._use_stemmer:
            words = self.stem(words)
        if self._use_metaphone:
            words = self.metaphone(words)

        scores = {}
        for word in words:
            scores.setdefault(word, 0)
            scores[word] += fraction
        return scores
Example #2
0
 def tokenize_title(self, phrase, stopwords=True):
     if isinstance(phrase, bytes):
         phrase = decode(phrase)
     phrase = re.sub('[^a-z0-9_\-\s]', '', phrase.lower())
     if stopwords:
         return [w for w in phrase.split() if w not in self._stopwords]
     else:
         return phrase.split()
Example #3
0
 def pending(self, start='-', stop='+', count=1000, consumer=None):
     start = normalize_id(start)
     stop = normalize_id(stop)
     resp = self.database.xpending_range(self.key, self.group, start, stop,
                                         count, consumer)
     return [(id_to_datetime(msg['message_id']), decode(msg['consumer']),
              msg['time_since_delivered'], msg['times_delivered'])
             for msg in resp]
Example #4
0
    def list_data(self):
        """
        Return all the data stored in the autocomplete index. If the data was
        stored as serialized JSON, then it will be de-serialized before being
        returned.

        :rtype: list
        """
        fn = (lambda v: json.loads(decode(v))) if self._use_json else decode
        return map(fn, self._data.values())
Example #5
0
 def python_value(self, value):
     if self._pickled:
         return pickle.loads(value)
     elif self._as_json and PY3:
         return json.loads(decode(value))
     elif self._as_json:
         return json.loads(value)
     elif self._coerce:
         return self._coerce(value)
     return value
Example #6
0
 def db_value(self, value):
     if self._pickled:
         return pickle.dumps(value)
     elif PY3 and self._as_json:
         return json.dumps(decode(value))
     elif self._as_json:
         return json.dumps(value)
     elif self._coerce:
         return self._coerce(value)
     return value
Example #7
0
    def test_slicing(self):
        self.lst.extend(['i1', 'i2', 'i3', 'i4'])
        self.assertEqual(self.lst[:1], [b'i1'])
        self.assertEqual(self.lst[:2], [b'i1', b'i2'])
        self.assertEqual(self.lst[:-1], [b'i1', b'i2', b'i3'])
        self.assertEqual(self.lst[1:2], [b'i2'])
        self.assertEqual(self.lst[1:], [b'i2', b'i3', b'i4'])

        l = db.List('l1')
        l.extend(range(10))

        # LTRIM, preserve the 1st to last (removes the 0th element).
        del l[1:-1]
        self.assertEqual([int(decode(i)) for i in l],
                         [1, 2, 3, 4, 5, 6, 7, 8, 9])

        # Trim the list so that it contains only the values within the
        # specified range.
        del l[:3]
        self.assertEqual([int(decode(i)) for i in l], [1, 2, 3])
Example #8
0
    def _load_objects(self, obj_id_zset, limit, chunk_size=1000):
        ct = i = 0
        while True:
            id_chunk = obj_id_zset[i:i + chunk_size]
            if not id_chunk:
                return

            i += chunk_size
            for raw_data in self._data[id_chunk]:
                if not raw_data:
                    continue
                if self._use_json:
                    yield json.loads(decode(raw_data))
                else:
                    yield raw_data
                ct += 1
                if limit and ct == limit:
                    return
Example #9
0
    def query(self, s=None, p=None, o=None):
        """
        Return all triples that satisfy the given expression. You may specify
        all or none of the fields (s, p, and o). For instance, if I wanted
        to query for all the people who live in Kansas, I might write:

        .. code-block:: python

            for triple in graph.query(p='lives', o='Kansas'):
                print triple['s'], 'lives in Kansas!'
        """
        start, end = self.keys_for_query(s, p, o)
        if end is None:
            if start in self._z:
                yield {'s': s, 'p': p, 'o': o}
            else:
                raise StopIteration
        else:
            for key in self._z.range_by_lex('[' + start, '[' + end):
                keys, p1, p2, p3 = decode(key).split('::')
                yield dict(zip(keys, (p1, p2, p3)))
Example #10
0
 def split_phrase(self, phrase):
     """Split the document or search query into tokens."""
     return self._symbols_re.sub(' ', decode(phrase)).split()
Example #11
0
 def python_value(self, value):
     return uuid.UUID(decode(value))
Example #12
0
    def test_read_api(self):
        sa = db.Stream('a')
        sb = db.Stream('b')
        sc = db.Stream('c')
        streams = [sa, sb, sc]
        docids = []
        for i in range(20):
            stream = streams[i % 3]
            docids.append(stream.add({'k': 'v%s' % i}, id=i + 1))

        def assertData(ret, idxs, is_multi=False):
            if is_multi:
                ret = dict(ret)
                accum = {}
                for idx in idxs:
                    sname = encode('abc'[idx % 3])
                    accum.setdefault(sname, [])
                    accum[sname].append((
                        docids[idx], {b'k': encode('v%s' % idx)}))
            else:
                accum = []
                for idx in idxs:
                    accum.append((docids[idx], {b'k': encode('v%s' % idx)}))
            self.assertEqual(ret, accum)

        assertData(sa.read(), [0, 3, 6, 9, 12, 15, 18])
        assertData(sc.read(), [2, 5, 8, 11, 14, 17])

        # We can specify a maximum number of records via "count".
        assertData(sa.read(3), [0, 3, 6])
        assertData(sb.read(2), [1, 4])
        assertData(sc.read(4), [2, 5, 8, 11])

        # We get the same values we read earlier.
        assertData(sa.read(2), [0, 3])

        # We can pass a minimum ID and will get newer data -- even if the ID
        # does not exist in the stream. We can also pass an exact ID and unlike
        # the range function, it is not inclusive.
        assertData(sa.read(2, last_id=docids[3]), [6, 9])
        assertData(sa.read(2, last_id=docids[4]), [6, 9])

        # If the last ID exceeds the highest ID (indicating no data), None is
        # returned. This is the same whether or not "count" is specified.
        self.assertEqual(sa.read(last_id=docids[18]), [])
        self.assertEqual(sa.read(2, last_id=docids[18]), [])

        # The count is a maximum, so up-to 2 items are return -- but since only
        # one item in the stream exceeds the given ID, we only get one result.
        assertData(sa.read(2, last_id=docids[17]), [18])

        # If a timeout is set and any stream can return a value, then that
        # value is returned immediately.
        assertData(sa.read(2, block=1, last_id=docids[17]), [18])
        assertData(sb.read(2, block=1, last_id=docids[18]), [19])

        # If no items are available and we timed-out, None is returned.
        self.assertEqual(sc.read(block=1, last_id=docids[19]), [])
        self.assertEqual(sc.read(2, block=1, last_id=docids[19]), [])

        # When multiple keys are given, up-to "count" items per stream
        # are returned.
        normalized = _normalize_stream_keys(['a', 'b', 'c'])
        res = db.xread(normalized, count=2)
        assertData(res, [0, 1, 2, 3, 4, 5], True)

        # Specify max-ids for each stream. The max value in "c" is 17, so
        # nothing will be returned for "c".
        uids = [decode(docid) for docid in docids]
        res = db.xread({'a': uids[15], 'b': uids[16], 'c': uids[17]},
                       count=3)
        assertData(res, [18, 19], True)

        # Now we limit ourselves to being able to pull only a single item from
        # stream "c".
        res = db.xread({'a': uids[18], 'b': uids[19], 'c': uids[16]})
        assertData(res, [17], True)

        # None is returned when no results are present and timeout is None or
        # if we reach the timeout.
        res = db.xread({'a': uids[18], 'b': uids[19], 'c': uids[17]})
        self.assertEqual(res, [])

        res = db.xread({'a': uids[18], 'b': uids[19], 'c': uids[17]},
                       count=1, block=1)
        self.assertEqual(res, [])
Example #13
0
 def __init__(self, stream, message_id, data):
     self.stream = decode(stream)
     self.message_id = decode(message_id)
     self.data = decode_dict(data)
     self.timestamp, self.sequence = id_to_datetime(message_id)
Example #14
0
 def python_value(self, value):
     return json.loads(decode(value))
Example #15
0
 def python_value(self, value):
     return decode(value) == '1'