Ejemplo n.º 1
0
    def __getitem__(self, key):
        previous_stop = self.stop
        result = super(ListPrefetch, self).__getitem__(key)
        if self.stop != previous_stop:
            # Cache-ahead
            try:
                if isinstance(key, six.integer_types):
                    tail = super(ListPrefetch, self).__getitem__(
                        slice(key + 1, key + self.prefetch_size + 1))
                elif isinstance(key, slice):
                    if key.stop:
                        tail = super(ListPrefetch, self).__getitem__(
                            slice(key.stop + 1,
                                  key.stop + self.prefetch_size + 1))
                    else:
                        tail = []
            except StopIteration:
                # If we've finished right at this element, that's not an error
                tail = []

            # Fetching objects needed
            if isinstance(result, list):
                prefetch(result + tail)
            elif isinstance(result, Persistent):
                prefetch([result] + tail)
        return result
Ejemplo n.º 2
0
    def index_doc(self, docid, text):
        if docid in self._docwords:
            return self._reindex_doc(docid, text)

        wids = self._lexicon.sourceToWordIds(text)

        # XXX Counter is slow. If it is an issue, need to include C module
        # http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character
        widcnt = Counter(wids)
        widset = widcnt.keys()
        widcode = PersistentWid.encode_wid(wids if self.keep_phrases else widset)
        self._docwords[docid] = widcode

        if widset:
            weights, lengths = self._get_doctrees(widset)
            docscores = self._get_widscores(widcnt, docid)
            parallel_traversal(*zip(*[(weights[w], docscores[w]) for w in widset]))
            prefetch(list(lengths.values()) + [self.documentCount])

            for w in widset:
                weights[w].add(docscores[w])
                lengths[w].change(1)

        self.documentCount.change(1)

        return len(wids)
Ejemplo n.º 3
0
 def applyInRange(self, start, end, excludemin=False, excludemax=False):
     # prefetch is a catalog or None
     values = self._fwd_index.values(start,
                                     end,
                                     excludemin=excludemin,
                                     excludemax=excludemax)
     prefetch(values)
     return self.family.IF.multiunion(values)
Ejemplo n.º 4
0
    def __getitem__(self, key):
        previous_stop = self.stop
        result = super(ListPrefetch, self).__getitem__(key)
        if self.stop != previous_stop:
            # Cache-ahead
            try:
                if isinstance(key, six.integer_types):
                    tail = super(ListPrefetch, self).__getitem__(slice(key + 1, key + self.prefetch_size + 1))
                elif isinstance(key, slice):
                    if key.stop:
                        tail = super(ListPrefetch, self).__getitem__(slice(key.stop + 1, key.stop + self.prefetch_size + 1))
                    else:
                        tail = []
            except StopIteration:
                # If we've finished right at this element, that's not an error
                tail = []

            # Fetching objects needed
            if isinstance(result, list):
                prefetch(result + tail)
            elif isinstance(result, Persistent):
                prefetch([result] + tail)
        return result
Ejemplo n.º 5
0
 def applyInRange(self, start, end, excludemin=False, excludemax=False):
     # prefetch is a catalog or None
     values = self._fwd_index.values(start, end, excludemin=excludemin, excludemax=excludemax)
     prefetch(values)
     return self.family.IF.multiunion(values)
Ejemplo n.º 6
0
def mass_weightedUnion(L):
    """
    Incremental version of mass_weightedUnion
    :param list L: (TreeSet((-score, docid)), weight) elements
    :returns: iterable ordered from large to small sum(score*weight)
    """
    cache_size = 40
    order_size = 15
    order_violation = 3
    cache_updated = None

    if len(L) == 0:
        return

    elif len(L) == 1:
        # Trivial
        tree, weight = L[0]
        # XXX need to make it possible to advance the tree N elements ahead!
        for (score, docid) in tree:
            yield (docid, -score * weight)

    else:
        # XXX make into an iterator class

        trees, weights = zip(*L)
        prefetch(trees)
        prefetch([x._firstbucket for x in trees if x._firstbucket is not None])

        unread_max = [-t.minKey()[0] * w for t, w in L]
        lengths = map(len, trees)
        iters = dict(enumerate(map(iter, trees)))
        caches = [{} for i in range(len(L))]
        docid2cacheid = defaultdict(list)
        cache_len = None
        maxscores = [-1] * len(L)
        used = set()
        sorted_mins = SortedSet()  # Contains tuples (-min_weight, docid)
        mins_dict = {}  # {docid -> min_weight}
        docids = []

        def precache(i, size):
            try:
                for j in xrange(size):
                    score, docid = next(iters[i])
                    score = -score * weights[i]
                    if unread_max[i] > score:
                        unread_max[i] = score
                    if docid not in used:
                        caches[i][docid] = score
                        docid2cacheid[docid].append(i)
                        if maxscores[i] < 0:
                            maxscores[i] = score

                        # documents are not repeated in iterators
                        # it means that for the iterator i we didn't meet docid before
                        # which means that we should just add score to the current minscore
                        total_score = mins_dict.get(docid, 0.0) + score
                        if docid in mins_dict:
                            sorted_mins.remove((-mins_dict[docid], docid))
                        mins_dict[docid] = total_score
                        sorted_mins.add((-total_score, docid))
            except StopIteration:
                del iters[i]
                unread_max[i] = 0

        while True:
            # Main cycle in which we yield values to make an iterator

            # Advance iterators when needed / fill caches to keep them long enough
            # Perhaps, some better algorithm is needed to pre-read, this is the simplest

            cache_updated = False
            for i in list(iters.keys()):
                cache = caches[i]
                if len(cache) < cache_size / 2:
                    precache(i, cache_size - len(cache))
                    cache_len = sum(map(len, caches))
                    cache_updated = True

            if cache_updated or (cache_len is not None and (cache_len > order_violation) and (len(docids) < order_violation)):
                while True:
                    max_sum = sum(unread_max)
                    mins = []
                    docids = []
                    maxs = []
                    for w, docid in islice(iter(sorted_mins), order_size):
                        mins.append(-w)
                        docids.append(docid)
                        cacheids = docid2cacheid[docid]
                        # Slower equivalent:
                        # maxs = sum(c.get(docid, m) for c, m in izip(caches, unread_max))
                        # because
                        # -w   = sum(c.get(docid, 0) for c, m in izip(caches, unread_max))
                        maxs.append(max_sum - sum(unread_max[i] for i in cacheids) - w)
                    violated = False
                    for i in xrange(len(mins) - order_violation - 1):
                        # Naive implementation. Can go from tail and do faster
                        # in batches of size of order_violation
                        for m in maxs[i + order_violation:]:
                            if m > mins[i]:
                                violated = True
                                break
                        if violated:
                            break
                    if len(mins) == order_size:
                        # Last check: can the order be violated by out-of-cache elements?
                        if mins[len(mins) - order_violation] < max_sum:
                            violated = True
                    if not violated:
                        break
                    else:
                        # XXX instead we could download enough elements to change
                        # sum(unread_max) by much enough
                        # We need something like "advance_to" method
                        precache(
                                max(enumerate(m / max(l - len(c), 1) for m, l, c in izip(unread_max, lengths, caches)), key=lambda x: x[1])[0],
                                cache_size / 2)
                        cache_len = sum(map(len, caches))

            if not docids:
                break
            minw = mins.pop(0)
            maxw = maxs.pop(0)
            docid = docids.pop(0)
            for i, c in enumerate(caches):
                if docid in c:
                    if c[docid] == unread_max[i]:
                        del c[docid]
                        unread_max[i] = max(six.itervalues(c))
                    else:
                        del c[docid]
                    cache_len -= 1
            used.add(docid)
            sorted_mins.remove((-minw, docid))
            del mins_dict[docid]
            yield docid, (minw + maxw) / 2.0