Exemple #1
 def index(self, lib):
     print "Indexing with %s..." % lib
     options = self.options
     chunk = int(options.chunk)
     skip = int(options.skip)
     upto = int(options.upto)
     count = 0
     skipc = skip
     starttime = chunkstarttime = now()
     for d in self.spec.documents():
         skipc -= 1
         if not skipc:
             count += 1
             skipc = skip
             if chunk and not count % chunk:
                 t = now()
                 sofar = t - starttime
                 print "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count/sofar)
                 chunkstarttime = t
             if count > upto:
     spooltime = now()
     print "Spool time:", spooltime - starttime
     committime = now()
     print "Commit time:", committime - spooltime
     print "Total time to index", count, "documents:",  committime - starttime
Exemple #3
def make_index(basedir,
    if not os.path.exists(ixdir):

    # Multi-lingual stop words
    stoplist = (analysis.STOP_WORDS
                | set("de la der und le die et en al no von di du da "
                      "del zur ein".split()))
    # Schema
    ana = analysis.StemmingAnalyzer(stoplist=stoplist)
    schema = fields.Schema(
        subject=fields.TEXT(analyzer=ana, phrase=False),

    # MARC fields to extract
    mfields = set(subjectfields)  # Subjects
    mfields.update("100 110 111".split())  # Author
    mfields.add("245")  # Title

    print("Indexing with %d processor(s) and %d MB per processor" %
          (procs, limitmb))
    c = 0
    t = now()
    ix = index.create_in(ixdir, schema)
    with ix.writer(procs=procs, limitmb=limitmb,
                   multisegment=multisegment) as w:
        filenames = [
            filename for filename in os.listdir(basedir)
            if fnmatch.fnmatch(filename, glob)
        for filename in filenames:
            path = os.path.join(basedir, filename)
            print("Indexing", path)
            f = open(path, 'rb')
            for x, pos in read_file(f, mfields):
                c += 1
    print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
Exemple #4
    def search_file(self, lib):
        f = open(self.options.termfile, "rb")
        terms = [line.strip() for line in f]

        print("Searching %d terms with %s" % (len(terms), lib))
        starttime = now()
        for r in lib.findterms(terms):
        searchtime = now() - starttime
        print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime)
Exemple #8
 def finish(self, doccount, lengthfile, termtable, postingwriter):
     _fieldlength_totals = self._fieldlength_totals
     if not self.tasks:
     pqueue = self.postingqueue
     rqueue = self.resultsqueue
     for _ in xrange(self.procs):
         pqueue.put((-1, doccount))
     #print "Joining..."
     t = now()
     for task in self.tasks:
     #print "Join:", now() - t
     #print "Getting results..."
     t = now()
     runs = []
     lenfilenames = []
     for task in self.tasks:
         taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
         for fieldnum, total in flentotals.iteritems():
             _fieldlength_totals[fieldnum] += total
         for fieldnum, length in flenmaxes.iteritems():
             if length > self._fieldlength_maxes.get(fieldnum, 0):
                 self._fieldlength_maxes[fieldnum] = length
     #print "Results:", now() - t
     #print "Writing lengths..."
     t = now()
     lw = LengthWriter(lengthfile, doccount)
     for lenfilename in lenfilenames:
         sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount)
     lengths = lw.reader()
     #print "Lengths:", now() - t
     t = now()
     iterator = imerge([read_run(runname, count) for runname, count in runs])
     total = sum(count for runname, count in runs)
     write_postings(self.schema, termtable, lengths, postingwriter, iterator)
     for runname, count in runs:
     #print "Merge:", now() - t
Exemple #9
    def cache_messages(self, archive, cache):
        print("Caching messages in %s..." % cache)

        if not os.path.exists(archive):
            raise Exception("Archive file %r does not exist" % archive)

        t = now()
        f = open(cache, "wb")
        c = 0
        for d in self.get_messages(archive):
            c += 1
            dump(d, f)
            if not c % 1000: print(c)
        print("Cached messages in ", now() - t, "seconds")
Exemple #11
    def prepare(self, top_searcher, q, context):
        """This method is called before a search.

        Subclasses can override this to perform set-up work, but
        they should still call the superclass's method because it sets several
        necessary attributes on the collector object:

            The top-level searcher.
            The query object
            ``context.needs_current`` controls whether a wrapping collector
            requires that this collector's matcher be in a valid state at every
            call to ``collect()``. If this is ``False``, the collector is free
            to use faster methods that don't necessarily keep the matcher
            updated, such as ``matcher.all_ids()``.

        :param top_searcher: the top-level :class:`whoosh.searching.Searcher`
        :param q: the :class:`whoosh.query.Query` object being searched for.
        :param context: a :class:`whoosh.searching.SearchContext` object
            containing information about the search.

        self.top_searcher = top_searcher
        self.q = q
        self.context = context

        self.starttime = now()
        self.runtime = None
        self.docset = set()
Exemple #14
    def index(self, lib):
        print("Indexing with %s..." % lib)

        options = self.options
        every = None if options.every is None else int(options.every)
        merge = options.merge
        chunk = int(options.chunk)
        skip = int(options.skip)
        upto = int(options.upto)
        count = 0
        skipc = skip

        starttime = chunkstarttime = now()


        for d in self.spec.documents():
            skipc -= 1
            if not skipc:
                count += 1
                skipc = skip
                if chunk and not count % chunk:
                    t = now()
                    sofar = t - starttime
                        "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s"
                        % (count, t - chunkstarttime, chunk, sofar,
                           count / sofar))
                    chunkstarttime = t
                if count > upto:
                if every and not count % every:

        spooltime = now()
        print("Spool time:", spooltime - starttime)
        committime = now()
        print("Commit time:", committime - spooltime)
        totaltime = committime - starttime
        print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" %
              (count, totaltime, totaltime / 60.0))
        print("Indexed %0.3f docs/s" % (count / totaltime))
    def sort_query(self, query, sortedby, reverse=False):
        if isinstance(sortedby, basestring):
            sorter = self._field_sorter(sortedby)
        elif isinstance(sortedby, (list, tuple)):
            sorter = scoring.MultiFieldSorter([self._field_sorter(fname)
                                               for fname in sortedby])
        elif isinstance(sortedby, Sorter):
            sorter = sortedby
            raise ValueError("sortedby argument (%R) must be a string, list,"
                             " or Sorter" % sortedby)

        t = now()
        sorted_docs = list(sorter.order(self, query.docs(self), reverse=reverse))
        runtime = now() - t
        return Results(self, query, sorted_docs, None, runtime)
def test_20000_single():
    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000single") as ix:
        domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot",
                  "golf", "hotel", "india", "juliet", "kilo", "lima"]

        t = now()
        for i in xrange(20000):
            w = ix.writer()
                           text=u(" ").join(random.sample(domain, 5)))
        print("Write single:", now() - t)

        t = now()
        print("Optimize single:", now() - t)
Exemple #20
    def finish(self):
        """This method is called after a search.
        Subclasses can override this to perform set-up work, but
        they should still call the superclass's method because it sets several
        necessary attributes on the collector object:
            The time (in seconds) the search took.

        self.runtime = now() - self.starttime
Exemple #21
def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True,
    if not os.path.exists(ixdir):

    # Multi-lingual stop words
    stoplist = (analysis.STOP_WORDS
                | set("de la der und le die et en al no von di du da "
                      "del zur ein".split()))
    # Schema
    ana = analysis.StemmingAnalyzer(stoplist=stoplist)
    schema = fields.Schema(title=fields.TEXT(analyzer=ana),
                           subject=fields.TEXT(analyzer=ana, phrase=False),
                           file=fields.STORED, pos=fields.STORED,

    # MARC fields to extract
    mfields = set(subjectfields)  # Subjects
    mfields.update("100 110 111".split())  # Author
    mfields.add("245")  # Title

    print("Indexing with %d processor(s) and %d MB per processor"
          % (procs, limitmb))
    c = 0
    t = now()
    ix = index.create_in(ixdir, schema)
    with ix.writer(procs=procs, limitmb=limitmb,
                   multisegment=multisegment) as w:
        filenames = [filename for filename in os.listdir(basedir)
                     if fnmatch.fnmatch(filename, glob)]
        for filename in filenames:
            path = os.path.join(basedir, filename)
            print("Indexing", path)
            f = open(path, 'rb')
            for x, pos in read_file(f, mfields):
                w.add_document(title=uni(title(x)), author=uni(author(x)),
                               file=filename, pos=pos)
                c += 1
    print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
def test_20000_buffered():
    from whoosh.writing import BufferedWriter

    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000buffered") as ix:
        domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot",
                  "golf", "hotel", "india", "juliet", "kilo", "lima"]

        t = now()
        w = BufferedWriter(ix, limit=100, period=None)
        for i in xrange(20000):
                           text=u(" ").join(random.sample(domain, 5)))
        print("Write buffered:", now() - t)

        t = now()
        print("Optimize buffered:", now() - t)
    def finish(self):
        """This method is called after a search.

        Subclasses can override this to perform set-up work, but
        they should still call the superclass's method because it sets several
        necessary attributes on the collector object:

            The time (in seconds) the search took.

    def dump_run(self):
        if self.size > 0:
            #print "Dumping run..."
            t = now()
            filename = self.unique_name(".run")
            runfile = open(filename, "w+b")
            for p in self.postings:
                dump(p, runfile)

            self.runs.append((filename, self.count))
            self.postings = []
            self.size = 0
            self.count = 0
Exemple #28
def timing(name=None):
    t = now()
    t = now() - t
    print("%s: %0.06f s" % (name or '', t))
Exemple #33
    def _simple_sort_query(self, q, limit=None, reverse=False, filter=None):
        # If the direction of all sort fields is the same, we can use field
        # caches to do the sorting

        t = now()
        docset = set()
        sortedby = [c[0] for c in self.criteria]
        reverse = self.criteria[0][1] ^ reverse
        comb = self.searcher._filter_to_comb(filter)

        if self.searcher.subsearchers:
            heap = []

            # I wish I could actually do a heap thing here, but the Python heap
            # queue only works with greater-than, and I haven't thought of a
            # smart way to get around that yet, so I'm being dumb and using
            # nlargest/nsmallest on the heap + each subreader list :(
            op = nlargest if reverse else nsmallest

            for s, offset in self.searcher.subsearchers:
                # This searcher is wrapping a MultiReader, so push the sorting
                # down to the leaf readers and then combine the results.
                docnums = [docnum for docnum in q.docs(s)
                           if (not comb) or docnum + offset in comb]

                # Add the docnums to the docset

                # Ask the reader to return a list of (key, docnum) pairs to
                # sort by. If limit=None, the returned list is not sorted. If
                # limit=True, it is sorted.
                r = s.reader()
                srt = r.key_docs_by(sortedby, docnums, limit, reverse=reverse,
                if limit:
                    # Pick the "limit" smallest/largest items from the current
                    # and new list
                    heap = op(limit, heap + srt)
                    # If limit=None, we'll just add everything to the "heap"
                    # and sort it at the end.

            # Sort the heap and take the docnums
            docnums = [docnum for _, docnum in sorted(heap, reverse=reverse)]

            # This searcher is wrapping an atomic reader, so we don't need to
            # get tricky combining the results of multiple readers, just ask
            # the reader to sort the results.
            r = self.searcher.reader()
            docnums = [docnum for docnum in q.docs(self.searcher)
                       if (not comb) or docnum in comb]
            docnums = r.sort_docs_by(sortedby, docnums, reverse=reverse)
            docset = set(docnums)

            # I artificially enforce the limit here, even thought the current
            # implementation can't use it, so that the results don't change
            # based on single- vs- multi-segment.
            docnums = docnums[:limit]

        runtime = now() - t
        return self._results(q, docnums, docset, runtime)
Exemple #35
 def download_archive(self, archive):
     print("Downloading Enron email archive to %r..." % archive)
     t = now()
     urlretrieve(self.enron_archive_url, archive)
     print("Downloaded in ", now() - t, "seconds")
    def search(self, query, limit=10, sortedby=None, reverse=False,
        """Runs the query represented by the ``query`` object and returns a
        Results object.
        :param query: a :class:`whoosh.query.Query` object.
        :param limit: the maximum number of documents to score. If you're only
            interested in the top N documents, you can set limit=N to limit the
            scoring for a faster search.
        :param sortedby: if this parameter is not None, the results are sorted
            instead of scored. If this value is a string, the results are
            sorted by the field named in the string. If this value is a list or
            tuple, it is assumed to be a sequence of strings and the results
            are sorted by the fieldnames in the sequence. Otherwise 'sortedby'
            should be a scoring.Sorter object.
            The fields you want to sort by must be indexed.
            For example, to sort the results by the 'path' field::
                searcher.find(q, sortedby = "path")
            To sort the results by the 'path' field and then the 'category'
                searcher.find(q, sortedby = ("path", "category"))
            To use a sorting object::
                searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn))
            Using a string or tuple simply instantiates a
            :class:`whoosh.scoring.FieldSorter` or
            :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a
            custom sort order, instantiate your own ``FieldSorter`` with a
            ``key`` argument, or write a custom :class:`whoosh.scoring.Sorter`
            FieldSorter and MultiFieldSorter cache the document order, using 4
            bytes times the number of documents in the index, and taking time
            to cache. To increase performance, instantiate your own sorter and
            re-use it (but remember you need to recreate it if the index
        :param reverse: if ``sortedby`` is not None, this reverses the
            direction of the sort.
        :param optimize: use optimizations to get faster results when possible.
        :rtype: :class:`Results`

        if limit is not None and limit < 1:
            raise ValueError("limit must be >= 1")

        if sortedby is not None:
            return self.sort_query(query, sortedby, reverse=reverse)
        t = now()
        matcher = query.matcher(self)
        if isinstance(matcher, NullMatcher):
            scores = []
            docnums = []
            bitset = None
            scores, docnums, bitset = collect(self, matcher, limit,
        runtime = now() - t

        return Results(self, query, docnums, scores, runtime, docs=bitset)
Exemple #38
    content = fields.TEXT(spelling=True, analyzer=ana)

    chapter = fields.ID(sortable=True)

    size = fields.NUMERIC(sortable=True)

    lastopened = fields.TEXT(sortable=True)

    lastchanged = fields.TEXT(sortable=True)

    created = fields.TEXT(sortable=True)

ix = index.create_in(indexdir, PydocSchema)
with ix.writer(limitmb=2048) as w:
    t = now()
    for dirpath, dirnames, filenames in os.walk(sourcedir):
        chapter = unicode(os.path.basename(dirpath))
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            size = os.path.getsize(filepath)

            path = dirpath

            fileName, fileExt = os.path.splitext(filename)
            fileName = unicode(fileName, errors='ignore')
            fileExt = unicode(fileExt, errors='ignore')
            data = None

            lasto = time.ctime(os.stat(filepath).st_atime)
            lasto = unicode(lasto[4:])
Exemple #43
    def results(self, qstring, cat_order, category=None, shortcuts=None,
                limit=None, cat_limit=5):
        from whoosh.util import now

        t = now()
        s = self.searcher
        limit = limit or self.limit
        showall = False

        if shortcuts:
            qstring, showall = self.expand_shortcuts(qstring, shortcuts)

        if category:
            filter = query.Term("category", category)
            filter = None

        all_q = self.make_query(qstring, "content")

        show_best = (not category and
                     all(isinstance(lq, query.Term) and lq.field() == "content"
                         for lq in all_q.leaves()))
        if show_best:
            best_q = self.make_query(qstring, "bestbet")
            best_r = s.search(best_q, limit=10)
            best_r = None

        grams_groups = None
        grams_q = self.make_query(qstring, "grams")
        if any(fn == "grams" for fn, _ in grams_q.iter_all_terms()):
                grams_r = s.search(grams_q, limit=limit, groupedby="category",
            except query.QueryError:
                grams_groups = grams_r.groups()

        all_r = s.search(all_q, limit=limit, groupedby="category",
        all_groups = all_r.groups()

        # OK, this is complicated... we want to present the categories in the
        # order defined in cat_order, BUT we want categories that have grams
        # matches to come before categories that only have content matches
        final_order = []
        if grams_groups:
            # Add categories in grams_groups in the order defined by cat_order
            for cat in cat_order:
                if cat in grams_groups:
            # Add any categories in grams_groups that aren't in cat_order
            final_order.extend(cat for cat in sorted(grams_groups)
                               if cat not in cat_order)

        seen = set(final_order)
        # Add categories in all_groups in the order defined by cat_order, IF
        # they weren't already added in the previous step
        for cat in cat_order:
            if cat in all_groups and cat not in seen:
        # Add any categories in all_groups that weren't added in the previous
        # steps
        final_order.extend(cat for cat in sorted(all_groups)
                           if cat not in cat_order and cat not in seen)

        # If there's only one category, there's no point in cutting it off,
        # just show all hits
        showall = showall or len(final_order) == 1

        # For each category, pull out the docnums and get their stored fields
        length = 0
        categories = []
        for cat in final_order:
            # Combine the docnums for this category from grams and all
            docnums = []
            seen = set()
            if grams_groups:
                for docnum in grams_groups.get(cat, ()):

            for docnum in all_groups.get(cat, ()):
                if docnum not in seen:

            # If the number of hits is exactly the limit + 1, then there's no
            # point showing a "show more" line instead of that one extra hit,
            # so just increase the limit in that case
            if len(docnums) == cat_limit + 1:
                cutoff = len(docnums)
                cutoff = cat_limit

            if not showall and len(docnums) > cutoff:
                docnums = docnums[:cutoff]

            length += len(seen)
            docs = [s.stored_fields(docnum) for docnum in docnums]
            categories.append((cat, docs, len(seen)))

        sent = now()
        runtime_ms = (sent - t) * 1000
        return {
            "qstring": qstring,
            "best": best_r,
            "category": category,
            "categories": categories,
            "length": length,
            "limit": limit,
            "hits": all_r,
            "sent": sent,
            "runtime": runtime_ms,
Exemple #48
    def search(self, query, limit=10, sortedby=None, reverse=False,
        """Runs the query represented by the ``query`` object and returns a
        Results object.
        :param query: a :class:`whoosh.query.Query` object.
        :param limit: the maximum number of documents to score. If you're only
            interested in the top N documents, you can set limit=N to limit the
            scoring for a faster search.
        :param sortedby: if this parameter is not None, the results are sorted
            instead of scored. If this value is a string, the results are
            sorted by the field named in the string. If this value is a list or
            tuple, it is assumed to be a sequence of strings and the results
            are sorted by the fieldnames in the sequence. Otherwise 'sortedby'
            should be a scoring.Sorter object.
            The fields you want to sort by must be indexed.
            For example, to sort the results by the 'path' field::
                searcher.find(q, sortedby = "path")
            To sort the results by the 'path' field and then the 'category'
                searcher.find(q, sortedby = ("path", "category"))
            To use a sorting object::
                searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn))
            Using a string or tuple simply instantiates a
            :class:`whoosh.scoring.FieldSorter` or
            :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a
            custom sort order, instantiate your own ``FieldSorter`` with a
            ``key`` argument, or write a custom :class:`whoosh.scoring.Sorter`
            FieldSorter and MultiFieldSorter cache the document order, using 4
            bytes times the number of documents in the index, and taking time
            to cache. To increase performance, instantiate your own sorter and
            re-use it (but remember you need to recreate it if the index
        :param reverse: if ``sortedby`` is not None, this reverses the
            direction of the sort.
        :param optimize: use optimizations to get faster results when possible.
        :rtype: :class:`Results`

        if limit is not None and limit < 1:
            raise ValueError("limit must be >= 1")

        if sortedby is not None:
            return self.sort_query(query, sortedby, reverse=reverse)
        t = now()
        matcher = query.matcher(self)
        if isinstance(matcher, NullMatcher):
            scores = []
            docnums = []
            bitset = None
            scores, docnums, bitset = collect(self, matcher, limit,
        runtime = now() - t

        return Results(self, query, docnums, scores, runtime, docs=bitset)
