Example #1
0
 def index(self, lib):
     print "Indexing with %s..." % lib
     
     options = self.options
     chunk = int(options.chunk)
     skip = int(options.skip)
     upto = int(options.upto)
     count = 0
     skipc = skip
     
     starttime = chunkstarttime = now()
     lib.indexer()
     for d in self.spec.documents():
         skipc -= 1
         if not skipc:
             lib.index_document(d)
             count += 1
             skipc = skip
             if chunk and not count % chunk:
                 t = now()
                 sofar = t - starttime
                 print "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count/sofar)
                 chunkstarttime = t
             if count > upto:
                 break
     
     spooltime = now()
     print "Spool time:", spooltime - starttime
     lib.finish()
     committime = now()
     print "Commit time:", committime - spooltime
     print "Total time to index", count, "documents:",  committime - starttime
Example #2
0
    def index(self, lib):
        print "Indexing with %s..." % lib

        options = self.options
        chunk = int(options.chunk)
        skip = int(options.skip)
        upto = int(options.upto)
        count = 0
        skipc = skip

        starttime = chunkstarttime = now()
        lib.indexer()
        for d in self.spec.documents():
            skipc -= 1
            if not skipc:
                lib.index_document(d)
                count += 1
                skipc = skip
                if chunk and not count % chunk:
                    t = now()
                    sofar = t - starttime
                    print "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (
                        count, t - chunkstarttime, chunk, sofar, count / sofar)
                    chunkstarttime = t
                if count > upto:
                    break

        spooltime = now()
        print "Spool time:", spooltime - starttime
        lib.finish()
        committime = now()
        print "Commit time:", committime - spooltime
        print "Total time to index", count, "documents:", committime - starttime
Example #3
0
def make_index(basedir,
               ixdir,
               procs=4,
               limitmb=128,
               multisegment=True,
               glob="*.mrc"):
    if not os.path.exists(ixdir):
        os.mkdir(ixdir)

    # Multi-lingual stop words
    stoplist = (analysis.STOP_WORDS
                | set("de la der und le die et en al no von di du da "
                      "del zur ein".split()))
    # Schema
    ana = analysis.StemmingAnalyzer(stoplist=stoplist)
    schema = fields.Schema(
        title=fields.TEXT(analyzer=ana),
        author=fields.TEXT(phrase=False),
        subject=fields.TEXT(analyzer=ana, phrase=False),
        file=fields.STORED,
        pos=fields.STORED,
    )

    # MARC fields to extract
    mfields = set(subjectfields)  # Subjects
    mfields.update("100 110 111".split())  # Author
    mfields.add("245")  # Title

    print("Indexing with %d processor(s) and %d MB per processor" %
          (procs, limitmb))
    c = 0
    t = now()
    ix = index.create_in(ixdir, schema)
    with ix.writer(procs=procs, limitmb=limitmb,
                   multisegment=multisegment) as w:
        filenames = [
            filename for filename in os.listdir(basedir)
            if fnmatch.fnmatch(filename, glob)
        ]
        for filename in filenames:
            path = os.path.join(basedir, filename)
            print("Indexing", path)
            f = open(path, 'rb')
            for x, pos in read_file(f, mfields):
                w.add_document(title=uni(title(x)),
                               author=uni(author(x)),
                               subject=uni(subjects(x)),
                               file=filename,
                               pos=pos)
                c += 1
            f.close()
        print("Committing...")
    print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
Example #4
0
    def search_file(self, lib):
        f = open(self.options.termfile, "rb")
        terms = [line.strip() for line in f]
        f.close()

        print("Searching %d terms with %s" % (len(terms), lib))
        lib.searcher()
        starttime = now()
        for r in lib.findterms(terms):
            pass
        searchtime = now() - starttime
        print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime)
Example #5
0
    def search(self, lib):
        lib.searcher()

        t = now()
        q = lib.query()
        print "Query:", q
        r = lib.find(q)
        print "Search time:", now() - t

        t = now()
        self.spec.print_results(lib.results(r))
        print "Print time:", now() - t
Example #6
0
    def search_file(self, lib):
        f = open(self.options.termfile, "rb")
        terms = [line.strip() for line in f]
        f.close()

        print "Searching %d terms with %s" % (len(terms), lib)
        lib.searcher()
        starttime = now()
        for r in lib.findterms(terms):
            pass
        searchtime = now() - starttime
        print "Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime
Example #7
0
    def search(self, lib):
        lib.searcher()

        t = now()
        q = lib.query()
        print "Query:", q
        r = lib.find(q)
        print "Search time:", now() - t

        t = now()
        self.spec.print_results(lib.results(r))
        print "Print time:", now() - t
Example #8
0
 def finish(self, doccount, lengthfile, termtable, postingwriter):
     _fieldlength_totals = self._fieldlength_totals
     if not self.tasks:
         return
     
     pqueue = self.postingqueue
     rqueue = self.resultsqueue
     
     for _ in xrange(self.procs):
         pqueue.put((-1, doccount))
     
     #print "Joining..."
     t = now()
     for task in self.tasks:
         task.join()
     #print "Join:", now() - t
     
     #print "Getting results..."
     t = now()
     runs = []
     lenfilenames = []
     for task in self.tasks:
         taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
         runs.extend(taskruns)
         lenfilenames.append(lenfilename)
         for fieldnum, total in flentotals.iteritems():
             _fieldlength_totals[fieldnum] += total
         for fieldnum, length in flenmaxes.iteritems():
             if length > self._fieldlength_maxes.get(fieldnum, 0):
                 self._fieldlength_maxes[fieldnum] = length
     #print "Results:", now() - t
     
     #print "Writing lengths..."
     t = now()
     lw = LengthWriter(lengthfile, doccount)
     for lenfilename in lenfilenames:
         sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount)
         lw.add_all(sublengths)
         os.remove(lenfilename)
     lw.close()
     lengths = lw.reader()
     #print "Lengths:", now() - t
     
     t = now()
     iterator = imerge([read_run(runname, count) for runname, count in runs])
     total = sum(count for runname, count in runs)
     write_postings(self.schema, termtable, lengths, postingwriter, iterator)
     for runname, count in runs:
         os.remove(runname)
     #print "Merge:", now() - t
     
     self.cleanup()
Example #9
0
File: enron.py Project: JunjieHu/dl
    def cache_messages(self, archive, cache):
        print("Caching messages in %s..." % cache)

        if not os.path.exists(archive):
            raise Exception("Archive file %r does not exist" % archive)

        t = now()
        f = open(cache, "wb")
        c = 0
        for d in self.get_messages(archive):
            c += 1
            dump(d, f)
            if not c % 1000: print(c)
        f.close()
        print("Cached messages in ", now() - t, "seconds")
Example #10
0
 def cache_messages(self, archive, cache):
     print("Caching messages in %s..." % cache)
     
     if not os.path.exists(archive):
         raise Exception("Archive file %r does not exist" % archive)
     
     t = now()
     f = open(cache, "wb")
     c = 0
     for d in self.get_messages(archive):
         c += 1
         dump(d, f)
         if not c % 1000: print(c)
     f.close()
     print("Cached messages in ", now() - t, "seconds")
Example #11
0
    def prepare(self, top_searcher, q, context):
        """This method is called before a search.

        Subclasses can override this to perform set-up work, but
        they should still call the superclass's method because it sets several
        necessary attributes on the collector object:

        self.top_searcher
            The top-level searcher.
        self.q
            The query object
        self.context
            ``context.needs_current`` controls whether a wrapping collector
            requires that this collector's matcher be in a valid state at every
            call to ``collect()``. If this is ``False``, the collector is free
            to use faster methods that don't necessarily keep the matcher
            updated, such as ``matcher.all_ids()``.

        :param top_searcher: the top-level :class:`whoosh.searching.Searcher`
            object.
        :param q: the :class:`whoosh.query.Query` object being searched for.
        :param context: a :class:`whoosh.searching.SearchContext` object
            containing information about the search.
        """

        self.top_searcher = top_searcher
        self.q = q
        self.context = context

        self.starttime = now()
        self.runtime = None
        self.docset = set()
Example #12
0
 def _complex_sort_query(self, q, limit=None, reverse=False, filter=None):
     t = now()
     if self.arrays is None:
         self._complex_cache()
     comb = self.searcher._filter_to_comb(filter)
     docnums = [docnum for docnum in self.searcher.docs_for_query(q)
                if (not comb) or docnum in comb]
     docnums.sort(key=self._complex_key_fn, reverse=reverse)
     docset = set(docnums)
     
     # I artificially enforce the limit here, even thought the current
     # implementation can't use it, so that the results don't change based
     # on single- vs- multi-segment.
     if limit:
         docnums = docnums[:limit]
     runtime = now() - t
     return self._results(q, docnums, docset, runtime)
Example #13
0
    def _complex_sort_query(self, q, limit=None, reverse=False, filter=None):
        t = now()
        if self.arrays is None:
            self._complex_cache()
        comb = self.searcher._filter_to_comb(filter)
        docnums = [docnum for docnum in self.searcher.docs_for_query(q)
                   if (not comb) or docnum in comb]
        docnums.sort(key=self._complex_key_fn, reverse=reverse)
        docset = set(docnums)

        # I artificially enforce the limit here, even thought the current
        # implementation can't use it, so that the results don't change based
        # on single- vs- multi-segment.
        if limit:
            docnums = docnums[:limit]
        runtime = now() - t
        return self._results(q, docnums, docset, runtime)
Example #14
0
    def index(self, lib):
        print("Indexing with %s..." % lib)

        options = self.options
        every = None if options.every is None else int(options.every)
        merge = options.merge
        chunk = int(options.chunk)
        skip = int(options.skip)
        upto = int(options.upto)
        count = 0
        skipc = skip

        starttime = chunkstarttime = now()

        lib.indexer()

        for d in self.spec.documents():
            skipc -= 1
            if not skipc:
                lib.index_document(d)
                count += 1
                skipc = skip
                if chunk and not count % chunk:
                    t = now()
                    sofar = t - starttime
                    print(
                        "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s"
                        % (count, t - chunkstarttime, chunk, sofar,
                           count / sofar))
                    chunkstarttime = t
                if count > upto:
                    break
                if every and not count % every:
                    print("----Commit")
                    lib.finish(merge=merge)
                    lib.indexer(create=False)

        spooltime = now()
        print("Spool time:", spooltime - starttime)
        lib.finish(merge=merge)
        committime = now()
        print("Commit time:", committime - spooltime)
        totaltime = committime - starttime
        print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" %
              (count, totaltime, totaltime / 60.0))
        print("Indexed %0.3f docs/s" % (count / totaltime))
Example #15
0
    def index(self, lib):
        print("Indexing with %s..." % lib)

        options = self.options
        every = None if options.every is None else int(options.every)
        merge = options.merge
        chunk = int(options.chunk)
        skip = int(options.skip)
        upto = int(options.upto)
        count = 0
        skipc = skip

        starttime = chunkstarttime = now()

        lib.indexer()

        for d in self.spec.documents():
            skipc -= 1
            if not skipc:
                lib.index_document(d)
                count += 1
                skipc = skip
                if chunk and not count % chunk:
                    t = now()
                    sofar = t - starttime
                    print(
                        "Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s"
                        % (count, t - chunkstarttime, chunk, sofar,
                           count / sofar))
                    chunkstarttime = t
                if count > upto:
                    break
                if every and not count % every:
                    print("----Commit")
                    lib.finish(merge=merge)
                    lib.indexer(create=False)

        spooltime = now()
        print("Spool time:", spooltime - starttime)
        lib.finish(merge=merge)
        committime = now()
        print("Commit time:", committime - spooltime)
        totaltime = committime - starttime
        print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" %
              (count, totaltime, totaltime / 60.0))
        print("Indexed %0.3f docs/s" % (count / totaltime))
Example #16
0
    def sort_query(self, query, sortedby, reverse=False):
        if isinstance(sortedby, basestring):
            sorter = self._field_sorter(sortedby)
        elif isinstance(sortedby, (list, tuple)):
            sorter = scoring.MultiFieldSorter([self._field_sorter(fname)
                                               for fname in sortedby])
        elif isinstance(sortedby, Sorter):
            sorter = sortedby
        else:
            raise ValueError("sortedby argument (%R) must be a string, list,"
                             " or Sorter" % sortedby)

        t = now()
        sorted_docs = list(sorter.order(self, query.docs(self), reverse=reverse))
        runtime = now() - t
        
        return Results(self, query, sorted_docs, None, runtime)
def test_20000_single():
    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000single") as ix:
        domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot",
                  "golf", "hotel", "india", "juliet", "kilo", "lima"]

        t = now()
        for i in xrange(20000):
            w = ix.writer()
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
            w.commit()
        print("Write single:", now() - t)

        t = now()
        ix.optimize()
        print("Optimize single:", now() - t)
Example #18
0
    def sort_query(self, query, sortedby, reverse=False):
        if isinstance(sortedby, basestring):
            sorter = self._field_sorter(sortedby)
        elif isinstance(sortedby, (list, tuple)):
            sorter = scoring.MultiFieldSorter([self._field_sorter(fname)
                                               for fname in sortedby])
        elif isinstance(sortedby, Sorter):
            sorter = sortedby
        else:
            raise ValueError("sortedby argument (%R) must be a string, list,"
                             " or Sorter" % sortedby)

        t = now()
        sorted_docs = list(sorter.order(self, query.docs(self), reverse=reverse))
        runtime = now() - t
        
        return Results(self, query, sorted_docs, None, runtime)
Example #19
0
def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True):
    ix = index.open_dir(ixdir)
    qp = qparser.QueryParser("title", ix.schema)
    q = qp.parse(qstring)

    with ix.searcher(weighting=scoring.PL2()) as s:
        if scores:
            r = s.search(q, limit=limit, optimize=optimize)
            for hit in r:
                print_record(hit.rank, basedir, hit["file"], hit["pos"])
            print("Found %d records in %0.06f seconds" % (len(r), r.runtime))
        else:
            t = now()
            for i, docnum in enumerate(s.docs_for_query(q)):
                if not limit or i < limit:
                    fields = s.stored_fields(docnum)
                    print_record(i, basedir, fields["file"], fields["pos"])
            print("Found %d records in %0.06f seconds" % (i, now() - t))
Example #20
0
    def finish(self):
        """This method is called after a search.
        Subclasses can override this to perform set-up work, but
        they should still call the superclass's method because it sets several
        necessary attributes on the collector object:
        self.runtime
            The time (in seconds) the search took.
        """

        self.runtime = now() - self.starttime
Example #21
0
def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True,
               glob="*.mrc"):
    if not os.path.exists(ixdir):
        os.mkdir(ixdir)

    # Multi-lingual stop words
    stoplist = (analysis.STOP_WORDS
                | set("de la der und le die et en al no von di du da "
                      "del zur ein".split()))
    # Schema
    ana = analysis.StemmingAnalyzer(stoplist=stoplist)
    schema = fields.Schema(title=fields.TEXT(analyzer=ana),
                           author=fields.TEXT(phrase=False),
                           subject=fields.TEXT(analyzer=ana, phrase=False),
                           file=fields.STORED, pos=fields.STORED,
                           )

    # MARC fields to extract
    mfields = set(subjectfields)  # Subjects
    mfields.update("100 110 111".split())  # Author
    mfields.add("245")  # Title

    print("Indexing with %d processor(s) and %d MB per processor"
          % (procs, limitmb))
    c = 0
    t = now()
    ix = index.create_in(ixdir, schema)
    with ix.writer(procs=procs, limitmb=limitmb,
                   multisegment=multisegment) as w:
        filenames = [filename for filename in os.listdir(basedir)
                     if fnmatch.fnmatch(filename, glob)]
        for filename in filenames:
            path = os.path.join(basedir, filename)
            print("Indexing", path)
            f = open(path, 'rb')
            for x, pos in read_file(f, mfields):
                w.add_document(title=uni(title(x)), author=uni(author(x)),
                               subject=uni(subjects(x)),
                               file=filename, pos=pos)
                c += 1
            f.close()
        print("Committing...")
    print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
def test_20000_buffered():
    from whoosh.writing import BufferedWriter

    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000buffered") as ix:
        domain = ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot",
                  "golf", "hotel", "india", "juliet", "kilo", "lima"]

        t = now()
        w = BufferedWriter(ix, limit=100, period=None)
        for i in xrange(20000):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        w.close()
        print("Write buffered:", now() - t)

        t = now()
        ix.optimize()
        print("Optimize buffered:", now() - t)
Example #23
0
 def index_document(self, d):
     try:
         self.archive.indexDictionary(str(self.count), d)
     except ValueError:
         print "d=", d
         raise
     self.count += 1
     if not self.count % int(self.options.batch):
         t = now()
         self.archive.store(lazy=True)
         self.indexer(create=False)
Example #24
0
 def index_document(self, d):
     try:
         self.archive.indexDictionary(str(self.count), d)
     except ValueError:
         print("d=", d)
         raise
     self.count += 1
     if not self.count % int(self.options.batch):
         t = now()
         self.archive.store(lazy=True)
         self.indexer(create=False)
    def finish(self):
        """This method is called after a search.

        Subclasses can override this to perform set-up work, but
        they should still call the superclass's method because it sets several
        necessary attributes on the collector object:

        self.runtime
            The time (in seconds) the search took.
        """

        self.runtime = now() - self.starttime
Example #26
0
    def dump_run(self):
        if self.size > 0:
            #print "Dumping run..."
            t = now()
            filename = self.unique_name(".run")
            runfile = open(filename, "w+b")
            self.postings.sort()
            for p in self.postings:
                dump(p, runfile)
            runfile.close()

            self.runs.append((filename, self.count))
            self.postings = []
            self.size = 0
            self.count = 0
Example #27
0
 def dump_run(self):
     if self.size > 0:
         #print "Dumping run..."
         t = now()
         filename = self.unique_name(".run")
         runfile = open(filename, "w+b")
         self.postings.sort()
         for p in self.postings:
             dump(p, runfile)
         runfile.close()
         
         self.runs.append((filename, self.count))
         self.postings = []
         self.size = 0
         self.count = 0
Example #28
0
def timing(name=None):
    t = now()
    yield
    t = now() - t
    print("%s: %0.06f s" % (name or '', t))
Example #29
0
    content = fields.TEXT(spelling=True, analyzer=ana)

    chapter = fields.ID(sortable=True)

    size = fields.NUMERIC(sortable=True)

    lastopened = fields.TEXT(sortable=True)

    lastchanged = fields.TEXT(sortable=True)

    created = fields.TEXT(sortable=True)


ix = index.create_in(indexdir, PydocSchema)
with ix.writer(limitmb=2048) as w:
    t = now()
    for dirpath, dirnames, filenames in os.walk(sourcedir):
        chapter = unicode(os.path.basename(dirpath))
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            size = os.path.getsize(filepath)

            path = dirpath

            fileName, fileExt = os.path.splitext(filename)
            fileName = unicode(fileName, errors='ignore')
            fileExt = unicode(fileExt, errors='ignore')
            data = None

            lasto = time.ctime(os.stat(filepath).st_atime)
            lasto = unicode(lasto[4:])
Example #30
0
schema = fields.Schema(
    tags=fields.KEYWORD(stored=True, vector=formats.Existence()))

if not os.path.exists(dirname):
    os.mkdir(dirname)

reindex = False
if reindex or not index.exists_in(dirname):
    tags = []
    for _ in xrange(tagcount):
        tag = u"".join(
            random.choice(string.ascii_lowercase) for _ in xrange(5))
        tags.append(tag)

    ix = index.create_in(dirname, schema)
    t = now()
    with ix.writer() as w:
        for i in xrange(doccount):
            doc = u" ".join(random.sample(tags, random.randint(10, 20)))
            w.add_document(tags=doc)
            if not i % 10000:
                print i
    print now() - t

ix = index.open_dir(dirname)
with ix.searcher() as s:
    tags = list(s.lexicon("tags"))
    facet = sorting.FieldFacet("tags", allow_overlap=True)
    qtag = random.choice(tags)
    print "tag=", qtag
    q = query.Term("tags", qtag)
Example #31
0
dirname = "testindex"

schema = fields.Schema(tags=fields.KEYWORD(stored=True, vector=formats.Existence()))

if not os.path.exists(dirname):
    os.mkdir(dirname)

reindex = False
if reindex or not index.exists_in(dirname):
    tags = []
    for _ in xrange(tagcount):
        tag = u"".join(random.choice(string.ascii_lowercase) for _ in xrange(5))
        tags.append(tag)

    ix = index.create_in(dirname, schema)
    t = now()
    with ix.writer() as w:
        for i in xrange(doccount):
            doc = u" ".join(random.sample(tags, random.randint(10, 20)))
            w.add_document(tags=doc)
            if not i % 10000:
                print i
    print now() - t


ix = index.open_dir(dirname)
with ix.searcher() as s:
    tags = list(s.lexicon("tags"))
    facet = sorting.FieldFacet("tags", allow_overlap=True)
    qtag = random.choice(tags)
    print "tag=", qtag
Example #32
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     if not exc_type:
         print "%0.8f" % (now() - self.t)
Example #33
0
    def _simple_sort_query(self, q, limit=None, reverse=False, filter=None):
        # If the direction of all sort fields is the same, we can use field
        # caches to do the sorting

        t = now()
        docset = set()
        sortedby = [c[0] for c in self.criteria]
        reverse = self.criteria[0][1] ^ reverse
        comb = self.searcher._filter_to_comb(filter)

        if self.searcher.subsearchers:
            heap = []

            # I wish I could actually do a heap thing here, but the Python heap
            # queue only works with greater-than, and I haven't thought of a
            # smart way to get around that yet, so I'm being dumb and using
            # nlargest/nsmallest on the heap + each subreader list :(
            op = nlargest if reverse else nsmallest

            for s, offset in self.searcher.subsearchers:
                # This searcher is wrapping a MultiReader, so push the sorting
                # down to the leaf readers and then combine the results.
                docnums = [docnum for docnum in q.docs(s)
                           if (not comb) or docnum + offset in comb]

                # Add the docnums to the docset
                docset.update(docnums)

                # Ask the reader to return a list of (key, docnum) pairs to
                # sort by. If limit=None, the returned list is not sorted. If
                # limit=True, it is sorted.
                r = s.reader()
                srt = r.key_docs_by(sortedby, docnums, limit, reverse=reverse,
                                    offset=offset)
                if limit:
                    # Pick the "limit" smallest/largest items from the current
                    # and new list
                    heap = op(limit, heap + srt)
                else:
                    # If limit=None, we'll just add everything to the "heap"
                    # and sort it at the end.
                    heap.extend(srt)

            # Sort the heap and take the docnums
            docnums = [docnum for _, docnum in sorted(heap, reverse=reverse)]

        else:
            # This searcher is wrapping an atomic reader, so we don't need to
            # get tricky combining the results of multiple readers, just ask
            # the reader to sort the results.
            r = self.searcher.reader()
            docnums = [docnum for docnum in q.docs(self.searcher)
                       if (not comb) or docnum in comb]
            docnums = r.sort_docs_by(sortedby, docnums, reverse=reverse)
            docset = set(docnums)

            # I artificially enforce the limit here, even thought the current
            # implementation can't use it, so that the results don't change
            # based on single- vs- multi-segment.
            docnums = docnums[:limit]

        runtime = now() - t
        return self._results(q, docnums, docset, runtime)
Example #34
0
 def _simple_sort_query(self, q, limit=None, reverse=False, filter=None):
     # If the direction of all sort fields is the same, we can use field
     # caches to do the sorting
     
     t = now()
     docset = set()
     sortedby = [c[0] for c in self.criteria]
     reverse = self.criteria[0][1] ^ reverse
     comb = self.searcher._filter_to_comb(filter)
     
     if self.searcher.subsearchers:
         heap = []
         
         # I wish I could actually do a heap thing here, but the Python heap
         # queue only works with greater-than, and I haven't thought of a
         # smart way to get around that yet, so I'm being dumb and using
         # nlargest/nsmallest on the heap + each subreader list :(
         op = nlargest if reverse else nsmallest
         
         for s, offset in self.searcher.subsearchers:
             # This searcher is wrapping a MultiReader, so push the sorting
             # down to the leaf readers and then combine the results.
             docnums = [docnum for docnum in q.docs(s)
                        if (not comb) or docnum + offset in comb]
             
             # Add the docnums to the docset
             docset.update(docnums)
             
             # Ask the reader to return a list of (key, docnum) pairs to
             # sort by. If limit=None, the returned list is not sorted. If
             # limit=True, it is sorted.
             r = s.reader()
             srt = r.key_docs_by(sortedby, docnums, limit, reverse=reverse,
                                 offset=offset)
             if limit:
                 # Pick the "limit" smallest/largest items from the current
                 # and new list
                 heap = op(limit, heap + srt)
             else:
                 # If limit=None, we'll just add everything to the "heap"
                 # and sort it at the end.
                 heap.extend(srt)
         
         # Sort the heap and take the docnums
         docnums = [docnum for _, docnum in sorted(heap, reverse=reverse)]
         
     else:
         # This searcher is wrapping an atomic reader, so we don't need to
         # get tricky combining the results of multiple readers, just ask
         # the reader to sort the results.
         r = self.searcher.reader()
         docnums = [docnum for docnum in q.docs(self.searcher)
                    if (not comb) or docnum in comb]
         docnums = r.sort_docs_by(sortedby, docnums, reverse=reverse)
         docset = set(docnums)
         
         # I artificially enforce the limit here, even thought the current
         # implementation can't use it, so that the results don't change
         # based on single- vs- multi-segment.
         docnums = docnums[:limit]
     
     runtime = now() - t
     return self._results(q, docnums, docset, runtime)
Example #35
0
File: enron.py Project: JunjieHu/dl
 def download_archive(self, archive):
     print("Downloading Enron email archive to %r..." % archive)
     t = now()
     urlretrieve(self.enron_archive_url, archive)
     print("Downloaded in ", now() - t, "seconds")
Example #36
0
    def search(self, query, limit=10, sortedby=None, reverse=False,
               optimize=True):
        """Runs the query represented by the ``query`` object and returns a
        Results object.
        
        :param query: a :class:`whoosh.query.Query` object.
        :param limit: the maximum number of documents to score. If you're only
            interested in the top N documents, you can set limit=N to limit the
            scoring for a faster search.
        :param sortedby: if this parameter is not None, the results are sorted
            instead of scored. If this value is a string, the results are
            sorted by the field named in the string. If this value is a list or
            tuple, it is assumed to be a sequence of strings and the results
            are sorted by the fieldnames in the sequence. Otherwise 'sortedby'
            should be a scoring.Sorter object.
            
            The fields you want to sort by must be indexed.
            
            For example, to sort the results by the 'path' field::
            
                searcher.find(q, sortedby = "path")
                
            To sort the results by the 'path' field and then the 'category'
            field::
                
                searcher.find(q, sortedby = ("path", "category"))
                
            To use a sorting object::
            
                searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn))
            
            Using a string or tuple simply instantiates a
            :class:`whoosh.scoring.FieldSorter` or
            :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a
            custom sort order, instantiate your own ``FieldSorter`` with a
            ``key`` argument, or write a custom :class:`whoosh.scoring.Sorter`
            class.
            
            FieldSorter and MultiFieldSorter cache the document order, using 4
            bytes times the number of documents in the index, and taking time
            to cache. To increase performance, instantiate your own sorter and
            re-use it (but remember you need to recreate it if the index
            changes).
        
        :param reverse: if ``sortedby`` is not None, this reverses the
            direction of the sort.
        :param optimize: use optimizations to get faster results when possible.
        :rtype: :class:`Results`
        """

        if limit is not None and limit < 1:
            raise ValueError("limit must be >= 1")

        if sortedby is not None:
            return self.sort_query(query, sortedby, reverse=reverse)
        
        t = now()
        matcher = query.matcher(self)
        if isinstance(matcher, NullMatcher):
            scores = []
            docnums = []
            bitset = None
        else:
            scores, docnums, bitset = collect(self, matcher, limit,
                                              usequality=optimize)
        runtime = now() - t

        return Results(self, query, docnums, scores, runtime, docs=bitset)
Example #37
0
def test_bigsort():
    times = 30000
    dirname = "testindex"

    df = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=df)

    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)
    ix = index.create_in(dirname, schema)

    print("Writing...")
    t = now()
    w = ix.writer(limitmb=512)
    for i in xrange(times):
        dt = datetime.fromtimestamp(random.randint(15839593, 1294102139))
        w.add_document(id=text_type(i), date=dt)
    w.commit()
    print("Writing took ", now() - t)

    ix = index.open_dir(dirname)
    s = ix.searcher()
    q = query.Wildcard("id", "1?2*")

    t = now()
    x = list(df.sortable_terms(s.reader(), "date"))
    print(now() - t, len(x))

    t = now()
    for y in x:
        p = list(s.postings("date", y).all_ids())
    print(now() - t)



    t = now()
    r = s.search(q, limit=25, sortedby="date", reverse=True)
    print("Search 1 took", now() - t)
    print("len=", r.scored_length())

    t = now()
    r = s.search(q, limit=25, sortedby="date")
    print("Search 2 took", now() - t)

    t = now()
    r = s.search(q, limit=25, sortedby="date")
    print("Search 2 took", now() - t)

    from heapq import nlargest
    t = now()
    sf = s.stored_fields
    gen = ((sf(n)["date"], n) for n in q.docs(s))
    r = nlargest(25, gen)
    print(now() - t)
Example #38
0
    content = fields.TEXT(spelling=True, analyzer=ana)

    chapter = fields.ID(sortable=True)

    size = fields.NUMERIC(sortable=True)

    lastopened = fields.TEXT(sortable=True)

    lastchanged = fields.TEXT(sortable=True)

    created = fields.TEXT(sortable=True)


ix = index.create_in(indexdir, PydocSchema)
with ix.writer(limitmb=2048) as w:
    t = now()
    for dirpath, dirnames, filenames in os.walk(sourcedir):
        chapter = unicode(os.path.basename(dirpath))
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            size = os.path.getsize(filepath)

            path = dirpath

            fileName, fileExt = os.path.splitext(filename)
            fileName = unicode(fileName, errors='ignore')
            fileExt = unicode(fileExt, errors='ignore')
            data = None

            lasto = time.ctime(os.stat(filepath).st_atime)
            lasto = unicode(lasto[4:])
Example #39
0
class PydocSchema(fields.SchemaClass):

    path = fields.STORED
    title = fields.TEXT(stored=True, sortable=True, spelling=True, analyzer=ana)
    tgrams = fields.NGRAMWORDS
    ext = fields.TEXT(stored=True, sortable=True)
    content = fields.TEXT(spelling=True, analyzer=ana)
    chapter = fields.ID(sortable=True)
    size = fields.NUMERIC(sortable=True)
    lastopened = fields.TEXT(sortable=True)
    lastchanged = fields.TEXT(sortable=True)
    created = fields.TEXT(sortable=True)

ix = index.create_in(indexdir, PydocSchema)
with ix.writer(limitmb=2048) as w:
    t = now()
    for dirpath, dirnames, filenames in os.walk(sourcedir):
        try:
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                size = os.path.getsize(filepath)

                path = dirpath
                chapter = unicode(os.path.basename(dirpath))
                fileName, fileExt = os.path.splitext(filename)
                fileName = unicode(fileName)
                fileExt = unicode(fileExt)
                data = None

                print dirpath, filename
                try:
Example #40
0
 def __enter__(self):
     self.t = now()
Example #41
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     if not exc_type:
         print "%0.8f" % (now() - self.t)
Example #42
0
def test_now():
    from whoosh.util import now

    t1 = now()
    t2 = now()
    assert t1 <= t2
Example #43
0
    def results(self, qstring, cat_order, category=None, shortcuts=None,
                limit=None, cat_limit=5):
        from whoosh.util import now

        t = now()
        s = self.searcher
        limit = limit or self.limit
        showall = False

        if shortcuts:
            qstring, showall = self.expand_shortcuts(qstring, shortcuts)

        if category:
            filter = query.Term("category", category)
        else:
            filter = None

        all_q = self.make_query(qstring, "content")

        show_best = (not category and
                     all(isinstance(lq, query.Term) and lq.field() == "content"
                         for lq in all_q.leaves()))
        if show_best:
            best_q = self.make_query(qstring, "bestbet")
            best_r = s.search(best_q, limit=10)
        else:
            best_r = None

        grams_groups = None
        grams_q = self.make_query(qstring, "grams")
        if any(fn == "grams" for fn, _ in grams_q.iter_all_terms()):
            try:
                grams_r = s.search(grams_q, limit=limit, groupedby="category",
                                   filter=filter)
            except query.QueryError:
                pass
            else:
                grams_groups = grams_r.groups()

        all_r = s.search(all_q, limit=limit, groupedby="category",
                         filter=filter)
        all_groups = all_r.groups()

        # OK, this is complicated... we want to present the categories in the
        # order defined in cat_order, BUT we want categories that have grams
        # matches to come before categories that only have content matches
        final_order = []
        if grams_groups:
            # Add categories in grams_groups in the order defined by cat_order
            for cat in cat_order:
                if cat in grams_groups:
                    final_order.append(cat)
            # Add any categories in grams_groups that aren't in cat_order
            final_order.extend(cat for cat in sorted(grams_groups)
                               if cat not in cat_order)

        seen = set(final_order)
        # Add categories in all_groups in the order defined by cat_order, IF
        # they weren't already added in the previous step
        for cat in cat_order:
            if cat in all_groups and cat not in seen:
                final_order.append(cat)
        # Add any categories in all_groups that weren't added in the previous
        # steps
        final_order.extend(cat for cat in sorted(all_groups)
                           if cat not in cat_order and cat not in seen)

        # If there's only one category, there's no point in cutting it off,
        # just show all hits
        showall = showall or len(final_order) == 1

        # For each category, pull out the docnums and get their stored fields
        length = 0
        categories = []
        for cat in final_order:
            # Combine the docnums for this category from grams and all
            docnums = []
            seen = set()
            if grams_groups:
                for docnum in grams_groups.get(cat, ()):
                    docnums.append(docnum)
                    seen.add(docnum)

            for docnum in all_groups.get(cat, ()):
                if docnum not in seen:
                    docnums.append(docnum)
                    seen.add(docnum)

            # If the number of hits is exactly the limit + 1, then there's no
            # point showing a "show more" line instead of that one extra hit,
            # so just increase the limit in that case
            if len(docnums) == cat_limit + 1:
                cutoff = len(docnums)
            else:
                cutoff = cat_limit

            if not showall and len(docnums) > cutoff:
                docnums = docnums[:cutoff]

            length += len(seen)
            docs = [s.stored_fields(docnum) for docnum in docnums]
            categories.append((cat, docs, len(seen)))

        sent = now()
        runtime_ms = (sent - t) * 1000
        return {
            "qstring": qstring,
            "best": best_r,
            "category": category,
            "categories": categories,
            "length": length,
            "limit": limit,
            "hits": all_r,
            "sent": sent,
            "runtime": runtime_ms,
        }
Example #44
0
def test_bigsort():
    times = 30000
    dirname = "testindex"
    
    df = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=df)
    
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)
    ix = index.create_in(dirname, schema)
    
    print("Writing...")
    t = now()
    w = ix.writer(limitmb=512)
    for i in xrange(times):
        dt = datetime.fromtimestamp(random.randint(15839593, 1294102139))
        w.add_document(id=text_type(i), date=dt)
    w.commit()
    print("Writing took ", now() - t)
    
    ix = index.open_dir(dirname)
    s = ix.searcher()
    q = query.Wildcard("id", "1?2*")
    
    t = now()
    x = list(df.sortable_values(s.reader(), "date"))
    print(now() - t, len(x))
    
    t = now()
    for y in x:
        p = list(s.postings("date", y).all_ids())
    print(now() - t)
    
    
    
    t = now()
    r = s.search(q, limit=25, sortedby="date", reverse=True)
    print("Search 1 took", now() - t)
    print("len=", r.scored_length())
    
    t = now()
    r = s.search(q, limit=25, sortedby="date")
    print("Search 2 took", now() - t)
    
    t = now()
    r = s.search(q, limit=25, sortedby="date")
    print("Search 2 took", now() - t)
    
    from heapq import nlargest
    t = now()
    sf = s.stored_fields
    gen = ((sf(n)["date"], n) for n in q.docs(s))
    r = nlargest(25, gen)
    print(now() - t)
Example #45
0
def test_now():
    from whoosh.util import now

    t1 = now()
    t2 = now()
    assert t1 <= t2
Example #46
0
 def download_archive(self, archive):
     print("Downloading Enron email archive to %r..." % archive)
     t = now()
     urlretrieve(self.enron_archive_url, archive)
     print("Downloaded in ", now() - t, "seconds")
Example #47
0
 def __enter__(self):
     self.t = now()
Example #48
0
    def search(self, query, limit=10, sortedby=None, reverse=False,
               optimize=True):
        """Runs the query represented by the ``query`` object and returns a
        Results object.
        
        :param query: a :class:`whoosh.query.Query` object.
        :param limit: the maximum number of documents to score. If you're only
            interested in the top N documents, you can set limit=N to limit the
            scoring for a faster search.
        :param sortedby: if this parameter is not None, the results are sorted
            instead of scored. If this value is a string, the results are
            sorted by the field named in the string. If this value is a list or
            tuple, it is assumed to be a sequence of strings and the results
            are sorted by the fieldnames in the sequence. Otherwise 'sortedby'
            should be a scoring.Sorter object.
            
            The fields you want to sort by must be indexed.
            
            For example, to sort the results by the 'path' field::
            
                searcher.find(q, sortedby = "path")
                
            To sort the results by the 'path' field and then the 'category'
            field::
                
                searcher.find(q, sortedby = ("path", "category"))
                
            To use a sorting object::
            
                searcher.find(q, sortedby = scoring.FieldSorter("path", key=mykeyfn))
            
            Using a string or tuple simply instantiates a
            :class:`whoosh.scoring.FieldSorter` or
            :class:`whoosh.scoring.MultiFieldSorter` object for you. To get a
            custom sort order, instantiate your own ``FieldSorter`` with a
            ``key`` argument, or write a custom :class:`whoosh.scoring.Sorter`
            class.
            
            FieldSorter and MultiFieldSorter cache the document order, using 4
            bytes times the number of documents in the index, and taking time
            to cache. To increase performance, instantiate your own sorter and
            re-use it (but remember you need to recreate it if the index
            changes).
        
        :param reverse: if ``sortedby`` is not None, this reverses the
            direction of the sort.
        :param optimize: use optimizations to get faster results when possible.
        :rtype: :class:`Results`
        """

        if limit is not None and limit < 1:
            raise ValueError("limit must be >= 1")

        if sortedby is not None:
            return self.sort_query(query, sortedby, reverse=reverse)
        
        t = now()
        matcher = query.matcher(self)
        if isinstance(matcher, NullMatcher):
            scores = []
            docnums = []
            bitset = None
        else:
            scores, docnums, bitset = collect(self, matcher, limit,
                                              usequality=optimize)
        runtime = now() - t

        return Results(self, query, docnums, scores, runtime, docs=bitset)
Example #49
0
ana = analysis.StemmingAnalyzer(stoplist=stoplists["en"], maxsize=40)


class EmailSchema(fields.SchemaClass):
    subject = fields.TEXT(stored=True, sortable=True, analyzer=ana)
    sgrams = fields.NGRAMWORDS
    body = fields.TEXT(stored=True, spelling=True, analyzer=ana)
    sender = fields.TEXT(phrase=False, stored=True)
    sent = fields.DATETIME(sortable=True)
    filename = fields.STORED
    key = fields.STORED

ix = index.create_in(indexdir, EmailSchema)
with ix.writer(limitmb=1024) as w:
    t = now()
    parser = email.parser.Parser()
    for filename in os.listdir(sourcedir):
        if not filename.endswith(".txt"):
            continue

        filepath = os.path.join(sourcedir, filename)
        print(filepath)
        mbox = mailbox.mbox(filepath)
        for key, msg in mbox.iteritems():
            sent = None
            if msg["date"]:
                sent_tuple = email.utils.parsedate_tz(msg["date"])
                if sent_tuple and sent_tuple[0] >= 2000:
                    tm = email.utils.mktime_tz(sent_tuple)
                    sent = datetime.datetime.fromtimestamp(tm)
def timing(name=None):
    t = now()
    yield
    t = now() - t
    print("%s: %0.06f s" % (name or '', t))