Example #1
0
 def open_file(self, name, mode=u'r'):
     path = self._topath(name)
     uf = self.fs.open(path, mode=mode)
     if mode == u'w':
         uf.content_type = 'application/structfile'
         uf.force_rev = 0
     sf = StructFile(uf)
     sf.is_real = False
     sf.fileno = None
     return sf
Example #2
0
 def open_file(self, name, *args, **kwargs):
     if not self.file_exists(name):
         raise NameError("No such file %r" % name)
     def onclose_fn(sfile):
         self.redis.hset("RedisStore:%s" % self.folder, name, sfile.file.getvalue())
     #print "Opened file %s %s " % (name, self.__file(name))
     return StructFile(BytesIO(self.__file(name)), name=name, onclose=onclose_fn, *args, **kwargs)
Example #3
0
    def run(self):
        jobqueue = self.jobqueue
        rqueue = self.resultqueue
        subpool = self.subpool = TempfilePool(self.schema,
                                              limitmb=self.limitmb,
                                              dir=self.dir)

        if self.firstjob:
            self._add_file(*self.firstjob)

        while True:
            arg1, arg2 = jobqueue.get()
            if arg1 is None:
                doccount = arg2
                break
            else:
                self._add_file(arg1, arg2)

        lenfd, lenfilename = tempfile.mkstemp(".lengths", dir=subpool.dir)
        lenf = os.fdopen(lenfd, "wb")
        subpool._write_lengths(StructFile(lenf), doccount)
        subpool.dump_run()
        rqueue.put((subpool.runs, subpool.fieldlength_totals(),
                    subpool.fieldlength_mins(), subpool.fieldlength_maxes(),
                    lenfilename))
Example #4
0
    def create_file(self, name, excl=False, mode="wb", **kwargs):
        """Creates a file with the given name in this storage.

        :param name: the name for the new file.
        :param excl: if True, try to open the file in "exclusive" mode.
        :param mode: the mode flags with which to open the file. The default is
            ``"wb"``.
        :return: a :class:`whoosh.filedb.structfile.StructFile` instance.
        """

        if self.readonly:
            raise ReadOnlyError

        path = self._fpath(name)
        if excl:
            flags = os.O_CREAT | os.O_EXCL | os.O_RDWR
            if hasattr(os, "O_BINARY"):
                flags |= os.O_BINARY
            fd = os.open(path, flags)
            fileobj = os.fdopen(fd, mode)
        else:
            fileobj = open(path, mode)

        f = StructFile(fileobj, name=name, **kwargs)
        return f
Example #5
0
    def create_file(self, name, **kwargs):
        def onclose_fn(sfile):
            self.redis.hset("RedisStore:%s" % self.folder, name,
                            sfile.file.getvalue())

        f = StructFile(StringIO(), name=name, onclose=onclose_fn)
        return f
Example #6
0
def wordlist_to_graph_file(wordlist, dbfile, fieldname="_", strip=True):
    """Writes a word graph file from a list of words.
    
    >>> # Open a word list file with one word on each line, and write the
    >>> # word graph to a graph file
    >>> wordlist_to_graph_file("mywords.txt", "mywords.dawg")
    
    :param wordlist: an iterable containing the words for the graph. The words
        must be in sorted order.
    :param dbfile: a filename string or file-like object to write the word
        graph to. This function will close the file.
    """

    from whoosh.filedb.structfile import StructFile
    if isinstance(dbfile, string_type):
        dbfile = open(dbfile, "wb")
    if not isinstance(dbfile, StructFile):
        dbfile = StructFile(dbfile)

    gw = dawg.GraphWriter(dbfile)
    gw.start_field(fieldname)
    for word in wordlist:
        if strip:
            word = word.strip()
        gw.insert(word)
    gw.finish_field()
    gw.close()
Example #7
0
 def open_file(self, name, *args, **kwargs):
     if name not in self.files:
         raise NameError("No such file %r" % name)
     return StructFile(StringIO(self.files[name]),
                       name=name,
                       *args,
                       **kwargs)
Example #8
0
    def finish(self, termswriter, doccount, lengthfile):
        _fieldlength_totals = self._fieldlength_totals
        if not self.tasks:
            return

        jobqueue = self.jobqueue
        rqueue = self.resultqueue

        for task in self.tasks:
            jobqueue.put((None, doccount))

        for task in self.tasks:
            task.join()

        runs = []
        lenfilenames = []
        for task in self.tasks:
            taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
            runs.extend(taskruns)
            lenfilenames.append(lenfilename)
            for fieldnum, total in flentotals.iteritems():
                _fieldlength_totals[fieldnum] += total
            for fieldnum, length in flenmaxes.iteritems():
                if length > self._fieldlength_maxes.get(fieldnum, 0):
                    self._fieldlength_maxes[fieldnum] = length

        jobqueue.close()
        rqueue.close()

        lw = LengthWriter(lengthfile, doccount)
        for lenfilename in lenfilenames:
            sublengths = LengthReader(StructFile(open(lenfilename, "rb")),
                                      doccount)
            lw.add_all(sublengths)
            os.remove(lenfilename)
        lw.close()
        lengths = lw.reader()

        #        if len(runs) >= self.procs * 2:
        #            pool = Pool(self.procs)
        #            tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir)
        #            while len(runs) >= self.procs * 2:
        #                runs2 = [(runs[i:i+4], tempname())
        #                         for i in xrange(0, len(runs), 4)]
        #                if len(runs) % 4:
        #                    last = runs2.pop()[0]
        #                    runs2[-1][0].extend(last)
        #                runs = pool.map(merge_runs, runs2)
        #            pool.close()

        iterator = imerge(
            [read_run(runname, count) for runname, count in runs])
        total = sum(count for runname, count in runs)
        termswriter.add_iter(iterator, lengths.get)
        for runname, count in runs:
            os.remove(runname)

        self.cleanup()
Example #9
0
    def create_file(self, name, excl=False, mode='wb', **kwargs):
        if self.readonly:
            raise ReadOnlyError

        # Of course SAE bucket does not support `exclusive` mode.
        # fileobj = self.bucket.get_object(name)
        fileobj = SAEFile(name, mode, self.bucket)
        f = StructFile(fileobj, name=name, **kwargs)
        return f
Example #10
0
    def open_file(self, name, **kwargs):
        """Opens an existing file in this storage.
        :param name: the name of the file to open.
        :param kwargs: additional keyword arguments are passed through to the
            :class:`~whoosh.filedb.structfile.StructFile` initializer.
        :return: a :class:`whoosh.filedb.structfile.StructFile` instance.
        """

        return StructFile(open(self._fpath(name), "rb"), name=name, **kwargs)
Example #11
0
    def open_file(self, name, **kwargs):
        if self._bucket.stat_object(self._fpath(name)) is None:
            raise NameError(name)
        content = self._bucket.get_object_contents(self._fpath(name))

        def onclose_fn(sfile):
            self._bucket.put_object(self._fpath(name), sfile.file.getvalue())

        return StructFile(BytesIO(content), name=name, onclose=onclose_fn)
Example #12
0
 def open_file(self, name, *args, **kwargs):
     try:
         f = StructFile(open(self._fpath(name), "rb"),
                        name=name,
                        *args,
                        **kwargs)
     except IOError:
         print "Tried to open %r, files=%r" % (name, self.list())
         raise
     return f
Example #13
0
 def open_file(self, name, *args, **kwargs):
     offset, length = self.range(name)
     if self._source:
         # Create a memoryview/buffer from the mmap
         buf = memoryview_(self._source, offset, length)
         f = BufferFile(buf, name=name)
     elif hasattr(self._file, "subset"):
         f = self._file.subset(offset, length, name=name)
     else:
         f = StructFile(SubFile(self._file, offset, length), name=name)
     return f
Example #14
0
    def open_file(self, fname, *args, **kwargs):
        if not self.kvdb_coll.exists(fname):
            raise NameError(fname)

        content = self.kvdb_coll.get_value(fname)

        def onclose_fn(sfile):
            value = sfile.file.getvalue()
            self.kvdb_coll.set_value(fname, value)

        return StructFile(StringIO(content), name=fname, onclose=onclose_fn)
Example #15
0
 def finish(self, doccount, lengthfile, termtable, postingwriter):
     _fieldlength_totals = self._fieldlength_totals
     if not self.tasks:
         return
     
     pqueue = self.postingqueue
     rqueue = self.resultsqueue
     
     for _ in xrange(self.procs):
         pqueue.put((-1, doccount))
     
     #print "Joining..."
     t = now()
     for task in self.tasks:
         task.join()
     #print "Join:", now() - t
     
     #print "Getting results..."
     t = now()
     runs = []
     lenfilenames = []
     for task in self.tasks:
         taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
         runs.extend(taskruns)
         lenfilenames.append(lenfilename)
         for fieldnum, total in flentotals.iteritems():
             _fieldlength_totals[fieldnum] += total
         for fieldnum, length in flenmaxes.iteritems():
             if length > self._fieldlength_maxes.get(fieldnum, 0):
                 self._fieldlength_maxes[fieldnum] = length
     #print "Results:", now() - t
     
     #print "Writing lengths..."
     t = now()
     lw = LengthWriter(lengthfile, doccount)
     for lenfilename in lenfilenames:
         sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount)
         lw.add_all(sublengths)
         os.remove(lenfilename)
     lw.close()
     lengths = lw.reader()
     #print "Lengths:", now() - t
     
     t = now()
     iterator = imerge([read_run(runname, count) for runname, count in runs])
     total = sum(count for runname, count in runs)
     write_postings(self.schema, termtable, lengths, postingwriter, iterator)
     for runname, count in runs:
         os.remove(runname)
     #print "Merge:", now() - t
     
     self.cleanup()
Example #16
0
    def open_file(self, name, *args, **kwargs):
        info = self.dir[name]
        offset = info["offset"]
        length = info["length"]

        if self.source:
            # Create a memoryview/buffer from the mmap
            buf = memoryview_(self.source, offset, length)
            f = BytesIO(buf)
        else:
            # If mmap is not available, use the slower sub-file implementation
            f = SubFile(self.file, offset, length)
        return StructFile(f, name=name)
Example #17
0
    def create_file(self, name, excl=False, mode="wb", **kwargs):
        if self.readonly:
            raise ReadOnlyError

        path = self._fpath(name)
        if excl:
            flags = os.O_CREAT | os.O_EXCL | os.O_RDWR
            if hasattr(os, "O_BINARY"):
                flags |= os.O_BINARY
            fd = os.open(path, flags)
            fileobj = os.fdopen(fd, mode)
        else:
            fileobj = open(path, mode)

        f = StructFile(fileobj, name=name, **kwargs)
        return f
Example #18
0
    def _flush_run(self):
        # Called when the memory buffer (of size self.limit) fills up.
        # Sorts the buffer and writes the current buffer to a "run" on disk.

        if self.size > 0:
            tempfd, tempname = tempfile.mkstemp(".run")
            runfile = StructFile(os.fdopen(tempfd, "w+b"))

            self.postings.sort()
            for p in self.postings:
                runfile.write_string2(p)
            runfile.flush()
            runfile.seek(0)

            self.runs.append((runfile, self.count))
            #print "Flushed run:", self.runs

            self.postings = []
            self.size = 0
            self.count = 0
Example #19
0
 def _flush_run(self):
     # Called when the memory buffer (of size self.limit) fills up.
     # Sorts the buffer and writes the current buffer to a "run" on disk.
     
     if self.size > 0:
         tempfd, tempname = tempfile.mkstemp(".run")
         runfile = StructFile(os.fdopen(tempfd, "w+b"))
         
         self.postings.sort()
         for p in self.postings:
             runfile.write_string2(p)
         runfile.flush()
         runfile.seek(0)
         
         self.runs.append((runfile, self.count))
         #print "Flushed run:", self.runs
         
         self.postings = []
         self.size = 0
         self.count = 0
Example #20
0
 def run(self):
     pqueue = self.postingqueue
     rqueue = self.resultqueue
     
     subpool = TempfilePool(self.schema, limitmb=self.limitmb, dir=self.dir)
     
     while True:
         code, args = pqueue.get()
         
         if code == -1:
             doccount = args
             break
         if code == 0:
             subpool.add_content(*args)
         elif code == 1:
             subpool.add_posting(*args)
         elif code == 2:
             subpool.add_field_length(*args)
     
     lenfilename = subpool.unique_name(".lengths")
     subpool._write_lengths(StructFile(open(lenfilename, "wb")), doccount)
     subpool.dump_run()
     rqueue.put((subpool.runs, subpool.fieldlength_totals(),
                 subpool.fieldlength_maxes(), lenfilename))
Example #21
0
def wordlist_to_graph_file(wordlist, dbfile, strip=True):
    """Writes a word graph file from a list of words.
    
    >>> # Open a word list file with one word on each line, and write the
    >>> # word graph to a graph file
    >>> wordlist_to_graph_file("mywords.txt", "mywords.dawg")
    
    :param wordlist: an iterable containing the words for the graph. The words
        must be in sorted order.
    :param dbfile: a filename string or file-like object to write the word
        graph to. If you pass a file-like object, it will be closed when the
        function completes.
    """

    from whoosh.filedb.structfile import StructFile

    g = GraphCorrector.from_word_list(wordlist, strip=strip)

    if isinstance(dbfile, string_type):
        dbfile = open(dbfile, "wb")
    if not isinstance(dbfile, StructFile):
        dbfile = StructFile(dbfile)

    g.to_file(dbfile)
Example #22
0
    def create_file(self, name, **kwargs):
        def onclose_fn(sfile):
            self._bucket.put_object(self._fpath(name), sfile.file.getvalue())

        f = StructFile(BytesIO(), name=name, onclose=onclose_fn)
        return f
Example #23
0
    def _write_node(self, uncnode):
        vtype = self.vtype
        dbfile = self.dbfile
        arcs = uncnode.arcs
        numarcs = len(arcs)

        if not numarcs:
            if uncnode.accept:
                return None
            else:
                # What does it mean for an arc to stop but not be accepted?
                raise Exception
        self.node_count += 1

        buf = StructFile(BytesIO())
        nodestart = dbfile.tell()
        #self.count += 1
        #self.arccount += numarcs

        fixedsize = -1
        arcstart = buf.tell()
        for i, arc in enumerate(arcs):
            self.arc_count += 1
            target = arc.target
            label = arc.label

            flags = 0
            if len(label) > 1:
                flags += MULTIBYTE_LABEL
            if i == numarcs - 1:
                flags += ARC_LAST
            if arc.accept:
                flags += ARC_ACCEPT
            if target is None:
                flags += ARC_STOP
            if arc.value is not None:
                flags += ARC_HAS_VAL
            if arc.acceptval is not None:
                flags += ARC_HAS_ACCEPT_VAL

            buf.write(pack_byte(flags))
            if len(label) > 1:
                buf.write(varint(len(label)))
            buf.write(label)
            if target is not None:
                buf.write(pack_uint(target))
            if arc.value is not None:
                vtype.write(buf, arc.value)
            if arc.acceptval is not None:
                vtype.write(buf, arc.acceptval)

            here = buf.tell()
            thissize = here - arcstart
            arcstart = here
            if fixedsize == -1:
                fixedsize = thissize
            elif fixedsize > 0 and thissize != fixedsize:
                fixedsize = 0

        if fixedsize > 0:
            # Write a fake arc containing the fixed size and number of arcs
            dbfile.write_byte(255)  # FIXED_SIZE
            dbfile.write_int(fixedsize)
            dbfile.write_int(numarcs)
            self.fixed_count += 1
        dbfile.write(buf.file.getvalue())

        return nodestart
Example #24
0
 def open_file(self, name, *args, **kwargs):
     f = StructFile(open(self._fpath(name), "rb"), *args, **kwargs)
     f._name = name
     return f
Example #25
0
 def create_file(self, name):
     f = StructFile(open(self._fpath(name), "wb"),
                    name=name,
                    mapped=self.mapped)
     return f
Example #26
0
 def open_file(self, name, *args, **kwargs):
     if name not in self.files:
         raise NameError
     return StructFile(StringIO(self.files[name]), *args, **kwargs)
Example #27
0
 def open_file(self, name, *args, **kwargs):
     return StructFile(DatastoreFile.loadfile(name))
Example #28
0
 def create_file(self, name):
     ss = self.SubStream(self._temp, self._buffersize)
     self._streams[name] = ss
     return StructFile(ss)
Example #29
0
 def open_file(self, name, *args, **kwargs):
     if name not in self.files:
         raise NameError(name)
     return StructFile(BytesIO(self.files[name]), name=name, *args,
                       **kwargs)
Example #30
0
 def open_file(self, name, *args, **kwargs):
     f = StructFile(open(self._fpath(name), "rb"), *args, **kwargs)
     f._name = name
     return f
 def create_file(self, name, excl=False, mode="w+b", **kwargs):
     f = StructFile(io.BytesIO(), name=name, onclose=self._encrypt_index_on_close(name))
     f.is_real = False
     return f
Example #32
0
    def create_file(self, name, **kwargs):
        def onclose_fn(sfile):
            self.files[name] = sfile.file.getvalue()

        f = StructFile(BytesIO(), name=name, onclose=onclose_fn)
        return f
Example #33
0
    def create_file(self, name):
        def onclose_fn(sfile):
            self.files[name] = sfile.file.getvalue()

        f = StructFile(StringIO(), name=name, onclose=onclose_fn)
        return f