def open_file(self, name, mode=u'r'): path = self._topath(name) uf = self.fs.open(path, mode=mode) if mode == u'w': uf.content_type = 'application/structfile' uf.force_rev = 0 sf = StructFile(uf) sf.is_real = False sf.fileno = None return sf
def open_file(self, name, *args, **kwargs): if not self.file_exists(name): raise NameError("No such file %r" % name) def onclose_fn(sfile): self.redis.hset("RedisStore:%s" % self.folder, name, sfile.file.getvalue()) #print "Opened file %s %s " % (name, self.__file(name)) return StructFile(BytesIO(self.__file(name)), name=name, onclose=onclose_fn, *args, **kwargs)
def run(self): jobqueue = self.jobqueue rqueue = self.resultqueue subpool = self.subpool = TempfilePool(self.schema, limitmb=self.limitmb, dir=self.dir) if self.firstjob: self._add_file(*self.firstjob) while True: arg1, arg2 = jobqueue.get() if arg1 is None: doccount = arg2 break else: self._add_file(arg1, arg2) lenfd, lenfilename = tempfile.mkstemp(".lengths", dir=subpool.dir) lenf = os.fdopen(lenfd, "wb") subpool._write_lengths(StructFile(lenf), doccount) subpool.dump_run() rqueue.put((subpool.runs, subpool.fieldlength_totals(), subpool.fieldlength_mins(), subpool.fieldlength_maxes(), lenfilename))
def create_file(self, name, excl=False, mode="wb", **kwargs): """Creates a file with the given name in this storage. :param name: the name for the new file. :param excl: if True, try to open the file in "exclusive" mode. :param mode: the mode flags with which to open the file. The default is ``"wb"``. :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ if self.readonly: raise ReadOnlyError path = self._fpath(name) if excl: flags = os.O_CREAT | os.O_EXCL | os.O_RDWR if hasattr(os, "O_BINARY"): flags |= os.O_BINARY fd = os.open(path, flags) fileobj = os.fdopen(fd, mode) else: fileobj = open(path, mode) f = StructFile(fileobj, name=name, **kwargs) return f
def create_file(self, name, **kwargs): def onclose_fn(sfile): self.redis.hset("RedisStore:%s" % self.folder, name, sfile.file.getvalue()) f = StructFile(StringIO(), name=name, onclose=onclose_fn) return f
def wordlist_to_graph_file(wordlist, dbfile, fieldname="_", strip=True): """Writes a word graph file from a list of words. >>> # Open a word list file with one word on each line, and write the >>> # word graph to a graph file >>> wordlist_to_graph_file("mywords.txt", "mywords.dawg") :param wordlist: an iterable containing the words for the graph. The words must be in sorted order. :param dbfile: a filename string or file-like object to write the word graph to. This function will close the file. """ from whoosh.filedb.structfile import StructFile if isinstance(dbfile, string_type): dbfile = open(dbfile, "wb") if not isinstance(dbfile, StructFile): dbfile = StructFile(dbfile) gw = dawg.GraphWriter(dbfile) gw.start_field(fieldname) for word in wordlist: if strip: word = word.strip() gw.insert(word) gw.finish_field() gw.close()
def open_file(self, name, *args, **kwargs): if name not in self.files: raise NameError("No such file %r" % name) return StructFile(StringIO(self.files[name]), name=name, *args, **kwargs)
def finish(self, termswriter, doccount, lengthfile): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return jobqueue = self.jobqueue rqueue = self.resultqueue for task in self.tasks: jobqueue.put((None, doccount)) for task in self.tasks: task.join() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length jobqueue.close() rqueue.close() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() # if len(runs) >= self.procs * 2: # pool = Pool(self.procs) # tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir) # while len(runs) >= self.procs * 2: # runs2 = [(runs[i:i+4], tempname()) # for i in xrange(0, len(runs), 4)] # if len(runs) % 4: # last = runs2.pop()[0] # runs2[-1][0].extend(last) # runs = pool.map(merge_runs, runs2) # pool.close() iterator = imerge( [read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) termswriter.add_iter(iterator, lengths.get) for runname, count in runs: os.remove(runname) self.cleanup()
def create_file(self, name, excl=False, mode='wb', **kwargs): if self.readonly: raise ReadOnlyError # Of course SAE bucket does not support `exclusive` mode. # fileobj = self.bucket.get_object(name) fileobj = SAEFile(name, mode, self.bucket) f = StructFile(fileobj, name=name, **kwargs) return f
def open_file(self, name, **kwargs): """Opens an existing file in this storage. :param name: the name of the file to open. :param kwargs: additional keyword arguments are passed through to the :class:`~whoosh.filedb.structfile.StructFile` initializer. :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ return StructFile(open(self._fpath(name), "rb"), name=name, **kwargs)
def open_file(self, name, **kwargs): if self._bucket.stat_object(self._fpath(name)) is None: raise NameError(name) content = self._bucket.get_object_contents(self._fpath(name)) def onclose_fn(sfile): self._bucket.put_object(self._fpath(name), sfile.file.getvalue()) return StructFile(BytesIO(content), name=name, onclose=onclose_fn)
def open_file(self, name, *args, **kwargs): try: f = StructFile(open(self._fpath(name), "rb"), name=name, *args, **kwargs) except IOError: print "Tried to open %r, files=%r" % (name, self.list()) raise return f
def open_file(self, name, *args, **kwargs): offset, length = self.range(name) if self._source: # Create a memoryview/buffer from the mmap buf = memoryview_(self._source, offset, length) f = BufferFile(buf, name=name) elif hasattr(self._file, "subset"): f = self._file.subset(offset, length, name=name) else: f = StructFile(SubFile(self._file, offset, length), name=name) return f
def open_file(self, fname, *args, **kwargs): if not self.kvdb_coll.exists(fname): raise NameError(fname) content = self.kvdb_coll.get_value(fname) def onclose_fn(sfile): value = sfile.file.getvalue() self.kvdb_coll.set_value(fname, value) return StructFile(StringIO(content), name=fname, onclose=onclose_fn)
def finish(self, doccount, lengthfile, termtable, postingwriter): _fieldlength_totals = self._fieldlength_totals if not self.tasks: return pqueue = self.postingqueue rqueue = self.resultsqueue for _ in xrange(self.procs): pqueue.put((-1, doccount)) #print "Joining..." t = now() for task in self.tasks: task.join() #print "Join:", now() - t #print "Getting results..." t = now() runs = [] lenfilenames = [] for task in self.tasks: taskruns, flentotals, flenmaxes, lenfilename = rqueue.get() runs.extend(taskruns) lenfilenames.append(lenfilename) for fieldnum, total in flentotals.iteritems(): _fieldlength_totals[fieldnum] += total for fieldnum, length in flenmaxes.iteritems(): if length > self._fieldlength_maxes.get(fieldnum, 0): self._fieldlength_maxes[fieldnum] = length #print "Results:", now() - t #print "Writing lengths..." t = now() lw = LengthWriter(lengthfile, doccount) for lenfilename in lenfilenames: sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount) lw.add_all(sublengths) os.remove(lenfilename) lw.close() lengths = lw.reader() #print "Lengths:", now() - t t = now() iterator = imerge([read_run(runname, count) for runname, count in runs]) total = sum(count for runname, count in runs) write_postings(self.schema, termtable, lengths, postingwriter, iterator) for runname, count in runs: os.remove(runname) #print "Merge:", now() - t self.cleanup()
def open_file(self, name, *args, **kwargs): info = self.dir[name] offset = info["offset"] length = info["length"] if self.source: # Create a memoryview/buffer from the mmap buf = memoryview_(self.source, offset, length) f = BytesIO(buf) else: # If mmap is not available, use the slower sub-file implementation f = SubFile(self.file, offset, length) return StructFile(f, name=name)
def create_file(self, name, excl=False, mode="wb", **kwargs): if self.readonly: raise ReadOnlyError path = self._fpath(name) if excl: flags = os.O_CREAT | os.O_EXCL | os.O_RDWR if hasattr(os, "O_BINARY"): flags |= os.O_BINARY fd = os.open(path, flags) fileobj = os.fdopen(fd, mode) else: fileobj = open(path, mode) f = StructFile(fileobj, name=name, **kwargs) return f
def _flush_run(self): # Called when the memory buffer (of size self.limit) fills up. # Sorts the buffer and writes the current buffer to a "run" on disk. if self.size > 0: tempfd, tempname = tempfile.mkstemp(".run") runfile = StructFile(os.fdopen(tempfd, "w+b")) self.postings.sort() for p in self.postings: runfile.write_string2(p) runfile.flush() runfile.seek(0) self.runs.append((runfile, self.count)) #print "Flushed run:", self.runs self.postings = [] self.size = 0 self.count = 0
def run(self): pqueue = self.postingqueue rqueue = self.resultqueue subpool = TempfilePool(self.schema, limitmb=self.limitmb, dir=self.dir) while True: code, args = pqueue.get() if code == -1: doccount = args break if code == 0: subpool.add_content(*args) elif code == 1: subpool.add_posting(*args) elif code == 2: subpool.add_field_length(*args) lenfilename = subpool.unique_name(".lengths") subpool._write_lengths(StructFile(open(lenfilename, "wb")), doccount) subpool.dump_run() rqueue.put((subpool.runs, subpool.fieldlength_totals(), subpool.fieldlength_maxes(), lenfilename))
def wordlist_to_graph_file(wordlist, dbfile, strip=True): """Writes a word graph file from a list of words. >>> # Open a word list file with one word on each line, and write the >>> # word graph to a graph file >>> wordlist_to_graph_file("mywords.txt", "mywords.dawg") :param wordlist: an iterable containing the words for the graph. The words must be in sorted order. :param dbfile: a filename string or file-like object to write the word graph to. If you pass a file-like object, it will be closed when the function completes. """ from whoosh.filedb.structfile import StructFile g = GraphCorrector.from_word_list(wordlist, strip=strip) if isinstance(dbfile, string_type): dbfile = open(dbfile, "wb") if not isinstance(dbfile, StructFile): dbfile = StructFile(dbfile) g.to_file(dbfile)
def create_file(self, name, **kwargs): def onclose_fn(sfile): self._bucket.put_object(self._fpath(name), sfile.file.getvalue()) f = StructFile(BytesIO(), name=name, onclose=onclose_fn) return f
def _write_node(self, uncnode): vtype = self.vtype dbfile = self.dbfile arcs = uncnode.arcs numarcs = len(arcs) if not numarcs: if uncnode.accept: return None else: # What does it mean for an arc to stop but not be accepted? raise Exception self.node_count += 1 buf = StructFile(BytesIO()) nodestart = dbfile.tell() #self.count += 1 #self.arccount += numarcs fixedsize = -1 arcstart = buf.tell() for i, arc in enumerate(arcs): self.arc_count += 1 target = arc.target label = arc.label flags = 0 if len(label) > 1: flags += MULTIBYTE_LABEL if i == numarcs - 1: flags += ARC_LAST if arc.accept: flags += ARC_ACCEPT if target is None: flags += ARC_STOP if arc.value is not None: flags += ARC_HAS_VAL if arc.acceptval is not None: flags += ARC_HAS_ACCEPT_VAL buf.write(pack_byte(flags)) if len(label) > 1: buf.write(varint(len(label))) buf.write(label) if target is not None: buf.write(pack_uint(target)) if arc.value is not None: vtype.write(buf, arc.value) if arc.acceptval is not None: vtype.write(buf, arc.acceptval) here = buf.tell() thissize = here - arcstart arcstart = here if fixedsize == -1: fixedsize = thissize elif fixedsize > 0 and thissize != fixedsize: fixedsize = 0 if fixedsize > 0: # Write a fake arc containing the fixed size and number of arcs dbfile.write_byte(255) # FIXED_SIZE dbfile.write_int(fixedsize) dbfile.write_int(numarcs) self.fixed_count += 1 dbfile.write(buf.file.getvalue()) return nodestart
def open_file(self, name, *args, **kwargs): f = StructFile(open(self._fpath(name), "rb"), *args, **kwargs) f._name = name return f
def create_file(self, name): f = StructFile(open(self._fpath(name), "wb"), name=name, mapped=self.mapped) return f
def open_file(self, name, *args, **kwargs): if name not in self.files: raise NameError return StructFile(StringIO(self.files[name]), *args, **kwargs)
def open_file(self, name, *args, **kwargs): return StructFile(DatastoreFile.loadfile(name))
def create_file(self, name): ss = self.SubStream(self._temp, self._buffersize) self._streams[name] = ss return StructFile(ss)
def open_file(self, name, *args, **kwargs): if name not in self.files: raise NameError(name) return StructFile(BytesIO(self.files[name]), name=name, *args, **kwargs)
def create_file(self, name, excl=False, mode="w+b", **kwargs): f = StructFile(io.BytesIO(), name=name, onclose=self._encrypt_index_on_close(name)) f.is_real = False return f
def create_file(self, name, **kwargs): def onclose_fn(sfile): self.files[name] = sfile.file.getvalue() f = StructFile(BytesIO(), name=name, onclose=onclose_fn) return f
def create_file(self, name): def onclose_fn(sfile): self.files[name] = sfile.file.getvalue() f = StructFile(StringIO(), name=name, onclose=onclose_fn) return f