def _serialize_data(cls, data, gzip=False, df_column_names=False, gzip_compression_level=6): if six.PY3: raise Exception("python3 currently not supported for this method! Sorry!") # easy, it's already in a stringio ready to go if any([isinstance(data, c) for c in already_stringio]): # make sure we reset to beginning, just in case, in preparation of read() # must be compatible with StringIO and cStringIO, so can't use reset() data.seek(SEEK_SET) return data data_io = StringIO() if gzip: io = GzipFile(fileobj=data_io, mode='w', compresslevel=gzip_compression_level) else: io = data_io if any([isinstance(data, c) for c in (dict, list, str)]): io.write(json.dumps(data)) elif isinstance(data, DataFrame) or isinstance(data, Series): data.to_csv(io, index=False, header=df_column_names) else: io.write(str(data)) if gzip: io.close() data_io.seek(SEEK_SET) return data_io
def main(): args = parse_args() with args.source as source: reader = csv.DictReader(source) for (run, sample), rows in groupby(reader, itemgetter('run', 'enum')): sample_name = format_sample_name(run, sample) filename1 = os.path.join(args.dest, sample_name + '_R1_001.fastq.gz') filename2 = os.path.join(args.dest, sample_name + '_R2_001.fastq.gz') print(filename1) with open(filename1, 'wb') as dest1, open(filename2, 'wb') as dest2: dest1_zip = GzipFile(fileobj=dest1) dest2_zip = GzipFile(fileobj=dest2) for i, row in enumerate(rows): seq = row['string'].replace('-', '') for j in range(3): # Three duplicates so that G2P doesn't ignore it. prefix = '@M454:01:000000000-AAAAA:1:1101:{}:{}'.format( 10*i + j, row['count']) dest1_zip.write(prefix + ' 1:N:0:1\n') dest2_zip.write(prefix + ' 2:N:0:1\n') dest1_zip.write(seq + '\n') dest2_zip.write(reverse_and_complement(seq) + '\n') dest1_zip.write('+\n') dest2_zip.write('+\n') quality = 'A' * len(seq) dest1_zip.write(quality + '\n') dest2_zip.write(quality + '\n') dest1_zip.close() dest2_zip.close() print('Done.')
def _read_raw_athena(filename): """try to read athena project file as plain text, to determine validity """ # try gzip text = None try: fh = GzipFile(filename) text = bytes2str(fh.read()) except Exception: errtype, errval, errtb = sys.exc_info() text = None finally: fh.close() if text is None: # try plain text file try: fh = open(filename, 'r') text = bytes2str(fh.read()) except Exception: errtype, errval, errtb = sys.exc_info() text = None finally: fh.close() return text
def gzip_generator(string_generator): """Return generator for gzipping given string generator. Example: >>> import StringIO >>> z = ''.join(gzip_generator(iter(['hello,', ' ', 'world!']))) >>> ''.join(gunzip_generator(StringIO.StringIO(z))) 'hello, world!' """ # Use gzip and not zlib to make proper gzip header. buffer = StringIO() gzip = GzipFile(fileobj=buffer, mode='w') # Yield header yield buffer.getvalue() buffer.truncate(0) for string in string_generator: gzip.write(string) gzip.flush() yield buffer.getvalue() buffer.truncate(0) # Flush gzip.close() yield buffer.getvalue()
def get_compressed_file_data(file_path, compresslevel=5): compressed_buffer = BytesIO() gzip_file = GzipFile(mode='wb', compresslevel=compresslevel, fileobj=compressed_buffer) try: fileobj = open(file_path, 'rb') while True: x = fileobj.read(65536) if not x: break gzip_file.write(x) x = None fileobj.close() except IOError as e: LOG.error(str(e)) return None gzip_file.close() compressed_data = compressed_buffer.getvalue() compressed_buffer.close() return compressed_data
def __init__(self, data): fd, fname = tempfile.mkstemp() gzd = GzipFile(mode='r', fileobj=StringIO(b64decode(data))) os.write(fd, gzd.read()) os.close(fd) gzd.close() self.name = fname
def test_content_encoding_gzip(self): kwargs = {'message': 'hello'} message = json.dumps(kwargs) fp = StringIO() try: f = GzipFile(fileobj=fp, mode='w') f.write(message) finally: f.close() key = self.projectkey.public_key secret = self.projectkey.secret_key with self.tasks(): resp = self.client.post( self.path, fp.getvalue(), content_type='application/octet-stream', HTTP_CONTENT_ENCODING='gzip', HTTP_X_SENTRY_AUTH=get_auth_header('_postWithHeader', key, secret), ) assert resp.status_code == 200, resp.content event_id = json.loads(resp.content)['id'] instance = Event.objects.get(event_id=event_id) assert instance.message == 'hello'
def write_sbml_model(cobra_model, filename, use_fbc_package=True, **kwargs): if not use_fbc_package: if libsbml is None: raise Exception("libSBML required to write non-fbc models") write_sbml2(cobra_model, filename, use_fbc_package=False, **kwargs) return # create xml xml = model_to_xml(cobra_model, **kwargs) write_args = {"encoding": "UTF-8"} if _with_lxml: write_args["pretty_print"] = True else: indent_xml(xml) # write xml to file should_close = True if hasattr(filename, "write"): xmlfile = filename should_close = False elif filename.endswith(".gz"): xmlfile = GzipFile(filename, "wb") elif filename.endswith(".bz2"): xmlfile = BZ2File(filename, "wb") else: xmlfile = open(filename, "wb") ElementTree(xml).write(xmlfile, **write_args) if should_close: xmlfile.close()
def write(self): if debug: print 'writing to disk' gz = GzipFile(database, 'wb') dump(db, gz, -1) gz.close() Pref.writing_to_disk = False
def _compress_string(self, s): """Gzip a given string.""" zbuf = StringIO() zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile.write(s) zfile.close() return zbuf.getvalue()
def get(self): from gzip import GzipFile try: from cString import StringIO except ImportError: from StringIO import StringIO data = self.get_data() data['gzipped'] = True json_response = self.json_response(data, finish=False) tmp_buffer = StringIO() gziped_buffer = GzipFile( fileobj=tmp_buffer, mode="wb", compresslevel=7) gziped_buffer.write(json_response) gziped_buffer.close() gzipped_data = tmp_buffer.getvalue() self.set_header("Content-Encoding", 'gzip') self.set_header("Content-Length", str(len(gzipped_data))) tmp_buffer.close() self.finish(gzipped_data)
def DecodeProcFile(proc_file): if len(proc_file) < 256: fd = open(proc_file) proc_file = fd.read(1024*1024) fd.close() if proc_file.find('Subsystem Id:') < 0: p = None try: from gzip import GzipFile from StringIO import StringIO s = StringIO(proc_file) gz = GzipFile(mode='r', fileobj=s) p = gz.read(1024*1024) gz.close() except: pass if p is None: try: from bz2 import decompress p = decompress(proc_file) except: pass if not p is None: proc_file = p return proc_file
def open(self): request = Request(self.url) request.add_header('User-Agent','lastfm-lda recommender v.0.0.-1') request.add_header('Accept-encoding', 'gzip') while True: URLLoadListener.num_connections+=1 response = None try: response = urlopen(request,timeout=10) if response.info().get('Content-Encoding') == 'gzip': f = GzipFile(fileobj=StringIO(response.read())) result = f.read() f.close() else: result = response.read() break except Exception, e: if self.retries>2: if isinstance(e, BadStatusLine): raise Exception("last.fm server does not respond (%s)" % e) raise e self.retries+=1 print self.url print "failed with", e print "retry #",self.retries print finally:
def _uncachedgenerate(self): """ Generates the Gzipped sitemap uncached data """ len_brains = len(self._catalogbrains()) if self.index is None: # no index specified in the url if len_brains < self.maxlen: # ok, we have few items, let's generate the standard sitemap xml = self.template() else: # a lot of items, let's generate a sitemap index xml = self.indextemplate() elif int(self.index)*self.maxlen >= len_brains: # bad index specified raise NotFound(self.context, '%s-%s' % (self.index, self.filename), self.request) else: # index specified in the url xml = self.template() if self.index is not None: filename = "%s-%s" % (self.index, self.filename) else: filename = self.filename fp = StringIO() gzip = GzipFile(filename, 'w', 9, fp) gzip.write(xml) gzip.close() data = fp.getvalue() fp.close() return data
class CompressingRequestWrapper(_makeBase()): """ A request wrapper with support for transport encoding compression. @ivar underlying: the request being wrapped. @type underlying: L{IRequest} @ivar encoding: the IANA-assigned name of the encoding. @type encoding: C{str} @ivar compressLevel: the level of gzip compression to apply. @type compressLevel: C{int} """ implements(IRequest) encoding = 'gzip' compressLevel = 6 def __init__(self, underlying): self.underlying = underlying self.setHeader('content-encoding', self.encoding) self._gzipFile = None # See setHeader docstring for more commentary. self.underlying.headers.pop('content-length', None) def setHeader(self, name, value): """ Discard the Content-Length header. When compression encoding is in use, the Content-Length header must indicate the length of the compressed content; since we are doing the compression on the fly, we don't actually know what the length is after compression, so we discard this header. If this is an HTTP/1.1 request, chunked transfer encoding should be used, softening the impact of losing this header. """ if name.lower() == 'content-length': return else: return self.underlying.setHeader(name, value) def write(self, data): """ Pass data through to the gzip layer. """ if self._gzipFile is None: self._gzipFile = GzipFile(fileobj=self.underlying, mode='wb', compresslevel=self.compressLevel) self._gzipFile.write(data) def finishRequest(self, success): """ Finish off gzip stream. """ if self._gzipFile is None: self.write('') self._gzipFile.close() self.underlying.finishRequest(success)
def handle_stackexchange_login(self, data): self.send_response(200) self.send_header("Content-type", "text/html") self.log_message(self.path) self.end_headers() c = Client(StackExchange, get_config()) cred = c.flow.authorization_received(data) d = c.request("/me", body=urlencode({ "site": "stackoverflow" })) self.wfile.write("<!DOCTYPE html>") self.wfile.write("<head><meta charset=\"utf-8\"/></head><body>") self.wfile.write("Access token: %s<br>" % cred.access_token) self.wfile.write("Type: %s<br>" % cred.token_type) self.wfile.write("Expires in: %d<br>" % cred.expires_in) # stackexchange gzips all data h = StringIO(d) gzip_data = GzipFile(fileobj=h) d = gzip_data.read() gzip_data.close() self.wfile.write(d) self.wfile.write("</body></html>")
def save(self, filename, mtime=1300507380.0): """ Serialize this RingData instance to disk. :param filename: File into which this instance should be serialized. :param mtime: time used to override mtime for gzip, default or None if the caller wants to include time """ # Override the timestamp so that the same ring data creates # the same bytes on disk. This makes a checksum comparison a # good way to see if two rings are identical. # # This only works on Python 2.7; on 2.6, we always get the # current time in the gzip output. tempf = NamedTemporaryFile(dir=".", prefix=filename, delete=False) if 'mtime' in inspect.getargspec(GzipFile.__init__).args: gz_file = GzipFile(filename, mode='wb', fileobj=tempf, mtime=mtime) else: gz_file = GzipFile(filename, mode='wb', fileobj=tempf) self.serialize_v1(gz_file) gz_file.close() tempf.flush() os.fsync(tempf.fileno()) tempf.close() os.chmod(tempf.name, 0o644) os.rename(tempf.name, filename)
def compress_string(s): # avg_block_size is acutally the reciporical of the average # intended interflush distance. rnd = Random(s) flushes_remaining = FLUSH_LIMIT if len(s) < AVERAGE_SPAN_BETWEEN_FLUSHES * APPROX_MIN_FLUSHES: avg_block_size = APPROX_MIN_FLUSHES / float(len(s) + 1) else: avg_block_size = 1.0 / AVERAGE_SPAN_BETWEEN_FLUSHES s = StringIO(s) if isinstance(s, six.text_type) else BytesIO(s) zbuf = BytesIO() zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) chunk = s.read(MIN_INTERFLUSH_INTERVAL + int(rnd.expovariate(avg_block_size))) while chunk and flushes_remaining: zfile.write(chunk) zfile.flush() flushes_remaining -= 1 chunk = s.read(MIN_INTERFLUSH_INTERVAL + int(rnd.expovariate(avg_block_size))) zfile.write(chunk) zfile.write(s.read()) zfile.close() return zbuf.getvalue()
class NBTFile(TAG_Compound): """Represents an NBT file object""" def __init__(self, filename=None, mode=None, buffer=None): super(NBTFile,self).__init__() self.__class__.__name__ = "TAG_Compound" if filename: self.file = GzipFile(filename, mode) self.parse_file(self.file) def parse_file(self, file=None): if not file: file = self.file if file: self.type = TAG_Byte(buffer=file) if self.type.value == self.id: name = TAG_String(buffer=file) self._parse_buffer(file) self.name = name self.file.close() else: raise ValueError("First record is not a Compound Tag") def write_file(self, filename=None, file=None): if file: self.file = file elif filename: self.file = GzipFile(filename, "wb") else: raise ValueError("Need to specify either a filename or a file") #Render tree to file self.type._render_buffer(file) self.name._render_buffer(file) self._render_buffer(file)
def build_index_gzip(self): """creates sorted index from gzip-compressed queue. caches object regardless of caccheobj flag. """ self.index = [] zf = GzipFile(fileobj=self.map, mode="rb") while 1: p = zf.tell() # just for diagnosis use try: l = zf.readline() except IOError as ex: # probably CRC error due to truncated file. discard the rest. logging.error("error in %s at %d: %s", self.fn, p, str(ex)) break if not l: break if l[0] != " ": continue try: o = cjson.decode(l[1:]) except Exception as ex: logging.warn("skipping malformed JSON at %s:%d: %s", self.fn, p, l[1:]) continue key = o.get("id") if key is None: try: key = self.urikey(o) except UnicodeEncodeError: pass if key is None: logging.error("urikey->None for %s", str(o)) continue self.index.append((key, o)) zf.close()
def gzip_media(self, filedata): """gzip encodes a given stream of data.""" gzip_data = StringIO() gzf = GzipFile(fileobj=gzip_data, mode="wb") gzf.write(filedata) gzf.close() return gzip_data.getvalue()
def main(argv): args = argv[1:] or ["-"] class TitleExtractor(MWXMLDumpParser): def start_revision(self, pageid, title, revid, timestamp): print(pageid, title) return for path in args: if path == "-": fp = sys.stdin elif path.endswith(".gz"): from gzip import GzipFile fp = GzipFile(path) elif path.endswith(".bz2"): from bz2 import BZ2File fp = BZ2File(path) else: fp = open(path) parser = TitleExtractor() parser.feed_file(fp) fp.close() parser.close() return 0
def save(self, filename): """ Serialize this RingData instance to disk. :param filename: File into which this instance should be serialized. """ # Override the timestamp so that the same ring data creates # the same bytes on disk. This makes a checksum comparison a # good way to see if two rings are identical. # # This only works on Python 2.7; on 2.6, we always get the # current time in the gzip output. tempf = NamedTemporaryFile(dir=".", prefix=filename, delete=False) try: gz_file = GzipFile(filename, mode='wb', fileobj=tempf, mtime=1300507380.0) except TypeError: gz_file = GzipFile(filename, mode='wb', fileobj=tempf) self.serialize_v1(gz_file) gz_file.close() tempf.flush() os.fsync(tempf.fileno()) tempf.close() os.chmod(tempf.name, 0o644) os.rename(tempf.name, filename)
def handleResponse(self, response): if self.quietLoss: return if self.failed: self.factory.noPage( failure.Failure( error.Error( self.status, self.message, response))) elif self.length != None and self.length != 0: self.factory.noPage(failure.Failure( client.PartialDownloadError(self.status, self.message, response))) else: if self.decode: s = StringIO() s.write(response) s.seek(-1) g = GzipFile(fileobj=s, mode='rb') try: response = g.read() except IOError: self.factory.noPage(failure.Failure( client.PartialDownloadError(self.status, self.message, response))) self.transport.loseConnection() return g.close() self.factory.page(response) # server might be stupid and not close connection. self.transport.loseConnection()
def savematch(data, filename=''): """data must have the following format: dictionary from Matrix to Sequence to Index to Score""" #Maybe one should add a security policy for allowed filenames. #e.g. do not allow '/' in filename. if filename=='': a=localtime() filename='eel_'+str(a.tm_year)+'_'+str(a.tm_mon)+'_'+str(a.tm_mday)+'_'+str(a.tm_hour)+'_'+str(a.tm_min)+'.gff' try: if filename[-3:]==".gz": try: F=GzipFile(filename,"w") except NameError: filename=filename[:-3] F=open(filename,'w') else: F=open(filename,'w') ## This is in wrong format Seq and Matr are reversed. ## for Matr in data.keys(): ## for Seq in data[Matr].keys(): ## for Pos,Strand in data[Matr][Seq].keys(): ## F.write("%s\teel\t%s\t%d\t%d\t%f\t%s\t.\n"%(Seq,Matr.getName(),Pos,Pos+len(Matr)-1,data[Matr][Seq][(Pos,Strand)],Strand)) F.write(get(data)) F.close() return filename except IOError, (errno, strerror): print "I/O error(%s): %s" % (errno, strerror) return ''
def run(self): if not Pref.writing_to_disk: Pref.writing_to_disk = True print_line() print_debug('WRITING TO DISK') start = time.time() while len(db) > Pref.max_database_records: db.popitem(last = False) gz = GzipFile(database+'.tmp', 'wb') dump(db, gz, -1) gz.close() try: remove(database) except: pass try: rename(database+'.tmp', database) except: pass print_debug('time expend writting to disk', time.time()-start) Pref.writing_to_disk = False
def write_to(self, out, newline='\x0D\x0A', gzip=False): if gzip: out = GzipFile(fileobj=out) self._write_to(out, newline) if gzip: out.flush() out.close()
def testPostMethodDeCompressesDeflatedBody_gzip(self): self.requestData = None def handler(**kwargs): self.requestData = kwargs reactor = Reactor() server = HttpServer(reactor, self.port, handler, timeout=0.01) server.listen() sok = socket() sok.connect(('localhost', self.port)) bodyData = 'bodydatabodydata' _sio = StringIO() _gzFileObj = GzipFile(filename=None, mode='wb', compresslevel=6, fileobj=_sio) _gzFileObj.write(bodyData); _gzFileObj.close() compressedBodyData = _sio.getvalue() bodyDataCompressed = compress(bodyData) contentLengthCompressed = len(bodyDataCompressed) sok.send(('POST / HTTP/1.0\r\nContent-Type: application/x-www-form-urlencoded\r\nContent-Length: %d\r\nContent-Encoding: gzip\r\n\r\n' % contentLengthCompressed) + bodyDataCompressed) while not self.requestData: reactor.step() self.assertEquals(dict, type(self.requestData)) self.assertTrue('Headers' in self.requestData) headers = self.requestData['Headers'] self.assertEquals('POST', self.requestData['Method']) self.assertEquals('application/x-www-form-urlencoded', headers['Content-Type']) self.assertEquals(contentLengthCompressed, int(headers['Content-Length'])) self.assertTrue('Body' in self.requestData) self.assertEquals('bodydatabodydata', self.requestData['Body'])
def readDerrick(path): g = GzipFile(path, "rb") messages = [] for l in g: messages.append(urllib.unquote(l.rstrip("\r\n").split(" ", 4)[-1])) g.close() return messages
def run_analogy_space_lang(lang): # Open files (fail early on errors) tensor_name = tensor_filename(lang) tensor_name_new = tensor_name+'_new' tensor_file = GzipFile(tensor_name_new, 'wb') svd_name = svd_filename(lang) svd_name_new = svd_name + '_new' # Load matrix logging.info('Loading %s'% lang) cnet_2d = conceptnet_2d_from_db(lang, identities=IDENTITIES, cutoff=CUTOFF) logging.info('Normalize %r' % cnet_2d) cnet_2d = cnet_2d.normalized() # Save tensor logging.info('Save tensor as %s' % tensor_name) pickle.dump(cnet_2d, tensor_file, -1) tensor_file.close() os.rename(tensor_name_new, tensor_name) logging.info('Running SVD') svd = cnet_2d.svd(k=100) # Save SVD logging.info('Save as %s' % svd_name) svd.save_pytables(svd_name_new) os.rename(svd_name_new, svd_name)
def _update(self, version): from poetry.utils.helpers import temporary_directory release_name = self._get_release_name(version) checksum = "{}.sha256sum".format(release_name) base_url = self.BASE_URL try: r = urlopen(base_url + "/{}/{}".format(version, checksum)) except HTTPError as e: if e.code == 404: raise RuntimeError("Could not find {} file".format(checksum)) raise checksum = r.read().decode().strip() # We get the payload from the remote host name = "{}.tar.gz".format(release_name) try: r = urlopen(base_url + "/{}/{}".format(version, name)) except HTTPError as e: if e.code == 404: raise RuntimeError("Could not find {} file".format(name)) raise meta = r.info() size = int(meta["Content-Length"]) current = 0 block_size = 8192 bar = self.progress_bar(max=size) bar.set_format(" - Downloading <info>{}</> <comment>%percent%%</>".format(name)) bar.start() sha = hashlib.sha256() with temporary_directory(prefix="poetry-updater-") as dir_: tar = os.path.join(dir_, name) with open(tar, "wb") as f: while True: buffer = r.read(block_size) if not buffer: break current += len(buffer) f.write(buffer) sha.update(buffer) bar.set_progress(current) bar.finish() # Checking hashes if checksum != sha.hexdigest(): raise RuntimeError( "Hashes for {} do not match: {} != {}".format( name, checksum, sha.hexdigest() ) ) gz = GzipFile(tar, mode="rb") try: with tarfile.TarFile(tar, fileobj=gz, format=tarfile.PAX_FORMAT) as f: f.extractall(str(self.lib)) finally: gz.close()
class S3Boto3StorageFile(File): """ The default file object used by the S3Boto3Storage backend. This file implements file streaming using boto's multipart uploading functionality. The file can be opened in read or write mode. This class extends Django's File class. However, the contained data is only the data contained in the current buffer. So you should not access the contained file object directly. You should access the data via this class. Warning: This file *must* be closed using the close() method in order to properly write the file to S3. Be sure to close the file in your application. """ # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing. # TODO: When Django drops support for Python 2.5, rewrite to use the # BufferedIO streams in the Python 2.6 io module. buffer_size = 5242880 def __init__(self, name, mode, storage, buffer_size=None): self._storage = storage self.name = name[len(self._storage.location):].lstrip("/") self._mode = mode self.obj = storage.bucket.Object(storage._encode_name(name)) # NOTE(mattrobenolt): This is an explicit deviation from # django-storages. This adds an extra HEAD request before # every GET. This effectively doubles the time it takes for # every chunk in our filestore. We explicitly are opting # out of this behavior to avoid this overhead. # # if 'w' not in mode: # # Force early RAII-style exception if object does not exist # self.obj.load() self._is_dirty = False self._file = None self._multipart = None # 5 MB is the minimum part size (if there is more than one part). # Amazon allows up to 10,000 parts. The default supports uploads # up to roughly 50 GB. Increase the part size to accommodate # for files larger than this. if buffer_size is not None: self.buffer_size = buffer_size self._write_counter = 0 @property def size(self): return self.obj.content_length def _get_file(self): if self._file is None: with metrics.timer("filestore.read", instance="s3"): self._file = BytesIO() if "r" in self._mode: self._is_dirty = False self._file.write(self.obj.get()["Body"].read()) self._file.seek(0) if self._storage.gzip and self.obj.content_encoding == "gzip": self._file = GzipFile(mode=self._mode, fileobj=self._file, mtime=0.0) return self._file def _set_file(self, value): self._file = value file = property(_get_file, _set_file) def read(self, *args, **kwargs): if "r" not in self._mode: raise AttributeError("File was not opened in read mode.") return super().read(*args, **kwargs) def write(self, content): if "w" not in self._mode: raise AttributeError("File was not opened in write mode.") self._is_dirty = True if self._multipart is None: parameters = self._storage.object_parameters.copy() parameters["ACL"] = self._storage.default_acl parameters["ContentType"] = (mimetypes.guess_type(self.obj.key)[0] or self._storage.default_content_type) if self._storage.reduced_redundancy: parameters["StorageClass"] = "REDUCED_REDUNDANCY" if self._storage.encryption: parameters["ServerSideEncryption"] = "AES256" self._multipart = self.obj.initiate_multipart_upload(**parameters) if self.buffer_size <= self._buffer_file_size: self._flush_write_buffer() return super().write(force_bytes(content)) @property def _buffer_file_size(self): pos = self.file.tell() self.file.seek(0, os.SEEK_END) length = self.file.tell() self.file.seek(pos) return length def _flush_write_buffer(self): """ Flushes the write buffer. """ if self._buffer_file_size: self._write_counter += 1 self.file.seek(0) part = self._multipart.Part(self._write_counter) part.upload(Body=self.file.read()) def close(self): if self._is_dirty: self._flush_write_buffer() # TODO: Possibly cache the part ids as they're being uploaded # instead of requesting parts from server. For now, emulating # s3boto's behavior. parts = [{ "ETag": part.e_tag, "PartNumber": part.part_number } for part in self._multipart.parts.all()] self._multipart.complete(MultipartUpload={"Parts": parts}) else: if self._multipart is not None: self._multipart.abort() if self._file is not None: self._file.close() self._file = None
class S3Boto3StorageFile(File): """ The default file object used by the S3Boto3Storage backend. This file implements file streaming using boto's multipart uploading functionality. The file can be opened in read or write mode. This class extends Django's File class. However, the contained data is only the data contained in the current buffer. So you should not access the contained file object directly. You should access the data via this class. Warning: This file *must* be closed using the close() method in order to properly write the file to S3. Be sure to close the file in your application. """ # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing. # TODO: When Django drops support for Python 2.5, rewrite to use the # BufferedIO streams in the Python 2.6 io module. buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880) def __init__(self, name, mode, storage, buffer_size=None): self._storage = storage self.name = name[len(self._storage.location):].lstrip('/') self._mode = mode self.obj = storage.bucket.Object(storage._encode_name(name)) if 'w' not in mode: # Force early RAII-style exception if object does not exist self.obj.load() self._is_dirty = False self._file = None self._multipart = None # 5 MB is the minimum part size (if there is more than one part). # Amazon allows up to 10,000 parts. The default supports uploads # up to roughly 50 GB. Increase the part size to accommodate # for files larger than this. if buffer_size is not None: self.buffer_size = buffer_size self._write_counter = 0 @property def size(self): return self.obj.content_length def _get_file(self): if self._file is None: self._file = SpooledTemporaryFile( max_size=self._storage.max_memory_size, suffix=".S3Boto3StorageFile", dir=setting("FILE_UPLOAD_TEMP_DIR", None)) if 'r' in self._mode: self._is_dirty = False self._file.write(self.obj.get()['Body'].read()) self._file.seek(0) if self._storage.gzip and self.obj.content_encoding == 'gzip': self._file = GzipFile(mode=self._mode, fileobj=self._file, mtime=0.0) return self._file def _set_file(self, value): self._file = value file = property(_get_file, _set_file) def read(self, *args, **kwargs): if 'r' not in self._mode: raise AttributeError("File was not opened in read mode.") return super(S3Boto3StorageFile, self).read(*args, **kwargs) def write(self, content): if 'w' not in self._mode: raise AttributeError("File was not opened in write mode.") self._is_dirty = True if self._multipart is None: parameters = self._storage.object_parameters.copy() parameters['ACL'] = self._storage.default_acl parameters['ContentType'] = (mimetypes.guess_type(self.obj.key)[0] or self._storage.default_content_type) if self._storage.reduced_redundancy: parameters['StorageClass'] = 'REDUCED_REDUNDANCY' if self._storage.encryption: parameters['ServerSideEncryption'] = 'AES256' self._multipart = self.obj.initiate_multipart_upload(**parameters) if self.buffer_size <= self._buffer_file_size: self._flush_write_buffer() return super(S3Boto3StorageFile, self).write(force_bytes(content)) @property def _buffer_file_size(self): pos = self.file.tell() self.file.seek(0, os.SEEK_END) length = self.file.tell() self.file.seek(pos) return length def _flush_write_buffer(self): """ Flushes the write buffer. """ if self._buffer_file_size: self._write_counter += 1 self.file.seek(0) part = self._multipart.Part(self._write_counter) part.upload(Body=self.file.read()) def close(self): if self._is_dirty: self._flush_write_buffer() # TODO: Possibly cache the part ids as they're being uploaded # instead of requesting parts from server. For now, emulating # s3boto's behavior. parts = [{ 'ETag': part.e_tag, 'PartNumber': part.part_number } for part in self._multipart.parts.all()] self._multipart.complete(MultipartUpload={'Parts': parts}) else: if self._multipart is not None: self._multipart.abort() if self._file is not None: self._file.close() self._file = None
line = pkginfos.split(':') if line[0] == 'Filename': filename = line[1].strip() elif line[0] == 'MD5sum': md5 = line[1].strip() elif line[0] == 'SHA1': sha1 = line[1].strip() elif line[0] == 'SHA256': sha256 = line[1].strip() if not md5 in channelPkgs and not sha1 in channelPkgs and not sha256 in channelPkgs: syncPkgs.append(filename) else: syncedPkgCount += 1 gzipfile.close() print "INFO: Packages in repo: %d" % repoPkgCount print "INFO: Packages synced: %d" % syncedPkgCount print "INFO: Packages to sync: %d" % len(syncPkgs) # download and push missing packages synced = 0 for pkg in syncPkgs: synced += 1 print "INFO: %d/%d: %s" % (synced, len(syncPkgs), os.path.basename(pkg)) # download url = urlopen(repoRoot + pkg) pkgFile = open(tempfile.gettempdir() + '/' + os.path.basename(pkg), 'wb')
def compress_string(s): zbuf = BytesIO() zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile.write(s) zfile.close() return zbuf.getvalue()
def putter(put, put_queue, stat_queue, options): pid = current_process().pid log = logging.getLogger(os.path.basename(sys.argv[0])) connection, bucket = None, None file_object_cache = FileObjectCache() # Figure out what content types we want to gzip if not options.gzip_type: # default gzip_content_types = GZIP_CONTENT_TYPES elif 'all' in options.gzip_type: gzip_content_types = GZIP_ALL else: gzip_content_types = options.gzip_type if 'guess' in gzip_content_types: # don't bother removing 'guess' from the list since nothing will match it gzip_content_types.extend(GZIP_CONTENT_TYPES) if options.gzip: pass #log.debug('These content types will be gzipped: %s' % unicode(gzip_content_types)) while True: args = put_queue.get() #print args, pid if args is None: put_queue.task_done() break key_name, value_kwargs = args #print(666,value_kwargs) if options.gzip: key_name = '%s.gz' % key_name value = Value(file_object_cache, **value_kwargs) should_gzip = False try: if connection is None: connection = S3Connection(is_secure=options.secure, host=options.host) if bucket is None: bucket = connection.get_bucket(options.bucket, validate=False) key = put(bucket, key_name, value) if key: if value.should_copy_content(): if options.headers: headers = dict( tuple(header.split(':', 1)) for header in options.headers) else: headers = {} content_type = None if options.content_type: if options.content_type == 'guess': content_type = mimetypes.guess_type(value.path)[0] elif options.content_type == 'magic': content_type = mimetypes.guess_type(value.path)[0] if content_type is None: content_type = magic.from_file(value.path, mime=True) else: content_type = options.content_type headers['Content-Type'] = content_type content = value.get_content() md5 = value.md5 should_gzip = options.gzip and ( content_type and content_type in gzip_content_types or gzip_content_types == GZIP_ALL) if should_gzip: headers['Content-Encoding'] = 'gzip' string_io = StringIO() gzip_file = GzipFile(compresslevel=1, fileobj=string_io, mode='w') gzip_file.write(content) gzip_file.close() content = string_io.getvalue() md5 = compute_md5(StringIO(content)) if not options.dry_run: key.set_contents_from_string( content, headers, md5=md5, policy=options.grant, encrypt_key=options.encrypt_key) #log.info('%s %s> %s' % (value.path, 'z' if should_gzip else '-', key.name)) stat_queue.put(dict(size=value.get_size())) else: log.info('skipping %s -> %s' % (value.path, key_name)) except SSLError as exc: log.error('%s -> %s (%s)' % (value.path, key_name, exc)) put_queue.put(args) connection, bucket = None, None except IOError as exc: log.error('%s -> %s (%s)' % (value.path, key_name, exc)) put_queue.task_done()
class NBTFile(TAG_Compound): """Represent an NBT file object.""" def __init__(self, filename=None, buffer=None, fileobj=None): """ Create a new NBTFile object. Specify either a filename, file object or data buffer. If filename of file object is specified, data should be GZip-compressed. If a data buffer is specified, it is assumed to be uncompressed. If filename is specified, the file is closed after reading and writing. If file object is specified, the caller is responsible for closing the file. """ super(NBTFile, self).__init__() self.filename = filename self.type = TAG_Byte(self.id) closefile = True # make a file object if filename: self.filename = filename self.file = GzipFile(filename, 'rb') elif buffer: if hasattr(buffer, 'name'): self.filename = buffer.name self.file = buffer closefile = False elif fileobj: if hasattr(fileobj, 'name'): self.filename = fileobj.name self.file = GzipFile(fileobj=fileobj) else: self.file = None closefile = False # parse the file given initially if self.file: self.parse_file() if closefile: # Note: GzipFile().close() does NOT close the fileobj, # So we are still responsible for closing that. try: self.file.close() except (AttributeError, IOError): pass self.file = None def parse_file(self, filename=None, buffer=None, fileobj=None): """Completely parse a file, extracting all tags.""" closefile = True if filename: self.file = GzipFile(filename, 'rb') elif buffer: if hasattr(buffer, 'name'): self.filename = buffer.name self.file = buffer closefile = False elif fileobj: if hasattr(fileobj, 'name'): self.filename = fileobj.name self.file = GzipFile(fileobj=fileobj) if self.file: try: type = TAG_Byte(buffer=self.file) if type.value == self.id: name = TAG_String(buffer=self.file).value self._parse_buffer(self.file) self.name = name if closefile: self.file.close() else: raise MalformedFileError( "First record is not a Compound Tag") except StructError as e: raise MalformedFileError( "Partial File Parse: file possibly truncated.") else: raise ValueError("NBTFile.parse_file(): Need to specify either a " "filename or a file object") def write_file(self, filename=None, buffer=None, fileobj=None): """Write this NBT file to a file.""" closefile = True if buffer: self.filename = None self.file = buffer closefile = False elif filename: self.filename = filename self.file = GzipFile(filename, "wb") elif fileobj: self.filename = None self.file = GzipFile(fileobj=fileobj, mode="wb") elif self.filename: self.file = GzipFile(self.filename, "wb") elif not self.file: raise ValueError("NBTFile.write_file(): Need to specify either a " "filename or a file object") # Render tree to file TAG_Byte(self.id)._render_buffer(self.file) TAG_String(self.name)._render_buffer(self.file) self._render_buffer(self.file) # make sure the file is complete try: self.file.flush() except (AttributeError, IOError): pass if closefile: try: self.file.close() except (AttributeError, IOError): pass def __repr__(self): """ Return a string (ascii formated for Python 2, unicode for Python 3) describing the class, name and id for debugging purposes. """ if self.filename: return "<%s(%r) with %s(%r) at 0x%x>" % ( self.__class__.__name__, self.filename, TAG_Compound.__name__, self.name, id(self)) else: return "<%s with %s(%r) at 0x%x>" % (self.__class__.__name__, TAG_Compound.__name__, self.name, id(self))
class S3Boto3StorageFile(File): """ The default file object used by the S3Boto3Storage backend. This file implements file streaming using boto's multipart uploading functionality. The file can be opened in read or write mode. This class extends Django's File class. However, the contained data is only the data contained in the current buffer. So you should not access the contained file object directly. You should access the data via this class. Warning: This file *must* be closed using the close() method in order to properly write the file to S3. Be sure to close the file in your application. """ buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880) def __init__(self, name, mode, storage, buffer_size=None): if 'r' in mode and 'w' in mode: raise ValueError("Can't combine 'r' and 'w' in mode.") self._storage = storage self.name = name[len(self._storage.location):].lstrip('/') self._mode = mode self._force_mode = (lambda b: b) if 'b' in mode else force_text self.obj = storage.bucket.Object(storage._encode_name(name)) if 'w' not in mode: # Force early RAII-style exception if object does not exist self.obj.load() self._is_dirty = False self._raw_bytes_written = 0 self._file = None self._multipart = None # 5 MB is the minimum part size (if there is more than one part). # Amazon allows up to 10,000 parts. The default supports uploads # up to roughly 50 GB. Increase the part size to accommodate # for files larger than this. if buffer_size is not None: self.buffer_size = buffer_size self._write_counter = 0 @property def size(self): return self.obj.content_length def _get_file(self): if self._file is None: self._file = SpooledTemporaryFile( max_size=self._storage.max_memory_size, suffix=".S3Boto3StorageFile", dir=setting("FILE_UPLOAD_TEMP_DIR") ) if 'r' in self._mode: self._is_dirty = False self.obj.download_fileobj(self._file) self._file.seek(0) if self._storage.gzip and self.obj.content_encoding == 'gzip': self._file = GzipFile(mode=self._mode, fileobj=self._file, mtime=0.0) return self._file def _set_file(self, value): self._file = value file = property(_get_file, _set_file) def read(self, *args, **kwargs): if 'r' not in self._mode: raise AttributeError("File was not opened in read mode.") return self._force_mode(super(S3Boto3StorageFile, self).read(*args, **kwargs)) def readline(self, *args, **kwargs): if 'r' not in self._mode: raise AttributeError("File was not opened in read mode.") return self._force_mode(super(S3Boto3StorageFile, self).readline(*args, **kwargs)) def write(self, content): if 'w' not in self._mode: raise AttributeError("File was not opened in write mode.") self._is_dirty = True if self._multipart is None: self._multipart = self.obj.initiate_multipart_upload( **self._storage._get_write_parameters(self.obj.key) ) if self.buffer_size <= self._buffer_file_size: self._flush_write_buffer() bstr = force_bytes(content) self._raw_bytes_written += len(bstr) return super(S3Boto3StorageFile, self).write(bstr) @property def _buffer_file_size(self): pos = self.file.tell() self.file.seek(0, os.SEEK_END) length = self.file.tell() self.file.seek(pos) return length def _flush_write_buffer(self): """ Flushes the write buffer. """ if self._buffer_file_size: self._write_counter += 1 self.file.seek(0) part = self._multipart.Part(self._write_counter) part.upload(Body=self.file.read()) self.file.seek(0) self.file.truncate() def _create_empty_on_close(self): """ Attempt to create an empty file for this key when this File is closed if no bytes have been written and no object already exists on S3 for this key. This behavior is meant to mimic the behavior of Django's builtin FileSystemStorage, where files are always created after they are opened in write mode: f = storage.open("file.txt", mode="w") f.close() """ assert "w" in self._mode assert self._raw_bytes_written == 0 try: # Check if the object exists on the server; if so, don't do anything self.obj.load() except ClientError as err: if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404: self.obj.put( Body=b"", **self._storage._get_write_parameters(self.obj.key) ) else: raise def close(self): if self._is_dirty: self._flush_write_buffer() # TODO: Possibly cache the part ids as they're being uploaded # instead of requesting parts from server. For now, emulating # s3boto's behavior. parts = [{'ETag': part.e_tag, 'PartNumber': part.part_number} for part in self._multipart.parts.all()] self._multipart.complete( MultipartUpload={'Parts': parts}) else: if self._multipart is not None: self._multipart.abort() if 'w' in self._mode and self._raw_bytes_written == 0: self._create_empty_on_close() if self._file is not None: self._file.close() self._file = None
def _fetch_brute_kddcup99(subset=None, data_home=None, download_if_missing=True, random_state=None, shuffle=False, percent10=False): """Load the kddcup99 dataset, downloading it if necessary. Parameters ---------- subset : None, 'SA', 'SF', 'http', 'smtp' To return the corresponding classical subsets of kddcup 99. If None, return the entire kddcup 99 dataset. data_home : string, optional Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, optional (default=None) Random state for shuffling the dataset. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, default=False Whether to shuffle dataset. percent10 : bool, default=False Whether to load only 10 percent of the data. Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array of shape (494021, 41) Each row corresponds to the 41 features in the dataset. dataset.target : numpy array of shape (494021,) Each value corresponds to one of the 21 attack types or to the label 'normal.'. dataset.DESCR : string Description of the kddcup99 dataset. """ data_home = get_data_home(data_home=data_home) if sys.version_info[0] == 3: # The zlib compression format use by joblib is not compatible when # switching from Python 2 to Python 3, let us use a separate folder # under Python 3: dir_suffix = "-py3" else: # Backward compat for Python 2 users dir_suffix = "" if percent10: kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix) else: kddcup_dir = join(data_home, "kddcup99" + dir_suffix) samples_path = join(kddcup_dir, "samples") targets_path = join(kddcup_dir, "targets") available = exists(samples_path) if download_if_missing and not available: _mkdirp(kddcup_dir) URL_ = URL10 if percent10 else URL logger.warning("Downloading %s" % URL_) f = BytesIO(urlopen(URL_).read()) dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'), ('flag', 'S6'), ('src_bytes', int), ('dst_bytes', int), ('land', int), ('wrong_fragment', int), ('urgent', int), ('hot', int), ('num_failed_logins', int), ('logged_in', int), ('num_compromised', int), ('root_shell', int), ('su_attempted', int), ('num_root', int), ('num_file_creations', int), ('num_shells', int), ('num_access_files', int), ('num_outbound_cmds', int), ('is_host_login', int), ('is_guest_login', int), ('count', int), ('srv_count', int), ('serror_rate', float), ('srv_serror_rate', float), ('rerror_rate', float), ('srv_rerror_rate', float), ('same_srv_rate', float), ('diff_srv_rate', float), ('srv_diff_host_rate', float), ('dst_host_count', int), ('dst_host_srv_count', int), ('dst_host_same_srv_rate', float), ('dst_host_diff_srv_rate', float), ('dst_host_same_src_port_rate', float), ('dst_host_srv_diff_host_rate', float), ('dst_host_serror_rate', float), ('dst_host_srv_serror_rate', float), ('dst_host_rerror_rate', float), ('dst_host_srv_rerror_rate', float), ('labels', 'S16')] DT = np.dtype(dt) file_ = GzipFile(fileobj=f, mode='r') Xy = [] for line in file_.readlines(): if six.PY3: line = line.decode() Xy.append(line.replace('\n', '').split(',')) file_.close() print('extraction done') Xy = np.asarray(Xy, dtype=object) for j in range(42): Xy[:, j] = Xy[:, j].astype(DT[j]) X = Xy[:, :-1] y = Xy[:, -1] # XXX bug when compress!=0: # (error: 'Incorrect data length while decompressing[...] the file # could be corrupted.') joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) elif not available: if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") try: X, y except NameError: X = joblib.load(samples_path) y = joblib.load(targets_path) if shuffle: X, y = shuffle_method(X, y, random_state=random_state) return Bunch(data=X, target=y, DESCR=__doc__)
"No rna file for genbank ID {}\n".format(genbankID)) continue elif rna_path.endswith('.gz'): handle = GzipFile(rna_path) else: handle = file(rna_path, 'r') for record in SeqIO.parse(handle, "fasta"): if "5S ribosomal RNA" in record.description: for i in record: if i in base_set: base_count += 1 if i in gc_set: gc_count += 1 else: continue handle.close() if base_count == 0: sys.stderr.write("{} has no 5S rRNA sequences.\n".format(goldID)) continue else: frac_gc = gc_count / float(base_count) query = "UPDATE GOLD_FEATURES SET rRNA_5S_GC = {} WHERE GOLD_ID = '{}';".format( frac_gc, goldID) c.execute(query) conn.commit() conn.close()
def _make_lib(self, version): # We get the payload from the remote host platform = sys.platform if platform == "linux2": platform = "linux" url = self._base_url + "{}/".format(version) name = "poetry-{}-{}.tar.gz".format(version, platform) checksum = "poetry-{}-{}.sha256sum".format(version, platform) try: r = urlopen(url + "{}".format(checksum)) except HTTPError as e: if e.code == 404: raise RuntimeError("Could not find {} file".format(checksum)) raise checksum = r.read().decode() try: r = urlopen(url + "{}".format(name)) except HTTPError as e: if e.code == 404: raise RuntimeError("Could not find {} file".format(name)) raise meta = r.info() size = int(meta["Content-Length"]) current = 0 block_size = 8192 print(" - Downloading {} ({:.2f}MB)".format(colorize("comment", name), size / 1024 / 1024)) sha = hashlib.sha256() with temporary_directory(prefix="poetry-installer-") as dir_: tar = os.path.join(dir_, name) with open(tar, "wb") as f: while True: buffer = r.read(block_size) if not buffer: break current += len(buffer) f.write(buffer) sha.update(buffer) # Checking hashes if checksum != sha.hexdigest(): raise RuntimeError( "Hashes for {} do not match: {} != {}".format( name, checksum, sha.hexdigest())) gz = GzipFile(tar, mode="rb") try: with tarfile.TarFile(tar, fileobj=gz, format=tarfile.PAX_FORMAT) as f: f.extractall(POETRY_LIB) finally: gz.close()
def test_reload_old_style_pickled_ring(self): devs = [{ 'id': 0, 'zone': 0, 'weight': 1.0, 'ip': '10.1.1.1', 'port': 6000 }, { 'id': 1, 'zone': 0, 'weight': 1.0, 'ip': '10.1.1.1', 'port': 6000 }, None, { 'id': 3, 'zone': 2, 'weight': 1.0, 'ip': '10.1.2.1', 'port': 6000 }, { 'id': 4, 'zone': 2, 'weight': 1.0, 'ip': '10.1.2.2', 'port': 6000 }] intended_devs = [{ 'id': 0, 'region': 1, 'zone': 0, 'weight': 1.0, 'ip': '10.1.1.1', 'port': 6000, 'replication_ip': '10.1.1.1', 'replication_port': 6000 }, { 'id': 1, 'region': 1, 'zone': 0, 'weight': 1.0, 'ip': '10.1.1.1', 'port': 6000, 'replication_ip': '10.1.1.1', 'replication_port': 6000 }, None, { 'id': 3, 'region': 1, 'zone': 2, 'weight': 1.0, 'ip': '10.1.2.1', 'port': 6000, 'replication_ip': '10.1.2.1', 'replication_port': 6000 }, { 'id': 4, 'region': 1, 'zone': 2, 'weight': 1.0, 'ip': '10.1.2.2', 'port': 6000, 'replication_ip': '10.1.2.2', 'replication_port': 6000 }] # simulate an old-style pickled ring testgz = os.path.join(self.testdir, 'without_replication_or_region.ring.gz') ring_data = ring.RingData(self.intended_replica2part2dev_id, devs, self.intended_part_shift) # an old-style pickled ring won't have region data for dev in ring_data.devs: if dev: del dev["region"] gz_file = GzipFile(testgz, 'wb') pickle.dump(ring_data, gz_file, protocol=2) gz_file.close() self.ring = ring.Ring(self.testdir, reload_time=self.intended_reload_time, ring_name='without_replication_or_region') self.assertEquals(self.ring.devs, intended_devs)
def fromfile(f): infile = GzipFile(f) result = loads(infile.read()) infile.close() return result
def _fetch_brute_kddcup99(data_home=None, download_if_missing=True, percent10=True): """Load the kddcup99 dataset, downloading it if necessary. Parameters ---------- data_home : str, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : bool, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. percent10 : bool, default=True Whether to load only 10 percent of the data. Returns ------- dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : ndarray of shape (494021, 41) Each row corresponds to the 41 features in the dataset. target : ndarray of shape (494021,) Each value corresponds to one of the 21 attack types or to the label 'normal.'. feature_names : list The names of the dataset columns target_names: list The names of the target columns DESCR : str Description of the kddcup99 dataset. """ data_home = get_data_home(data_home=data_home) dir_suffix = "-py3" if percent10: kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix) archive = ARCHIVE_10_PERCENT else: kddcup_dir = join(data_home, "kddcup99" + dir_suffix) archive = ARCHIVE samples_path = join(kddcup_dir, "samples") targets_path = join(kddcup_dir, "targets") available = exists(samples_path) dt = [ ("duration", int), ("protocol_type", "S4"), ("service", "S11"), ("flag", "S6"), ("src_bytes", int), ("dst_bytes", int), ("land", int), ("wrong_fragment", int), ("urgent", int), ("hot", int), ("num_failed_logins", int), ("logged_in", int), ("num_compromised", int), ("root_shell", int), ("su_attempted", int), ("num_root", int), ("num_file_creations", int), ("num_shells", int), ("num_access_files", int), ("num_outbound_cmds", int), ("is_host_login", int), ("is_guest_login", int), ("count", int), ("srv_count", int), ("serror_rate", float), ("srv_serror_rate", float), ("rerror_rate", float), ("srv_rerror_rate", float), ("same_srv_rate", float), ("diff_srv_rate", float), ("srv_diff_host_rate", float), ("dst_host_count", int), ("dst_host_srv_count", int), ("dst_host_same_srv_rate", float), ("dst_host_diff_srv_rate", float), ("dst_host_same_src_port_rate", float), ("dst_host_srv_diff_host_rate", float), ("dst_host_serror_rate", float), ("dst_host_srv_serror_rate", float), ("dst_host_rerror_rate", float), ("dst_host_srv_rerror_rate", float), ("labels", "S16"), ] column_names = [c[0] for c in dt] target_names = column_names[-1] feature_names = column_names[:-1] if available: try: X = joblib.load(samples_path) y = joblib.load(targets_path) except Exception as e: raise IOError( "The cache for fetch_kddcup99 is invalid, please delete " f"{str(kddcup_dir)} and run the fetch_kddcup99 again") from e elif download_if_missing: _mkdirp(kddcup_dir) logger.info("Downloading %s" % archive.url) _fetch_remote(archive, dirname=kddcup_dir) DT = np.dtype(dt) logger.debug("extracting archive") archive_path = join(kddcup_dir, archive.filename) file_ = GzipFile(filename=archive_path, mode="r") Xy = [] for line in file_.readlines(): line = line.decode() Xy.append(line.replace("\n", "").split(",")) file_.close() logger.debug("extraction done") os.remove(archive_path) Xy = np.asarray(Xy, dtype=object) for j in range(42): Xy[:, j] = Xy[:, j].astype(DT[j]) X = Xy[:, :-1] y = Xy[:, -1] # XXX bug when compress!=0: # (error: 'Incorrect data length while decompressing[...] the file # could be corrupted.') joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) else: raise IOError("Data not found and `download_if_missing` is False") return Bunch( data=X, target=y, feature_names=feature_names, target_names=[target_names], )
def eval(): files = glob(data + "/" + "*.zip") files.sort() print len(files), "found" for fileName in files[:]: print fileName # s_time = time.time() smp = pp.GestureSample(fileName) # print "loading", (time.time()-s_time)/1000.,"ms" # s_time = time.time() n = smp.data['numFrames'] dv, uv, gv = smp.depth, smp.user, smp.rgb cur_fr = 1 # new_shape = (step,128,128) s = [] d, u, g = [empty((n_f, ) + vid_res + (3, ), "uint8") for _ in range(3)] # take first n_f frames for v in dv, uv, gv: pp.go_to_frame(v, cur_fr) for i, fr in enumerate(range(cur_fr, cur_fr + n_f)): s.append(smp.getSkeleton(fr)) d[i], u[i], g[i] = [v.read()[1] for v in dv, uv, gv] d, u, g = [pp.to_grayscale(v) for v in d, u, g] u[u < 128], u[u >= 128] = 0, 1 depth, user, gray, skelet = d, u, g, s user_o = user.copy() depth_o = depth.copy() gray_o = gray.copy() # user_depth = depth_o[user_o==1] skelet, c = pp.proc_skelet(array(skelet).copy()) user = pp.proc_user(user) _, depth, c = pp.proc_depth(depth.copy(), user.copy(), user_o, array(skelet).copy()) gray, c = pp.proc_gray( gray.copy(), user, array(skelet).copy()) #user.copy!!!!!!!!!!!!!!!!!!! cur_fr += n_f predictions = [] while cur_fr + step < n: # time_start = time.time() sn = [] dn, un, gn = [ empty((step, ) + vid_res + (3, ), "uint8") for _ in range(3) ] # for v in dv,uv,gv: pp.go_to_frame(v, cur_fr) for i, fr in enumerate(range(cur_fr, cur_fr + step)): sn.append(smp.getSkeleton(fr)) dn[i], un[i], gn[i] = [v.read()[1] for v in dv, uv, gv] dn, un, gn = [pp.to_grayscale(v) for v in dn, un, gn] un[un < 128], un[un >= 128] = 0, 1 s = s[step:] + sn # s.extend(sn) skelet, c = pp.proc_skelet(s, _3D=False) # len_dump = len(depth_o[:step][user_o[:step]==1]) # un_d = dn[un==1] user_o[:-step] = user_o[step:] user_o[-step:] = un.copy() un = pp.proc_user(un, 3) user[:-step] = user[step:] user[-step:] = un.copy() depth_o[:-step] = depth_o[step:] depth_o[-step:] = dn.copy() gray_o[:-step] = gray_o[step:] gray_o[-step:] = gn.copy() _, depth, c = pp.proc_depth(depth_o.copy(), user.copy(), user_o, skelet) gray, c = pp.proc_gray(gray_o.copy(), user, skelet) traj2D, traj3D, ori, pheight, hand, center = skelet video = empty(( 1, 2, ) + gray.shape, dtype="uint8") video[0, 0] = gray.copy() video[0, 1] = depth.copy() video = video.swapaxes(1, 2) #(body-hand,gray-depth,fr,h,w) v_new = empty((1, 2, 2) + vid_shape, dtype="uint8") # p = pheight ratio = 0.25 for i in xrange(video.shape[0]): #batch if pheight < 10: pheight = 100 scale = ratio #+randi(2)/100. ofs = pheight * scale mid = video.shape[-1] / 2. sli = None if ofs < mid: start = int(round(mid - ofs)) end = int(round(mid + ofs)) sli = slice(start, end) for j in xrange(video.shape[2]): #maps for k in xrange(video.shape[3]): #frames #body img = video[i, 0, j, k] img = cut_img(img, 5) img = misc.imresize(img, (h, h)) # if j==0: img = 255-misc.imfilter(img,"contour") v_new[i, 0, j, k] = img #hand img = video[i, 1, j, k] img = img[sli, sli] img = misc.imresize(img, (h, h)) v_new[i, 1, j, k] = img # print "put" # pred_loop(v_new,cur_fr,n, fileName) x_.set_value(v_new.astype("float32"), borrow=True) pred = evalu_model()[0][0] predictions.append(pred) cur_fr += step predictions = array(predictions, float32) pred_file_name = fileName.split('/') pred_file_name = pred_file_name[-1].replace(".zip", "_prediction.zip") file = GzipFile(dst + "/" + pred_file_name, 'wb') dump(predictions, file, -1) file.close()
def tofile(f, obj): out = GzipFile(f, 'wb') out.write(dumps(obj)) out.close()
def close(self): GzipFile.close(self) self.fileobj_to_close.close()
def add_file(): tags = request.forms.get('tag_list') uploads = request.files.getlist('file') # Set Project project = request.forms.get('project') if project in project_list(): __project__.open(project) else: __project__.open('../') project = 'Main' db = Database() file_list = [] # Write temp file to disk with upload_temp() as temp_dir: for upload in uploads: file_path = os.path.join(temp_dir, upload.filename) with open(file_path, 'w') as tmp_file: tmp_file.write(upload.file.read()) # Zip Files if request.forms.get('compression') == 'zip': zip_pass = request.forms.get('zip_pass') try: with ZipFile(file_path) as zf: zf.extractall(temp_dir, pwd=zip_pass) for root, dirs, files in os.walk(temp_dir, topdown=False): for name in files: if not name == upload.filename: file_list.append(os.path.join(root, name)) except Exception as e: return template('error.tpl', error="Error with zipfile - {0}".format(e)) # GZip Files elif request.forms.get('compression') == 'gz': try: gzf = GzipFile(file_path, 'rb') decompress = gzf.read() gzf.close() with open(file_path[:-3], "wb") as df: df.write(decompress) file_list.append(file_path[:-3]) except Exception as e: return template( 'error.tpl', error="Error with gzipfile - {0}".format(e)) # BZip2 Files elif request.forms.get('compression') == 'bz2': try: bz2f = BZ2File(file_path, 'rb') decompress = bz2f.read() bz2f.close() with open(file_path[:-3], "wb") as df: df.write(decompress) file_list.append(file_path[:-3]) except Exception as e: return template( 'error.tpl', error="Error with bzip2file - {0}".format(e)) # Tar Files (any, including tar.gz tar.bz2) elif request.forms.get('compression') == 'tar': try: if not tarfile.is_tarfile(file_path): return template('error.tpl', error="This is not a tar file") with tarfile.open(file_path, 'r:*') as tarf: tarf.extractall(temp_dir) for root, dirs, files in os.walk(temp_dir, topdown=False): for name in files: if not name == upload.filename: file_list.append(os.path.join(root, name)) except Exception as e: return template('error.tpl', error="Error with tarfile - {0}".format(e)) # Non zip files elif request.forms.get('compression') == 'none': file_list.append(file_path) # Add each file for new_file in file_list: print new_file obj = File(new_file) new_path = store_sample(obj) success = True if new_path: # Add file to the database. success = db.add(obj=obj, tags=tags) if not success: return template( 'error.tpl', error="Unable to Store The File: {0}".format( upload.filename)) redirect("/project/{0}".format(project))
def close(self): # GzipFile.close() doesn't actuallly close anything. if self.mode == GZ_WRITE: self._write_gzip(None) self._reset_buffer() return GzipFile.close(self)
def _put_file(self, name, content): name = self._path(name) placeholder = False if self.cache: if not self.cache.exists(name): self.cache.save(name, 0, 0) placedholder = True content_type = mimetypes.guess_type(name)[0] or "application/x-octet-stream" headers = {} for pattern in self.headers: if pattern[0].match(name): headers = pattern[1].copy() break file_pos = content.tell() content.seek(0, 2) content_length = content.tell() content.seek(0) gz_cts = getattr( settings, 'CUDDLYBUDDLY_STORAGE_S3_GZIP_CONTENT_TYPES', ( 'text/css', 'application/javascript', 'application/x-javascript' ) ) gz_content = None if content_length > 1024 and content_type in gz_cts: gz_content = StringIO() gzf = GzipFile(mode='wb', fileobj=gz_content) gzf.write(content.read()) content.seek(0) gzf.close() gz_content.seek(0, 2) gz_content_length = gz_content.tell() gz_content.seek(0) if gz_content_length < content_length: content_length = gz_content_length headers.update({ 'Content-Encoding': 'gzip' }) else: gz_content = None headers.update({ 'Content-Type': content_type, 'Content-Length': str(content_length) }) # Httplib in < 2.6 doesn't accept file like objects. Meanwhile in # >= 2.7 it will try to join a content str object with the headers which # results in encoding problems. if sys.version_info[0] == 2 and sys.version_info[1] < 6: content_to_send = gz_content.read() if gz_content is not None else content.read() else: content_to_send = gz_content if gz_content is not None else content response = self.connection.put(self.bucket, name, content_to_send, headers) content.seek(file_pos) if response.http_response.status != 200: if placeholder: self.cache.remove(name) raise S3Error(response.message) if self.cache: date = response.http_response.getheader('Date') date = timegm(parsedate(date)) self.cache.save(name, size=content_length, mtime=date)
def __init__(self, derrickFile): self.derrickFile = derrickFile g = GzipFile(derrickFile, "rb") self.messages = [DerrickPacket(l.rstrip("\r\n")) for l in g] g.close()
def save_to_brain(): print("SAVING TO DISK-----------------") print(db) gz = GzipFile(db_file, 'wb') dump(db, gz, -1) gz.close()
def transfer_yaml(): upload_folder = os.path.join(app.root_path, app.config['UPLOAD_FOLDER']) if request.method == 'GET': tarfile_backend = TemporaryFile(mode='wb+') yamlfile = TemporaryFile(mode='wb+') tarball = tarfile.open(fileobj=tarfile_backend, mode='w') yamlfile.write( bytes( export_challenges('export.yaml', 'export.d', upload_folder, tarball), "UTF-8")) tarinfo = tarfile.TarInfo('export.yaml') tarinfo.size = yamlfile.tell() yamlfile.seek(0) tarball.addfile(tarinfo, yamlfile) tarball.close() yamlfile.close() gzipfile_backend = TemporaryFile(mode='wb+') gzipfile = GzipFile(fileobj=gzipfile_backend, mode='wb') tarfile_backend.seek(0) shutil.copyfileobj(tarfile_backend, gzipfile) tarfile_backend.close() gzipfile.close() gzipfile_backend.seek(0) return send_file(gzipfile_backend, as_attachment=True, attachment_filename='export.tar.gz') if request.method == 'POST': if 'file' not in request.files: abort(400) file = request.files['file'] readmode = 'r:gz' if file.filename.endswith('.tar'): readmode = 'r' if file.filename.endswith('.bz2'): readmode = 'r:bz2' tempdir = mkdtemp() try: archive = tarfile.open(fileobj=file.stream, mode=readmode) if 'export.yaml' not in archive.getnames(): shutil.rmtree(tempdir) abort(400) # Check for atttempts to escape to higher dirs for member in archive.getmembers(): memberpath = os.path.normpath(member.name) if memberpath.startswith('/') or '..' in memberpath.split( '/'): shutil.rmtree(tempdir) abort(400) if member.linkname: linkpath = os.path.normpath(member.linkname) if linkpath.startswith('/') or '..' in linkpath.split( '/'): shutil.rmtree(tempdir) abort(400) archive.extractall(path=tempdir) except tarfile.TarError: shutil.rmtree(tempdir) print('b') abort(400) in_file = os.path.join(tempdir, 'export.yaml') import_challenges(in_file, upload_folder, move=True) shutil.rmtree(tempdir) return '1'
def writePackets(self, messages): g = GzipFile(self.derrickFile, "wb") for m in messages: g.write("%s\n" % str(m)) g.close()
from gzip import GzipFile import boto3 s3 = boto3.client('s3') bucket = 'bluebucket.mindvessel.net' # Read in some example text, as unicode with open("utext.txt") as fi: text_body = fi.read().decode("utf-8") # A GzipFile must wrap a real file or a file-like object. We do not want to # write to disk, so we use a BytesIO as a buffer. gz_body = BytesIO() gz = GzipFile(None, 'wb', 9, gz_body) gz.write(text_body.encode('utf-8')) # convert unicode strings to bytes! gz.close() # GzipFile has written the compressed bytes into our gz_body s3.put_object( Bucket=bucket, Key='gztest.txt', # Note: NO .gz extension! ContentType='text/plain', # the original type ContentEncoding='gzip', # MUST have or browsers will error Body=gz_body.getvalue()) retr = s3.get_object(Bucket=bucket, Key='gztest.txt') # Now the fun part. Reading it back requires this little dance, because # GzipFile insists that its underlying file-like thing implement tell and # seek, but boto3's io stream does not. bytestream = BytesIO(retr['Body'].read()) got_text = GzipFile(None, 'rb', fileobj=bytestream).read().decode('utf-8') assert got_text == text_body
print "Failed creating dir: " + os.path.dirname( pack[0]) pass sendFile(os.path.join(options.packdir, packname), dirpack, options.move) else: # Package already exists in repo if options.overwrite: sendFile(os.path.join(options.packdir, packname), dirpack, options.move) else: if options.move: print 'File exists ' + packname + ', deleting...' os.remove(os.path.join(options.packdir, packname)) packfiles.remove(packname) if packlisttext != '': # Only bother with the Packages.gz file if there is a reason if not os.path.exists(os.path.dirname(dirlist)): try: os.makedirs(os.path.dirname(dirlist)) print "Creating dir: " + os.path.dirname(packlist[0]) except: print "Failed creating dir: " + os.path.dirname(packlist[0]) pass print "Writing file: " + packlist[0] + '.gz' packlistfile = file( dirlist + '.gz', 'ab' ) # If repo already has this Packages.gz file then add the new files to it. gzfile = GzipFile(dirlist, 'ab', 9, packlistfile) gzfile.write(packlisttext) gzfile.close() packlistfile.close()
class ezfio_obj(object): def __init__(self, read_only=False): self._filename = "EZFIO_File" self.buffer_rank = -1 self.read_only = read_only self.locks = {} def acquire_lock(self, var): locks = self.locks try: locks[var].acquire() except: locks[var] = threading.Lock() locks[var].acquire() def release_lock(self, var): self.locks[var].release() def set_read_only(self, v): self.read_only = v def get_read_only(self): return self.read_only def exists(self, path): if os.access(path + '/.version', os.F_OK) == 1: file = open(path + '/.version', "r") v = file.readline().strip() file.close() else: return False def mkdir(self, path): if self.read_only: self.error('Read-only file.') if self.exists(path): self.error('mkdir', 'Group ' + path + ' exists') try: os.mkdir(path.strip()) except OSError: pass file = open(path.strip() + '/.version', 'w') print >> file, self.version file.close() def error(self, where, txt): print '------------------------------------------------------------' print 'EZFIO File : ' + self.filename print 'EZFIO Error in : ' + where.strip() print '------------------------------------------------------------' print '' print txt.strip() print '' print '------------------------------------------------------------' raise IOError def get_filename(self): if not self.exists(self._filename): self.mkdir(self._filename) return self._filename def set_filename(self, filename): self._filename = filename filename = property(fset=set_filename, fget=get_filename) def set_file(self, filename): self.filename = filename if not self.exists(filename): self.mkdir(filename) self.mkdir(filename + "/ezfio") os.system(""" LANG= date > %s/ezfio/creation echo $USER > %s/ezfio/user echo %s > %s/ezfio/library""" % (filename, filename, self.LIBRARY, filename)) def open_write_buffer(self, dir, fil, rank): if self.read_only: self.error('Read-only file.') l_filename = dir.strip() + '/' + fil + '.gz' if self.buffer_rank != -1: self.error('open_write_buffer', 'Another buffered file is already open.') self.buffer_rank = rank assert (self.buffer_rank > 0) try: self.file = GzipFile(filename=l_filename, mode='wb7') except IOError: self.error('open_write_buffer', 'Unable to open buffered file.') self.file.write("%2d\n" % (rank, )) def open_read_buffer(self, dir, fil, rank): l_filename = dir.strip() + '/' + fil + '.gz' if self.buffer_rank != -1: self.error('open_read_buffer', 'Another buffered file is already open.') try: self.file = GzipFile(filename=l_filename, mode='rb') except IOError: self.error('open_read_buffer', 'Unable to open buffered file.') try: rank = eval(self.file.readline()) except IOError: self.error('open_read_buffer', 'Unable to read buffered file.') self.buffer_rank = rank assert (self.buffer_rank > 0) return rank def close_buffer(self): assert (self.buffer_rank > 0) self.buffer_rank = -1 self.file.close() def read_buffer(self, isize): if self.buffer_rank == -1: self.error('read_buffer', 'No buffered file is open.') indices = [] values = [] for i in xrange(isize): try: line = self.file.readline().split() except: return indices, values if len(line) == 0: return indices, values indices.append([int(i) for i in line[:-1]]) values.append(eval(line[-1])) return indices, values def write_buffer(self, indices, values, isize): if self.read_only: self.error('Read-only file.') if self.buffer_rank == -1: self.error('write_buffer', 'No buffered file is open.') for i in xrange(isize): for j in indices[i]: self.file.write("%4d " % (j, )) self.file.write("%24.15e\n" % (values[i], ))
class S3BotoStorageFile(File): """ The default file object used by the S3BotoStorage backend. This file implements file streaming using boto's multipart uploading functionality. The file can be opened in read or write mode. This class extends Django's File class. However, the contained data is only the data contained in the current buffer. So you should not access the contained file object directly. You should access the data via this class. Warning: This file *must* be closed using the close() method in order to properly write the file to S3. Be sure to close the file in your application. """ # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing. # TODO: When Django drops support for Python 2.5, rewrite to use the # BufferedIO streams in the Python 2.6 io module. buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880) def __init__(self, name, mode, storage, buffer_size=None): self._storage = storage self.name = name[len(self._storage.location):].lstrip('/') self._mode = mode self.key = storage.bucket.get_key(self._storage._encode_name(name)) if not self.key and 'w' in mode: self.key = storage.bucket.new_key(storage._encode_name(name)) self._is_dirty = False self._file = None self._multipart = None # 5 MB is the minimum part size (if there is more than one part). # Amazon allows up to 10,000 parts. The default supports uploads # up to roughly 50 GB. Increase the part size to accommodate # for files larger than this. if buffer_size is not None: self.buffer_size = buffer_size self._write_counter = 0 if not hasattr(django_settings, 'AWS_DEFAULT_ACL'): warnings.warn( "The default behavior of S3BotoStorage is insecure. By default files " "and new buckets are saved with an ACL of 'public-read' (globally " "publicly readable). To change to using the bucket's default ACL " "set AWS_DEFAULT_ACL = None, otherwise to silence this warning " "explicitly set AWS_DEFAULT_ACL.") @property def size(self): return self.key.size def _get_file(self): if self._file is None: self._file = SpooledTemporaryFile( max_size=self._storage.max_memory_size, suffix='.S3BotoStorageFile', dir=setting('FILE_UPLOAD_TEMP_DIR')) if 'r' in self._mode: self._is_dirty = False self.key.get_contents_to_file(self._file) self._file.seek(0) if self._storage.gzip and self.key.content_encoding == 'gzip': self._file = GzipFile(mode=self._mode, fileobj=self._file) return self._file def _set_file(self, value): self._file = value file = property(_get_file, _set_file) def read(self, *args, **kwargs): if 'r' not in self._mode: raise AttributeError('File was not opened in read mode.') return super(S3BotoStorageFile, self).read(*args, **kwargs) def write(self, content, *args, **kwargs): if 'w' not in self._mode: raise AttributeError('File was not opened in write mode.') self._is_dirty = True if self._multipart is None: provider = self.key.bucket.connection.provider upload_headers = {} if self._storage.default_acl: upload_headers[provider.acl_header] = self._storage.default_acl upload_headers.update({ 'Content-Type': mimetypes.guess_type(self.key.name)[0] or self._storage.key_class.DefaultContentType }) upload_headers.update(self._storage.headers) self._multipart = self._storage.bucket.initiate_multipart_upload( self.key.name, headers=upload_headers, reduced_redundancy=self._storage.reduced_redundancy, encrypt_key=self._storage.encryption, ) if self.buffer_size <= self._buffer_file_size: self._flush_write_buffer() return super(S3BotoStorageFile, self).write(force_bytes(content), *args, **kwargs) @property def _buffer_file_size(self): pos = self.file.tell() self.file.seek(0, os.SEEK_END) length = self.file.tell() self.file.seek(pos) return length def _flush_write_buffer(self): if self._buffer_file_size: self._write_counter += 1 self.file.seek(0) headers = self._storage.headers.copy() self._multipart.upload_part_from_file(self.file, self._write_counter, headers=headers) self.file.seek(0) self.file.truncate() def close(self): if self._is_dirty: self._flush_write_buffer() self._multipart.complete_upload() else: if self._multipart is not None: self._multipart.cancel_upload() self.key.close() if self._file is not None: self._file.close() self._file = None
class ChunkedFile(object): """Compressed file writer/reader that stores data in chunks in a zip file. Transparently supports reading gzip files. """ def __init__(self, filename, subfile='', mode='r', chunksize=131072, autoflush=True): """Create a ChunkedFile object with given filename, I/O mode (r,w,a), and preferred chunk size. If you wish to manually control the chunk boundaries using bookmark() or flush(), set autoflush=False.""" if mode not in 'rwa': raise ValueError('Mode must be r or w or a') self._is_gzip = False if os.path.isdir(filename): assert mode == 'r' self.zip = UnpackedZipFile(filename, mode) else: try: self.zip = ZipFile(filename, mode, ZIP_DEFLATED) except BadZipfile: assert mode == 'r' # Transparent reading of gzip files # (relatively fast, pure-python, some limitations) self.zip = GzipFile(filename, mode) self._is_gzip = True self.prefix = '%s/c.' % str(subfile) if subfile else 'c.' self.mode = mode self.chunksize = chunksize self.autoflush = autoflush # List of available chunks if not self._is_gzip: self.chunks = self._chunks() # Determine current position if mode == 'r': self.eof = False self.chunkidx = -1 else: self.eof = True self.chunkidx = len(self.chunks)-1 if self.chunkidx >= 0: info = self.zip.getinfo(self.chunks[self.chunkidx].name) self.pos = self.chunks[self.chunkidx].pos + info.file_size else: self.pos = 0 # Buffers self.nextbuf = [] self.readbuf = '' self.writebuf = '' self._last_bookmark = None def _chunks(self): """Return a list of ChunkInfos, one for each chunk in the file.""" offset = len(self.prefix) chunks = [] for name in self.zip.namelist(): # Check multifiles if not name[0:].startswith(self.prefix): continue nameinfo = name[offset:].split(',') pos = int(nameinfo[0], 16) bookmark = None if len(nameinfo) > 1: bookmark = urlsafe_b64decode(nameinfo[1]) chunks.append(ChunkInfo(name=name, pos=pos, bookmark=bookmark)) return sorted(chunks, key=lambda chunk: chunk.pos) def _next_chunk(self): """Read the next chunk into the read buffer.""" if self._is_gzip: chunk = self.zip.read(self.chunksize) if not chunk: self.eof = True raise EOFError else: self.readbuf += chunk return self.chunkidx += 1 if self.chunkidx >= len(self.chunks): self.eof = True raise EOFError else: self.readbuf += self.zip.read(self.chunks[self.chunkidx].name) def _flush(self, auto=True, bookmark=None): """Flush complete chunks from the write buffer. An incomplete chunk may be created (and the write buffer completely emptied) if auto=False""" if auto and not self.autoflush: return while self.writebuf and \ (len(self.writebuf) >= self.chunksize or not auto): self.chunkidx += 1 assert(self.chunkidx == len(self.chunks)) chunkpos = self.pos-len(self.writebuf) chunkname = '%s%08x' % (self.prefix, chunkpos) chunkbookmark = None if bookmark and len(self.writebuf) <= self.chunksize: chunkname += ','+urlsafe_b64encode(bookmark) chunkbookmark = bookmark self.zip.writestr(chunkname, self.writebuf[:self.chunksize]) self.writebuf = self.writebuf[self.chunksize:] self.chunks.append(ChunkInfo(name=chunkname, pos=chunkpos, bookmark=chunkbookmark)) def close(self): """Close the file. Must be called to avoid data loss.""" self.flush() self.zip.close() def flush(self): """Flush all output to the file.""" self._flush(auto=False) def bookmark(self, bookmark): """Possibly flush the file, writing a bookmark if doing so.""" assert(not self._last_bookmark or bookmark >= self._last_bookmark) self._last_bookmark = bookmark if len(self.writebuf) >= (self.chunksize-self.chunksize/8): # Use 7/8 of a chunksize to avoid creating too many tiny overflow # chunks. self._flush(auto=False, bookmark=bookmark) def write(self, data): """Write data to be stored in the file.""" assert(not self._is_gzip) self.writebuf += data self.pos += len(data) self._flush(auto=True) def read(self, size=-1): """Read data from the file.""" try: while size < 0 or len(self.readbuf) < size: self._next_chunk() except EOFError: pass if size > 0: ret = self.readbuf[:size] self.readbuf = self.readbuf[size:] elif size < 0: ret = self.readbuf self.readbuf = '' elif size == 0: ret = '' self.pos += len(ret) return ret def next(self): """Return the next line from the file or raise StopIteration.""" if self.nextbuf: self.pos += len(self.nextbuf[0]) return self.nextbuf.pop(0) if self.eof and not self.readbuf: raise StopIteration # Find next line ending try: while '\n' not in self.readbuf: self._next_chunk() except EOFError: if '\n' not in self.readbuf: if self.readbuf: return self.read(-1) else: raise StopIteration # Split lines into separate buffer self.nextbuf = self.readbuf.splitlines(True) if self.readbuf[-1] != '\n': self.readbuf = self.nextbuf.pop() else: self.readbuf = '' return self.next() def seek(self, offset, whence=0): """Seek to a given byte position in the file. Currently limited to files opened for mode=r and whence current location or beginning of the file.""" # Only simple writing is supported assert(self.mode == 'r') if whence == 0: pass elif whence == 1: offset = self.pos+offset elif whence == 2: raise NotImplementedError else: raise ValueError if self._is_gzip: assert(offset >= self.pos) else: # Find the correct chunk self.flush() self.nextbuf = [] self.readbuf = '' self.chunkidx = -1 self.pos = 0 for idx, data in enumerate(self.chunks): if data.pos <= offset: self.chunkidx = idx-1 self.pos = data.pos delta = offset-self.pos assert(delta >= 0) self.read(delta) assert(delta <= self.chunksize or self.eof or self._is_gzip) assert(self.pos == offset) def find_bookmark(self, bookmark, give_range=False): """Determine an appropriate seek position near bookmark.""" pos = 0 for chunk in self.chunks: if chunk.bookmark and chunk.bookmark < bookmark: pos = chunk.pos if give_range: ret_next = 0 for chunk in self.chunks: if ret_next == 1: assert(chunk.pos > pos) return pos, chunk.pos elif chunk.bookmark and chunk.bookmark > bookmark: ret_next = 1 return pos, None else: return pos def tell(self): """Return the current byte position in the file.""" return self.pos # def __enter__(...): return self # def __exit__(...): self.close() def __iter__(self): return self
def _fetch_brute_kddcup99(data_home=None, download_if_missing=True, percent10=True): """Load the kddcup99 dataset, downloading it if necessary. Parameters ---------- data_home : string, optional Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. percent10 : bool, default=True Whether to load only 10 percent of the data. Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array of shape (494021, 41) Each row corresponds to the 41 features in the dataset. dataset.target : numpy array of shape (494021,) Each value corresponds to one of the 21 attack types or to the label 'normal.'. dataset.DESCR : string Description of the kddcup99 dataset. """ data_home = get_data_home(data_home=data_home) dir_suffix = "-py3" if percent10: kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix) archive = ARCHIVE_10_PERCENT else: kddcup_dir = join(data_home, "kddcup99" + dir_suffix) archive = ARCHIVE samples_path = join(kddcup_dir, "samples") targets_path = join(kddcup_dir, "targets") available = exists(samples_path) if download_if_missing and not available: _mkdirp(kddcup_dir) logger.info("Downloading %s" % archive.url) _fetch_remote(archive, dirname=kddcup_dir) dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'), ('flag', 'S6'), ('src_bytes', int), ('dst_bytes', int), ('land', int), ('wrong_fragment', int), ('urgent', int), ('hot', int), ('num_failed_logins', int), ('logged_in', int), ('num_compromised', int), ('root_shell', int), ('su_attempted', int), ('num_root', int), ('num_file_creations', int), ('num_shells', int), ('num_access_files', int), ('num_outbound_cmds', int), ('is_host_login', int), ('is_guest_login', int), ('count', int), ('srv_count', int), ('serror_rate', float), ('srv_serror_rate', float), ('rerror_rate', float), ('srv_rerror_rate', float), ('same_srv_rate', float), ('diff_srv_rate', float), ('srv_diff_host_rate', float), ('dst_host_count', int), ('dst_host_srv_count', int), ('dst_host_same_srv_rate', float), ('dst_host_diff_srv_rate', float), ('dst_host_same_src_port_rate', float), ('dst_host_srv_diff_host_rate', float), ('dst_host_serror_rate', float), ('dst_host_srv_serror_rate', float), ('dst_host_rerror_rate', float), ('dst_host_srv_rerror_rate', float), ('labels', 'S16')] DT = np.dtype(dt) logger.debug("extracting archive") archive_path = join(kddcup_dir, archive.filename) file_ = GzipFile(filename=archive_path, mode='r') Xy = [] for line in file_.readlines(): line = line.decode() Xy.append(line.replace('\n', '').split(',')) file_.close() logger.debug('extraction done') os.remove(archive_path) Xy = np.asarray(Xy, dtype=object) for j in range(42): Xy[:, j] = Xy[:, j].astype(DT[j]) X = Xy[:, :-1] y = Xy[:, -1] # XXX bug when compress!=0: # (error: 'Incorrect data length while decompressing[...] the file # could be corrupted.') joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) elif not available: if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") try: X, y except NameError: X = joblib.load(samples_path) y = joblib.load(targets_path) return Bunch(data=X, target=y)