Python GzipFile.close Examples, gzip.GzipFile.close Python Examples

Example #1

0

Show file

File: s3_writer.py Project: dhrod5/link

    def _serialize_data(cls, data, gzip=False, df_column_names=False,
            gzip_compression_level=6):
        if six.PY3:
            raise Exception("python3 currently not supported for this method! Sorry!")

        # easy, it's already in a stringio ready to go
        if any([isinstance(data, c) for c in already_stringio]):
            # make sure we reset to beginning, just in case, in preparation of read()
            # must be compatible with StringIO and cStringIO, so can't use reset()
            data.seek(SEEK_SET)
            return data
        
        data_io = StringIO()
        if gzip:
            io = GzipFile(fileobj=data_io, mode='w', compresslevel=gzip_compression_level)
        else:
            io = data_io

        if any([isinstance(data, c) for c in (dict, list, str)]):
            io.write(json.dumps(data))
        elif isinstance(data, DataFrame) or isinstance(data, Series):
            data.to_csv(io, index=False, header=df_column_names)
        else:
            io.write(str(data))

        if gzip:
            io.close()

        data_io.seek(SEEK_SET)
        return data_io

Example #2

0

Show file

File: samples_from_454.py Project: cfe-lab/MiCall

def main():
    args = parse_args()
    with args.source as source:
        reader = csv.DictReader(source)
        for (run, sample), rows in groupby(reader, itemgetter('run', 'enum')):
            sample_name = format_sample_name(run, sample)
            filename1 = os.path.join(args.dest, sample_name + '_R1_001.fastq.gz')
            filename2 = os.path.join(args.dest, sample_name + '_R2_001.fastq.gz')
            print(filename1)
            with open(filename1, 'wb') as dest1, open(filename2, 'wb') as dest2:
                dest1_zip = GzipFile(fileobj=dest1)
                dest2_zip = GzipFile(fileobj=dest2)
                for i, row in enumerate(rows):
                    seq = row['string'].replace('-', '')
                    for j in range(3):
                        # Three duplicates so that G2P doesn't ignore it.
                        prefix = '@M454:01:000000000-AAAAA:1:1101:{}:{}'.format(
                            10*i + j,
                            row['count'])
                        dest1_zip.write(prefix + ' 1:N:0:1\n')
                        dest2_zip.write(prefix + ' 2:N:0:1\n')
                        dest1_zip.write(seq + '\n')
                        dest2_zip.write(reverse_and_complement(seq) + '\n')
                        dest1_zip.write('+\n')
                        dest2_zip.write('+\n')
                        quality = 'A' * len(seq)
                        dest1_zip.write(quality + '\n')
                        dest2_zip.write(quality + '\n')
                dest1_zip.close()
                dest2_zip.close()
    print('Done.')

Example #3

0

Show file

File: athena_project.py Project: xraypy/xraylarch

def _read_raw_athena(filename):
    """try to read athena project file as plain text,
    to determine validity
    """
    # try gzip
    text = None
    try:
        fh = GzipFile(filename)
        text = bytes2str(fh.read())
    except Exception:
        errtype, errval, errtb = sys.exc_info()
        text = None
    finally:
        fh.close()

    if text is None:
        # try plain text file
        try:
            fh = open(filename, 'r')
            text = bytes2str(fh.read())
        except Exception:
            errtype, errval, errtb = sys.exc_info()
            text = None
        finally:
            fh.close()
    return text

Example #4

0

Show file

File: lazygen.py Project: RobinCAS/sprits-it

def gzip_generator(string_generator):
    """Return generator for gzipping given string generator.

    Example:

        >>> import StringIO
        >>> z = ''.join(gzip_generator(iter(['hello,', ' ', 'world!'])))
        >>> ''.join(gunzip_generator(StringIO.StringIO(z)))
        'hello, world!'

    """
    # Use gzip and not zlib to make proper gzip header.
    buffer = StringIO()
    gzip = GzipFile(fileobj=buffer, mode='w')

    # Yield header
    yield buffer.getvalue()
    buffer.truncate(0)

    for string in string_generator:

        gzip.write(string)
        gzip.flush()

        yield buffer.getvalue()
        buffer.truncate(0)

    # Flush
    gzip.close()

    yield buffer.getvalue()

Example #5

0

Show file

File: tools.py Project: DaneTheory/turbulenz_local

def get_compressed_file_data(file_path, compresslevel=5):
    compressed_buffer = BytesIO()

    gzip_file = GzipFile(mode='wb',
                         compresslevel=compresslevel,
                         fileobj=compressed_buffer)

    try:
        fileobj = open(file_path, 'rb')
        while True:
            x = fileobj.read(65536)
            if not x:
                break
            gzip_file.write(x)
            x = None
        fileobj.close()
    except IOError as e:
        LOG.error(str(e))
        return None

    gzip_file.close()

    compressed_data = compressed_buffer.getvalue()
    compressed_buffer.close()

    return compressed_data

Example #6

0

Show file

File: fwrtss-plot.py Project: ynsta/fwrtss

 def __init__(self, data):
     fd, fname = tempfile.mkstemp()
     gzd = GzipFile(mode='r', fileobj=StringIO(b64decode(data)))
     os.write(fd, gzd.read())
     os.close(fd)
     gzd.close()
     self.name = fname

Example #7

0

Show file

File: tests.py Project: faulkner/sentry

    def test_content_encoding_gzip(self):
        kwargs = {'message': 'hello'}

        message = json.dumps(kwargs)

        fp = StringIO()

        try:
            f = GzipFile(fileobj=fp, mode='w')
            f.write(message)
        finally:
            f.close()

        key = self.projectkey.public_key
        secret = self.projectkey.secret_key

        with self.tasks():
            resp = self.client.post(
                self.path, fp.getvalue(),
                content_type='application/octet-stream',
                HTTP_CONTENT_ENCODING='gzip',
                HTTP_X_SENTRY_AUTH=get_auth_header('_postWithHeader', key, secret),
            )

        assert resp.status_code == 200, resp.content

        event_id = json.loads(resp.content)['id']
        instance = Event.objects.get(event_id=event_id)

        assert instance.message == 'hello'

Example #8

0

Show file

File: sbml3.py Project: franapoli/cobrapy

def write_sbml_model(cobra_model, filename, use_fbc_package=True, **kwargs):
    if not use_fbc_package:
        if libsbml is None:
            raise Exception("libSBML required to write non-fbc models")
        write_sbml2(cobra_model, filename, use_fbc_package=False, **kwargs)
        return
    # create xml
    xml = model_to_xml(cobra_model, **kwargs)
    write_args = {"encoding": "UTF-8"}
    if _with_lxml:
        write_args["pretty_print"] = True
    else:
        indent_xml(xml)
    # write xml to file
    should_close = True
    if hasattr(filename, "write"):
        xmlfile = filename
        should_close = False
    elif filename.endswith(".gz"):
        xmlfile = GzipFile(filename, "wb")
    elif filename.endswith(".bz2"):
        xmlfile = BZ2File(filename, "wb")
    else:
        xmlfile = open(filename, "wb")
    ElementTree(xml).write(xmlfile, **write_args)
    if should_close:
        xmlfile.close()

Example #9

0

Show file

File: BufferScroll.py Project: Rob-McCormack/Sublime-1

	def write(self):
		if debug:
			print 'writing to disk'
		gz = GzipFile(database, 'wb')
		dump(db, gz, -1)
		gz.close()
		Pref.writing_to_disk = False

Example #10

0

Show file

File: s3.py Project: Crowdbooster/django-storages

 def _compress_string(self, s):
     """Gzip a given string."""
     zbuf = StringIO()
     zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
     zfile.write(s)
     zfile.close()
     return zbuf.getvalue()

Example #11

0

Show file

File: app.py Project: imbolc/httphq

    def get(self):
        from gzip import GzipFile
        try:
            from cString import StringIO
        except ImportError:
            from StringIO import StringIO

        data = self.get_data()
        data['gzipped'] = True
        json_response = self.json_response(data, finish=False)

        tmp_buffer = StringIO()

        gziped_buffer = GzipFile(
            fileobj=tmp_buffer,
            mode="wb",
            compresslevel=7)
        gziped_buffer.write(json_response)
        gziped_buffer.close()

        gzipped_data = tmp_buffer.getvalue()

        self.set_header("Content-Encoding", 'gzip')
        self.set_header("Content-Length", str(len(gzipped_data)))

        tmp_buffer.close()
        self.finish(gzipped_data)

Example #12

0

Show file

File: hda_proc.py Project: ddxofy/hda-analyzer-mb52

def DecodeProcFile(proc_file):
  if len(proc_file) < 256:
    fd = open(proc_file)
  proc_file = fd.read(1024*1024)
  fd.close()
  if proc_file.find('Subsystem Id:') < 0:
      p = None
      try:
        from gzip import GzipFile
        from StringIO import StringIO
        s = StringIO(proc_file)
        gz = GzipFile(mode='r', fileobj=s)
        p = gz.read(1024*1024)
        gz.close()
      except:
        pass
      if p is None:
        try:
          from bz2 import decompress
          p = decompress(proc_file)
        except:
          pass
      if not p is None:
        proc_file = p
  return proc_file

Example #13

0

Show file

File: lastfm_api.py Project: danielberndt/lda-album-recommender

 def open(self):
   request = Request(self.url)
   request.add_header('User-Agent','lastfm-lda recommender v.0.0.-1')
   request.add_header('Accept-encoding', 'gzip')
   while True:
     URLLoadListener.num_connections+=1
     response = None
     try:
       response = urlopen(request,timeout=10)
       if response.info().get('Content-Encoding') == 'gzip':
         f = GzipFile(fileobj=StringIO(response.read()))
         result = f.read()
         f.close()
       else:
         result = response.read()
       break
     except Exception, e:
       if self.retries>2: 
         if isinstance(e, BadStatusLine): raise Exception("last.fm server does not respond (%s)" % e)
         raise e
       self.retries+=1
       print self.url
       print "failed with", e
       print "retry #",self.retries
       print
     finally:

Example #14

0

Show file

File: sitemap.py Project: redomino/googlesitemap.common

    def _uncachedgenerate(self):
        """ Generates the Gzipped sitemap uncached data """
        len_brains = len(self._catalogbrains())

        if self.index is None:
            # no index specified in the url
            if len_brains < self.maxlen:
                # ok, we have few items, let's generate the standard sitemap
                xml = self.template()
            else:
                # a lot of items, let's generate a sitemap index
                xml = self.indextemplate()
        elif int(self.index)*self.maxlen >= len_brains:
            # bad index specified
            raise NotFound(self.context, '%s-%s' % (self.index, self.filename), self.request)
        else:
            # index specified in the url
            xml = self.template()

        if self.index is not None:
            filename = "%s-%s" % (self.index, self.filename)
        else:
            filename = self.filename

        fp = StringIO()
        gzip = GzipFile(filename, 'w', 9, fp)
        gzip.write(xml)
        gzip.close()
        data = fp.getvalue()
        fp.close()
        return data

Example #15

0

Show file

File: compression.py Project: UstadMobile/exelearning-ustadmobile-work

class CompressingRequestWrapper(_makeBase()):
    """
    A request wrapper with support for transport encoding compression.

    @ivar underlying: the request being wrapped.
    @type underlying: L{IRequest}
    @ivar encoding: the IANA-assigned name of the encoding.
    @type encoding: C{str}
    @ivar compressLevel: the level of gzip compression to apply.
    @type compressLevel: C{int}
    """
    implements(IRequest)

    encoding = 'gzip'
    compressLevel = 6


    def __init__(self, underlying):
        self.underlying = underlying
        self.setHeader('content-encoding', self.encoding)
        self._gzipFile = None

        # See setHeader docstring for more commentary.
        self.underlying.headers.pop('content-length', None)


    def setHeader(self, name, value):
        """
        Discard the Content-Length header.

        When compression encoding is in use, the Content-Length header must
        indicate the length of the compressed content; since we are doing the
        compression on the fly, we don't actually know what the length is after
        compression, so we discard this header. If this is an HTTP/1.1 request,
        chunked transfer encoding should be used, softening the impact of
        losing this header.
        """
        if name.lower() == 'content-length':
            return
        else:
            return self.underlying.setHeader(name, value)


    def write(self, data):
        """
        Pass data through to the gzip layer.
        """
        if self._gzipFile is None:
            self._gzipFile = GzipFile(fileobj=self.underlying, mode='wb', compresslevel=self.compressLevel)
        self._gzipFile.write(data)


    def finishRequest(self, success):
        """
        Finish off gzip stream.
        """
        if self._gzipFile is None:
            self.write('')
        self._gzipFile.close()
        self.underlying.finishRequest(success)

Example #16

0

Show file

File: server.py Project: nirleka/py-sanction

    def handle_stackexchange_login(self, data):
        self.send_response(200)
        self.send_header("Content-type", "text/html")
        self.log_message(self.path)
        self.end_headers()

        c = Client(StackExchange, get_config())
        cred = c.flow.authorization_received(data)

        d = c.request("/me", body=urlencode({
            "site": "stackoverflow"
        }))

        self.wfile.write("<!DOCTYPE html>")
        self.wfile.write("<head><meta charset=\"utf-8\"/></head><body>")
        self.wfile.write("Access token: %s<br>" % cred.access_token)
        self.wfile.write("Type: %s<br>" % cred.token_type)
        self.wfile.write("Expires in: %d<br>" % cred.expires_in)

        # stackexchange gzips all data
        h = StringIO(d)
        gzip_data = GzipFile(fileobj=h)
        d = gzip_data.read()
        gzip_data.close()
        self.wfile.write(d)
        self.wfile.write("</body></html>")

Example #17

0

Show file

File: ring.py Project: 2015-ucsc-hp/swift

    def save(self, filename, mtime=1300507380.0):
        """
        Serialize this RingData instance to disk.

        :param filename: File into which this instance should be serialized.
        :param mtime: time used to override mtime for gzip, default or None
                      if the caller wants to include time
        """
        # Override the timestamp so that the same ring data creates
        # the same bytes on disk. This makes a checksum comparison a
        # good way to see if two rings are identical.
        #
        # This only works on Python 2.7; on 2.6, we always get the
        # current time in the gzip output.
        tempf = NamedTemporaryFile(dir=".", prefix=filename, delete=False)
        if 'mtime' in inspect.getargspec(GzipFile.__init__).args:
            gz_file = GzipFile(filename, mode='wb', fileobj=tempf,
                               mtime=mtime)
        else:
            gz_file = GzipFile(filename, mode='wb', fileobj=tempf)
        self.serialize_v1(gz_file)
        gz_file.close()
        tempf.flush()
        os.fsync(tempf.fileno())
        tempf.close()
        os.chmod(tempf.name, 0o644)
        os.rename(tempf.name, filename)

Example #18

0

Show file

File: gzip.py Project: miki725/breach_buster

def compress_string(s):

    # avg_block_size is acutally the reciporical of the average
    # intended interflush distance.   

    rnd = Random(s)

    flushes_remaining = FLUSH_LIMIT

    if len(s) < AVERAGE_SPAN_BETWEEN_FLUSHES * APPROX_MIN_FLUSHES:
        avg_block_size = APPROX_MIN_FLUSHES / float(len(s) + 1)
    else:
        avg_block_size = 1.0 / AVERAGE_SPAN_BETWEEN_FLUSHES

    s = StringIO(s) if isinstance(s, six.text_type) else BytesIO(s)
    zbuf = BytesIO()
    zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
    chunk = s.read(MIN_INTERFLUSH_INTERVAL + int(rnd.expovariate(avg_block_size)))
    while chunk and flushes_remaining:
        zfile.write(chunk)
        zfile.flush()
        flushes_remaining -= 1
        chunk = s.read(MIN_INTERFLUSH_INTERVAL + int(rnd.expovariate(avg_block_size)))
    zfile.write(chunk)
    zfile.write(s.read())
    zfile.close()
    return zbuf.getvalue()

Example #19

0

Show file

File: nbt.py Project: aheadley/pynemap

class NBTFile(TAG_Compound):
	"""Represents an NBT file object"""
	
	def __init__(self, filename=None, mode=None, buffer=None):
		super(NBTFile,self).__init__()
		self.__class__.__name__ = "TAG_Compound"
		if filename:
			self.file = GzipFile(filename, mode)
			self.parse_file(self.file)
	
	def parse_file(self, file=None):
		if not file:
			file = self.file
		if file:
			self.type = TAG_Byte(buffer=file)
			if self.type.value == self.id:
				name = TAG_String(buffer=file)
				self._parse_buffer(file)
				self.name = name
				self.file.close()
			else:
				raise ValueError("First record is not a Compound Tag")

	def write_file(self, filename=None, file=None):
		if file:
			self.file = file
		elif filename:
			self.file = GzipFile(filename, "wb")
		else:
			raise ValueError("Need to specify either a filename or a file")
		#Render tree to file
		self.type._render_buffer(file)
		self.name._render_buffer(file)
		self._render_buffer(file)

Example #20

0

Show file

File: sortdequeue.py Project: travisfw/hq

 def build_index_gzip(self):
     """creates sorted index from gzip-compressed queue.
     caches object regardless of caccheobj flag.
     """
     self.index = []
     zf = GzipFile(fileobj=self.map, mode="rb")
     while 1:
         p = zf.tell()  # just for diagnosis use
         try:
             l = zf.readline()
         except IOError as ex:
             # probably CRC error due to truncated file. discard the rest.
             logging.error("error in %s at %d: %s", self.fn, p, str(ex))
             break
         if not l:
             break
         if l[0] != " ":
             continue
         try:
             o = cjson.decode(l[1:])
         except Exception as ex:
             logging.warn("skipping malformed JSON at %s:%d: %s", self.fn, p, l[1:])
             continue
         key = o.get("id")
         if key is None:
             try:
                 key = self.urikey(o)
             except UnicodeEncodeError:
                 pass
             if key is None:
                 logging.error("urikey->None for %s", str(o))
                 continue
         self.index.append((key, o))
     zf.close()

Example #21

0

Show file

File: upload_to_s3.py Project: mattsnider/django-shared

 def gzip_media(self, filedata):
     """gzip encodes a given stream of data."""
     gzip_data = StringIO()
     gzf = GzipFile(fileobj=gzip_data, mode="wb")
     gzf.write(filedata)
     gzf.close()
     return gzip_data.getvalue()

Example #22

0

Show file

File: mwxmldump.py Project: udgover/pymwp

def main(argv):
    args = argv[1:] or ["-"]

    class TitleExtractor(MWXMLDumpParser):
        def start_revision(self, pageid, title, revid, timestamp):
            print(pageid, title)
            return

    for path in args:
        if path == "-":
            fp = sys.stdin
        elif path.endswith(".gz"):
            from gzip import GzipFile

            fp = GzipFile(path)
        elif path.endswith(".bz2"):
            from bz2 import BZ2File

            fp = BZ2File(path)
        else:
            fp = open(path)
        parser = TitleExtractor()
        parser.feed_file(fp)
        fp.close()
        parser.close()
    return 0

Example #23

0

Show file

File: ring.py Project: anishnarang/gswift-multinode

    def save(self, filename):
        """
        Serialize this RingData instance to disk.

        :param filename: File into which this instance should be serialized.
        """
        # Override the timestamp so that the same ring data creates
        # the same bytes on disk. This makes a checksum comparison a
        # good way to see if two rings are identical.
        #
        # This only works on Python 2.7; on 2.6, we always get the
        # current time in the gzip output.
        tempf = NamedTemporaryFile(dir=".", prefix=filename, delete=False)
        try:
            gz_file = GzipFile(filename, mode='wb', fileobj=tempf,
                               mtime=1300507380.0)
        except TypeError:
            gz_file = GzipFile(filename, mode='wb', fileobj=tempf)
        self.serialize_v1(gz_file)
        gz_file.close()
        tempf.flush()
        os.fsync(tempf.fileno())
        tempf.close()
        os.chmod(tempf.name, 0o644)
        os.rename(tempf.name, filename)

Example #24

0

Show file

File: HTTPDownloader.py Project: AchillesA/bittorrent-orig

 def handleResponse(self, response):
     if self.quietLoss:
         return
     if self.failed:
         self.factory.noPage(
             failure.Failure(
                 error.Error(
                     self.status, self.message, response)))
     elif self.length != None and self.length != 0:
         self.factory.noPage(failure.Failure(
             client.PartialDownloadError(self.status, self.message, response)))
     else:
         if self.decode:
             s = StringIO()
             s.write(response)
             s.seek(-1)
             g = GzipFile(fileobj=s, mode='rb')
             try:
                 response = g.read()
             except IOError:
                 self.factory.noPage(failure.Failure(
                     client.PartialDownloadError(self.status, self.message, response)))
                 self.transport.loseConnection()
                 return
             g.close()
         self.factory.page(response)
     # server might be stupid and not close connection.
     self.transport.loseConnection()

Example #25

0

Show file

File: Output.py Project: kpalin/EEL

def savematch(data, filename=''):
    """data must have the following format:
    dictionary from Matrix to Sequence to Index to Score"""
    #Maybe one should add a security policy for allowed filenames.
    #e.g. do not allow '/' in filename.
    if filename=='':
        a=localtime()
        filename='eel_'+str(a.tm_year)+'_'+str(a.tm_mon)+'_'+str(a.tm_mday)+'_'+str(a.tm_hour)+'_'+str(a.tm_min)+'.gff'
    try:
        if filename[-3:]==".gz":
            try:
                F=GzipFile(filename,"w")
            except NameError:
                filename=filename[:-3]
                F=open(filename,'w')
        else:
                F=open(filename,'w')

## This is in wrong format Seq and Matr are reversed.
##        for Matr in data.keys():
##            for Seq in data[Matr].keys():
##                for Pos,Strand in data[Matr][Seq].keys():
##                    F.write("%s\teel\t%s\t%d\t%d\t%f\t%s\t.\n"%(Seq,Matr.getName(),Pos,Pos+len(Matr)-1,data[Matr][Seq][(Pos,Strand)],Strand))
        F.write(get(data))
        F.close()
        return filename
        
    except IOError, (errno, strerror):
        print "I/O error(%s): %s" % (errno, strerror)
        return ''

Example #26

0

Show file

File: BufferScroll.py Project: psistorma/PrivateSublimeSettings

    def run(self):
        if not Pref.writing_to_disk:
            Pref.writing_to_disk = True

            print_line()
            print_debug('WRITING TO DISK')
            start = time.time()

            while len(db) > Pref.max_database_records:
                db.popitem(last = False)

            gz = GzipFile(database+'.tmp', 'wb')
            dump(db, gz, -1)
            gz.close()
            try:
                remove(database)
            except:
                pass
            try:
                rename(database+'.tmp', database)
            except:
                pass

            print_debug('time expend writting to disk', time.time()-start)
            Pref.writing_to_disk = False

Example #27

0

Show file

File: record.py Project: NLCR/NDK-validator

 def write_to(self, out, newline='\x0D\x0A', gzip=False):
     if gzip:
         out = GzipFile(fileobj=out)
     self._write_to(out, newline)
     if gzip:
         out.flush()
         out.close()

Example #28

0

Show file

File: httpservertest.py Project: blep/weightless-core

    def testPostMethodDeCompressesDeflatedBody_gzip(self):
        self.requestData = None
        def handler(**kwargs):
            self.requestData = kwargs

        reactor = Reactor()
        server = HttpServer(reactor, self.port, handler, timeout=0.01)
        server.listen()
        sok = socket()
        sok.connect(('localhost', self.port))
        bodyData = 'bodydatabodydata'
        _sio = StringIO()
        _gzFileObj = GzipFile(filename=None, mode='wb', compresslevel=6, fileobj=_sio)
        _gzFileObj.write(bodyData); _gzFileObj.close()
        compressedBodyData = _sio.getvalue()
        bodyDataCompressed = compress(bodyData)
        contentLengthCompressed = len(bodyDataCompressed)
        sok.send(('POST / HTTP/1.0\r\nContent-Type: application/x-www-form-urlencoded\r\nContent-Length: %d\r\nContent-Encoding: gzip\r\n\r\n' % contentLengthCompressed) + bodyDataCompressed)

        while not self.requestData:
            reactor.step()
        self.assertEquals(dict, type(self.requestData))
        self.assertTrue('Headers' in self.requestData)
        headers = self.requestData['Headers']
        self.assertEquals('POST', self.requestData['Method'])
        self.assertEquals('application/x-www-form-urlencoded', headers['Content-Type'])
        self.assertEquals(contentLengthCompressed, int(headers['Content-Length']))

        self.assertTrue('Body' in self.requestData)
        self.assertEquals('bodydatabodydata', self.requestData['Body'])

Example #29

0

Show file

File: util.py Project: DarthRa/pulsar

def readDerrick(path):
        g = GzipFile(path, "rb")
        messages = [] 
        for l in g:
            messages.append(urllib.unquote(l.rstrip("\r\n").split(" ", 4)[-1]))
        g.close()
        return messages

Example #30

0

Show file

File: analysis.py Project: Web5design/openmind-commons

def run_analogy_space_lang(lang):
    # Open files (fail early on errors)
    tensor_name = tensor_filename(lang)
    tensor_name_new = tensor_name+'_new'
    tensor_file = GzipFile(tensor_name_new, 'wb')

    svd_name = svd_filename(lang)
    svd_name_new = svd_name + '_new'
    
    # Load matrix
    logging.info('Loading %s'% lang)
    cnet_2d = conceptnet_2d_from_db(lang, identities=IDENTITIES, cutoff=CUTOFF)
    logging.info('Normalize %r' % cnet_2d)
    cnet_2d = cnet_2d.normalized()

    # Save tensor
    logging.info('Save tensor as %s' % tensor_name)
    pickle.dump(cnet_2d, tensor_file, -1)
    tensor_file.close()
    os.rename(tensor_name_new, tensor_name)

    logging.info('Running SVD')
    svd = cnet_2d.svd(k=100)

    # Save SVD
    logging.info('Save as %s' % svd_name)
    svd.save_pytables(svd_name_new)
    os.rename(svd_name_new, svd_name)

Example #31

0

Show file

File: update.py Project: yashingle/poetry

    def _update(self, version):
        from poetry.utils.helpers import temporary_directory

        release_name = self._get_release_name(version)

        checksum = "{}.sha256sum".format(release_name)

        base_url = self.BASE_URL

        try:
            r = urlopen(base_url + "/{}/{}".format(version, checksum))
        except HTTPError as e:
            if e.code == 404:
                raise RuntimeError("Could not find {} file".format(checksum))

            raise

        checksum = r.read().decode().strip()

        # We get the payload from the remote host
        name = "{}.tar.gz".format(release_name)
        try:
            r = urlopen(base_url + "/{}/{}".format(version, name))
        except HTTPError as e:
            if e.code == 404:
                raise RuntimeError("Could not find {} file".format(name))

            raise

        meta = r.info()
        size = int(meta["Content-Length"])
        current = 0
        block_size = 8192

        bar = self.progress_bar(max=size)
        bar.set_format(" - Downloading <info>{}</> <comment>%percent%%</>".format(name))
        bar.start()

        sha = hashlib.sha256()
        with temporary_directory(prefix="poetry-updater-") as dir_:
            tar = os.path.join(dir_, name)
            with open(tar, "wb") as f:
                while True:
                    buffer = r.read(block_size)
                    if not buffer:
                        break

                    current += len(buffer)
                    f.write(buffer)
                    sha.update(buffer)

                    bar.set_progress(current)

            bar.finish()

            # Checking hashes
            if checksum != sha.hexdigest():
                raise RuntimeError(
                    "Hashes for {} do not match: {} != {}".format(
                        name, checksum, sha.hexdigest()
                    )
                )

            gz = GzipFile(tar, mode="rb")
            try:
                with tarfile.TarFile(tar, fileobj=gz, format=tarfile.PAX_FORMAT) as f:
                    f.extractall(str(self.lib))
            finally:
                gz.close()

Example #32

0

Show file

class S3Boto3StorageFile(File):
    """
    The default file object used by the S3Boto3Storage backend.

    This file implements file streaming using boto's multipart
    uploading functionality. The file can be opened in read or
    write mode.

    This class extends Django's File class. However, the contained
    data is only the data contained in the current buffer. So you
    should not access the contained file object directly. You should
    access the data via this class.

    Warning: This file *must* be closed using the close() method in
    order to properly write the file to S3. Be sure to close the file
    in your application.
    """

    # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing.
    # TODO: When Django drops support for Python 2.5, rewrite to use the
    #       BufferedIO streams in the Python 2.6 io module.
    buffer_size = 5242880

    def __init__(self, name, mode, storage, buffer_size=None):
        self._storage = storage
        self.name = name[len(self._storage.location):].lstrip("/")
        self._mode = mode
        self.obj = storage.bucket.Object(storage._encode_name(name))
        # NOTE(mattrobenolt): This is an explicit deviation from
        # django-storages. This adds an extra HEAD request before
        # every GET. This effectively doubles the time it takes for
        # every chunk in our filestore. We explicitly are opting
        # out of this behavior to avoid this overhead.
        #
        # if 'w' not in mode:
        #     # Force early RAII-style exception if object does not exist
        #     self.obj.load()
        self._is_dirty = False
        self._file = None
        self._multipart = None
        # 5 MB is the minimum part size (if there is more than one part).
        # Amazon allows up to 10,000 parts.  The default supports uploads
        # up to roughly 50 GB.  Increase the part size to accommodate
        # for files larger than this.
        if buffer_size is not None:
            self.buffer_size = buffer_size
        self._write_counter = 0

    @property
    def size(self):
        return self.obj.content_length

    def _get_file(self):
        if self._file is None:
            with metrics.timer("filestore.read", instance="s3"):
                self._file = BytesIO()
                if "r" in self._mode:
                    self._is_dirty = False
                    self._file.write(self.obj.get()["Body"].read())
                    self._file.seek(0)
                if self._storage.gzip and self.obj.content_encoding == "gzip":
                    self._file = GzipFile(mode=self._mode,
                                          fileobj=self._file,
                                          mtime=0.0)
        return self._file

    def _set_file(self, value):
        self._file = value

    file = property(_get_file, _set_file)

    def read(self, *args, **kwargs):
        if "r" not in self._mode:
            raise AttributeError("File was not opened in read mode.")
        return super().read(*args, **kwargs)

    def write(self, content):
        if "w" not in self._mode:
            raise AttributeError("File was not opened in write mode.")
        self._is_dirty = True
        if self._multipart is None:
            parameters = self._storage.object_parameters.copy()
            parameters["ACL"] = self._storage.default_acl
            parameters["ContentType"] = (mimetypes.guess_type(self.obj.key)[0]
                                         or self._storage.default_content_type)
            if self._storage.reduced_redundancy:
                parameters["StorageClass"] = "REDUCED_REDUNDANCY"
            if self._storage.encryption:
                parameters["ServerSideEncryption"] = "AES256"
            self._multipart = self.obj.initiate_multipart_upload(**parameters)
        if self.buffer_size <= self._buffer_file_size:
            self._flush_write_buffer()
        return super().write(force_bytes(content))

    @property
    def _buffer_file_size(self):
        pos = self.file.tell()
        self.file.seek(0, os.SEEK_END)
        length = self.file.tell()
        self.file.seek(pos)
        return length

    def _flush_write_buffer(self):
        """
        Flushes the write buffer.
        """
        if self._buffer_file_size:
            self._write_counter += 1
            self.file.seek(0)
            part = self._multipart.Part(self._write_counter)
            part.upload(Body=self.file.read())

    def close(self):
        if self._is_dirty:
            self._flush_write_buffer()
            # TODO: Possibly cache the part ids as they're being uploaded
            # instead of requesting parts from server. For now, emulating
            # s3boto's behavior.
            parts = [{
                "ETag": part.e_tag,
                "PartNumber": part.part_number
            } for part in self._multipart.parts.all()]
            self._multipart.complete(MultipartUpload={"Parts": parts})
        else:
            if self._multipart is not None:
                self._multipart.abort()
        if self._file is not None:
            self._file.close()
            self._file = None

Example #33

0

Show file

File: s3boto3.py Project: svleeuwen/django-storages

class S3Boto3StorageFile(File):
    """
    The default file object used by the S3Boto3Storage backend.

    This file implements file streaming using boto's multipart
    uploading functionality. The file can be opened in read or
    write mode.

    This class extends Django's File class. However, the contained
    data is only the data contained in the current buffer. So you
    should not access the contained file object directly. You should
    access the data via this class.

    Warning: This file *must* be closed using the close() method in
    order to properly write the file to S3. Be sure to close the file
    in your application.
    """
    # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing.
    # TODO: When Django drops support for Python 2.5, rewrite to use the
    #       BufferedIO streams in the Python 2.6 io module.
    buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880)

    def __init__(self, name, mode, storage, buffer_size=None):
        self._storage = storage
        self.name = name[len(self._storage.location):].lstrip('/')
        self._mode = mode
        self.obj = storage.bucket.Object(storage._encode_name(name))
        if 'w' not in mode:
            # Force early RAII-style exception if object does not exist
            self.obj.load()
        self._is_dirty = False
        self._file = None
        self._multipart = None
        # 5 MB is the minimum part size (if there is more than one part).
        # Amazon allows up to 10,000 parts.  The default supports uploads
        # up to roughly 50 GB.  Increase the part size to accommodate
        # for files larger than this.
        if buffer_size is not None:
            self.buffer_size = buffer_size
        self._write_counter = 0

    @property
    def size(self):
        return self.obj.content_length

    def _get_file(self):
        if self._file is None:
            self._file = SpooledTemporaryFile(
                max_size=self._storage.max_memory_size,
                suffix=".S3Boto3StorageFile",
                dir=setting("FILE_UPLOAD_TEMP_DIR", None))
            if 'r' in self._mode:
                self._is_dirty = False
                self._file.write(self.obj.get()['Body'].read())
                self._file.seek(0)
            if self._storage.gzip and self.obj.content_encoding == 'gzip':
                self._file = GzipFile(mode=self._mode,
                                      fileobj=self._file,
                                      mtime=0.0)
        return self._file

    def _set_file(self, value):
        self._file = value

    file = property(_get_file, _set_file)

    def read(self, *args, **kwargs):
        if 'r' not in self._mode:
            raise AttributeError("File was not opened in read mode.")
        return super(S3Boto3StorageFile, self).read(*args, **kwargs)

    def write(self, content):
        if 'w' not in self._mode:
            raise AttributeError("File was not opened in write mode.")
        self._is_dirty = True
        if self._multipart is None:
            parameters = self._storage.object_parameters.copy()
            parameters['ACL'] = self._storage.default_acl
            parameters['ContentType'] = (mimetypes.guess_type(self.obj.key)[0]
                                         or self._storage.default_content_type)
            if self._storage.reduced_redundancy:
                parameters['StorageClass'] = 'REDUCED_REDUNDANCY'
            if self._storage.encryption:
                parameters['ServerSideEncryption'] = 'AES256'
            self._multipart = self.obj.initiate_multipart_upload(**parameters)
        if self.buffer_size <= self._buffer_file_size:
            self._flush_write_buffer()
        return super(S3Boto3StorageFile, self).write(force_bytes(content))

    @property
    def _buffer_file_size(self):
        pos = self.file.tell()
        self.file.seek(0, os.SEEK_END)
        length = self.file.tell()
        self.file.seek(pos)
        return length

    def _flush_write_buffer(self):
        """
        Flushes the write buffer.
        """
        if self._buffer_file_size:
            self._write_counter += 1
            self.file.seek(0)
            part = self._multipart.Part(self._write_counter)
            part.upload(Body=self.file.read())

    def close(self):
        if self._is_dirty:
            self._flush_write_buffer()
            # TODO: Possibly cache the part ids as they're being uploaded
            # instead of requesting parts from server. For now, emulating
            # s3boto's behavior.
            parts = [{
                'ETag': part.e_tag,
                'PartNumber': part.part_number
            } for part in self._multipart.parts.all()]
            self._multipart.complete(MultipartUpload={'Parts': parts})
        else:
            if self._multipart is not None:
                self._multipart.abort()
        if self._file is not None:
            self._file.close()
            self._file = None

Example #34

0

Show file

        line = pkginfos.split(':')
        if line[0] == 'Filename':
            filename = line[1].strip()
        elif line[0] == 'MD5sum':
            md5 = line[1].strip()
        elif line[0] == 'SHA1':
            sha1 = line[1].strip()
        elif line[0] == 'SHA256':
            sha256 = line[1].strip()

    if not md5 in channelPkgs and not sha1 in channelPkgs and not sha256 in channelPkgs:
        syncPkgs.append(filename)
    else:
        syncedPkgCount += 1

gzipfile.close()

print "INFO: Packages in repo: %d" % repoPkgCount
print "INFO: Packages synced: %d" % syncedPkgCount
print "INFO: Packages to sync: %d" % len(syncPkgs)

# download and push missing packages
synced = 0

for pkg in syncPkgs:
    synced += 1
    print "INFO: %d/%d: %s" % (synced, len(syncPkgs), os.path.basename(pkg))

    # download
    url = urlopen(repoRoot + pkg)
    pkgFile = open(tempfile.gettempdir() + '/' + os.path.basename(pkg), 'wb')

Example #35

0

Show file

File: text.py Project: zishan74750/django

def compress_string(s):
    zbuf = BytesIO()
    zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
    zfile.write(s)
    zfile.close()
    return zbuf.getvalue()

Example #36

0

Show file

File: s3_file_upload.py Project: pie-crust/etl

def putter(put, put_queue, stat_queue, options):
    pid = current_process().pid
    log = logging.getLogger(os.path.basename(sys.argv[0]))
    connection, bucket = None, None
    file_object_cache = FileObjectCache()
    # Figure out what content types we want to gzip
    if not options.gzip_type:  # default
        gzip_content_types = GZIP_CONTENT_TYPES
    elif 'all' in options.gzip_type:
        gzip_content_types = GZIP_ALL
    else:
        gzip_content_types = options.gzip_type
    if 'guess' in gzip_content_types:
        # don't bother removing 'guess' from the list since nothing will match it
        gzip_content_types.extend(GZIP_CONTENT_TYPES)
    if options.gzip:
        pass
        #log.debug('These content types will be gzipped: %s' % unicode(gzip_content_types))
    while True:
        args = put_queue.get()
        #print args, pid
        if args is None:
            put_queue.task_done()
            break
        key_name, value_kwargs = args
        #print(666,value_kwargs)
        if options.gzip:
            key_name = '%s.gz' % key_name
        value = Value(file_object_cache, **value_kwargs)
        should_gzip = False
        try:
            if connection is None:
                connection = S3Connection(is_secure=options.secure,
                                          host=options.host)
            if bucket is None:
                bucket = connection.get_bucket(options.bucket, validate=False)
            key = put(bucket, key_name, value)
            if key:
                if value.should_copy_content():
                    if options.headers:
                        headers = dict(
                            tuple(header.split(':', 1))
                            for header in options.headers)
                    else:
                        headers = {}

                    content_type = None
                    if options.content_type:
                        if options.content_type == 'guess':
                            content_type = mimetypes.guess_type(value.path)[0]
                        elif options.content_type == 'magic':
                            content_type = mimetypes.guess_type(value.path)[0]
                            if content_type is None:
                                content_type = magic.from_file(value.path,
                                                               mime=True)
                        else:
                            content_type = options.content_type
                        headers['Content-Type'] = content_type

                    content = value.get_content()
                    md5 = value.md5
                    should_gzip = options.gzip and (
                        content_type and content_type in gzip_content_types
                        or gzip_content_types == GZIP_ALL)
                    if should_gzip:
                        headers['Content-Encoding'] = 'gzip'
                        string_io = StringIO()
                        gzip_file = GzipFile(compresslevel=1,
                                             fileobj=string_io,
                                             mode='w')
                        gzip_file.write(content)
                        gzip_file.close()
                        content = string_io.getvalue()
                        md5 = compute_md5(StringIO(content))
                    if not options.dry_run:
                        key.set_contents_from_string(
                            content,
                            headers,
                            md5=md5,
                            policy=options.grant,
                            encrypt_key=options.encrypt_key)
                #log.info('%s %s> %s' % (value.path, 'z' if should_gzip else '-', key.name))
                stat_queue.put(dict(size=value.get_size()))
            else:
                log.info('skipping %s -> %s' % (value.path, key_name))
        except SSLError as exc:
            log.error('%s -> %s (%s)' % (value.path, key_name, exc))
            put_queue.put(args)
            connection, bucket = None, None
        except IOError as exc:
            log.error('%s -> %s (%s)' % (value.path, key_name, exc))
        put_queue.task_done()

Example #37

0

Show file

File: nbt.py Project: mitchts/nbt-converter

class NBTFile(TAG_Compound):
    """Represent an NBT file object."""
    def __init__(self, filename=None, buffer=None, fileobj=None):
        """
        Create a new NBTFile object.
        Specify either a filename, file object or data buffer.
        If filename of file object is specified, data should be GZip-compressed.
        If a data buffer is specified, it is assumed to be uncompressed.

        If filename is specified, the file is closed after reading and writing.
        If file object is specified, the caller is responsible for closing the
        file.
        """
        super(NBTFile, self).__init__()
        self.filename = filename
        self.type = TAG_Byte(self.id)
        closefile = True
        # make a file object
        if filename:
            self.filename = filename
            self.file = GzipFile(filename, 'rb')
        elif buffer:
            if hasattr(buffer, 'name'):
                self.filename = buffer.name
            self.file = buffer
            closefile = False
        elif fileobj:
            if hasattr(fileobj, 'name'):
                self.filename = fileobj.name
            self.file = GzipFile(fileobj=fileobj)
        else:
            self.file = None
            closefile = False
        # parse the file given initially
        if self.file:
            self.parse_file()
            if closefile:
                # Note: GzipFile().close() does NOT close the fileobj,
                # So we are still responsible for closing that.
                try:
                    self.file.close()
                except (AttributeError, IOError):
                    pass
            self.file = None

    def parse_file(self, filename=None, buffer=None, fileobj=None):
        """Completely parse a file, extracting all tags."""
        closefile = True
        if filename:
            self.file = GzipFile(filename, 'rb')
        elif buffer:
            if hasattr(buffer, 'name'):
                self.filename = buffer.name
            self.file = buffer
            closefile = False
        elif fileobj:
            if hasattr(fileobj, 'name'):
                self.filename = fileobj.name
            self.file = GzipFile(fileobj=fileobj)
        if self.file:
            try:
                type = TAG_Byte(buffer=self.file)
                if type.value == self.id:
                    name = TAG_String(buffer=self.file).value
                    self._parse_buffer(self.file)
                    self.name = name
                    if closefile:
                        self.file.close()
                else:
                    raise MalformedFileError(
                        "First record is not a Compound Tag")
            except StructError as e:
                raise MalformedFileError(
                    "Partial File Parse: file possibly truncated.")
        else:
            raise ValueError("NBTFile.parse_file(): Need to specify either a "
                             "filename or a file object")

    def write_file(self, filename=None, buffer=None, fileobj=None):
        """Write this NBT file to a file."""
        closefile = True
        if buffer:
            self.filename = None
            self.file = buffer
            closefile = False
        elif filename:
            self.filename = filename
            self.file = GzipFile(filename, "wb")
        elif fileobj:
            self.filename = None
            self.file = GzipFile(fileobj=fileobj, mode="wb")
        elif self.filename:
            self.file = GzipFile(self.filename, "wb")
        elif not self.file:
            raise ValueError("NBTFile.write_file(): Need to specify either a "
                             "filename or a file object")
        # Render tree to file
        TAG_Byte(self.id)._render_buffer(self.file)
        TAG_String(self.name)._render_buffer(self.file)
        self._render_buffer(self.file)
        # make sure the file is complete
        try:
            self.file.flush()
        except (AttributeError, IOError):
            pass
        if closefile:
            try:
                self.file.close()
            except (AttributeError, IOError):
                pass

    def __repr__(self):
        """
        Return a string (ascii formated for Python 2, unicode
        for Python 3) describing the class, name and id for
        debugging purposes.
        """
        if self.filename:
            return "<%s(%r) with %s(%r) at 0x%x>" % (
                self.__class__.__name__, self.filename, TAG_Compound.__name__,
                self.name, id(self))
        else:
            return "<%s with %s(%r) at 0x%x>" % (self.__class__.__name__,
                                                 TAG_Compound.__name__,
                                                 self.name, id(self))

Example #38

0

Show file

class S3Boto3StorageFile(File):

    """
    The default file object used by the S3Boto3Storage backend.

    This file implements file streaming using boto's multipart
    uploading functionality. The file can be opened in read or
    write mode.

    This class extends Django's File class. However, the contained
    data is only the data contained in the current buffer. So you
    should not access the contained file object directly. You should
    access the data via this class.

    Warning: This file *must* be closed using the close() method in
    order to properly write the file to S3. Be sure to close the file
    in your application.
    """
    buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880)

    def __init__(self, name, mode, storage, buffer_size=None):
        if 'r' in mode and 'w' in mode:
            raise ValueError("Can't combine 'r' and 'w' in mode.")
        self._storage = storage
        self.name = name[len(self._storage.location):].lstrip('/')
        self._mode = mode
        self._force_mode = (lambda b: b) if 'b' in mode else force_text
        self.obj = storage.bucket.Object(storage._encode_name(name))
        if 'w' not in mode:
            # Force early RAII-style exception if object does not exist
            self.obj.load()
        self._is_dirty = False
        self._raw_bytes_written = 0
        self._file = None
        self._multipart = None
        # 5 MB is the minimum part size (if there is more than one part).
        # Amazon allows up to 10,000 parts.  The default supports uploads
        # up to roughly 50 GB.  Increase the part size to accommodate
        # for files larger than this.
        if buffer_size is not None:
            self.buffer_size = buffer_size
        self._write_counter = 0

    @property
    def size(self):
        return self.obj.content_length

    def _get_file(self):
        if self._file is None:
            self._file = SpooledTemporaryFile(
                max_size=self._storage.max_memory_size,
                suffix=".S3Boto3StorageFile",
                dir=setting("FILE_UPLOAD_TEMP_DIR")
            )
            if 'r' in self._mode:
                self._is_dirty = False
                self.obj.download_fileobj(self._file)
                self._file.seek(0)
            if self._storage.gzip and self.obj.content_encoding == 'gzip':
                self._file = GzipFile(mode=self._mode, fileobj=self._file, mtime=0.0)
        return self._file

    def _set_file(self, value):
        self._file = value

    file = property(_get_file, _set_file)

    def read(self, *args, **kwargs):
        if 'r' not in self._mode:
            raise AttributeError("File was not opened in read mode.")
        return self._force_mode(super(S3Boto3StorageFile, self).read(*args, **kwargs))

    def readline(self, *args, **kwargs):
        if 'r' not in self._mode:
            raise AttributeError("File was not opened in read mode.")
        return self._force_mode(super(S3Boto3StorageFile, self).readline(*args, **kwargs))

    def write(self, content):
        if 'w' not in self._mode:
            raise AttributeError("File was not opened in write mode.")
        self._is_dirty = True
        if self._multipart is None:
            self._multipart = self.obj.initiate_multipart_upload(
                **self._storage._get_write_parameters(self.obj.key)
            )
        if self.buffer_size <= self._buffer_file_size:
            self._flush_write_buffer()
        bstr = force_bytes(content)
        self._raw_bytes_written += len(bstr)
        return super(S3Boto3StorageFile, self).write(bstr)

    @property
    def _buffer_file_size(self):
        pos = self.file.tell()
        self.file.seek(0, os.SEEK_END)
        length = self.file.tell()
        self.file.seek(pos)
        return length

    def _flush_write_buffer(self):
        """
        Flushes the write buffer.
        """
        if self._buffer_file_size:
            self._write_counter += 1
            self.file.seek(0)
            part = self._multipart.Part(self._write_counter)
            part.upload(Body=self.file.read())
            self.file.seek(0)
            self.file.truncate()

    def _create_empty_on_close(self):
        """
        Attempt to create an empty file for this key when this File is closed if no bytes
        have been written and no object already exists on S3 for this key.

        This behavior is meant to mimic the behavior of Django's builtin FileSystemStorage,
        where files are always created after they are opened in write mode:

            f = storage.open("file.txt", mode="w")
            f.close()
        """
        assert "w" in self._mode
        assert self._raw_bytes_written == 0

        try:
            # Check if the object exists on the server; if so, don't do anything
            self.obj.load()
        except ClientError as err:
            if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
                self.obj.put(
                    Body=b"", **self._storage._get_write_parameters(self.obj.key)
                )
            else:
                raise

    def close(self):
        if self._is_dirty:
            self._flush_write_buffer()
            # TODO: Possibly cache the part ids as they're being uploaded
            # instead of requesting parts from server. For now, emulating
            # s3boto's behavior.
            parts = [{'ETag': part.e_tag, 'PartNumber': part.part_number}
                     for part in self._multipart.parts.all()]
            self._multipart.complete(
                MultipartUpload={'Parts': parts})
        else:
            if self._multipart is not None:
                self._multipart.abort()
            if 'w' in self._mode and self._raw_bytes_written == 0:
                self._create_empty_on_close()
        if self._file is not None:
            self._file.close()
            self._file = None

Example #39

0

Show file

def _fetch_brute_kddcup99(subset=None,
                          data_home=None,
                          download_if_missing=True,
                          random_state=None,
                          shuffle=False,
                          percent10=False):
    """Load the kddcup99 dataset, downloading it if necessary.

    Parameters
    ----------
    subset : None, 'SA', 'SF', 'http', 'smtp'
        To return the corresponding classical subsets of kddcup 99.
        If None, return the entire kddcup 99 dataset.

    data_home : string, optional
        Specify another download and cache folder for the datasets. By default
        all scikit learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : boolean, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None, optional (default=None)
        Random state for shuffling the dataset.
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    percent10 : bool, default=False
        Whether to load only 10 percent of the data.

    Returns
    -------
    dataset : dict-like object with the following attributes:
        dataset.data : numpy array of shape (494021, 41)
            Each row corresponds to the 41 features in the dataset.
        dataset.target : numpy array of shape (494021,)
            Each value corresponds to one of the 21 attack types or to the
            label 'normal.'.
        dataset.DESCR : string
            Description of the kddcup99 dataset.

    """

    data_home = get_data_home(data_home=data_home)
    if sys.version_info[0] == 3:
        # The zlib compression format use by joblib is not compatible when
        # switching from Python 2 to Python 3, let us use a separate folder
        # under Python 3:
        dir_suffix = "-py3"
    else:
        # Backward compat for Python 2 users
        dir_suffix = ""
    if percent10:
        kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
    else:
        kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
    samples_path = join(kddcup_dir, "samples")
    targets_path = join(kddcup_dir, "targets")
    available = exists(samples_path)

    if download_if_missing and not available:
        _mkdirp(kddcup_dir)
        URL_ = URL10 if percent10 else URL
        logger.warning("Downloading %s" % URL_)
        f = BytesIO(urlopen(URL_).read())

        dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'),
              ('flag', 'S6'), ('src_bytes', int), ('dst_bytes', int),
              ('land', int), ('wrong_fragment', int), ('urgent', int),
              ('hot', int), ('num_failed_logins', int), ('logged_in', int),
              ('num_compromised', int), ('root_shell', int),
              ('su_attempted', int), ('num_root', int),
              ('num_file_creations', int), ('num_shells', int),
              ('num_access_files', int), ('num_outbound_cmds', int),
              ('is_host_login', int), ('is_guest_login', int), ('count', int),
              ('srv_count', int), ('serror_rate', float),
              ('srv_serror_rate', float), ('rerror_rate', float),
              ('srv_rerror_rate', float), ('same_srv_rate', float),
              ('diff_srv_rate', float), ('srv_diff_host_rate', float),
              ('dst_host_count', int), ('dst_host_srv_count', int),
              ('dst_host_same_srv_rate', float),
              ('dst_host_diff_srv_rate', float),
              ('dst_host_same_src_port_rate', float),
              ('dst_host_srv_diff_host_rate', float),
              ('dst_host_serror_rate', float),
              ('dst_host_srv_serror_rate', float),
              ('dst_host_rerror_rate', float),
              ('dst_host_srv_rerror_rate', float), ('labels', 'S16')]
        DT = np.dtype(dt)

        file_ = GzipFile(fileobj=f, mode='r')
        Xy = []
        for line in file_.readlines():
            if six.PY3:
                line = line.decode()
            Xy.append(line.replace('\n', '').split(','))
        file_.close()
        print('extraction done')
        Xy = np.asarray(Xy, dtype=object)
        for j in range(42):
            Xy[:, j] = Xy[:, j].astype(DT[j])

        X = Xy[:, :-1]
        y = Xy[:, -1]
        # XXX bug when compress!=0:
        # (error: 'Incorrect data length while decompressing[...] the file
        #  could be corrupted.')

        joblib.dump(X, samples_path, compress=0)
        joblib.dump(y, targets_path, compress=0)
    elif not available:
        if not download_if_missing:
            raise IOError("Data not found and `download_if_missing` is False")

    try:
        X, y
    except NameError:
        X = joblib.load(samples_path)
        y = joblib.load(targets_path)

    if shuffle:
        X, y = shuffle_method(X, y, random_state=random_state)

    return Bunch(data=X, target=y, DESCR=__doc__)

Example #40

0

Show file

File: gc_5S_rrna.py Project: mallikaiyer/OGT

                        "No rna file for genbank ID {}\n".format(genbankID))
                    continue
                elif rna_path.endswith('.gz'):
                    handle = GzipFile(rna_path)
                else:
                    handle = file(rna_path, 'r')

                for record in SeqIO.parse(handle, "fasta"):
                    if "5S ribosomal RNA" in record.description:
                        for i in record:
                            if i in base_set:
                                base_count += 1
                            if i in gc_set:
                                gc_count += 1
                            else:
                                continue

                handle.close()

        if base_count == 0:
            sys.stderr.write("{} has no 5S rRNA sequences.\n".format(goldID))
            continue
        else:
            frac_gc = gc_count / float(base_count)
            query = "UPDATE GOLD_FEATURES SET rRNA_5S_GC = {} WHERE GOLD_ID = '{}';".format(
                frac_gc, goldID)
            c.execute(query)
            conn.commit()

    conn.close()

Example #41

0

Show file

    def _make_lib(self, version):
        # We get the payload from the remote host
        platform = sys.platform
        if platform == "linux2":
            platform = "linux"

        url = self._base_url + "{}/".format(version)
        name = "poetry-{}-{}.tar.gz".format(version, platform)
        checksum = "poetry-{}-{}.sha256sum".format(version, platform)

        try:
            r = urlopen(url + "{}".format(checksum))
        except HTTPError as e:
            if e.code == 404:
                raise RuntimeError("Could not find {} file".format(checksum))

            raise

        checksum = r.read().decode()

        try:
            r = urlopen(url + "{}".format(name))
        except HTTPError as e:
            if e.code == 404:
                raise RuntimeError("Could not find {} file".format(name))

            raise

        meta = r.info()
        size = int(meta["Content-Length"])
        current = 0
        block_size = 8192

        print("  - Downloading {} ({:.2f}MB)".format(colorize("comment", name),
                                                     size / 1024 / 1024))

        sha = hashlib.sha256()
        with temporary_directory(prefix="poetry-installer-") as dir_:
            tar = os.path.join(dir_, name)
            with open(tar, "wb") as f:
                while True:
                    buffer = r.read(block_size)
                    if not buffer:
                        break

                    current += len(buffer)
                    f.write(buffer)
                    sha.update(buffer)

            # Checking hashes
            if checksum != sha.hexdigest():
                raise RuntimeError(
                    "Hashes for {} do not match: {} != {}".format(
                        name, checksum, sha.hexdigest()))

            gz = GzipFile(tar, mode="rb")
            try:
                with tarfile.TarFile(tar,
                                     fileobj=gz,
                                     format=tarfile.PAX_FORMAT) as f:
                    f.extractall(POETRY_LIB)
            finally:
                gz.close()

Example #42

0

Show file

    def test_reload_old_style_pickled_ring(self):
        devs = [{
            'id': 0,
            'zone': 0,
            'weight': 1.0,
            'ip': '10.1.1.1',
            'port': 6000
        }, {
            'id': 1,
            'zone': 0,
            'weight': 1.0,
            'ip': '10.1.1.1',
            'port': 6000
        }, None, {
            'id': 3,
            'zone': 2,
            'weight': 1.0,
            'ip': '10.1.2.1',
            'port': 6000
        }, {
            'id': 4,
            'zone': 2,
            'weight': 1.0,
            'ip': '10.1.2.2',
            'port': 6000
        }]
        intended_devs = [{
            'id': 0,
            'region': 1,
            'zone': 0,
            'weight': 1.0,
            'ip': '10.1.1.1',
            'port': 6000,
            'replication_ip': '10.1.1.1',
            'replication_port': 6000
        }, {
            'id': 1,
            'region': 1,
            'zone': 0,
            'weight': 1.0,
            'ip': '10.1.1.1',
            'port': 6000,
            'replication_ip': '10.1.1.1',
            'replication_port': 6000
        }, None, {
            'id': 3,
            'region': 1,
            'zone': 2,
            'weight': 1.0,
            'ip': '10.1.2.1',
            'port': 6000,
            'replication_ip': '10.1.2.1',
            'replication_port': 6000
        }, {
            'id': 4,
            'region': 1,
            'zone': 2,
            'weight': 1.0,
            'ip': '10.1.2.2',
            'port': 6000,
            'replication_ip': '10.1.2.2',
            'replication_port': 6000
        }]

        # simulate an old-style pickled ring
        testgz = os.path.join(self.testdir,
                              'without_replication_or_region.ring.gz')
        ring_data = ring.RingData(self.intended_replica2part2dev_id, devs,
                                  self.intended_part_shift)
        # an old-style pickled ring won't have region data
        for dev in ring_data.devs:
            if dev:
                del dev["region"]
        gz_file = GzipFile(testgz, 'wb')
        pickle.dump(ring_data, gz_file, protocol=2)
        gz_file.close()

        self.ring = ring.Ring(self.testdir,
                              reload_time=self.intended_reload_time,
                              ring_name='without_replication_or_region')
        self.assertEquals(self.ring.devs, intended_devs)

Example #43

0

Show file

def fromfile(f):
    infile = GzipFile(f)
    result = loads(infile.read())
    infile.close()
    return result

Example #44

0

Show file

File: _kddcup99.py Project: harupy/scikit-learn

def _fetch_brute_kddcup99(data_home=None,
                          download_if_missing=True,
                          percent10=True):
    """Load the kddcup99 dataset, downloading it if necessary.

    Parameters
    ----------
    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : bool, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    percent10 : bool, default=True
        Whether to load only 10 percent of the data.

    Returns
    -------
    dataset : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : ndarray of shape (494021, 41)
            Each row corresponds to the 41 features in the dataset.
        target : ndarray of shape (494021,)
            Each value corresponds to one of the 21 attack types or to the
            label 'normal.'.
        feature_names : list
            The names of the dataset columns
        target_names: list
            The names of the target columns
        DESCR : str
            Description of the kddcup99 dataset.

    """

    data_home = get_data_home(data_home=data_home)
    dir_suffix = "-py3"

    if percent10:
        kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
        archive = ARCHIVE_10_PERCENT
    else:
        kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
        archive = ARCHIVE

    samples_path = join(kddcup_dir, "samples")
    targets_path = join(kddcup_dir, "targets")
    available = exists(samples_path)

    dt = [
        ("duration", int),
        ("protocol_type", "S4"),
        ("service", "S11"),
        ("flag", "S6"),
        ("src_bytes", int),
        ("dst_bytes", int),
        ("land", int),
        ("wrong_fragment", int),
        ("urgent", int),
        ("hot", int),
        ("num_failed_logins", int),
        ("logged_in", int),
        ("num_compromised", int),
        ("root_shell", int),
        ("su_attempted", int),
        ("num_root", int),
        ("num_file_creations", int),
        ("num_shells", int),
        ("num_access_files", int),
        ("num_outbound_cmds", int),
        ("is_host_login", int),
        ("is_guest_login", int),
        ("count", int),
        ("srv_count", int),
        ("serror_rate", float),
        ("srv_serror_rate", float),
        ("rerror_rate", float),
        ("srv_rerror_rate", float),
        ("same_srv_rate", float),
        ("diff_srv_rate", float),
        ("srv_diff_host_rate", float),
        ("dst_host_count", int),
        ("dst_host_srv_count", int),
        ("dst_host_same_srv_rate", float),
        ("dst_host_diff_srv_rate", float),
        ("dst_host_same_src_port_rate", float),
        ("dst_host_srv_diff_host_rate", float),
        ("dst_host_serror_rate", float),
        ("dst_host_srv_serror_rate", float),
        ("dst_host_rerror_rate", float),
        ("dst_host_srv_rerror_rate", float),
        ("labels", "S16"),
    ]

    column_names = [c[0] for c in dt]
    target_names = column_names[-1]
    feature_names = column_names[:-1]

    if available:
        try:
            X = joblib.load(samples_path)
            y = joblib.load(targets_path)
        except Exception as e:
            raise IOError(
                "The cache for fetch_kddcup99 is invalid, please delete "
                f"{str(kddcup_dir)} and run the fetch_kddcup99 again") from e

    elif download_if_missing:
        _mkdirp(kddcup_dir)
        logger.info("Downloading %s" % archive.url)
        _fetch_remote(archive, dirname=kddcup_dir)
        DT = np.dtype(dt)
        logger.debug("extracting archive")
        archive_path = join(kddcup_dir, archive.filename)
        file_ = GzipFile(filename=archive_path, mode="r")
        Xy = []
        for line in file_.readlines():
            line = line.decode()
            Xy.append(line.replace("\n", "").split(","))
        file_.close()
        logger.debug("extraction done")
        os.remove(archive_path)

        Xy = np.asarray(Xy, dtype=object)
        for j in range(42):
            Xy[:, j] = Xy[:, j].astype(DT[j])

        X = Xy[:, :-1]
        y = Xy[:, -1]
        # XXX bug when compress!=0:
        # (error: 'Incorrect data length while decompressing[...] the file
        #  could be corrupted.')

        joblib.dump(X, samples_path, compress=0)
        joblib.dump(y, targets_path, compress=0)
    else:
        raise IOError("Data not found and `download_if_missing` is False")

    return Bunch(
        data=X,
        target=y,
        feature_names=feature_names,
        target_names=[target_names],
    )

Example #45

0

Show file

File: eval.py Project: lpigou/chalearn2014

def eval():

    files = glob(data + "/" + "*.zip")
    files.sort()

    print len(files), "found"

    for fileName in files[:]:

        print fileName

        # s_time = time.time()
        smp = pp.GestureSample(fileName)
        # print "loading", (time.time()-s_time)/1000.,"ms"
        # s_time = time.time()
        n = smp.data['numFrames']
        dv, uv, gv = smp.depth, smp.user, smp.rgb

        cur_fr = 1
        # new_shape = (step,128,128)

        s = []
        d, u, g = [empty((n_f, ) + vid_res + (3, ), "uint8") for _ in range(3)]
        # take first n_f frames
        for v in dv, uv, gv:
            pp.go_to_frame(v, cur_fr)
        for i, fr in enumerate(range(cur_fr, cur_fr + n_f)):
            s.append(smp.getSkeleton(fr))
            d[i], u[i], g[i] = [v.read()[1] for v in dv, uv, gv]

        d, u, g = [pp.to_grayscale(v) for v in d, u, g]
        u[u < 128], u[u >= 128] = 0, 1
        depth, user, gray, skelet = d, u, g, s
        user_o = user.copy()
        depth_o = depth.copy()
        gray_o = gray.copy()
        # user_depth = depth_o[user_o==1]
        skelet, c = pp.proc_skelet(array(skelet).copy())
        user = pp.proc_user(user)

        _, depth, c = pp.proc_depth(depth.copy(), user.copy(), user_o,
                                    array(skelet).copy())
        gray, c = pp.proc_gray(
            gray.copy(), user,
            array(skelet).copy())  #user.copy!!!!!!!!!!!!!!!!!!!
        cur_fr += n_f

        predictions = []
        while cur_fr + step < n:
            # time_start = time.time()
            sn = []
            dn, un, gn = [
                empty((step, ) + vid_res + (3, ), "uint8") for _ in range(3)
            ]
            # for v in dv,uv,gv: pp.go_to_frame(v, cur_fr)
            for i, fr in enumerate(range(cur_fr, cur_fr + step)):
                sn.append(smp.getSkeleton(fr))
                dn[i], un[i], gn[i] = [v.read()[1] for v in dv, uv, gv]

            dn, un, gn = [pp.to_grayscale(v) for v in dn, un, gn]
            un[un < 128], un[un >= 128] = 0, 1

            s = s[step:] + sn
            # s.extend(sn)
            skelet, c = pp.proc_skelet(s, _3D=False)

            # len_dump = len(depth_o[:step][user_o[:step]==1])
            # un_d = dn[un==1]

            user_o[:-step] = user_o[step:]
            user_o[-step:] = un.copy()
            un = pp.proc_user(un, 3)

            user[:-step] = user[step:]
            user[-step:] = un.copy()

            depth_o[:-step] = depth_o[step:]
            depth_o[-step:] = dn.copy()
            gray_o[:-step] = gray_o[step:]
            gray_o[-step:] = gn.copy()

            _, depth, c = pp.proc_depth(depth_o.copy(), user.copy(), user_o,
                                        skelet)
            gray, c = pp.proc_gray(gray_o.copy(), user, skelet)
            traj2D, traj3D, ori, pheight, hand, center = skelet

            video = empty((
                1,
                2,
            ) + gray.shape, dtype="uint8")
            video[0, 0] = gray.copy()
            video[0, 1] = depth.copy()
            video = video.swapaxes(1, 2)  #(body-hand,gray-depth,fr,h,w)
            v_new = empty((1, 2, 2) + vid_shape, dtype="uint8")
            # p = pheight
            ratio = 0.25
            for i in xrange(video.shape[0]):  #batch

                if pheight < 10: pheight = 100
                scale = ratio  #+randi(2)/100.
                ofs = pheight * scale
                mid = video.shape[-1] / 2.
                sli = None
                if ofs < mid:
                    start = int(round(mid - ofs))
                    end = int(round(mid + ofs))
                    sli = slice(start, end)

                for j in xrange(video.shape[2]):  #maps
                    for k in xrange(video.shape[3]):  #frames
                        #body
                        img = video[i, 0, j, k]
                        img = cut_img(img, 5)
                        img = misc.imresize(img, (h, h))
                        # if j==0: img = 255-misc.imfilter(img,"contour")
                        v_new[i, 0, j, k] = img

                        #hand
                        img = video[i, 1, j, k]
                        img = img[sli, sli]
                        img = misc.imresize(img, (h, h))
                        v_new[i, 1, j, k] = img
            # print "put"
            # pred_loop(v_new,cur_fr,n, fileName)
            x_.set_value(v_new.astype("float32"), borrow=True)
            pred = evalu_model()[0][0]
            predictions.append(pred)

            cur_fr += step

        predictions = array(predictions, float32)
        pred_file_name = fileName.split('/')
        pred_file_name = pred_file_name[-1].replace(".zip", "_prediction.zip")
        file = GzipFile(dst + "/" + pred_file_name, 'wb')
        dump(predictions, file, -1)
        file.close()

Example #46

0

Show file

def tofile(f, obj):
    out = GzipFile(f, 'wb')
    out.write(dumps(obj))
    out.close()

Example #47

0

Show file

File: urls.py Project: rivman/WeasyPrint

 def close(self):
     GzipFile.close(self)
     self.fileobj_to_close.close()

Example #48

0

Show file

File: web.py Project: dipsec/viper

def add_file():
    tags = request.forms.get('tag_list')
    uploads = request.files.getlist('file')

    # Set Project
    project = request.forms.get('project')
    if project in project_list():
        __project__.open(project)
    else:
        __project__.open('../')
        project = 'Main'
    db = Database()
    file_list = []
    # Write temp file to disk
    with upload_temp() as temp_dir:
        for upload in uploads:
            file_path = os.path.join(temp_dir, upload.filename)
            with open(file_path, 'w') as tmp_file:
                tmp_file.write(upload.file.read())
            # Zip Files
            if request.forms.get('compression') == 'zip':
                zip_pass = request.forms.get('zip_pass')
                try:
                    with ZipFile(file_path) as zf:
                        zf.extractall(temp_dir, pwd=zip_pass)
                    for root, dirs, files in os.walk(temp_dir, topdown=False):
                        for name in files:
                            if not name == upload.filename:
                                file_list.append(os.path.join(root, name))
                except Exception as e:
                    return template('error.tpl',
                                    error="Error with zipfile - {0}".format(e))
            # GZip Files
            elif request.forms.get('compression') == 'gz':
                try:
                    gzf = GzipFile(file_path, 'rb')
                    decompress = gzf.read()
                    gzf.close()
                    with open(file_path[:-3], "wb") as df:
                        df.write(decompress)
                    file_list.append(file_path[:-3])
                except Exception as e:
                    return template(
                        'error.tpl',
                        error="Error with gzipfile - {0}".format(e))
            # BZip2 Files
            elif request.forms.get('compression') == 'bz2':
                try:
                    bz2f = BZ2File(file_path, 'rb')
                    decompress = bz2f.read()
                    bz2f.close()
                    with open(file_path[:-3], "wb") as df:
                        df.write(decompress)
                    file_list.append(file_path[:-3])
                except Exception as e:
                    return template(
                        'error.tpl',
                        error="Error with bzip2file - {0}".format(e))
            # Tar Files (any, including tar.gz tar.bz2)
            elif request.forms.get('compression') == 'tar':
                try:
                    if not tarfile.is_tarfile(file_path):
                        return template('error.tpl',
                                        error="This is not a tar file")
                    with tarfile.open(file_path, 'r:*') as tarf:
                        tarf.extractall(temp_dir)
                    for root, dirs, files in os.walk(temp_dir, topdown=False):
                        for name in files:
                            if not name == upload.filename:
                                file_list.append(os.path.join(root, name))
                except Exception as e:
                    return template('error.tpl',
                                    error="Error with tarfile - {0}".format(e))
            # Non zip files
            elif request.forms.get('compression') == 'none':
                file_list.append(file_path)

        # Add each file
        for new_file in file_list:
            print new_file
            obj = File(new_file)
            new_path = store_sample(obj)
            success = True
            if new_path:
                # Add file to the database.
                success = db.add(obj=obj, tags=tags)
                if not success:
                    return template(
                        'error.tpl',
                        error="Unable to Store The File: {0}".format(
                            upload.filename))
    redirect("/project/{0}".format(project))

Example #49

0

Show file

File: data.py Project: mhawthorne/antonym

 def close(self):
     # GzipFile.close() doesn't actuallly close anything.
     if self.mode == GZ_WRITE:
         self._write_gzip(None)
         self._reset_buffer()
     return GzipFile.close(self)

Example #50

0

Show file

File: storage.py Project: muranga/django-cb-storage-s3

 def _put_file(self, name, content):
     name = self._path(name)
     placeholder = False
     if self.cache:
         if not self.cache.exists(name):
             self.cache.save(name, 0, 0)
             placedholder = True
     content_type = mimetypes.guess_type(name)[0] or "application/x-octet-stream"
     headers = {}
     for pattern in self.headers:
         if pattern[0].match(name):
             headers = pattern[1].copy()
             break
     file_pos = content.tell()
     content.seek(0, 2)
     content_length = content.tell()
     content.seek(0)
     gz_cts = getattr(
         settings,
         'CUDDLYBUDDLY_STORAGE_S3_GZIP_CONTENT_TYPES',
         (
             'text/css',
             'application/javascript',
             'application/x-javascript'
         )
     )
     gz_content = None
     if content_length > 1024 and content_type in gz_cts:
         gz_content = StringIO()
         gzf = GzipFile(mode='wb', fileobj=gz_content)
         gzf.write(content.read())
         content.seek(0)
         gzf.close()
         gz_content.seek(0, 2)
         gz_content_length = gz_content.tell()
         gz_content.seek(0)
         if gz_content_length < content_length:
             content_length = gz_content_length
             headers.update({
                 'Content-Encoding': 'gzip'
             })
         else:
             gz_content = None
     headers.update({
         'Content-Type': content_type,
         'Content-Length': str(content_length)
     })
     # Httplib in < 2.6 doesn't accept file like objects. Meanwhile in
     # >= 2.7 it will try to join a content str object with the headers which
     # results in encoding problems.
     if sys.version_info[0] == 2 and sys.version_info[1] < 6:
         content_to_send = gz_content.read() if gz_content is not None else content.read()
     else:
         content_to_send = gz_content if gz_content is not None else content
     response = self.connection.put(self.bucket, name, content_to_send, headers)
     content.seek(file_pos)
     if response.http_response.status != 200:
         if placeholder:
             self.cache.remove(name)
         raise S3Error(response.message)
     if self.cache:
         date = response.http_response.getheader('Date')
         date = timegm(parsedate(date))
         self.cache.save(name, size=content_length, mtime=date)

Example #51

0

Show file

 def __init__(self, derrickFile):
     self.derrickFile = derrickFile
     g = GzipFile(derrickFile, "rb")
     self.messages = [DerrickPacket(l.rstrip("\r\n")) for l in g]
     g.close()

Example #52

0

Show file

def save_to_brain():
    print("SAVING TO DISK-----------------")
    print(db)
    gz = GzipFile(db_file, 'wb')
    dump(db, gz, -1)
    gz.close()

Example #53

0

Show file

File: portable.py Project: obilodeau/ctfd-portable-challenges-plugin

    def transfer_yaml():
        upload_folder = os.path.join(app.root_path,
                                     app.config['UPLOAD_FOLDER'])
        if request.method == 'GET':
            tarfile_backend = TemporaryFile(mode='wb+')
            yamlfile = TemporaryFile(mode='wb+')
            tarball = tarfile.open(fileobj=tarfile_backend, mode='w')

            yamlfile.write(
                bytes(
                    export_challenges('export.yaml', 'export.d', upload_folder,
                                      tarball), "UTF-8"))

            tarinfo = tarfile.TarInfo('export.yaml')
            tarinfo.size = yamlfile.tell()
            yamlfile.seek(0)
            tarball.addfile(tarinfo, yamlfile)
            tarball.close()
            yamlfile.close()

            gzipfile_backend = TemporaryFile(mode='wb+')
            gzipfile = GzipFile(fileobj=gzipfile_backend, mode='wb')

            tarfile_backend.seek(0)
            shutil.copyfileobj(tarfile_backend, gzipfile)

            tarfile_backend.close()
            gzipfile.close()
            gzipfile_backend.seek(0)
            return send_file(gzipfile_backend,
                             as_attachment=True,
                             attachment_filename='export.tar.gz')

        if request.method == 'POST':
            if 'file' not in request.files:
                abort(400)

            file = request.files['file']

            readmode = 'r:gz'
            if file.filename.endswith('.tar'):
                readmode = 'r'
            if file.filename.endswith('.bz2'):
                readmode = 'r:bz2'

            tempdir = mkdtemp()
            try:
                archive = tarfile.open(fileobj=file.stream, mode=readmode)

                if 'export.yaml' not in archive.getnames():
                    shutil.rmtree(tempdir)
                    abort(400)

                # Check for atttempts to escape to higher dirs
                for member in archive.getmembers():
                    memberpath = os.path.normpath(member.name)
                    if memberpath.startswith('/') or '..' in memberpath.split(
                            '/'):
                        shutil.rmtree(tempdir)
                        abort(400)

                    if member.linkname:
                        linkpath = os.path.normpath(member.linkname)
                        if linkpath.startswith('/') or '..' in linkpath.split(
                                '/'):
                            shutil.rmtree(tempdir)
                            abort(400)

                archive.extractall(path=tempdir)

            except tarfile.TarError:
                shutil.rmtree(tempdir)
                print('b')
                abort(400)

            in_file = os.path.join(tempdir, 'export.yaml')
            import_challenges(in_file, upload_folder, move=True)

            shutil.rmtree(tempdir)

            return '1'

Example #54

0

Show file

 def writePackets(self, messages):
     g = GzipFile(self.derrickFile, "wb")
     for m in messages:
         g.write("%s\n" % str(m))
     g.close()

Example #55

0

Show file

File: snippet.py Project: someburner/GistsHub

from gzip import GzipFile

import boto3
s3 = boto3.client('s3')
bucket = 'bluebucket.mindvessel.net'

# Read in some example text, as unicode
with open("utext.txt") as fi:
    text_body = fi.read().decode("utf-8")

# A GzipFile must wrap a real file or a file-like object. We do not want to
# write to disk, so we use a BytesIO as a buffer.
gz_body = BytesIO()
gz = GzipFile(None, 'wb', 9, gz_body)
gz.write(text_body.encode('utf-8'))  # convert unicode strings to bytes!
gz.close()
# GzipFile has written the compressed bytes into our gz_body
s3.put_object(
    Bucket=bucket,
    Key='gztest.txt',  # Note: NO .gz extension!
    ContentType='text/plain',  # the original type
    ContentEncoding='gzip',  # MUST have or browsers will error
    Body=gz_body.getvalue())
retr = s3.get_object(Bucket=bucket, Key='gztest.txt')
# Now the fun part. Reading it back requires this little dance, because
# GzipFile insists that its underlying file-like thing implement tell and
# seek, but boto3's io stream does not.
bytestream = BytesIO(retr['Body'].read())
got_text = GzipFile(None, 'rb', fileobj=bytestream).read().decode('utf-8')
assert got_text == text_body

Example #56

0

Show file

                        print "Failed creating dir: " + os.path.dirname(
                            pack[0])
                        pass
                sendFile(os.path.join(options.packdir, packname), dirpack,
                         options.move)
            else:  # Package already exists in repo
                if options.overwrite:
                    sendFile(os.path.join(options.packdir, packname), dirpack,
                             options.move)
                else:
                    if options.move:
                        print 'File exists ' + packname + ', deleting...'
                        os.remove(os.path.join(options.packdir, packname))
            packfiles.remove(packname)
    if packlisttext != '':  # Only bother with the Packages.gz file if there is a reason
        if not os.path.exists(os.path.dirname(dirlist)):
            try:
                os.makedirs(os.path.dirname(dirlist))
                print "Creating dir: " + os.path.dirname(packlist[0])
            except:
                print "Failed creating dir: " + os.path.dirname(packlist[0])
                pass
        print "Writing file: " + packlist[0] + '.gz'
        packlistfile = file(
            dirlist + '.gz', 'ab'
        )  # If repo already has this Packages.gz file then add the new files to it.
        gzfile = GzipFile(dirlist, 'ab', 9, packlistfile)
        gzfile.write(packlisttext)
        gzfile.close()
        packlistfile.close()

Example #57

0

Show file

class ezfio_obj(object):
    def __init__(self, read_only=False):
        self._filename = "EZFIO_File"
        self.buffer_rank = -1
        self.read_only = read_only
        self.locks = {}

    def acquire_lock(self, var):
        locks = self.locks
        try:
            locks[var].acquire()
        except:
            locks[var] = threading.Lock()
            locks[var].acquire()

    def release_lock(self, var):
        self.locks[var].release()

    def set_read_only(self, v):
        self.read_only = v

    def get_read_only(self):
        return self.read_only

    def exists(self, path):
        if os.access(path + '/.version', os.F_OK) == 1:
            file = open(path + '/.version', "r")
            v = file.readline().strip()
            file.close()
        else:
            return False

    def mkdir(self, path):
        if self.read_only:
            self.error('Read-only file.')
        if self.exists(path):
            self.error('mkdir', 'Group ' + path + ' exists')
        try:
            os.mkdir(path.strip())
        except OSError:
            pass
        file = open(path.strip() + '/.version', 'w')
        print >> file, self.version
        file.close()

    def error(self, where, txt):
        print '------------------------------------------------------------'
        print 'EZFIO File     : ' + self.filename
        print 'EZFIO Error in : ' + where.strip()
        print '------------------------------------------------------------'
        print ''
        print txt.strip()
        print ''
        print '------------------------------------------------------------'
        raise IOError

    def get_filename(self):
        if not self.exists(self._filename):
            self.mkdir(self._filename)
        return self._filename

    def set_filename(self, filename):
        self._filename = filename

    filename = property(fset=set_filename, fget=get_filename)

    def set_file(self, filename):
        self.filename = filename
        if not self.exists(filename):
            self.mkdir(filename)
            self.mkdir(filename + "/ezfio")
            os.system("""
LANG= date > %s/ezfio/creation
echo $USER > %s/ezfio/user
echo %s > %s/ezfio/library""" % (filename, filename, self.LIBRARY, filename))

    def open_write_buffer(self, dir, fil, rank):
        if self.read_only:
            self.error('Read-only file.')
        l_filename = dir.strip() + '/' + fil + '.gz'
        if self.buffer_rank != -1:
            self.error('open_write_buffer',
                       'Another buffered file is already open.')

        self.buffer_rank = rank
        assert (self.buffer_rank > 0)

        try:
            self.file = GzipFile(filename=l_filename, mode='wb7')
        except IOError:
            self.error('open_write_buffer', 'Unable to open buffered file.')

        self.file.write("%2d\n" % (rank, ))

    def open_read_buffer(self, dir, fil, rank):
        l_filename = dir.strip() + '/' + fil + '.gz'

        if self.buffer_rank != -1:
            self.error('open_read_buffer',
                       'Another buffered file is already open.')

        try:
            self.file = GzipFile(filename=l_filename, mode='rb')
        except IOError:
            self.error('open_read_buffer', 'Unable to open buffered file.')

        try:
            rank = eval(self.file.readline())
        except IOError:
            self.error('open_read_buffer', 'Unable to read buffered file.')

        self.buffer_rank = rank
        assert (self.buffer_rank > 0)
        return rank

    def close_buffer(self):
        assert (self.buffer_rank > 0)
        self.buffer_rank = -1
        self.file.close()

    def read_buffer(self, isize):

        if self.buffer_rank == -1:
            self.error('read_buffer', 'No buffered file is open.')

        indices = []
        values = []
        for i in xrange(isize):
            try:
                line = self.file.readline().split()
            except:
                return indices, values
            if len(line) == 0:
                return indices, values
            indices.append([int(i) for i in line[:-1]])
            values.append(eval(line[-1]))
        return indices, values

    def write_buffer(self, indices, values, isize):
        if self.read_only:
            self.error('Read-only file.')
        if self.buffer_rank == -1:
            self.error('write_buffer', 'No buffered file is open.')

        for i in xrange(isize):
            for j in indices[i]:
                self.file.write("%4d " % (j, ))
            self.file.write("%24.15e\n" % (values[i], ))

Example #58

0

Show file

class S3BotoStorageFile(File):
    """
    The default file object used by the S3BotoStorage backend.

    This file implements file streaming using boto's multipart
    uploading functionality. The file can be opened in read or
    write mode.

    This class extends Django's File class. However, the contained
    data is only the data contained in the current buffer. So you
    should not access the contained file object directly. You should
    access the data via this class.

    Warning: This file *must* be closed using the close() method in
    order to properly write the file to S3. Be sure to close the file
    in your application.
    """
    # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing.
    # TODO: When Django drops support for Python 2.5, rewrite to use the
    #       BufferedIO streams in the Python 2.6 io module.
    buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880)

    def __init__(self, name, mode, storage, buffer_size=None):
        self._storage = storage
        self.name = name[len(self._storage.location):].lstrip('/')
        self._mode = mode
        self.key = storage.bucket.get_key(self._storage._encode_name(name))
        if not self.key and 'w' in mode:
            self.key = storage.bucket.new_key(storage._encode_name(name))
        self._is_dirty = False
        self._file = None
        self._multipart = None
        # 5 MB is the minimum part size (if there is more than one part).
        # Amazon allows up to 10,000 parts.  The default supports uploads
        # up to roughly 50 GB.  Increase the part size to accommodate
        # for files larger than this.
        if buffer_size is not None:
            self.buffer_size = buffer_size
        self._write_counter = 0

        if not hasattr(django_settings, 'AWS_DEFAULT_ACL'):
            warnings.warn(
                "The default behavior of S3BotoStorage is insecure. By default files "
                "and new buckets are saved with an ACL of 'public-read' (globally "
                "publicly readable). To change to using the bucket's default ACL "
                "set AWS_DEFAULT_ACL = None, otherwise to silence this warning "
                "explicitly set AWS_DEFAULT_ACL.")

    @property
    def size(self):
        return self.key.size

    def _get_file(self):
        if self._file is None:
            self._file = SpooledTemporaryFile(
                max_size=self._storage.max_memory_size,
                suffix='.S3BotoStorageFile',
                dir=setting('FILE_UPLOAD_TEMP_DIR'))
            if 'r' in self._mode:
                self._is_dirty = False
                self.key.get_contents_to_file(self._file)
                self._file.seek(0)
            if self._storage.gzip and self.key.content_encoding == 'gzip':
                self._file = GzipFile(mode=self._mode, fileobj=self._file)
        return self._file

    def _set_file(self, value):
        self._file = value

    file = property(_get_file, _set_file)

    def read(self, *args, **kwargs):
        if 'r' not in self._mode:
            raise AttributeError('File was not opened in read mode.')
        return super(S3BotoStorageFile, self).read(*args, **kwargs)

    def write(self, content, *args, **kwargs):
        if 'w' not in self._mode:
            raise AttributeError('File was not opened in write mode.')
        self._is_dirty = True
        if self._multipart is None:
            provider = self.key.bucket.connection.provider
            upload_headers = {}
            if self._storage.default_acl:
                upload_headers[provider.acl_header] = self._storage.default_acl
            upload_headers.update({
                'Content-Type':
                mimetypes.guess_type(self.key.name)[0]
                or self._storage.key_class.DefaultContentType
            })
            upload_headers.update(self._storage.headers)
            self._multipart = self._storage.bucket.initiate_multipart_upload(
                self.key.name,
                headers=upload_headers,
                reduced_redundancy=self._storage.reduced_redundancy,
                encrypt_key=self._storage.encryption,
            )
        if self.buffer_size <= self._buffer_file_size:
            self._flush_write_buffer()
        return super(S3BotoStorageFile, self).write(force_bytes(content),
                                                    *args, **kwargs)

    @property
    def _buffer_file_size(self):
        pos = self.file.tell()
        self.file.seek(0, os.SEEK_END)
        length = self.file.tell()
        self.file.seek(pos)
        return length

    def _flush_write_buffer(self):
        if self._buffer_file_size:
            self._write_counter += 1
            self.file.seek(0)
            headers = self._storage.headers.copy()
            self._multipart.upload_part_from_file(self.file,
                                                  self._write_counter,
                                                  headers=headers)
            self.file.seek(0)
            self.file.truncate()

    def close(self):
        if self._is_dirty:
            self._flush_write_buffer()
            self._multipart.complete_upload()
        else:
            if self._multipart is not None:
                self._multipart.cancel_upload()
        self.key.close()
        if self._file is not None:
            self._file.close()
            self._file = None

Example #59

0

Show file

File: chunkedfile.py Project: evi1angel/python-imdb

class ChunkedFile(object):
    """Compressed file writer/reader that stores data in chunks in a zip file.
    Transparently supports reading gzip files.
    """
    def __init__(self, filename, subfile='', mode='r', chunksize=131072,
                 autoflush=True):
        """Create a ChunkedFile object with given filename, I/O mode (r,w,a),
        and preferred chunk size. If you wish to manually control the chunk
        boundaries using bookmark() or flush(), set autoflush=False."""
        if mode not in 'rwa':
            raise ValueError('Mode must be r or w or a')
        self._is_gzip = False
        if os.path.isdir(filename):
            assert mode == 'r'
            self.zip = UnpackedZipFile(filename, mode)
        else:
            try:
                self.zip = ZipFile(filename, mode, ZIP_DEFLATED)
            except BadZipfile:
                assert mode == 'r'
                # Transparent reading of gzip files
                # (relatively fast, pure-python, some limitations)
                self.zip = GzipFile(filename, mode)
                self._is_gzip = True
        self.prefix = '%s/c.' % str(subfile) if subfile else 'c.'
        self.mode = mode
        self.chunksize = chunksize
        self.autoflush = autoflush

        # List of available chunks
        if not self._is_gzip:
            self.chunks = self._chunks()

        # Determine current position
        if mode == 'r':
            self.eof = False
            self.chunkidx = -1
        else:
            self.eof = True
            self.chunkidx = len(self.chunks)-1
        if self.chunkidx >= 0:
            info = self.zip.getinfo(self.chunks[self.chunkidx].name)
            self.pos = self.chunks[self.chunkidx].pos + info.file_size
        else:
            self.pos = 0

        # Buffers
        self.nextbuf = []
        self.readbuf = ''
        self.writebuf = ''
        self._last_bookmark = None

    def _chunks(self):
        """Return a list of ChunkInfos, one for each chunk in the file."""
        offset = len(self.prefix)
        chunks = []
        for name in self.zip.namelist():
            # Check multifiles
            if not name[0:].startswith(self.prefix):
                continue
            nameinfo = name[offset:].split(',')
            pos = int(nameinfo[0], 16)
            bookmark = None
            if len(nameinfo) > 1:
                bookmark = urlsafe_b64decode(nameinfo[1])
            chunks.append(ChunkInfo(name=name,
                                    pos=pos,
                                    bookmark=bookmark))
        return sorted(chunks, key=lambda chunk: chunk.pos)

    def _next_chunk(self):
        """Read the next chunk into the read buffer."""
        if self._is_gzip:
            chunk = self.zip.read(self.chunksize)
            if not chunk:
                self.eof = True
                raise EOFError
            else:
                self.readbuf += chunk
            return
        self.chunkidx += 1
        if self.chunkidx >= len(self.chunks):
            self.eof = True
            raise EOFError
        else:
            self.readbuf += self.zip.read(self.chunks[self.chunkidx].name)

    def _flush(self, auto=True, bookmark=None):
        """Flush complete chunks from the write buffer. An incomplete chunk
        may be created (and the write buffer completely emptied) if
        auto=False"""
        if auto and not self.autoflush:
            return
        while self.writebuf and \
                (len(self.writebuf) >= self.chunksize or not auto):
            self.chunkidx += 1
            assert(self.chunkidx == len(self.chunks))
            chunkpos = self.pos-len(self.writebuf)
            chunkname = '%s%08x' % (self.prefix, chunkpos)
            chunkbookmark = None
            if bookmark and len(self.writebuf) <= self.chunksize:
                chunkname += ','+urlsafe_b64encode(bookmark)
                chunkbookmark = bookmark
            self.zip.writestr(chunkname, self.writebuf[:self.chunksize])
            self.writebuf = self.writebuf[self.chunksize:]
            self.chunks.append(ChunkInfo(name=chunkname,
                                         pos=chunkpos,
                                         bookmark=chunkbookmark))

    def close(self):
        """Close the file. Must be called to avoid data loss."""
        self.flush()
        self.zip.close()

    def flush(self):
        """Flush all output to the file."""
        self._flush(auto=False)

    def bookmark(self, bookmark):
        """Possibly flush the file, writing a bookmark if doing so."""
        assert(not self._last_bookmark or bookmark >= self._last_bookmark)
        self._last_bookmark = bookmark
        if len(self.writebuf) >= (self.chunksize-self.chunksize/8):
            # Use 7/8 of a chunksize to avoid creating too many tiny overflow
            # chunks.
            self._flush(auto=False, bookmark=bookmark)

    def write(self, data):
        """Write data to be stored in the file."""
        assert(not self._is_gzip)
        self.writebuf += data
        self.pos += len(data)
        self._flush(auto=True)

    def read(self, size=-1):
        """Read data from the file."""
        try:
            while size < 0 or len(self.readbuf) < size:
                self._next_chunk()
        except EOFError:
            pass
        if size > 0:
            ret = self.readbuf[:size]
            self.readbuf = self.readbuf[size:]
        elif size < 0:
            ret = self.readbuf
            self.readbuf = ''
        elif size == 0:
            ret = ''
        self.pos += len(ret)
        return ret

    def next(self):
        """Return the next line from the file or raise StopIteration."""
        if self.nextbuf:
            self.pos += len(self.nextbuf[0])
            return self.nextbuf.pop(0)
        if self.eof and not self.readbuf:
            raise StopIteration
        # Find next line ending
        try:
            while '\n' not in self.readbuf:
                self._next_chunk()
        except EOFError:
            if '\n' not in self.readbuf:
                if self.readbuf:
                    return self.read(-1)
                else:
                    raise StopIteration

        # Split lines into separate buffer
        self.nextbuf = self.readbuf.splitlines(True)
        if self.readbuf[-1] != '\n':
            self.readbuf = self.nextbuf.pop()
        else:
            self.readbuf = ''
        return self.next()

    def seek(self, offset, whence=0):
        """Seek to a given byte position in the file. Currently limited to
        files opened for mode=r and whence current location or beginning of
        the file."""
        # Only simple writing is supported
        assert(self.mode == 'r')
        if whence == 0:
            pass
        elif whence == 1:
            offset = self.pos+offset
        elif whence == 2:
            raise NotImplementedError
        else:
            raise ValueError
        if self._is_gzip:
            assert(offset >= self.pos)
        else:
            # Find the correct chunk
            self.flush()
            self.nextbuf = []
            self.readbuf = ''
            self.chunkidx = -1
            self.pos = 0
            for idx, data in enumerate(self.chunks):
                if data.pos <= offset:
                    self.chunkidx = idx-1
                    self.pos = data.pos
        delta = offset-self.pos
        assert(delta >= 0)
        self.read(delta)
        assert(delta <= self.chunksize or self.eof or self._is_gzip)
        assert(self.pos == offset)

    def find_bookmark(self, bookmark, give_range=False):
        """Determine an appropriate seek position near bookmark."""
        pos = 0
        for chunk in self.chunks:
            if chunk.bookmark and chunk.bookmark < bookmark:
                pos = chunk.pos
        if give_range:
            ret_next = 0
            for chunk in self.chunks:
                if ret_next == 1:
                    assert(chunk.pos > pos)
                    return pos, chunk.pos
                elif chunk.bookmark and chunk.bookmark > bookmark:
                    ret_next = 1
            return pos, None
        else:
            return pos

    def tell(self):
        """Return the current byte position in the file."""
        return self.pos

    # def __enter__(...): return self
    # def __exit__(...): self.close()

    def __iter__(self):
        return self

Example #60

0

Show file

def _fetch_brute_kddcup99(data_home=None,
                          download_if_missing=True,
                          percent10=True):
    """Load the kddcup99 dataset, downloading it if necessary.

    Parameters
    ----------
    data_home : string, optional
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : boolean, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    percent10 : bool, default=True
        Whether to load only 10 percent of the data.

    Returns
    -------
    dataset : dict-like object with the following attributes:
        dataset.data : numpy array of shape (494021, 41)
            Each row corresponds to the 41 features in the dataset.
        dataset.target : numpy array of shape (494021,)
            Each value corresponds to one of the 21 attack types or to the
            label 'normal.'.
        dataset.DESCR : string
            Description of the kddcup99 dataset.

    """

    data_home = get_data_home(data_home=data_home)
    dir_suffix = "-py3"

    if percent10:
        kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
        archive = ARCHIVE_10_PERCENT
    else:
        kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
        archive = ARCHIVE

    samples_path = join(kddcup_dir, "samples")
    targets_path = join(kddcup_dir, "targets")
    available = exists(samples_path)

    if download_if_missing and not available:
        _mkdirp(kddcup_dir)
        logger.info("Downloading %s" % archive.url)
        _fetch_remote(archive, dirname=kddcup_dir)
        dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'),
              ('flag', 'S6'), ('src_bytes', int), ('dst_bytes', int),
              ('land', int), ('wrong_fragment', int), ('urgent', int),
              ('hot', int), ('num_failed_logins', int), ('logged_in', int),
              ('num_compromised', int), ('root_shell', int),
              ('su_attempted', int), ('num_root', int),
              ('num_file_creations', int), ('num_shells', int),
              ('num_access_files', int), ('num_outbound_cmds', int),
              ('is_host_login', int), ('is_guest_login', int), ('count', int),
              ('srv_count', int), ('serror_rate', float),
              ('srv_serror_rate', float), ('rerror_rate', float),
              ('srv_rerror_rate', float), ('same_srv_rate', float),
              ('diff_srv_rate', float), ('srv_diff_host_rate', float),
              ('dst_host_count', int), ('dst_host_srv_count', int),
              ('dst_host_same_srv_rate', float),
              ('dst_host_diff_srv_rate', float),
              ('dst_host_same_src_port_rate', float),
              ('dst_host_srv_diff_host_rate', float),
              ('dst_host_serror_rate', float),
              ('dst_host_srv_serror_rate', float),
              ('dst_host_rerror_rate', float),
              ('dst_host_srv_rerror_rate', float), ('labels', 'S16')]
        DT = np.dtype(dt)
        logger.debug("extracting archive")
        archive_path = join(kddcup_dir, archive.filename)
        file_ = GzipFile(filename=archive_path, mode='r')
        Xy = []
        for line in file_.readlines():
            line = line.decode()
            Xy.append(line.replace('\n', '').split(','))
        file_.close()
        logger.debug('extraction done')
        os.remove(archive_path)

        Xy = np.asarray(Xy, dtype=object)
        for j in range(42):
            Xy[:, j] = Xy[:, j].astype(DT[j])

        X = Xy[:, :-1]
        y = Xy[:, -1]
        # XXX bug when compress!=0:
        # (error: 'Incorrect data length while decompressing[...] the file
        #  could be corrupted.')

        joblib.dump(X, samples_path, compress=0)
        joblib.dump(y, targets_path, compress=0)
    elif not available:
        if not download_if_missing:
            raise IOError("Data not found and `download_if_missing` is False")

    try:
        X, y
    except NameError:
        X = joblib.load(samples_path)
        y = joblib.load(targets_path)

    return Bunch(data=X, target=y)