Exemple #1
0
def savematch(data, filename=''):
    """data must have the following format:
    dictionary from Matrix to Sequence to Index to Score"""
    #Maybe one should add a security policy for allowed filenames.
    #e.g. do not allow '/' in filename.
    if filename=='':
        a=localtime()
        filename='eel_'+str(a.tm_year)+'_'+str(a.tm_mon)+'_'+str(a.tm_mday)+'_'+str(a.tm_hour)+'_'+str(a.tm_min)+'.gff'
    try:
        if filename[-3:]==".gz":
            try:
                F=GzipFile(filename,"w")
            except NameError:
                filename=filename[:-3]
                F=open(filename,'w')
        else:
                F=open(filename,'w')

## This is in wrong format Seq and Matr are reversed.
##        for Matr in data.keys():
##            for Seq in data[Matr].keys():
##                for Pos,Strand in data[Matr][Seq].keys():
##                    F.write("%s\teel\t%s\t%d\t%d\t%f\t%s\t.\n"%(Seq,Matr.getName(),Pos,Pos+len(Matr)-1,data[Matr][Seq][(Pos,Strand)],Strand))
        F.write(get(data))
        F.close()
        return filename
        
    except IOError, (errno, strerror):
        print "I/O error(%s): %s" % (errno, strerror)
        return ''
Exemple #2
0
def get_compressed_file_data(file_path, compresslevel=5):
    compressed_buffer = BytesIO()

    gzip_file = GzipFile(mode='wb',
                         compresslevel=compresslevel,
                         fileobj=compressed_buffer)

    try:
        fileobj = open(file_path, 'rb')
        while True:
            x = fileobj.read(65536)
            if not x:
                break
            gzip_file.write(x)
            x = None
        fileobj.close()
    except IOError as e:
        LOG.error(str(e))
        return None

    gzip_file.close()

    compressed_data = compressed_buffer.getvalue()
    compressed_buffer.close()

    return compressed_data
Exemple #3
0
    def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs):
        """
        Return a buffered gzip file object.

        :param filename: a filesystem path
        :type filename: str
        :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
            'w', or 'wb'
        :type mode: str
        :param compresslevel: The compresslevel argument is an integer from 1
            to 9 controlling the level of compression; 1 is fastest and
            produces the least compression, and 9 is slowest and produces the
            most compression. The default is 9.
        :type compresslevel: int
        :param fileobj: a StringIO stream to read from instead of a file.
        :type fileobj: StringIO
        :param size: number of bytes to buffer during calls to read() and write()
        :type size: int
        :rtype: BufferedGzipFile
        """
        GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
        self._size = kwargs.get("size", self.SIZE)
        self._buffer = StringIO()
        # cStringIO does not support len.
        self._len = 0
Exemple #4
0
    def save(self, filename):
        """
        Serialize this RingData instance to disk.

        :param filename: File into which this instance should be serialized.
        """
        # Override the timestamp so that the same ring data creates
        # the same bytes on disk. This makes a checksum comparison a
        # good way to see if two rings are identical.
        #
        # This only works on Python 2.7; on 2.6, we always get the
        # current time in the gzip output.
        tempf = NamedTemporaryFile(dir=".", prefix=filename, delete=False)
        try:
            gz_file = GzipFile(filename, mode='wb', fileobj=tempf,
                               mtime=1300507380.0)
        except TypeError:
            gz_file = GzipFile(filename, mode='wb', fileobj=tempf)
        self.serialize_v1(gz_file)
        gz_file.close()
        tempf.flush()
        os.fsync(tempf.fileno())
        tempf.close()
        os.chmod(tempf.name, 0o644)
        os.rename(tempf.name, filename)
Exemple #5
0
    def test_content_encoding_gzip(self):
        kwargs = {'message': 'hello'}

        message = json.dumps(kwargs)

        fp = StringIO()

        try:
            f = GzipFile(fileobj=fp, mode='w')
            f.write(message)
        finally:
            f.close()

        key = self.projectkey.public_key
        secret = self.projectkey.secret_key

        with self.tasks():
            resp = self.client.post(
                self.path, fp.getvalue(),
                content_type='application/octet-stream',
                HTTP_CONTENT_ENCODING='gzip',
                HTTP_X_SENTRY_AUTH=get_auth_header('_postWithHeader', key, secret),
            )

        assert resp.status_code == 200, resp.content

        event_id = json.loads(resp.content)['id']
        instance = Event.objects.get(event_id=event_id)

        assert instance.message == 'hello'
	def write(self):
		if debug:
			print 'writing to disk'
		gz = GzipFile(database, 'wb')
		dump(db, gz, -1)
		gz.close()
		Pref.writing_to_disk = False
Exemple #7
0
 def __init__(self, data):
     fd, fname = tempfile.mkstemp()
     gzd = GzipFile(mode='r', fileobj=StringIO(b64decode(data)))
     os.write(fd, gzd.read())
     os.close(fd)
     gzd.close()
     self.name = fname
Exemple #8
0
 def write_file(self, filename=None, buffer=None, fileobj=None):
     """Write this NBT file to a file."""
     closefile = True
     if buffer:
         self.filename = None
         self.file = buffer
         closefile = False
     elif filename:
         self.filename = filename
         self.file = GzipFile(filename, "wb")
     elif fileobj:
         self.filename = None
         self.file = GzipFile(fileobj=fileobj, mode="wb")
     elif self.filename:
         self.file = GzipFile(self.filename, "wb")
     elif not self.file:
         raise ValueError(
             "NBTFile.write_file(): Need to specify either a "
             "filename or a file object"
         )
     # Render tree to file
     TAG_Byte(self.id)._render_buffer(self.file)
     TAG_String(self.name)._render_buffer(self.file)
     self._render_buffer(self.file)
     # make sure the file is complete
     try:
         self.file.flush()
     except (AttributeError, IOError):
         pass
     if closefile:
         try:
             self.file.close()
         except (AttributeError, IOError):
             pass
Exemple #9
0
def write_sbml_model(cobra_model, filename, use_fbc_package=True, **kwargs):
    if not use_fbc_package:
        if libsbml is None:
            raise Exception("libSBML required to write non-fbc models")
        write_sbml2(cobra_model, filename, use_fbc_package=False, **kwargs)
        return
    # create xml
    xml = model_to_xml(cobra_model, **kwargs)
    write_args = {"encoding": "UTF-8"}
    if _with_lxml:
        write_args["pretty_print"] = True
    else:
        indent_xml(xml)
    # write xml to file
    should_close = True
    if hasattr(filename, "write"):
        xmlfile = filename
        should_close = False
    elif filename.endswith(".gz"):
        xmlfile = GzipFile(filename, "wb")
    elif filename.endswith(".bz2"):
        xmlfile = BZ2File(filename, "wb")
    else:
        xmlfile = open(filename, "wb")
    ElementTree(xml).write(xmlfile, **write_args)
    if should_close:
        xmlfile.close()
Exemple #10
0
    def get(self):
        from gzip import GzipFile
        try:
            from cString import StringIO
        except ImportError:
            from StringIO import StringIO

        data = self.get_data()
        data['gzipped'] = True
        json_response = self.json_response(data, finish=False)

        tmp_buffer = StringIO()

        gziped_buffer = GzipFile(
            fileobj=tmp_buffer,
            mode="wb",
            compresslevel=7)
        gziped_buffer.write(json_response)
        gziped_buffer.close()

        gzipped_data = tmp_buffer.getvalue()

        self.set_header("Content-Encoding", 'gzip')
        self.set_header("Content-Length", str(len(gzipped_data)))

        tmp_buffer.close()
        self.finish(gzipped_data)
Exemple #11
0
 def parse_file(self, filename=None, buffer=None, fileobj=None):
     """Completely parse a file, extracting all tags."""
     if filename:
         self.file = GzipFile(filename, 'rb')
     elif buffer:
         if hasattr(buffer, 'name'):
             self.filename = buffer.name
         self.file = buffer
     elif fileobj:
         if hasattr(fileobj, 'name'):
             self.filename = fileobj.name
         self.file = GzipFile(fileobj=fileobj)
     if self.file:
         try:
             type = TAG_Byte(buffer=self.file)
             if type.value == self.id:
                 name = TAG_String(buffer=self.file).value
                 self._parse_buffer(self.file)
                 self.name = name
                 self.file.close()
             else:
                 raise MalformedFileError(
                     "First record is not a Compound Tag")
         except StructError as e:
             raise MalformedFileError(
                 "Partial File Parse: file possibly truncated.")
     else:
         raise ValueError(
             "NBTFile.parse_file(): Need to specify either a "
             "filename or a file object"
         )
Exemple #12
0
 def __init__(self, filename=None, mode=None, compresslevel=9, 
              fileobj=None, **kwargs):
     """
     @return: a buffered gzip file object
     @rtype: C{BufferedGzipFile}
     @param filename: a filesystem path
     @type filename: C{str}
     @param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab', 
         'w', or 'wb'
     @type mode: C{str}
     @param compresslevel: The compresslevel argument is an integer from 1
         to 9 controlling the level of compression; 1 is fastest and 
         produces the least compression, and 9 is slowest and produces the
         most compression. The default is 9.
     @type compresslevel: C{int}
     @param fileobj: a StringIO stream to read from instead of a file.
     @type fileobj: C{StringIO}
     @kwparam size: number of bytes to buffer during calls to
         L{read()} and L{write()}
     @type size: C{int}
     """   
     GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
     self._size = kwargs.get('size', self.SIZE)
     self._buffer = StringIO()
     # cStringIO does not support len.
     self._len = 0
Exemple #13
0
 def _compress_string(self, s):
     """Gzip a given string."""
     zbuf = StringIO()
     zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
     zfile.write(s)
     zfile.close()
     return zbuf.getvalue()
Exemple #14
0
def DecodeProcFile(proc_file):
  if len(proc_file) < 256:
    fd = open(proc_file)
  proc_file = fd.read(1024*1024)
  fd.close()
  if proc_file.find('Subsystem Id:') < 0:
      p = None
      try:
        from gzip import GzipFile
        from StringIO import StringIO
        s = StringIO(proc_file)
        gz = GzipFile(mode='r', fileobj=s)
        p = gz.read(1024*1024)
        gz.close()
      except:
        pass
      if p is None:
        try:
          from bz2 import decompress
          p = decompress(proc_file)
        except:
          pass
      if not p is None:
        proc_file = p
  return proc_file
 def open(self):
   request = Request(self.url)
   request.add_header('User-Agent','lastfm-lda recommender v.0.0.-1')
   request.add_header('Accept-encoding', 'gzip')
   while True:
     URLLoadListener.num_connections+=1
     response = None
     try:
       response = urlopen(request,timeout=10)
       if response.info().get('Content-Encoding') == 'gzip':
         f = GzipFile(fileobj=StringIO(response.read()))
         result = f.read()
         f.close()
       else:
         result = response.read()
       break
     except Exception, e:
       if self.retries>2: 
         if isinstance(e, BadStatusLine): raise Exception("last.fm server does not respond (%s)" % e)
         raise e
       self.retries+=1
       print self.url
       print "failed with", e
       print "retry #",self.retries
       print
     finally:
class CompressingRequestWrapper(_makeBase()):
    """
    A request wrapper with support for transport encoding compression.

    @ivar underlying: the request being wrapped.
    @type underlying: L{IRequest}
    @ivar encoding: the IANA-assigned name of the encoding.
    @type encoding: C{str}
    @ivar compressLevel: the level of gzip compression to apply.
    @type compressLevel: C{int}
    """
    implements(IRequest)

    encoding = 'gzip'
    compressLevel = 6


    def __init__(self, underlying):
        self.underlying = underlying
        self.setHeader('content-encoding', self.encoding)
        self._gzipFile = None

        # See setHeader docstring for more commentary.
        self.underlying.headers.pop('content-length', None)


    def setHeader(self, name, value):
        """
        Discard the Content-Length header.

        When compression encoding is in use, the Content-Length header must
        indicate the length of the compressed content; since we are doing the
        compression on the fly, we don't actually know what the length is after
        compression, so we discard this header. If this is an HTTP/1.1 request,
        chunked transfer encoding should be used, softening the impact of
        losing this header.
        """
        if name.lower() == 'content-length':
            return
        else:
            return self.underlying.setHeader(name, value)


    def write(self, data):
        """
        Pass data through to the gzip layer.
        """
        if self._gzipFile is None:
            self._gzipFile = GzipFile(fileobj=self.underlying, mode='wb', compresslevel=self.compressLevel)
        self._gzipFile.write(data)


    def finishRequest(self, success):
        """
        Finish off gzip stream.
        """
        if self._gzipFile is None:
            self.write('')
        self._gzipFile.close()
        self.underlying.finishRequest(success)
Exemple #17
0
def start_file(fname, id_=None):
    """Opens a fif file for writing and writes the compulsory header tags

    Parameters
    ----------
    fname : string | fid
        The name of the file to open. It is recommended
        that the name ends with .fif or .fif.gz. Can also be an
        already opened file.
    id_ : dict | None
        ID to use for the FIFF_FILE_ID.
    """
    if isinstance(fname, string_types):
        if op.splitext(fname)[1].lower() == '.gz':
            logger.debug('Writing using gzip')
            # defaults to compression level 9, which is barely smaller but much
            # slower. 2 offers a good compromise.
            fid = GzipFile(fname, "wb", compresslevel=2)
        else:
            logger.debug('Writing using normal I/O')
            fid = open(fname, "wb")
    else:
        logger.debug('Writing using %s I/O' % type(fname))
        fid = fname
        fid.seek(0)
    #   Write the compulsory items
    write_id(fid, FIFF.FIFF_FILE_ID, id_)
    write_int(fid, FIFF.FIFF_DIR_POINTER, -1)
    write_int(fid, FIFF.FIFF_FREE_LIST, -1)
    return fid
    def _uncachedgenerate(self):
        """ Generates the Gzipped sitemap uncached data """
        len_brains = len(self._catalogbrains())

        if self.index is None:
            # no index specified in the url
            if len_brains < self.maxlen:
                # ok, we have few items, let's generate the standard sitemap
                xml = self.template()
            else:
                # a lot of items, let's generate a sitemap index
                xml = self.indextemplate()
        elif int(self.index)*self.maxlen >= len_brains:
            # bad index specified
            raise NotFound(self.context, '%s-%s' % (self.index, self.filename), self.request)
        else:
            # index specified in the url
            xml = self.template()

        if self.index is not None:
            filename = "%s-%s" % (self.index, self.filename)
        else:
            filename = self.filename

        fp = StringIO()
        gzip = GzipFile(filename, 'w', 9, fp)
        gzip.write(xml)
        gzip.close()
        data = fp.getvalue()
        fp.close()
        return data
 def __init__(self, filename=None, buffer=None, fileobj=None):
     super(NBTFile, self).__init__()
     self.filename = filename
     self.type = TAG_Byte(self.id)
     closefile = True
     #make a file object
     if filename:
         self.file = GzipFile(filename, 'rb')
     elif buffer:
         if hasattr(buffer, 'name'):
             self.filename = buffer.name
         self.file = buffer
         closefile = False
     elif fileobj:
         if hasattr(fileobj, 'name'):
             self.filename = fileobj.name
         self.file = GzipFile(fileobj=fileobj)
     else:
         self.file = None
         closefile = False
     #parse the file given initially
     if self.file:
         self.parse_file()
         if closefile:
             # Note: GzipFile().close() does NOT close the fileobj, 
             # So the caller is still responsible for closing that.
             try:
                 self.file.close()
             except (AttributeError, IOError):
                 pass
         self.file = None
Exemple #20
0
    def handle_stackexchange_login(self, data):
        self.send_response(200)
        self.send_header("Content-type", "text/html")
        self.log_message(self.path)
        self.end_headers()

        c = Client(StackExchange, get_config())
        cred = c.flow.authorization_received(data)

        d = c.request("/me", body=urlencode({
            "site": "stackoverflow"
        }))

        self.wfile.write("<!DOCTYPE html>")
        self.wfile.write("<head><meta charset=\"utf-8\"/></head><body>")
        self.wfile.write("Access token: %s<br>" % cred.access_token)
        self.wfile.write("Type: %s<br>" % cred.token_type)
        self.wfile.write("Expires in: %d<br>" % cred.expires_in)

        # stackexchange gzips all data
        h = StringIO(d)
        gzip_data = GzipFile(fileobj=h)
        d = gzip_data.read()
        gzip_data.close()
        self.wfile.write(d)
        self.wfile.write("</body></html>")
Exemple #21
0
    def save(self, filename, mtime=1300507380.0):
        """
        Serialize this RingData instance to disk.

        :param filename: File into which this instance should be serialized.
        :param mtime: time used to override mtime for gzip, default or None
                      if the caller wants to include time
        """
        # Override the timestamp so that the same ring data creates
        # the same bytes on disk. This makes a checksum comparison a
        # good way to see if two rings are identical.
        #
        # This only works on Python 2.7; on 2.6, we always get the
        # current time in the gzip output.
        tempf = NamedTemporaryFile(dir=".", prefix=filename, delete=False)
        if 'mtime' in inspect.getargspec(GzipFile.__init__).args:
            gz_file = GzipFile(filename, mode='wb', fileobj=tempf,
                               mtime=mtime)
        else:
            gz_file = GzipFile(filename, mode='wb', fileobj=tempf)
        self.serialize_v1(gz_file)
        gz_file.close()
        tempf.flush()
        os.fsync(tempf.fileno())
        tempf.close()
        os.chmod(tempf.name, 0o644)
        os.rename(tempf.name, filename)
 def gzip_media(self, filedata):
     """gzip encodes a given stream of data."""
     gzip_data = StringIO()
     gzf = GzipFile(fileobj=gzip_data, mode="wb")
     gzf.write(filedata)
     gzf.close()
     return gzip_data.getvalue()
def run_analogy_space_lang(lang):
    # Open files (fail early on errors)
    tensor_name = tensor_filename(lang)
    tensor_name_new = tensor_name+'_new'
    tensor_file = GzipFile(tensor_name_new, 'wb')

    svd_name = svd_filename(lang)
    svd_name_new = svd_name + '_new'
    
    # Load matrix
    logging.info('Loading %s'% lang)
    cnet_2d = conceptnet_2d_from_db(lang, identities=IDENTITIES, cutoff=CUTOFF)
    logging.info('Normalize %r' % cnet_2d)
    cnet_2d = cnet_2d.normalized()

    # Save tensor
    logging.info('Save tensor as %s' % tensor_name)
    pickle.dump(cnet_2d, tensor_file, -1)
    tensor_file.close()
    os.rename(tensor_name_new, tensor_name)

    logging.info('Running SVD')
    svd = cnet_2d.svd(k=100)

    # Save SVD
    logging.info('Save as %s' % svd_name)
    svd.save_pytables(svd_name_new)
    os.rename(svd_name_new, svd_name)
Exemple #24
0
def main(argv):
    args = argv[1:] or ["-"]

    class TitleExtractor(MWXMLDumpParser):
        def start_revision(self, pageid, title, revid, timestamp):
            print(pageid, title)
            return

    for path in args:
        if path == "-":
            fp = sys.stdin
        elif path.endswith(".gz"):
            from gzip import GzipFile

            fp = GzipFile(path)
        elif path.endswith(".bz2"):
            from bz2 import BZ2File

            fp = BZ2File(path)
        else:
            fp = open(path)
        parser = TitleExtractor()
        parser.feed_file(fp)
        fp.close()
        parser.close()
    return 0
Exemple #25
0
    def __init__(self, fileobj, name=None, onclose=None, mapped=True,
                 gzip=False):

        if gzip:
            fileobj = GzipFile(fileobj=fileobj)

        self.file = fileobj
        self._name = name
        self.onclose = onclose
        self.is_closed = False

        for attr in ("read", "readline", "write", "tell", "seek", "truncate"):
            if hasattr(fileobj, attr):
                setattr(self, attr, getattr(fileobj, attr))

        # If mapped is True, set the 'map' attribute to a memory-mapped
        # representation of the file. Otherwise, the fake 'map' that set up by
        # the base class will be used.
        if not gzip and mapped and hasattr(fileobj, "mode") and "r" in fileobj.mode:
            fd = fileobj.fileno()
            self.size = os.fstat(fd).st_size
            if self.size > 0:
                import mmap

                try:
                    self.map = mmap.mmap(fd, self.size, access=mmap.ACCESS_READ)
                except OSError:
                    self._setup_fake_map()
        else:
            self._setup_fake_map()

        self.is_real = not gzip and hasattr(fileobj, "fileno")
Exemple #26
0
def LoadGuide():
    if Prefs['xmltv'].startswith('http://') or Prefs['xmltv'].startswith('https://'):
        # Plex can't handle compressed files, using standart Python methods instead
        if Prefs['xmltv'].endswith('.gz') or Prefs['xmltv'].endswith('.gz?raw=1'):
            f = BytesIO(urlopen(Prefs['xmltv']).read())
            try:
                g = GzipFile(fileobj = f)
                xmltv = g.read()
            except:
                Log.Error('Provided file %s is not a valid GZIP file' % Prefs['xmltv'])
                xmltv = None
        else:
            xmltv = HTTP.Request(Prefs['xmltv']).content
    else:
        # Local compressed files are not supported at the moment
        xmltv = Resource.Load(Prefs['xmltv'], binary = True)
    if xmltv != None:
        try:
            root = xml.etree.ElementTree.fromstring(xmltv)
        except:
            Log.Error('Provided file %s is not a valid XML file' % Prefs['xmltv'])
            root = None
        if root != None:
            count = 0
            for programme in root.findall("./programme"):
                channel = programme.get('channel')
                start = datetime_from_utc_to_local(programme.get('start'))
                stop = datetime_from_utc_to_local(programme.get('stop'))
                title = programme.find('title').text
                count = count + 1
                item = {'start': start, 'stop': stop, 'title': title, 'order': count}
                GUIDE.setdefault(channel, {})[count] = item
    return None
Exemple #27
0
 def decode_content(self, data):
     if web.ctx.env.get('HTTP_CONTENT_ENCODING') == 'gzip':
         ib = StringIO(data)
         zf = GzipFile(fileobj=ib)
         return zf.read()
     else:
         return data
Exemple #28
0
 def build_index_gzip(self):
     """creates sorted index from gzip-compressed queue.
     caches object regardless of caccheobj flag.
     """
     self.index = []
     zf = GzipFile(fileobj=self.map, mode="rb")
     while 1:
         p = zf.tell()  # just for diagnosis use
         try:
             l = zf.readline()
         except IOError as ex:
             # probably CRC error due to truncated file. discard the rest.
             logging.error("error in %s at %d: %s", self.fn, p, str(ex))
             break
         if not l:
             break
         if l[0] != " ":
             continue
         try:
             o = cjson.decode(l[1:])
         except Exception as ex:
             logging.warn("skipping malformed JSON at %s:%d: %s", self.fn, p, l[1:])
             continue
         key = o.get("id")
         if key is None:
             try:
                 key = self.urikey(o)
             except UnicodeEncodeError:
                 pass
             if key is None:
                 logging.error("urikey->None for %s", str(o))
                 continue
         self.index.append((key, o))
     zf.close()
    def testPostMethodDeCompressesDeflatedBody_gzip(self):
        self.requestData = None
        def handler(**kwargs):
            self.requestData = kwargs

        reactor = Reactor()
        server = HttpServer(reactor, self.port, handler, timeout=0.01)
        server.listen()
        sok = socket()
        sok.connect(('localhost', self.port))
        bodyData = 'bodydatabodydata'
        _sio = StringIO()
        _gzFileObj = GzipFile(filename=None, mode='wb', compresslevel=6, fileobj=_sio)
        _gzFileObj.write(bodyData); _gzFileObj.close()
        compressedBodyData = _sio.getvalue()
        bodyDataCompressed = compress(bodyData)
        contentLengthCompressed = len(bodyDataCompressed)
        sok.send(('POST / HTTP/1.0\r\nContent-Type: application/x-www-form-urlencoded\r\nContent-Length: %d\r\nContent-Encoding: gzip\r\n\r\n' % contentLengthCompressed) + bodyDataCompressed)

        while not self.requestData:
            reactor.step()
        self.assertEquals(dict, type(self.requestData))
        self.assertTrue('Headers' in self.requestData)
        headers = self.requestData['Headers']
        self.assertEquals('POST', self.requestData['Method'])
        self.assertEquals('application/x-www-form-urlencoded', headers['Content-Type'])
        self.assertEquals(contentLengthCompressed, int(headers['Content-Length']))

        self.assertTrue('Body' in self.requestData)
        self.assertEquals('bodydatabodydata', self.requestData['Body'])
Exemple #30
0
class NBTFile(TAG_Compound):
	"""Represents an NBT file object"""
	
	def __init__(self, filename=None, mode=None, buffer=None):
		super(NBTFile,self).__init__()
		self.__class__.__name__ = "TAG_Compound"
		if filename:
			self.file = GzipFile(filename, mode)
			self.parse_file(self.file)
	
	def parse_file(self, file=None):
		if not file:
			file = self.file
		if file:
			self.type = TAG_Byte(buffer=file)
			if self.type.value == self.id:
				name = TAG_String(buffer=file)
				self._parse_buffer(file)
				self.name = name
				self.file.close()
			else:
				raise ValueError("First record is not a Compound Tag")

	def write_file(self, filename=None, file=None):
		if file:
			self.file = file
		elif filename:
			self.file = GzipFile(filename, "wb")
		else:
			raise ValueError("Need to specify either a filename or a file")
		#Render tree to file
		self.type._render_buffer(file)
		self.name._render_buffer(file)
		self._render_buffer(file)
    def test_process_response_gzipped_gzip_file(self):
        """Test that a gzip Content-Encoded .gz file is gunzipped
        only once by the middleware, leaving gunzipping of the file
        to upper layers.
        """
        headers = {
            'Content-Type': 'application/gzip',
            'Content-Encoding': 'gzip',
        }
        # build a gzipped file (here, a sitemap)
        f = BytesIO()
        plainbody = b"""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
  <url>
    <loc>http://www.example.com/</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>daily</changefreq>
    <priority>1</priority>
  </url>
  <url>
    <loc>http://www.example.com/Special-Offers.html</loc>
    <lastmod>2009-08-16</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.8</priority>
  </url>
</urlset>"""
        gz_file = GzipFile(fileobj=f, mode='wb')
        gz_file.write(plainbody)
        gz_file.close()

        # build a gzipped response body containing this gzipped file
        r = BytesIO()
        gz_resp = GzipFile(fileobj=r, mode='wb')
        gz_resp.write(f.getvalue())
        gz_resp.close()

        response = Response("http;//www.example.com/", headers=headers, body=r.getvalue())
        request = Request("http://www.example.com/")

        newresponse = self.mw.process_response(request, response, self.spider)
        self.assertEqual(gunzip(newresponse.body), plainbody)
Exemple #32
0
class OGDClient(object):
    HTTPError = HTTPError
    BadRequestError = BadRequestError
    UnauthorizedError = UnauthorizedError
    ForbiddenError = ForbiddenError
    NotFoundError = NotFoundError
    NonRetryableHTTPError = NonRetryableHTTPError

    def __init__(self):
        self._json = None
        self.data = b""

    @staticmethod
    def is_logged_in():
        # non-empty ogd_auth means we are logged in (probably, the
        # authorization can in theory have been invalidated on the server
        return bool(app.settings["database_auth"])

    def login_task(self, username, password):
        return LoginTask(self, username, password)

    def logout_task(self, auth_token):
        return LogoutTask(self, auth_token)

    @retry
    def auth(self, username, password, device_id, device_name):
        result = self.post("/api/auth", {
            "username": username,
            "password": password,
            "device_id": device_id,
            "device_name": device_name
        },
                           auth=False)
        return result

    @retry
    def deauth(self, auth_token):
        result = self.post("/api/deauth", {"auth_token": auth_token},
                           auth=False)
        return result

    @staticmethod
    def url_prefix():
        return openretro_url_prefix()

    def opener(self):
        username, password = self.credentials()
        # FIXME: use cache dict?
        return opener_for_url_prefix(self.url_prefix(), username, password)

    @staticmethod
    def credentials():
        auth_token = app.settings["database_auth"]
        return "auth_token", auth_token

    def post(self, path, params=None, data=None, auth=True):
        headers = {}
        if auth:
            credentials = self.credentials()
            headers[str("Authorization")] = str(
                "Basic " + base64.b64encode("{0}:{1}".format(
                    *credentials).encode("UTF-8")).decode("UTF-8"))
        connection = openretro_http_connection()
        url = "{0}{1}".format(openretro_url_prefix(), path)
        # if params:
        #     url += "?" + urlencode(params)
        if not data and params:
            data = urlencode(params)
            headers[str("Content-Type")] = \
                str("application/x-www-form-urlencoded")
        print(url, headers)
        if isinstance(data, dict):
            data = json.dumps(data)
        # print(data)
        connection.request(str("POST"), str(url), data, headers=headers)
        response = connection.getresponse()
        if response.status not in [200]:
            print(response.status, response.reason)
            if response.status == 400:
                class_ = BadRequestError
            elif response.status == 401:
                class_ = UnauthorizedError
            elif response.status == 403:
                class_ = ForbiddenError
            elif response.status == 404:
                class_ = NotFoundError
            else:
                class_ = HTTPError
            raise class_(url, response.status, response.reason,
                         response.getheaders(), None)
        data = response.read()
        if len(data) > 0 and data[0:1] == b"{":
            doc = json.loads(data.decode("UTF-8"))
            return doc
        return data

    def build_url(self, path, **kwargs):
        url = "{0}{1}".format(self.url_prefix(), path)
        if kwargs:
            url += "?" + urlencode(kwargs)
        return url

    def get_request(self, url):
        request = Request(url)
        print("get_request:", url)
        request.add_header("Accept-Encoding", "gzip")
        response = self.opener().open(request)
        return self.handle_response(response)

    def handle_response(self, response):
        self._json = None
        self.data = response.read()
        # print(dir(response.headers))
        try:
            getheader = response.headers.getheader
        except AttributeError:
            getheader = response.getheader
        content_encoding = getheader("content-encoding", "").lower()
        if content_encoding == "gzip":
            # data = zlib.decompress(data)
            fake_stream = StringIO(self.data)
            self.data = GzipFile(fileobj=fake_stream).read()

    def json_response(self):
        if self._json is None:
            self._json = json.loads(self.data.decode("UTF-8"))
        return self._json

    def rate_variant(self, variant_uuid, like=None, work=None):
        params = {
            "game": variant_uuid,
        }
        if like is not None:
            params["like"] = like
        if work is not None:
            params["work"] = work
        url = self.build_url("/api/1/rate_game", **params)
        self.get_request(url)
        return self.json_response()
Exemple #33
0
class HTTPConnection:
    def __init__(self, handler, connection):
        self.handler = handler
        self.connection = connection
        self.buf = ''
        self.closed = False
        self.done = False
        self.donereading = False
        self.next_func = self.read_type

    def get_ip(self):
        return self.connection.get_ip()

    def data_came_in(self, data):
        if self.donereading or self.next_func is None:
            return True
        self.buf += data
        while 1:
            try:
                i = self.buf.index('\n')
            except ValueError:
                return True
            val = self.buf[:i]
            self.buf = self.buf[i + 1:]
            self.next_func = self.next_func(val)
            if self.donereading:
                return True
            if self.next_func is None or self.closed:
                return False

    def read_type(self, data):
        self.header = data.strip()
        words = data.split()
        if len(words) == 3:
            self.command, self.path, garbage = words
            self.pre1 = False
        elif len(words) == 2:
            self.command, self.path = words
            self.pre1 = True
            if self.command != 'GET':
                return None
        else:
            return None
        if self.command not in ('HEAD', 'GET'):
            return None
        self.headers = {}
        return self.read_header

    def read_header(self, data):
        data = data.strip()
        if data == '':
            self.donereading = True
            if self.headers.get('accept-encoding', '').find('gzip') > -1:
                self.encoding = 'gzip'
            else:
                self.encoding = 'identity'
            r = self.handler.getfunc(self, self.path, self.headers)
            if r is not None:
                self.answer(r)
            return None
        try:
            i = data.index(':')
        except ValueError:
            return None
        self.headers[data[:i].strip().lower()] = data[i + 1:].strip()
        if DEBUG:
            print data[:i].strip() + ": " + data[i + 1:].strip()
        return self.read_header

    def answer(self, (responsecode, responsestring, headers, data)):
        if self.closed:
            return
        if self.encoding == 'gzip':
            compressed = StringIO()
            gz = GzipFile(fileobj=compressed, mode='wb', compresslevel=9)
            gz.write(data)
            gz.close()
            cdata = compressed.getvalue()
            if len(cdata) >= len(data):
                self.encoding = 'identity'
            else:
                if DEBUG:
                    print "Compressed: %i  Uncompressed: %i\n" % (len(cdata),
                                                                  len(data))
                data = cdata
                headers['Content-Encoding'] = 'gzip'

        # i'm abusing the identd field here, but this should be ok
        if self.encoding == 'identity':
            ident = '-'
        else:
            ident = self.encoding
        self.handler.log(self.connection.get_ip(), ident, '-', self.header,
                         responsecode, len(data),
                         self.headers.get('referer', '-'),
                         self.headers.get('user-agent', '-'))
        self.done = True
        r = StringIO()
        r.write('HTTP/1.0 ' + str(responsecode) + ' ' + responsestring +
                '\r\n')
        if not self.pre1:
            headers['Content-Length'] = len(data)
            for key, value in headers.items():
                r.write(key + ': ' + str(value) + '\r\n')
            r.write('\r\n')
        if self.command != 'HEAD':
            r.write(data)
        self.connection.write(r.getvalue())
        if self.connection.is_flushed():
            self.connection.shutdown(1)
Exemple #34
0
def tofile(f, obj):
    out = GzipFile(f, 'wb')
    out.write(dumps(obj))
    out.close()
Exemple #35
0
def fromfile(f):
    infile = GzipFile(f)
    result = loads(infile.read())
    infile.close()
    return result
def openMaybeGzip(file, mode):
    f = open(file, mode)
    if re.search('\.gz$', file):
        f = GzipFile(fileobj=f)
    return f
def fetch_covtype(
    *,
    data_home=None,
    download_if_missing=True,
    random_state=None,
    shuffle=False,
    return_X_y=False,
    as_frame=False,
):
    """Load the covertype dataset (classification).

    Download it if necessary.

    =================   ============
    Classes                        7
    Samples total             581012
    Dimensionality                54
    Features                     int
    =================   ============

    Read more in the :ref:`User Guide <covtype_dataset>`.

    Parameters
    ----------
    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : bool, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for dataset shuffling. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    return_X_y : bool, default=False
        If True, returns ``(data.data, data.target)`` instead of a Bunch
        object.

        .. versionadded:: 0.20

    as_frame : bool, default=False
        If True, the data is a pandas DataFrame including columns with
        appropriate dtypes (numeric). The target is a pandas DataFrame or
        Series depending on the number of target columns. If `return_X_y` is
        True, then (`data`, `target`) will be pandas DataFrames or Series as
        described below.

        .. versionadded:: 0.24

    Returns
    -------
    dataset : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : ndarray of shape (581012, 54)
            Each row corresponds to the 54 features in the dataset.
        target : ndarray of shape (581012,)
            Each value corresponds to one of
            the 7 forest covertypes with values
            ranging between 1 to 7.
        frame : dataframe of shape (581012, 55)
            Only present when `as_frame=True`. Contains `data` and `target`.
        DESCR : str
            Description of the forest covertype dataset.
        feature_names : list
            The names of the dataset columns.
        target_names: list
            The names of the target columns.

    (data, target) : tuple if ``return_X_y`` is True
        A tuple of two ndarray. The first containing a 2D array of
        shape (n_samples, n_features) with each row representing one
        sample and each column representing the features. The second
        ndarray of shape (n_samples,) containing the target samples.

        .. versionadded:: 0.20
    """
    data_home = get_data_home(data_home=data_home)
    covtype_dir = join(data_home, "covertype")
    samples_path = _pkl_filepath(covtype_dir, "samples")
    targets_path = _pkl_filepath(covtype_dir, "targets")
    available = exists(samples_path) and exists(targets_path)

    if download_if_missing and not available:
        os.makedirs(covtype_dir, exist_ok=True)

        # Creating temp_dir as a direct subdirectory of the target directory
        # guarantees that both reside on the same filesystem, so that we can use
        # os.rename to atomically move the data files to their target location.
        with TemporaryDirectory(dir=covtype_dir) as temp_dir:
            logger.info(f"Downloading {ARCHIVE.url}")
            archive_path = _fetch_remote(ARCHIVE, dirname=temp_dir)
            Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",")

            X = Xy[:, :-1]
            y = Xy[:, -1].astype(np.int32, copy=False)

            samples_tmp_path = _pkl_filepath(temp_dir, "samples")
            joblib.dump(X, samples_tmp_path, compress=9)
            os.rename(samples_tmp_path, samples_path)

            targets_tmp_path = _pkl_filepath(temp_dir, "targets")
            joblib.dump(y, targets_tmp_path, compress=9)
            os.rename(targets_tmp_path, targets_path)

    elif not available and not download_if_missing:
        raise IOError("Data not found and `download_if_missing` is False")
    try:
        X, y
    except NameError:
        X = joblib.load(samples_path)
        y = joblib.load(targets_path)

    if shuffle:
        ind = np.arange(X.shape[0])
        rng = check_random_state(random_state)
        rng.shuffle(ind)
        X = X[ind]
        y = y[ind]

    fdescr = load_descr("covtype.rst")

    frame = None
    if as_frame:
        frame, X, y = _convert_data_dataframe(
            caller_name="fetch_covtype",
            data=X,
            target=y,
            feature_names=FEATURE_NAMES,
            target_names=TARGET_NAMES,
        )
    if return_X_y:
        return X, y

    return Bunch(
        data=X,
        target=y,
        frame=frame,
        target_names=TARGET_NAMES,
        feature_names=FEATURE_NAMES,
        DESCR=fdescr,
    )
Exemple #38
0
class S3BotoStorageFile(File):
    """
    The default file object used by the S3BotoStorage backend.

    This file implements file streaming using boto's multipart
    uploading functionality. The file can be opened in read or
    write mode.

    This class extends Django's File class. However, the contained
    data is only the data contained in the current buffer. So you
    should not access the contained file object directly. You should
    access the data via this class.

    Warning: This file *must* be closed using the close() method in
    order to properly write the file to S3. Be sure to close the file
    in your application.
    """
    # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing.
    # TODO: When Django drops support for Python 2.5, rewrite to use the
    #       BufferedIO streams in the Python 2.6 io module.
    buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880)

    def __init__(self, name, mode, storage, buffer_size=None):
        self._storage = storage
        self.name = name[len(self._storage.location):].lstrip('/')
        self._mode = mode
        self.key = storage.bucket.get_key(self._storage._encode_name(name))
        if not self.key and 'w' in mode:
            self.key = storage.bucket.new_key(storage._encode_name(name))
        self._is_dirty = False
        self._file = None
        self._multipart = None
        # 5 MB is the minimum part size (if there is more than one part).
        # Amazon allows up to 10,000 parts.  The default supports uploads
        # up to roughly 50 GB.  Increase the part size to accommodate
        # for files larger than this.
        if buffer_size is not None:
            self.buffer_size = buffer_size
        self._write_counter = 0

    @property
    def size(self):
        return self.key.size

    def _get_file(self):
        if self._file is None:
            self._file = SpooledTemporaryFile(
                max_size=self._storage.max_memory_size,
                suffix=".S3BotoStorageFile",
                dir=setting("FILE_UPLOAD_TEMP_DIR", None))
            if 'r' in self._mode:
                self._is_dirty = False
                self.key.get_contents_to_file(self._file)
                self._file.seek(0)
            if self._storage.gzip and self.key.content_encoding == 'gzip':
                self._file = GzipFile(mode=self._mode, fileobj=self._file)
        return self._file

    def _set_file(self, value):
        self._file = value

    file = property(_get_file, _set_file)

    def read(self, *args, **kwargs):
        if 'r' not in self._mode:
            raise AttributeError("File was not opened in read mode.")
        return super(S3BotoStorageFile, self).read(*args, **kwargs)

    def write(self, content, *args, **kwargs):
        if 'w' not in self._mode:
            raise AttributeError("File was not opened in write mode.")
        self._is_dirty = True
        if self._multipart is None:
            provider = self.key.bucket.connection.provider
            upload_headers = {provider.acl_header: self._storage.default_acl}
            upload_headers.update({
                'Content-Type':
                mimetypes.guess_type(self.key.name)[0]
                or self._storage.key_class.DefaultContentType
            })
            upload_headers.update(self._storage.headers)
            self._multipart = self._storage.bucket.initiate_multipart_upload(
                self.key.name,
                headers=upload_headers,
                reduced_redundancy=self._storage.reduced_redundancy)
        if self.buffer_size <= self._buffer_file_size:
            self._flush_write_buffer()
        return super(S3BotoStorageFile, self).write(force_bytes(content),
                                                    *args, **kwargs)

    @property
    def _buffer_file_size(self):
        pos = self.file.tell()
        self.file.seek(0, os.SEEK_END)
        length = self.file.tell()
        self.file.seek(pos)
        return length

    def _flush_write_buffer(self):
        """
        Flushes the write buffer.
        """
        if self._buffer_file_size:
            self._write_counter += 1
            self.file.seek(0)
            headers = self._storage.headers.copy()
            self._multipart.upload_part_from_file(self.file,
                                                  self._write_counter,
                                                  headers=headers)
            self.file.close()
            self._file = None

    def close(self):
        if self._is_dirty:
            self._flush_write_buffer()
            self._multipart.complete_upload()
        else:
            if not self._multipart is None:
                self._multipart.cancel_upload()
        self.key.close()
def GetSinaUSStockList(page, last_update, mysql):
    #http://stock.finance.sina.com.cn/usstock/api/jsonp.php/IO.XSRV2.CallbackList%5B%27fa8Vo3U4TzVRdsLs%27%5D/US_CategoryService.getList?page=1&num=20&sort=&asc=0&market=&id=
    url = 'http://stock.finance.sina.com.cn/usstock/api/jsonp.php/IO.XSRV2.CallbackList%%5B%%27fa8Vo3U4TzVRdsLs%%27%%5D/US_CategoryService.getList?page=%d&num=60&sort=&asc=0&market=&id=' % page
    data = ''
    user_agent = 'Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/20100101 Firefox/12.0'
    headers = {
        'User-Agent': user_agent,
        'Host': 'stock.finance.sina.com.cn',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-us,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        #'Connection': 'keep-alive',
        'Referer': 'http://finance.sina.com.cn/stock/usstock/sector.shtml',
    }

    req = urllib2.Request(url, data, headers)
    resp = urllib2.urlopen(req)
    old_resp = resp

    if resp.headers.get("content-encoding") == "gzip":
        gz = GzipFile(fileobj=StringIO(resp.read()), mode="r")
        #resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
        resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url)
        resp.msg = old_resp.msg
        #json_html = gz.read()
        #print 'xxx'
    # deflate
    if resp.headers.get("content-encoding") == "deflate":
        gz = StringIO(deflate(resp.read()))
        #resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)  # 'class to add info() and
        resp = urllib2.addinfourl(gz, old_resp.headers,
                                  old_resp.url)  # 'class to add info() and
        resp.msg = old_resp.msg
        #json_html = gz.read()
        #print 'YY'
    json_html = resp.read()
    #print json_html
    #
    json_html = re.sub(r'([a-zA-Z_]+):', r'"\1":', json_html)
    #json_html = re.sub(r'(McDonald\')', r"McDonald'", json_html)
    #json_html = json_html.replace("McDonald\\'s", "McDonald_s")
    #json_html = json_html.replace("O\\'Reilly", "O_Reilly")
    json_html = json_html.replace("\\'", "_")

    j_start = json_html.find("IO.XSRV2.CallbackList['fa8Vo3U4TzVRdsLs']((")
    j_start = j_start + len("IO.XSRV2.CallbackList['fa8Vo3U4TzVRdsLs']((")
    j_end = json_html.rfind("));")
    #print j_end
    #print j_start
    your_string = json_html[j_start:j_end]
    your_string = your_string.decode('gbk')
    #data = json.loads(json_html[j_start:j_end])
    #yaml.load('[{id:"1",category:"basic materials"}]')
    #your_string = re.sub(r'([a-zA-Z_]+):', r'"\1":', your_string)
    #print your_string
    #yesterday = datetime.date.today() - datetime.timedelta(1)
    #print yesterday
    s_trade_day = last_update.strftime("%Y-%m-%d")
    print s_trade_day
    #exit(0)
    try:
        json_obj = json.loads(your_string)
        print json_obj['count']
        items = json_obj['data']
        for item in items:
            print item['symbol']
            sql = "SELECT * FROM  `stock_symbols`  WHERE `symbol` = '%s' " % item[
                'symbol'].encode('utf-8')
            result = mysql.query(sql)

            if result:
                #print "item['symbol']:%s in table" % item['symbol']
                result = mysql.query(sql)
                sid = result[0][0]
                #print sid
            else:
                sql = """INSERT INTO `stock_symbols` (`symbol`, `cname`, `fname`, `brief`, `ipodate`, `52weeklow`, `52weekhigh`, `lastpriceopen`, `lastpriceclose`, `lastpricehigh`, `lastpricelow`, `change`, `changepc`, `volumeoftoday`, `marketvalue`, `PE`, `industry`, `exchange`) VALUES
('%s', '%s', '%s', '', '0000-00-00', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', 0, '0.0000', '0.0000', 0, 0);""" % (
                    item['symbol'].encode('utf-8').strip(),
                    item['name'].encode('utf-8').strip(),
                    item['cname'].encode('utf-8').strip())
                #print sql
                result = mysql.query(sql)
                #print "INSERT", mysql.lastrowid()
                sid = mysql.lastrowid()
            try:
                if item['pe'] is None:
                    pe = 0
                else:
                    pe = item['pe'].encode('utf-8').strip()
                sql = "INSERT INTO `smartstrader`.`trade_daily_history` (`sid`, `preclose`, `openprice`, `closeprice`, `highprice`, `lowprice`, `volume`, `mktcap`, `pe`, `tradeday`) VALUES ('%d', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
                    sid, item['preclose'].encode('utf-8').strip(),
                    item['open'].encode('utf-8').strip(),
                    item['price'].encode('utf-8').strip(),
                    item['high'].encode('utf-8').strip(),
                    item['low'].encode('utf-8').strip(),
                    item['volume'].encode('utf-8').strip(),
                    item['mktcap'].encode('utf-8').strip(), pe, s_trade_day)
            except AttributeError:
                print item
                raise

            #print sql
            try:
                result = mysql.query(sql)
            except MySQLdb.IntegrityError, error:
                print error

        #print json_obj[0]
        #print len(json_obj)
    except JSONDecodeError, e:
        print your_string
        print e
        raise
Exemple #40
0
def _fetch_brute_kddcup99(data_home=None,
                          download_if_missing=True,
                          percent10=True):
    """Load the kddcup99 dataset, downloading it if necessary.

    Parameters
    ----------
    data_home : string, optional
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : boolean, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    percent10 : bool, default=True
        Whether to load only 10 percent of the data.

    Returns
    -------
    dataset : dict-like object with the following attributes:
        dataset.data : numpy array of shape (494021, 41)
            Each row corresponds to the 41 features in the dataset.
        dataset.target : numpy array of shape (494021,)
            Each value corresponds to one of the 21 attack types or to the
            label 'normal.'.
        dataset.DESCR : string
            Description of the kddcup99 dataset.

    """

    data_home = get_data_home(data_home=data_home)
    if sys.version_info[0] == 3:
        # The zlib compression format use by joblib is not compatible when
        # switching from Python 2 to Python 3, let us use a separate folder
        # under Python 3:
        dir_suffix = "-py3"
    else:
        # Backward compat for Python 2 users
        dir_suffix = ""

    if percent10:
        kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
        archive = ARCHIVE_10_PERCENT
    else:
        kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
        archive = ARCHIVE

    samples_path = join(kddcup_dir, "samples")
    targets_path = join(kddcup_dir, "targets")
    available = exists(samples_path)

    if download_if_missing and not available:
        _mkdirp(kddcup_dir)
        logger.info("Downloading %s" % archive.url)
        _fetch_remote(archive, dirname=kddcup_dir)
        dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'),
              ('flag', 'S6'), ('src_bytes', int), ('dst_bytes', int),
              ('land', int), ('wrong_fragment', int), ('urgent', int),
              ('hot', int), ('num_failed_logins', int), ('logged_in', int),
              ('num_compromised', int), ('root_shell', int),
              ('su_attempted', int), ('num_root', int),
              ('num_file_creations', int), ('num_shells', int),
              ('num_access_files', int), ('num_outbound_cmds', int),
              ('is_host_login', int), ('is_guest_login', int), ('count', int),
              ('srv_count', int), ('serror_rate', float),
              ('srv_serror_rate', float), ('rerror_rate', float),
              ('srv_rerror_rate', float), ('same_srv_rate', float),
              ('diff_srv_rate', float), ('srv_diff_host_rate', float),
              ('dst_host_count', int), ('dst_host_srv_count', int),
              ('dst_host_same_srv_rate', float),
              ('dst_host_diff_srv_rate', float),
              ('dst_host_same_src_port_rate', float),
              ('dst_host_srv_diff_host_rate', float),
              ('dst_host_serror_rate', float),
              ('dst_host_srv_serror_rate', float),
              ('dst_host_rerror_rate', float),
              ('dst_host_srv_rerror_rate', float), ('labels', 'S16')]
        DT = np.dtype(dt)
        logger.debug("extracting archive")
        archive_path = join(kddcup_dir, archive.filename)
        file_ = GzipFile(filename=archive_path, mode='r')
        Xy = []
        for line in file_.readlines():
            if six.PY3:
                line = line.decode()
            Xy.append(line.replace('\n', '').split(','))
        file_.close()
        logger.debug('extraction done')
        os.remove(archive_path)

        Xy = np.asarray(Xy, dtype=object)
        for j in range(42):
            Xy[:, j] = Xy[:, j].astype(DT[j])

        X = Xy[:, :-1]
        y = Xy[:, -1]
        # XXX bug when compress!=0:
        # (error: 'Incorrect data length while decompressing[...] the file
        #  could be corrupted.')

        joblib.dump(X, samples_path, compress=0)
        joblib.dump(y, targets_path, compress=0)
    elif not available:
        if not download_if_missing:
            raise IOError("Data not found and `download_if_missing` is False")

    try:
        X, y
    except NameError:
        X = joblib.load(samples_path)
        y = joblib.load(targets_path)

    return Bunch(data=X, target=y, DESCR=__doc__)
Exemple #41
0
class NBTFile(TAG_Compound):
    """Represent an NBT file object."""
    def __init__(self, filename=None, buffer=None, fileobj=None):
        """
        Create a new NBTFile object.
        Specify either a filename, file object or data buffer.
        If filename of file object is specified, data should be GZip-compressed.
        If a data buffer is specified, it is assumed to be uncompressed.
        
        If filename is specified, the file is closed after reading and writing.
        If file object is specified, the caller is responsible for closing the file.
        """
        super(NBTFile, self).__init__()
        self.filename = filename
        self.type = TAG_Byte(self.id)
        closefile = True
        #make a file object
        if filename:
            self.filename = filename
            self.file = GzipFile(filename, 'rb')
        elif buffer:
            if hasattr(buffer, 'name'):
                self.filename = buffer.name
            self.file = buffer
            closefile = False
        elif fileobj:
            if hasattr(fileobj, 'name'):
                self.filename = fileobj.name
            self.file = GzipFile(fileobj=fileobj)
        else:
            self.file = None
            closefile = False
        #parse the file given initially
        if self.file:
            self.parse_file()
            if closefile:
                # Note: GzipFile().close() does NOT close the fileobj, 
                # So we are still responsible for closing that.
                try:
                    self.file.close()
                except (AttributeError, IOError):
                    pass
            self.file = None

    def parse_file(self, filename=None, buffer=None, fileobj=None):
        """Completely parse a file, extracting all tags."""
        if filename:
            self.file = GzipFile(filename, 'rb')
        elif buffer:
            if hasattr(buffer, 'name'):
                self.filename = buffer.name
            self.file = buffer
        elif fileobj:
            if hasattr(fileobj, 'name'):
                self.filename = fileobj.name
            self.file = GzipFile(fileobj=fileobj)
        if self.file:
            try:
                type = TAG_Byte(buffer=self.file)
                if type.value == self.id:
                    name = TAG_String(buffer=self.file).value
                    self._parse_buffer(self.file)
                    self.name = name
                    self.file.close()
                else:
                    raise MalformedFileError("First record is not a Compound Tag")
            except StructError as e:
                raise MalformedFileError("Partial File Parse: file possibly truncated.")
        else:
            raise ValueError("NBTFile.parse_file(): Need to specify either a filename or a file object")

    def write_file(self, filename=None, buffer=None, fileobj=None):
        """Write this NBT file to a file."""
        closefile = True
        if buffer:
            self.filename = None
            self.file = buffer
            closefile = False
        elif filename:
            self.filename = filename
            self.file = GzipFile(filename, "wb")
        elif fileobj:
            self.filename = None
            self.file = GzipFile(fileobj=fileobj, mode="wb")
        elif self.filename:
            self.file = GzipFile(self.filename, "wb")
        elif not self.file:
            raise ValueError("NBTFile.write_file(): Need to specify either a filename or a file object")
        #Render tree to file
        TAG_Byte(self.id)._render_buffer(self.file)
        TAG_String(self.name)._render_buffer(self.file)
        self._render_buffer(self.file)
        #make sure the file is complete
        try:
            self.file.flush()
        except (AttributeError, IOError):
            pass
        if closefile:
            try:
                self.file.close()
            except (AttributeError, IOError):
                pass

    def __repr__(self):
        """
        Return a string (ascii formated for Python 2, unicode
        for Python 3) describing the class, name and id for
        debugging purposes.
        """
        if self.filename:
            return "<%s(%r) with %s(%r) at 0x%x>" % (self.__class__.__name__, self.filename, \
                    TAG_Compound.__name__, self.name, id(self))
        else:
            return "<%s with %s(%r) at 0x%x>" % (self.__class__.__name__, \
                    TAG_Compound.__name__, self.name, id(self))
def GetSinaUSStockCategory(mysql):
    url = 'http://stock.finance.sina.com.cn/usstock/api/jsonp.php/var%20category=/US_CategoryService.getCategory'
    data = ''
    user_agent = 'Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/20100101 Firefox/12.0'
    headers = {
        'User-Agent': user_agent,
        'Host': 'stock.finance.sina.com.cn',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-us,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        #'Connection': 'keep-alive',
        'Referer': 'http://finance.sina.com.cn/stock/usstock/sector.shtml',
    }

    req = urllib2.Request(url, data, headers)
    resp = urllib2.urlopen(req)
    old_resp = resp

    if resp.headers.get("content-encoding") == "gzip":
        gz = GzipFile(fileobj=StringIO(resp.read()), mode="r")
        #resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
        resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url)
        resp.msg = old_resp.msg
        #json_html = gz.read()
        #print 'xxx'
    # deflate
    if resp.headers.get("content-encoding") == "deflate":
        gz = StringIO(deflate(resp.read()))
        #resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)  # 'class to add info() and
        resp = urllib2.addinfourl(gz, old_resp.headers,
                                  old_resp.url)  # 'class to add info() and
        resp.msg = old_resp.msg
        #json_html = gz.read()
        #print 'YY'
    json_html = resp.read()
    #
    j_start = json_html.find("var category=(")
    j_start = j_start + len("var category=(")
    j_end = json_html.rfind(");")
    #print j_end
    #print j_start
    #print json_html[j_start:j_end]
    #data = json.loads(json_html[j_start:j_end])
    #yaml.load('[{id:"1",category:"basic materials"}]')
    your_string = re.sub(r'([a-zA-Z_]+):', r'"\1":', json_html[j_start:j_end])
    #print your_string
    json_obj = json.loads(your_string.decode('gbk'))
    #print json_obj[0]
    #print len(json_obj)
    for item in json_obj:
        print item['id']
        print item['category']
        print item['category_cn']
        print item['parent']
        #print item['child']
        for citem in item['child']:
            print '\t', citem['id']
            print '\t', citem['category']
            print '\t', citem['category_cn']
            print '\t', citem['parent']
        print "####"
Exemple #43
0
    def checkin(self, timeout=11):
        """Gather system status."""

        # Compile checkin data
        checkin_start = time.time()
        status = {
            'usage': events.usage(),
            'uptime': system_utilities.uptime(),
            'system_utilization': self.utilization_tracker.get_data(),
        }

        # Append status if we can
        try:
            #get the software versions
            status['versions'] = bts.get_versions()
        except BSSError as e:
            logger.error("bts get_versions error: %s" % e)

        try:
            # Gather camped subscriber list
            status['camped_subscribers'] = bts.active_subscribers()
        except BSSError as e:
            logger.error("bts get active_subscribers error: %s" % e)

        # Gather tower load and noise data.
        # NOTE(matt): these values can vary quite a bit over a minute. It
        #       might be worth capturing data more frequently and sending
        #       something like average or median values.
        status['openbts_load'] = {}
        try:
            status['openbts_load'] = bts.get_load()
        except BSSError as e:
            logger.error("bts get_load error: %s" % e)

        for key, val in self._checkin_load_stats.items():
            status['openbts_load']['checkin.' + key] = val
        self._checkin_load_stats.clear()

        try:
            status['openbts_noise'] = bts.get_noise()
        except BSSError as e:
            logger.error("bts get_noise error: %s" % e)

        status['radio'] = {}
        try:
            status['radio']['band'] = bts.get_band()
            # eventually need to also grab all used channels, not just c0
            # TODO: (kheimerl) T13270338 Add multiband support
            status['radio']['c0'] = bts.get_arfcn_c0()
            #also add power here eventually
            # TODO: (kheimerl) T13270365 Add power level support
        except BSSError as e:
            #delete the key if this failed
            del status['radio']
            logger.error("bts radio error: %s" % e)

        # Add balance sync data
        status['subscribers'] = subscriber.get_subscriber_states(
            imsis=events.EventStore().modified_subs())

        # Add delta protocol context (if available) to let server know,
        # client supports delta optimization & has a prior delta state
        if delta.DeltaProtocol.CTX_KEY not in status:  # just a precaution
            sections_ctx = {}
            for section, ctx in CheckinHandler.section_ctx.items():
                if ctx:
                    sections_ctx[section] = ctx.to_proto_dict()

            if sections_ctx:
                status[delta.DeltaProtocol.CTX_KEY] = {
                    delta.DeltaProtocolOptimizer.SECTIONS_CTX_KEY: sections_ctx
                }

        # Send checkin request.
        uuid = snowflake.snowflake()
        data = {
            'status': status,
            'bts_uuid': uuid,
        }
        headers = dict(self.auth_header)
        # Set content type to app/json & utf-8, compressed or not - JSON should
        # be more efficient then URL encoded JSON form payload
        headers['Content-Type'] = 'application/json; charset=utf-8'
        data_json = json.dumps(data)
        decompressed_status_len = len(data_json)
        status_len = decompressed_status_len

        if status_len > endaga_ic.MIN_COMPRESSIBLE_REQUEST_SZ:
            # try to gzip payload, send uncompressed if compression failed
            try:
                gzbuf = BytesIO()
                with GzipFile(mode='wb', fileobj=gzbuf) as gzfile:
                    gzfile.write(data_json)
                data_json = gzbuf.getvalue()
                # Using Content-Encoding header since AWS cannot handle
                # Transfer-Encoding header which would be more appropriate here
                headers['Content-Encoding'] = 'gzip'
                status_len = len(data_json)  # set len to reflect compression
            except BaseException as e:
                logger.error("Checkin request Gzip error: %s" % e)

        headers['Content-Length'] = str(status_len)

        post_start = time.time()
        try:
            r = self.session.post(
                self.conf['registry'] + "/checkin?id=" +
                # add part of uuid to the query, it helps with
                # debugging & server side logging and can
                # be used by LBs
                uuid[:8],
                headers=headers,
                data=data_json,
                timeout=timeout,
                cookies=self._session_cookies)

        except BaseException as e:
            logger.error("Endaga: checkin failed , network error: %s." % e)
            self._cleanup_session()
            self._checkin_load_stats['req_sz'] = status_len
            self._checkin_load_stats['raw_req_sz'] = decompressed_status_len
            self._checkin_load_stats['post_lat'] = time.time() - post_start
            raise

        post_end = time.time()

        # Make sure either server sent charset or we set it to utf-8 (JSON
        # default)
        if not r.encoding:
            r.encoding = 'utf-8'

        text = r.text
        decompressed_response_len = len(text)
        response_len = decompressed_response_len

        # Try to get correct content length from HTTP headers, it should
        # reflect correctly compressed length. if it fails - fall back to
        # getting length of returned text
        cont_len = r.headers.get('Content-Length')
        if cont_len:
            try:
                response_len = int(cont_len)
            except BaseException:
                pass

        if r.status_code == 200:
            try:
                CheckinHandler(text)
                logger.info("Endaga: checkin success.")
                if r.cookies is not None:
                    if self._session_cookies is None:
                        # First time cookies are seen from server
                        # initialize the cookies dict
                        self._session_cookies = dict(r.cookies)
                    else:
                        for key, value in r.cookies.items():
                            # if server sent new/updated cookies, update them,
                            # but keep previously set cokies as well. ELBs
                            # do not send AWSELB cookies on every request &
                            # expect clients to 'remember' them
                            self._session_cookies[key] = value
            except BaseException:
                self._cleanup_session()
                raise
        else:
            logger.error("Endaga: checkin failed (%d), reason: %s, body: %s" %
                         (r.status_code, r.reason, r.text))
            # cleanup session on any error
            if r.status_code >= 300:
                self._cleanup_session()

        checkin_end = time.time()

        self._checkin_load_stats['req_sz'] = status_len  # request payload SZ
        self._checkin_load_stats['raw_req_sz'] = decompressed_status_len
        self._checkin_load_stats[
            'rsp_sz'] = response_len  # response payload SZ
        self._checkin_load_stats['raw_rsp_sz'] = decompressed_response_len
        # Checkin Latencies
        self._checkin_load_stats['post_lat'] = post_end - post_start
        self._checkin_load_stats['process_lat'] = checkin_end - post_end
        self._checkin_load_stats['lat'] = checkin_end - checkin_start

        data['response'] = {'status': r.status_code, 'text': r.text}
        return data
Exemple #44
0
def compress_string(s):
    zbuf = BytesIO()
    with GzipFile(mode='wb', compresslevel=6, fileobj=zbuf, mtime=0) as zfile:
        zfile.write(s)
    return zbuf.getvalue()
Exemple #45
0
 def _write_compressed(self, fileobj):
     with GzipFile(fileobj=fileobj, mode='w') as gz_f:
         gz_f.writelines(self.data)
Exemple #46
0
 def _ungzip(self, data):
     """
     Un-gzip some data.
     """
     s = StringIO(data)
     return GzipFile(fileobj=s, mode='rb').read()
Exemple #47
0
 def __exit__(self, exc_type, exc_value, traceback):
     if hasattr(GzipFile, '__exit__'):
         return GzipFile.__exit__(self, exc_type, exc_value, traceback)
     else:
         return self.close()
Exemple #48
0
def add_file():
    tags = request.forms.get('tag_list')
    uploads = request.files.getlist('file')

    # Set Project
    project = request.forms.get('project')
    if project in project_list():
        __project__.open(project)
    else:
        __project__.open('../')
        project = 'Main'
    db = Database()
    file_list = []
    # Write temp file to disk
    with upload_temp() as temp_dir:
        for upload in uploads:
            file_path = os.path.join(temp_dir, upload.filename)
            with open(file_path, 'w') as tmp_file:
                tmp_file.write(upload.file.read())
            # Zip Files
            if request.forms.get('compression') == 'zip':
                zip_pass = request.forms.get('zip_pass')
                try:
                    with ZipFile(file_path) as zf:
                        zf.extractall(temp_dir, pwd=zip_pass)
                    for root, dirs, files in os.walk(temp_dir, topdown=False):
                        for name in files:
                            if not name == upload.filename:
                                file_list.append(os.path.join(root, name))
                except Exception as e:
                    return template('error.tpl',
                                    error="Error with zipfile - {0}".format(e))
            # GZip Files
            elif request.forms.get('compression') == 'gz':
                try:
                    gzf = GzipFile(file_path, 'rb')
                    decompress = gzf.read()
                    gzf.close()
                    with open(file_path[:-3], "wb") as df:
                        df.write(decompress)
                    file_list.append(file_path[:-3])
                except Exception as e:
                    return template(
                        'error.tpl',
                        error="Error with gzipfile - {0}".format(e))
            # BZip2 Files
            elif request.forms.get('compression') == 'bz2':
                try:
                    bz2f = BZ2File(file_path, 'rb')
                    decompress = bz2f.read()
                    bz2f.close()
                    with open(file_path[:-3], "wb") as df:
                        df.write(decompress)
                    file_list.append(file_path[:-3])
                except Exception as e:
                    return template(
                        'error.tpl',
                        error="Error with bzip2file - {0}".format(e))
            # Tar Files (any, including tar.gz tar.bz2)
            elif request.forms.get('compression') == 'tar':
                try:
                    if not tarfile.is_tarfile(file_path):
                        return template('error.tpl',
                                        error="This is not a tar file")
                    with tarfile.open(file_path, 'r:*') as tarf:
                        tarf.extractall(temp_dir)
                    for root, dirs, files in os.walk(temp_dir, topdown=False):
                        for name in files:
                            if not name == upload.filename:
                                file_list.append(os.path.join(root, name))
                except Exception as e:
                    return template('error.tpl',
                                    error="Error with tarfile - {0}".format(e))
            # Non zip files
            elif request.forms.get('compression') == 'none':
                file_list.append(file_path)

        # Add each file
        for new_file in file_list:
            print new_file
            obj = File(new_file)
            new_path = store_sample(obj)
            success = True
            if new_path:
                # Add file to the database.
                success = db.add(obj=obj, tags=tags)
                if not success:
                    return template(
                        'error.tpl',
                        error="Unable to Store The File: {0}".format(
                            upload.filename))
    redirect("/project/{0}".format(project))
Exemple #49
0
 def __init__(self, file):
     data = GzipFile(fileobj=file, mode="rb").read()
     self.size = len(data)
     self.name = file.name
     super(GzipChunk, self).__init__(data)
Exemple #50
0
 def __enter__(self):
     if hasattr(GzipFile, '__enter__'):
         return GzipFile.__enter__(self)
     else:
         return self
 def __enter__(self):
     file_handle = GzipFile(fileobj=open(self.file_name, 'wb'), mode='wb')
     self.set_file_handle(file_handle)
     self.add_file_to_registry()
     return self
Exemple #52
0
    def _update(self, version):
        from poetry.utils.helpers import temporary_directory

        platform = sys.platform
        if platform == "linux2":
            platform = "linux"

        checksum = "poetry-{}-{}.sha256sum".format(version, platform)

        try:
            r = urlopen(self.BASE_URL + "/{}/{}".format(version, checksum))
        except HTTPError as e:
            if e.code == 404:
                raise RuntimeError("Could not find {} file".format(checksum))

            raise

        checksum = r.read().decode()

        # We get the payload from the remote host
        name = "poetry-{}-{}.tar.gz".format(version, platform)
        try:
            r = urlopen(self.BASE_URL + "/{}/{}".format(version, name))
        except HTTPError as e:
            if e.code == 404:
                raise RuntimeError("Could not find {} file".format(name))

            raise

        meta = r.info()
        size = int(meta["Content-Length"])
        current = 0
        block_size = 8192

        bar = self.progress_bar(max=size)
        bar.set_format(
            " - Downloading <info>{}</> <comment>%percent%%</>".format(name))
        bar.start()

        sha = hashlib.sha256()
        with temporary_directory(prefix="poetry-updater-") as dir_:
            tar = os.path.join(dir_, name)
            with open(tar, "wb") as f:
                while True:
                    buffer = r.read(block_size)
                    if not buffer:
                        break

                    current += len(buffer)
                    f.write(buffer)
                    sha.update(buffer)

                    bar.set_progress(current)

            bar.finish()

            # Checking hashes
            if checksum != sha.hexdigest():
                raise RuntimeError(
                    "Hashes for {} do not match: {} != {}".format(
                        name, checksum, sha.hexdigest()))

            gz = GzipFile(tar, mode="rb")
            try:
                with tarfile.TarFile(tar,
                                     fileobj=gz,
                                     format=tarfile.PAX_FORMAT) as f:
                    f.extractall(str(self.lib))
            finally:
                gz.close()
Exemple #53
0
except:
    old = ''

files = []
for dirs in sys.argv[1:]:
    files += [dirs + '/' + x for x in os.listdir(dirs)]

files = set(files) - (knownfiles)

output = codecs.open('kv7planning.idx', 'w', 'UTF-8')
output.write(old)

for filename in sorted(files):
    localservicelevelcodes = set([])

    for line in GzipFile(filename, 'r'):
        if line[0] == '\\':
            if dumping:
                dumping = False

            if table == 'LOCALSERVICEGROUPPASSTIME' and line[1] == 'L':
                dumping = True

            elif line[1] == 'T':
                table = line[2:].split('|')[0]

        else:
            if dumping:
                line = line.decode('UTF-8').split('|')
                localservicelevelcodes.add(line[0] + "|" + line[1])
Exemple #54
0
def eval():

    files = glob(data + "/" + "*.zip")
    files.sort()

    print len(files), "found"

    for fileName in files[:]:

        print fileName

        # s_time = time.time()
        smp = pp.GestureSample(fileName)
        # print "loading", (time.time()-s_time)/1000.,"ms"
        # s_time = time.time()
        n = smp.data['numFrames']
        dv, uv, gv = smp.depth, smp.user, smp.rgb

        cur_fr = 1
        # new_shape = (step,128,128)

        s = []
        d, u, g = [empty((n_f, ) + vid_res + (3, ), "uint8") for _ in range(3)]
        # take first n_f frames
        for v in dv, uv, gv:
            pp.go_to_frame(v, cur_fr)
        for i, fr in enumerate(range(cur_fr, cur_fr + n_f)):
            s.append(smp.getSkeleton(fr))
            d[i], u[i], g[i] = [v.read()[1] for v in dv, uv, gv]

        d, u, g = [pp.to_grayscale(v) for v in d, u, g]
        u[u < 128], u[u >= 128] = 0, 1
        depth, user, gray, skelet = d, u, g, s
        user_o = user.copy()
        depth_o = depth.copy()
        gray_o = gray.copy()
        # user_depth = depth_o[user_o==1]
        skelet, c = pp.proc_skelet(array(skelet).copy())
        user = pp.proc_user(user)

        _, depth, c = pp.proc_depth(depth.copy(), user.copy(), user_o,
                                    array(skelet).copy())
        gray, c = pp.proc_gray(
            gray.copy(), user,
            array(skelet).copy())  #user.copy!!!!!!!!!!!!!!!!!!!
        cur_fr += n_f

        predictions = []
        while cur_fr + step < n:
            # time_start = time.time()
            sn = []
            dn, un, gn = [
                empty((step, ) + vid_res + (3, ), "uint8") for _ in range(3)
            ]
            # for v in dv,uv,gv: pp.go_to_frame(v, cur_fr)
            for i, fr in enumerate(range(cur_fr, cur_fr + step)):
                sn.append(smp.getSkeleton(fr))
                dn[i], un[i], gn[i] = [v.read()[1] for v in dv, uv, gv]

            dn, un, gn = [pp.to_grayscale(v) for v in dn, un, gn]
            un[un < 128], un[un >= 128] = 0, 1

            s = s[step:] + sn
            # s.extend(sn)
            skelet, c = pp.proc_skelet(s, _3D=False)

            # len_dump = len(depth_o[:step][user_o[:step]==1])
            # un_d = dn[un==1]

            user_o[:-step] = user_o[step:]
            user_o[-step:] = un.copy()
            un = pp.proc_user(un, 3)

            user[:-step] = user[step:]
            user[-step:] = un.copy()

            depth_o[:-step] = depth_o[step:]
            depth_o[-step:] = dn.copy()
            gray_o[:-step] = gray_o[step:]
            gray_o[-step:] = gn.copy()

            _, depth, c = pp.proc_depth(depth_o.copy(), user.copy(), user_o,
                                        skelet)
            gray, c = pp.proc_gray(gray_o.copy(), user, skelet)
            traj2D, traj3D, ori, pheight, hand, center = skelet

            video = empty((
                1,
                2,
            ) + gray.shape, dtype="uint8")
            video[0, 0] = gray.copy()
            video[0, 1] = depth.copy()
            video = video.swapaxes(1, 2)  #(body-hand,gray-depth,fr,h,w)
            v_new = empty((1, 2, 2) + vid_shape, dtype="uint8")
            # p = pheight
            ratio = 0.25
            for i in xrange(video.shape[0]):  #batch

                if pheight < 10: pheight = 100
                scale = ratio  #+randi(2)/100.
                ofs = pheight * scale
                mid = video.shape[-1] / 2.
                sli = None
                if ofs < mid:
                    start = int(round(mid - ofs))
                    end = int(round(mid + ofs))
                    sli = slice(start, end)

                for j in xrange(video.shape[2]):  #maps
                    for k in xrange(video.shape[3]):  #frames
                        #body
                        img = video[i, 0, j, k]
                        img = cut_img(img, 5)
                        img = misc.imresize(img, (h, h))
                        # if j==0: img = 255-misc.imfilter(img,"contour")
                        v_new[i, 0, j, k] = img

                        #hand
                        img = video[i, 1, j, k]
                        img = img[sli, sli]
                        img = misc.imresize(img, (h, h))
                        v_new[i, 1, j, k] = img
            # print "put"
            # pred_loop(v_new,cur_fr,n, fileName)
            x_.set_value(v_new.astype("float32"), borrow=True)
            pred = evalu_model()[0][0]
            predictions.append(pred)

            cur_fr += step

        predictions = array(predictions, float32)
        pred_file_name = fileName.split('/')
        pred_file_name = pred_file_name[-1].replace(".zip", "_prediction.zip")
        file = GzipFile(dst + "/" + pred_file_name, 'wb')
        dump(predictions, file, -1)
        file.close()
        input_end_time = time()

        input_length = len(body)
        if verbose:
            logger.info("Read '%s' (%d bytes)" % (input_name, input_length))

        logger.debug("File Response Headers: %s" % (str(headers), ))

        output_name = input_name.replace('/', '_')
        output_length = input_length

        # To gzip or not gzip, that is the question
        if not input_name.endswith('.gz') and gzip_data:
            output = StringIO.StringIO()
            output_name += ".gz"
            with GzipFile(filename=output_name, mode='w', fileobj=output) as of:
                of.write(body)
            body = output.getvalue()
            output_length = len(body)
            logger.debug("Gzipped Body is now %d bytes "% (output_length, ))

        output_hdfs_file = hdfs_output_dir + '/' + output_name

        if verbose:
            logger.info("Writing %s (%d bytes)" % (output_name, output_length))


        (tmp_fd, tmp_filename) = tempfile.mkstemp()
        try:
            with open(tmp_filename, "wb") as fp:
                fp.write(body)
Exemple #56
0
def compress_string(s):
    zbuf = BytesIO()
    zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
    zfile.write(s)
    zfile.close()
    return zbuf.getvalue()
 def _put_file(self, name, content):
     name = self._path(name)
     placeholder = False
     if self.cache:
         if not self.cache.exists(name):
             self.cache.save(name, 0, 0)
             placedholder = True
     content_type = mimetypes.guess_type(name)[0] or "application/x-octet-stream"
     headers = {}
     for pattern in self.headers:
         if pattern[0].match(name):
             headers = pattern[1].copy()
             break
     file_pos = content.tell()
     content.seek(0, 2)
     content_length = content.tell()
     content.seek(0)
     gz_cts = getattr(
         settings,
         'CUDDLYBUDDLY_STORAGE_S3_GZIP_CONTENT_TYPES',
         (
             'text/css',
             'application/javascript',
             'application/x-javascript'
         )
     )
     gz_content = None
     if content_length > 1024 and content_type in gz_cts:
         gz_content = StringIO()
         gzf = GzipFile(mode='wb', fileobj=gz_content)
         gzf.write(content.read())
         content.seek(0)
         gzf.close()
         gz_content.seek(0, 2)
         gz_content_length = gz_content.tell()
         gz_content.seek(0)
         if gz_content_length < content_length:
             content_length = gz_content_length
             headers.update({
                 'Content-Encoding': 'gzip'
             })
         else:
             gz_content = None
     headers.update({
         'Content-Type': content_type,
         'Content-Length': str(content_length)
     })
     # Httplib in < 2.6 doesn't accept file like objects. Meanwhile in
     # >= 2.7 it will try to join a content str object with the headers which
     # results in encoding problems.
     if sys.version_info[0] == 2 and sys.version_info[1] < 6:
         content_to_send = gz_content.read() if gz_content is not None else content.read()
     else:
         content_to_send = gz_content if gz_content is not None else content
     response = self.connection.put(self.bucket, name, content_to_send, headers)
     content.seek(file_pos)
     if response.http_response.status != 200:
         if placeholder:
             self.cache.remove(name)
         raise S3Error(response.message)
     if self.cache:
         date = response.http_response.getheader('Date')
         date = timegm(parsedate(date))
         self.cache.save(name, size=content_length, mtime=date)
Exemple #58
0
class S3Boto3StorageFile(File):
    """
    The default file object used by the S3Boto3Storage backend.

    This file implements file streaming using boto's multipart
    uploading functionality. The file can be opened in read or
    write mode.

    This class extends Django's File class. However, the contained
    data is only the data contained in the current buffer. So you
    should not access the contained file object directly. You should
    access the data via this class.

    Warning: This file *must* be closed using the close() method in
    order to properly write the file to S3. Be sure to close the file
    in your application.
    """
    # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing.
    # TODO: When Django drops support for Python 2.5, rewrite to use the
    #       BufferedIO streams in the Python 2.6 io module.
    buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880)

    def __init__(self, name, mode, storage, buffer_size=None):
        self._storage = storage
        self.name = name[len(self._storage.location):].lstrip('/')
        self._mode = mode
        self.obj = storage.bucket.Object(storage._encode_name(name))
        if 'w' not in mode:
            # Force early RAII-style exception if object does not exist
            self.obj.load()
        self._is_dirty = False
        self._file = None
        self._multipart = None
        # 5 MB is the minimum part size (if there is more than one part).
        # Amazon allows up to 10,000 parts.  The default supports uploads
        # up to roughly 50 GB.  Increase the part size to accommodate
        # for files larger than this.
        if buffer_size is not None:
            self.buffer_size = buffer_size
        self._write_counter = 0

    @property
    def size(self):
        return self.obj.content_length

    def _get_file(self):
        if self._file is None:
            self._file = SpooledTemporaryFile(
                max_size=self._storage.max_memory_size,
                suffix=".S3Boto3StorageFile",
                dir=setting("FILE_UPLOAD_TEMP_DIR", None))
            if 'r' in self._mode:
                self._is_dirty = False
                self._file.write(self.obj.get()['Body'].read())
                self._file.seek(0)
            if self._storage.gzip and self.obj.content_encoding == 'gzip':
                self._file = GzipFile(mode=self._mode,
                                      fileobj=self._file,
                                      mtime=0.0)
        return self._file

    def _set_file(self, value):
        self._file = value

    file = property(_get_file, _set_file)

    def read(self, *args, **kwargs):
        if 'r' not in self._mode:
            raise AttributeError("File was not opened in read mode.")
        return super(S3Boto3StorageFile, self).read(*args, **kwargs)

    def write(self, content):
        if 'w' not in self._mode:
            raise AttributeError("File was not opened in write mode.")
        self._is_dirty = True
        if self._multipart is None:
            parameters = self._storage.object_parameters.copy()
            parameters['ACL'] = self._storage.default_acl
            parameters['ContentType'] = (mimetypes.guess_type(self.obj.key)[0]
                                         or self._storage.default_content_type)
            if self._storage.reduced_redundancy:
                parameters['StorageClass'] = 'REDUCED_REDUNDANCY'
            if self._storage.encryption:
                parameters['ServerSideEncryption'] = 'AES256'
            self._multipart = self.obj.initiate_multipart_upload(**parameters)
        if self.buffer_size <= self._buffer_file_size:
            self._flush_write_buffer()
        return super(S3Boto3StorageFile, self).write(force_bytes(content))

    @property
    def _buffer_file_size(self):
        pos = self.file.tell()
        self.file.seek(0, os.SEEK_END)
        length = self.file.tell()
        self.file.seek(pos)
        return length

    def _flush_write_buffer(self):
        """
        Flushes the write buffer.
        """
        if self._buffer_file_size:
            self._write_counter += 1
            self.file.seek(0)
            part = self._multipart.Part(self._write_counter)
            part.upload(Body=self.file.read())

    def close(self):
        if self._is_dirty:
            self._flush_write_buffer()
            # TODO: Possibly cache the part ids as they're being uploaded
            # instead of requesting parts from server. For now, emulating
            # s3boto's behavior.
            parts = [{
                'ETag': part.e_tag,
                'PartNumber': part.part_number
            } for part in self._multipart.parts.all()]
            self._multipart.complete(MultipartUpload={'Parts': parts})
        else:
            if self._multipart is not None:
                self._multipart.abort()
        if self._file is not None:
            self._file.close()
            self._file = None
Exemple #59
0
 def compress(s):
     buf = BytesIO()
     with GzipFile(mode='wb', fileobj=buf) as zfile:
         zfile.write(s)
     return buf.getvalue()
Exemple #60
0
def fetch_rcv1(*,
               data_home=None,
               subset='all',
               download_if_missing=True,
               random_state=None,
               shuffle=False,
               return_X_y=False):
    """Load the RCV1 multilabel dataset (classification).

    Download it if necessary.

    Version: RCV1-v2, vectors, full sets, topics multilabels.

    =================   =====================
    Classes                               103
    Samples total                      804414
    Dimensionality                      47236
    Features            real, between 0 and 1
    =================   =====================

    Read more in the :ref:`User Guide <rcv1_dataset>`.

    .. versionadded:: 0.17

    Parameters
    ----------
    data_home : string, optional
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    subset : string, 'train', 'test', or 'all', default='all'
        Select the dataset to load: 'train' for the training set
        (23149 samples), 'test' for the test set (781265 samples),
        'all' for both, with the training samples first if shuffle is False.
        This follows the official LYRL2004 chronological split.

    download_if_missing : boolean, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    random_state : int, RandomState instance, default=None
        Determines random number generation for dataset shuffling. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    shuffle : bool, default=False
        Whether to shuffle dataset.

    return_X_y : boolean, default=False.
        If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch
        object. See below for more information about the `dataset.data` and
        `dataset.target` object.

        .. versionadded:: 0.20

    Returns
    -------
    dataset : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : scipy csr array, dtype np.float64, shape (804414, 47236)
            The array has 0.16% of non zero values.
        target : scipy csr array, dtype np.uint8, shape (804414, 103)
            Each sample has a value of 1 in its categories, and 0 in others.
            The array has 3.15% of non zero values.
        sample_id : numpy array, dtype np.uint32, shape (804414,)
            Identification number of each sample, as ordered in dataset.data.
        target_names : numpy array, dtype object, length (103)
            Names of each target (RCV1 topics), as ordered in dataset.target.
        DESCR : string
            Description of the RCV1 dataset.

    (data, target) : tuple if ``return_X_y`` is True

        .. versionadded:: 0.20
    """
    N_SAMPLES = 804414
    N_FEATURES = 47236
    N_CATEGORIES = 103
    N_TRAIN = 23149

    data_home = get_data_home(data_home=data_home)
    rcv1_dir = join(data_home, "RCV1")
    if download_if_missing:
        if not exists(rcv1_dir):
            makedirs(rcv1_dir)

    samples_path = _pkl_filepath(rcv1_dir, "samples.pkl")
    sample_id_path = _pkl_filepath(rcv1_dir, "sample_id.pkl")
    sample_topics_path = _pkl_filepath(rcv1_dir, "sample_topics.pkl")
    topics_path = _pkl_filepath(rcv1_dir, "topics_names.pkl")

    # load data (X) and sample_id
    if download_if_missing and (not exists(samples_path)
                                or not exists(sample_id_path)):
        files = []
        for each in XY_METADATA:
            logger.info("Downloading %s" % each.url)
            file_path = _fetch_remote(each, dirname=rcv1_dir)
            files.append(GzipFile(filename=file_path))

        Xy = load_svmlight_files(files, n_features=N_FEATURES)

        # Training data is before testing data
        X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
        sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
        sample_id = sample_id.astype(np.uint32, copy=False)

        joblib.dump(X, samples_path, compress=9)
        joblib.dump(sample_id, sample_id_path, compress=9)

        # delete archives
        for f in files:
            f.close()
            remove(f.name)
    else:
        X = joblib.load(samples_path)
        sample_id = joblib.load(sample_id_path)

    # load target (y), categories, and sample_id_bis
    if download_if_missing and (not exists(sample_topics_path)
                                or not exists(topics_path)):
        logger.info("Downloading %s" % TOPICS_METADATA.url)
        topics_archive_path = _fetch_remote(TOPICS_METADATA, dirname=rcv1_dir)

        # parse the target file
        n_cat = -1
        n_doc = -1
        doc_previous = -1
        y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
        sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
        category_names = {}
        with GzipFile(filename=topics_archive_path, mode='rb') as f:
            for line in f:
                line_components = line.decode("ascii").split(" ")
                if len(line_components) == 3:
                    cat, doc, _ = line_components
                    if cat not in category_names:
                        n_cat += 1
                        category_names[cat] = n_cat

                    doc = int(doc)
                    if doc != doc_previous:
                        doc_previous = doc
                        n_doc += 1
                        sample_id_bis[n_doc] = doc
                    y[n_doc, category_names[cat]] = 1

        # delete archive
        remove(topics_archive_path)

        # Samples in X are ordered with sample_id,
        # whereas in y, they are ordered with sample_id_bis.
        permutation = _find_permutation(sample_id_bis, sample_id)
        y = y[permutation, :]

        # save category names in a list, with same order than y
        categories = np.empty(N_CATEGORIES, dtype=object)
        for k in category_names.keys():
            categories[category_names[k]] = k

        # reorder categories in lexicographic order
        order = np.argsort(categories)
        categories = categories[order]
        y = sp.csr_matrix(y[:, order])

        joblib.dump(y, sample_topics_path, compress=9)
        joblib.dump(categories, topics_path, compress=9)
    else:
        y = joblib.load(sample_topics_path)
        categories = joblib.load(topics_path)

    if subset == 'all':
        pass
    elif subset == 'train':
        X = X[:N_TRAIN, :]
        y = y[:N_TRAIN, :]
        sample_id = sample_id[:N_TRAIN]
    elif subset == 'test':
        X = X[N_TRAIN:, :]
        y = y[N_TRAIN:, :]
        sample_id = sample_id[N_TRAIN:]
    else:
        raise ValueError("Unknown subset parameter. Got '%s' instead of one"
                         " of ('all', 'train', test')" % subset)

    if shuffle:
        X, y, sample_id = shuffle_(X, y, sample_id, random_state=random_state)

    module_path = dirname(__file__)
    with open(join(module_path, 'descr', 'rcv1.rst')) as rst_file:
        fdescr = rst_file.read()

    if return_X_y:
        return X, y

    return Bunch(data=X,
                 target=y,
                 sample_id=sample_id,
                 target_names=categories,
                 DESCR=fdescr)