def savematch(data, filename=''): """data must have the following format: dictionary from Matrix to Sequence to Index to Score""" #Maybe one should add a security policy for allowed filenames. #e.g. do not allow '/' in filename. if filename=='': a=localtime() filename='eel_'+str(a.tm_year)+'_'+str(a.tm_mon)+'_'+str(a.tm_mday)+'_'+str(a.tm_hour)+'_'+str(a.tm_min)+'.gff' try: if filename[-3:]==".gz": try: F=GzipFile(filename,"w") except NameError: filename=filename[:-3] F=open(filename,'w') else: F=open(filename,'w') ## This is in wrong format Seq and Matr are reversed. ## for Matr in data.keys(): ## for Seq in data[Matr].keys(): ## for Pos,Strand in data[Matr][Seq].keys(): ## F.write("%s\teel\t%s\t%d\t%d\t%f\t%s\t.\n"%(Seq,Matr.getName(),Pos,Pos+len(Matr)-1,data[Matr][Seq][(Pos,Strand)],Strand)) F.write(get(data)) F.close() return filename except IOError, (errno, strerror): print "I/O error(%s): %s" % (errno, strerror) return ''
def get_compressed_file_data(file_path, compresslevel=5): compressed_buffer = BytesIO() gzip_file = GzipFile(mode='wb', compresslevel=compresslevel, fileobj=compressed_buffer) try: fileobj = open(file_path, 'rb') while True: x = fileobj.read(65536) if not x: break gzip_file.write(x) x = None fileobj.close() except IOError as e: LOG.error(str(e)) return None gzip_file.close() compressed_data = compressed_buffer.getvalue() compressed_buffer.close() return compressed_data
def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs): """ Return a buffered gzip file object. :param filename: a filesystem path :type filename: str :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb' :type mode: str :param compresslevel: The compresslevel argument is an integer from 1 to 9 controlling the level of compression; 1 is fastest and produces the least compression, and 9 is slowest and produces the most compression. The default is 9. :type compresslevel: int :param fileobj: a StringIO stream to read from instead of a file. :type fileobj: StringIO :param size: number of bytes to buffer during calls to read() and write() :type size: int :rtype: BufferedGzipFile """ GzipFile.__init__(self, filename, mode, compresslevel, fileobj) self._size = kwargs.get("size", self.SIZE) self._buffer = StringIO() # cStringIO does not support len. self._len = 0
def save(self, filename): """ Serialize this RingData instance to disk. :param filename: File into which this instance should be serialized. """ # Override the timestamp so that the same ring data creates # the same bytes on disk. This makes a checksum comparison a # good way to see if two rings are identical. # # This only works on Python 2.7; on 2.6, we always get the # current time in the gzip output. tempf = NamedTemporaryFile(dir=".", prefix=filename, delete=False) try: gz_file = GzipFile(filename, mode='wb', fileobj=tempf, mtime=1300507380.0) except TypeError: gz_file = GzipFile(filename, mode='wb', fileobj=tempf) self.serialize_v1(gz_file) gz_file.close() tempf.flush() os.fsync(tempf.fileno()) tempf.close() os.chmod(tempf.name, 0o644) os.rename(tempf.name, filename)
def test_content_encoding_gzip(self): kwargs = {'message': 'hello'} message = json.dumps(kwargs) fp = StringIO() try: f = GzipFile(fileobj=fp, mode='w') f.write(message) finally: f.close() key = self.projectkey.public_key secret = self.projectkey.secret_key with self.tasks(): resp = self.client.post( self.path, fp.getvalue(), content_type='application/octet-stream', HTTP_CONTENT_ENCODING='gzip', HTTP_X_SENTRY_AUTH=get_auth_header('_postWithHeader', key, secret), ) assert resp.status_code == 200, resp.content event_id = json.loads(resp.content)['id'] instance = Event.objects.get(event_id=event_id) assert instance.message == 'hello'
def write(self): if debug: print 'writing to disk' gz = GzipFile(database, 'wb') dump(db, gz, -1) gz.close() Pref.writing_to_disk = False
def __init__(self, data): fd, fname = tempfile.mkstemp() gzd = GzipFile(mode='r', fileobj=StringIO(b64decode(data))) os.write(fd, gzd.read()) os.close(fd) gzd.close() self.name = fname
def write_file(self, filename=None, buffer=None, fileobj=None): """Write this NBT file to a file.""" closefile = True if buffer: self.filename = None self.file = buffer closefile = False elif filename: self.filename = filename self.file = GzipFile(filename, "wb") elif fileobj: self.filename = None self.file = GzipFile(fileobj=fileobj, mode="wb") elif self.filename: self.file = GzipFile(self.filename, "wb") elif not self.file: raise ValueError( "NBTFile.write_file(): Need to specify either a " "filename or a file object" ) # Render tree to file TAG_Byte(self.id)._render_buffer(self.file) TAG_String(self.name)._render_buffer(self.file) self._render_buffer(self.file) # make sure the file is complete try: self.file.flush() except (AttributeError, IOError): pass if closefile: try: self.file.close() except (AttributeError, IOError): pass
def write_sbml_model(cobra_model, filename, use_fbc_package=True, **kwargs): if not use_fbc_package: if libsbml is None: raise Exception("libSBML required to write non-fbc models") write_sbml2(cobra_model, filename, use_fbc_package=False, **kwargs) return # create xml xml = model_to_xml(cobra_model, **kwargs) write_args = {"encoding": "UTF-8"} if _with_lxml: write_args["pretty_print"] = True else: indent_xml(xml) # write xml to file should_close = True if hasattr(filename, "write"): xmlfile = filename should_close = False elif filename.endswith(".gz"): xmlfile = GzipFile(filename, "wb") elif filename.endswith(".bz2"): xmlfile = BZ2File(filename, "wb") else: xmlfile = open(filename, "wb") ElementTree(xml).write(xmlfile, **write_args) if should_close: xmlfile.close()
def get(self): from gzip import GzipFile try: from cString import StringIO except ImportError: from StringIO import StringIO data = self.get_data() data['gzipped'] = True json_response = self.json_response(data, finish=False) tmp_buffer = StringIO() gziped_buffer = GzipFile( fileobj=tmp_buffer, mode="wb", compresslevel=7) gziped_buffer.write(json_response) gziped_buffer.close() gzipped_data = tmp_buffer.getvalue() self.set_header("Content-Encoding", 'gzip') self.set_header("Content-Length", str(len(gzipped_data))) tmp_buffer.close() self.finish(gzipped_data)
def parse_file(self, filename=None, buffer=None, fileobj=None): """Completely parse a file, extracting all tags.""" if filename: self.file = GzipFile(filename, 'rb') elif buffer: if hasattr(buffer, 'name'): self.filename = buffer.name self.file = buffer elif fileobj: if hasattr(fileobj, 'name'): self.filename = fileobj.name self.file = GzipFile(fileobj=fileobj) if self.file: try: type = TAG_Byte(buffer=self.file) if type.value == self.id: name = TAG_String(buffer=self.file).value self._parse_buffer(self.file) self.name = name self.file.close() else: raise MalformedFileError( "First record is not a Compound Tag") except StructError as e: raise MalformedFileError( "Partial File Parse: file possibly truncated.") else: raise ValueError( "NBTFile.parse_file(): Need to specify either a " "filename or a file object" )
def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs): """ @return: a buffered gzip file object @rtype: C{BufferedGzipFile} @param filename: a filesystem path @type filename: C{str} @param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb' @type mode: C{str} @param compresslevel: The compresslevel argument is an integer from 1 to 9 controlling the level of compression; 1 is fastest and produces the least compression, and 9 is slowest and produces the most compression. The default is 9. @type compresslevel: C{int} @param fileobj: a StringIO stream to read from instead of a file. @type fileobj: C{StringIO} @kwparam size: number of bytes to buffer during calls to L{read()} and L{write()} @type size: C{int} """ GzipFile.__init__(self, filename, mode, compresslevel, fileobj) self._size = kwargs.get('size', self.SIZE) self._buffer = StringIO() # cStringIO does not support len. self._len = 0
def _compress_string(self, s): """Gzip a given string.""" zbuf = StringIO() zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile.write(s) zfile.close() return zbuf.getvalue()
def DecodeProcFile(proc_file): if len(proc_file) < 256: fd = open(proc_file) proc_file = fd.read(1024*1024) fd.close() if proc_file.find('Subsystem Id:') < 0: p = None try: from gzip import GzipFile from StringIO import StringIO s = StringIO(proc_file) gz = GzipFile(mode='r', fileobj=s) p = gz.read(1024*1024) gz.close() except: pass if p is None: try: from bz2 import decompress p = decompress(proc_file) except: pass if not p is None: proc_file = p return proc_file
def open(self): request = Request(self.url) request.add_header('User-Agent','lastfm-lda recommender v.0.0.-1') request.add_header('Accept-encoding', 'gzip') while True: URLLoadListener.num_connections+=1 response = None try: response = urlopen(request,timeout=10) if response.info().get('Content-Encoding') == 'gzip': f = GzipFile(fileobj=StringIO(response.read())) result = f.read() f.close() else: result = response.read() break except Exception, e: if self.retries>2: if isinstance(e, BadStatusLine): raise Exception("last.fm server does not respond (%s)" % e) raise e self.retries+=1 print self.url print "failed with", e print "retry #",self.retries print finally:
class CompressingRequestWrapper(_makeBase()): """ A request wrapper with support for transport encoding compression. @ivar underlying: the request being wrapped. @type underlying: L{IRequest} @ivar encoding: the IANA-assigned name of the encoding. @type encoding: C{str} @ivar compressLevel: the level of gzip compression to apply. @type compressLevel: C{int} """ implements(IRequest) encoding = 'gzip' compressLevel = 6 def __init__(self, underlying): self.underlying = underlying self.setHeader('content-encoding', self.encoding) self._gzipFile = None # See setHeader docstring for more commentary. self.underlying.headers.pop('content-length', None) def setHeader(self, name, value): """ Discard the Content-Length header. When compression encoding is in use, the Content-Length header must indicate the length of the compressed content; since we are doing the compression on the fly, we don't actually know what the length is after compression, so we discard this header. If this is an HTTP/1.1 request, chunked transfer encoding should be used, softening the impact of losing this header. """ if name.lower() == 'content-length': return else: return self.underlying.setHeader(name, value) def write(self, data): """ Pass data through to the gzip layer. """ if self._gzipFile is None: self._gzipFile = GzipFile(fileobj=self.underlying, mode='wb', compresslevel=self.compressLevel) self._gzipFile.write(data) def finishRequest(self, success): """ Finish off gzip stream. """ if self._gzipFile is None: self.write('') self._gzipFile.close() self.underlying.finishRequest(success)
def start_file(fname, id_=None): """Opens a fif file for writing and writes the compulsory header tags Parameters ---------- fname : string | fid The name of the file to open. It is recommended that the name ends with .fif or .fif.gz. Can also be an already opened file. id_ : dict | None ID to use for the FIFF_FILE_ID. """ if isinstance(fname, string_types): if op.splitext(fname)[1].lower() == '.gz': logger.debug('Writing using gzip') # defaults to compression level 9, which is barely smaller but much # slower. 2 offers a good compromise. fid = GzipFile(fname, "wb", compresslevel=2) else: logger.debug('Writing using normal I/O') fid = open(fname, "wb") else: logger.debug('Writing using %s I/O' % type(fname)) fid = fname fid.seek(0) # Write the compulsory items write_id(fid, FIFF.FIFF_FILE_ID, id_) write_int(fid, FIFF.FIFF_DIR_POINTER, -1) write_int(fid, FIFF.FIFF_FREE_LIST, -1) return fid
def _uncachedgenerate(self): """ Generates the Gzipped sitemap uncached data """ len_brains = len(self._catalogbrains()) if self.index is None: # no index specified in the url if len_brains < self.maxlen: # ok, we have few items, let's generate the standard sitemap xml = self.template() else: # a lot of items, let's generate a sitemap index xml = self.indextemplate() elif int(self.index)*self.maxlen >= len_brains: # bad index specified raise NotFound(self.context, '%s-%s' % (self.index, self.filename), self.request) else: # index specified in the url xml = self.template() if self.index is not None: filename = "%s-%s" % (self.index, self.filename) else: filename = self.filename fp = StringIO() gzip = GzipFile(filename, 'w', 9, fp) gzip.write(xml) gzip.close() data = fp.getvalue() fp.close() return data
def __init__(self, filename=None, buffer=None, fileobj=None): super(NBTFile, self).__init__() self.filename = filename self.type = TAG_Byte(self.id) closefile = True #make a file object if filename: self.file = GzipFile(filename, 'rb') elif buffer: if hasattr(buffer, 'name'): self.filename = buffer.name self.file = buffer closefile = False elif fileobj: if hasattr(fileobj, 'name'): self.filename = fileobj.name self.file = GzipFile(fileobj=fileobj) else: self.file = None closefile = False #parse the file given initially if self.file: self.parse_file() if closefile: # Note: GzipFile().close() does NOT close the fileobj, # So the caller is still responsible for closing that. try: self.file.close() except (AttributeError, IOError): pass self.file = None
def handle_stackexchange_login(self, data): self.send_response(200) self.send_header("Content-type", "text/html") self.log_message(self.path) self.end_headers() c = Client(StackExchange, get_config()) cred = c.flow.authorization_received(data) d = c.request("/me", body=urlencode({ "site": "stackoverflow" })) self.wfile.write("<!DOCTYPE html>") self.wfile.write("<head><meta charset=\"utf-8\"/></head><body>") self.wfile.write("Access token: %s<br>" % cred.access_token) self.wfile.write("Type: %s<br>" % cred.token_type) self.wfile.write("Expires in: %d<br>" % cred.expires_in) # stackexchange gzips all data h = StringIO(d) gzip_data = GzipFile(fileobj=h) d = gzip_data.read() gzip_data.close() self.wfile.write(d) self.wfile.write("</body></html>")
def save(self, filename, mtime=1300507380.0): """ Serialize this RingData instance to disk. :param filename: File into which this instance should be serialized. :param mtime: time used to override mtime for gzip, default or None if the caller wants to include time """ # Override the timestamp so that the same ring data creates # the same bytes on disk. This makes a checksum comparison a # good way to see if two rings are identical. # # This only works on Python 2.7; on 2.6, we always get the # current time in the gzip output. tempf = NamedTemporaryFile(dir=".", prefix=filename, delete=False) if 'mtime' in inspect.getargspec(GzipFile.__init__).args: gz_file = GzipFile(filename, mode='wb', fileobj=tempf, mtime=mtime) else: gz_file = GzipFile(filename, mode='wb', fileobj=tempf) self.serialize_v1(gz_file) gz_file.close() tempf.flush() os.fsync(tempf.fileno()) tempf.close() os.chmod(tempf.name, 0o644) os.rename(tempf.name, filename)
def gzip_media(self, filedata): """gzip encodes a given stream of data.""" gzip_data = StringIO() gzf = GzipFile(fileobj=gzip_data, mode="wb") gzf.write(filedata) gzf.close() return gzip_data.getvalue()
def run_analogy_space_lang(lang): # Open files (fail early on errors) tensor_name = tensor_filename(lang) tensor_name_new = tensor_name+'_new' tensor_file = GzipFile(tensor_name_new, 'wb') svd_name = svd_filename(lang) svd_name_new = svd_name + '_new' # Load matrix logging.info('Loading %s'% lang) cnet_2d = conceptnet_2d_from_db(lang, identities=IDENTITIES, cutoff=CUTOFF) logging.info('Normalize %r' % cnet_2d) cnet_2d = cnet_2d.normalized() # Save tensor logging.info('Save tensor as %s' % tensor_name) pickle.dump(cnet_2d, tensor_file, -1) tensor_file.close() os.rename(tensor_name_new, tensor_name) logging.info('Running SVD') svd = cnet_2d.svd(k=100) # Save SVD logging.info('Save as %s' % svd_name) svd.save_pytables(svd_name_new) os.rename(svd_name_new, svd_name)
def main(argv): args = argv[1:] or ["-"] class TitleExtractor(MWXMLDumpParser): def start_revision(self, pageid, title, revid, timestamp): print(pageid, title) return for path in args: if path == "-": fp = sys.stdin elif path.endswith(".gz"): from gzip import GzipFile fp = GzipFile(path) elif path.endswith(".bz2"): from bz2 import BZ2File fp = BZ2File(path) else: fp = open(path) parser = TitleExtractor() parser.feed_file(fp) fp.close() parser.close() return 0
def __init__(self, fileobj, name=None, onclose=None, mapped=True, gzip=False): if gzip: fileobj = GzipFile(fileobj=fileobj) self.file = fileobj self._name = name self.onclose = onclose self.is_closed = False for attr in ("read", "readline", "write", "tell", "seek", "truncate"): if hasattr(fileobj, attr): setattr(self, attr, getattr(fileobj, attr)) # If mapped is True, set the 'map' attribute to a memory-mapped # representation of the file. Otherwise, the fake 'map' that set up by # the base class will be used. if not gzip and mapped and hasattr(fileobj, "mode") and "r" in fileobj.mode: fd = fileobj.fileno() self.size = os.fstat(fd).st_size if self.size > 0: import mmap try: self.map = mmap.mmap(fd, self.size, access=mmap.ACCESS_READ) except OSError: self._setup_fake_map() else: self._setup_fake_map() self.is_real = not gzip and hasattr(fileobj, "fileno")
def LoadGuide(): if Prefs['xmltv'].startswith('http://') or Prefs['xmltv'].startswith('https://'): # Plex can't handle compressed files, using standart Python methods instead if Prefs['xmltv'].endswith('.gz') or Prefs['xmltv'].endswith('.gz?raw=1'): f = BytesIO(urlopen(Prefs['xmltv']).read()) try: g = GzipFile(fileobj = f) xmltv = g.read() except: Log.Error('Provided file %s is not a valid GZIP file' % Prefs['xmltv']) xmltv = None else: xmltv = HTTP.Request(Prefs['xmltv']).content else: # Local compressed files are not supported at the moment xmltv = Resource.Load(Prefs['xmltv'], binary = True) if xmltv != None: try: root = xml.etree.ElementTree.fromstring(xmltv) except: Log.Error('Provided file %s is not a valid XML file' % Prefs['xmltv']) root = None if root != None: count = 0 for programme in root.findall("./programme"): channel = programme.get('channel') start = datetime_from_utc_to_local(programme.get('start')) stop = datetime_from_utc_to_local(programme.get('stop')) title = programme.find('title').text count = count + 1 item = {'start': start, 'stop': stop, 'title': title, 'order': count} GUIDE.setdefault(channel, {})[count] = item return None
def decode_content(self, data): if web.ctx.env.get('HTTP_CONTENT_ENCODING') == 'gzip': ib = StringIO(data) zf = GzipFile(fileobj=ib) return zf.read() else: return data
def build_index_gzip(self): """creates sorted index from gzip-compressed queue. caches object regardless of caccheobj flag. """ self.index = [] zf = GzipFile(fileobj=self.map, mode="rb") while 1: p = zf.tell() # just for diagnosis use try: l = zf.readline() except IOError as ex: # probably CRC error due to truncated file. discard the rest. logging.error("error in %s at %d: %s", self.fn, p, str(ex)) break if not l: break if l[0] != " ": continue try: o = cjson.decode(l[1:]) except Exception as ex: logging.warn("skipping malformed JSON at %s:%d: %s", self.fn, p, l[1:]) continue key = o.get("id") if key is None: try: key = self.urikey(o) except UnicodeEncodeError: pass if key is None: logging.error("urikey->None for %s", str(o)) continue self.index.append((key, o)) zf.close()
def testPostMethodDeCompressesDeflatedBody_gzip(self): self.requestData = None def handler(**kwargs): self.requestData = kwargs reactor = Reactor() server = HttpServer(reactor, self.port, handler, timeout=0.01) server.listen() sok = socket() sok.connect(('localhost', self.port)) bodyData = 'bodydatabodydata' _sio = StringIO() _gzFileObj = GzipFile(filename=None, mode='wb', compresslevel=6, fileobj=_sio) _gzFileObj.write(bodyData); _gzFileObj.close() compressedBodyData = _sio.getvalue() bodyDataCompressed = compress(bodyData) contentLengthCompressed = len(bodyDataCompressed) sok.send(('POST / HTTP/1.0\r\nContent-Type: application/x-www-form-urlencoded\r\nContent-Length: %d\r\nContent-Encoding: gzip\r\n\r\n' % contentLengthCompressed) + bodyDataCompressed) while not self.requestData: reactor.step() self.assertEquals(dict, type(self.requestData)) self.assertTrue('Headers' in self.requestData) headers = self.requestData['Headers'] self.assertEquals('POST', self.requestData['Method']) self.assertEquals('application/x-www-form-urlencoded', headers['Content-Type']) self.assertEquals(contentLengthCompressed, int(headers['Content-Length'])) self.assertTrue('Body' in self.requestData) self.assertEquals('bodydatabodydata', self.requestData['Body'])
class NBTFile(TAG_Compound): """Represents an NBT file object""" def __init__(self, filename=None, mode=None, buffer=None): super(NBTFile,self).__init__() self.__class__.__name__ = "TAG_Compound" if filename: self.file = GzipFile(filename, mode) self.parse_file(self.file) def parse_file(self, file=None): if not file: file = self.file if file: self.type = TAG_Byte(buffer=file) if self.type.value == self.id: name = TAG_String(buffer=file) self._parse_buffer(file) self.name = name self.file.close() else: raise ValueError("First record is not a Compound Tag") def write_file(self, filename=None, file=None): if file: self.file = file elif filename: self.file = GzipFile(filename, "wb") else: raise ValueError("Need to specify either a filename or a file") #Render tree to file self.type._render_buffer(file) self.name._render_buffer(file) self._render_buffer(file)
def test_process_response_gzipped_gzip_file(self): """Test that a gzip Content-Encoded .gz file is gunzipped only once by the middleware, leaving gunzipping of the file to upper layers. """ headers = { 'Content-Type': 'application/gzip', 'Content-Encoding': 'gzip', } # build a gzipped file (here, a sitemap) f = BytesIO() plainbody = b"""<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.google.com/schemas/sitemap/0.84"> <url> <loc>http://www.example.com/</loc> <lastmod>2009-08-16</lastmod> <changefreq>daily</changefreq> <priority>1</priority> </url> <url> <loc>http://www.example.com/Special-Offers.html</loc> <lastmod>2009-08-16</lastmod> <changefreq>weekly</changefreq> <priority>0.8</priority> </url> </urlset>""" gz_file = GzipFile(fileobj=f, mode='wb') gz_file.write(plainbody) gz_file.close() # build a gzipped response body containing this gzipped file r = BytesIO() gz_resp = GzipFile(fileobj=r, mode='wb') gz_resp.write(f.getvalue()) gz_resp.close() response = Response("http;//www.example.com/", headers=headers, body=r.getvalue()) request = Request("http://www.example.com/") newresponse = self.mw.process_response(request, response, self.spider) self.assertEqual(gunzip(newresponse.body), plainbody)
class OGDClient(object): HTTPError = HTTPError BadRequestError = BadRequestError UnauthorizedError = UnauthorizedError ForbiddenError = ForbiddenError NotFoundError = NotFoundError NonRetryableHTTPError = NonRetryableHTTPError def __init__(self): self._json = None self.data = b"" @staticmethod def is_logged_in(): # non-empty ogd_auth means we are logged in (probably, the # authorization can in theory have been invalidated on the server return bool(app.settings["database_auth"]) def login_task(self, username, password): return LoginTask(self, username, password) def logout_task(self, auth_token): return LogoutTask(self, auth_token) @retry def auth(self, username, password, device_id, device_name): result = self.post("/api/auth", { "username": username, "password": password, "device_id": device_id, "device_name": device_name }, auth=False) return result @retry def deauth(self, auth_token): result = self.post("/api/deauth", {"auth_token": auth_token}, auth=False) return result @staticmethod def url_prefix(): return openretro_url_prefix() def opener(self): username, password = self.credentials() # FIXME: use cache dict? return opener_for_url_prefix(self.url_prefix(), username, password) @staticmethod def credentials(): auth_token = app.settings["database_auth"] return "auth_token", auth_token def post(self, path, params=None, data=None, auth=True): headers = {} if auth: credentials = self.credentials() headers[str("Authorization")] = str( "Basic " + base64.b64encode("{0}:{1}".format( *credentials).encode("UTF-8")).decode("UTF-8")) connection = openretro_http_connection() url = "{0}{1}".format(openretro_url_prefix(), path) # if params: # url += "?" + urlencode(params) if not data and params: data = urlencode(params) headers[str("Content-Type")] = \ str("application/x-www-form-urlencoded") print(url, headers) if isinstance(data, dict): data = json.dumps(data) # print(data) connection.request(str("POST"), str(url), data, headers=headers) response = connection.getresponse() if response.status not in [200]: print(response.status, response.reason) if response.status == 400: class_ = BadRequestError elif response.status == 401: class_ = UnauthorizedError elif response.status == 403: class_ = ForbiddenError elif response.status == 404: class_ = NotFoundError else: class_ = HTTPError raise class_(url, response.status, response.reason, response.getheaders(), None) data = response.read() if len(data) > 0 and data[0:1] == b"{": doc = json.loads(data.decode("UTF-8")) return doc return data def build_url(self, path, **kwargs): url = "{0}{1}".format(self.url_prefix(), path) if kwargs: url += "?" + urlencode(kwargs) return url def get_request(self, url): request = Request(url) print("get_request:", url) request.add_header("Accept-Encoding", "gzip") response = self.opener().open(request) return self.handle_response(response) def handle_response(self, response): self._json = None self.data = response.read() # print(dir(response.headers)) try: getheader = response.headers.getheader except AttributeError: getheader = response.getheader content_encoding = getheader("content-encoding", "").lower() if content_encoding == "gzip": # data = zlib.decompress(data) fake_stream = StringIO(self.data) self.data = GzipFile(fileobj=fake_stream).read() def json_response(self): if self._json is None: self._json = json.loads(self.data.decode("UTF-8")) return self._json def rate_variant(self, variant_uuid, like=None, work=None): params = { "game": variant_uuid, } if like is not None: params["like"] = like if work is not None: params["work"] = work url = self.build_url("/api/1/rate_game", **params) self.get_request(url) return self.json_response()
class HTTPConnection: def __init__(self, handler, connection): self.handler = handler self.connection = connection self.buf = '' self.closed = False self.done = False self.donereading = False self.next_func = self.read_type def get_ip(self): return self.connection.get_ip() def data_came_in(self, data): if self.donereading or self.next_func is None: return True self.buf += data while 1: try: i = self.buf.index('\n') except ValueError: return True val = self.buf[:i] self.buf = self.buf[i + 1:] self.next_func = self.next_func(val) if self.donereading: return True if self.next_func is None or self.closed: return False def read_type(self, data): self.header = data.strip() words = data.split() if len(words) == 3: self.command, self.path, garbage = words self.pre1 = False elif len(words) == 2: self.command, self.path = words self.pre1 = True if self.command != 'GET': return None else: return None if self.command not in ('HEAD', 'GET'): return None self.headers = {} return self.read_header def read_header(self, data): data = data.strip() if data == '': self.donereading = True if self.headers.get('accept-encoding', '').find('gzip') > -1: self.encoding = 'gzip' else: self.encoding = 'identity' r = self.handler.getfunc(self, self.path, self.headers) if r is not None: self.answer(r) return None try: i = data.index(':') except ValueError: return None self.headers[data[:i].strip().lower()] = data[i + 1:].strip() if DEBUG: print data[:i].strip() + ": " + data[i + 1:].strip() return self.read_header def answer(self, (responsecode, responsestring, headers, data)): if self.closed: return if self.encoding == 'gzip': compressed = StringIO() gz = GzipFile(fileobj=compressed, mode='wb', compresslevel=9) gz.write(data) gz.close() cdata = compressed.getvalue() if len(cdata) >= len(data): self.encoding = 'identity' else: if DEBUG: print "Compressed: %i Uncompressed: %i\n" % (len(cdata), len(data)) data = cdata headers['Content-Encoding'] = 'gzip' # i'm abusing the identd field here, but this should be ok if self.encoding == 'identity': ident = '-' else: ident = self.encoding self.handler.log(self.connection.get_ip(), ident, '-', self.header, responsecode, len(data), self.headers.get('referer', '-'), self.headers.get('user-agent', '-')) self.done = True r = StringIO() r.write('HTTP/1.0 ' + str(responsecode) + ' ' + responsestring + '\r\n') if not self.pre1: headers['Content-Length'] = len(data) for key, value in headers.items(): r.write(key + ': ' + str(value) + '\r\n') r.write('\r\n') if self.command != 'HEAD': r.write(data) self.connection.write(r.getvalue()) if self.connection.is_flushed(): self.connection.shutdown(1)
def tofile(f, obj): out = GzipFile(f, 'wb') out.write(dumps(obj)) out.close()
def fromfile(f): infile = GzipFile(f) result = loads(infile.read()) infile.close() return result
def openMaybeGzip(file, mode): f = open(file, mode) if re.search('\.gz$', file): f = GzipFile(fileobj=f) return f
def fetch_covtype( *, data_home=None, download_if_missing=True, random_state=None, shuffle=False, return_X_y=False, as_frame=False, ): """Load the covertype dataset (classification). Download it if necessary. ================= ============ Classes 7 Samples total 581012 Dimensionality 54 Features int ================= ============ Read more in the :ref:`User Guide <covtype_dataset>`. Parameters ---------- data_home : str, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : bool, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. shuffle : bool, default=False Whether to shuffle dataset. return_X_y : bool, default=False If True, returns ``(data.data, data.target)`` instead of a Bunch object. .. versionadded:: 0.20 as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). The target is a pandas DataFrame or Series depending on the number of target columns. If `return_X_y` is True, then (`data`, `target`) will be pandas DataFrames or Series as described below. .. versionadded:: 0.24 Returns ------- dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : ndarray of shape (581012, 54) Each row corresponds to the 54 features in the dataset. target : ndarray of shape (581012,) Each value corresponds to one of the 7 forest covertypes with values ranging between 1 to 7. frame : dataframe of shape (581012, 55) Only present when `as_frame=True`. Contains `data` and `target`. DESCR : str Description of the forest covertype dataset. feature_names : list The names of the dataset columns. target_names: list The names of the target columns. (data, target) : tuple if ``return_X_y`` is True A tuple of two ndarray. The first containing a 2D array of shape (n_samples, n_features) with each row representing one sample and each column representing the features. The second ndarray of shape (n_samples,) containing the target samples. .. versionadded:: 0.20 """ data_home = get_data_home(data_home=data_home) covtype_dir = join(data_home, "covertype") samples_path = _pkl_filepath(covtype_dir, "samples") targets_path = _pkl_filepath(covtype_dir, "targets") available = exists(samples_path) and exists(targets_path) if download_if_missing and not available: os.makedirs(covtype_dir, exist_ok=True) # Creating temp_dir as a direct subdirectory of the target directory # guarantees that both reside on the same filesystem, so that we can use # os.rename to atomically move the data files to their target location. with TemporaryDirectory(dir=covtype_dir) as temp_dir: logger.info(f"Downloading {ARCHIVE.url}") archive_path = _fetch_remote(ARCHIVE, dirname=temp_dir) Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",") X = Xy[:, :-1] y = Xy[:, -1].astype(np.int32, copy=False) samples_tmp_path = _pkl_filepath(temp_dir, "samples") joblib.dump(X, samples_tmp_path, compress=9) os.rename(samples_tmp_path, samples_path) targets_tmp_path = _pkl_filepath(temp_dir, "targets") joblib.dump(y, targets_tmp_path, compress=9) os.rename(targets_tmp_path, targets_path) elif not available and not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") try: X, y except NameError: X = joblib.load(samples_path) y = joblib.load(targets_path) if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] fdescr = load_descr("covtype.rst") frame = None if as_frame: frame, X, y = _convert_data_dataframe( caller_name="fetch_covtype", data=X, target=y, feature_names=FEATURE_NAMES, target_names=TARGET_NAMES, ) if return_X_y: return X, y return Bunch( data=X, target=y, frame=frame, target_names=TARGET_NAMES, feature_names=FEATURE_NAMES, DESCR=fdescr, )
class S3BotoStorageFile(File): """ The default file object used by the S3BotoStorage backend. This file implements file streaming using boto's multipart uploading functionality. The file can be opened in read or write mode. This class extends Django's File class. However, the contained data is only the data contained in the current buffer. So you should not access the contained file object directly. You should access the data via this class. Warning: This file *must* be closed using the close() method in order to properly write the file to S3. Be sure to close the file in your application. """ # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing. # TODO: When Django drops support for Python 2.5, rewrite to use the # BufferedIO streams in the Python 2.6 io module. buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880) def __init__(self, name, mode, storage, buffer_size=None): self._storage = storage self.name = name[len(self._storage.location):].lstrip('/') self._mode = mode self.key = storage.bucket.get_key(self._storage._encode_name(name)) if not self.key and 'w' in mode: self.key = storage.bucket.new_key(storage._encode_name(name)) self._is_dirty = False self._file = None self._multipart = None # 5 MB is the minimum part size (if there is more than one part). # Amazon allows up to 10,000 parts. The default supports uploads # up to roughly 50 GB. Increase the part size to accommodate # for files larger than this. if buffer_size is not None: self.buffer_size = buffer_size self._write_counter = 0 @property def size(self): return self.key.size def _get_file(self): if self._file is None: self._file = SpooledTemporaryFile( max_size=self._storage.max_memory_size, suffix=".S3BotoStorageFile", dir=setting("FILE_UPLOAD_TEMP_DIR", None)) if 'r' in self._mode: self._is_dirty = False self.key.get_contents_to_file(self._file) self._file.seek(0) if self._storage.gzip and self.key.content_encoding == 'gzip': self._file = GzipFile(mode=self._mode, fileobj=self._file) return self._file def _set_file(self, value): self._file = value file = property(_get_file, _set_file) def read(self, *args, **kwargs): if 'r' not in self._mode: raise AttributeError("File was not opened in read mode.") return super(S3BotoStorageFile, self).read(*args, **kwargs) def write(self, content, *args, **kwargs): if 'w' not in self._mode: raise AttributeError("File was not opened in write mode.") self._is_dirty = True if self._multipart is None: provider = self.key.bucket.connection.provider upload_headers = {provider.acl_header: self._storage.default_acl} upload_headers.update({ 'Content-Type': mimetypes.guess_type(self.key.name)[0] or self._storage.key_class.DefaultContentType }) upload_headers.update(self._storage.headers) self._multipart = self._storage.bucket.initiate_multipart_upload( self.key.name, headers=upload_headers, reduced_redundancy=self._storage.reduced_redundancy) if self.buffer_size <= self._buffer_file_size: self._flush_write_buffer() return super(S3BotoStorageFile, self).write(force_bytes(content), *args, **kwargs) @property def _buffer_file_size(self): pos = self.file.tell() self.file.seek(0, os.SEEK_END) length = self.file.tell() self.file.seek(pos) return length def _flush_write_buffer(self): """ Flushes the write buffer. """ if self._buffer_file_size: self._write_counter += 1 self.file.seek(0) headers = self._storage.headers.copy() self._multipart.upload_part_from_file(self.file, self._write_counter, headers=headers) self.file.close() self._file = None def close(self): if self._is_dirty: self._flush_write_buffer() self._multipart.complete_upload() else: if not self._multipart is None: self._multipart.cancel_upload() self.key.close()
def GetSinaUSStockList(page, last_update, mysql): #http://stock.finance.sina.com.cn/usstock/api/jsonp.php/IO.XSRV2.CallbackList%5B%27fa8Vo3U4TzVRdsLs%27%5D/US_CategoryService.getList?page=1&num=20&sort=&asc=0&market=&id= url = 'http://stock.finance.sina.com.cn/usstock/api/jsonp.php/IO.XSRV2.CallbackList%%5B%%27fa8Vo3U4TzVRdsLs%%27%%5D/US_CategoryService.getList?page=%d&num=60&sort=&asc=0&market=&id=' % page data = '' user_agent = 'Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/20100101 Firefox/12.0' headers = { 'User-Agent': user_agent, 'Host': 'stock.finance.sina.com.cn', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', #'Connection': 'keep-alive', 'Referer': 'http://finance.sina.com.cn/stock/usstock/sector.shtml', } req = urllib2.Request(url, data, headers) resp = urllib2.urlopen(req) old_resp = resp if resp.headers.get("content-encoding") == "gzip": gz = GzipFile(fileobj=StringIO(resp.read()), mode="r") #resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url) resp.msg = old_resp.msg #json_html = gz.read() #print 'xxx' # deflate if resp.headers.get("content-encoding") == "deflate": gz = StringIO(deflate(resp.read())) #resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url) # 'class to add info() and resp.msg = old_resp.msg #json_html = gz.read() #print 'YY' json_html = resp.read() #print json_html # json_html = re.sub(r'([a-zA-Z_]+):', r'"\1":', json_html) #json_html = re.sub(r'(McDonald\')', r"McDonald'", json_html) #json_html = json_html.replace("McDonald\\'s", "McDonald_s") #json_html = json_html.replace("O\\'Reilly", "O_Reilly") json_html = json_html.replace("\\'", "_") j_start = json_html.find("IO.XSRV2.CallbackList['fa8Vo3U4TzVRdsLs']((") j_start = j_start + len("IO.XSRV2.CallbackList['fa8Vo3U4TzVRdsLs']((") j_end = json_html.rfind("));") #print j_end #print j_start your_string = json_html[j_start:j_end] your_string = your_string.decode('gbk') #data = json.loads(json_html[j_start:j_end]) #yaml.load('[{id:"1",category:"basic materials"}]') #your_string = re.sub(r'([a-zA-Z_]+):', r'"\1":', your_string) #print your_string #yesterday = datetime.date.today() - datetime.timedelta(1) #print yesterday s_trade_day = last_update.strftime("%Y-%m-%d") print s_trade_day #exit(0) try: json_obj = json.loads(your_string) print json_obj['count'] items = json_obj['data'] for item in items: print item['symbol'] sql = "SELECT * FROM `stock_symbols` WHERE `symbol` = '%s' " % item[ 'symbol'].encode('utf-8') result = mysql.query(sql) if result: #print "item['symbol']:%s in table" % item['symbol'] result = mysql.query(sql) sid = result[0][0] #print sid else: sql = """INSERT INTO `stock_symbols` (`symbol`, `cname`, `fname`, `brief`, `ipodate`, `52weeklow`, `52weekhigh`, `lastpriceopen`, `lastpriceclose`, `lastpricehigh`, `lastpricelow`, `change`, `changepc`, `volumeoftoday`, `marketvalue`, `PE`, `industry`, `exchange`) VALUES ('%s', '%s', '%s', '', '0000-00-00', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', 0, '0.0000', '0.0000', 0, 0);""" % ( item['symbol'].encode('utf-8').strip(), item['name'].encode('utf-8').strip(), item['cname'].encode('utf-8').strip()) #print sql result = mysql.query(sql) #print "INSERT", mysql.lastrowid() sid = mysql.lastrowid() try: if item['pe'] is None: pe = 0 else: pe = item['pe'].encode('utf-8').strip() sql = "INSERT INTO `smartstrader`.`trade_daily_history` (`sid`, `preclose`, `openprice`, `closeprice`, `highprice`, `lowprice`, `volume`, `mktcap`, `pe`, `tradeday`) VALUES ('%d', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % ( sid, item['preclose'].encode('utf-8').strip(), item['open'].encode('utf-8').strip(), item['price'].encode('utf-8').strip(), item['high'].encode('utf-8').strip(), item['low'].encode('utf-8').strip(), item['volume'].encode('utf-8').strip(), item['mktcap'].encode('utf-8').strip(), pe, s_trade_day) except AttributeError: print item raise #print sql try: result = mysql.query(sql) except MySQLdb.IntegrityError, error: print error #print json_obj[0] #print len(json_obj) except JSONDecodeError, e: print your_string print e raise
def _fetch_brute_kddcup99(data_home=None, download_if_missing=True, percent10=True): """Load the kddcup99 dataset, downloading it if necessary. Parameters ---------- data_home : string, optional Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. percent10 : bool, default=True Whether to load only 10 percent of the data. Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array of shape (494021, 41) Each row corresponds to the 41 features in the dataset. dataset.target : numpy array of shape (494021,) Each value corresponds to one of the 21 attack types or to the label 'normal.'. dataset.DESCR : string Description of the kddcup99 dataset. """ data_home = get_data_home(data_home=data_home) if sys.version_info[0] == 3: # The zlib compression format use by joblib is not compatible when # switching from Python 2 to Python 3, let us use a separate folder # under Python 3: dir_suffix = "-py3" else: # Backward compat for Python 2 users dir_suffix = "" if percent10: kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix) archive = ARCHIVE_10_PERCENT else: kddcup_dir = join(data_home, "kddcup99" + dir_suffix) archive = ARCHIVE samples_path = join(kddcup_dir, "samples") targets_path = join(kddcup_dir, "targets") available = exists(samples_path) if download_if_missing and not available: _mkdirp(kddcup_dir) logger.info("Downloading %s" % archive.url) _fetch_remote(archive, dirname=kddcup_dir) dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'), ('flag', 'S6'), ('src_bytes', int), ('dst_bytes', int), ('land', int), ('wrong_fragment', int), ('urgent', int), ('hot', int), ('num_failed_logins', int), ('logged_in', int), ('num_compromised', int), ('root_shell', int), ('su_attempted', int), ('num_root', int), ('num_file_creations', int), ('num_shells', int), ('num_access_files', int), ('num_outbound_cmds', int), ('is_host_login', int), ('is_guest_login', int), ('count', int), ('srv_count', int), ('serror_rate', float), ('srv_serror_rate', float), ('rerror_rate', float), ('srv_rerror_rate', float), ('same_srv_rate', float), ('diff_srv_rate', float), ('srv_diff_host_rate', float), ('dst_host_count', int), ('dst_host_srv_count', int), ('dst_host_same_srv_rate', float), ('dst_host_diff_srv_rate', float), ('dst_host_same_src_port_rate', float), ('dst_host_srv_diff_host_rate', float), ('dst_host_serror_rate', float), ('dst_host_srv_serror_rate', float), ('dst_host_rerror_rate', float), ('dst_host_srv_rerror_rate', float), ('labels', 'S16')] DT = np.dtype(dt) logger.debug("extracting archive") archive_path = join(kddcup_dir, archive.filename) file_ = GzipFile(filename=archive_path, mode='r') Xy = [] for line in file_.readlines(): if six.PY3: line = line.decode() Xy.append(line.replace('\n', '').split(',')) file_.close() logger.debug('extraction done') os.remove(archive_path) Xy = np.asarray(Xy, dtype=object) for j in range(42): Xy[:, j] = Xy[:, j].astype(DT[j]) X = Xy[:, :-1] y = Xy[:, -1] # XXX bug when compress!=0: # (error: 'Incorrect data length while decompressing[...] the file # could be corrupted.') joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) elif not available: if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") try: X, y except NameError: X = joblib.load(samples_path) y = joblib.load(targets_path) return Bunch(data=X, target=y, DESCR=__doc__)
class NBTFile(TAG_Compound): """Represent an NBT file object.""" def __init__(self, filename=None, buffer=None, fileobj=None): """ Create a new NBTFile object. Specify either a filename, file object or data buffer. If filename of file object is specified, data should be GZip-compressed. If a data buffer is specified, it is assumed to be uncompressed. If filename is specified, the file is closed after reading and writing. If file object is specified, the caller is responsible for closing the file. """ super(NBTFile, self).__init__() self.filename = filename self.type = TAG_Byte(self.id) closefile = True #make a file object if filename: self.filename = filename self.file = GzipFile(filename, 'rb') elif buffer: if hasattr(buffer, 'name'): self.filename = buffer.name self.file = buffer closefile = False elif fileobj: if hasattr(fileobj, 'name'): self.filename = fileobj.name self.file = GzipFile(fileobj=fileobj) else: self.file = None closefile = False #parse the file given initially if self.file: self.parse_file() if closefile: # Note: GzipFile().close() does NOT close the fileobj, # So we are still responsible for closing that. try: self.file.close() except (AttributeError, IOError): pass self.file = None def parse_file(self, filename=None, buffer=None, fileobj=None): """Completely parse a file, extracting all tags.""" if filename: self.file = GzipFile(filename, 'rb') elif buffer: if hasattr(buffer, 'name'): self.filename = buffer.name self.file = buffer elif fileobj: if hasattr(fileobj, 'name'): self.filename = fileobj.name self.file = GzipFile(fileobj=fileobj) if self.file: try: type = TAG_Byte(buffer=self.file) if type.value == self.id: name = TAG_String(buffer=self.file).value self._parse_buffer(self.file) self.name = name self.file.close() else: raise MalformedFileError("First record is not a Compound Tag") except StructError as e: raise MalformedFileError("Partial File Parse: file possibly truncated.") else: raise ValueError("NBTFile.parse_file(): Need to specify either a filename or a file object") def write_file(self, filename=None, buffer=None, fileobj=None): """Write this NBT file to a file.""" closefile = True if buffer: self.filename = None self.file = buffer closefile = False elif filename: self.filename = filename self.file = GzipFile(filename, "wb") elif fileobj: self.filename = None self.file = GzipFile(fileobj=fileobj, mode="wb") elif self.filename: self.file = GzipFile(self.filename, "wb") elif not self.file: raise ValueError("NBTFile.write_file(): Need to specify either a filename or a file object") #Render tree to file TAG_Byte(self.id)._render_buffer(self.file) TAG_String(self.name)._render_buffer(self.file) self._render_buffer(self.file) #make sure the file is complete try: self.file.flush() except (AttributeError, IOError): pass if closefile: try: self.file.close() except (AttributeError, IOError): pass def __repr__(self): """ Return a string (ascii formated for Python 2, unicode for Python 3) describing the class, name and id for debugging purposes. """ if self.filename: return "<%s(%r) with %s(%r) at 0x%x>" % (self.__class__.__name__, self.filename, \ TAG_Compound.__name__, self.name, id(self)) else: return "<%s with %s(%r) at 0x%x>" % (self.__class__.__name__, \ TAG_Compound.__name__, self.name, id(self))
def GetSinaUSStockCategory(mysql): url = 'http://stock.finance.sina.com.cn/usstock/api/jsonp.php/var%20category=/US_CategoryService.getCategory' data = '' user_agent = 'Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/20100101 Firefox/12.0' headers = { 'User-Agent': user_agent, 'Host': 'stock.finance.sina.com.cn', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', #'Connection': 'keep-alive', 'Referer': 'http://finance.sina.com.cn/stock/usstock/sector.shtml', } req = urllib2.Request(url, data, headers) resp = urllib2.urlopen(req) old_resp = resp if resp.headers.get("content-encoding") == "gzip": gz = GzipFile(fileobj=StringIO(resp.read()), mode="r") #resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url) resp.msg = old_resp.msg #json_html = gz.read() #print 'xxx' # deflate if resp.headers.get("content-encoding") == "deflate": gz = StringIO(deflate(resp.read())) #resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url) # 'class to add info() and resp.msg = old_resp.msg #json_html = gz.read() #print 'YY' json_html = resp.read() # j_start = json_html.find("var category=(") j_start = j_start + len("var category=(") j_end = json_html.rfind(");") #print j_end #print j_start #print json_html[j_start:j_end] #data = json.loads(json_html[j_start:j_end]) #yaml.load('[{id:"1",category:"basic materials"}]') your_string = re.sub(r'([a-zA-Z_]+):', r'"\1":', json_html[j_start:j_end]) #print your_string json_obj = json.loads(your_string.decode('gbk')) #print json_obj[0] #print len(json_obj) for item in json_obj: print item['id'] print item['category'] print item['category_cn'] print item['parent'] #print item['child'] for citem in item['child']: print '\t', citem['id'] print '\t', citem['category'] print '\t', citem['category_cn'] print '\t', citem['parent'] print "####"
def checkin(self, timeout=11): """Gather system status.""" # Compile checkin data checkin_start = time.time() status = { 'usage': events.usage(), 'uptime': system_utilities.uptime(), 'system_utilization': self.utilization_tracker.get_data(), } # Append status if we can try: #get the software versions status['versions'] = bts.get_versions() except BSSError as e: logger.error("bts get_versions error: %s" % e) try: # Gather camped subscriber list status['camped_subscribers'] = bts.active_subscribers() except BSSError as e: logger.error("bts get active_subscribers error: %s" % e) # Gather tower load and noise data. # NOTE(matt): these values can vary quite a bit over a minute. It # might be worth capturing data more frequently and sending # something like average or median values. status['openbts_load'] = {} try: status['openbts_load'] = bts.get_load() except BSSError as e: logger.error("bts get_load error: %s" % e) for key, val in self._checkin_load_stats.items(): status['openbts_load']['checkin.' + key] = val self._checkin_load_stats.clear() try: status['openbts_noise'] = bts.get_noise() except BSSError as e: logger.error("bts get_noise error: %s" % e) status['radio'] = {} try: status['radio']['band'] = bts.get_band() # eventually need to also grab all used channels, not just c0 # TODO: (kheimerl) T13270338 Add multiband support status['radio']['c0'] = bts.get_arfcn_c0() #also add power here eventually # TODO: (kheimerl) T13270365 Add power level support except BSSError as e: #delete the key if this failed del status['radio'] logger.error("bts radio error: %s" % e) # Add balance sync data status['subscribers'] = subscriber.get_subscriber_states( imsis=events.EventStore().modified_subs()) # Add delta protocol context (if available) to let server know, # client supports delta optimization & has a prior delta state if delta.DeltaProtocol.CTX_KEY not in status: # just a precaution sections_ctx = {} for section, ctx in CheckinHandler.section_ctx.items(): if ctx: sections_ctx[section] = ctx.to_proto_dict() if sections_ctx: status[delta.DeltaProtocol.CTX_KEY] = { delta.DeltaProtocolOptimizer.SECTIONS_CTX_KEY: sections_ctx } # Send checkin request. uuid = snowflake.snowflake() data = { 'status': status, 'bts_uuid': uuid, } headers = dict(self.auth_header) # Set content type to app/json & utf-8, compressed or not - JSON should # be more efficient then URL encoded JSON form payload headers['Content-Type'] = 'application/json; charset=utf-8' data_json = json.dumps(data) decompressed_status_len = len(data_json) status_len = decompressed_status_len if status_len > endaga_ic.MIN_COMPRESSIBLE_REQUEST_SZ: # try to gzip payload, send uncompressed if compression failed try: gzbuf = BytesIO() with GzipFile(mode='wb', fileobj=gzbuf) as gzfile: gzfile.write(data_json) data_json = gzbuf.getvalue() # Using Content-Encoding header since AWS cannot handle # Transfer-Encoding header which would be more appropriate here headers['Content-Encoding'] = 'gzip' status_len = len(data_json) # set len to reflect compression except BaseException as e: logger.error("Checkin request Gzip error: %s" % e) headers['Content-Length'] = str(status_len) post_start = time.time() try: r = self.session.post( self.conf['registry'] + "/checkin?id=" + # add part of uuid to the query, it helps with # debugging & server side logging and can # be used by LBs uuid[:8], headers=headers, data=data_json, timeout=timeout, cookies=self._session_cookies) except BaseException as e: logger.error("Endaga: checkin failed , network error: %s." % e) self._cleanup_session() self._checkin_load_stats['req_sz'] = status_len self._checkin_load_stats['raw_req_sz'] = decompressed_status_len self._checkin_load_stats['post_lat'] = time.time() - post_start raise post_end = time.time() # Make sure either server sent charset or we set it to utf-8 (JSON # default) if not r.encoding: r.encoding = 'utf-8' text = r.text decompressed_response_len = len(text) response_len = decompressed_response_len # Try to get correct content length from HTTP headers, it should # reflect correctly compressed length. if it fails - fall back to # getting length of returned text cont_len = r.headers.get('Content-Length') if cont_len: try: response_len = int(cont_len) except BaseException: pass if r.status_code == 200: try: CheckinHandler(text) logger.info("Endaga: checkin success.") if r.cookies is not None: if self._session_cookies is None: # First time cookies are seen from server # initialize the cookies dict self._session_cookies = dict(r.cookies) else: for key, value in r.cookies.items(): # if server sent new/updated cookies, update them, # but keep previously set cokies as well. ELBs # do not send AWSELB cookies on every request & # expect clients to 'remember' them self._session_cookies[key] = value except BaseException: self._cleanup_session() raise else: logger.error("Endaga: checkin failed (%d), reason: %s, body: %s" % (r.status_code, r.reason, r.text)) # cleanup session on any error if r.status_code >= 300: self._cleanup_session() checkin_end = time.time() self._checkin_load_stats['req_sz'] = status_len # request payload SZ self._checkin_load_stats['raw_req_sz'] = decompressed_status_len self._checkin_load_stats[ 'rsp_sz'] = response_len # response payload SZ self._checkin_load_stats['raw_rsp_sz'] = decompressed_response_len # Checkin Latencies self._checkin_load_stats['post_lat'] = post_end - post_start self._checkin_load_stats['process_lat'] = checkin_end - post_end self._checkin_load_stats['lat'] = checkin_end - checkin_start data['response'] = {'status': r.status_code, 'text': r.text} return data
def compress_string(s): zbuf = BytesIO() with GzipFile(mode='wb', compresslevel=6, fileobj=zbuf, mtime=0) as zfile: zfile.write(s) return zbuf.getvalue()
def _write_compressed(self, fileobj): with GzipFile(fileobj=fileobj, mode='w') as gz_f: gz_f.writelines(self.data)
def _ungzip(self, data): """ Un-gzip some data. """ s = StringIO(data) return GzipFile(fileobj=s, mode='rb').read()
def __exit__(self, exc_type, exc_value, traceback): if hasattr(GzipFile, '__exit__'): return GzipFile.__exit__(self, exc_type, exc_value, traceback) else: return self.close()
def add_file(): tags = request.forms.get('tag_list') uploads = request.files.getlist('file') # Set Project project = request.forms.get('project') if project in project_list(): __project__.open(project) else: __project__.open('../') project = 'Main' db = Database() file_list = [] # Write temp file to disk with upload_temp() as temp_dir: for upload in uploads: file_path = os.path.join(temp_dir, upload.filename) with open(file_path, 'w') as tmp_file: tmp_file.write(upload.file.read()) # Zip Files if request.forms.get('compression') == 'zip': zip_pass = request.forms.get('zip_pass') try: with ZipFile(file_path) as zf: zf.extractall(temp_dir, pwd=zip_pass) for root, dirs, files in os.walk(temp_dir, topdown=False): for name in files: if not name == upload.filename: file_list.append(os.path.join(root, name)) except Exception as e: return template('error.tpl', error="Error with zipfile - {0}".format(e)) # GZip Files elif request.forms.get('compression') == 'gz': try: gzf = GzipFile(file_path, 'rb') decompress = gzf.read() gzf.close() with open(file_path[:-3], "wb") as df: df.write(decompress) file_list.append(file_path[:-3]) except Exception as e: return template( 'error.tpl', error="Error with gzipfile - {0}".format(e)) # BZip2 Files elif request.forms.get('compression') == 'bz2': try: bz2f = BZ2File(file_path, 'rb') decompress = bz2f.read() bz2f.close() with open(file_path[:-3], "wb") as df: df.write(decompress) file_list.append(file_path[:-3]) except Exception as e: return template( 'error.tpl', error="Error with bzip2file - {0}".format(e)) # Tar Files (any, including tar.gz tar.bz2) elif request.forms.get('compression') == 'tar': try: if not tarfile.is_tarfile(file_path): return template('error.tpl', error="This is not a tar file") with tarfile.open(file_path, 'r:*') as tarf: tarf.extractall(temp_dir) for root, dirs, files in os.walk(temp_dir, topdown=False): for name in files: if not name == upload.filename: file_list.append(os.path.join(root, name)) except Exception as e: return template('error.tpl', error="Error with tarfile - {0}".format(e)) # Non zip files elif request.forms.get('compression') == 'none': file_list.append(file_path) # Add each file for new_file in file_list: print new_file obj = File(new_file) new_path = store_sample(obj) success = True if new_path: # Add file to the database. success = db.add(obj=obj, tags=tags) if not success: return template( 'error.tpl', error="Unable to Store The File: {0}".format( upload.filename)) redirect("/project/{0}".format(project))
def __init__(self, file): data = GzipFile(fileobj=file, mode="rb").read() self.size = len(data) self.name = file.name super(GzipChunk, self).__init__(data)
def __enter__(self): if hasattr(GzipFile, '__enter__'): return GzipFile.__enter__(self) else: return self
def __enter__(self): file_handle = GzipFile(fileobj=open(self.file_name, 'wb'), mode='wb') self.set_file_handle(file_handle) self.add_file_to_registry() return self
def _update(self, version): from poetry.utils.helpers import temporary_directory platform = sys.platform if platform == "linux2": platform = "linux" checksum = "poetry-{}-{}.sha256sum".format(version, platform) try: r = urlopen(self.BASE_URL + "/{}/{}".format(version, checksum)) except HTTPError as e: if e.code == 404: raise RuntimeError("Could not find {} file".format(checksum)) raise checksum = r.read().decode() # We get the payload from the remote host name = "poetry-{}-{}.tar.gz".format(version, platform) try: r = urlopen(self.BASE_URL + "/{}/{}".format(version, name)) except HTTPError as e: if e.code == 404: raise RuntimeError("Could not find {} file".format(name)) raise meta = r.info() size = int(meta["Content-Length"]) current = 0 block_size = 8192 bar = self.progress_bar(max=size) bar.set_format( " - Downloading <info>{}</> <comment>%percent%%</>".format(name)) bar.start() sha = hashlib.sha256() with temporary_directory(prefix="poetry-updater-") as dir_: tar = os.path.join(dir_, name) with open(tar, "wb") as f: while True: buffer = r.read(block_size) if not buffer: break current += len(buffer) f.write(buffer) sha.update(buffer) bar.set_progress(current) bar.finish() # Checking hashes if checksum != sha.hexdigest(): raise RuntimeError( "Hashes for {} do not match: {} != {}".format( name, checksum, sha.hexdigest())) gz = GzipFile(tar, mode="rb") try: with tarfile.TarFile(tar, fileobj=gz, format=tarfile.PAX_FORMAT) as f: f.extractall(str(self.lib)) finally: gz.close()
except: old = '' files = [] for dirs in sys.argv[1:]: files += [dirs + '/' + x for x in os.listdir(dirs)] files = set(files) - (knownfiles) output = codecs.open('kv7planning.idx', 'w', 'UTF-8') output.write(old) for filename in sorted(files): localservicelevelcodes = set([]) for line in GzipFile(filename, 'r'): if line[0] == '\\': if dumping: dumping = False if table == 'LOCALSERVICEGROUPPASSTIME' and line[1] == 'L': dumping = True elif line[1] == 'T': table = line[2:].split('|')[0] else: if dumping: line = line.decode('UTF-8').split('|') localservicelevelcodes.add(line[0] + "|" + line[1])
def eval(): files = glob(data + "/" + "*.zip") files.sort() print len(files), "found" for fileName in files[:]: print fileName # s_time = time.time() smp = pp.GestureSample(fileName) # print "loading", (time.time()-s_time)/1000.,"ms" # s_time = time.time() n = smp.data['numFrames'] dv, uv, gv = smp.depth, smp.user, smp.rgb cur_fr = 1 # new_shape = (step,128,128) s = [] d, u, g = [empty((n_f, ) + vid_res + (3, ), "uint8") for _ in range(3)] # take first n_f frames for v in dv, uv, gv: pp.go_to_frame(v, cur_fr) for i, fr in enumerate(range(cur_fr, cur_fr + n_f)): s.append(smp.getSkeleton(fr)) d[i], u[i], g[i] = [v.read()[1] for v in dv, uv, gv] d, u, g = [pp.to_grayscale(v) for v in d, u, g] u[u < 128], u[u >= 128] = 0, 1 depth, user, gray, skelet = d, u, g, s user_o = user.copy() depth_o = depth.copy() gray_o = gray.copy() # user_depth = depth_o[user_o==1] skelet, c = pp.proc_skelet(array(skelet).copy()) user = pp.proc_user(user) _, depth, c = pp.proc_depth(depth.copy(), user.copy(), user_o, array(skelet).copy()) gray, c = pp.proc_gray( gray.copy(), user, array(skelet).copy()) #user.copy!!!!!!!!!!!!!!!!!!! cur_fr += n_f predictions = [] while cur_fr + step < n: # time_start = time.time() sn = [] dn, un, gn = [ empty((step, ) + vid_res + (3, ), "uint8") for _ in range(3) ] # for v in dv,uv,gv: pp.go_to_frame(v, cur_fr) for i, fr in enumerate(range(cur_fr, cur_fr + step)): sn.append(smp.getSkeleton(fr)) dn[i], un[i], gn[i] = [v.read()[1] for v in dv, uv, gv] dn, un, gn = [pp.to_grayscale(v) for v in dn, un, gn] un[un < 128], un[un >= 128] = 0, 1 s = s[step:] + sn # s.extend(sn) skelet, c = pp.proc_skelet(s, _3D=False) # len_dump = len(depth_o[:step][user_o[:step]==1]) # un_d = dn[un==1] user_o[:-step] = user_o[step:] user_o[-step:] = un.copy() un = pp.proc_user(un, 3) user[:-step] = user[step:] user[-step:] = un.copy() depth_o[:-step] = depth_o[step:] depth_o[-step:] = dn.copy() gray_o[:-step] = gray_o[step:] gray_o[-step:] = gn.copy() _, depth, c = pp.proc_depth(depth_o.copy(), user.copy(), user_o, skelet) gray, c = pp.proc_gray(gray_o.copy(), user, skelet) traj2D, traj3D, ori, pheight, hand, center = skelet video = empty(( 1, 2, ) + gray.shape, dtype="uint8") video[0, 0] = gray.copy() video[0, 1] = depth.copy() video = video.swapaxes(1, 2) #(body-hand,gray-depth,fr,h,w) v_new = empty((1, 2, 2) + vid_shape, dtype="uint8") # p = pheight ratio = 0.25 for i in xrange(video.shape[0]): #batch if pheight < 10: pheight = 100 scale = ratio #+randi(2)/100. ofs = pheight * scale mid = video.shape[-1] / 2. sli = None if ofs < mid: start = int(round(mid - ofs)) end = int(round(mid + ofs)) sli = slice(start, end) for j in xrange(video.shape[2]): #maps for k in xrange(video.shape[3]): #frames #body img = video[i, 0, j, k] img = cut_img(img, 5) img = misc.imresize(img, (h, h)) # if j==0: img = 255-misc.imfilter(img,"contour") v_new[i, 0, j, k] = img #hand img = video[i, 1, j, k] img = img[sli, sli] img = misc.imresize(img, (h, h)) v_new[i, 1, j, k] = img # print "put" # pred_loop(v_new,cur_fr,n, fileName) x_.set_value(v_new.astype("float32"), borrow=True) pred = evalu_model()[0][0] predictions.append(pred) cur_fr += step predictions = array(predictions, float32) pred_file_name = fileName.split('/') pred_file_name = pred_file_name[-1].replace(".zip", "_prediction.zip") file = GzipFile(dst + "/" + pred_file_name, 'wb') dump(predictions, file, -1) file.close()
input_end_time = time() input_length = len(body) if verbose: logger.info("Read '%s' (%d bytes)" % (input_name, input_length)) logger.debug("File Response Headers: %s" % (str(headers), )) output_name = input_name.replace('/', '_') output_length = input_length # To gzip or not gzip, that is the question if not input_name.endswith('.gz') and gzip_data: output = StringIO.StringIO() output_name += ".gz" with GzipFile(filename=output_name, mode='w', fileobj=output) as of: of.write(body) body = output.getvalue() output_length = len(body) logger.debug("Gzipped Body is now %d bytes "% (output_length, )) output_hdfs_file = hdfs_output_dir + '/' + output_name if verbose: logger.info("Writing %s (%d bytes)" % (output_name, output_length)) (tmp_fd, tmp_filename) = tempfile.mkstemp() try: with open(tmp_filename, "wb") as fp: fp.write(body)
def compress_string(s): zbuf = BytesIO() zfile = GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile.write(s) zfile.close() return zbuf.getvalue()
def _put_file(self, name, content): name = self._path(name) placeholder = False if self.cache: if not self.cache.exists(name): self.cache.save(name, 0, 0) placedholder = True content_type = mimetypes.guess_type(name)[0] or "application/x-octet-stream" headers = {} for pattern in self.headers: if pattern[0].match(name): headers = pattern[1].copy() break file_pos = content.tell() content.seek(0, 2) content_length = content.tell() content.seek(0) gz_cts = getattr( settings, 'CUDDLYBUDDLY_STORAGE_S3_GZIP_CONTENT_TYPES', ( 'text/css', 'application/javascript', 'application/x-javascript' ) ) gz_content = None if content_length > 1024 and content_type in gz_cts: gz_content = StringIO() gzf = GzipFile(mode='wb', fileobj=gz_content) gzf.write(content.read()) content.seek(0) gzf.close() gz_content.seek(0, 2) gz_content_length = gz_content.tell() gz_content.seek(0) if gz_content_length < content_length: content_length = gz_content_length headers.update({ 'Content-Encoding': 'gzip' }) else: gz_content = None headers.update({ 'Content-Type': content_type, 'Content-Length': str(content_length) }) # Httplib in < 2.6 doesn't accept file like objects. Meanwhile in # >= 2.7 it will try to join a content str object with the headers which # results in encoding problems. if sys.version_info[0] == 2 and sys.version_info[1] < 6: content_to_send = gz_content.read() if gz_content is not None else content.read() else: content_to_send = gz_content if gz_content is not None else content response = self.connection.put(self.bucket, name, content_to_send, headers) content.seek(file_pos) if response.http_response.status != 200: if placeholder: self.cache.remove(name) raise S3Error(response.message) if self.cache: date = response.http_response.getheader('Date') date = timegm(parsedate(date)) self.cache.save(name, size=content_length, mtime=date)
class S3Boto3StorageFile(File): """ The default file object used by the S3Boto3Storage backend. This file implements file streaming using boto's multipart uploading functionality. The file can be opened in read or write mode. This class extends Django's File class. However, the contained data is only the data contained in the current buffer. So you should not access the contained file object directly. You should access the data via this class. Warning: This file *must* be closed using the close() method in order to properly write the file to S3. Be sure to close the file in your application. """ # TODO: Read/Write (rw) mode may be a bit undefined at the moment. Needs testing. # TODO: When Django drops support for Python 2.5, rewrite to use the # BufferedIO streams in the Python 2.6 io module. buffer_size = setting('AWS_S3_FILE_BUFFER_SIZE', 5242880) def __init__(self, name, mode, storage, buffer_size=None): self._storage = storage self.name = name[len(self._storage.location):].lstrip('/') self._mode = mode self.obj = storage.bucket.Object(storage._encode_name(name)) if 'w' not in mode: # Force early RAII-style exception if object does not exist self.obj.load() self._is_dirty = False self._file = None self._multipart = None # 5 MB is the minimum part size (if there is more than one part). # Amazon allows up to 10,000 parts. The default supports uploads # up to roughly 50 GB. Increase the part size to accommodate # for files larger than this. if buffer_size is not None: self.buffer_size = buffer_size self._write_counter = 0 @property def size(self): return self.obj.content_length def _get_file(self): if self._file is None: self._file = SpooledTemporaryFile( max_size=self._storage.max_memory_size, suffix=".S3Boto3StorageFile", dir=setting("FILE_UPLOAD_TEMP_DIR", None)) if 'r' in self._mode: self._is_dirty = False self._file.write(self.obj.get()['Body'].read()) self._file.seek(0) if self._storage.gzip and self.obj.content_encoding == 'gzip': self._file = GzipFile(mode=self._mode, fileobj=self._file, mtime=0.0) return self._file def _set_file(self, value): self._file = value file = property(_get_file, _set_file) def read(self, *args, **kwargs): if 'r' not in self._mode: raise AttributeError("File was not opened in read mode.") return super(S3Boto3StorageFile, self).read(*args, **kwargs) def write(self, content): if 'w' not in self._mode: raise AttributeError("File was not opened in write mode.") self._is_dirty = True if self._multipart is None: parameters = self._storage.object_parameters.copy() parameters['ACL'] = self._storage.default_acl parameters['ContentType'] = (mimetypes.guess_type(self.obj.key)[0] or self._storage.default_content_type) if self._storage.reduced_redundancy: parameters['StorageClass'] = 'REDUCED_REDUNDANCY' if self._storage.encryption: parameters['ServerSideEncryption'] = 'AES256' self._multipart = self.obj.initiate_multipart_upload(**parameters) if self.buffer_size <= self._buffer_file_size: self._flush_write_buffer() return super(S3Boto3StorageFile, self).write(force_bytes(content)) @property def _buffer_file_size(self): pos = self.file.tell() self.file.seek(0, os.SEEK_END) length = self.file.tell() self.file.seek(pos) return length def _flush_write_buffer(self): """ Flushes the write buffer. """ if self._buffer_file_size: self._write_counter += 1 self.file.seek(0) part = self._multipart.Part(self._write_counter) part.upload(Body=self.file.read()) def close(self): if self._is_dirty: self._flush_write_buffer() # TODO: Possibly cache the part ids as they're being uploaded # instead of requesting parts from server. For now, emulating # s3boto's behavior. parts = [{ 'ETag': part.e_tag, 'PartNumber': part.part_number } for part in self._multipart.parts.all()] self._multipart.complete(MultipartUpload={'Parts': parts}) else: if self._multipart is not None: self._multipart.abort() if self._file is not None: self._file.close() self._file = None
def compress(s): buf = BytesIO() with GzipFile(mode='wb', fileobj=buf) as zfile: zfile.write(s) return buf.getvalue()
def fetch_rcv1(*, data_home=None, subset='all', download_if_missing=True, random_state=None, shuffle=False, return_X_y=False): """Load the RCV1 multilabel dataset (classification). Download it if necessary. Version: RCV1-v2, vectors, full sets, topics multilabels. ================= ===================== Classes 103 Samples total 804414 Dimensionality 47236 Features real, between 0 and 1 ================= ===================== Read more in the :ref:`User Guide <rcv1_dataset>`. .. versionadded:: 0.17 Parameters ---------- data_home : string, optional Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. subset : string, 'train', 'test', or 'all', default='all' Select the dataset to load: 'train' for the training set (23149 samples), 'test' for the test set (781265 samples), 'all' for both, with the training samples first if shuffle is False. This follows the official LYRL2004 chronological split. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance, default=None Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. shuffle : bool, default=False Whether to shuffle dataset. return_X_y : boolean, default=False. If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch object. See below for more information about the `dataset.data` and `dataset.target` object. .. versionadded:: 0.20 Returns ------- dataset : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : scipy csr array, dtype np.float64, shape (804414, 47236) The array has 0.16% of non zero values. target : scipy csr array, dtype np.uint8, shape (804414, 103) Each sample has a value of 1 in its categories, and 0 in others. The array has 3.15% of non zero values. sample_id : numpy array, dtype np.uint32, shape (804414,) Identification number of each sample, as ordered in dataset.data. target_names : numpy array, dtype object, length (103) Names of each target (RCV1 topics), as ordered in dataset.target. DESCR : string Description of the RCV1 dataset. (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 """ N_SAMPLES = 804414 N_FEATURES = 47236 N_CATEGORIES = 103 N_TRAIN = 23149 data_home = get_data_home(data_home=data_home) rcv1_dir = join(data_home, "RCV1") if download_if_missing: if not exists(rcv1_dir): makedirs(rcv1_dir) samples_path = _pkl_filepath(rcv1_dir, "samples.pkl") sample_id_path = _pkl_filepath(rcv1_dir, "sample_id.pkl") sample_topics_path = _pkl_filepath(rcv1_dir, "sample_topics.pkl") topics_path = _pkl_filepath(rcv1_dir, "topics_names.pkl") # load data (X) and sample_id if download_if_missing and (not exists(samples_path) or not exists(sample_id_path)): files = [] for each in XY_METADATA: logger.info("Downloading %s" % each.url) file_path = _fetch_remote(each, dirname=rcv1_dir) files.append(GzipFile(filename=file_path)) Xy = load_svmlight_files(files, n_features=N_FEATURES) # Training data is before testing data X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr() sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7])) sample_id = sample_id.astype(np.uint32, copy=False) joblib.dump(X, samples_path, compress=9) joblib.dump(sample_id, sample_id_path, compress=9) # delete archives for f in files: f.close() remove(f.name) else: X = joblib.load(samples_path) sample_id = joblib.load(sample_id_path) # load target (y), categories, and sample_id_bis if download_if_missing and (not exists(sample_topics_path) or not exists(topics_path)): logger.info("Downloading %s" % TOPICS_METADATA.url) topics_archive_path = _fetch_remote(TOPICS_METADATA, dirname=rcv1_dir) # parse the target file n_cat = -1 n_doc = -1 doc_previous = -1 y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8) sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32) category_names = {} with GzipFile(filename=topics_archive_path, mode='rb') as f: for line in f: line_components = line.decode("ascii").split(" ") if len(line_components) == 3: cat, doc, _ = line_components if cat not in category_names: n_cat += 1 category_names[cat] = n_cat doc = int(doc) if doc != doc_previous: doc_previous = doc n_doc += 1 sample_id_bis[n_doc] = doc y[n_doc, category_names[cat]] = 1 # delete archive remove(topics_archive_path) # Samples in X are ordered with sample_id, # whereas in y, they are ordered with sample_id_bis. permutation = _find_permutation(sample_id_bis, sample_id) y = y[permutation, :] # save category names in a list, with same order than y categories = np.empty(N_CATEGORIES, dtype=object) for k in category_names.keys(): categories[category_names[k]] = k # reorder categories in lexicographic order order = np.argsort(categories) categories = categories[order] y = sp.csr_matrix(y[:, order]) joblib.dump(y, sample_topics_path, compress=9) joblib.dump(categories, topics_path, compress=9) else: y = joblib.load(sample_topics_path) categories = joblib.load(topics_path) if subset == 'all': pass elif subset == 'train': X = X[:N_TRAIN, :] y = y[:N_TRAIN, :] sample_id = sample_id[:N_TRAIN] elif subset == 'test': X = X[N_TRAIN:, :] y = y[N_TRAIN:, :] sample_id = sample_id[N_TRAIN:] else: raise ValueError("Unknown subset parameter. Got '%s' instead of one" " of ('all', 'train', test')" % subset) if shuffle: X, y, sample_id = shuffle_(X, y, sample_id, random_state=random_state) module_path = dirname(__file__) with open(join(module_path, 'descr', 'rcv1.rst')) as rst_file: fdescr = rst_file.read() if return_X_y: return X, y return Bunch(data=X, target=y, sample_id=sample_id, target_names=categories, DESCR=fdescr)