def _read_raw_athena(filename): """try to read athena project file as plain text, to determine validity """ # try gzip text = None try: fh = GzipFile(filename) text = bytes2str(fh.read()) except Exception: errtype, errval, errtb = sys.exc_info() text = None finally: fh.close() if text is None: # try plain text file try: fh = open(filename, 'r') text = bytes2str(fh.read()) except Exception: errtype, errval, errtb = sys.exc_info() text = None finally: fh.close() return text
def generate_log_stream_from_file(path): """ yield a sequence of (header, data) tuples from a named file """ input_gzip_file = GzipFile(filename=path) while True: packed_frame = input_gzip_file.read(_frame_size) if len(packed_frame) < _frame_size: raise StopIteration() protocol_version, header_size, data_size = struct.unpack(_frame_format, packed_frame) if protocol_version != _frame_protocol_version: raise LogStreamError("Invalid protocol {0} expected {1}".format( protocol_version, _frame_protocol_version)) header = input_gzip_file.read(header_size) if len(header) != header_size: raise LogStreamError("Invalid header read {0} expected {1}".format( len(header), header_size)) data = input_gzip_file.read(data_size) if len(data) != data_size: raise LogStreamError("Invalid data read {0} expected {1}".format( len(data), data_size)) yield header, data
def load(cls, filename, metadata_only=False): """ Load ring data from a file. :param filename: Path to a file serialized by the save() method. :param bool metadata_only: If True, only load `devs` and `part_shift`. :returns: A RingData instance containing the loaded data. """ gz_file = GzipFile(filename, 'rb') # Python 2.6 GzipFile doesn't support BufferedIO if hasattr(gz_file, '_checkReadable'): gz_file = BufferedReader(gz_file) # See if the file is in the new format magic = gz_file.read(4) if magic == 'R1NG': format_version, = struct.unpack('!H', gz_file.read(2)) if format_version == 1: ring_data = cls.deserialize_v1(gz_file, metadata_only=metadata_only) else: raise Exception('Unknown ring format version %d' % format_version) else: # Assume old-style pickled ring gz_file.seek(0) ring_data = pickle.load(gz_file) if not hasattr(ring_data, 'devs'): ring_data = RingData(ring_data['replica2part2dev_id'], ring_data['devs'], ring_data['part_shift']) return ring_data
def load(cls, filename): """ Load ring data from a file. :param filename: Path to a file serialized by the save() method. :returns: A RingData instance containing the loaded data. """ gz_file = GzipFile(filename, 'rb') # Python 2.6 GzipFile doesn't support BufferedIO if hasattr(gz_file, '_checkReadable'): gz_file = BufferedReader(gz_file) # See if the file is in the new format magic = gz_file.read(4) if magic == 'R1NG': # Not sure if this is intended to be unpacked into a tuple # Leave the comma unless it has been proven not to be needed version, = struct.unpack('!H', gz_file.read(2)) if version == 1: ring_data = cls.deserialize_v1(gz_file) else: raise Exception('Unknown ring format version %d' % version) else: # Assume old-style pickled ring gz_file.seek(0) ring_data = pickle.load(gz_file) if not hasattr(ring_data, 'devs'): ring_data = RingData( ring_data['replica2part2dev_id'], ring_data['devs'], ring_data['part_shift'] ) return ring_data
def load(cls, filename): """ Load ring data from a file. :param filename: Path to a file serialized by the save() method. :returns: A RingData instance containing the loaded data. """ gz_file = GzipFile(filename, 'rb') # Python 2.6 GzipFile doesn't support BufferedIO if hasattr(gz_file, '_checkReadable'): gz_file = BufferedReader(gz_file) # See if the file is in the new format magic = gz_file.read(4) if magic == 'R1NG': version, = struct.unpack('!H', gz_file.read(2)) if version == 1: ring_data = cls.deserialize_v1(gz_file) else: raise Exception('Unknown ring format version %d' % version) else: # Assume old-style pickled ring gz_file.seek(0) ring_data = pickle.load(gz_file) if not hasattr(ring_data, 'devs'): ring_data = RingData(ring_data['replica2part2dev_id'], ring_data['devs'], ring_data['part_shift']) return ring_data
def load(cls, filename, metadata_only=False): """ Load ring data from a file. :param filename: Path to a file serialized by the save() method. :param bool metadata_only: If True, only load `devs` and `part_shift`. :returns: A RingData instance containing the loaded data. """ gz_file = GzipFile(filename, "rb") # Python 2.6 GzipFile doesn't support BufferedIO if hasattr(gz_file, "_checkReadable"): gz_file = BufferedReader(gz_file) # See if the file is in the new format magic = gz_file.read(4) if magic == "R1NG": format_version, = struct.unpack("!H", gz_file.read(2)) if format_version == 1: ring_data = cls.deserialize_v1(gz_file, metadata_only=metadata_only) else: raise Exception("Unknown ring format version %d" % format_version) else: # Assume old-style pickled ring gz_file.seek(0) ring_data = pickle.load(gz_file) if not hasattr(ring_data, "devs"): ring_data = RingData(ring_data["replica2part2dev_id"], ring_data["devs"], ring_data["part_shift"]) return ring_data
def read_primary_xml(repo_path, primary_xml = os.path.join('repodata', 'primary.xml')): primary_xml = os.path.join(repo_path, primary_xml) url = urlparse(primary_xml) if url.scheme not in [None, '', 'file']: fdurl = urlopen(primary_xml) primary_xml_str = fdurl.read() fdurl.close() if primary_xml.endswith('.gz'): primary_xml_stream = StringIO(primary_xml_str) primary_xml_gz = GzipFile(fileobj=primary_xml_stream, mode='rb') primary_xml_str = primary_xml_gz.read() pkgdb = minidom.parseString(primary_xml_str) else: if primary_xml.endswith('.gz'): primary_xml_gz = GzipFile(primary_xml, mode='rb') primary_xml_str = primary_xml_gz.read() pkgdb = minidom.parseString(primary_xml_str) elif primary_xml.endswith('.xml'): pkgdb = minidom.parse(primary_xml) # Parse packages pkgs = set() for pkg in pkgdb.getElementsByTagName('package'): if pkg.getAttribute('type') != 'rpm': continue rpm_name = pkg.getElementsByTagName('name')[0].firstChild.data rpm_version_obj = pkg.getElementsByTagName('version')[0] rpm_version = rpm_version_obj.getAttribute('ver') rpm_release = rpm_version_obj.getAttribute('rel') rpm_arch = pkg.getElementsByTagName('arch')[0].firstChild.data try: rpm_format_obj = pkg.getElementsByTagName('format')[0] except (AttributeError, IndexError): rpm_requires = rpm_provides = set() else: try: rpm_requires_obj = rpm_format_obj.getElementsByTagName('rpm:requires')[0] rpm_requires = set(r.getAttribute('name') for r in rpm_requires_obj.getElementsByTagName('rpm:entry')) except (AttributeError, IndexError): rpm_requires = set() try: rpm_provides_obj = rpm_format_obj.getElementsByTagName('rpm:provides')[0] rpm_provides = set(p.getAttribute('name') for p in rpm_provides_obj.getElementsByTagName('rpm:entry')) except (AttributeError, IndexError): rpm_provides = set() rpm_location_obj = pkg.getElementsByTagName('location')[0] rpm_path = os.path.join(repo_path, rpm_location_obj.getAttribute('href')) pkgs.add(RpmInfo(name = rpm_name, version = rpm_version, release = rpm_release, arch = rpm_arch, is_src = bool(rpm_arch == 'src'), requires = rpm_requires, provides = rpm_provides, path = rpm_path)) return pkgs
def _open_gzip (self, fobj): try: _gz = GzipFile (fileobj=fobj) # read will fail if fobj isn't a proper gzip _gz.read (1) _gz.seek (0) except IOError as e: _gz = None return _gz
def decompress(name, data): if name.endswith('.gz'): buf = StringIO(data) fp = GzipFile(mode='r', fileobj=buf) data = fp.read() elif name.endswith('.bz2'): buf = StringIO(data) fp = BZ2File(mode='r', fileobj=buf) data = fp.read() return data
def from_url(url, **kwargs): """ Create an urlset from an url """ u = urlopen(url) # print "urlset.py: %s" % url print u.headers["content-type"] if u.headers.has_key("content-type") and u.headers["content-type"].lower().startswith("application/x-gzip"): u = GzipFile(fileobj=StringIO(u.read())) elif u.headers["content-type"].lower().startswith("text/html"): print u.read() return UrlSet(u, url, **kwargs)
def download_etopo1(etopo_path, version='ice'): url = etopo1_url(version) log.info('%s was not found. Attempting to download from %s.' % (etopo_path, url)) try: response = urlopen(url) content_length = -1 blocksize = 2**22 blocksize_disk = 2**25 info = response.info() headers = info.headers content_lengths = filter(lambda x: x.startswith('Content-Length'), headers) if content_lengths: content_length = content_lengths[0] content_length = int(content_length.strip().split()[1]) log.info('Downloading %d bytes at %d byte chunks' % (content_length, blocksize)) log.info('This will take a while. Go enjoy some fresh air.') with SpooledTemporaryFile(2**30) as s: bytes = 0 last_percent = 0 data = response.read(blocksize) while data: s.write(data) bytes += len(data) data = response.read(blocksize) percent = float(bytes) / content_length if percent > last_percent + 0.05: log.debug('%d / %d = %f' % (bytes, content_length, percent)) last_percent = percent response.close() s.flush() s.seek(0) log.debug('Gunzipping file to %s' % etopo_path) g = GzipFile(fileobj=s) bytes = 0 with open(etopo_path, 'wb') as f: data = g.read(blocksize_disk) while data: f.write(data) data = g.read(blocksize_disk) bytes += len(data) log.debug('%d written' % bytes) except URLError, e: log.critical('Download from %s failed.' % url) raise e
def read(self, size=None): if not size: size = self._size contents = StringIO() while True: blocks = GzipFile.read(self, size) if not blocks: contents.flush() break contents.write(blocks) return contents.getvalue() else: return GzipFile.read(self, size)
def from_file(cls, file): file = GzipFile(mode='rb', fileobj=file) buffer = file.read(4) size, = np.frombuffer(buffer, np.uint32) buffer = file.read(4 * size) counts = np.frombuffer(buffer, np.uint32).tolist() text = file.read().decode('utf8') words = text.splitlines() return cls(words, counts)
def title(text): """ Retrieve titles from URL in text. >>> len(title('no url here')) 0 TODO This case should ignore the 404. >>> print(title('https://hdhoang.space/404 https://hdhoang.space/')) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... urllib.error.HTTPError: HTTP Error 404: Not Found >>> print(title('https://hdhoang.space/luser.html https://hdhoang.space/luser.html')) IRC bot / IRC bot >>> print(title('http://www.nytimes.com/2016/01/26/business/marvin-minsky-pioneer-in-artificial-intelligence-dies-at-88.html')) Marvin Minsky, Pioneer in Artificial Intelligence, Dies at 88 - The New York Times >>> print(title('http://www.baomoi.com/bao-nhieu-tan-bot-trung-quoc-da-duoc-nhap-ve-lam-tra-o-long-tea-plus/c/18486151.epi')) Bao nhiêu tấn bột Trung Quốc đã được nhập về làm trà Ô long TEA Plus? - GĐ&XH; >>> print(title('http://news.zing.vn/chi-tiet-ban-do-cam-duong-dip-29-o-ha-noi-post574142.html')) Chi tiết bản đồ cấm đường dịp 2/9 ở Hà Nội - Thời sự - Zing.vn >>> print(title('https://www.facebook.com/photo.php?fbid=261863914155282&set=a.261860180822322.1073742015.100009950253866&type=3&theater')) # doctest: +ELLIPSIS Vo Thanh Thuy - Vo Thanh Thuy ... | Facebook >>> print(title('https://imgur.com/M18GYfw?r https://imgur.com/GUFyoUa?r')) Glorious new key cap set for my work keyboard! - Imgur """ uninteresting = ["XKCDB: The: The #xkcd Quote Database", "Saturday Morning Breakfast Cereal", "Library Genesis"] titles = [] urls = filter(lambda w: w.startswith('http'), text.split()) for u in urls: request = build_opener(HTTPCookieProcessor()) request.addheaders = [('Accept-Encoding', 'gzip'), ('User-Agent', 'Mozilla/5.0')] response = request.open(u) if response.info().get('Content-Encoding') == 'gzip': if sys.version_info.major == 3: response = GzipFile(fileobj=response) else: response = GzipFile(fileobj=StringIO(response.read())) title = BeautifulSoup(response.read(50000), 'html.parser').title response.close() if (title and 'Imgur:' not in title.string and title.string not in uninteresting): titles.append(title.string.replace('\n', '').strip()) return ' / '.join(titles)
class Reader: def __init__(self, file_name): self._stream = GzipFile(file_name, 'rb') user_len = int.from_bytes(self._stream.read(4), 'little') self.user = User() self.user.ParseFromString(self._stream.read(user_len)) def __iter__(self): snapshot_len_bytes = self._stream.read(4) while snapshot_len := int.from_bytes(snapshot_len_bytes, 'little'): snapshot = Snapshot() snapshot.ParseFromString(self._stream.read(snapshot_len)) yield snapshot snapshot_len_bytes = self._stream.read(4) self._stream.close()
def read(self, chunk_size=None): """ Reads specified chunk_size or the whole file if chunk_size is None. If reading the whole file and the content-encoding is gzip, also gunzip the read content. If chunk_size is provided, the same chunk_size will be used in all further read() calls until the file is reopened or seek() is called. """ if self._pos >= self._get_size() or chunk_size == 0: return "" if chunk_size is None and self._chunks is None: meta, data = self.file.get(include_meta=True) if meta.get("content-encoding", None) == "gzip": zbuf = StringIO(data) zfile = GzipFile(mode="rb", fileobj=zbuf) data = zfile.read() else: if self._chunks is None: # When reading by chunks, we're supposed to read the whole file # before calling get() again. self._chunks = self.file.get(chunk_size=chunk_size) try: data = self._chunks.next() except StopIteration: data = "" self._pos += len(data) return data
def process_request(self, request): encoding = request.META.get('HTTP_CONTENT_ENCODING', None) if encoding == 'gzip': if is_ratelimited(request, group='gunzip_request_middleware', key='ip', rate='300/1m', increment=True): return http.HttpResponse('Rate limit exceeded: too many gzipped request bodies', status=429) data = request._stream.read() if len(data) > settings.GUNZIP_MAX_COMPRESSED_SIZE: logger.warning('Compressed request body is too large: %s', request.path, extra = { 'status_code': 400, 'request': request, } ) return http.HttpResponseBadRequest('Compressed request body is too large') try: zfile = GzipFile(mode='rb', fileobj=StringIO(data)) uncompressed = zfile.read() request._stream = LimitedStream(StringIO(uncompressed), len(uncompressed)) del request.META['HTTP_CONTENT_ENCODING'] except IOError: return http.HttpResponseBadRequest('Invalid content-encoding, could not gunzip') return None
def gzip_decode(data): fileobj = StringIO(data) f = GzipFile(fileobj=fileobj) decoded = f.read() f.close() return decoded
def main(path, key, force=False): result = [] targets = find_targets(path) for target in targets: out = target[:-1] if os.path.exists(out) and force: click.secho( "file is existed, please use `-f` to overwrite it: {}".format( out), fg="yellow") result.append(out) continue content = open(target, "rb").read() if key: content = xxtea.decrypt(content, key) if content[:2] == b'\037\213': try: mock_fp = BytesIO(content) gz = GzipFile(fileobj=mock_fp) content = gz.read() except Exception as e: import traceback click.secho("ungz fault {} in {}".format( e, traceback.format_tb(sys.exc_info()[2])[-1]), fg="red") with open(out, 'wb') as _: _.write(content) click.secho("decrypt successful: {}".format(out), fg="green") result.append(out) return result
def getContent(self, url, data=None, referer=None): """ return content cookie html in response decode utf-8 to BeautifulSoup """ encoded_data = urlencode(data) if data else None # if referer is None: url default_headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.2.9) Gecko/20100824 Firefox/3.6.9 ( .NET CLR 3.5.30729; .NET4.0E)', 'Accept-Language': 'pt-br;q=0.5', 'Accept-Charset': 'utf-8;q=0.7,*;q=0.7', 'Accept-Encoding': 'gzip', 'Connection': 'close', 'Cache-Control': 'no-cache', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': referer} req = Request(url, encoded_data, default_headers, origin_req_host=referer) retries = 0 try: handle = self._opener.open(req) except HTTPError: retries += 1 if retries > self.max_retries: raise if handle.info().get('Content-Encoding') == 'gzip': data = handle.read() buf = StringIO(data) f = GzipFile(fileobj=buf) response = f.read() else: response = handle.read() # return response.decode('utf-8') return response
def s3_handles(aws_access_key, aws_secret_key, bucket, keys): """ Return handles for S3 objects matching keys in bucket. Parameters ---------- aws_access_key : string AWS access key from credentials. aws_secret_key : string AWS secret key from credentials. bucket : string S3 bucket to read objects from. keys : [string] List of keys in bucket to iterate over. """ handles = [] bucket = S3Connection(aws_access_key, aws_secret_key).get_bucket(bucket) for key in keys: for k in bucket.list(prefix=key): s = k.get_contents_as_string() gz = GzipFile(fileobj=StringIO(s)) handles.append(StringIO(gz.read())) return handles
def restRequest(url): printDebugMessage('restRequest', 'Begin', 11) printDebugMessage('restRequest', 'url: ' + url, 11) url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") try: user_agent = getUserAgent() http_headers = {'User-Agent': user_agent, 'Accept-Encoding': 'gzip'} req = urllib2.Request(url, None, http_headers) resp = urllib2.urlopen(req) encoding = resp.info().getheader('Content-Encoding') result = None if encoding == None or encoding == 'identity': result = resp.read() elif encoding == 'gzip': result = resp.read() printDebugMessage('restRequest', 'result: ' + result, 21) gz = GzipFile(fileobj=StringIO(result), mode="r") result = gz.read() else: raise Exception('Unsupported Content-Encoding') resp.close() except urllib2.HTTPError, ex: print ex.read() raise
def decompress_gzip(data): f = StringIO() f.write(data) f.seek(0) g = GzipFile(fileobj=f, mode="rb") return(g.read())
def open(self): request = Request(self.url) request.add_header('User-Agent', 'lastfm-lda recommender v.0.0.-1') request.add_header('Accept-encoding', 'gzip') while True: URLLoadListener.num_connections += 1 response = None try: response = urlopen(request, timeout=10) if response.info().get('Content-Encoding') == 'gzip': f = GzipFile(fileobj=StringIO(response.read())) result = f.read() f.close() else: result = response.read() break except Exception, e: if self.retries > 2: if isinstance(e, BadStatusLine): raise Exception( "last.fm server does not respond (%s)" % e) raise e self.retries += 1 print self.url print "failed with", e print "retry #", self.retries print finally:
def process_result_value(self, value, dialect): if value: gzipped = GzipFile(mode='r', fileobj=StringIO(value)) out = loads(gzipped.read()) else: out = '' return out
class GzipDecompressStream(object): def __init__(self, fileobj): """ Create a new instance of a gzip stream from the stream-like input :param fileobj: stream-like object to compress or decompress """ self.__buf = Buffer() self.__gzip = GzipFile(None, mode='rb', fileobj=fileobj) # included for 'with' support but not really a pythonic way of doing it. # using with is not required, passing this object is good enough def __enter__(self): return self def __exit__(self, type, value, traceback): self.__gzip.close() self.__gzip = None self.__buf = None return def read(self, size=-1): # buffer is used to that the output of read can be exactly the number of bytes specified in 'size' while size < 0 or len(self.__buf) < size: data = self.__gzip.read(CHUNK) if not data: break self.__buf.write(data) return self.__buf.read(size)
def fasta2dict(filename, want_dict = 'YES',key_func=None): ''' Fasta.fasta2dict(filename, want_dict = 'YES',key_func=None) ---------------------------------------------------------- Very fast Fasta Loader. Used internally. You should be using Fasta.load() or Fasta.seqs() instead. ''' D = {} if filename[-3:] == '.gz': FH = GzipFile(filename) else: FH = open(filename,'r') chunks = FH.read().split('>') for chunk in chunks[1:]: lines = chunk.split('\n') raw_id = lines[0] seq = ''.join(lines[1:]) try: if not key_func: key = raw_id.split()[0] else: key = key_func(raw_id) except: print raw_id sys.stdout.flush() sys.exit(1) D[key] = seq if want_dict: return D else: return D.values()
def gunzip(data, max_length=0): '''Gunzip the given data and return as much data as possible. This is resilient to CRC checksum errors. ''' f = GzipFile(fileobj=StringIO(data)) output = '' chunk = '.' while chunk: try: chunk = f.read(8196) output += chunk if max_length and len(output) > max_length: raise DecompressSizeError('Object exceeded %s bytes' % max_length) except (IOError, EOFError, struct.error): # complete only if there is some data, otherwise re-raise # see issue 87 about catching struct.error # some pages are quite small so output is '' and f.extrabuf # contains the whole page content if output or f.extrabuf: output += f.extrabuf break else: raise return output
def DecodeProcFile(proc_file): if len(proc_file) < 256: fd = open(proc_file) proc_file = fd.read(1024*1024) fd.close() if proc_file.find('Subsystem Id:') < 0: p = None try: from gzip import GzipFile from StringIO import StringIO s = StringIO(proc_file) gz = GzipFile(mode='r', fileobj=s) p = gz.read(1024*1024) gz.close() except: pass if p is None: try: from bz2 import decompress p = decompress(proc_file) except: pass if not p is None: proc_file = p return proc_file
def handleResponse(self, response): if self.quietLoss: return if self.failed: self.factory.noPage( failure.Failure( error.Error( self.status, self.message, response))) elif self.length != None and self.length != 0: self.factory.noPage(failure.Failure( client.PartialDownloadError(self.status, self.message, response))) else: if self.decode: s = StringIO() s.write(response) s.seek(-1) g = GzipFile(fileobj=s, mode='rb') try: response = g.read() except IOError: self.factory.noPage(failure.Failure( client.PartialDownloadError(self.status, self.message, response))) self.transport.loseConnection() return g.close() self.factory.page(response) # server might be stupid and not close connection. self.transport.loseConnection()
def LoadGuide(): if Prefs['xmltv'].startswith('http://') or Prefs['xmltv'].startswith('https://'): # Plex can't handle compressed files, using standart Python methods instead if Prefs['xmltv'].endswith('.gz') or Prefs['xmltv'].endswith('.gz?raw=1'): f = BytesIO(urlopen(Prefs['xmltv']).read()) try: g = GzipFile(fileobj = f) xmltv = g.read() except: Log.Error('Provided file %s is not a valid GZIP file' % Prefs['xmltv']) xmltv = None else: xmltv = HTTP.Request(Prefs['xmltv']).content else: # Local compressed files are not supported at the moment xmltv = Resource.Load(Prefs['xmltv'], binary = True) if xmltv != None: try: root = xml.etree.ElementTree.fromstring(xmltv) except: Log.Error('Provided file %s is not a valid XML file' % Prefs['xmltv']) root = None if root != None: count = 0 for programme in root.findall("./programme"): channel = programme.get('channel') start = datetime_from_utc_to_local(programme.get('start')) stop = datetime_from_utc_to_local(programme.get('stop')) title = programme.find('title').text count = count + 1 item = {'start': start, 'stop': stop, 'title': title, 'order': count} GUIDE.setdefault(channel, {})[count] = item return None
def ungzip_stream(stream): """Returns the ungzipped stream.""" try: gzipped_stream = GzipFile(fileobj=StringIO(stream)) return gzipped_stream.read() except IOError: return stream
def decode_content(self, data): if web.ctx.env.get('HTTP_CONTENT_ENCODING') == 'gzip': ib = StringIO(data) zf = GzipFile(fileobj=ib) return zf.read() else: return data
class ZipLengthReader(LengthReader): """ Tries to read the body as gzip according to length. In case that fails, it disregards the Content-Length and reads it normally. """ def __init__(self, length, text): # TODO test if this works with gzipped responses in WARC try: self._file = GzipFile(fileobj=BytesIO(text[:length]), mode='rb') self._text = self._file.read() super(ZipLengthReader, self).__init__(len(self._text)) except IOError: self._file = None super(ZipLengthReader, self).__init__(len(text)) def __del__(self): if self._file: self._file.close() def feed(self, parser, text): """Parse the body according to remaining length""" if self.remaining > 0: if self._file: text = self._text self.remaining, text = parser.feed_length(text, self.remaining) if self.remaining <= 0: parser.mode = 'end' return text
def scrape(self, url): global h # Make the request with a random user agent string h = choice(self.user_agent_list) req = r.Request(url, headers={'User-Agent': h}) # Get the response res = r.urlopen(req) # Cleanup del req # Check for an error (HTTP status code >= 400) if (int(res.getcode()) >= 400): return "%s : Error encountered, : %s" % (url, res.getcode()) if (("content-encoding" in res.info().keys()) and (res.info()["content-encoding"] == "gzip")): buf = res.read() f = GzipFile(fileobj=buf) res = f.read() else: res = res.read() # Try to decode the page with utf-8 and then ascii try: return res.decode('utf-8') return res.decode('ascii') # Notify the user on error except: print("%s : Encountered encoding error." % (url)) return "%s : Encountered encoding error." % (url)
def unzip(self,html_data): try: gf = GzipFile(fileobj=StringIO(html_data), mode="r") html_data = gf.read() except: pass return html_data
def handle_stackexchange_login(self, data): self.send_response(200) self.send_header("Content-type", "text/html") self.log_message(self.path) self.end_headers() c = Client(StackExchange, get_config()) cred = c.flow.authorization_received(data) d = c.request("/me", body=urlencode({ "site": "stackoverflow" })) self.wfile.write("<!DOCTYPE html>") self.wfile.write("<head><meta charset=\"utf-8\"/></head><body>") self.wfile.write("Access token: %s<br>" % cred.access_token) self.wfile.write("Type: %s<br>" % cred.token_type) self.wfile.write("Expires in: %d<br>" % cred.expires_in) # stackexchange gzips all data h = StringIO(d) gzip_data = GzipFile(fileobj=h) d = gzip_data.read() gzip_data.close() self.wfile.write(d) self.wfile.write("</body></html>")
def try_decompress(data): try: gf = GzipFile(fileobj=StringIO(data), mode="r") decompressed_data = gf.read() except: decompressed_data = gf.extrabuf return decompressed_data
def process_request(self, request): encoding = request.META.get('HTTP_CONTENT_ENCODING', None) if encoding == 'gzip': if is_ratelimited(request, group='gunzip_request_middleware', key='ip', rate='300/1m', increment=True): return http.HttpResponse( 'Rate limit exceeded: too many gzipped request bodies', status=429) data = request._stream.read() if len(data) > settings.GUNZIP_MAX_COMPRESSED_SIZE: logger.warning('Compressed request body is too large: %s', request.path, extra={ 'status_code': 400, 'request': request, }) return http.HttpResponseBadRequest( 'Compressed request body is too large') try: zfile = GzipFile(mode='rb', fileobj=StringIO(data)) uncompressed = zfile.read() request._stream = LimitedStream(StringIO(uncompressed), len(uncompressed)) del request.META['HTTP_CONTENT_ENCODING'] except IOError: return http.HttpResponseBadRequest( 'Invalid content-encoding, could not gunzip') return None
def extract_nodes(self): """Second XOR+gzip loop to extract nodes from decoded config @param buf: encoded blob @return string of IPs """ # Don't XOR first chunk of random bytes used to mess up file magic self.data = self.xor("\x55\xAA")[0x80:] try: gzfile = GzipFile(fileobj=StringIO(self.data)) decoded = gzfile.read() gzfile.close() except IOError: log.warning("DridexDecode_v1: Unable to decode <nodes> element: " "data is not gzip") return None if not self.is_printable(decoded): log.warning("DridexDecode_v1: Unable to decode <nodes> element: " "data is not valid") return None tmp = self.NODES.search(decoded) if tmp: return self.NODES.search(decoded).group(1) else: log.warning("DridexDecode_v1: Unable to decode <nodes> element: " "<nodes> not found") return None
def httxdecompress(response): ''' Decompression of the body from response. The supported compression types are gzip, bzip2 and deflate @param response: the response from the http server containing the body @type response: L{HttxResponse} ''' decompmethod = response.getheader('content-encoding') if not decompmethod: return if decompmethod == 'gzip': gzipper = GzipFile(fileobj=response.bodyfile) response.body = gzipper.read() elif decompmethod == 'deflate': try: response.body = zlibdecompress(response.body) except zliberror: # Many web sites fail to send the first bytes of the header # possibly it is a header-stripped gzip file response.body = zlibdecompress(response.body, -zlibMAX_WBITS) elif decompmethod == 'bzip2': response.body = bz2decompress(response.body)
def get_athena_version(fname): ftype = 'ascii' try: fh = GzipFile(fname) text = fh.read() ftype = 'gzip' except: text = None if text is None: text = open(fname, 'r').read() if isinstance(text, bytes): text = text.decode('utf-8') version = 'unknown' for line in text[:200].split('\n'): line = line.lower().replace('#', '').replace('--', '').strip() if 'athena project file' in line: line = line.replace('version', '') line = line.replace('_____header1', '') line = line.replace('athena project file', '') words = [a.strip() for a in line.split()] version = words.pop().replace('"', "").replace(',', "") parent = words.pop() return (ftype, parent, version)
def _get_avg_views(site, article): url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/" "{0}.{1}/all-access/user/{2}/daily/{3}/{4}") days = 30 slug = quote(article, safe="") start = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%m%d") end = datetime.utcnow().strftime("%Y%m%d") query = url.format(site.lang, site.project, slug, start, end) try: response = site._opener.open(query) # We're terrible except URLError: return None result = response.read() if response.headers.get("Content-Encoding") == "gzip": stream = StringIO(result) gzipper = GzipFile(fileobj=stream) result = gzipper.read() try: res = loads(result) except ValueError: return None if "items" not in res: return None total_views = sum(item["views"] for item in res["items"]) return total_views / (float(days) * 24 * 60)
def open(self): request = Request(self.url) request.add_header('User-Agent','lastfm-lda recommender v.0.0.-1') request.add_header('Accept-encoding', 'gzip') while True: URLLoadListener.num_connections+=1 response = None try: response = urlopen(request,timeout=10) if response.info().get('Content-Encoding') == 'gzip': f = GzipFile(fileobj=StringIO(response.read())) result = f.read() f.close() else: result = response.read() break except Exception, e: if self.retries>2: if isinstance(e, BadStatusLine): raise Exception("last.fm server does not respond (%s)" % e) raise e self.retries+=1 print self.url print "failed with", e print "retry #",self.retries print finally:
def GET(self): try: pyDict = {} data = param_input() response = get(str(data.file_location), cert=config_get('webui', 'usercert'), verify=False) if not response.ok: response.raise_for_status() cont = response.content file_like_object = BytesIO(cont) tar = open(mode='r:gz', fileobj=file_like_object) for member in tar.getmembers(): if member.name == str(data.file_name): try: f = tar.extractfile(member) pyDict['content'] = f.read(16000000) pyDict['size'] = f.tell() jsonResponse = dumps(pyDict) tar.close() return jsonResponse except UnicodeDecodeError: f = tar.extractfile(member) out = GzipFile(fileobj=f) pyDict['content'] = out.read(16000000) pyDict['size'] = out.tell() jsonResponse = dumps(pyDict) tar.close() return jsonResponse return "ok" except ConnectionError, err: raise generate_http_error(503, str(type(err)), str(err))
def from_url(url, **kwargs): """ Create an urlset from an url """ u = urlopen(url) if u.headers.has_key("content-type") and u.headers[ "content-type"].lower() == "application/x-gzip": u = GzipFile(fileobj=StringIO(u.read())) return UrlSet(u, url, **kwargs)
def __init__(self, data): fd, fname = tempfile.mkstemp() gzd = GzipFile(mode='r', fileobj=StringIO(b64decode(data))) os.write(fd, gzd.read()) os.close(fd) gzd.close() self.name = fname
def fasta2dict(filename, want_dict='YES', key_func=None): ''' Fasta.fasta2dict(filename, want_dict = 'YES',key_func=None) ---------------------------------------------------------- Very fast Fasta Loader. Used internally. You should be using Fasta.load() or Fasta.seqs() instead. ''' D = {} if filename[-3:] == '.gz': FH = GzipFile(filename) else: FH = open(filename, 'r') chunks = FH.read().split('>') for chunk in chunks[1:]: lines = chunk.split('\n') raw_id = lines[0] seq = ''.join(lines[1:]) try: if not key_func: key = raw_id.split()[0] else: key = key_func(raw_id) except: print raw_id sys.stdout.flush() sys.exit(1) D[key] = seq if want_dict: return D else: return D.values()
def Request(self,url,data=None,headers={}): """ send a request using specified url params: url : url you want to send data : additional post data headers : headers you want to attach, eg. {'referer' : 'http : //www.baiud.com'} """ #setup request info if url is None or url == '': raise HttpWrapperException("url can't be empty!") if 'user-agent' not in headers: headers['user-agent'] = 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16' #auto add gzip capabilities if 'Accept-Encoding' not in headers: headers['Accept-Encoding'] = 'gzip' self.__opener.addheaders = headers.items() try: if data is not None: req = self.__opener.open(url,data = urllib.urlencode(data)) else: req = self.__opener.open(url) #check if gzip encoding if req.headers.get("content-encoding") == "gzip": gz = GzipFile( fileobj = StringIO(req.read())) resData = HttpWrapperResponseData(gz.read(),req.geturl(),req.info().dict,req.getcode()) else: resData = HttpWrapperResponseData(req.read(),req.geturl(),req.info().dict,req.getcode()) req.close() return resData except urllib2.HTTPError,e: return HttpWrapperResponseData(e.fp.read(),'',e.headers,e.code)
def process(self, wealth, imported_file, account=None): gzip_file = GzipFile(fileobj=imported_file.file) decompressed = gzip_file.read() parser = make_parser() model = { 'accounts': {}, 'categories': {}, 'currency': [], 'transactions': [], 'category_splits': [], 'account_splits': [], 'wealth': wealth } handler = KMYXmlHandler(model) parser.setContentHandler(handler) parseString(decompressed, handler) accounts = model['accounts'] categories = self.__build_category_tree(model['categories']) transactions = model['transactions'] account_splits = model['account_splits'] category_splits = model['category_splits'] # if main currencies differ, re-calculate if model['currency'] != model['wealth'].currency: exchange_rate = get_rate(model['currency'], model['wealth'].currency) for split in category_splits: split.amount *= exchange_rate self.accounts = accounts.values() self.categories = categories.values() self.transactions = [transaction for transaction in transactions if transaction.date] self.category_splits = [split for split in category_splits if split.category ] self.account_splits = [split for split in account_splits if split.account ] self.currency = model['currency']