Exemple #1
0
def _read_raw_athena(filename):
    """try to read athena project file as plain text,
    to determine validity
    """
    # try gzip
    text = None
    try:
        fh = GzipFile(filename)
        text = bytes2str(fh.read())
    except Exception:
        errtype, errval, errtb = sys.exc_info()
        text = None
    finally:
        fh.close()

    if text is None:
        # try plain text file
        try:
            fh = open(filename, 'r')
            text = bytes2str(fh.read())
        except Exception:
            errtype, errval, errtb = sys.exc_info()
            text = None
        finally:
            fh.close()
    return text
Exemple #2
0
def generate_log_stream_from_file(path):
    """
    yield a sequence of (header, data) tuples from a named file
    """
    input_gzip_file = GzipFile(filename=path)

    while True:
        packed_frame = input_gzip_file.read(_frame_size)
        if len(packed_frame) < _frame_size:
            raise StopIteration()

        protocol_version, header_size, data_size = struct.unpack(_frame_format,
                                                                 packed_frame)
        if protocol_version != _frame_protocol_version:
            raise LogStreamError("Invalid protocol {0} expected {1}".format(
                protocol_version, _frame_protocol_version))

        header = input_gzip_file.read(header_size)
        if len(header) != header_size:
            raise LogStreamError("Invalid header read {0} expected {1}".format(
                len(header), header_size))

        data = input_gzip_file.read(data_size)
        if len(data) != data_size:
            raise LogStreamError("Invalid data read {0} expected {1}".format(
                len(data), data_size))

        yield header, data
Exemple #3
0
    def load(cls, filename, metadata_only=False):
        """
        Load ring data from a file.

        :param filename: Path to a file serialized by the save() method.
        :param bool metadata_only: If True, only load `devs` and `part_shift`.
        :returns: A RingData instance containing the loaded data.
        """
        gz_file = GzipFile(filename, 'rb')
        # Python 2.6 GzipFile doesn't support BufferedIO
        if hasattr(gz_file, '_checkReadable'):
            gz_file = BufferedReader(gz_file)

        # See if the file is in the new format
        magic = gz_file.read(4)
        if magic == 'R1NG':
            format_version, = struct.unpack('!H', gz_file.read(2))
            if format_version == 1:
                ring_data = cls.deserialize_v1(gz_file,
                                               metadata_only=metadata_only)
            else:
                raise Exception('Unknown ring format version %d' %
                                format_version)
        else:
            # Assume old-style pickled ring
            gz_file.seek(0)
            ring_data = pickle.load(gz_file)

        if not hasattr(ring_data, 'devs'):
            ring_data = RingData(ring_data['replica2part2dev_id'],
                                 ring_data['devs'], ring_data['part_shift'])
        return ring_data
Exemple #4
0
def _read_raw_athena(filename):
    """try to read athena project file as plain text,
    to determine validity
    """
    # try gzip
    text = None
    try:
        fh = GzipFile(filename)
        text = bytes2str(fh.read())
    except Exception:
        errtype, errval, errtb = sys.exc_info()
        text = None
    finally:
        fh.close()

    if text is None:
        # try plain text file
        try:
            fh = open(filename, 'r')
            text = bytes2str(fh.read())
        except Exception:
            errtype, errval, errtb = sys.exc_info()
            text = None
        finally:
            fh.close()
    return text
Exemple #5
0
    def load(cls, filename):
        """
        Load ring data from a file.

        :param filename: Path to a file serialized by the save() method.
        :returns: A RingData instance containing the loaded data.
        """
        gz_file = GzipFile(filename, 'rb')
        # Python 2.6 GzipFile doesn't support BufferedIO
        if hasattr(gz_file, '_checkReadable'):
            gz_file = BufferedReader(gz_file)

        # See if the file is in the new format
        magic = gz_file.read(4)
        if magic == 'R1NG':
            # Not sure if this is intended to be unpacked into a tuple
            # Leave the comma unless it has been proven not to be needed
            version, = struct.unpack('!H', gz_file.read(2))
            if version == 1:
                ring_data = cls.deserialize_v1(gz_file)
            else:
                raise Exception('Unknown ring format version %d' % version)
        else:
            # Assume old-style pickled ring
            gz_file.seek(0)
            ring_data = pickle.load(gz_file)

        if not hasattr(ring_data, 'devs'):
            ring_data = RingData(
                ring_data['replica2part2dev_id'],
                ring_data['devs'],
                ring_data['part_shift']
            )
        return ring_data
Exemple #6
0
    def load(cls, filename):
        """
        Load ring data from a file.

        :param filename: Path to a file serialized by the save() method.
        :returns: A RingData instance containing the loaded data.
        """
        gz_file = GzipFile(filename, 'rb')
        # Python 2.6 GzipFile doesn't support BufferedIO
        if hasattr(gz_file, '_checkReadable'):
            gz_file = BufferedReader(gz_file)

        # See if the file is in the new format
        magic = gz_file.read(4)
        if magic == 'R1NG':
            version, = struct.unpack('!H', gz_file.read(2))
            if version == 1:
                ring_data = cls.deserialize_v1(gz_file)
            else:
                raise Exception('Unknown ring format version %d' % version)
        else:
            # Assume old-style pickled ring
            gz_file.seek(0)
            ring_data = pickle.load(gz_file)
        if not hasattr(ring_data, 'devs'):
            ring_data = RingData(ring_data['replica2part2dev_id'],
                                 ring_data['devs'], ring_data['part_shift'])
        return ring_data
Exemple #7
0
    def load(cls, filename, metadata_only=False):
        """
        Load ring data from a file.

        :param filename: Path to a file serialized by the save() method.
        :param bool metadata_only: If True, only load `devs` and `part_shift`.
        :returns: A RingData instance containing the loaded data.
        """
        gz_file = GzipFile(filename, "rb")
        # Python 2.6 GzipFile doesn't support BufferedIO
        if hasattr(gz_file, "_checkReadable"):
            gz_file = BufferedReader(gz_file)

        # See if the file is in the new format
        magic = gz_file.read(4)
        if magic == "R1NG":
            format_version, = struct.unpack("!H", gz_file.read(2))
            if format_version == 1:
                ring_data = cls.deserialize_v1(gz_file, metadata_only=metadata_only)
            else:
                raise Exception("Unknown ring format version %d" % format_version)
        else:
            # Assume old-style pickled ring
            gz_file.seek(0)
            ring_data = pickle.load(gz_file)

        if not hasattr(ring_data, "devs"):
            ring_data = RingData(ring_data["replica2part2dev_id"], ring_data["devs"], ring_data["part_shift"])
        return ring_data
Exemple #8
0
def read_primary_xml(repo_path, primary_xml = os.path.join('repodata', 'primary.xml')):
    primary_xml = os.path.join(repo_path, primary_xml)
    url = urlparse(primary_xml)
    if url.scheme not in [None, '', 'file']:
        fdurl = urlopen(primary_xml)
        primary_xml_str = fdurl.read()
        fdurl.close()

        if primary_xml.endswith('.gz'):
            primary_xml_stream = StringIO(primary_xml_str)
            primary_xml_gz = GzipFile(fileobj=primary_xml_stream, mode='rb')
            primary_xml_str = primary_xml_gz.read()

        pkgdb = minidom.parseString(primary_xml_str)
    else:
        if primary_xml.endswith('.gz'):
            primary_xml_gz = GzipFile(primary_xml, mode='rb')
            primary_xml_str = primary_xml_gz.read()
            pkgdb = minidom.parseString(primary_xml_str)
        elif primary_xml.endswith('.xml'):
            pkgdb = minidom.parse(primary_xml)

    # Parse packages
    pkgs = set()
    for pkg in pkgdb.getElementsByTagName('package'):
        if pkg.getAttribute('type') != 'rpm':
            continue
        rpm_name = pkg.getElementsByTagName('name')[0].firstChild.data
        rpm_version_obj = pkg.getElementsByTagName('version')[0]
        rpm_version = rpm_version_obj.getAttribute('ver')
        rpm_release = rpm_version_obj.getAttribute('rel')
        rpm_arch = pkg.getElementsByTagName('arch')[0].firstChild.data
        try:
            rpm_format_obj = pkg.getElementsByTagName('format')[0]
        except (AttributeError, IndexError):
            rpm_requires = rpm_provides = set()
        else:
            try:
                rpm_requires_obj = rpm_format_obj.getElementsByTagName('rpm:requires')[0]
                rpm_requires = set(r.getAttribute('name') for r in rpm_requires_obj.getElementsByTagName('rpm:entry'))
            except (AttributeError, IndexError):
                rpm_requires = set()
            try:
                rpm_provides_obj = rpm_format_obj.getElementsByTagName('rpm:provides')[0]
                rpm_provides = set(p.getAttribute('name') for p in rpm_provides_obj.getElementsByTagName('rpm:entry'))
            except (AttributeError, IndexError):
                rpm_provides = set()
        rpm_location_obj = pkg.getElementsByTagName('location')[0]
        rpm_path = os.path.join(repo_path, rpm_location_obj.getAttribute('href'))
        pkgs.add(RpmInfo(name = rpm_name,
                         version = rpm_version,
                         release = rpm_release,
                         arch = rpm_arch,
                         is_src = bool(rpm_arch == 'src'),
                         requires = rpm_requires,
                         provides = rpm_provides,
                         path = rpm_path))

    return pkgs
Exemple #9
0
 def _open_gzip (self, fobj):
     try:
         _gz = GzipFile (fileobj=fobj)
         # read will fail if fobj isn't a proper gzip
         _gz.read (1)
         _gz.seek (0)
     except IOError as e:
         _gz = None
     return _gz
Exemple #10
0
def decompress(name, data):
    if name.endswith('.gz'):
        buf = StringIO(data)
        fp = GzipFile(mode='r', fileobj=buf)
        data = fp.read()
    elif name.endswith('.bz2'):
        buf = StringIO(data)
        fp = BZ2File(mode='r', fileobj=buf)
        data = fp.read()
    return data
Exemple #11
0
def decompress(name, data):
    if name.endswith('.gz'):
        buf = StringIO(data)
        fp = GzipFile(mode='r', fileobj=buf)
        data = fp.read()
    elif name.endswith('.bz2'):
        buf = StringIO(data)
        fp = BZ2File(mode='r', fileobj=buf)
        data = fp.read()
    return data
Exemple #12
0
 def from_url(url, **kwargs):
     """ Create an urlset from an url """
     u = urlopen(url)
     # print "urlset.py: %s" % url
     print u.headers["content-type"]
     if u.headers.has_key("content-type") and u.headers["content-type"].lower().startswith("application/x-gzip"):
         u = GzipFile(fileobj=StringIO(u.read()))
     elif u.headers["content-type"].lower().startswith("text/html"):
         print u.read()
     return UrlSet(u, url, **kwargs)
Exemple #13
0
def download_etopo1(etopo_path, version='ice'):
    url = etopo1_url(version)
    log.info('%s was not found. Attempting to download from %s.' %
             (etopo_path, url))
    try:
        response = urlopen(url)

        content_length = -1
        blocksize = 2**22
        blocksize_disk = 2**25

        info = response.info()
        headers = info.headers
        content_lengths = filter(lambda x: x.startswith('Content-Length'),
                                 headers)
        if content_lengths:
            content_length = content_lengths[0]
            content_length = int(content_length.strip().split()[1])

        log.info('Downloading %d bytes at %d byte chunks' %
                 (content_length, blocksize))
        log.info('This will take a while. Go enjoy some fresh air.')

        with SpooledTemporaryFile(2**30) as s:
            bytes = 0
            last_percent = 0
            data = response.read(blocksize)
            while data:
                s.write(data)
                bytes += len(data)
                data = response.read(blocksize)

                percent = float(bytes) / content_length
                if percent > last_percent + 0.05:
                    log.debug('%d / %d = %f' %
                              (bytes, content_length, percent))
                    last_percent = percent
            response.close()
            s.flush()
            s.seek(0)

            log.debug('Gunzipping file to %s' % etopo_path)
            g = GzipFile(fileobj=s)
            bytes = 0
            with open(etopo_path, 'wb') as f:
                data = g.read(blocksize_disk)
                while data:
                    f.write(data)
                    data = g.read(blocksize_disk)
                    bytes += len(data)
                    log.debug('%d written' % bytes)

    except URLError, e:
        log.critical('Download from %s failed.' % url)
        raise e
Exemple #14
0
 def read(self, size=None):
     if not size:
         size = self._size
         contents = StringIO()
         while True:
             blocks = GzipFile.read(self, size)
             if not blocks:
                 contents.flush()
                 break
             contents.write(blocks)
         return contents.getvalue()
     else:
         return GzipFile.read(self, size)
Exemple #15
0
    def from_file(cls, file):
        file = GzipFile(mode='rb', fileobj=file)

        buffer = file.read(4)
        size, = np.frombuffer(buffer, np.uint32)

        buffer = file.read(4 * size)
        counts = np.frombuffer(buffer, np.uint32).tolist()

        text = file.read().decode('utf8')
        words = text.splitlines()

        return cls(words, counts)
Exemple #16
0
 def read(self, size=None):
     if not size:
         size = self._size
         contents = StringIO()
         while True:
             blocks = GzipFile.read(self, size)
             if not blocks:
                 contents.flush()
                 break
             contents.write(blocks)
         return contents.getvalue()
     else:
         return GzipFile.read(self, size)
Exemple #17
0
def title(text):
    """
    Retrieve titles from URL in text.

    >>> len(title('no url here'))
    0

    TODO This case should ignore the 404.
    >>> print(title('https://hdhoang.space/404 https://hdhoang.space/')) # doctest: +IGNORE_EXCEPTION_DETAIL
    Traceback (most recent call last):
      ...
    urllib.error.HTTPError: HTTP Error 404: Not Found

    >>> print(title('https://hdhoang.space/luser.html https://hdhoang.space/luser.html'))
    IRC bot / IRC bot

    >>> print(title('http://www.nytimes.com/2016/01/26/business/marvin-minsky-pioneer-in-artificial-intelligence-dies-at-88.html'))
    Marvin Minsky, Pioneer in Artificial Intelligence, Dies at 88 - The New York Times

    >>> print(title('http://www.baomoi.com/bao-nhieu-tan-bot-trung-quoc-da-duoc-nhap-ve-lam-tra-o-long-tea-plus/c/18486151.epi'))
    Bao nhiêu tấn bột Trung Quốc đã được nhập về làm trà Ô long TEA Plus? - GĐ&XH;

    >>> print(title('http://news.zing.vn/chi-tiet-ban-do-cam-duong-dip-29-o-ha-noi-post574142.html'))
    Chi tiết bản đồ cấm đường dịp 2/9 ở Hà Nội - Thời sự - Zing.vn

    >>> print(title('https://www.facebook.com/photo.php?fbid=261863914155282&set=a.261860180822322.1073742015.100009950253866&type=3&theater')) # doctest: +ELLIPSIS
    Vo Thanh Thuy - Vo Thanh Thuy ... | Facebook

    >>> print(title('https://imgur.com/M18GYfw?r https://imgur.com/GUFyoUa?r'))
    Glorious new key cap set for my work keyboard! - Imgur
    """
    uninteresting = ["XKCDB: The: The #xkcd Quote Database", "Saturday Morning Breakfast Cereal", "Library Genesis"]
    titles = []
    urls = filter(lambda w: w.startswith('http'), text.split())
    for u in urls:
        request = build_opener(HTTPCookieProcessor())
        request.addheaders = [('Accept-Encoding', 'gzip'), ('User-Agent', 'Mozilla/5.0')]
        response = request.open(u)
        if response.info().get('Content-Encoding') == 'gzip':
            if sys.version_info.major == 3:
                response = GzipFile(fileobj=response)
            else:
                response = GzipFile(fileobj=StringIO(response.read()))
        title = BeautifulSoup(response.read(50000), 'html.parser').title
        response.close()
        if (title
            and 'Imgur:' not in title.string
            and title.string not in uninteresting):
            titles.append(title.string.replace('\n', '').strip())
    return ' / '.join(titles)
class Reader:
    def __init__(self, file_name):
        self._stream = GzipFile(file_name, 'rb')
        user_len = int.from_bytes(self._stream.read(4), 'little')
        self.user = User()
        self.user.ParseFromString(self._stream.read(user_len))

    def __iter__(self):
        snapshot_len_bytes = self._stream.read(4)
        while snapshot_len := int.from_bytes(snapshot_len_bytes, 'little'):
            snapshot = Snapshot()
            snapshot.ParseFromString(self._stream.read(snapshot_len))
            yield snapshot
            snapshot_len_bytes = self._stream.read(4)
        self._stream.close()
Exemple #19
0
    def read(self, chunk_size=None):
        """
        Reads specified chunk_size or the whole file if chunk_size is None.

        If reading the whole file and the content-encoding is gzip, also
        gunzip the read content.

        If chunk_size is provided, the same chunk_size will be used in all
        further read() calls until the file is reopened or seek() is called.
        """
        if self._pos >= self._get_size() or chunk_size == 0:
            return ""

        if chunk_size is None and self._chunks is None:
            meta, data = self.file.get(include_meta=True)
            if meta.get("content-encoding", None) == "gzip":
                zbuf = StringIO(data)
                zfile = GzipFile(mode="rb", fileobj=zbuf)
                data = zfile.read()
        else:
            if self._chunks is None:
                # When reading by chunks, we're supposed to read the whole file
                # before calling get() again.
                self._chunks = self.file.get(chunk_size=chunk_size)

            try:
                data = self._chunks.next()
            except StopIteration:
                data = ""

        self._pos += len(data)
        return data
Exemple #20
0
    def process_request(self, request):
        encoding = request.META.get('HTTP_CONTENT_ENCODING', None)

        if encoding == 'gzip':
            if is_ratelimited(request, group='gunzip_request_middleware', key='ip', rate='300/1m', increment=True):
                return http.HttpResponse('Rate limit exceeded: too many gzipped request bodies', status=429)

            data = request._stream.read()
            if len(data) > settings.GUNZIP_MAX_COMPRESSED_SIZE:
                logger.warning('Compressed request body is too large: %s', request.path,
                    extra = {
                        'status_code': 400,
                        'request': request,
                    }
                )
                return http.HttpResponseBadRequest('Compressed request body is too large')

            try:
                zfile = GzipFile(mode='rb', fileobj=StringIO(data))
                uncompressed = zfile.read()
                request._stream = LimitedStream(StringIO(uncompressed), len(uncompressed))
                del request.META['HTTP_CONTENT_ENCODING']
            except IOError:
                return http.HttpResponseBadRequest('Invalid content-encoding, could not gunzip')

        return None
Exemple #21
0
def gzip_decode(data):
    fileobj = StringIO(data)

    f = GzipFile(fileobj=fileobj)
    decoded = f.read()
    f.close()
    return decoded
Exemple #22
0
def main(path, key, force=False):
    result = []
    targets = find_targets(path)
    for target in targets:
        out = target[:-1]
        if os.path.exists(out) and force:
            click.secho(
                "file is existed, please use `-f` to overwrite it: {}".format(
                    out),
                fg="yellow")
            result.append(out)
            continue

        content = open(target, "rb").read()
        if key:
            content = xxtea.decrypt(content, key)

        if content[:2] == b'\037\213':
            try:
                mock_fp = BytesIO(content)
                gz = GzipFile(fileobj=mock_fp)
                content = gz.read()
            except Exception as e:
                import traceback
                click.secho("ungz fault {} in {}".format(
                    e,
                    traceback.format_tb(sys.exc_info()[2])[-1]),
                            fg="red")

        with open(out, 'wb') as _:
            _.write(content)

        click.secho("decrypt successful: {}".format(out), fg="green")
        result.append(out)
    return result
    def getContent(self, url, data=None, referer=None):
        """
        return content cookie html in response decode utf-8 to BeautifulSoup
        """
        encoded_data = urlencode(data) if data else None
        # if referer is None: url
        default_headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.2.9) Gecko/20100824 Firefox/3.6.9 ( .NET CLR 3.5.30729; .NET4.0E)',
                           'Accept-Language': 'pt-br;q=0.5',
                           'Accept-Charset': 'utf-8;q=0.7,*;q=0.7',
                           'Accept-Encoding': 'gzip',
                           'Connection': 'close',
                           'Cache-Control': 'no-cache',
                           'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                           'Referer': referer}

        req = Request(url, encoded_data, default_headers, origin_req_host=referer)

        retries = 0
        try:
            handle = self._opener.open(req)
        except HTTPError:
            retries += 1
            if retries > self.max_retries:
                raise
        if handle.info().get('Content-Encoding') == 'gzip':
            data = handle.read()
            buf = StringIO(data)
            f = GzipFile(fileobj=buf)
            response = f.read()
        else:
            response = handle.read()
        # return response.decode('utf-8')
        return response
Exemple #24
0
    def s3_handles(aws_access_key, aws_secret_key, bucket, keys):
        """
    Return handles for S3 objects matching keys in bucket.

    Parameters
    ----------
    aws_access_key : string
      AWS access key from credentials.
    aws_secret_key : string
      AWS secret key from credentials.
    bucket : string
      S3 bucket to read objects from.
    keys : [string]
      List of keys in bucket to iterate over.

    """
        handles = []
        bucket = S3Connection(aws_access_key,
                              aws_secret_key).get_bucket(bucket)
        for key in keys:
            for k in bucket.list(prefix=key):
                s = k.get_contents_as_string()
                gz = GzipFile(fileobj=StringIO(s))
                handles.append(StringIO(gz.read()))
        return handles
def restRequest(url):
    printDebugMessage('restRequest', 'Begin', 11)
    printDebugMessage('restRequest', 'url: ' + url, 11)
    url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]")

    try:
        user_agent = getUserAgent()
        http_headers = {'User-Agent': user_agent, 'Accept-Encoding': 'gzip'}
        req = urllib2.Request(url, None, http_headers)
        resp = urllib2.urlopen(req)
        encoding = resp.info().getheader('Content-Encoding')
        result = None
        if encoding == None or encoding == 'identity':
            result = resp.read()
        elif encoding == 'gzip':
            result = resp.read()
            printDebugMessage('restRequest', 'result: ' + result, 21)
            gz = GzipFile(fileobj=StringIO(result), mode="r")
            result = gz.read()
        else:
            raise Exception('Unsupported Content-Encoding')
        resp.close()
    except urllib2.HTTPError, ex:
        print ex.read()
        raise
Exemple #26
0
def decompress_gzip(data):
    f = StringIO()
    f.write(data)
    f.seek(0)
    
    g = GzipFile(fileobj=f, mode="rb")
    return(g.read())
Exemple #27
0
 def open(self):
     request = Request(self.url)
     request.add_header('User-Agent', 'lastfm-lda recommender v.0.0.-1')
     request.add_header('Accept-encoding', 'gzip')
     while True:
         URLLoadListener.num_connections += 1
         response = None
         try:
             response = urlopen(request, timeout=10)
             if response.info().get('Content-Encoding') == 'gzip':
                 f = GzipFile(fileobj=StringIO(response.read()))
                 result = f.read()
                 f.close()
             else:
                 result = response.read()
             break
         except Exception, e:
             if self.retries > 2:
                 if isinstance(e, BadStatusLine):
                     raise Exception(
                         "last.fm server does not respond (%s)" % e)
                 raise e
             self.retries += 1
             print self.url
             print "failed with", e
             print "retry #", self.retries
             print
         finally:
Exemple #28
0
 def process_result_value(self, value, dialect):
     if value:
         gzipped = GzipFile(mode='r', fileobj=StringIO(value))
         out = loads(gzipped.read())
     else:
         out = ''
     return out
Exemple #29
0
class GzipDecompressStream(object):
    def __init__(self, fileobj):
        """
        Create a new instance of a gzip stream from the stream-like input
        :param fileobj: stream-like object to compress or decompress
        """
        self.__buf = Buffer()
        self.__gzip = GzipFile(None, mode='rb', fileobj=fileobj)

    # included for 'with' support but not really a pythonic way of doing it.
    # using with is not required, passing this object is good enough
    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.__gzip.close()
        self.__gzip = None
        self.__buf = None
        return

    def read(self, size=-1):
        # buffer is used to that the output of read can be exactly the number of bytes specified in 'size'
        while size < 0 or len(self.__buf) < size:
            data = self.__gzip.read(CHUNK)
            if not data:
                break
            self.__buf.write(data)
        return self.__buf.read(size)
Exemple #30
0
def fasta2dict(filename, want_dict = 'YES',key_func=None):
    '''
    Fasta.fasta2dict(filename, want_dict = 'YES',key_func=None)
    ----------------------------------------------------------
    Very fast Fasta Loader.  Used internally.  You should be using
    Fasta.load() or Fasta.seqs() instead.
    '''
    D = {}
    if filename[-3:] == '.gz': FH = GzipFile(filename)
    else:                      FH = open(filename,'r')
    chunks = FH.read().split('>')
    for chunk in chunks[1:]:
        lines  = chunk.split('\n')
        raw_id = lines[0]
        seq    = ''.join(lines[1:])
        try:
            if not key_func:  key = raw_id.split()[0]
            else:            key = key_func(raw_id)
        except:
            print raw_id
            sys.stdout.flush()
            sys.exit(1)
        D[key] = seq
    if want_dict: return D
    else:         return D.values()
Exemple #31
0
def gunzip(data, max_length=0):
    '''Gunzip the given data and return as much data as possible.
    This is resilient to CRC checksum errors.
    '''
    f = GzipFile(fileobj=StringIO(data))
    output = ''
    chunk = '.'
    while chunk:
        try:
            chunk = f.read(8196)
            output += chunk
            if max_length and len(output) > max_length:
                raise DecompressSizeError('Object exceeded %s bytes' %
                                          max_length)
        except (IOError, EOFError, struct.error):
            # complete only if there is some data, otherwise re-raise
            # see issue 87 about catching struct.error
            # some pages are quite small so output is '' and f.extrabuf
            # contains the whole page content
            if output or f.extrabuf:
                output += f.extrabuf
                break
            else:
                raise
    return output
Exemple #32
0
    def read(self, chunk_size=None):
        """
        Reads specified chunk_size or the whole file if chunk_size is None.

        If reading the whole file and the content-encoding is gzip, also
        gunzip the read content.

        If chunk_size is provided, the same chunk_size will be used in all
        further read() calls until the file is reopened or seek() is called.
        """
        if self._pos >= self._get_size() or chunk_size == 0:
            return ""

        if chunk_size is None and self._chunks is None:
            meta, data = self.file.get(include_meta=True)
            if meta.get("content-encoding", None) == "gzip":
                zbuf = StringIO(data)
                zfile = GzipFile(mode="rb", fileobj=zbuf)
                data = zfile.read()
        else:
            if self._chunks is None:
                # When reading by chunks, we're supposed to read the whole file
                # before calling get() again.
                self._chunks = self.file.get(chunk_size=chunk_size)

            try:
                data = self._chunks.next()
            except StopIteration:
                data = ""

        self._pos += len(data)
        return data
Exemple #33
0
def DecodeProcFile(proc_file):
  if len(proc_file) < 256:
    fd = open(proc_file)
  proc_file = fd.read(1024*1024)
  fd.close()
  if proc_file.find('Subsystem Id:') < 0:
      p = None
      try:
        from gzip import GzipFile
        from StringIO import StringIO
        s = StringIO(proc_file)
        gz = GzipFile(mode='r', fileobj=s)
        p = gz.read(1024*1024)
        gz.close()
      except:
        pass
      if p is None:
        try:
          from bz2 import decompress
          p = decompress(proc_file)
        except:
          pass
      if not p is None:
        proc_file = p
  return proc_file
Exemple #34
0
 def handleResponse(self, response):
     if self.quietLoss:
         return
     if self.failed:
         self.factory.noPage(
             failure.Failure(
                 error.Error(
                     self.status, self.message, response)))
     elif self.length != None and self.length != 0:
         self.factory.noPage(failure.Failure(
             client.PartialDownloadError(self.status, self.message, response)))
     else:
         if self.decode:
             s = StringIO()
             s.write(response)
             s.seek(-1)
             g = GzipFile(fileobj=s, mode='rb')
             try:
                 response = g.read()
             except IOError:
                 self.factory.noPage(failure.Failure(
                     client.PartialDownloadError(self.status, self.message, response)))
                 self.transport.loseConnection()
                 return
             g.close()
         self.factory.page(response)
     # server might be stupid and not close connection.
     self.transport.loseConnection()
Exemple #35
0
def LoadGuide():
    if Prefs['xmltv'].startswith('http://') or Prefs['xmltv'].startswith('https://'):
        # Plex can't handle compressed files, using standart Python methods instead
        if Prefs['xmltv'].endswith('.gz') or Prefs['xmltv'].endswith('.gz?raw=1'):
            f = BytesIO(urlopen(Prefs['xmltv']).read())
            try:
                g = GzipFile(fileobj = f)
                xmltv = g.read()
            except:
                Log.Error('Provided file %s is not a valid GZIP file' % Prefs['xmltv'])
                xmltv = None
        else:
            xmltv = HTTP.Request(Prefs['xmltv']).content
    else:
        # Local compressed files are not supported at the moment
        xmltv = Resource.Load(Prefs['xmltv'], binary = True)
    if xmltv != None:
        try:
            root = xml.etree.ElementTree.fromstring(xmltv)
        except:
            Log.Error('Provided file %s is not a valid XML file' % Prefs['xmltv'])
            root = None
        if root != None:
            count = 0
            for programme in root.findall("./programme"):
                channel = programme.get('channel')
                start = datetime_from_utc_to_local(programme.get('start'))
                stop = datetime_from_utc_to_local(programme.get('stop'))
                title = programme.find('title').text
                count = count + 1
                item = {'start': start, 'stop': stop, 'title': title, 'order': count}
                GUIDE.setdefault(channel, {})[count] = item
    return None
Exemple #36
0
def DecodeProcFile(proc_file):
  if len(proc_file) < 256:
    fd = open(proc_file)
  proc_file = fd.read(1024*1024)
  fd.close()
  if proc_file.find('Subsystem Id:') < 0:
      p = None
      try:
        from gzip import GzipFile
        from StringIO import StringIO
        s = StringIO(proc_file)
        gz = GzipFile(mode='r', fileobj=s)
        p = gz.read(1024*1024)
        gz.close()
      except:
        pass
      if p is None:
        try:
          from bz2 import decompress
          p = decompress(proc_file)
        except:
          pass
      if not p is None:
        proc_file = p
  return proc_file
Exemple #37
0
def ungzip_stream(stream):
    """Returns the ungzipped stream."""
    try:
        gzipped_stream = GzipFile(fileobj=StringIO(stream))
        return gzipped_stream.read()
    except IOError:
        return stream
Exemple #38
0
 def decode_content(self, data):
     if web.ctx.env.get('HTTP_CONTENT_ENCODING') == 'gzip':
         ib = StringIO(data)
         zf = GzipFile(fileobj=ib)
         return zf.read()
     else:
         return data
class ZipLengthReader(LengthReader):
    """
    Tries to read the body as gzip according to length. In case that fails, it
    disregards the Content-Length and reads it normally.
    """
    def __init__(self, length, text):
        # TODO test if this works with gzipped responses in WARC
        try:
            self._file = GzipFile(fileobj=BytesIO(text[:length]), mode='rb')
            self._text = self._file.read()
            super(ZipLengthReader, self).__init__(len(self._text))
        except IOError:
            self._file = None
            super(ZipLengthReader, self).__init__(len(text))

    def __del__(self):
        if self._file:
            self._file.close()

    def feed(self, parser, text):
        """Parse the body according to remaining length"""
        if self.remaining > 0:
            if self._file:
                text = self._text
            self.remaining, text = parser.feed_length(text, self.remaining)
        if self.remaining <= 0:
            parser.mode = 'end'
        return text
Exemple #40
0
    def scrape(self, url):
        global h
        # Make the request with a random user agent string
        h = choice(self.user_agent_list)
        req = r.Request(url, headers={'User-Agent': h})
        # Get the response
        res = r.urlopen(req)

        # Cleanup
        del req

        # Check for an error (HTTP status code >= 400)
        if (int(res.getcode()) >= 400):
            return "%s : Error encountered, : %s" % (url, res.getcode())

        if (("content-encoding" in res.info().keys())
                and (res.info()["content-encoding"] == "gzip")):
            buf = res.read()
            f = GzipFile(fileobj=buf)
            res = f.read()
        else:
            res = res.read()

        # Try to decode the page with utf-8 and then ascii
        try:
            return res.decode('utf-8')
            return res.decode('ascii')
        # Notify the user on error
        except:
            print("%s : Encountered encoding error." % (url))
            return "%s : Encountered encoding error." % (url)
Exemple #41
0
 def unzip(self,html_data):
     try:
         gf = GzipFile(fileobj=StringIO(html_data), mode="r")
         html_data = gf.read()
     except:
         pass
     return html_data
Exemple #42
0
    def handle_stackexchange_login(self, data):
        self.send_response(200)
        self.send_header("Content-type", "text/html")
        self.log_message(self.path)
        self.end_headers()

        c = Client(StackExchange, get_config())
        cred = c.flow.authorization_received(data)

        d = c.request("/me", body=urlencode({
            "site": "stackoverflow"
        }))

        self.wfile.write("<!DOCTYPE html>")
        self.wfile.write("<head><meta charset=\"utf-8\"/></head><body>")
        self.wfile.write("Access token: %s<br>" % cred.access_token)
        self.wfile.write("Type: %s<br>" % cred.token_type)
        self.wfile.write("Expires in: %d<br>" % cred.expires_in)

        # stackexchange gzips all data
        h = StringIO(d)
        gzip_data = GzipFile(fileobj=h)
        d = gzip_data.read()
        gzip_data.close()
        self.wfile.write(d)
        self.wfile.write("</body></html>")
Exemple #43
0
def LoadGuide():
    if Prefs['xmltv'].startswith('http://') or Prefs['xmltv'].startswith('https://'):
        # Plex can't handle compressed files, using standart Python methods instead
        if Prefs['xmltv'].endswith('.gz') or Prefs['xmltv'].endswith('.gz?raw=1'):
            f = BytesIO(urlopen(Prefs['xmltv']).read())
            try:
                g = GzipFile(fileobj = f)
                xmltv = g.read()
            except:
                Log.Error('Provided file %s is not a valid GZIP file' % Prefs['xmltv'])
                xmltv = None
        else:
            xmltv = HTTP.Request(Prefs['xmltv']).content
    else:
        # Local compressed files are not supported at the moment
        xmltv = Resource.Load(Prefs['xmltv'], binary = True)
    if xmltv != None:
        try:
            root = xml.etree.ElementTree.fromstring(xmltv)
        except:
            Log.Error('Provided file %s is not a valid XML file' % Prefs['xmltv'])
            root = None
        if root != None:
            count = 0
            for programme in root.findall("./programme"):
                channel = programme.get('channel')
                start = datetime_from_utc_to_local(programme.get('start'))
                stop = datetime_from_utc_to_local(programme.get('stop'))
                title = programme.find('title').text
                count = count + 1
                item = {'start': start, 'stop': stop, 'title': title, 'order': count}
                GUIDE.setdefault(channel, {})[count] = item
    return None
Exemple #44
0
 def try_decompress(data):
     try:
         gf = GzipFile(fileobj=StringIO(data), mode="r")
         decompressed_data = gf.read()
     except:
         decompressed_data = gf.extrabuf
     return decompressed_data
Exemple #45
0
    def process_request(self, request):
        encoding = request.META.get('HTTP_CONTENT_ENCODING', None)

        if encoding == 'gzip':
            if is_ratelimited(request,
                              group='gunzip_request_middleware',
                              key='ip',
                              rate='300/1m',
                              increment=True):
                return http.HttpResponse(
                    'Rate limit exceeded: too many gzipped request bodies',
                    status=429)

            data = request._stream.read()
            if len(data) > settings.GUNZIP_MAX_COMPRESSED_SIZE:
                logger.warning('Compressed request body is too large: %s',
                               request.path,
                               extra={
                                   'status_code': 400,
                                   'request': request,
                               })
                return http.HttpResponseBadRequest(
                    'Compressed request body is too large')

            try:
                zfile = GzipFile(mode='rb', fileobj=StringIO(data))
                uncompressed = zfile.read()
                request._stream = LimitedStream(StringIO(uncompressed),
                                                len(uncompressed))
                del request.META['HTTP_CONTENT_ENCODING']
            except IOError:
                return http.HttpResponseBadRequest(
                    'Invalid content-encoding, could not gunzip')

        return None
Exemple #46
0
    def extract_nodes(self):
        """Second XOR+gzip loop to extract nodes from decoded config
        @param buf: encoded blob
        @return string of IPs
        """
        # Don't XOR first chunk of random bytes used to mess up file magic
        self.data = self.xor("\x55\xAA")[0x80:]
        try:
            gzfile = GzipFile(fileobj=StringIO(self.data))
            decoded = gzfile.read()
            gzfile.close()

        except IOError:
            log.warning("DridexDecode_v1: Unable to decode <nodes> element: "
                        "data is not gzip")
            return None

        if not self.is_printable(decoded):
            log.warning("DridexDecode_v1: Unable to decode <nodes> element: "
                        "data is not valid")
            return None

        tmp = self.NODES.search(decoded)
        if tmp:
            return self.NODES.search(decoded).group(1)
        else:
            log.warning("DridexDecode_v1: Unable to decode <nodes> element: "
                        "<nodes> not found")
            return None
Exemple #47
0
def httxdecompress(response):
    '''
    Decompression of the body from response.

    The supported compression types are gzip, bzip2 and deflate

    @param response: the response from the http server containing the body
    @type response: L{HttxResponse}
    '''

    decompmethod = response.getheader('content-encoding')

    if not decompmethod:
        return

    if decompmethod == 'gzip':
        gzipper = GzipFile(fileobj=response.bodyfile)
        response.body = gzipper.read()

    elif decompmethod == 'deflate':
        try:
            response.body = zlibdecompress(response.body)
        except zliberror:
            # Many web sites fail to send the first bytes of the header
            # possibly it is a header-stripped gzip file
            response.body = zlibdecompress(response.body, -zlibMAX_WBITS)

    elif decompmethod == 'bzip2':
        response.body = bz2decompress(response.body)
Exemple #48
0
def get_athena_version(fname):
    ftype = 'ascii'
    try:
        fh = GzipFile(fname)
        text = fh.read()
        ftype = 'gzip'
    except:
        text = None

    if text is None:
        text = open(fname, 'r').read()

    if isinstance(text, bytes):
        text = text.decode('utf-8')
    version = 'unknown'
    for line in text[:200].split('\n'):
        line = line.lower().replace('#', '').replace('--', '').strip()
        if 'athena project file' in line:
            line = line.replace('version', '')
            line = line.replace('_____header1', '')
            line = line.replace('athena project file', '')
            words = [a.strip() for a in line.split()]
            version = words.pop().replace('"', "").replace(',', "")
            parent = words.pop()
    return (ftype, parent, version)
def _get_avg_views(site, article):
    url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
           "{0}.{1}/all-access/user/{2}/daily/{3}/{4}")
    days = 30
    slug = quote(article, safe="")
    start = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%m%d")
    end = datetime.utcnow().strftime("%Y%m%d")
    query = url.format(site.lang, site.project, slug, start, end)

    try:
        response = site._opener.open(query)  # We're terrible
    except URLError:
        return None

    result = response.read()
    if response.headers.get("Content-Encoding") == "gzip":
        stream = StringIO(result)
        gzipper = GzipFile(fileobj=stream)
        result = gzipper.read()

    try:
        res = loads(result)
    except ValueError:
        return None

    if "items" not in res:
        return None
    total_views = sum(item["views"] for item in res["items"])
    return total_views / (float(days) * 24 * 60)
 def open(self):
   request = Request(self.url)
   request.add_header('User-Agent','lastfm-lda recommender v.0.0.-1')
   request.add_header('Accept-encoding', 'gzip')
   while True:
     URLLoadListener.num_connections+=1
     response = None
     try:
       response = urlopen(request,timeout=10)
       if response.info().get('Content-Encoding') == 'gzip':
         f = GzipFile(fileobj=StringIO(response.read()))
         result = f.read()
         f.close()
       else:
         result = response.read()
       break
     except Exception, e:
       if self.retries>2: 
         if isinstance(e, BadStatusLine): raise Exception("last.fm server does not respond (%s)" % e)
         raise e
       self.retries+=1
       print self.url
       print "failed with", e
       print "retry #",self.retries
       print
     finally:
Exemple #51
0
def httxdecompress(response):
    '''
    Decompression of the body from response.

    The supported compression types are gzip, bzip2 and deflate

    @param response: the response from the http server containing the body
    @type response: L{HttxResponse}
    '''

    decompmethod = response.getheader('content-encoding')

    if not decompmethod:
        return

    if decompmethod == 'gzip':
        gzipper = GzipFile(fileobj=response.bodyfile)
        response.body = gzipper.read()

    elif decompmethod == 'deflate':
        try:
            response.body = zlibdecompress(response.body)
        except zliberror:
            # Many web sites fail to send the first bytes of the header
            # possibly it is a header-stripped gzip file
            response.body = zlibdecompress(response.body, -zlibMAX_WBITS)

    elif decompmethod == 'bzip2':
        response.body = bz2decompress(response.body)
Exemple #52
0
 def GET(self):
     try:
         pyDict = {}
         data = param_input()
         response = get(str(data.file_location),
                        cert=config_get('webui', 'usercert'),
                        verify=False)
         if not response.ok:
             response.raise_for_status()
         cont = response.content
         file_like_object = BytesIO(cont)
         tar = open(mode='r:gz', fileobj=file_like_object)
         for member in tar.getmembers():
             if member.name == str(data.file_name):
                 try:
                     f = tar.extractfile(member)
                     pyDict['content'] = f.read(16000000)
                     pyDict['size'] = f.tell()
                     jsonResponse = dumps(pyDict)
                     tar.close()
                     return jsonResponse
                 except UnicodeDecodeError:
                     f = tar.extractfile(member)
                     out = GzipFile(fileobj=f)
                     pyDict['content'] = out.read(16000000)
                     pyDict['size'] = out.tell()
                     jsonResponse = dumps(pyDict)
                     tar.close()
                     return jsonResponse
                 return "ok"
     except ConnectionError, err:
         raise generate_http_error(503, str(type(err)), str(err))
Exemple #53
0
 def from_url(url, **kwargs):
     """ Create an urlset from an url """
     u = urlopen(url)
     if u.headers.has_key("content-type") and u.headers[
             "content-type"].lower() == "application/x-gzip":
         u = GzipFile(fileobj=StringIO(u.read()))
     return UrlSet(u, url, **kwargs)
Exemple #54
0
 def __init__(self, data):
     fd, fname = tempfile.mkstemp()
     gzd = GzipFile(mode='r', fileobj=StringIO(b64decode(data)))
     os.write(fd, gzd.read())
     os.close(fd)
     gzd.close()
     self.name = fname
def fasta2dict(filename, want_dict='YES', key_func=None):
    '''
    Fasta.fasta2dict(filename, want_dict = 'YES',key_func=None)
    ----------------------------------------------------------
    Very fast Fasta Loader.  Used internally.  You should be using
    Fasta.load() or Fasta.seqs() instead.
    '''
    D = {}
    if filename[-3:] == '.gz': FH = GzipFile(filename)
    else: FH = open(filename, 'r')
    chunks = FH.read().split('>')
    for chunk in chunks[1:]:
        lines = chunk.split('\n')
        raw_id = lines[0]
        seq = ''.join(lines[1:])
        try:
            if not key_func: key = raw_id.split()[0]
            else: key = key_func(raw_id)
        except:
            print raw_id
            sys.stdout.flush()
            sys.exit(1)
        D[key] = seq
    if want_dict: return D
    else: return D.values()
Exemple #56
0
    def Request(self,url,data=None,headers={}):
        """
        send a request using specified url
        params:
            url     : url you want to send
            data    : additional post data
            headers : headers you want to attach, eg. {'referer' : 'http : //www.baiud.com'}
        """
        #setup request info
        if url is None or url == '':
            raise HttpWrapperException("url can't be empty!")
        if 'user-agent' not in headers:
            headers['user-agent'] = 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16'
        #auto add gzip capabilities
        if 'Accept-Encoding' not in headers:
            headers['Accept-Encoding'] = 'gzip'
        self.__opener.addheaders = headers.items()

        try:
            if data is not None:
                req = self.__opener.open(url,data = urllib.urlencode(data))
            else:
                req = self.__opener.open(url)

            #check if gzip encoding
            if req.headers.get("content-encoding") == "gzip":
                gz = GzipFile( fileobj = StringIO(req.read()))
                resData = HttpWrapperResponseData(gz.read(),req.geturl(),req.info().dict,req.getcode())
            else:
                resData = HttpWrapperResponseData(req.read(),req.geturl(),req.info().dict,req.getcode())
            req.close()

            return resData
        except urllib2.HTTPError,e:
            return HttpWrapperResponseData(e.fp.read(),'',e.headers,e.code)
Exemple #57
0
  def process(self, wealth, imported_file, account=None):
    gzip_file = GzipFile(fileobj=imported_file.file)
    decompressed = gzip_file.read()
    parser = make_parser()
    model = {
        'accounts': {},
        'categories': {},
        'currency': [],
        'transactions': [],
        'category_splits': [],
        'account_splits': [],
        'wealth': wealth }
    handler = KMYXmlHandler(model)
    parser.setContentHandler(handler)
    parseString(decompressed, handler)

    accounts = model['accounts']
    categories = self.__build_category_tree(model['categories'])
    transactions = model['transactions']
    account_splits = model['account_splits']
    category_splits = model['category_splits']

    # if main currencies differ, re-calculate
    if model['currency'] != model['wealth'].currency:
      exchange_rate = get_rate(model['currency'], model['wealth'].currency)
      for split in category_splits:
        split.amount *= exchange_rate

    self.accounts = accounts.values()
    self.categories = categories.values()
    self.transactions = [transaction for transaction in transactions if transaction.date]
    self.category_splits = [split for split in category_splits if split.category ]
    self.account_splits = [split for split in account_splits if split.account ]
    self.currency = model['currency']