Exemple #1
0
class DiscoZipFile(ZipFile, object):
    def __init__(self):
        self.buffer = BytesIO()
        super(DiscoZipFile, self).__init__(self.buffer, 'w', ZIP_DEFLATED)

    def writepath(self, pathname, exclude=()):
        for file in files(pathname):
            name, ext = os.path.splitext(file)
            if ext not in exclude:
                self.write(file, file)

    def writemodule(self, module, arcname=None):
        if isinstance(module, basestring):
            module = __import__(module)
        self.write(getsourcefile(module), arcname=arcname)

    def writesource(self, object):
        self.writepath(getsourcefile(getmodule(object)))

    def dump(self, handle):
        handle.write(self.dumps())

    def dumps(self):
        self.buffer.seek(0)
        return self.buffer.read()
Exemple #2
0
 def setUp(self):
     self.ddfs.push('disco:test:blobs', [(BytesIO(b'datablob'), 'blobdata')])
     self.ddfs.push('disco:test:blobs', [(BytesIO(b'datablob2'), 'blobdata2')])
     self.ddfs.push('disco:test:emptyblob', [(BytesIO(b''), 'empty')])
     self.ddfs.tag('disco:test:tag', [['urls']])
     self.ddfs.tag('disco:test:metatag',
                   [['tag://disco:test:tag'], ['tag://disco:test:metatag']])
Exemple #3
0
class DiscoZipFile(ZipFile, object):
    def __init__(self):
        self.buffer = BytesIO()
        super(DiscoZipFile, self).__init__(self.buffer, 'w', ZIP_DEFLATED)

    def writepath(self, pathname, exclude=()):
        for file in files(pathname):
            name, ext = os.path.splitext(file)
            if ext not in exclude:
                self.write(file, file)

    def writemodule(self, module, arcname=None):
        if isinstance(module, basestring):
            module = __import__(module)
        self.write(getsourcefile(module), arcname=arcname)

    def writesource(self, object):
        self.writepath(getsourcefile(getmodule(object)))

    def dump(self, handle):
        handle.write(self.dumps())

    def dumps(self):
        self.buffer.seek(0)
        return self.buffer.read()
Exemple #4
0
 def read(self, size=-1):
     buf = BytesIO()
     while size:
         bytes = self._read_chunk(size if size > 0 else CHUNK_SIZE)
         if not bytes:
             break
         size -= len(bytes)
         buf.write(bytes)
     return buf.getvalue()
Exemple #5
0
 def test_push(self):
     self.ddfs.push('disco:test:blobs', [(BytesIO(b'blobdata'), 'blobdata')])
     self.assert_(self.ddfs.exists('disco:test:blobs'))
     self.ddfs.push('tag://disco:test:blobs2', [(BytesIO(b'blobdata'), 'blobdata')])
     self.assert_(self.ddfs.exists('disco:test:blobs2'))
     self.ddfs.delete('disco:test:blobs')
     self.assert_(not self.ddfs.exists('disco:test:blobs'))
     self.ddfs.delete('disco:test:blobs2')
     self.assert_(not self.ddfs.exists('disco:test:blobs2'))
Exemple #6
0
 def read(self, size=-1):
     buf = BytesIO()
     while size:
         bytes = self._read_chunk(size if size > 0 else CHUNK_SIZE)
         if not bytes:
             break
         size -= len(bytes)
         buf.write(bytes)
     return buf.getvalue()
Exemple #7
0
 def setUp(self):
     self.ddfs.push('disco:test:authrd', [(BytesIO(b'datablob'), 'blobdata')])
     self.ddfs.push('disco:test:authwr', [(BytesIO(b'datablob'), 'blobdata')])
     self.ddfs.push('disco:test:authempty', [(BytesIO(b'datablob'), 'blobdata')])
     self.ddfs.setattr('disco:test:authrd', 'a', 'v')
     self.ddfs.setattr('disco:test:authwr', 'a', 'v')
     self.ddfs.setattr('disco:test:authrd', 'ddfs:read-token', 'rdr')
     self.ddfs.setattr('disco:test:authwr', 'ddfs:write-token', 'wtr')
     self.ddfs.setattr('disco:test:authempty', 'a', 'v')
     self.ddfs.setattr('disco:test:authempty', 'ddfs:read-token', '')
     self.ddfs.setattr('disco:test:authempty', 'ddfs:write-token', '')
Exemple #8
0
class DiscoOutputStream_v1(object):
    def __init__(self, stream,
                 version=1,
                 compression_level=2,
                 min_hunk_size=HUNK_SIZE,
                 max_record_size=None):
        self.stream = stream
        self.version = version
        self.compression_level = compression_level
        self.max_record_size = max_record_size
        self.min_hunk_size = min_hunk_size
        self.size = 0
        self.hunk_size = 0
        self.hunk = BytesIO()

    def add(self, k, v):
        self.append((k, v))

    def append(self, record):
        self.hunk_write(pickle_dumps(record, 1))
        if self.hunk_size > self.min_hunk_size:
            self.flush()

    def close(self):
        if self.hunk_size:
            self.flush()
        self.flush()

    def flush(self):
        hunk = self.hunk.getvalue()
        checksum = crc32(hunk) & 0xFFFFFFFF
        iscompressed = int(self.compression_level > 0)
        if iscompressed:
            hunk = compress(hunk, self.compression_level)
        data = b''.join([struct.pack('<BBIQ',
                                     128 + self.version,
                                     iscompressed,
                                     checksum,
                                     len(hunk)),
                         hunk])

        self.stream.write(data)
        self.size += len(data)
        self.hunk_size = 0
        self.hunk = BytesIO()

    def hunk_write(self, data):
        size = len(data)
        if self.max_record_size and size > self.max_record_size:
            raise ValueError("Record of size " + str(size) +
                             " is larger than max_record_size: " + str(self.max_record_size))
        self.hunk.write(data)
        self.hunk_size += size
class DiscoOutputStream_v1(object):
    def __init__(self, stream,
                 version=1,
                 compression_level=2,
                 min_hunk_size=HUNK_SIZE,
                 max_record_size=None):
        self.stream = stream
        self.version = version
        self.compression_level = compression_level
        self.max_record_size = max_record_size
        self.min_hunk_size = min_hunk_size
        self.size = 0
        self.hunk_size = 0
        self.hunk = BytesIO()

    def add(self, k, v):
        self.append((k, v))

    def append(self, record):
        self.hunk_write(pickle_dumps(record, 1))
        if self.hunk_size > self.min_hunk_size:
            self.flush()

    def close(self):
        if self.hunk_size:
            self.flush()
        self.flush()

    def flush(self):
        hunk = self.hunk.getvalue()
        checksum = crc32(hunk) & 0xFFFFFFFF
        iscompressed = int(self.compression_level > 0)
        if iscompressed:
            hunk = compress(hunk, self.compression_level)
        data = b''.join([struct.pack('<BBIQ',
                                     128 + self.version,
                                     iscompressed,
                                     checksum,
                                     len(hunk)),
                         hunk])

        self.stream.write(data)
        self.size += len(data)
        self.hunk_size = 0
        self.hunk = BytesIO()

    def hunk_write(self, data):
        size = len(data)
        if self.max_record_size and size > self.max_record_size:
            raise ValueError("Record of size " + str(size) +
                             " is larger than max_record_size: " + str(self.max_record_size))
        self.hunk.write(data)
        self.hunk_size += size
Exemple #10
0
 def __init__(self, stream,
              version=1,
              compression_level=2,
              min_hunk_size=HUNK_SIZE,
              max_record_size=None):
     self.stream = stream
     self.version = version
     self.compression_level = compression_level
     self.max_record_size = max_record_size
     self.min_hunk_size = min_hunk_size
     self.size = 0
     self.hunk_size = 0
     self.hunk = BytesIO()
Exemple #11
0
    def flush(self):
        hunk = self.hunk.getvalue()
        checksum = crc32(hunk) & 0xFFFFFFFF
        iscompressed = int(self.compression_level > 0)
        if iscompressed:
            hunk = compress(hunk, self.compression_level)
        data = b''.join([
            struct.pack('<BBIQ', 128 + self.version, iscompressed, checksum,
                        len(hunk)), hunk
        ])

        self.stream.write(data)
        self.size += len(data)
        self.hunk_size = 0
        self.hunk = BytesIO()
Exemple #12
0
 def jobpack(self, jobname):
     """Return the :class:`disco.job.JobPack` submitted for the job."""
     from disco.compat import BytesIO
     from disco.job import JobPack
     return JobPack.load(
         BytesIO(
             self.request('/disco/ctrl/parameters?name={0}'.format(jobname),
                          as_bytes=True)))
Exemple #13
0
 def test_create_delete_create_token(self):
     self.ddfs.delete('disco:test:delete2')
     self.assert_(not self.ddfs.exists('disco:test:delete2'))
     self.ddfs.push('disco:test:delete2',
                    [(BytesIO(b'abc'), 'atom')],
                    token='secret1')
     self.assert_(self.ddfs.exists('disco:test:delete2'))
     self.assert_("disco:test:delete2" in self.ddfs.list('disco:test:delete2'))
Exemple #14
0
 def test_create_delete_create(self):
     self.ddfs.delete('disco:test:delete1')
     self.assert_(not self.ddfs.exists('disco:test:delete1'))
     self.ddfs.push('disco:test:delete1',
                    [(BytesIO(b'datablob'), 'blobdata')])
     self.assert_(self.ddfs.exists('disco:test:delete1'))
     self.assert_(
         "disco:test:delete1" in self.ddfs.list('disco:test:delete1'))
Exemple #15
0
 def __iter__(self):
     chunk = self._read_chunk(CHUNK_SIZE)
     while chunk:
         next_chunk = self._read_chunk(CHUNK_SIZE)
         lines = list(BytesIO(chunk))
         last = lines.pop() if next_chunk else b''
         for line in lines:
             yield line
         chunk = last + next_chunk
Exemple #16
0
 def __init__(self, stream,
              version=1,
              compression_level=2,
              min_hunk_size=HUNK_SIZE,
              max_record_size=None):
     self.stream = stream
     self.version = version
     self.compression_level = compression_level
     self.max_record_size = max_record_size
     self.min_hunk_size = min_hunk_size
     self.size = 0
     self.hunk_size = 0
     self.hunk = BytesIO()
Exemple #17
0
    def flush(self):
        hunk = self.hunk.getvalue()
        checksum = crc32(hunk) & 0xFFFFFFFF
        iscompressed = int(self.compression_level > 0)
        if iscompressed:
            hunk = compress(hunk, self.compression_level)
        data = b''.join([struct.pack('<BBIQ',
                                     128 + self.version,
                                     iscompressed,
                                     checksum,
                                     len(hunk)),
                         hunk])

        self.stream.write(data)
        self.size += len(data)
        self.hunk_size = 0
        self.hunk = BytesIO()
Exemple #18
0
def disco_input_stream(stream, size, url, ignore_corrupt=False):
    """Input stream for Disco's internal compression format."""
    from disco.compat import BytesIO, int_of_byte
    from disco.compat import pickle_load
    import struct, gzip, zlib
    offset = 0
    while True:
        header = stream.read(1)
        if not header:
            return
        if int_of_byte(header[0]) < 128:
            for e in old_netstr_reader(stream, size, url, header):
                yield e
            return
        try:
            is_compressed, checksum, hunk_size =\
                struct.unpack('<BIQ', stream.read(13))
        except:
            raise DataError("Truncated data at {0} bytes".format(offset), url)
        if not hunk_size:
            return
        hunk = stream.read(hunk_size)
        data = b''
        try:
            data = zlib.decompress(hunk) if is_compressed else hunk
            if checksum != (zlib.crc32(data) & 0xFFFFFFFF):
                raise ValueError("Checksum does not match")
        except (ValueError, zlib.error) as e:
            if not ignore_corrupt:
                raise DataError(
                    "Corrupted data between bytes {0}-{1}: {2}".format(
                        offset, offset + hunk_size, e), url)
        offset += hunk_size
        hunk = BytesIO(data)
        while True:
            try:
                yield pickle_load(hunk)
            except EOFError:
                break
            except UnpicklingError as e:
                if not ignore_corrupt:
                    raise DataError(
                        "Corrupted data between bytes {0}-{1}: {2}".format(
                            offset - hunk_size, offset, e), url)
Exemple #19
0
    def chunk(self,
              tag,
              urls,
              replicas=None,
              forceon=[],
              retries=10,
              delayed=False,
              update=False,
              token=None,
              chunk_size=CHUNK_SIZE,
              max_record_size=MAX_RECORD_SIZE,
              **kwargs):
        """
        Chunks the contents of `urls`,
        pushes the chunks to ddfs and tags them with `tag`.
        """
        from disco.core import result_iterator

        if 'reader' not in kwargs:
            kwargs['reader'] = None

        def chunk_iter(replicas):
            chunker = Chunker(chunk_size=chunk_size,
                              max_record_size=max_record_size)
            return chunker.chunks(result_iterator([replicas], **kwargs))

        def chunk_name(replicas, n):
            url = listify(replicas)[0]
            return self.safe_name('{0}-{1}'.format(os.path.basename(url), n))

        blobs = [
            self._push((BytesIO(chunk), chunk_name(reps, n)),
                       replicas=replicas,
                       forceon=forceon,
                       retries=retries) for reps in urls
            for n, chunk in enumerate(chunk_iter(reps))
        ]
        return (self.tag(tag,
                         blobs,
                         delayed=delayed,
                         update=update,
                         token=token), blobs)
Exemple #20
0
 def test_atomic_token(self):
     self.ddfs.push('disco:test:atomic1',
                     [(BytesIO(b'abc'), 'atom')],
                     update=True,
                     delayed=True,
                     token='secret1')
     getter = lambda: self.ddfs.getattr('disco:test:atomic1', 'foobar')
     self.assertCommErrorCode(401, getter)
     self.assertEquals(self.ddfs.getattr('disco:test:atomic1',
                                         'ddfs:write-token',
                                         token='secret1'), 'secret1')
     self.ddfs.put('disco:test:atomic2', [], token='secret2')
     getter = lambda: self.ddfs.getattr('disco:test:atomic2', 'foobar')
     self.assertCommErrorCode(401, getter)
     self.assertEquals(self.ddfs.getattr('disco:test:atomic2',
                                         'ddfs:write-token',
                                         token='secret2'), 'secret2')
     self.ddfs.put('disco:test:notoken', [])
     self.assertEquals(self.ddfs.getattr('disco:test:notoken',
                                         'ddfs:write-token'), None)
Exemple #21
0
 def read(self):
     if self.isopen:
         return BytesIO(str_to_bytes(self.source)).read
     return open(self.source, 'rb').read
Exemple #22
0
 def __init__(self):
     self.buffer = BytesIO()
     super(DiscoZipFile, self).__init__(self.buffer, 'w', ZIP_DEFLATED)
Exemple #23
0
    def codec(self,
              version=1,
              corrupt=False,
              ignore_corrupt=False,
              **kwargs):
        buf = BytesIO()
        stream = DiscoOutputStream(buf, version=version, **kwargs)
        t = self.encode(stream, self.data)
        final_size = len(buf.getvalue())
        final_mb = final_size / 1024**2
        msg = (("{0:1.2f}MB encoded in {1:1.3f}s ({2:1.2f}MB/s), "
                "encoded size {3:1.3f}MB (version: {4}, {5})")
               .format(self.size, t, self.size / t, final_mb, version, kwargs))
        if corrupt:
            buf.seek(0)
            new = BytesIO()
            new.write(buf.read(100))
            new.write(b'X')
            buf.read(1)
            new.write(buf.read())
            buf = new

        buf.seek(0)
        t, res = self.decode(buf, final_size, "nourl",
                             ignore_corrupt=ignore_corrupt)
        if not ignore_corrupt:
            print("{0}, decoded in {1:1.3f}s ({2:1.2f}MB/s)"
                  .format(msg, t, self.size / t))
        return res
Exemple #24
0
 def setUp(self):
     self.ddfs.push('disco:test:attrs', [(BytesIO(b'datablob'), 'blobdata')])
     self.ddfs.setattr('disco:test:attrs', 'a1', 'v1')
     self.ddfs.setattr('disco:test:attrs', 'a2', 'v2')
Exemple #25
0
 def makeout(self):
     return DiscoOutputStream(BytesIO(), max_record_size=MAX_RECORD_SIZE)
Exemple #26
0
 def __init__(self):
     self.buffer = BytesIO()
     super(DiscoZipFile, self).__init__(self.buffer, 'w', ZIP_DEFLATED)
Exemple #27
0
 def setUp(self):
     self.ddfs.push('disco:test:delete1', [(BytesIO(b'datablob'), 'blobdata')])
     self.ddfs.push('disco:test:delete2', [(BytesIO(b'datablob'), 'blobdata')])
Exemple #28
0
def save_oob(host, name, key, value, ddfs_token=None):
    from disco.ddfs import DDFS
    DDFS(host).push(DDFS.job_oob(name), [(BytesIO(value), key)], delayed=True)
Exemple #29
0
class DDFSUpdateTestCase(TestCase):
    data = BytesIO(b'blobdata')

    def setUp(self):
        self.ddfs.delete('disco:test:blobs')

    def blobnames(self, tag):
        from disco.ddfs import DDFS
        return list(reversed(list(DDFS.blob_name(repl[0])
                                  for repl in self.ddfs.blobs(tag))))

    def test_update_empty_new(self):
        self.ddfs.push('disco:test:blobs', [], update=True)
        self.assertEquals(len(self.blobnames('disco:test:blobs')), 0)
        self.ddfs.delete('disco:test:blobs')

    def test_update(self):
        for i in range(5):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, 'dup')] * 2,
                           update=True)
        self.assertEquals(len(self.blobnames('disco:test:blobs')), 1)
        for i in range(5):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, 'dup2')],
                           update=True,
                           delayed=True)
        self.assertEquals(len(self.blobnames('disco:test:blobs')), 2)
        self.ddfs.delete('disco:test:blobs')

    def test_no_garbage(self):
        self.ddfs.push('disco:test:blobs',
                       [(self.data, 'dup')] * 2,
                       update=True)
        tag_pre = self.ddfs.get('disco:test:blobs')
        self.assertEquals(len(tag_pre['urls']), 1)
        self.ddfs.tag('disco:test:blobs', tag_pre['urls'], update=True)
        self.assertEquals(tag_pre['id'], self.ddfs.get('disco:test:blobs')['id'])
        self.ddfs.delete('disco:test:blobs')

    def test_random(self):
        import random
        keys = [str(random.randint(1, 100)) for i in range(100)]
        ukeys = []
        for key in keys:
            self.ddfs.push('disco:test:blobs', [(self.data, key)], update=True)
            if key not in ukeys:
                ukeys.append(key)
        self.assertEquals(ukeys, self.blobnames('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs')

    def test_mixed(self):
        keys = []
        for key in map(str, range(10)):
            self.ddfs.push('disco:test:blobs', [(self.data, key)] * 2)
            keys += [key] * 2
        for key in map(str, range(15)):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, key)] * 2,
                           update=True)
            if int(key) > 9:
                keys.append(key)
        for key in map(str, range(10)):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, key)] * 2,
                           delayed=True)
            keys += [key] * 2
        self.assertEquals(keys, self.blobnames('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs')

    def tearDown(self):
        self.ddfs.delete('disco:test:blobs')
Exemple #30
0
def dumps(obj, protocol=None):
    file = BytesIO()
    Pickler(file, protocol).dump(obj)
    return file.getvalue()
 def setUp(self):
     super(AuthTestCase, self).setUp()
     from disco.compat import BytesIO
     self.tag = 'disco:test:authjob'
     self.ddfs.push(self.tag, [(BytesIO(b'blobdata'), 'blob')])
 def makeout(self):
     return DiscoOutputStream(BytesIO(), max_record_size=self.max_record_size)