Esempio n. 1
0
    def __init__(self, con, path=None, fileobj=None):
        self.con = con
        self.path = path
        if path:
            self.reader = open(self.path, 'rb')
            self.size = os.path.getsize(self.path)
        else:
            self.reader = fileobj
            fileobj.seek(0, 2)
            self.size = fileobj.tell()
            fileobj.seek(0)

        self.rs = Rollsum()
        self.blob_size = 0
        # Store Span the instance of the chunk
        self.spans = []
        # Total size
        self.n = 0
        # buffer to store the chunk
        self.buf = ''
        self.buf_spans = {}

        # To generate the end report.
        self.cnt = {
            'skipped': 0,
            'skipped_size': 0,
            'uploaded': 0,
            'uploaded_size': 0
        }
Esempio n. 2
0
    def __init__(self, con, path=None, fileobj=None):
        self.con = con
        self.path = path
        if path:
            self.reader = open(self.path, 'rb')
            self.size = os.path.getsize(self.path)
        else:
            self.reader = fileobj
            fileobj.seek(0, 2)
            self.size = fileobj.tell()
            fileobj.seek(0)

        self.rs = Rollsum()
        self.blob_size = 0
        # Store Span the instance of the chunk
        self.spans = []
        # Total size
        self.n = 0
        # buffer to store the chunk
        self.buf = ''
        self.buf_spans = {}

        # To generate the end report.
        self.cnt = {'skipped': 0,
                    'skipped_size': 0,
                    'uploaded': 0,
                    'uploaded_size': 0}
Esempio n. 3
0
def benchmark_rollsum():
    bytes_size = 1024 * 1024 * 5
    rs = Rollsum()
    splits = 0
    for i in range(bytes_size):
        rs.roll(random.randint(0, 255))
        if rs.on_split():
            rs.bits()
            splits += 1

    every = int(bytes_size / splits)
    print 'num splits: {0}; every {1} bytes.'.format(splits, every)
Esempio n. 4
0
def benchmark_rollsum():
    bytes_size = 1024 * 1024 * 5
    rs = Rollsum()
    splits = 0
    for i in range(bytes_size):
        rs.roll(random.randint(0, 255))
        if rs.on_split():
            rs.bits()
            splits += 1

    every = int(bytes_size / splits)
    print "num splits: {0}; every {1} bytes.".format(splits, every)
Esempio n. 5
0
 def rsum(offset, length):
     """ Test function that returns Rollsum digest. """
     rs = Rollsum()
     for b in buf[offset:length]:
         rs.roll(b)
     return rs.digest()
Esempio n. 6
0
class FileWriter(object):
    def __init__(self, con, path=None, fileobj=None):
        self.con = con
        self.path = path
        if path:
            self.reader = open(self.path, 'rb')
            self.size = os.path.getsize(self.path)
        else:
            self.reader = fileobj
            fileobj.seek(0, 2)
            self.size = fileobj.tell()
            fileobj.seek(0)

        self.rs = Rollsum()
        self.blob_size = 0
        # Store Span the instance of the chunk
        self.spans = []
        # Total size
        self.n = 0
        # buffer to store the chunk
        self.buf = ''
        self.buf_spans = {}

        # To generate the end report.
        self.cnt = {'skipped': 0,
                    'skipped_size': 0,
                    'uploaded': 0,
                    'uploaded_size': 0}

    def _upload_spans(self, force=False):
        """ Actually upload/put the blobs. """
        if len(self.buf_spans) == 10 or force:
            if camlipy.DEBUG:
                log.debug('Upload spans')
            resp = self.con.put_blobs(self.buf_spans.values())
            self.buf_spans = {}
            for rec in resp['received']:
                self.cnt['uploaded'] += 1
                self.cnt['uploaded_size'] += rec['size']
            for rec in resp['skipped']:
                self.cnt['skipped'] += 1
                self.cnt['skipped_size'] += rec['size']

    def upload_last_span(self):
        """ Empty the current blob buffer, prepare the blob,
        and add it to the spans buffer (they are uploaded once they
        are ten blobs in the buffer).
        """
        if camlipy.DEBUG:
            log.debug('Add span to buffer: {0}'.format(self.spans[-1]))

        chunk = self.buf
        self.buf = ''
        blob_ref = camlipy.compute_hash(chunk)
        self.spans[-1].br = blob_ref
        self.buf_spans[blob_ref] = chunk
        executor = futures.ThreadPoolExecutor(max_workers=2)
        executor.submit(self._upload_spans())
        executor.shutdown(wait=False)

    def chunk(self):
        """ Chunk the file with Rollsum to a tree of Spans. """
        if self.size <= FIRST_CHUNK_SIZE:
            if camlipy.DEBUG:
                log.debug('Skip chunking, file size lower than first chunk: {0}'.format(self.size))
                buf = self.reader.read(self.size)
                br = self.con.put_blob(buf)
                span = Span(br=br, size=self.size)
                self.spans.append(span)
                return 1

        if camlipy.DEBUG:
            log.debug('Start chunking, total size: {0}'.format(self.size))
        chunk_cnt = 0
        last = 0
        eof = False
        bits = 0
        while 1:
            c = self.reader.read(1)
            if c:
                self.buf += c
                self.n += 1
                self.blob_size += 1
                self.rs.roll(ord(c))
                on_split = self.rs.on_split()

                bits = 0
                if self.blob_size == MAX_BLOB_SIZE:
                    bits = 20
                # check EOF
                elif self.n > self.size - BUFFER_SIZE:
                    continue
                elif (on_split and self.n > FIRST_CHUNK_SIZE and
                        self.blob_size > TOO_SMALL_THRESHOLD):
                    bits = self.rs.bits()
                # First chink => 262144 bytes
                elif self.n == FIRST_CHUNK_SIZE:
                    bits = 18  # 1 << 18
                else:
                    continue

                self.blob_size = 0

                # The tricky part, take spans from the end that have
                # smaller bits score, slice them and make them children
                # of the node, that's how we end up with mixed blobRef/bytesRef,
                # And it keep them ordered by  creating a kind of depth-first graph
                children = []
                children_from = len(self.spans)

                while children_from > 0 and \
                        self.spans[children_from - 1].bits < bits:
                    children_from -= 1

                n_copy = len(self.spans) - children_from
                if n_copy:
                    children = self.spans[children_from:]
                    self.spans = self.spans[:children_from]
            else:
                eof = True
                children = []

            current_span = Span(last, self.n, bits, children, chunk_cnt)

            if camlipy.DEBUG:
                log.debug('Current span: {0}, last:{1}, n:{2}'.format(current_span, last, self.n))

            self.spans.append(current_span)
            last = self.n
            self.upload_last_span()

            chunk_cnt += 1

            if eof:
                log.debug('EOF')
                break

        # Upload left chunks
        assert self.n == self.size

        self._upload_spans(force=True)
        return chunk_cnt

    def bytes_writer(self, to_bytes=True):
        """ Transform the span in a blobRef/bytesRef tree.

        if `to_bytes' is True, returns a Bytes schema,
        if False, it returns the list of parts (ready to
        be injected in a File schema.)

        """
        return self._bytes_writer(self.spans, to_bytes=to_bytes)

    def _bytes_writer(self, spans, to_bytes=True):
        """ Actually transform the span in a blobRef/bytesRef tree.

        if `to_bytes' is True, returns a Bytes schema,
        if False, it returns the list of parts (ready to
        be injected in a File schema.)

        """
        schema = Bytes(self.con)
        if camlipy.DEBUG:
            log.debug('Starting spans: {0}'.format(spans))

        for span in spans:
            if camlipy.DEBUG:
                log.debug('Current span: {0}'.format(span))

            # Don't create a bytesRef if there is only one child,
            # make it a blobRef instead.
            if len(span.children) == 1 and span.children[0].single_blob():
                children_size = span.children[0].to - span.children[0]._from
                schema.add_blob_ref(span.children[0].br, children_size)
                span.children = []

                if camlipy.DEBUG:
                    log.debug('Transform this span to blobRef, new span: {0}'.format(span))

            # Create a new bytesRef if the span has children
            elif len(span.children):
                children_size = 0
                for c in span.children:
                    children_size += c.size()

                if camlipy.DEBUG:
                    log.debug('Embedding a bytesRef')
                schema.add_bytes_ref(self._bytes_writer(span.children, True), children_size)

            # Make a blobRef with the span data
            schema.add_blob_ref(span.br, span.to - span._from)
            log.info(schema.json())

        if camlipy.DEBUG:
            log.debug('Resulting Bytes schema: {0}'.format(schema.json()))

        if to_bytes:
            self.con.put_blobs([schema.json()])

            return camlipy.compute_hash(schema.json())

        return schema.data['parts']

    def check_spans(self):
        """ Debug methods. """
        log.debug(self.spans)
        return self._check_spans(self.spans)

    def _check_spans(self, spans):
        """ Debug methods. """
        for span in spans:
            if span.single_blob():
                yield span.chunk_cnt
            else:
                for sp in self._check_spans(span.children):
                    yield sp
                yield span.chunk_cnt
Esempio n. 7
0
 def rsum(offset, length):
     """ Test function that returns Rollsum digest. """
     rs = Rollsum()
     for b in buf[offset:length]:
         rs.roll(b)
     return rs.digest()
Esempio n. 8
0
class FileWriter(object):
    def __init__(self, con, path=None, fileobj=None):
        self.con = con
        self.path = path
        if path:
            self.reader = open(self.path, 'rb')
            self.size = os.path.getsize(self.path)
        else:
            self.reader = fileobj
            fileobj.seek(0, 2)
            self.size = fileobj.tell()
            fileobj.seek(0)

        self.rs = Rollsum()
        self.blob_size = 0
        # Store Span the instance of the chunk
        self.spans = []
        # Total size
        self.n = 0
        # buffer to store the chunk
        self.buf = ''
        self.buf_spans = {}

        # To generate the end report.
        self.cnt = {
            'skipped': 0,
            'skipped_size': 0,
            'uploaded': 0,
            'uploaded_size': 0
        }

    def _upload_spans(self, force=False):
        """ Actually upload/put the blobs. """
        if len(self.buf_spans) == 10 or force:
            if camlipy.DEBUG:
                log.debug('Upload spans')
            resp = self.con.put_blobs(self.buf_spans.values())
            self.buf_spans = {}
            for rec in resp['received']:
                self.cnt['uploaded'] += 1
                self.cnt['uploaded_size'] += rec['size']
            for rec in resp['skipped']:
                self.cnt['skipped'] += 1
                self.cnt['skipped_size'] += rec['size']

    def upload_last_span(self):
        """ Empty the current blob buffer, prepare the blob,
        and add it to the spans buffer (they are uploaded once they
        are ten blobs in the buffer).
        """
        if camlipy.DEBUG:
            log.debug('Add span to buffer: {0}'.format(self.spans[-1]))

        chunk = self.buf
        self.buf = ''
        blob_ref = camlipy.compute_hash(chunk)
        self.spans[-1].br = blob_ref
        self.buf_spans[blob_ref] = chunk
        executor = futures.ThreadPoolExecutor(max_workers=2)
        executor.submit(self._upload_spans())
        executor.shutdown(wait=False)

    def chunk(self):
        """ Chunk the file with Rollsum to a tree of Spans. """
        if self.size <= FIRST_CHUNK_SIZE:
            if camlipy.DEBUG:
                log.debug(
                    'Skip chunking, file size lower than first chunk: {0}'.
                    format(self.size))
                buf = self.reader.read(self.size)
                br = self.con.put_blob(buf)
                span = Span(br=br, size=self.size)
                self.spans.append(span)
                return 1

        if camlipy.DEBUG:
            log.debug('Start chunking, total size: {0}'.format(self.size))
        chunk_cnt = 0
        last = 0
        eof = False
        bits = 0
        while 1:
            c = self.reader.read(1)
            if c:
                self.buf += c
                self.n += 1
                self.blob_size += 1
                self.rs.roll(ord(c))
                on_split = self.rs.on_split()

                bits = 0
                if self.blob_size == MAX_BLOB_SIZE:
                    bits = 20
                # check EOF
                elif self.n > self.size - BUFFER_SIZE:
                    continue
                elif (on_split and self.n > FIRST_CHUNK_SIZE
                      and self.blob_size > TOO_SMALL_THRESHOLD):
                    bits = self.rs.bits()
                # First chink => 262144 bytes
                elif self.n == FIRST_CHUNK_SIZE:
                    bits = 18  # 1 << 18
                else:
                    continue

                self.blob_size = 0

                # The tricky part, take spans from the end that have
                # smaller bits score, slice them and make them children
                # of the node, that's how we end up with mixed blobRef/bytesRef,
                # And it keep them ordered by  creating a kind of depth-first graph
                children = []
                children_from = len(self.spans)

                while children_from > 0 and \
                        self.spans[children_from - 1].bits < bits:
                    children_from -= 1

                n_copy = len(self.spans) - children_from
                if n_copy:
                    children = self.spans[children_from:]
                    self.spans = self.spans[:children_from]
            else:
                eof = True
                children = []

            current_span = Span(last, self.n, bits, children, chunk_cnt)

            if camlipy.DEBUG:
                log.debug('Current span: {0}, last:{1}, n:{2}'.format(
                    current_span, last, self.n))

            self.spans.append(current_span)
            last = self.n
            self.upload_last_span()

            chunk_cnt += 1

            if eof:
                log.debug('EOF')
                break

        # Upload left chunks
        assert self.n == self.size

        self._upload_spans(force=True)
        return chunk_cnt

    def bytes_writer(self, to_bytes=True):
        """ Transform the span in a blobRef/bytesRef tree.

        if `to_bytes' is True, returns a Bytes schema,
        if False, it returns the list of parts (ready to
        be injected in a File schema.)

        """
        return self._bytes_writer(self.spans, to_bytes=to_bytes)

    def _bytes_writer(self, spans, to_bytes=True):
        """ Actually transform the span in a blobRef/bytesRef tree.

        if `to_bytes' is True, returns a Bytes schema,
        if False, it returns the list of parts (ready to
        be injected in a File schema.)

        """
        schema = Bytes(self.con)
        if camlipy.DEBUG:
            log.debug('Starting spans: {0}'.format(spans))

        for span in spans:
            if camlipy.DEBUG:
                log.debug('Current span: {0}'.format(span))

            # Don't create a bytesRef if there is only one child,
            # make it a blobRef instead.
            if len(span.children) == 1 and span.children[0].single_blob():
                children_size = span.children[0].to - span.children[0]._from
                schema.add_blob_ref(span.children[0].br, children_size)
                span.children = []

                if camlipy.DEBUG:
                    log.debug(
                        'Transform this span to blobRef, new span: {0}'.format(
                            span))

            # Create a new bytesRef if the span has children
            elif len(span.children):
                children_size = 0
                for c in span.children:
                    children_size += c.size()

                if camlipy.DEBUG:
                    log.debug('Embedding a bytesRef')
                schema.add_bytes_ref(self._bytes_writer(span.children, True),
                                     children_size)

            # Make a blobRef with the span data
            schema.add_blob_ref(span.br, span.to - span._from)
            log.info(schema.json())

        if camlipy.DEBUG:
            log.debug('Resulting Bytes schema: {0}'.format(schema.json()))

        if to_bytes:
            self.con.put_blobs([schema.json()])

            return camlipy.compute_hash(schema.json())

        return schema.data['parts']

    def check_spans(self):
        """ Debug methods. """
        log.debug(self.spans)
        return self._check_spans(self.spans)

    def _check_spans(self, spans):
        """ Debug methods. """
        for span in spans:
            if span.single_blob():
                yield span.chunk_cnt
            else:
                for sp in self._check_spans(span.children):
                    yield sp
                yield span.chunk_cnt