Ejemplo n.º 1
0
def cli(filename, output):
    """calculate Merkle Tree
    """
    hash_value = calculate_tree_hash(filename)
    if output == 'binary':
        hash_value = binascii.unhexlify(hash_value)
    click.echo(hash_value)
Ejemplo n.º 2
0
def multipart_upload(filename, part_size=PART_SIZE):
    glacier = boto3.resource('glacier', region_name='us-west-2')
    # There's no error if the vault already exists so we don't
    # need to catch any exceptions here.
    vault = glacier.create_vault(vaultName='botocore-integ-test-vault')
    file_size = os.path.getsize(filename)

    # Initiate a multipart upload
    multipart_upload = vault.initiate_multipart_upload(
        archiveDescription='multipart upload', partSize=str(part_size))
    try:
        # Upload each part
        for i in range(file_size/part_size+1):
            range_from = i*part_size
            range_to = min((i+1)*part_size-1, file_size-1)
            body = ReadFileChunk.from_filename(filename, range_from, part_size)
            multipart_upload.upload_part(
                body=body, range='bytes %d-%d/*' % (range_from, range_to))

        # Complete a multipart upload transaction
        response = multipart_upload.complete(
            checksum=calculate_tree_hash(open(filename, 'rb')),  # NEEDED
            archiveSize=str(file_size))
        return vault.Archive(response['archiveId'])
    except:
        multipart_upload.abort()
        raise
Ejemplo n.º 3
0
def glacier_sync_multipart_upload(ctx, version, file_path, file_size):
    # Reference: https://boto3.readthedocs.io/en/latest/reference/services/glacier.html#Glacier.Vault.initiate_multipart_upload
    multipart_upload = ctx.vault.initiate_multipart_upload(
        archiveDescription=version.location['object'],
        partSize=str(GLACIER_PART_SIZE),
    )

    with open(file_path, 'rb') as fp:
        for byte_offset in range(0, file_size, GLACIER_PART_SIZE):
            part = fp.read(GLACIER_PART_SIZE)
            range_header = 'bytes {}-{}/{}'.format(byte_offset, byte_offset + len(part) - 1, file_size)
            multipart_upload.upload_part(
                range=range_header,
                body=part,
            )

    # TODO: Ideally this would be computed on upload, however this is also a good double check, so we do not incur any off-by-one issues.
    # see https://boto3.readthedocs.io/en/latest/reference/services/glacier.html#Glacier.MultipartUpload.complete
    checksum = calculate_tree_hash(open(file_path, 'rb'))
    response = multipart_upload.complete(
        archiveSize=str(file_size),
        checksum=checksum,
    )
    assert response['checksum'] == checksum

    return response['archiveId']
Ejemplo n.º 4
0
def multipart_upload(filename, part_size=PART_SIZE):
    glacier = boto3.resource('glacier', region_name='us-west-2')
    # There's no error if the vault already exists so we don't
    # need to catch any exceptions here.
    vault = glacier.create_vault(vaultName='botocore-integ-test-vault')
    file_size = os.path.getsize(filename)

    # Initiate a multipart upload
    multipart_upload = vault.initiate_multipart_upload(
        archiveDescription='multipart upload', partSize=str(part_size))
    try:
        # Upload each part
        for i in range(file_size / part_size + 1):
            range_from = i * part_size
            range_to = min((i + 1) * part_size - 1, file_size - 1)
            body = ReadFileChunk.from_filename(filename, range_from, part_size)
            multipart_upload.upload_part(body=body,
                                         range='bytes %d-%d/*' %
                                         (range_from, range_to))

        # Complete a multipart upload transaction
        response = multipart_upload.complete(
            checksum=calculate_tree_hash(open(filename, 'rb')),  # NEEDED
            archiveSize=str(file_size))
        return vault.Archive(response['archiveId'])
    except:
        multipart_upload.abort()
        raise
Ejemplo n.º 5
0
def add_glacier_checksums(params, **kwargs):
    """Add glacier checksums to the http request.

    This will add two headers to the http request:

        * x-amz-content-sha256
        * x-amz-sha256-tree-hash

    These values will only be added if they are not present
    in the HTTP request.

    """
    request_dict = params
    headers = request_dict['headers']
    body = request_dict['body']
    if isinstance(body, six.binary_type):
        # If the user provided a bytes type instead of a file
        # like object, we're temporarily create a BytesIO object
        # so we can use the util functions to calculate the
        # checksums which assume file like objects.  Note that
        # we're not actually changing the body in the request_dict.
        body = six.BytesIO(body)
    starting_position = body.tell()
    if 'x-amz-content-sha256' not in headers:
        headers['x-amz-content-sha256'] = utils.calculate_sha256(
            body, as_hex=True)
    body.seek(starting_position)
    if 'x-amz-sha256-tree-hash' not in headers:
        headers['x-amz-sha256-tree-hash'] = utils.calculate_tree_hash(body)
    body.seek(starting_position)
Ejemplo n.º 6
0
def add_glacier_checksums(params, **kwargs):
    """Add glacier checksums to the http request.

    This will add two headers to the http request:

        * x-amz-content-sha256
        * x-amz-sha256-tree-hash

    These values will only be added if they are not present
    in the HTTP request.

    """
    request_dict = params
    headers = request_dict['headers']
    body = request_dict['body']
    if isinstance(body, six.binary_type):
        # If the user provided a bytes type instead of a file
        # like object, we're temporarily create a BytesIO object
        # so we can use the util functions to calculate the
        # checksums which assume file like objects.  Note that
        # we're not actually changing the body in the request_dict.
        body = six.BytesIO(body)
    starting_position = body.tell()
    if 'x-amz-content-sha256' not in headers:
        headers['x-amz-content-sha256'] = utils.calculate_sha256(body,
                                                                 as_hex=True)
    body.seek(starting_position)
    if 'x-amz-sha256-tree-hash' not in headers:
        headers['x-amz-sha256-tree-hash'] = utils.calculate_tree_hash(body)
    body.seek(starting_position)
Ejemplo n.º 7
0
def archive_file_to_glacier_multipart(session, vault_name, archive_desc, archive_file):
    """
    Upload file to AWS glacier vault.
    :param session:
    :param vault_name:
    :param archive_desc:
    :param archive_file:
    :return:
    """
    # use chunk size 10MB
    chunk_size = 1048576 * 32
    glacier = session.resource("glacier")
    logger.info("Upload %s to glacier vault %s" % (archive_file, vault_name))
    vault = glacier.Vault(account_id="-", name=vault_name)
    multipart_upload = vault.initiate_multipart_upload(
        accountId="-", archiveDescription=archive_desc, partSize=str(chunk_size))
    # upload_id = multipart_upload.id
    f = open(archive_file, "rb")
    start_range = 0
    for chunk in read_in_chunks(f, chunk_size):
        range_data = "bytes %s-%s/*" % (start_range, f.tell()-1)
        logger.info("Uploading range %s" % range_data)
        multipart_upload.upload_part(range=range_data, body=chunk)
        start_range = f.tell()

    f.seek(0)
    response = multipart_upload.complete(archiveSize=str(start_range),
                                         checksum=calculate_tree_hash(f))
    f.close()
    # archive_id = response.get('archiveId')
    return response
Ejemplo n.º 8
0
 def finalize_upload(self):
     self.file_handle.seek(0)
     filehash = calculate_tree_hash(self.file_handle)
     self.glacier.complete_multipart_upload(
         vaultName=self.vault,
         uploadId=self.multipart_upload_id,
         archiveSize=str(self.file_size),
         checksum=filehash,
     )
Ejemplo n.º 9
0
 def complete_multipart_upload(self, upload_id, archive):
     size = archive.tell()
     archive.seek(0)
     checksum = calculate_tree_hash(archive)
     response = self.client.complete_multipart_upload(
         vaultName=self.vault_name,
         uploadId=upload_id,
         archiveSize=str(size),
         checksum=checksum)
     return response['archiveId']
Ejemplo n.º 10
0
    def checksum(self):
        """ Calculate the checksum for the upload, as needed by Glacier API.

        :rtype: str
        """
        if self._checksum is None:
            self._checksum = calculate_tree_hash(self.data)
        if self.completed:
            self._data = None
        return self._checksum
 def complete_multipart_upload(self, upload_id, archive):
     size = archive.tell()
     archive.seek(0)
     checksum = calculate_tree_hash(archive)
     response = self.client.complete_multipart_upload(
         vaultName = self.vault_name,
         uploadId = upload_id,
         archiveSize = str(size),
         checksum = checksum
     )
     return response['archiveId']
Ejemplo n.º 12
0
 def finalize(self):
     step = 'finalize'
     total_checksum = calculate_tree_hash(open(self.filepath, 'rb'))
     self.responses[step] = get_client().complete_multipart_upload(
         archiveSize=str(get_file_size(self.filepath)),
         checksum=total_checksum,
         uploadId=self.upload_id,
         vaultName=self.vault_name,
     )
     if self.responses[step].get('checksum', None) == total_checksum:
         self.archive_id = self.responses[step]['archiveId']
         logging.info('multipart upload %s finished with archive ID %s' %
                      (self.filepath, self.archive_id))
     else:
         logging.info('multipart upload %s failed' % self.filepath)
Ejemplo n.º 13
0
def multipart_upload(account_id, chunk_size, file_name, glacier_vault,
                     out_file):
    file_size = os.path.getsize(file_name)
    parts = math.ceil(file_size / chunk_size)

    client = boto3.client('glacier')
    try:
        # initiate initiate_multipart_upload
        upload_obj = client.initiate_multipart_upload(
            accountId=account_id,
            vaultName=glacier_vault,
            archiveDescription=file_name,
            partSize=str(chunk_size))
        print(upload_obj)
        # upload file in chunks
        with open(file_name, 'rb') as upload:
            for p in range(parts):
                lower = p * chunk_size
                upper = lower + chunk_size - 1
                if upper > file_size:
                    upper = file_size - 1
                file_part = upload.read(chunk_size)

                up_part = client.upload_multipart_part(
                    accountId=account_id,
                    vaultName=glacier_vault,
                    uploadId=upload_obj['uploadId'],
                    range='bytes {}-{}/*'.format(lower, upper),
                    body=file_part)
        # complete the upload
        checksum = calculate_tree_hash(open(file_name, 'rb'))
        complete_upload = client.complete_multipart_upload(
            accountId=account_id,
            vaultName=glacier_vault,
            uploadId=upload_obj['uploadId'],
            archiveSize=str(file_size),
            checksum=checksum)
    except ClientError as e:
        logging.error(e)
        sys.exit()

    print("complete archiving " + file_name + " in Glacier vault " +
          glacier_vault)
    print(complete_upload)

    with open(out_file, "w") as file:
        file.write(str(complete_upload))
Ejemplo n.º 14
0
 def upload(self):
     """
     Upload the archive to Amazon Glacier by chunking it.
     """
     self.start_upload()
     while not self.upload_part():
         continue
     # Move to the front of the file to calculate its hash
     self.zip_file.seek(0)
     zip_hash = calculate_tree_hash(self.zip_file)
     response = self.client.complete_multipart_upload(
             vaultName=self.vault_name,
             uploadId=self.upload_id,
             archiveSize=str(self.zip_file_size),
             checksum=zip_hash
     )
     return response
Ejemplo n.º 15
0
def upload_large_file(vault_name, filepath, description):
    """
    Do a multi part upload to glacier
    :param vault_name:
    :param filepath:
    :param description:
    :return:
    """
    glacier = boto3.resource("glacier")
    vault = glacier.Vault(account_id="-", name=vault_name)

    multipart_upload = vault.initiate_multipart_upload(
        accountId="-",
        archiveDescription=description,
        partSize=str(CHUNK_SIZE))
    upload_id = multipart_upload.id
    print("Upload id: {}".format(upload_id))

    with open(filepath, 'rb') as f:
        retrylist = upload_segments(multipart_upload,
                                    read_in_chunks(f, CHUNK_SIZE))

        f.seek(0, 2)
        fsize = f.tell()

        while len(retrylist) > 0:
            print("Retrying failed parts")
            # syntax turns list into a generator
            retrylist = upload_segments(multipart_upload,
                                        (i for i in retrylist))

        print("Finalizing upload {} ...".format(upload_id))
        f.seek(0)
        s256t_hash = calculate_tree_hash(f)
        response = multipart_upload.complete(archiveSize=str(fsize),
                                             checksum=s256t_hash)
        print("Hash: {}".format(s256t_hash))

    pprint.pprint(response)
    return response
Ejemplo n.º 16
0
def upload_archive(client, vault_name, file_name, description, concurrency):
    file_size = os.stat(file_name).st_size
    part_size = calculate_part_size(file_size)

    info('Initiating multipart upload')
    upload_id = client.initiate_multipart_upload(vaultName=vault_name, partSize=str(part_size),
                                                 archiveDescription=description)['uploadId']

    parts = generate_archive_parts(vault_name, file_name, upload_id, file_size, part_size)
    base_name = os.path.basename(file_name)
    info(f'Uploading {base_name} in {len(parts)} parts')
    with Pool(concurrency) as pool:
        pbar = tqdm(total=file_size, unit="", unit_scale=True, dynamic_ncols=True,
                    bar_format='{percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} ({rate_fmt})')
        for uploaded_size in pool.imap_unordered(upload_archive_part, parts):
            pbar.update(uploaded_size)
        pbar.close()

    info('Verifying checksum')
    checksum = calculate_tree_hash(open(file_name, 'rb'))
    return client.complete_multipart_upload(vaultName=vault_name, uploadId=upload_id, archiveSize=str(file_size),
                                            checksum=checksum)
Ejemplo n.º 17
0
    def multipart_upload(self, path_to_file, part_size=4, description=""):
        """Uploading a file in mutiple parts.

        Args:
            path_to_file (str): Path to the file.
            description (str, optional): Description of what is uploaded.
            part_size (int, optional): Size for the multipart parts. Defaults to 4 megabytes.
        """
        if self.validator.preupload_checks(path_to_file, part_size) and self._vault_exists(self.vault_name):
            total_size = get_file_size(path_to_file)
            part_size_bytes = get_allowed_sizes().get(str(part_size))
            parts = get_needed_parts(path_to_file, part_size_bytes, total_size)
            parts = add_byte_ranges(parts)
            response = self._initiate_multipart_upload(description, part_size_bytes, total_size)
            if self.validator.is_response_ok(response):
                upload_id = response.get("uploadId")
                upload_success = self._do_multipart_upload(upload_id, path_to_file, parts)
                if upload_success:
                    self.logger.info("Calculating tree hash...")
                    with open(path_to_file, 'rb') as file_object:
                        total_hash = calculate_tree_hash(file_object)
                    completed_response = self._complete_multipart_upload(upload_id, total_size, total_hash)
                    if self.validator.is_response_ok(completed_response):
                        self.logger.info("Upload completed.")
Ejemplo n.º 18
0
def update_local_inventory(local_dir, local_filenames, old_local_inventory):
    """Update the local inventory stored as a json file in the local directory.
    The updated local inventory is returned."""

    archive_list = list()

    old_local_inventory_dict = dictify_inventory(old_local_inventory)

    for f in tqdm(local_filenames):
        if f in old_local_inventory_dict:
            checksum = old_local_inventory_dict[f]
        else:
            with open(os.path.join(local_dir, f), 'rb') as target_file:
                checksum = calculate_tree_hash(target_file)

        archive_list.append({
                "ArchiveDescription" : f,
                "CreationDate" : datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                "SHA256TreeHash" : checksum
            })

    res = {"ArchiveList" : archive_list}

    return res
Ejemplo n.º 19
0
 def test_tree_hash_exactly_one_mb(self):
     one_meg_bytestring = b'a' * (1 * 1024 * 1024)
     one_meg = six.BytesIO(one_meg_bytestring)
     self.assertEqual(
         calculate_tree_hash(one_meg),
         '9bc1b2a288b26af7257a36277ae3816a7d4f16e89c1e7e77d0a5c48bad62b360')
Ejemplo n.º 20
0
 def test_tree_hash_less_than_one_mb(self):
     one_k = six.BytesIO(b'a' * 1024)
     self.assertEqual(
         calculate_tree_hash(one_k),
         '2edc986847e209b4016e141a6dc8716d3207350f416969382d431539bf292e4a')
Ejemplo n.º 21
0
 def test_empty_tree_hash(self):
     self.assertEqual(
         calculate_tree_hash(six.BytesIO(b'')),
         'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
Ejemplo n.º 22
0
def mpUpload(fname, vault, desc, acctid, hrglass):
    '''Initiate a multipart upload, split file into parts, calculate checksums,
    treehash values, for each, and uploads to glacier in parallel according to # of cores in machine.'''
    startime = time.time()
    chunkscheme = chunksize(fname)  # chunkscheme[0]= size of chunk
    fullsize = getsize(fname)
    size, partcount = chunkscheme[0], chunkscheme[1]
    #graph = ''
    try:
        if size < 42494967296:  # this is not 40gb
            try:
                print('calling aws initiate function')
                try:
                    desc = desc.replace('*', '+')
                except:
                    pass
                mpu = glacier_client.initiate_multipart_upload(
                    vaultName=vault,
                    archiveDescription=desc,
                    partSize=str(size))
            except botocore.exceptions.ParamValidationError or botocore.exceptions.ClientError:
                print(
                    'Multipart Upload operation failed, {} not uploaded to {}'.
                    format(fname, vault))
                return None
            upid = mpu['uploadId']
            print(
                '\nMultipart Upload initiated for {} to {}\n Upload Request ID: {}'
                .format(fname, vault, upid))
            try:
                print(hrglass)
            except:
                pass
            print(
                'This may take a while, {} is {} watching it happen is not recommended...\n'
                .format(fname, size_display(fullsize)))
            print('\nsplitting input file into {} parts\n'.format(partcount))
            presplit = time.time() - startime
            print('time {}'.format(timer(startime, time.time())))
            filist = unxplit(fname, size, PART_PREFIX)
            postplit = time.time() - presplit
            print('time: {}'.format(timer(postplit, time.time())))
            of = len(filist)

            if of != partcount:
                print('of: {} doesnt equal partcount: {}...'.format(
                    of, partcount))
                trash(PART_PREFIX)
                sys.exit()

            totrange = 0
            #response = []

            all_params = []

            #  iterate over each file part to compile parameters
            print('Compiling parameters for upload segments...')
            prop = 0
            for part in filist:
                part_params = {
                    'xpart': part,
                    'vault': vault,
                    'upid': upid,
                    'acctid': ACCOUNT_ID
                }
                num = filist.index(part) + 1
                bytestring = open(part, 'rb')
                thash = str(core.calculate_tree_hash(bytestring))
                part_params.update({'xthash': thash})
                bytestring.close(
                )  # size is the closest power of two that is greater than the size of the part!
                btrange = totrange + getsize(part)
                prtrange = (totrange, btrange - 1)
                part_params.update({'xrange': prtrange})
                all_params.append(part_params)

                # display progress
                comprop = prop
                prop = (len(all_params) * 100) / of
                if prop != comprop:
                    print(bar(prop), ' {} of {}'.format(num, of))
                # increment how much of the file has now been accounted for
                totrange += size  # range is increasing by size, which is larger than the size of the part
            print('Uploading to Amazon Glacier...\n')

            # map part uploads into parallel scheme using upool
            ccount = 0
            errcount = 0
            done = round((ccount / of) * 100)  #  done refers to the percentage
            pool = Pool(processes=cpu_count())
            for partload in pool.imap_unordered(
                    partfeed, all_params):  #  pool.imap() parallelization
                if not partload[2]:  #  upload part failed
                    print('failed to upload range {}-{}'.format(
                        partload[0], partload[1]))
                    errcount += 1  # make errcount a vector of the failed parameters
                else:  # successful part upload
                    pdone = done
                    ccount += 1
                    done = (ccount / of) * 100
                    if done != pdone:
                        print('byterange {}-{} successfully uploaded'.format(
                            partload[0], partload[1]))
                        print(bar(done))
            print('{} of {} successfully uploaded'.format(ccount, of))
            pool.close()
            pool.join()
            pool.terminate()
            pool = Pool(processes=cpu_count())
            print('{} failed parts'.format(errcount))
            if errcount != 0:
                #firstpass = time.time()-postplit
                errcount = 0

                uploaded = glacier_client.list_parts(vaultName=vault,
                                                     uploadId=upid)['Parts']
                while len(uploaded) < len(all_params):
                    #  check to see if all parts have uploaded
                    print('time: {}'.format(timer(postplit, time.time())))
                    postplit = time.time()
                    errcount = 0
                    print('retrying {} failed parts'.format(
                        errcount))  # no .format here
                    rangit = [i['RangeInBytes'] for i in uploaded]
                    checkparts = (tuple(byterange.split('-'))
                                  for byterange in rangit
                                  )  # received byteranges from AWS server
                    rcheckparts = [(int(ad[0]), int(ad[1]))
                                   for ad in checkparts]
                    remains = [
                        left for left in all_params
                        if left['xrange'] not in rcheckparts
                    ]

                    for partload in pool.imap(partfeed, remains):
                        if not partload[2]:
                            print('failed to upload range: {}-{}'.format(
                                partload[0], partload[1]))
                            errcount += 1
                        else:
                            ccount += 1
                            print(
                                'bytes {}-{} (part {} of {}) successfully uploaded!'
                                .format(partload[0], partload[1], ccount, of))
                            print(bar(round((ccount / of) * 100)))
                    print('{} of {} successfully uploaded'.format(ccount, of))
                    print('{} failed parts'.format(errcount))
                    pool.close()
                    pool.join()
                    uploaded = glacier_client.list_parts(
                        vaultName=vault, uploadId=upid)['Parts']
                    pool.terminate()
                    pool = Pool(processes=cpu_count())

            # close pool
            #pool.close()
            #pool.join()  # retry any uploads left over
            with open(fname, 'rb') as f:
                full_tree_hash = core.calculate_tree_hash(f)
                f.close()

            completion = glacier_client.complete_multipart_upload(
                vaultName=vault,
                uploadId=upid,
                archiveSize=str(fullsize),
                checksum=full_tree_hash,
                accountId=acctid)
            # trash(PART_PREFIX)
            print('\nMultipart Upload of Archive: {} to Vault {} Completed\n'.
                  format(fname, vault))
            pool.terminate()
            # for safety, * is marker string for history update
            output = {
                'FileName':
                fname,
                'Description':
                "{}: {} *{}* ".format(
                    completion['ResponseMetadata']['HTTPHeaders']['date'],
                    desc, fname),
                'VaultName':
                vault,
                'ArchiveId':
                completion['archiveId'],
                'Size':
                str(fullsize)
            }
            return output
        else:  # this needs to happen before the success message if
            print('Amazon does not support archive files over 40 GB, {} is {}'.
                  format(fname, size_display(fullsize)))
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        name = split(exc_tb.tb_frame.f_code.co_filename)[1]
        print('Python raised the following excception: {} {} {} {}'.format(
            exc_type, e, name, exc_tb.tb_lineno))
        glacier_client.abort_multipart_upload(vaultName=vault, upload_id=upid)
    finally:
        trash(PART_PREFIX)
        return None
Ejemplo n.º 23
0
 def test_tree_hash_offset_of_one_mb_multiple(self):
     offset_four_mb = six.BytesIO(b'a' * (4 * 1024 * 1024) + b'a' * 20)
     self.assertEqual(
         calculate_tree_hash(offset_four_mb),
         '12f3cbd6101b981cde074039f6f728071da8879d6f632de8afc7cdf00661b08f')
Ejemplo n.º 24
0
 def test_tree_hash_multiple_of_one_mb(self):
     four_mb = six.BytesIO(b'a' * (4 * 1024 * 1024))
     self.assertEqual(
         calculate_tree_hash(four_mb),
         '9491cb2ed1d4e7cd53215f4017c23ec4ad21d7050a1e6bb636c4f67e8cddb844')
Ejemplo n.º 25
0
 def test_tree_hash_exactly_one_mb(self):
     one_meg_bytestring = b'a' * (1 * 1024 * 1024)
     one_meg = six.BytesIO(one_meg_bytestring)
     self.assertEqual(
         calculate_tree_hash(one_meg),
         '9bc1b2a288b26af7257a36277ae3816a7d4f16e89c1e7e77d0a5c48bad62b360')
Ejemplo n.º 26
0
 def test_tree_hash_less_than_one_mb(self):
     one_k = six.BytesIO(b'a' * 1024)
     self.assertEqual(
         calculate_tree_hash(one_k),
         '2edc986847e209b4016e141a6dc8716d3207350f416969382d431539bf292e4a')
Ejemplo n.º 27
0
 def test_empty_tree_hash(self):
     self.assertEqual(
         calculate_tree_hash(six.BytesIO(b'')),
         'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
def upload_archive(transfer_file, vault_name):
    glacier_client = boto3.client('glacier',
                                  region_name='ap-southeast-2',
                                  aws_access_key_id='DUMMY_ID',
                                  aws_secret_access_key='DUMMY_ACCESS')
    total = os.path.getsize(transfer_file)
    size = ceiling_log(total/32, 2) # 32 cores on this NCI machine

    init_mp_upl_resp = glacier_client.initiate_multipart_upload(vaultName=vault_name,
                                                                archiveDescription='2000_062 cwb waveform data',
                                                                partSize=str(size))
    print init_mp_upl_resp['uploadId']
    #write_uploadid(os.path.splitext(transfer_file)[0]+'.id', init_mp_upl_resp['uploadId'])

    '''
    # running 32 parallel thread with the code below caused a argument mismatch error in the
    # multiprocessing code! so continuing with serial upload for now. will investigate and
    # try to fix the parallel upload.
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    tasks = []
    start = 0
    while start < total:
        tasks.append( (transfer_file, start, size, init_mp_upl_resp['uploadId']) )
        start += size
    results = [pool.apply_async( transfer_part, t ) for t in tasks]

    all_uploaded=True
    for i, result in enumerate(results):
        if not result.get():
            print('Part number ', i, ' was not uploaded successfully')
            all_uploaded=False
    '''

    all_uploaded = True
    start = 0
    while start < total:
        if transfer_part(transfer_file, start, size, init_mp_upl_resp['uploadId'], vault_name):
            start += size
        else:
            all_uploaded = False

    if all_uploaded:
        print("All Files Uploaded")
        print("Verifying Checksum...")
        complete_up = glacier_client.complete_multipart_upload(vaultName=vault_name, uploadId=init_mp_upl_resp['uploadId'], archiveSize=str(total), checksum=calculate_tree_hash(open(transfer_file, 'rb')))
        print("Upload Completed:", complete_up)
    else:
        print("Upload of archive file:", transfer_file, " failed...")
Ejemplo n.º 29
0
def main(argv):
    global logger, aws_region, aws_profile, logging, retrieval_tier, sns_topic
    logger.info('Started')
    topic_arn = ''
    args = parse_args()
    if args.loglevel is False:
        logger.info("Using default logging level: INFO")
    else:
        loglevel = args.loglevel
        logger.info("Changing logging level: " + loglevel.upper())
        if loglevel == "debug":
            logging.getLogger().setLevel(logging.DEBUG)
        elif loglevel == "info":
            logging.getLogger().setLevel(logging.INFO)
        elif loglevel == "error":
            logging.getLogger().setLevel(logging.ERROR)
        elif loglevel == "critical":
            logging.getLogger().setLevel(logging.CRITICAL)

    if args.awsprofile is None:
        logger.error("--aws-profile parameter is missing.")
        return 1
    else:
        aws_profile = args.awsprofile
        logger.info("AWS Profile: " + aws_profile)

    if args.region is not None:
        aws_region = args.region

    if args.vault_name is not None:
        vault_name = args.vault_name
        logger.info("Vault Name: " + vault_name)
    else:
        logger.error("--vault-name parameter is missing.")
        return 1

    if args.sns_topic is not None:
        sns_topic = args.sns_topic
    else:
        sns_topic = None

    if args.archive_id is not None:
        archive_id = args.archive_id
        logger.info("Archive Id: " + archive_id)
    else:
        archive_id = None

    if args.retrieval_tier is not None:
        retrieval_tier = args.retrieval_tier
    logger.info("Glacier Retrieval Tier: %s" % retrieval_tier)

    session = boto3.Session(profile_name=aws_profile, region_name=aws_region)
    sts = session.client('sts')
    account_id = sts.get_caller_identity()["Account"]
    if args.bucket_name is None:
        bucket_name = 'glacier-restored-%s-%s' % (account_id, aws_region)
        logger.info("Default Bucket For Restore: " + bucket_name)
    else:
        bucket_name = args.bucket_name
        logger.info("Specified Bucket For Restore: " + bucket_name)
    s3 = session.client('s3')
    glacier = session.client('glacier')
    response = s3.list_buckets()
    if not any(item['Name'] == bucket_name for item in response['Buckets']):
        logger.info("Bucket name %s not found. Creating new bucket in region %s" % (bucket_name, aws_region))
        if aws_region != 'us-east-1':
            response = s3.create_bucket(Bucket=bucket_name,
                                        CreateBucketConfiguration={'LocationConstraint': aws_region})
        else:
            response = s3.create_bucket(Bucket=bucket_name)
        logger.debug(json_print(response))
        waiter = s3.get_waiter('bucket_exists')
        waiter.wait(Bucket=bucket_name)
        logger.info('Bucket creation successful.')
    logger.info("Found bucket name %s" % bucket_name)
    if sns_topic is not None:
        sns = session.client('sns')
        topic_arn = 'arn:aws:sns:%s:%s:%s' % (aws_region, account_id, sns_topic)
        logger.info("Topic Arn: %s" % topic_arn)
        try:
            response = sns.list_subscriptions_by_topic(TopicArn=topic_arn)
            for endpoint in response['Subscriptions']:
                logger.info("Email notify: %s" % endpoint['Endpoint'])
        except ClientError as e:
            logger.error(e.response['Error']['Message'])
            return 1
    if args.archive_id is not None:
        description = 'Retrieve archive on %s' % str(datetime.datetime.now())
        try:
            response = glacier.initiate_job(vaultName=vault_name, jobParameters={
                'Type': glacier_job_type, 'ArchiveId': archive_id, 'Description': description,
                'SNSTopic': topic_arn, 'Tier': retrieval_tier,
                }
            )
            logger.info(json_print(response))
            job_id = response['jobId']
            logger.info("Open python shelve file %s" % shelve_file)
            shelve_db = shelve.open(shelve_file, flag='c', writeback=True)
            if 'job_ids' not in shelve_db:
                shelve_db['job_ids'] = dict()
            logger.info("Archive Id %s" % archive_id)
            logger.info("Storing job id %s with archive id" % job_id)
            job_ids = shelve_db['job_ids']
            job_ids[archive_id] = job_id
            shelve_db['job_ids'] = job_ids
            for job_id in job_ids:
                logger.info("Job Id: %s" % job_id)
            logger.info("Close python shelve file.")
            shelve_db.close()
        except ClientError as e:
            logger.error(e.response['Error']['Message'])
            return 1
    else:
        logger.info("Open shelve db file %s" % shelve_file)
        shelve_db = shelve.open(shelve_file, flag='c', writeback=True)
        logger.info("Checking on glacier retrieval job that is in progress...")
        if 'job_ids' not in shelve_db:
            logger.info("There are no pending jobs.")
            return
        job_ids = shelve_db['job_ids']
        deleted_archive_ids = list()
        for archive_id in job_ids:
            job_id = job_ids[archive_id]
            try:
                response = glacier.describe_job(vaultName=vault_name, jobId=job_id)
                logger.info(json_print(response))
            except ClientError as e:
                logger.info(e.response['Error']['Message'])
                return 1
            if 'StatusCode' in response:
                status_code = response['StatusCode']
            else:
                status_code = ''
            if 'StatusMessage' in response:
                status_message = response['StatusMessage']
            else:
                status_message = ''
            if status_code == 'Succeeded' and status_message == 'Succeeded':
                logger.info("Archive Retrieval Successful")
                response = glacier.get_job_output(vaultName=vault_name, jobId=job_id)
                # logger.info(response)
                archive_description = response['archiveDescription']
                archive_checksum = response['checksum']
                filename = archive_description.split("/")[-1:][0]
                archive_file = workspace + '/' + filename
                logger.info("Saving data to %s " % archive_file)
                file = open(archive_file, "wb")
                file.write(response['body'].read())
                file.flush()
                file.close()
                file_checksum = calculate_tree_hash(open(archive_file, 'rb'))
                logger.info("Archived File checksum: %s" % archive_checksum)
                logger.info("Download File checksum: %s" % file_checksum)
                if archive_checksum != file_checksum:
                    logger.error("Download archive file has a different checksum.")
                    os.remove(archive_file)
                else:
                    logger.info("Download file checksum validation successful.")
                    logger.info("Removing the archive id from shelve db queue.")
                    deleted_archive_ids.append(archive_id)
                    s3key = archive_description
                    s3_upload_file(s3, bucket_name, s3key, archive_file)
            else:
                logger.info("Job Id: %s" % job_id)
                if status_code != '':
                    logger.info("Status Code: %s" % status_code)
                if status_message != '':
                    logger.info("Status Message: %s" % status_message)
        if len(deleted_archive_ids) != 0:
            logger.info("Removing job ids that has been completed.")
            for archive_id in deleted_archive_ids:
                del job_ids[archive_id]
            shelve_db['job_ids'] = job_ids
        logger.info("Closing shelve db for retrieval job.")
        shelve_db.close()
    return
Ejemplo n.º 30
0
def upload_to_vault(access_key: str,
                    secret_key: str,
                    vault_name: str,
                    file_name: str,
                    log: logging.Logger = None,
                    archive_name: str = None,
                    region: str = 'ap-south-1') -> Optional[dict]:
  """Upload archive to S3 Glacier.

  Uploads files to S3 Glacier for archival.

  Args:
    access_key: AWS access key.
    secret_key: AWS saccess_key: str,
    bucket_name: Bucket to upload to.
    file_name: Local file to upload.
    log: Logger object for logging the status.
    s3_name: Name (default: None) for the uploaded file.

  Returns:
    Dictionary/Response of the uploaded archived file.
  """
  # You can find the reference code here:
  # https://stackoverflow.com/a/52602270
  try:
    glacier = boto3.client('glacier',
                           aws_access_key_id=access_key,
                           aws_secret_access_key=secret_key,
                           region_name=region)
  except (ClientError, NoCredentialsError):
    log.error('Wrong credentials used to access the AWS account.')
    return None
  else:
    if archive_name is None:
      try:
        archive_name = os.path.basename(file_name)
      except FileNotFoundError:
        log.error('File not found.')
        return None

    upload_chunk = 2 ** 25
    mp_upload = glacier.initiate_multipart_upload
    mp_part = glacier.upload_multipart_part
    cp_upload = glacier.complete_multipart_upload
    multipart_archive_upload = mp_upload(vaultName=vault_name,
                                         archiveDescription=file_name,
                                         partSize=str(upload_chunk))

    file_size = os.path.getsize(file_name)
    multiple_parts = math.ceil(file_size / upload_chunk)

    with open(file_name, 'rb') as upload_archive:
      for idx in range(multiple_parts):
        min_size = idx * upload_chunk
        max_size = min_size + upload_chunk - 1

        if max_size > file_size:
          max_size = (file_size - min_size) + min_size - 1
        file_part = upload_archive.read(upload_chunk)
        mp_part(vaultName=vault_name,
                uploadId=multipart_archive_upload['uploadId'],
                range=f'bytes {min_size}-{max_size}/{file_size}',
                body=file_part)

    checksum = calculate_tree_hash(open(file_name, 'rb'))
    complete_upload = cp_upload(vaultName=vault_name,
                                uploadId=multipart_archive_upload['uploadId'],
                                archiveSize=str(file_size),
                                checksum=checksum)

    log.info(f'"{file_name}" file archived on AWS S3 Glacier.')
    return complete_upload
Ejemplo n.º 31
0
                vaultName=vault_name,
                uploadId=upload['uploadId'],
                range=data_range,
                body=data)
        except Exception as e:
            if retry == max_retry:
                print 'Max number of retry.'
                response = client.abort_multipart_upload(
                    vaultName=vault_name, uploadId=upload['uploadId'])
                sys.exit(1)
            print e.message + ' Retry...'
            retry += 1
            continue
        print "OK"
        break

file.seek(0)
file_checksum = calculate_tree_hash(file)

upload_complete = client.complete_multipart_upload(vaultName=vault_name,
                                                   uploadId=upload['uploadId'],
                                                   archiveSize=str(file_len),
                                                   checksum=file_checksum)
file.close()
if upload_complete['ResponseMetadata']['HTTPStatusCode'] == '201':
    print "Upload complete"
    pprint(upload_complete)
else:
    print "Upload failed"
    sys.exit(1)
Ejemplo n.º 32
0
def main(argv):
    global logger, aws_region, aws_profile, logging
    logger.info('Started')

    args = parse_args()
    if args.loglevel is False:
        logger.info("Using default logging level: INFO")
    else:
        loglevel = args.loglevel
        logger.info("Changing logging level: " + loglevel.upper())
        if loglevel == "debug":
            logging.getLogger().setLevel(logging.DEBUG)
        elif loglevel == "info":
            logging.getLogger().setLevel(logging.INFO)
        elif loglevel == "error":
            logging.getLogger().setLevel(logging.ERROR)
        elif loglevel == "critical":
            logging.getLogger().setLevel(logging.CRITICAL)

    if args.awsprofile is False:
        logger.error("--aws-profile parameter is missing")
        return 1
    else:
        aws_profile = args.awsprofile
        logger.info("AWS Profile: " + aws_profile)

    if args.vault_name is False:
        logger.error("--vault-name parameter is missing")
        return 1
    else:
        vault_name = args.vault_name
        logger.info("Vault Name: " + vault_name)

    if args.bucket_name is False:
        logger.error("--bucket-name parameter is missing")
        return 1
    else:
        bucket_name = args.bucket_name
        logger.info("Bucket Name: " + bucket_name)
    if args.region is False:
        aws_region = 'us-east-1'
    else:
        aws_region = args.region
    logger.info("Number of days: %s" % args.expired_days)
    session = boto3.Session(profile_name=aws_profile)
    s3 = session.client('s3')
    response = s3.list_buckets()
    if not any(item['Name'] == bucket_name for item in response['Buckets']):
        logger.error("Bucket name %s not found" % bucket_name)
        return 1
    logger.info("Found bucket name %s" % bucket_name)
    logger.info("Retrieving a list of files from S3 bucket based on Dynamodb table names")
    list_of_files = []
    dynamodb_client = session.client('dynamodb')
    response = dynamodb_client.list_tables(Limit=100)
    for table_name in response['TableNames']:
        is_truncated = True
        next_continuation_token = None
        while is_truncated:
            if next_continuation_token is None:
                response = s3.list_objects_v2(Bucket=bucket_name, Prefix=table_name, MaxKeys=1000)
            else:
                logger.info("Using next continuation token %s" % next_continuation_token)
                response = s3.list_objects_v2(Bucket=bucket_name, Prefix=table_name,
                                              MaxKeys=1000, ContinuationToken=next_continuation_token)
            for key in response.keys():
                logger.debug("Response: %s" % key)
            is_truncated = response['IsTruncated']
            key_count = response['KeyCount']
            logger.info("There are %s objects with prefix %s in bucket name %s" % (key_count, table_name, bucket_name))
            if 'NextContinuationToken' in response:
                next_continuation_token = response['NextContinuationToken']
            if key_count != 0:
                for item in response['Contents']:
                    # only upload if the archive size is bigger than 0 KB
                    if item['Size'] > 0 and not re.search(r'glacier_archive|logs/|manifest', item['Key'], re.M | re.I):
                        logger.debug("Found backup file: %s" % item['Key'])
                        if item not in list_of_files:
                            list_of_files.append(item)
                logger.debug("Truncated %s" % is_truncated)
                logger.debug(json_print(response['Contents']))
    logger.info("Looping through each file and determine if it needs to be archived.")
    for item in list_of_files:
        logger.debug(item)
        ts = time.time()
        archive_time = datetime.fromtimestamp(ts).strftime('%Y%m%dT%H%M%S')
        s3file_path = item['Key']
        upload_time = item['LastModified']
        file_size = item['Size']
        table_name, backup_time, file_name = s3file_path.split("/")
        bucket_path = table_name + "/" + backup_time
        logger.debug("Bucket Path: %s" % bucket_path)
        logger.debug("File %s | Upload Date %s | Size %s" % (s3file_path,
                                                            upload_time.strftime("%Y-%m-%d"), file_size))
        upload_date = upload_time.date()
        today = date.today()
        delta = today - upload_date
        number_of_days = delta.days
        year, month, day = str(today.strftime("%Y-%m-%d")).split("-")
        logger.debug("Upload date %s | Today: %s | Delta Days %s" % (upload_date,
                                                                     str(today.strftime("%Y-%m-%d")), number_of_days))
        if number_of_days < int(args.expired_days):
            logger.debug("File %s NOT older than %s days" % (file_name, args.expired_days))
            continue
        logger.info("File %s older than %s days will be archived to glacier vault %s." % (file_name, args.expired_days,
                                                                                          vault_name))
        my_file, file_extension = os.path.splitext(file_name)
        if len(file_extension) != 0:
            new_file_name = table_name + "-" + backup_time + file_extension
            logger.info("Rename the backup file %s with table name and extension %s" % (file_name, new_file_name))
        else:
            new_file_name = table_name + "-" + backup_time
            logger.info("Rename the backup file %s with table name %s" % (file_name, new_file_name))
        download_file = workspace + '/' + new_file_name
        manifest_file = bucket_path + '/manifest'
        download_manifest_file = workspace + '/manifest'
        try:
            response = s3_download_file(s3, bucket_name, s3file_path, download_file)
            response = s3_download_file(s3, bucket_name, manifest_file, download_manifest_file)
            logger.info("Download  backup file and manifest file status: Successful")
        except S3ResponseError:
            logger.error("Download %s from S3 Failed." % s3file_path)
            logger.error(json_print(response))
            logger.info("Delete downloaded file.")
            os.remove(download_file)
            os.remove(download_manifest_file)
            return 1
        # description = bucket_path + "/" + new_file_name
        response = archive_file_to_glacier_multipart(session, args.vault_name, s3file_path, download_file)
        logger.debug(json_print(response))
        logger.info("Calculate and compare checksum....")
        checksum = calculate_tree_hash(open(download_file, 'rb'))
        archive_checksum = response['checksum']
        logger.info(" Archive checksum: %s" % archive_checksum)
        logger.info("Expected checksum: %s" % checksum)
        if archive_checksum != checksum:
            logger.error("Upload archive %s has a different checksum.")
            os.remove(download_file)
            os.remove(download_manifest_file)
            return 1
        else:
            logger.info("Archive file %s checksum successful." % download_file)
        response["filename"] = s3file_path
        logger.info("Delete downloaded file")
        logger.info("Saving glacier archive id information to file.")
        manifest_data = open(download_manifest_file).read()
        response['manifest'] = json.loads(manifest_data)
        logger.debug(json_print(response))
        archive_id = response['archiveId']
        filename = response['filename']
        location = response['location']
        glacier_id_file = new_file_name + '-' + str(archive_time) + '.txt'
        with open(workspace + '/' + glacier_id_file, 'w') as outfile:
            json.dump(response, outfile, ensure_ascii=True)
        key_filename = "glacier_archived_id/{0}/{1}/{2}/{3}".format(year, month, day, glacier_id_file)
        with open(workspace + '/' + glacier_id_file) as f:
            object_data = f.read()
        logger.info("Upload archive id to s3 bucket for backup.")
        response = s3.put_object(Body=object_data, Bucket=bucket_name, Key=key_filename, ServerSideEncryption='AES256',
                                 StorageClass='STANDARD', ContentType='text/plain')
        logger.debug(json_print(response))
        os.remove(workspace + '/' + glacier_id_file)
        os.remove(download_manifest_file)
        os.remove(download_file)
        copy_archived_file(session, bucket_name, s3file_path)
        copy_archived_file(session, bucket_name, manifest_file)
        delete_folder_after_archive(s3, bucket_name, bucket_path)
        store_archive_id(archive_id, filename, checksum, location)
    logger.info('Finished')
    return
Ejemplo n.º 33
0
 def test_tree_hash_multiple_of_one_mb(self):
     four_mb = six.BytesIO(b'a' * (4 * 1024 * 1024))
     self.assertEqual(
         calculate_tree_hash(four_mb),
         '9491cb2ed1d4e7cd53215f4017c23ec4ad21d7050a1e6bb636c4f67e8cddb844')
Ejemplo n.º 34
0
 def test_tree_hash_offset_of_one_mb_multiple(self):
     offset_four_mb = six.BytesIO(b'a' * (4 * 1024 * 1024) + b'a' * 20)
     self.assertEqual(
         calculate_tree_hash(offset_four_mb),
         '12f3cbd6101b981cde074039f6f728071da8879d6f632de8afc7cdf00661b08f')
Ejemplo n.º 35
0
 def get_treehash(self):
     treehash = ''
     with open(self.filename, 'rb') as f:
         treehash = calculate_tree_hash(f)
     return treehash
Ejemplo n.º 36
0
#! /usr/bin/env python3
from sys import argv

fname = argv[1]

from botocore.utils import calculate_tree_hash
with open(fname, "rb") as f:
    print(calculate_tree_hash(f))

Ejemplo n.º 37
0
upload_id = response['uploadId']

try:
    with open(filepath, 'r') as fp:

        fp.seek(0, 2)
        total_size = fp.tell()
        fp.seek(0)
        offsets_start = range(0, total_size, part_size)
        offsets_end = [x + part_size - 1 for x in offsets_start]
        offsets_end[-1] = total_size - 1
        for i, offset_start, in enumerate(offsets_start):
            offset_end = offsets_end[i]
            range_str = 'bytes {}-{}/*'.format(offset_start, offset_end)
            data = fp.read(part_size)
            part_tree_hash = calculate_tree_hash(cStringIO.StringIO(data))
            print 'Sending part {} of {} ({})...'.format(
                i, len(offsets_start), range_str)
            print '- SHA256 tree hash (local):  {}'.format(part_tree_hash)
            while True:
                try:
                    response = client.upload_multipart_part(
                        vaultName=vault_name,
                        uploadId=upload_id,
                        checksum=part_tree_hash,
                        range=range_str,
                        body=data)
                    break
                except ClientError:
                    print 'Trying again...'
            print '- SHA256 tree hash (remote): {}'.format(