def cli(filename, output): """calculate Merkle Tree """ hash_value = calculate_tree_hash(filename) if output == 'binary': hash_value = binascii.unhexlify(hash_value) click.echo(hash_value)
def multipart_upload(filename, part_size=PART_SIZE): glacier = boto3.resource('glacier', region_name='us-west-2') # There's no error if the vault already exists so we don't # need to catch any exceptions here. vault = glacier.create_vault(vaultName='botocore-integ-test-vault') file_size = os.path.getsize(filename) # Initiate a multipart upload multipart_upload = vault.initiate_multipart_upload( archiveDescription='multipart upload', partSize=str(part_size)) try: # Upload each part for i in range(file_size/part_size+1): range_from = i*part_size range_to = min((i+1)*part_size-1, file_size-1) body = ReadFileChunk.from_filename(filename, range_from, part_size) multipart_upload.upload_part( body=body, range='bytes %d-%d/*' % (range_from, range_to)) # Complete a multipart upload transaction response = multipart_upload.complete( checksum=calculate_tree_hash(open(filename, 'rb')), # NEEDED archiveSize=str(file_size)) return vault.Archive(response['archiveId']) except: multipart_upload.abort() raise
def glacier_sync_multipart_upload(ctx, version, file_path, file_size): # Reference: https://boto3.readthedocs.io/en/latest/reference/services/glacier.html#Glacier.Vault.initiate_multipart_upload multipart_upload = ctx.vault.initiate_multipart_upload( archiveDescription=version.location['object'], partSize=str(GLACIER_PART_SIZE), ) with open(file_path, 'rb') as fp: for byte_offset in range(0, file_size, GLACIER_PART_SIZE): part = fp.read(GLACIER_PART_SIZE) range_header = 'bytes {}-{}/{}'.format(byte_offset, byte_offset + len(part) - 1, file_size) multipart_upload.upload_part( range=range_header, body=part, ) # TODO: Ideally this would be computed on upload, however this is also a good double check, so we do not incur any off-by-one issues. # see https://boto3.readthedocs.io/en/latest/reference/services/glacier.html#Glacier.MultipartUpload.complete checksum = calculate_tree_hash(open(file_path, 'rb')) response = multipart_upload.complete( archiveSize=str(file_size), checksum=checksum, ) assert response['checksum'] == checksum return response['archiveId']
def multipart_upload(filename, part_size=PART_SIZE): glacier = boto3.resource('glacier', region_name='us-west-2') # There's no error if the vault already exists so we don't # need to catch any exceptions here. vault = glacier.create_vault(vaultName='botocore-integ-test-vault') file_size = os.path.getsize(filename) # Initiate a multipart upload multipart_upload = vault.initiate_multipart_upload( archiveDescription='multipart upload', partSize=str(part_size)) try: # Upload each part for i in range(file_size / part_size + 1): range_from = i * part_size range_to = min((i + 1) * part_size - 1, file_size - 1) body = ReadFileChunk.from_filename(filename, range_from, part_size) multipart_upload.upload_part(body=body, range='bytes %d-%d/*' % (range_from, range_to)) # Complete a multipart upload transaction response = multipart_upload.complete( checksum=calculate_tree_hash(open(filename, 'rb')), # NEEDED archiveSize=str(file_size)) return vault.Archive(response['archiveId']) except: multipart_upload.abort() raise
def add_glacier_checksums(params, **kwargs): """Add glacier checksums to the http request. This will add two headers to the http request: * x-amz-content-sha256 * x-amz-sha256-tree-hash These values will only be added if they are not present in the HTTP request. """ request_dict = params headers = request_dict['headers'] body = request_dict['body'] if isinstance(body, six.binary_type): # If the user provided a bytes type instead of a file # like object, we're temporarily create a BytesIO object # so we can use the util functions to calculate the # checksums which assume file like objects. Note that # we're not actually changing the body in the request_dict. body = six.BytesIO(body) starting_position = body.tell() if 'x-amz-content-sha256' not in headers: headers['x-amz-content-sha256'] = utils.calculate_sha256( body, as_hex=True) body.seek(starting_position) if 'x-amz-sha256-tree-hash' not in headers: headers['x-amz-sha256-tree-hash'] = utils.calculate_tree_hash(body) body.seek(starting_position)
def add_glacier_checksums(params, **kwargs): """Add glacier checksums to the http request. This will add two headers to the http request: * x-amz-content-sha256 * x-amz-sha256-tree-hash These values will only be added if they are not present in the HTTP request. """ request_dict = params headers = request_dict['headers'] body = request_dict['body'] if isinstance(body, six.binary_type): # If the user provided a bytes type instead of a file # like object, we're temporarily create a BytesIO object # so we can use the util functions to calculate the # checksums which assume file like objects. Note that # we're not actually changing the body in the request_dict. body = six.BytesIO(body) starting_position = body.tell() if 'x-amz-content-sha256' not in headers: headers['x-amz-content-sha256'] = utils.calculate_sha256(body, as_hex=True) body.seek(starting_position) if 'x-amz-sha256-tree-hash' not in headers: headers['x-amz-sha256-tree-hash'] = utils.calculate_tree_hash(body) body.seek(starting_position)
def archive_file_to_glacier_multipart(session, vault_name, archive_desc, archive_file): """ Upload file to AWS glacier vault. :param session: :param vault_name: :param archive_desc: :param archive_file: :return: """ # use chunk size 10MB chunk_size = 1048576 * 32 glacier = session.resource("glacier") logger.info("Upload %s to glacier vault %s" % (archive_file, vault_name)) vault = glacier.Vault(account_id="-", name=vault_name) multipart_upload = vault.initiate_multipart_upload( accountId="-", archiveDescription=archive_desc, partSize=str(chunk_size)) # upload_id = multipart_upload.id f = open(archive_file, "rb") start_range = 0 for chunk in read_in_chunks(f, chunk_size): range_data = "bytes %s-%s/*" % (start_range, f.tell()-1) logger.info("Uploading range %s" % range_data) multipart_upload.upload_part(range=range_data, body=chunk) start_range = f.tell() f.seek(0) response = multipart_upload.complete(archiveSize=str(start_range), checksum=calculate_tree_hash(f)) f.close() # archive_id = response.get('archiveId') return response
def finalize_upload(self): self.file_handle.seek(0) filehash = calculate_tree_hash(self.file_handle) self.glacier.complete_multipart_upload( vaultName=self.vault, uploadId=self.multipart_upload_id, archiveSize=str(self.file_size), checksum=filehash, )
def complete_multipart_upload(self, upload_id, archive): size = archive.tell() archive.seek(0) checksum = calculate_tree_hash(archive) response = self.client.complete_multipart_upload( vaultName=self.vault_name, uploadId=upload_id, archiveSize=str(size), checksum=checksum) return response['archiveId']
def checksum(self): """ Calculate the checksum for the upload, as needed by Glacier API. :rtype: str """ if self._checksum is None: self._checksum = calculate_tree_hash(self.data) if self.completed: self._data = None return self._checksum
def complete_multipart_upload(self, upload_id, archive): size = archive.tell() archive.seek(0) checksum = calculate_tree_hash(archive) response = self.client.complete_multipart_upload( vaultName = self.vault_name, uploadId = upload_id, archiveSize = str(size), checksum = checksum ) return response['archiveId']
def finalize(self): step = 'finalize' total_checksum = calculate_tree_hash(open(self.filepath, 'rb')) self.responses[step] = get_client().complete_multipart_upload( archiveSize=str(get_file_size(self.filepath)), checksum=total_checksum, uploadId=self.upload_id, vaultName=self.vault_name, ) if self.responses[step].get('checksum', None) == total_checksum: self.archive_id = self.responses[step]['archiveId'] logging.info('multipart upload %s finished with archive ID %s' % (self.filepath, self.archive_id)) else: logging.info('multipart upload %s failed' % self.filepath)
def multipart_upload(account_id, chunk_size, file_name, glacier_vault, out_file): file_size = os.path.getsize(file_name) parts = math.ceil(file_size / chunk_size) client = boto3.client('glacier') try: # initiate initiate_multipart_upload upload_obj = client.initiate_multipart_upload( accountId=account_id, vaultName=glacier_vault, archiveDescription=file_name, partSize=str(chunk_size)) print(upload_obj) # upload file in chunks with open(file_name, 'rb') as upload: for p in range(parts): lower = p * chunk_size upper = lower + chunk_size - 1 if upper > file_size: upper = file_size - 1 file_part = upload.read(chunk_size) up_part = client.upload_multipart_part( accountId=account_id, vaultName=glacier_vault, uploadId=upload_obj['uploadId'], range='bytes {}-{}/*'.format(lower, upper), body=file_part) # complete the upload checksum = calculate_tree_hash(open(file_name, 'rb')) complete_upload = client.complete_multipart_upload( accountId=account_id, vaultName=glacier_vault, uploadId=upload_obj['uploadId'], archiveSize=str(file_size), checksum=checksum) except ClientError as e: logging.error(e) sys.exit() print("complete archiving " + file_name + " in Glacier vault " + glacier_vault) print(complete_upload) with open(out_file, "w") as file: file.write(str(complete_upload))
def upload(self): """ Upload the archive to Amazon Glacier by chunking it. """ self.start_upload() while not self.upload_part(): continue # Move to the front of the file to calculate its hash self.zip_file.seek(0) zip_hash = calculate_tree_hash(self.zip_file) response = self.client.complete_multipart_upload( vaultName=self.vault_name, uploadId=self.upload_id, archiveSize=str(self.zip_file_size), checksum=zip_hash ) return response
def upload_large_file(vault_name, filepath, description): """ Do a multi part upload to glacier :param vault_name: :param filepath: :param description: :return: """ glacier = boto3.resource("glacier") vault = glacier.Vault(account_id="-", name=vault_name) multipart_upload = vault.initiate_multipart_upload( accountId="-", archiveDescription=description, partSize=str(CHUNK_SIZE)) upload_id = multipart_upload.id print("Upload id: {}".format(upload_id)) with open(filepath, 'rb') as f: retrylist = upload_segments(multipart_upload, read_in_chunks(f, CHUNK_SIZE)) f.seek(0, 2) fsize = f.tell() while len(retrylist) > 0: print("Retrying failed parts") # syntax turns list into a generator retrylist = upload_segments(multipart_upload, (i for i in retrylist)) print("Finalizing upload {} ...".format(upload_id)) f.seek(0) s256t_hash = calculate_tree_hash(f) response = multipart_upload.complete(archiveSize=str(fsize), checksum=s256t_hash) print("Hash: {}".format(s256t_hash)) pprint.pprint(response) return response
def upload_archive(client, vault_name, file_name, description, concurrency): file_size = os.stat(file_name).st_size part_size = calculate_part_size(file_size) info('Initiating multipart upload') upload_id = client.initiate_multipart_upload(vaultName=vault_name, partSize=str(part_size), archiveDescription=description)['uploadId'] parts = generate_archive_parts(vault_name, file_name, upload_id, file_size, part_size) base_name = os.path.basename(file_name) info(f'Uploading {base_name} in {len(parts)} parts') with Pool(concurrency) as pool: pbar = tqdm(total=file_size, unit="", unit_scale=True, dynamic_ncols=True, bar_format='{percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} ({rate_fmt})') for uploaded_size in pool.imap_unordered(upload_archive_part, parts): pbar.update(uploaded_size) pbar.close() info('Verifying checksum') checksum = calculate_tree_hash(open(file_name, 'rb')) return client.complete_multipart_upload(vaultName=vault_name, uploadId=upload_id, archiveSize=str(file_size), checksum=checksum)
def multipart_upload(self, path_to_file, part_size=4, description=""): """Uploading a file in mutiple parts. Args: path_to_file (str): Path to the file. description (str, optional): Description of what is uploaded. part_size (int, optional): Size for the multipart parts. Defaults to 4 megabytes. """ if self.validator.preupload_checks(path_to_file, part_size) and self._vault_exists(self.vault_name): total_size = get_file_size(path_to_file) part_size_bytes = get_allowed_sizes().get(str(part_size)) parts = get_needed_parts(path_to_file, part_size_bytes, total_size) parts = add_byte_ranges(parts) response = self._initiate_multipart_upload(description, part_size_bytes, total_size) if self.validator.is_response_ok(response): upload_id = response.get("uploadId") upload_success = self._do_multipart_upload(upload_id, path_to_file, parts) if upload_success: self.logger.info("Calculating tree hash...") with open(path_to_file, 'rb') as file_object: total_hash = calculate_tree_hash(file_object) completed_response = self._complete_multipart_upload(upload_id, total_size, total_hash) if self.validator.is_response_ok(completed_response): self.logger.info("Upload completed.")
def update_local_inventory(local_dir, local_filenames, old_local_inventory): """Update the local inventory stored as a json file in the local directory. The updated local inventory is returned.""" archive_list = list() old_local_inventory_dict = dictify_inventory(old_local_inventory) for f in tqdm(local_filenames): if f in old_local_inventory_dict: checksum = old_local_inventory_dict[f] else: with open(os.path.join(local_dir, f), 'rb') as target_file: checksum = calculate_tree_hash(target_file) archive_list.append({ "ArchiveDescription" : f, "CreationDate" : datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "SHA256TreeHash" : checksum }) res = {"ArchiveList" : archive_list} return res
def test_tree_hash_exactly_one_mb(self): one_meg_bytestring = b'a' * (1 * 1024 * 1024) one_meg = six.BytesIO(one_meg_bytestring) self.assertEqual( calculate_tree_hash(one_meg), '9bc1b2a288b26af7257a36277ae3816a7d4f16e89c1e7e77d0a5c48bad62b360')
def test_tree_hash_less_than_one_mb(self): one_k = six.BytesIO(b'a' * 1024) self.assertEqual( calculate_tree_hash(one_k), '2edc986847e209b4016e141a6dc8716d3207350f416969382d431539bf292e4a')
def test_empty_tree_hash(self): self.assertEqual( calculate_tree_hash(six.BytesIO(b'')), 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
def mpUpload(fname, vault, desc, acctid, hrglass): '''Initiate a multipart upload, split file into parts, calculate checksums, treehash values, for each, and uploads to glacier in parallel according to # of cores in machine.''' startime = time.time() chunkscheme = chunksize(fname) # chunkscheme[0]= size of chunk fullsize = getsize(fname) size, partcount = chunkscheme[0], chunkscheme[1] #graph = '' try: if size < 42494967296: # this is not 40gb try: print('calling aws initiate function') try: desc = desc.replace('*', '+') except: pass mpu = glacier_client.initiate_multipart_upload( vaultName=vault, archiveDescription=desc, partSize=str(size)) except botocore.exceptions.ParamValidationError or botocore.exceptions.ClientError: print( 'Multipart Upload operation failed, {} not uploaded to {}'. format(fname, vault)) return None upid = mpu['uploadId'] print( '\nMultipart Upload initiated for {} to {}\n Upload Request ID: {}' .format(fname, vault, upid)) try: print(hrglass) except: pass print( 'This may take a while, {} is {} watching it happen is not recommended...\n' .format(fname, size_display(fullsize))) print('\nsplitting input file into {} parts\n'.format(partcount)) presplit = time.time() - startime print('time {}'.format(timer(startime, time.time()))) filist = unxplit(fname, size, PART_PREFIX) postplit = time.time() - presplit print('time: {}'.format(timer(postplit, time.time()))) of = len(filist) if of != partcount: print('of: {} doesnt equal partcount: {}...'.format( of, partcount)) trash(PART_PREFIX) sys.exit() totrange = 0 #response = [] all_params = [] # iterate over each file part to compile parameters print('Compiling parameters for upload segments...') prop = 0 for part in filist: part_params = { 'xpart': part, 'vault': vault, 'upid': upid, 'acctid': ACCOUNT_ID } num = filist.index(part) + 1 bytestring = open(part, 'rb') thash = str(core.calculate_tree_hash(bytestring)) part_params.update({'xthash': thash}) bytestring.close( ) # size is the closest power of two that is greater than the size of the part! btrange = totrange + getsize(part) prtrange = (totrange, btrange - 1) part_params.update({'xrange': prtrange}) all_params.append(part_params) # display progress comprop = prop prop = (len(all_params) * 100) / of if prop != comprop: print(bar(prop), ' {} of {}'.format(num, of)) # increment how much of the file has now been accounted for totrange += size # range is increasing by size, which is larger than the size of the part print('Uploading to Amazon Glacier...\n') # map part uploads into parallel scheme using upool ccount = 0 errcount = 0 done = round((ccount / of) * 100) # done refers to the percentage pool = Pool(processes=cpu_count()) for partload in pool.imap_unordered( partfeed, all_params): # pool.imap() parallelization if not partload[2]: # upload part failed print('failed to upload range {}-{}'.format( partload[0], partload[1])) errcount += 1 # make errcount a vector of the failed parameters else: # successful part upload pdone = done ccount += 1 done = (ccount / of) * 100 if done != pdone: print('byterange {}-{} successfully uploaded'.format( partload[0], partload[1])) print(bar(done)) print('{} of {} successfully uploaded'.format(ccount, of)) pool.close() pool.join() pool.terminate() pool = Pool(processes=cpu_count()) print('{} failed parts'.format(errcount)) if errcount != 0: #firstpass = time.time()-postplit errcount = 0 uploaded = glacier_client.list_parts(vaultName=vault, uploadId=upid)['Parts'] while len(uploaded) < len(all_params): # check to see if all parts have uploaded print('time: {}'.format(timer(postplit, time.time()))) postplit = time.time() errcount = 0 print('retrying {} failed parts'.format( errcount)) # no .format here rangit = [i['RangeInBytes'] for i in uploaded] checkparts = (tuple(byterange.split('-')) for byterange in rangit ) # received byteranges from AWS server rcheckparts = [(int(ad[0]), int(ad[1])) for ad in checkparts] remains = [ left for left in all_params if left['xrange'] not in rcheckparts ] for partload in pool.imap(partfeed, remains): if not partload[2]: print('failed to upload range: {}-{}'.format( partload[0], partload[1])) errcount += 1 else: ccount += 1 print( 'bytes {}-{} (part {} of {}) successfully uploaded!' .format(partload[0], partload[1], ccount, of)) print(bar(round((ccount / of) * 100))) print('{} of {} successfully uploaded'.format(ccount, of)) print('{} failed parts'.format(errcount)) pool.close() pool.join() uploaded = glacier_client.list_parts( vaultName=vault, uploadId=upid)['Parts'] pool.terminate() pool = Pool(processes=cpu_count()) # close pool #pool.close() #pool.join() # retry any uploads left over with open(fname, 'rb') as f: full_tree_hash = core.calculate_tree_hash(f) f.close() completion = glacier_client.complete_multipart_upload( vaultName=vault, uploadId=upid, archiveSize=str(fullsize), checksum=full_tree_hash, accountId=acctid) # trash(PART_PREFIX) print('\nMultipart Upload of Archive: {} to Vault {} Completed\n'. format(fname, vault)) pool.terminate() # for safety, * is marker string for history update output = { 'FileName': fname, 'Description': "{}: {} *{}* ".format( completion['ResponseMetadata']['HTTPHeaders']['date'], desc, fname), 'VaultName': vault, 'ArchiveId': completion['archiveId'], 'Size': str(fullsize) } return output else: # this needs to happen before the success message if print('Amazon does not support archive files over 40 GB, {} is {}'. format(fname, size_display(fullsize))) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() name = split(exc_tb.tb_frame.f_code.co_filename)[1] print('Python raised the following excception: {} {} {} {}'.format( exc_type, e, name, exc_tb.tb_lineno)) glacier_client.abort_multipart_upload(vaultName=vault, upload_id=upid) finally: trash(PART_PREFIX) return None
def test_tree_hash_offset_of_one_mb_multiple(self): offset_four_mb = six.BytesIO(b'a' * (4 * 1024 * 1024) + b'a' * 20) self.assertEqual( calculate_tree_hash(offset_four_mb), '12f3cbd6101b981cde074039f6f728071da8879d6f632de8afc7cdf00661b08f')
def test_tree_hash_multiple_of_one_mb(self): four_mb = six.BytesIO(b'a' * (4 * 1024 * 1024)) self.assertEqual( calculate_tree_hash(four_mb), '9491cb2ed1d4e7cd53215f4017c23ec4ad21d7050a1e6bb636c4f67e8cddb844')
def upload_archive(transfer_file, vault_name): glacier_client = boto3.client('glacier', region_name='ap-southeast-2', aws_access_key_id='DUMMY_ID', aws_secret_access_key='DUMMY_ACCESS') total = os.path.getsize(transfer_file) size = ceiling_log(total/32, 2) # 32 cores on this NCI machine init_mp_upl_resp = glacier_client.initiate_multipart_upload(vaultName=vault_name, archiveDescription='2000_062 cwb waveform data', partSize=str(size)) print init_mp_upl_resp['uploadId'] #write_uploadid(os.path.splitext(transfer_file)[0]+'.id', init_mp_upl_resp['uploadId']) ''' # running 32 parallel thread with the code below caused a argument mismatch error in the # multiprocessing code! so continuing with serial upload for now. will investigate and # try to fix the parallel upload. pool = multiprocessing.Pool(multiprocessing.cpu_count()) tasks = [] start = 0 while start < total: tasks.append( (transfer_file, start, size, init_mp_upl_resp['uploadId']) ) start += size results = [pool.apply_async( transfer_part, t ) for t in tasks] all_uploaded=True for i, result in enumerate(results): if not result.get(): print('Part number ', i, ' was not uploaded successfully') all_uploaded=False ''' all_uploaded = True start = 0 while start < total: if transfer_part(transfer_file, start, size, init_mp_upl_resp['uploadId'], vault_name): start += size else: all_uploaded = False if all_uploaded: print("All Files Uploaded") print("Verifying Checksum...") complete_up = glacier_client.complete_multipart_upload(vaultName=vault_name, uploadId=init_mp_upl_resp['uploadId'], archiveSize=str(total), checksum=calculate_tree_hash(open(transfer_file, 'rb'))) print("Upload Completed:", complete_up) else: print("Upload of archive file:", transfer_file, " failed...")
def main(argv): global logger, aws_region, aws_profile, logging, retrieval_tier, sns_topic logger.info('Started') topic_arn = '' args = parse_args() if args.loglevel is False: logger.info("Using default logging level: INFO") else: loglevel = args.loglevel logger.info("Changing logging level: " + loglevel.upper()) if loglevel == "debug": logging.getLogger().setLevel(logging.DEBUG) elif loglevel == "info": logging.getLogger().setLevel(logging.INFO) elif loglevel == "error": logging.getLogger().setLevel(logging.ERROR) elif loglevel == "critical": logging.getLogger().setLevel(logging.CRITICAL) if args.awsprofile is None: logger.error("--aws-profile parameter is missing.") return 1 else: aws_profile = args.awsprofile logger.info("AWS Profile: " + aws_profile) if args.region is not None: aws_region = args.region if args.vault_name is not None: vault_name = args.vault_name logger.info("Vault Name: " + vault_name) else: logger.error("--vault-name parameter is missing.") return 1 if args.sns_topic is not None: sns_topic = args.sns_topic else: sns_topic = None if args.archive_id is not None: archive_id = args.archive_id logger.info("Archive Id: " + archive_id) else: archive_id = None if args.retrieval_tier is not None: retrieval_tier = args.retrieval_tier logger.info("Glacier Retrieval Tier: %s" % retrieval_tier) session = boto3.Session(profile_name=aws_profile, region_name=aws_region) sts = session.client('sts') account_id = sts.get_caller_identity()["Account"] if args.bucket_name is None: bucket_name = 'glacier-restored-%s-%s' % (account_id, aws_region) logger.info("Default Bucket For Restore: " + bucket_name) else: bucket_name = args.bucket_name logger.info("Specified Bucket For Restore: " + bucket_name) s3 = session.client('s3') glacier = session.client('glacier') response = s3.list_buckets() if not any(item['Name'] == bucket_name for item in response['Buckets']): logger.info("Bucket name %s not found. Creating new bucket in region %s" % (bucket_name, aws_region)) if aws_region != 'us-east-1': response = s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': aws_region}) else: response = s3.create_bucket(Bucket=bucket_name) logger.debug(json_print(response)) waiter = s3.get_waiter('bucket_exists') waiter.wait(Bucket=bucket_name) logger.info('Bucket creation successful.') logger.info("Found bucket name %s" % bucket_name) if sns_topic is not None: sns = session.client('sns') topic_arn = 'arn:aws:sns:%s:%s:%s' % (aws_region, account_id, sns_topic) logger.info("Topic Arn: %s" % topic_arn) try: response = sns.list_subscriptions_by_topic(TopicArn=topic_arn) for endpoint in response['Subscriptions']: logger.info("Email notify: %s" % endpoint['Endpoint']) except ClientError as e: logger.error(e.response['Error']['Message']) return 1 if args.archive_id is not None: description = 'Retrieve archive on %s' % str(datetime.datetime.now()) try: response = glacier.initiate_job(vaultName=vault_name, jobParameters={ 'Type': glacier_job_type, 'ArchiveId': archive_id, 'Description': description, 'SNSTopic': topic_arn, 'Tier': retrieval_tier, } ) logger.info(json_print(response)) job_id = response['jobId'] logger.info("Open python shelve file %s" % shelve_file) shelve_db = shelve.open(shelve_file, flag='c', writeback=True) if 'job_ids' not in shelve_db: shelve_db['job_ids'] = dict() logger.info("Archive Id %s" % archive_id) logger.info("Storing job id %s with archive id" % job_id) job_ids = shelve_db['job_ids'] job_ids[archive_id] = job_id shelve_db['job_ids'] = job_ids for job_id in job_ids: logger.info("Job Id: %s" % job_id) logger.info("Close python shelve file.") shelve_db.close() except ClientError as e: logger.error(e.response['Error']['Message']) return 1 else: logger.info("Open shelve db file %s" % shelve_file) shelve_db = shelve.open(shelve_file, flag='c', writeback=True) logger.info("Checking on glacier retrieval job that is in progress...") if 'job_ids' not in shelve_db: logger.info("There are no pending jobs.") return job_ids = shelve_db['job_ids'] deleted_archive_ids = list() for archive_id in job_ids: job_id = job_ids[archive_id] try: response = glacier.describe_job(vaultName=vault_name, jobId=job_id) logger.info(json_print(response)) except ClientError as e: logger.info(e.response['Error']['Message']) return 1 if 'StatusCode' in response: status_code = response['StatusCode'] else: status_code = '' if 'StatusMessage' in response: status_message = response['StatusMessage'] else: status_message = '' if status_code == 'Succeeded' and status_message == 'Succeeded': logger.info("Archive Retrieval Successful") response = glacier.get_job_output(vaultName=vault_name, jobId=job_id) # logger.info(response) archive_description = response['archiveDescription'] archive_checksum = response['checksum'] filename = archive_description.split("/")[-1:][0] archive_file = workspace + '/' + filename logger.info("Saving data to %s " % archive_file) file = open(archive_file, "wb") file.write(response['body'].read()) file.flush() file.close() file_checksum = calculate_tree_hash(open(archive_file, 'rb')) logger.info("Archived File checksum: %s" % archive_checksum) logger.info("Download File checksum: %s" % file_checksum) if archive_checksum != file_checksum: logger.error("Download archive file has a different checksum.") os.remove(archive_file) else: logger.info("Download file checksum validation successful.") logger.info("Removing the archive id from shelve db queue.") deleted_archive_ids.append(archive_id) s3key = archive_description s3_upload_file(s3, bucket_name, s3key, archive_file) else: logger.info("Job Id: %s" % job_id) if status_code != '': logger.info("Status Code: %s" % status_code) if status_message != '': logger.info("Status Message: %s" % status_message) if len(deleted_archive_ids) != 0: logger.info("Removing job ids that has been completed.") for archive_id in deleted_archive_ids: del job_ids[archive_id] shelve_db['job_ids'] = job_ids logger.info("Closing shelve db for retrieval job.") shelve_db.close() return
def upload_to_vault(access_key: str, secret_key: str, vault_name: str, file_name: str, log: logging.Logger = None, archive_name: str = None, region: str = 'ap-south-1') -> Optional[dict]: """Upload archive to S3 Glacier. Uploads files to S3 Glacier for archival. Args: access_key: AWS access key. secret_key: AWS saccess_key: str, bucket_name: Bucket to upload to. file_name: Local file to upload. log: Logger object for logging the status. s3_name: Name (default: None) for the uploaded file. Returns: Dictionary/Response of the uploaded archived file. """ # You can find the reference code here: # https://stackoverflow.com/a/52602270 try: glacier = boto3.client('glacier', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region) except (ClientError, NoCredentialsError): log.error('Wrong credentials used to access the AWS account.') return None else: if archive_name is None: try: archive_name = os.path.basename(file_name) except FileNotFoundError: log.error('File not found.') return None upload_chunk = 2 ** 25 mp_upload = glacier.initiate_multipart_upload mp_part = glacier.upload_multipart_part cp_upload = glacier.complete_multipart_upload multipart_archive_upload = mp_upload(vaultName=vault_name, archiveDescription=file_name, partSize=str(upload_chunk)) file_size = os.path.getsize(file_name) multiple_parts = math.ceil(file_size / upload_chunk) with open(file_name, 'rb') as upload_archive: for idx in range(multiple_parts): min_size = idx * upload_chunk max_size = min_size + upload_chunk - 1 if max_size > file_size: max_size = (file_size - min_size) + min_size - 1 file_part = upload_archive.read(upload_chunk) mp_part(vaultName=vault_name, uploadId=multipart_archive_upload['uploadId'], range=f'bytes {min_size}-{max_size}/{file_size}', body=file_part) checksum = calculate_tree_hash(open(file_name, 'rb')) complete_upload = cp_upload(vaultName=vault_name, uploadId=multipart_archive_upload['uploadId'], archiveSize=str(file_size), checksum=checksum) log.info(f'"{file_name}" file archived on AWS S3 Glacier.') return complete_upload
vaultName=vault_name, uploadId=upload['uploadId'], range=data_range, body=data) except Exception as e: if retry == max_retry: print 'Max number of retry.' response = client.abort_multipart_upload( vaultName=vault_name, uploadId=upload['uploadId']) sys.exit(1) print e.message + ' Retry...' retry += 1 continue print "OK" break file.seek(0) file_checksum = calculate_tree_hash(file) upload_complete = client.complete_multipart_upload(vaultName=vault_name, uploadId=upload['uploadId'], archiveSize=str(file_len), checksum=file_checksum) file.close() if upload_complete['ResponseMetadata']['HTTPStatusCode'] == '201': print "Upload complete" pprint(upload_complete) else: print "Upload failed" sys.exit(1)
def main(argv): global logger, aws_region, aws_profile, logging logger.info('Started') args = parse_args() if args.loglevel is False: logger.info("Using default logging level: INFO") else: loglevel = args.loglevel logger.info("Changing logging level: " + loglevel.upper()) if loglevel == "debug": logging.getLogger().setLevel(logging.DEBUG) elif loglevel == "info": logging.getLogger().setLevel(logging.INFO) elif loglevel == "error": logging.getLogger().setLevel(logging.ERROR) elif loglevel == "critical": logging.getLogger().setLevel(logging.CRITICAL) if args.awsprofile is False: logger.error("--aws-profile parameter is missing") return 1 else: aws_profile = args.awsprofile logger.info("AWS Profile: " + aws_profile) if args.vault_name is False: logger.error("--vault-name parameter is missing") return 1 else: vault_name = args.vault_name logger.info("Vault Name: " + vault_name) if args.bucket_name is False: logger.error("--bucket-name parameter is missing") return 1 else: bucket_name = args.bucket_name logger.info("Bucket Name: " + bucket_name) if args.region is False: aws_region = 'us-east-1' else: aws_region = args.region logger.info("Number of days: %s" % args.expired_days) session = boto3.Session(profile_name=aws_profile) s3 = session.client('s3') response = s3.list_buckets() if not any(item['Name'] == bucket_name for item in response['Buckets']): logger.error("Bucket name %s not found" % bucket_name) return 1 logger.info("Found bucket name %s" % bucket_name) logger.info("Retrieving a list of files from S3 bucket based on Dynamodb table names") list_of_files = [] dynamodb_client = session.client('dynamodb') response = dynamodb_client.list_tables(Limit=100) for table_name in response['TableNames']: is_truncated = True next_continuation_token = None while is_truncated: if next_continuation_token is None: response = s3.list_objects_v2(Bucket=bucket_name, Prefix=table_name, MaxKeys=1000) else: logger.info("Using next continuation token %s" % next_continuation_token) response = s3.list_objects_v2(Bucket=bucket_name, Prefix=table_name, MaxKeys=1000, ContinuationToken=next_continuation_token) for key in response.keys(): logger.debug("Response: %s" % key) is_truncated = response['IsTruncated'] key_count = response['KeyCount'] logger.info("There are %s objects with prefix %s in bucket name %s" % (key_count, table_name, bucket_name)) if 'NextContinuationToken' in response: next_continuation_token = response['NextContinuationToken'] if key_count != 0: for item in response['Contents']: # only upload if the archive size is bigger than 0 KB if item['Size'] > 0 and not re.search(r'glacier_archive|logs/|manifest', item['Key'], re.M | re.I): logger.debug("Found backup file: %s" % item['Key']) if item not in list_of_files: list_of_files.append(item) logger.debug("Truncated %s" % is_truncated) logger.debug(json_print(response['Contents'])) logger.info("Looping through each file and determine if it needs to be archived.") for item in list_of_files: logger.debug(item) ts = time.time() archive_time = datetime.fromtimestamp(ts).strftime('%Y%m%dT%H%M%S') s3file_path = item['Key'] upload_time = item['LastModified'] file_size = item['Size'] table_name, backup_time, file_name = s3file_path.split("/") bucket_path = table_name + "/" + backup_time logger.debug("Bucket Path: %s" % bucket_path) logger.debug("File %s | Upload Date %s | Size %s" % (s3file_path, upload_time.strftime("%Y-%m-%d"), file_size)) upload_date = upload_time.date() today = date.today() delta = today - upload_date number_of_days = delta.days year, month, day = str(today.strftime("%Y-%m-%d")).split("-") logger.debug("Upload date %s | Today: %s | Delta Days %s" % (upload_date, str(today.strftime("%Y-%m-%d")), number_of_days)) if number_of_days < int(args.expired_days): logger.debug("File %s NOT older than %s days" % (file_name, args.expired_days)) continue logger.info("File %s older than %s days will be archived to glacier vault %s." % (file_name, args.expired_days, vault_name)) my_file, file_extension = os.path.splitext(file_name) if len(file_extension) != 0: new_file_name = table_name + "-" + backup_time + file_extension logger.info("Rename the backup file %s with table name and extension %s" % (file_name, new_file_name)) else: new_file_name = table_name + "-" + backup_time logger.info("Rename the backup file %s with table name %s" % (file_name, new_file_name)) download_file = workspace + '/' + new_file_name manifest_file = bucket_path + '/manifest' download_manifest_file = workspace + '/manifest' try: response = s3_download_file(s3, bucket_name, s3file_path, download_file) response = s3_download_file(s3, bucket_name, manifest_file, download_manifest_file) logger.info("Download backup file and manifest file status: Successful") except S3ResponseError: logger.error("Download %s from S3 Failed." % s3file_path) logger.error(json_print(response)) logger.info("Delete downloaded file.") os.remove(download_file) os.remove(download_manifest_file) return 1 # description = bucket_path + "/" + new_file_name response = archive_file_to_glacier_multipart(session, args.vault_name, s3file_path, download_file) logger.debug(json_print(response)) logger.info("Calculate and compare checksum....") checksum = calculate_tree_hash(open(download_file, 'rb')) archive_checksum = response['checksum'] logger.info(" Archive checksum: %s" % archive_checksum) logger.info("Expected checksum: %s" % checksum) if archive_checksum != checksum: logger.error("Upload archive %s has a different checksum.") os.remove(download_file) os.remove(download_manifest_file) return 1 else: logger.info("Archive file %s checksum successful." % download_file) response["filename"] = s3file_path logger.info("Delete downloaded file") logger.info("Saving glacier archive id information to file.") manifest_data = open(download_manifest_file).read() response['manifest'] = json.loads(manifest_data) logger.debug(json_print(response)) archive_id = response['archiveId'] filename = response['filename'] location = response['location'] glacier_id_file = new_file_name + '-' + str(archive_time) + '.txt' with open(workspace + '/' + glacier_id_file, 'w') as outfile: json.dump(response, outfile, ensure_ascii=True) key_filename = "glacier_archived_id/{0}/{1}/{2}/{3}".format(year, month, day, glacier_id_file) with open(workspace + '/' + glacier_id_file) as f: object_data = f.read() logger.info("Upload archive id to s3 bucket for backup.") response = s3.put_object(Body=object_data, Bucket=bucket_name, Key=key_filename, ServerSideEncryption='AES256', StorageClass='STANDARD', ContentType='text/plain') logger.debug(json_print(response)) os.remove(workspace + '/' + glacier_id_file) os.remove(download_manifest_file) os.remove(download_file) copy_archived_file(session, bucket_name, s3file_path) copy_archived_file(session, bucket_name, manifest_file) delete_folder_after_archive(s3, bucket_name, bucket_path) store_archive_id(archive_id, filename, checksum, location) logger.info('Finished') return
def get_treehash(self): treehash = '' with open(self.filename, 'rb') as f: treehash = calculate_tree_hash(f) return treehash
#! /usr/bin/env python3 from sys import argv fname = argv[1] from botocore.utils import calculate_tree_hash with open(fname, "rb") as f: print(calculate_tree_hash(f))
upload_id = response['uploadId'] try: with open(filepath, 'r') as fp: fp.seek(0, 2) total_size = fp.tell() fp.seek(0) offsets_start = range(0, total_size, part_size) offsets_end = [x + part_size - 1 for x in offsets_start] offsets_end[-1] = total_size - 1 for i, offset_start, in enumerate(offsets_start): offset_end = offsets_end[i] range_str = 'bytes {}-{}/*'.format(offset_start, offset_end) data = fp.read(part_size) part_tree_hash = calculate_tree_hash(cStringIO.StringIO(data)) print 'Sending part {} of {} ({})...'.format( i, len(offsets_start), range_str) print '- SHA256 tree hash (local): {}'.format(part_tree_hash) while True: try: response = client.upload_multipart_part( vaultName=vault_name, uploadId=upload_id, checksum=part_tree_hash, range=range_str, body=data) break except ClientError: print 'Trying again...' print '- SHA256 tree hash (remote): {}'.format(