def calculate_stats(self, msg): start_time = parse_ts(msg['Service-Read']) end_time = parse_ts(msg['Service-Write']) elapsed_time = end_time - start_time if elapsed_time > self.max_time: self.max_time = elapsed_time if elapsed_time < self.min_time: self.min_time = elapsed_time self.total_time += elapsed_time.seconds if start_time < self.earliest_time: self.earliest_time = start_time if end_time > self.latest_time: self.latest_time = end_time
def parse_ts_extended(ts): warnings.warn( "parse_ts_extended has been deprecated and will be removed in version " "1.3 because boto.utils.parse_ts has subsumed the old functionality.", PendingDeprecationWarning ) return parse_ts(ts)
def run(self): """ Backup postgres database for specific `app_code`. """ s3connection = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) s3bucket = s3connection.get_bucket(AWS_BUCKET) DBDATESTAMP = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') DBURL = APP_CODES.get(self.__app_code) DUMPFILE = 'postgres-{}-{}.pg_dump'.format(self.__app_code, DBDATESTAMP) BACKUP_COMMAND = 'pg_dump --format=c --dbname="{}"'.format(DBURL) # Determine where to put this backup now = datetime.datetime.now() for directory in DIRECTORIES: prefix = directory['name'] + '/' earliest_current_date = now - datetime.timedelta( days=directory['days']) s3keys = s3bucket.list(prefix=prefix) large_enough_backups = filter(lambda x: x.size >= MINIMUM_SIZE, s3keys) young_enough_backup_found = False for backup in large_enough_backups: if parse_ts(backup.last_modified) >= earliest_current_date: young_enough_backup_found = True if not young_enough_backup_found: # This directory doesn't have any current backups; stop here and use it # as the destination break # Perform the backup filename = ''.join((prefix, DUMPFILE)) print('Backing up to "{}"...'.format(filename)) upload = s3bucket.new_key(filename) chunks_done = 0 with smart_open.smart_open(upload, 'wb') as s3backup: process = subprocess.Popen(BACKUP_COMMAND, shell=True, stdout=subprocess.PIPE) while True: chunk = process.stdout.read(CHUNK_SIZE) if not len(chunk): print('Finished! Wrote {} chunks; {}'.format( chunks_done, humanize.naturalsize(chunks_done * CHUNK_SIZE))) break s3backup.write(chunk) chunks_done += 1 if '--hush' not in sys.argv: print('Wrote {} chunks; {}'.format( chunks_done, humanize.naturalsize(chunks_done * CHUNK_SIZE))) print('Backup `{}` successfully sent to S3.'.format(filename)) return # Close thread
def cleanup(): aws_lifecycle = os.environ.get("AWS_BACKUP_BUCKET_DELETION_RULE_ENABLED", "False") == "True" s3connection = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) s3bucket = s3connection.get_bucket(AWS_BUCKET) if not aws_lifecycle: # Remove old backups beyond desired retention for directory in DIRECTORIES: prefix = directory['name'] + '/' keeps = directory['keeps'] s3keys = s3bucket.list(prefix=prefix) large_enough_backups = filter(lambda x: x.size >= MINIMUM_SIZE, s3keys) large_enough_backups = sorted(large_enough_backups, key=lambda x: x.last_modified, reverse=True) for l in large_enough_backups: now = datetime.datetime.now() delta = now - parse_ts(l.last_modified) if delta.days > keeps: print('Deleting old backup "{}"...'.format(l.name)) l.delete()
def modified_time(self, name): name = self._normalize_name(self._clean_name(name)) entry = self.entries.get(name) if entry is None: entry = self.bucket.get_key(self._encode_name(name)) # Parse the last_modified string to a local datetime object. return parse_ts(entry.last_modified)
def set_keys(self, credentials, timeout=None): keys = self.stored_keys.get(credentials.path) if keys is not None: expiration = parse_ts(keys["Expiration"]) if timeout: expiration = keys["LastUpdated"] + timeout if datetime.utcnow() > expiration: log.info("Keys expired, recreating them") keys = None if keys is None: log.info("Assuming role") pair = IamSaml(credentials.provider, credentials.idp_username, "") pair.basic_auth = self.basic_auth keys = pair.get_result(credentials.role).credentials.to_dict() _keys = { "Code": "Success", "LastUpdated": datetime.utcnow(), "AccessKeyId": keys["access_key"], "SecretAccessKey": keys["secret_key"], "Token": keys["session_token"], "Expiration": keys["expiration"] } self.stored_keys[credentials.path] = _keys self.stored_assertions[credentials.path] = pair.assertion return self.stored_keys[credentials.path], self.stored_assertions[ credentials.path]
def retrieve_response(self, spider, request): response = super(S3CacheStorage, self).retrieve_response(spider, request) if response is None: # not in local filesystem cache, so try copying from s3 local_path = self._get_request_path(spider, request) remote_path = os.path.relpath(local_path, self.tmpcachedir).lower() bucket = self.conn.get_bucket(self.bucket_name, validate=False) def _get_key(filename): key_name = os.path.join(remote_path, filename) return bucket.get_key(key_name) # check if the key exists metadata_key = _get_key('pickled_meta') if metadata_key is None: return None # key not found # check if the cache entry has expired mtime = parse_ts(metadata_key.last_modified) if 0 < self.expiration_secs < (datetime.datetime.utcnow() - mtime).total_seconds(): return None # expired # deserialise the cached response metadata = pickle.loads(metadata_key.get_contents_as_string()) body = _get_key('response_body').get_contents_as_string() rawheaders = _get_key('response_headers').get_contents_as_string() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def endElement(self, name, value, connection): if name == 'reservedInstancesModificationId': self.modification_id = value elif name == 'createDate': self.create_date = parse_ts(value) elif name == 'updateDate': self.update_date = parse_ts(value) elif name == 'effectiveDate': self.effective_date = parse_ts(value) elif name == 'status': self.status = value elif name == 'statusMessage': self.status_message = value elif name == 'clientToken': self.client_token = value else: setattr(self, name, value)
def parse_ts_extended(ts): RFC1123 = '%a, %d %b %Y %H:%M:%S %Z' rv = None try: rv = parse_ts(ts) except ValueError: rv = datetime.datetime.strptime(ts, RFC1123) return rv
def endElement(self, name, value, connection): if name == "reservedInstancesModificationId": self.modification_id = value elif name == "createDate": self.create_date = parse_ts(value) elif name == "updateDate": self.update_date = parse_ts(value) elif name == "effectiveDate": self.effective_date = parse_ts(value) elif name == "status": self.status = value elif name == "statusMessage": self.status_message = value elif name == "clientToken": self.client_token = value else: setattr(self, name, value)
def key_dt(key): from boto.utils import parse_ts try: modified = time.strptime(key.last_modified, '%a, %d %b %Y %H:%M:%S %Z') dt = datetime.datetime.fromtimestamp(mktime(modified)) return dt except: return parse_ts(key.last_modified)
def _save_to_s3(self, data, mime, update=False, compress=True): ziped_data = None content_encoding = None headers = {} if compress and mime == 'application/vnd.google-earth.kml+xml': ziped_data = self._gzip_data(data) content_encoding = 'gzip' headers['Content-Encoding'] = 'gzip' if not update: if content_encoding == 'gzip' and ziped_data is not None: data = ziped_data try: k = Key(bucket=self.bucket) k.key = self.file_id k.set_metadata('Content-Type', mime) k.content_type = mime k.content_encoding = content_encoding k.set_metadata('Content-Encoding', content_encoding) k.set_contents_from_string(data, replace=False) key = self.bucket.get_key(k.key) last_updated = parse_ts(key.last_modified) except Exception as e: raise exc.HTTPInternalServerError('Error while configuring S3 key (%s) %s' % (self.file_id, e)) try: _save_item(self.admin_id, file_id=self.file_id, last_updated=last_updated) except Exception as e: raise exc.HTTPInternalServerError('Cannot create file on Dynamodb (%s)' % e) else: try: if content_encoding == 'gzip' and ziped_data is not None: data = ziped_data # Inconsistant behaviour with metadata, see https://github.com/boto/boto/issues/2798 self.key.content_encoding = content_encoding self.key.set_metadata('Content-Encoding', content_encoding) self.key.set_contents_from_string(data, replace=True) key = self.bucket.get_key(self.key.key) last_updated = parse_ts(key.last_modified) except Exception as e: raise exc.HTTPInternalServerError('Error while updating S3 key (%s) %s' % (self.key.key, e)) try: _save_item(self.admin_id, last_updated=last_updated) except Exception as e: raise exc.HTTPInternalServerError('Cannot update file on Dynamodb (%s) %s' % (self.file_id, e))
def modified_time(self, name): name = self._normalize_name(self._clean_name(name)) entry = self.entries.get(name) # only call self.bucket.get_key() if the key is not found # in the preloaded metadata. if entry is None: entry = self.bucket.get_key(self._encode_name(name)) # Parse the last_modified string to a local datetime object. return parse_ts(entry.last_modified)
def test_new_file_modified_time(self): self.storage.preload_metadata = True name = 'test_storage_save.txt' content = ContentFile('new content') utcnow = datetime.datetime.utcnow() with mock.patch('storages.backends.s3boto.datetime') as mock_datetime: mock_datetime.utcnow.return_value = utcnow self.storage.save(name, content) self.assertEqual(self.storage.modified_time(name), parse_ts(utcnow.strftime(ISO8601)))
def clean_s3(now): remove_datetime = now - timedelta(hours=24) bucket = getS3Bucket() for key in bucket.list(): key_name = key.name.encode('utf-8') key_datetime = parse_ts(key.last_modified) if 'data' in key_name and len(key_name) == 34 and key_datetime < remove_datetime: print key_name, 'Remove' bucket.delete_key(key_name) else: print key_name, 'Skipping'
def _save_to_s3(self, data, mime, update=False, compress=True): data_payload = data content_encoding = None headers = {'Cache-Control': 'no-cache, must-revalidate'} if compress and mime == 'application/vnd.google-earth.kml+xml': data_payload = self._gzip_data(data) content_encoding = 'gzip' if not update: try: # Push object to bucket replace = False k = Key(bucket=self.bucket) _push_object_to_s3(k, self.file_id, mime, content_encoding, headers, data_payload, replace) key = self.bucket.get_key(k.key) last_updated = parse_ts(key.last_modified) except Exception as e: raise exc.HTTPInternalServerError('Error while configuring S3 key (%s) %s' % (self.file_id, e)) try: # Push to dynamoDB, only one entry per object _save_item(self.admin_id, file_id=self.file_id, last_updated=last_updated, bucketname=self.bucket.name) except Exception as e: raise exc.HTTPInternalServerError('Cannot create file on Dynamodb (%s)' % e) else: try: # Inconsistant behaviour with metadata, see https://github.com/boto/boto/issues/2798 # Push object to bucket replace = True _push_object_to_s3(self.key, self.file_id, mime, content_encoding, headers, data_payload, replace) key = self.bucket.get_key(self.key.key) last_updated = parse_ts(key.last_modified) except Exception as e: raise exc.HTTPInternalServerError('Error while updating S3 key (%s) %s' % (self.file_id, e)) try: _save_item(self.admin_id, last_updated=last_updated, bucketname=self.bucket.name) except Exception as e: raise exc.HTTPInternalServerError('Cannot update file on Dynamodb (%s) %s' % (self.file_id, e))
def handle_noargs(self, **options): connection = S3Connection(settings.BACKUP_S3_ACCESS_KEY, settings.BACKUP_S3_SECRET_KEY) bucket = connection.get_bucket(settings.BACKUP_S3_BUCKET) count_deleted = 0 size_deleted = 0 for key in bucket.list(): file_datetime = parse_ts(key.last_modified) # Time is appararently two hours earlier than local time file_date = (file_datetime + relativedelta(hours=2)).date() if not must_keep_file(file_date): count_deleted += 1 size_deleted += key.size key.delete() print "%d files are deleted with a total size of %s" % (count_deleted, size(size_deleted))
def main(): ## Args parser = argparse.ArgumentParser() parser.add_argument('--bucket', required=True, help='Bucket') parser.add_argument('--endpoint', default=boto.s3.connection.NoHostProvided, help='S3 endpoint') parser.add_argument('--profile', help='Boto profile used for connection') args = parser.parse_args() ## S3 Connection bucket = S3Connection(suppress_consec_slashes=False, host=args.endpoint, is_secure=True, profile_name=args.profile).get_bucket(args.bucket) ## Hadoop Counters totalsize = 0 ## In a Stream? start_index = campanile.stream_index() ## Process input for line in fileinput.input("-"): if line.startswith('#'): continue delim, prefix = line.rstrip('\n').split('\t')[start_index].split(',') for key in bucket.list(prefix=prefix, delimiter=delim): if key.__class__.__name__ == "Prefix": continue ## Don't include glacier obejcts if key.storage_class == 'GLACIER': continue print "%s\t%s\t%s\t%s" % (key.name.encode('utf-8'), key.etag.replace("\"", ""), key.size, parse_ts(key.last_modified)) ## Log stats campanile.counter(args.bucket, "Bytes", key.size)
def get_last_key(keys, days_ago): """ Loops over the keys and finds the last key that is at least `days_ago` old. """ last_key = None cur_time_in_seconds = time.mktime(time.gmtime()) min_difference_in_days = float('inf') SECONDS_IN_DAY = 60 * 60 * 24 for key in keys: # Get the time of last modification of the key, in seconds parsed_datetime = utils.parse_ts(key.last_modified) time_in_seconds = time.mktime(parsed_datetime.timetuple()) difference_in_days = (cur_time_in_seconds - time_in_seconds) / float(SECONDS_IN_DAY) # We want the latest backup that is at least `day_ago` old if difference_in_days < min_difference_in_days and difference_in_days > days_ago: last_key = key min_difference_in_days = difference_in_days return last_key
def main(): ## Args parser = argparse.ArgumentParser() parser.add_argument('--bucket', required=True, help='Bucket') parser.add_argument('--endpoint', default=boto.s3.connection.NoHostProvided, help='S3 endpoint') parser.add_argument('--profile', help='Boto profile used for connection') args = parser.parse_args() ## S3 Connection bucket = S3Connection(suppress_consec_slashes=False, host=args.endpoint,is_secure=True, profile_name=args.profile).get_bucket(args.bucket) ## Hadoop Counters totalsize = 0 ## In a Stream? start_index = campanile.stream_index() ## Process input for line in fileinput.input("-"): if line.startswith('#'): continue delim, prefix = line.rstrip('\n').split('\t')[start_index].split(',') for key in bucket.list(prefix=prefix,delimiter=delim): if key.__class__.__name__ == "Prefix": continue ## Don't include glacier obejcts if key.storage_class == 'GLACIER': continue print "%s\t%s\t%s\t%s" % (key.name.encode('utf-8'), key.etag.replace("\"", ""), key.size, parse_ts(key.last_modified)) ## Log stats campanile.counter(args.bucket, "Bytes", key.size)
def keys(self): keys = getattr(self, "_keys", None) if keys is not None: expiration = parse_ts(keys["Expiration"]) if datetime.utcnow() > expiration: log.info("Keys expired, recreating them") keys = None if keys is None: log.info("Assuming role") pair = IamSaml(self.credentials.keys.provider, self.credentials.keys.idp_username, "") pair.basic_auth = self.basic_auth keys = pair.get_result(self.credentials.keys.role).credentials.to_dict() self._keys = { "Code": "Success" , "LastUpdated": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S:00Z") , "AccessKeyId": keys["access_key"] , "SecretAccessKey": keys["secret_key"] , "Token": keys["session_token"] , "Expiration": keys["expiration"] } return self._keys
def main(): config = ConfigParser.ConfigParser() config.read(filenames=["aws-s3-restore.conf"]) aws_access_key = config.get(section="aws", option="key") aws_access_secret = config.get(section="aws", option="secret") conn = S3Connection(aws_access_key, aws_access_secret) buckets = conn.get_all_buckets() print "The following buckets are available" print "\n".join(["- {}".format(bucket.name) for bucket in buckets]) bucket = None while not bucket: print "Enter the exact name of the bucket to restore from:", name = raw_input().strip() bucket = next( iter([bucket for bucket in buckets if bucket.name == name]), None) if not bucket: print "Not a valid bucket" print "Using bucket `{bucket_name}`".format(bucket_name=bucket.name) restore_before = datetime.today() date_set = "n" while not date_set == "y": print "From how many days ago do you wish to restore? ", days = raw_input().strip() try: day_offset = int(days) except exceptions.ValueError: print "Error, you must supply an integer" continue restore_before = datetime.today() - timedelta(days=day_offset) print "Use files modified on `{date}` (or nearest preceding version) (y/N)? ".format( date=restore_before), date_set = raw_input().strip().lower() print print "Add files/folders for restoration" all_folders = "n" objects_to_restore = [] while not all_folders == "y": print "Full path of file/folder to restore: ", add_folder = raw_input().strip() if add_folder[0] is not "/": print "Error, supplied path does not begin with a `/`; discarding" else: objects_to_restore.append(add_folder) print "Folders currently in restore set: " print "\n".join(["- {}".format(f) for f in objects_to_restore]) print "Done adding folders (y/N)? ", all_folders = raw_input().strip().lower() print "NOTICE: Files will be restored to *this* working directory (and subdirectories)" print "Do you want to run the restore (y/N)? ", if not raw_input().strip().lower() == "y": os.exit(-1) else: valid_prefixes = [] print "Running restore from bucket `{bucket_name}`".format( bucket_name=bucket.name) for obj in objects_to_restore: prefix = obj[1:] # Remove the leading slash keys = bucket.get_all_versions(prefix=prefix) if not keys: print "Invalid prefix: `{obj}`".format(obj=obj) else: valid_prefixes.append(prefix) print print "Restoring files modified *before* `{restore_date}` (or nearest preceding version)".format( restore_date=restore_before) print "Aggregating backupset details..." # Determine the available versions for this file list all_files = {} for prefix in valid_prefixes: for version in bucket.list_versions(prefix=prefix): last_modified = parse_ts(version.last_modified) if last_modified < restore_before: # Only restore if older than specified date if version.name not in all_files or version.last_modified > all_files[ version.name].last_modified: # Add to list, or update if newer version available all_files[version.name] = version total_file_count = len(all_files.keys()) print "{count} file(s) to be restored".format(count=total_file_count) print print "Beginning Restore: " i = 0 for file_prefix, version in all_files.iteritems(): i = i + 1 print "- ({number}/{total}): `{name}`".format( number=i, total=total_file_count, name=file_prefix) dirs = os.path.dirname(file_prefix) if not os.path.exists(dirs): os.makedirs(dirs) if isinstance(version, DeleteMarker): print " WARNING: File was previously DELETED on {date}; skipping".format( date=version.last_modified) else: if not os.path.exists(file_prefix): # Open relative to our working path fp = open(file_prefix, "w") version.get_file(fp, version_id=version.version_id) fp.close() else: print " WARNING: Already exists at restore location; skipping"
def modified_time(self, name): dt = tz.make_aware(parse_ts(self._get_key(name).last_modified), tz.utc) return tz.make_naive(dt)
def get_modified_time(self, name): dt = tz.make_aware(parse_ts(self._get_key(name).last_modified), tz.utc) return dt if setting('USE_TZ') else tz.make_naive(dt)
def backup(mount_point, aws_key, aws_secret_key, lockers=[], dryrun=False, keep_old=25): devices = [get_device_for_mount(mount_point)] mapped_devices = [str.replace(device, '/dev/xvd', '/dev/sd') for device in devices] if devices[0].startswith("/dev/md"): devices = get_devices_for_raid(devices[0]) instance_id = boto.utils.get_instance_metadata()['instance-id'] region = boto.utils.get_instance_metadata()['placement']['availability-zone'][:-1] ec2 = connect_to_region(region, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret_key) instance = ec2.get_all_instances([instance_id])[0].instances[0] all_volumes = ec2.get_all_volumes() volumes = [] for v in all_volumes: if v.attach_data.instance_id == instance_id: if v.attach_data.device in devices or v.attach_data.device in mapped_devices: volumes.append(v) if not volumes: sys.stderr.write("No EBS volumes found for devices %s\n" % devices) sys.exit(1) logging.info("Instance ID: %s", instance_id) logging.info("Devices: %s", ", ".join(devices)) logging.info("Volumes: %s", ", ".join(v.id for v in volumes)) locker_instances = [] for l in lockers: l = l.split(':') cls = LOCKER_CLASSES[l[0]] kwargs = {} for k, v in (x.split('=') for x in l[1:]): if v.lower() == "true": v = True elif v.lower() == "false": v = False kwargs[k] = v kwargs['dryrun'] = dryrun inst = cls(**kwargs) locker_instances.append(inst) if not inst.validate(): return locker_instances.append(XFSLocker(mount_point, dryrun=dryrun)) with contextlib.nested(*locker_instances): for v in volumes: name = v.tags.get('Name') logging.info("Snapshoting %s (%s)", v.id, name or 'NONAME') if not dryrun: snap = v.create_snapshot() if name: snap.add_tag('Name', name) snapshots = ec2.get_all_snapshots(filters={'volume-id': [volume.id for volume in volumes]}) for s in snapshots: start_time = parse_ts(s.start_time) if start_time < datetime.now() - timedelta(keep_old): logging.info("Deleting Snapshot %s (%s - %s) of %s from %s", s.id, s.description, s.tags, s.volume_id, s.start_time) if not dryrun: s.delete()
def main(): config = read_config() # Cool! Let's set up everything. connect_to_region(config.region, aws_access_key_id=access_key_id, aws_secret_access_key=secret_key) glacier = Layer2(aws_access_key_id=access_key_id, aws_secret_access_key=secret_key, region_name=config.region) vault = glacier.get_vault(config.vault_name) # workaround for UnicodeDecodeError # https://github.com/boto/boto/issues/3318 vault.name = str(vault.name) print "Beginning job on " + vault.arn # Ah, we don't have a vault listing yet. if not config.ls_present: # No job yet? Initiate a job. if not config.inventory_job: config.inventory_job = vault.retrieve_inventory() config.write() print "Requested an inventory. This usually takes about four hours." terminate(0) # We have a job, but it's not finished. job = vault.get_job(config.inventory_job) if not job.completed: print "Waiting for an inventory. This usually takes about four hours." terminate(0) # Finished! try: data = json.loads(job.get_output().read()) except ValueError: print "Something went wrong interpreting the data Amazon sent!" terminate(1) config.ls = {} for archive in data['ArchiveList']: config.ls[archive['ArchiveDescription']] = { 'id': archive['ArchiveId'], 'last_modified': int(float(time.mktime(parse_ts(archive['CreationDate']).timetuple()))), 'size': int(archive['Size']), 'hash': archive['SHA256TreeHash'] } config.ls_present = '-' config.inventory_job = '' config.write() print "Imported a new inventory from Amazon." database = Database( host=db_host, port=db_port, username=db_username, password=db_password, name=db_name ) print "Connected to database." # Let's upload! os.stat_float_times(False) try: i = 0 transferred = 0 time_begin = time.time() for dir in config.dirs: print "Syncing " + dir for file in database.files(): path = dir + os.sep + file if not os.path.exists(path): #print >> sys.stderr, "'%s' does not exist" % path print "\n" + "'%s' does not exist" % path continue # If it's a directory, then ignore it if not os.path.isfile(path): continue last_modified = int(os.path.getmtime(path)) size = os.path.getsize(path) updating = False if file in config.ls: # Has it not been modified since? if config.ls[file]['last_modified'] >= last_modified and config.ls[file]['size'] == size: continue # It's been changed... we should delete the old one else: vault.delete_archive(config.ls[file]['id']) del config.ls[file] updating = True config.write() try: print file + ": uploading... ", id = vault.concurrent_create_archive_from_file(path, file) config.ls[file] = { 'id': id, 'size': size, 'last_modified': last_modified } config.write() i += 1 transferred += size if updating: print "updated." else: print "done." database.update(file, id, vault) except UploadArchiveError: print "FAILED TO UPLOAD." finally: database.close() elapsed = time.time() - time_begin print "\n" + str(i) + " files successfully uploaded." print "Transferred " + format_bytes(transferred) + " in " + format_time(elapsed) + " at rate of " + format_bytes(transferred / elapsed) + "/s." terminate(0)
def last_modified(self): return mktime(parse_ts(self.key.last_modified).timetuple())
AWS_BUCKET = os.environ.get('BACKUP_AWS_STORAGE_BUCKET_NAME') s3connection = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) s3bucket = s3connection.get_bucket(AWS_BUCKET) ############################################################################### # Determine where to put this backup now = datetime.datetime.now() for directory in DIRECTORIES: prefix = directory['name'] + '/' earliest_current_date = now - datetime.timedelta(days=directory['days']) s3keys = s3bucket.list(prefix=prefix) large_enough_backups = filter(lambda x: x.size >= MINIMUM_SIZE, s3keys) young_enough_backup_found = False for backup in large_enough_backups: if parse_ts(backup.last_modified) >= earliest_current_date: young_enough_backup_found = True if not young_enough_backup_found: # This directory doesn't have any current backups; stop here and use it # as the destination break # Perform the backup filename = ''.join((prefix, DUMPFILE)) print('Backing up to "{}"...'.format(filename)) os.system( "{backup_command} && s3cmd put --multipart-chunk-size-mb={chunk_size}" " /srv/backups/{source} s3://{bucket}/{filename}" " && rm -rf /srv/backups/{source}".format(backup_command=BACKUP_COMMAND, bucket=AWS_BUCKET,
# Finished! try: data = json.loads(job.get_output().read()) except ValueError: print "Something went wrong interpreting the data Amazon sent!" terminate(1) ls = {} for archive in data['ArchiveList']: ls[archive['ArchiveDescription']] = { 'id': archive['ArchiveId'], 'last_modified': int( float( time.mktime(parse_ts( archive['CreationDate']).timetuple()))), 'size': int(archive['Size']), 'hash': archive['SHA256TreeHash'] } ls_present = '-' inventory_job = '' write() print "Imported a new inventory from Amazon." # Let's upload! os.stat_float_times(False) try: i = 0
def backup(app_code): """ Backup postgres database for specific `app_code`. Args: app_code (str): `kc` or `kpi` """ DBDATESTAMP = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') # `postgis://` isn't recognized by `pg_dump`; replace it with `postgres://` DBURL = re.sub(r'^postgis://', 'postgres://', APP_CODES.get(app_code)) # Because we are running `pg_dump` within the container, # we need to replace the hostname ... DBURL = DBURL.replace(os.getenv("POSTGRES_HOST"), "127.0.0.1") # ... and the port for '127.0.0.1:5432' DBURL = re.sub(r"\:(\d+)\/", ":5432/", DBURL) DUMPFILE = 'postgres-{}-{}-{}-{}.pg_dump'.format( app_code, os.environ.get('PG_MAJOR'), os.environ.get('PUBLIC_DOMAIN_NAME'), DBDATESTAMP, ) BACKUP_COMMAND = 'pg_dump --format=c --dbname="{}"'.format(DBURL) # Determine where to put this backup now = datetime.datetime.now() for directory in DIRECTORIES: prefix = directory['name'] + '/' earliest_current_date = now - datetime.timedelta( days=directory['days']) s3keys = s3bucket.list(prefix=prefix) large_enough_backups = filter(lambda x: x.size >= MINIMUM_SIZE, s3keys) young_enough_backup_found = False for backup in large_enough_backups: if parse_ts(backup.last_modified) >= earliest_current_date: young_enough_backup_found = True if not young_enough_backup_found: # This directory doesn't have any current backups; stop here and use it # as the destination break # Perform the backup filename = ''.join((prefix, DUMPFILE)) print('Backing up to "{}"...'.format(filename)) upload = s3bucket.new_key(filename) chunks_done = 0 with smart_open.smart_open(upload, 'wb') as s3backup: process = subprocess.Popen(BACKUP_COMMAND, shell=True, stdout=subprocess.PIPE) while True: chunk = process.stdout.read(CHUNK_SIZE) if not len(chunk): print('Finished! Wrote {} chunks; {}'.format( chunks_done, humanize.naturalsize(chunks_done * CHUNK_SIZE))) break s3backup.write(chunk) chunks_done += 1 if '--hush' not in sys.argv: print('Wrote {} chunks; {}'.format( chunks_done, humanize.naturalsize(chunks_done * CHUNK_SIZE))) print('Backup `{}` successfully sent to S3.'.format(filename))
if not job.completed: print "Waiting for an inventory. This usually takes about four hours." terminate(0) # Finished! try: data = json.loads(job.get_output().read()) except ValueError: print "Something went wrong interpreting the data Amazon sent!" terminate(1) ls = {} for archive in data['ArchiveList']: ls[archive['ArchiveDescription']] = { 'id': archive['ArchiveId'], 'last_modified': int(float(time.mktime(parse_ts(archive['CreationDate']).timetuple()))), 'size': int(archive['Size']), 'hash': archive['SHA256TreeHash'] } ls_present = '-' inventory_job = '' write() print "Imported a new inventory from Amazon." db_connection = pymysql.connect( host=db_host, port=db_port, user=db_username, password=db_password, db=db_name
def get_key_timestamp(self, file_id): key = self.get_key(file_id) if key: last_updated = parse_ts(key.last_modified) return last_updated.strftime('%Y-%m-%d %X') return time.strftime('%Y-%m-%d %X', time.localtime())
def sync_fbo_weekly(self): """ This task will sync the latest full copy of FBO's xml and any intermediary files. It will overwrite the weekly file. We make a personal s3 copy of the data since the FBO ftp service is unreliable and tends to get hammered during peak hours. Files are stored to S3 in a gzipped format. Working files are stored in temp_dir and can be processed in other processes. """ conn = connect_s3() vitals_bucket = conn.get_bucket(S3_BUCKET) storage_path = None try: self.ftp.connect() self.ftp.login() sourceModifiedTime = self.ftp.sendcmd('MDTM datagov/FBOFullXML.xml')[4:] sourceModifiedDateTime = datetime.strptime(sourceModifiedTime, "%Y%m%d%H%M%S") sourceModifiedDateTimeStr = sourceModifiedDateTime.strftime("%Y%m%d") filename = 'FBOFullXML'+sourceModifiedDateTimeStr+'.xml' fullFBOKey = vitals_bucket.get_key(S3_EXTRACT_PREFIX+filename+S3_ARCHIVE_FORMAT) if not fullFBOKey or parse_ts(fullFBOKey.last_modified) < sourceModifiedDateTime: #Update S3 copy with latest print "downloading the latest full xml from repository" storage_path = path.join(self.temp_dir, filename) with open(storage_path, 'wb') as local_file: # Download the file a chunk at a time using RET self.ftp.retrbinary('RETR datagov/FBOFullXML.xml', local_file.write) finally: self.ftp.close() if not storage_path: return print "zipping the fbo full file" zipped_storage_path = path.join(self.temp_dir, filename+S3_ARCHIVE_FORMAT) with open(storage_path, 'rb') as f_in: with gzip.GzipFile(zipped_storage_path, 'wb') as myzip: myzip.writelines(f_in) print "uploading the latest full xml to S3" # Put file to S3 source_size = os.stat(zipped_storage_path).st_size # Create a multipart upload request mp = vitals_bucket.initiate_multipart_upload(S3_EXTRACT_PREFIX+os.path.basename(zipped_storage_path)) # Use a chunk size of 50 MiB (feel free to change this) chunk_size = 52428800 chunk_count = int(math.ceil(source_size / chunk_size)) # Send the file parts, using FileChunkIO to create a file-like object # that points to a certain byte range within the original file. We # set bytes to never exceed the original file size. try: for i in range(chunk_count + 1): print "uploading chunk {0} of {1}".format(i+1, chunk_count+1) offset = chunk_size * i bytes = min(chunk_size, source_size - offset) with FileChunkIO(zipped_storage_path, 'r', offset=offset, bytes=bytes) as fp: mp.upload_part_from_file(fp, part_num=i + 1) finally: # Finish the upload mp.complete_upload() print "clearing any delta files from s3" keys_to_delete = vitals_bucket.list(prefix=S3_EXTRACT_PREFIX) for key in keys_to_delete: if 'FBOFeed' in key: vitals_bucket.delete_key(key)
def run(): """ Backup postgres database for specific `app_code`. """ s3connection = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) s3bucket = s3connection.get_bucket(AWS_BUCKET) DBDATESTAMP = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') DUMPFILE = 'mongo-{}-{}-{}.gz'.format( os.environ.get('MONGO_MAJOR'), os.environ.get('PUBLIC_DOMAIN_NAME'), DBDATESTAMP, ) MONGO_INITDB_ROOT_USERNAME = os.environ.get('MONGO_INITDB_ROOT_USERNAME') MONGO_INITDB_ROOT_PASSWORD = os.environ.get('MONGO_INITDB_ROOT_PASSWORD') if MONGO_INITDB_ROOT_USERNAME and MONGO_INITDB_ROOT_PASSWORD: BACKUP_COMMAND = 'mongodump --archive --gzip --username="******"' \ ' --password="******"'.format( username=MONGO_INITDB_ROOT_USERNAME, password=MONGO_INITDB_ROOT_PASSWORD ) else: BACKUP_COMMAND = "mongodump --archive --gzip" # Determine where to put this backup now = datetime.datetime.now() for directory in DIRECTORIES: prefix = directory['name'] + '/' earliest_current_date = now - datetime.timedelta(days=directory['days']) s3keys = s3bucket.list(prefix=prefix) large_enough_backups = filter(lambda x: x.size >= MINIMUM_SIZE, s3keys) young_enough_backup_found = False for backup in large_enough_backups: if parse_ts(backup.last_modified) >= earliest_current_date: young_enough_backup_found = True if not young_enough_backup_found: # This directory doesn't have any current backups; stop here and use it # as the destination break # Perform the backup filename = ''.join((prefix, DUMPFILE)) print('Backing up to "{}"...'.format(filename)) upload = s3bucket.new_key(filename) chunks_done = 0 with smart_open.smart_open(upload, 'wb') as s3backup: process = subprocess.Popen( BACKUP_COMMAND, shell=True, stdout=subprocess.PIPE) while True: chunk = process.stdout.read(CHUNK_SIZE) if not len(chunk): print('Finished! Wrote {} chunks; {}'.format( chunks_done, humanize.naturalsize(chunks_done * CHUNK_SIZE) )) break s3backup.write(chunk) chunks_done += 1 if '--hush' not in sys.argv: print('Wrote {} chunks; {}'.format( chunks_done, humanize.naturalsize(chunks_done * CHUNK_SIZE) )) print('Backup `{}` successfully sent to S3.'.format(filename))
if not job.completed: print "Waiting for an inventory. This usually takes about four hours." terminate(0) # Finished! try: data = json.loads(job.get_output().read()) except ValueError: print "Something went wrong interpreting the data Amazon sent!" terminate(1) ls = {} for archive in data['ArchiveList']: ls[archive['ArchiveDescription']] = { 'id': archive['ArchiveId'], 'last_modified': int(float(time.mktime(parse_ts(archive['CreationDate']).timetuple()))), 'size': int(archive['Size']), 'hash': archive['SHA256TreeHash'] } ls_present = '-' inventory_job = '' write() print "Imported a new inventory from Amazon." # Let's upload! os.stat_float_times(False) try: i = 0 transferred = 0 time_begin = time.time()
def main(): config = ConfigParser.ConfigParser() config.read(filenames=["aws-s3-restore.conf"]) aws_access_key = config.get(section="aws", option="key") aws_access_secret = config.get(section="aws", option="secret") conn = S3Connection(aws_access_key, aws_access_secret) buckets = conn.get_all_buckets() print "The following buckets are available" print "\n".join(["- {}".format(bucket.name) for bucket in buckets]) bucket = None while not bucket: print "Enter the exact name of the bucket to restore from:", name = raw_input().strip() bucket = next( iter([bucket for bucket in buckets if bucket.name == name]), None) if not bucket: print "Not a valid bucket" print "Using bucket `{bucket_name}`".format(bucket_name=bucket.name) restore_before = datetime.today() date_set = "n" while not date_set == "y": print "From how many days ago do you wish to restore? ", days = raw_input().strip() try: day_offset = int(days) except exceptions.ValueError: print "Error, you must supply an integer" continue restore_before = datetime.today() - timedelta(days=day_offset) print "Use files modified on `{date}` (or nearest preceding version) (y/N)? ".format(date=restore_before), date_set = raw_input().strip().lower() print print "Add files/folders for restoration" all_folders = "n" objects_to_restore = [] while not all_folders == "y": print "Full path of file/folder to restore: ", add_folder = raw_input().strip() if add_folder[0] is not "/": print "Error, supplied path does not begin with a `/`; discarding" else: objects_to_restore.append(add_folder) print "Folders currently in restore set: " print "\n".join(["- {}".format(f) for f in objects_to_restore]) print "Done adding folders (y/N)? ", all_folders = raw_input().strip().lower() print "NOTICE: Files will be restored to *this* working directory (and subdirectories)" print "Do you want to run the restore (y/N)? ", if not raw_input().strip().lower() == "y": os.exit(-1) else: valid_prefixes = [] print "Running restore from bucket `{bucket_name}`".format(bucket_name=bucket.name) for obj in objects_to_restore: prefix = obj[1:] # Remove the leading slash keys = bucket.get_all_versions(prefix=prefix) if not keys: print "Invalid prefix: `{obj}`".format(obj=obj) else: valid_prefixes.append(prefix) print print "Restoring files modified *before* `{restore_date}` (or nearest preceding version)".format(restore_date=restore_before) print "Aggregating backupset details..." # Determine the available versions for this file list all_files = {} for prefix in valid_prefixes: for version in bucket.list_versions(prefix=prefix): last_modified = parse_ts(version.last_modified) if last_modified < restore_before: # Only restore if older than specified date if version.name not in all_files or version.last_modified > all_files[version.name].last_modified: # Add to list, or update if newer version available all_files[version.name] = version total_file_count = len(all_files.keys()) print "{count} file(s) to be restored".format(count=total_file_count) print print "Beginning Restore: " i = 0 for file_prefix, version in all_files.iteritems(): i = i + 1 print "- ({number}/{total}): `{name}`".format(number=i, total=total_file_count, name=file_prefix) dirs = os.path.dirname(file_prefix) if not os.path.exists(dirs): os.makedirs(dirs) if isinstance(version, DeleteMarker): print " WARNING: File was previously DELETED on {date}; skipping".format(date=version.last_modified) else: if not os.path.exists(file_prefix): # Open relative to our working path fp = open(file_prefix, "w") version.get_file(fp, version_id=version.version_id) fp.close() else: print " WARNING: Already exists at restore location; skipping"