def connect(self): if not self._conn: try: logging.debug("Connecting to AWS S3 with Access Key: %s" % self.access_key) self._conn = boto.s3.connect_to_region( self.region, aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key, is_secure=self.secure, calling_format=self.calling_format) logging.debug( "Successfully connected to AWS S3 with Access Key: %s" % self.access_key) except boto.exception.S3ResponseError, e: if self.is_forbidden_error(e): logging.error( "Not authorized to connect to AWS S3 with Access Key: %s!" % self.access_key) else: logging.error( "Cannot connect to AWS S3 with Access Key: %s!" % self.access_key) return OperationError(e) except Exception, e: logging.error("Cannot connect to AWS S3 with Access Key: %s!" % self.access_key) raise OperationError(e)
def __init__(self, manager, config, timer, base_dir, backup_dir, **kwargs): super(Zabbix, self).__init__(self.__class__.__name__, manager, config, timer, base_dir, backup_dir, **kwargs) self.server = self.config.notify.zabbix.server self.port = self.config.notify.zabbix.port self.use_config = self.config.notify.zabbix.use_config self.key = self.config.notify.zabbix.key self.nodename = self.config.notify.zabbix.node self.success = 0 self.failed = 2 req_attrs = ['key'] for attr in req_attrs: if not getattr(self, attr): raise OperationError( 'Zabbix notifier module requires attribute: %s!' % attr) try: self.notifier = ZabbixSender( use_config=self._use_config(), zabbix_server=self.server, zabbix_port=self.port, ) except Exception, e: logging.error("Error initiating ZabbixSender! Error: %s" % e) raise OperationError(e)
def wait(self): completed = 0 start_threads = len(self.worker_threads) # wait for all threads to finish logging.debug("Waiting for %d oplog threads to finish" % start_threads) while len(self.worker_threads) > 0: if self.backup_stop and self.backup_stop.is_set(): logging.error( "Received backup stop event due to error(s), stopping backup!" ) raise OperationError( "Received backup stop event due to error(s)") for thread in self.worker_threads: if not thread.is_alive(): logging.debug("Thread %s exited with code %d" % (thread, thread.exitcode)) if thread.exitcode == 0: completed += 1 self.worker_threads.remove(thread) else: logging.debug("Waiting for %s to finish" % thread.name) sleep(1) # check if all threads completed if completed == start_threads: logging.info("All oplog threads completed successfully") self.timer.stop(self.timer_name) else: raise OperationError( "%d oplog getter threads failed to complete successfully!" % (start_threads - completed))
def wait(self): completed = 0 start_threads = len(self.dump_threads) # wait for all threads to finish while len(self.dump_threads) > 0: if self.backup_stop and self.backup_stop.is_set(): logging.error("Received backup stop event due to error(s), stopping backup!") raise OperationError("Received backup stop event due to error(s)") for thread in self.dump_threads: if not thread.is_alive(): if thread.exitcode == 0: completed += 1 self.dump_threads.remove(thread) sleep(0.5) # sleep for 3 sec to fix logging order before gathering summaries sleep(3) self.get_summaries() # check if all threads completed if completed == start_threads: logging.info("All mongodump backups completed successfully") self.timer.stop(self.timer_name) else: raise OperationError("Not all mongodump threads completed successfully!")
def run(self): try: logging.info("Tailing oplog on %s for changes" % self.uri) self.timer.start(self.timer_name) self.state.set('running', True) self.connect() oplog = self.oplog() while not self.tail_stop.is_set() and not self.backup_stop.is_set(): try: self._cursor = self.db.get_oplog_cursor_since(self.__class__, self.last_ts) while self.check_cursor(): try: # get the next oplog doc and write it doc = self._cursor.next() if self.last_ts and self.last_ts >= doc['ts']: continue oplog.add(doc) # update states self.count += 1 self.last_ts = doc['ts'] if self.first_ts is None: self.first_ts = self.last_ts update = { 'count': self.count, 'first_ts': self.first_ts, 'last_ts': self.last_ts } self.state.set(None, update, True) # print status report every N seconds self.status() except NotMasterError: # pymongo.errors.NotMasterError means a RECOVERING-state when connected to secondary (which should be true) self.backup_stop.set() logging.error("Node %s is in RECOVERING state! Stopping tailer thread" % self.uri) raise OperationError("Node %s is in RECOVERING state! Stopping tailer thread" % self.uri) except CursorNotFound: self.backup_stop.set() logging.error("Cursor disappeared on server %s! Stopping tailer thread" % self.uri) raise OperationError("Cursor disappeared on server %s! Stopping tailer thread" % self.uri) except (AutoReconnect, ConnectionFailure, ExceededMaxWaiters, ExecutionTimeout, NetworkTimeout), e: logging.error("Tailer %s received %s exception: %s. Attempting retry" % (self.uri, type(e).__name__, e)) if self._tail_retry > self._tail_retry_max: self.backup_stop.set() logging.error("Reconnected to %s %i/%i times, stopping backup!" % (self.uri, self._tail_retry, self._tail_retry_max)) raise OperationError("Reconnected to %s %i/%i times, stopping backup!" % (self.uri, self._tail_retry, self._tail_retry_max)) self._tail_retry += 1 except StopIteration: continue sleep(1) finally: if self._cursor: logging.info("Stopping oplog cursor on %s" % self.uri) self._cursor.close()
def init(self): if not self.host_has_rsync(): raise OperationError("Cannot find rsync binary on this host!") if not os.path.isdir(self.backup_dir): logging.error( "The source directory: %s does not exist or is not a directory! Skipping Rsync upload!" % self.backup_dir) raise OperationError( "The source directory: %s does not exist or is not a directory! Skipping Rsync upload!" % self.backup_dir)
def get_bucket(self, bucket_name): try: logging.debug("Connecting to AWS S3 Bucket: %s" % bucket_name) return self._conn.get_bucket(bucket_name) except Exception, e: logging.error("Cannot connect to AWS S3 Bucket: %s!" % bucket_name) raise OperationError(e)
def parse(self): # allow mongodb+srv:// URI if self.url.startswith("mongodb+srv://"): rsSearch = re.search(r'replicaSet=(\S+)(&.+)?$', self.url) if not rsSearch: raise OperationError( "replicaSet=X flag required when using mongodb+srv:// URI") self.replset = rsSearch.group(1) self.srv = True return True if "/" in self.url: self.replset, self.url = self.url.split("/") for url in self.url.split(","): addr = MongoAddr() addr.replset = self.replset if ":" in url: addr.host, addr.port = url.split(":") addr.port = int(addr.port) else: addr.host = url if not addr.port: addr.port = self.default_port validate_hostname(addr.host) self.addrs.append(addr) return True
def run(self): if not self.enabled(): logging.info("Oplog tailer is disabled, skipping") return logging.info("Starting oplog tailers on all replica sets (options: compression=%s, status_secs=%i)" % (self.compression(), self.status_secs)) self.timer.start(self.timer_name) for shard in self.replsets: tail_stop = Event() secondary = self.replsets[shard].find_secondary() mongo_uri = secondary['uri'] shard_name = mongo_uri.replset oplog_file = self.prepare_oplog_files(shard_name) oplog_state = OplogState(self.manager, mongo_uri, oplog_file) thread = TailThread( self.backup_stop, tail_stop, mongo_uri, self.config, self.timer, oplog_file, oplog_state, self.do_gzip() ) self.shards[shard] = { 'stop': tail_stop, 'thread': thread, 'state': oplog_state } self.shards[shard]['thread'].start() while not oplog_state.get('running'): if self.shards[shard]['thread'].exitcode: raise OperationError("Oplog tailer for %s failed with exit code %i!" % (mongo_uri, self.shards[shard]['thread'].exitcode)) sleep(0.5)
def run(self): f = None try: self.configure() if self.exists(): if self.gs_md5hash() and self.file_md5hash( ) == self.gs_md5hash(): logging.debug( "Path %s already exists with the same checksum (%s), skipping" % (self.path, self.gs_md5hash())) return logging.debug( "Path %s checksum and local checksum differ, re-uploading" % self.path) else: logging.debug("Path %s does not exist, uploading" % self.path) try: f = open(self.file_path, 'r') uri = self.get_uri() logging.info("Uploading %s to Google Cloud Storage" % self.path) uri.new_key().set_contents_from_file(f) finally: if f: f.close() self.success() except Exception, e: logging.error( "Uploading to Google Cloud Storage failed! Error: %s" % e) raise OperationError(e)
def __init__(self, manager, config, timer, base_dir, backup_dir, **kwargs): super(S3, self).__init__(self.__class__.__name__, manager, config, timer, base_dir, backup_dir, **kwargs) self.remove_uploaded = self.config.upload.remove_uploaded self.retries = self.config.upload.retries self.thread_count = self.config.upload.threads self.region = self.config.upload.s3.region self.bucket_name = getattr(self.config.upload.s3, 'bucket_name', None) self.bucket_prefix = getattr(self.config.upload.s3, 'bucket_prefix', None) self.bucket_explicit_key = getattr(self.config.upload.s3, 'bucket_explicit_key', None) self.access_key = getattr(self.config.upload.s3, 'access_key', None) self.secret_key = getattr(self.config.upload.s3, 'secret_key', None) self.chunk_size_mb = self.config.upload.s3.chunk_size_mb self.chunk_size = self.chunk_size_mb * 1024 * 1024 self.s3_acl = self.config.upload.s3.acl self.key_prefix = base_dir self._pool = None if None in (self.access_key, self.secret_key, self.region): raise OperationError( "Invalid or missing AWS S3 access key, secret key or region detected!" ) self._pool = S3UploadPool(self.bucket_name, self.region, self.access_key, self.secret_key, self.thread_count, self.remove_uploaded, self.chunk_size, self.s3_acl)
def __init__(self, bucket_name, region, access_key, secret_key, threads=4, remove_uploaded=False, chunk_bytes=50 * 1024 * 1024, key_acl=None): self.bucket_name = bucket_name self.region = region self.access_key = access_key self.secret_key = secret_key self.threads = threads self.remove_uploaded = remove_uploaded self.chunk_bytes = chunk_bytes self.key_acl = key_acl self.multipart_min_bytes = 5242880 self._closed = False self._uploads = {} self._mp_uploads = {} self._pool = Pool(processes=self.threads) try: self.s3_conn = S3Session(self.region, self.access_key, self.secret_key, self.bucket_name) self.bucket = self.s3_conn.get_bucket(self.bucket_name) except Exception, e: raise OperationError(e)
def __init__(self, config, db): self.config = config self.db = db self.max_lag_secs = self.config.replication.max_lag_secs self.min_priority = self.config.replication.min_priority self.max_priority = self.config.replication.max_priority self.hidden_only = self.config.replication.hidden_only self.hidden_weight = 0.20 self.pri0_weight = 0.10 self.replset = True self.rs_config = None self.rs_status = None self.primary = None self.secondary = None self.mongo_config = None self.replset_summary = {} # Get a DB connection try: if isinstance(self.db, DB): self.connection = self.db.connection() else: raise Error("'db' field is not an instance of class: 'DB'!") except Exception, e: logging.fatal("Could not get DB connection! Error: %s" % e) raise OperationError(e)
def done(self, done_dir): if done_dir in self._pooled: logging.debug("Archiving completed for: %s" % done_dir) self._pooled.remove(done_dir) else: raise OperationError("Unexpected response from tar thread: %s" % done_dir)
def run(self): try: self.init() self.timer.start(self.timer_name) logging.info("Preparing destination path on %s" % self.rsync_host) self.prepare_dest_dir() rsync_config = { "dest": "%s@%s:%s" % (self.rsync_user, self.rsync_host, self.rsync_path), "threads": self.threads(), "retries": self.retries } rsync_config.update(self.rsync_info()) logging.info( "Starting upload using rsync version %s (%s)" % (self.rsync_info()['version'], config_to_string(rsync_config))) for child in os.listdir(self.backup_dir): self._pool.apply_async(RsyncUploadThread( os.path.join(self.backup_dir, child), self.base_dir, self.rsync_flags, self.rsync_path, self.rsync_user, self.rsync_host, self.rsync_port, self.rsync_ssh_key, self.remove_uploaded, self.retries).run, callback=self.done) self.wait() except Exception, e: logging.error("Rsync upload failed! Error: %s" % e) raise OperationError(e)
def run(self): if not os.path.isdir(self.backup_dir): logging.error( "The source directory: %s does not exist or is not a directory! Skipping Google Cloud Storage upload!" % self.backup_dir) return try: self.running = True self.timer.start(self.timer_name) logging.info( "Uploading %s to Google Cloud Storage (bucket=%s, threads=%i)" % (self.base_dir, self.bucket, self.threads())) for file_path in self.get_backup_files(): gs_path = os.path.relpath(file_path, self.backup_location) self._pool.apply_async( GsUploadThread(self.backup_dir, file_path, gs_path, self.bucket, self.project_id, self.access_key, self.secret_key, self.remove_uploaded, self.retries).run) self._pool.close() self._pool.join() self.exit_code = 0 self.completed = True except Exception, e: logging.error( "Uploading to Google Cloud Storage failed! Error: %s" % e) raise OperationError(e)
def find_primary(self, force=False, quiet=False): if force or not self.primary: rs_status = self.get_rs_status(force, quiet) rs_name = rs_status['set'] for member in rs_status['members']: if member['stateStr'] == 'PRIMARY' and member['health'] > 0: member_uri = MongoUri(member['name'], 27017, rs_name) optime_ts = member['optime'] if isinstance(member['optime'], dict) and 'ts' in member['optime']: optime_ts = member['optime']['ts'] if quiet == False or not self.primary: logging.info("Found PRIMARY: %s with optime %s" % (member_uri, str(optime_ts))) self.primary = {'uri': member_uri, 'optime': optime_ts} self.replset_summary['primary'] = { "member": member, "uri": member_uri.str() } if self.primary is None: logging.error( "Unable to locate a PRIMARY member for replset %s, giving up" % rs_name) raise OperationError( "Unable to locate a PRIMARY member for replset %s, giving up" % rs_name) return self.primary
def get_rs_config_member(self, member, force=False, quiet=False): rs_config = self.get_rs_config(force, quiet) if 'name' in member: for cnf_member in rs_config['members']: if member['name'] == cnf_member['host']: return cnf_member raise OperationError("Member does not exist in mongo config!")
def validate_hostname(hostname): try: if ":" in hostname: hostname, port = hostname.split(":") socket.gethostbyname(hostname) except socket.error, e: raise OperationError("Could not resolve host '%s', error: %s" % (hostname, e))
def choose_compression(self): if self.can_compress(): if self.compression() == 'auto': logging.info("Mongodump binary supports gzip compression, auto-enabling gzip compression") self.compression('gzip') elif self.compression() == 'gzip': raise OperationError("mongodump gzip compression requested on binary that does not support gzip!")
def can_compress(self): if os.path.isfile(self.binary) and os.access(self.binary, os.X_OK): logging.debug("Mongodump binary supports gzip compression") if tuple("3.2.0".split(".")) <= tuple(self.version.split(".")): return True return False else: raise OperationError("Cannot find or execute the mongodump binary file %s!" % self.binary)
def write(self, file_name): f = None try: f = open(file_name, "w+") f.write(json.dumps(self._state)) except Exception, e: logging.debug("Writing oplog state to file: '%s'! Error: %s" % (self.oplog_file, e)) raise OperationError(e)
def get_file_size(self, file_name): if os.path.isfile(file_name): return os.stat(file_name).st_size else: logging.error("Upload file does not exist (or is not a file): %s" % file_name) raise OperationError( "Upload file does not exist (or is not a file)!")
def get_rs_status(self, force=False, quiet=False): try: if force or not self.rs_status: self.rs_status = self.db.admin_command('replSetGetStatus', quiet) self.replset_summary['status'] = self.rs_status return self.rs_status except Exception, e: logging.fatal("Error getting replica set status! Error: %s" % e) raise OperationError(e)
def run(self): self.timer.start(self.timer_name) # backup a secondary from each shard: for shard in self.replsets: secondary = self.replsets[shard].find_secondary() mongo_uri = secondary['uri'] self.states[shard] = OplogState(self.manager, mongo_uri) thread = MongodumpThread( self.states[shard], mongo_uri, self.timer, self.user, self.password, self.authdb, self.backup_dir, self.binary, self.threads(), self.do_gzip, self.verbose ) self.dump_threads.append(thread) if not len(self.dump_threads) > 0: raise OperationError('No backup threads started!') logging.info( "Starting backups using mongodump %s (options: compression=%s, threads_per_dump=%i)" % (self.version, self.compression(), self.threads())) for thread in self.dump_threads: thread.start() self.wait() # backup a single sccc/non-replset config server, if exists: if self.sharding: config_server = self.sharding.get_config_server() if config_server and isinstance(config_server, dict): logging.info("Using non-replset backup method for config server mongodump") mongo_uri = MongoUri(config_server['host'], 27019, 'configsvr') self.states['configsvr'] = OplogState(self.manager, mongo_uri) self.dump_threads = [MongodumpThread( self.states['configsvr'], mongo_uri, self.timer, self.user, self.password, self.authdb, self.backup_dir, self.binary, self.threads(), self.do_gzip, self.verbose )] self.dump_threads[0].start() self.dump_threads[0].join() self.completed = True return self._summary
def run(self): if not self.enabled(): logging.info("Oplog getter is disabled, skipping") return logging.info( "Starting oplog getter for all replica sets (options: compression=%s, status_secs=%i)" % (self.compression(), self.status_secs)) self.timer.start(self.timer_name) if len(self.backup_summary) == 0: raise OperationError( "Oplogs cannot gathered without a successful backup first.") # Determine the time when the last shard completed its backup, because we need all changes # across all other shards since whenever they finished until then logging.debug("Finding latest finished backup timestamp") need_changes_until_ts = None for shard in self.replsets: ts = self.backup_summary[shard].get('last_ts') logging.debug("Shard %s's has changes up to %s" % (shard, ts)) if need_changes_until_ts is None or ts > need_changes_until_ts: need_changes_until_ts = ts logging.info("Getting oplogs for all shards up to %s" % need_changes_until_ts) for shard in self.replsets: getter_stop = Event() secondary = self.replsets[shard].find_secondary() mongo_uri = secondary['uri'] shard_name = mongo_uri.replset need_changes_since_ts = self.backup_summary[shard].get('last_ts') oplog_file = self.prepare_oplog_files(shard_name) oplog_state = OplogState(self.manager, mongo_uri, oplog_file) thread = SimpleOplogGetterThread( self.backup_stop, getter_stop, mongo_uri, self.config, self.timer, oplog_file, oplog_state, self.do_gzip(), need_changes_since_ts, need_changes_until_ts) self.shards[shard] = { 'stop': getter_stop, 'thread': thread, 'state': oplog_state } self.worker_threads.append(thread) logging.debug("Starting thread %s to write %s oplog to %s" % (thread.name, mongo_uri, oplog_file)) thread.start() # Wait for all threads to complete self.wait() # Wait would have thrown an error is not all of them completed # normally. self.completed = True self.stopped = True self.get_summaries() return self._summary
def get_mongo_config(self, force=False, quiet=False): try: if force or not self.mongo_config: cmdline_opts = self.db.admin_command('getCmdLineOpts', quiet) if 'parsed' in cmdline_opts: self.mongo_config = cmdline_opts['parsed'] self.replset_summary['mongo_config'] = self.mongo_config return self.mongo_config except pymongo.errors.OperationFailure, e: raise OperationError("Error getting mongo config! Error: %s" % e)
def init(self): mod_class = None if self.task == "none": logging.info("%s stage disabled, skipping" % self.stage_name) return try: module = sys.modules["%s.%s" % (self.stage, self.task.capitalize())] mod_class = getattr(module, self.task.capitalize()) except LookupError, e: raise OperationError('Could not load task %s: %s' % (self.task, e))
def __init__(self, bucket_name, region, access_key, secret_key, file_name, key_name, byte_count, target_bandwidth, multipart_id=None, multipart_num=None, multipart_parts=None, multipart_offset=None, retries=5, secure=True, retry_sleep_secs=1): self.bucket_name = bucket_name self.region = region self.access_key = access_key self.secret_key = secret_key self.file_name = file_name self.key_name = key_name self.byte_count = byte_count self.target_bandwidth = target_bandwidth self.multipart_id = multipart_id self.multipart_num = multipart_num self.multipart_parts = multipart_parts self.multipart_offset = multipart_offset self.retries = retries self.secure = secure self.retry_sleep_secs = retry_sleep_secs self.do_stop = False if self.target_bandwidth is not None: logging.debug("Target bandwidth: %.2f" % self.target_bandwidth) progress_key_name = self.short_key_name(self.key_name) if self.multipart_num and self.multipart_parts: progress_key_name = "%s %d/%d" % (self.short_key_name( self.key_name), self.multipart_num, self.multipart_parts) self._progress = S3ProgressBar(progress_key_name, max=float(self.byte_count / 1024.00 / 1024.00)) self._last_bytes = None self._last_status_ts = None try: self.s3_conn = S3Session(self.region, self.access_key, self.secret_key, self.bucket_name, self.secure, self.retries) self.bucket = self.s3_conn.get_bucket(self.bucket_name) except Exception, e: logging.fatal( "Could not get AWS S3 connection to bucket %s! Error: %s" % (self.bucket_name, e)) raise OperationError("Could not get AWS S3 connection to bucket")
def get_rs_config(self, force=False, quiet=False): if force or not self.rs_config: try: if self.db.server_version() >= tuple("3.0.0".split(".")): output = self.db.admin_command('replSetGetConfig', quiet) self.rs_config = output['config'] else: self.rs_config = self.connection['local'].system.replset.find_one() self.replset_summary['config'] = self.rs_config except pymongo.errors.OperationFailure, e: raise OperationError("Error getting replica set config! Error: %s" % e)