Exemple #1
0
    def test_alive(self):
        primary = ha_tools.get_primary()
        secondary = ha_tools.get_random_secondary()
        primary_cx = MongoClient(primary, use_greenlets=use_greenlets)
        secondary_cx = MongoClient(secondary, use_greenlets=use_greenlets)
        rsc = MongoReplicaSetClient(self.seed,
                                    replicaSet=self.name,
                                    use_greenlets=use_greenlets)

        try:
            self.assertTrue(primary_cx.alive())
            self.assertTrue(secondary_cx.alive())
            self.assertTrue(rsc.alive())

            ha_tools.kill_primary()
            time.sleep(0.5)

            self.assertFalse(primary_cx.alive())
            self.assertTrue(secondary_cx.alive())
            self.assertFalse(rsc.alive())

            ha_tools.kill_members([secondary], 2)
            time.sleep(0.5)

            self.assertFalse(primary_cx.alive())
            self.assertFalse(secondary_cx.alive())
            self.assertFalse(rsc.alive())
        finally:
            rsc.close()
    def test_alive(self):
        primary = ha_tools.get_primary()
        secondary = ha_tools.get_random_secondary()
        primary_cx = MongoClient(primary, use_greenlets=use_greenlets)
        secondary_cx = MongoClient(secondary, use_greenlets=use_greenlets)
        rsc = MongoReplicaSetClient(
            self.seed, replicaSet=self.name, use_greenlets=use_greenlets)

        try:
            self.assertTrue(primary_cx.alive())
            self.assertTrue(secondary_cx.alive())
            self.assertTrue(rsc.alive())
    
            ha_tools.kill_primary()
            time.sleep(0.5)

            self.assertFalse(primary_cx.alive())
            self.assertTrue(secondary_cx.alive())
            self.assertFalse(rsc.alive())
            
            ha_tools.kill_members([secondary], 2)
            time.sleep(0.5)

            self.assertFalse(primary_cx.alive())
            self.assertFalse(secondary_cx.alive())
            self.assertFalse(rsc.alive())
        finally:
            rsc.close()
class AwsMongoBackup(object):
    def __init__(self,
                 replicaset=None,
                 filters=None,
                 instance_ids=None,
                 ssh_opts=None,
                 dryrun=False,
                 region=None,
                 logger=None):
        self.creation_time = datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S")

        if logger is not None:
            self.logger = logger

        if region is None:
            region = 'us-east-1'

        if AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY:
            self.ec2 = ec2_connect_to_region(
                region,
                aws_access_key_id=AWS_ACCESS_KEY_ID,
                aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
        else:
            self.ec2 = ec2_connect_to_region(region)

        if replicaset is None:
            raise RuntimeError('replicaset must be provided.')

        self.replicaset = replicaset

        if filters:
            self.instances = self._instances_via_filters(filters=filters)
        elif instance_ids:
            self.instances = self._instances_via_ids(instance_ids=instance_ids)
        else:
            raise RuntimeError('Either an API filter or a list of instance'
                               'IDs must be provided.')
        self.logger.debug("found instances %s" % self.instances)

        self.mongo = self._mongo(instances=self.instances)
        self.logger.debug("connected to mongo %s" % self.mongo)

        self.ssh_opts = ssh_opts
        self.logger.debug("set ssh opts to %s" % self.ssh_opts)

        self.dryrun = dryrun

    def _instances_via_filters(self, filters=None):
        if filters is None:
            raise ValueError('filters must be a dict of valid EC2 API filters')

        reservations = self.ec2.get_all_instances(filters=filters)

        instances = []

        for reservation in reservations:
            instances.extend(reservation.instances)

        return instances

    def _instances_via_ids(self, instance_ids=None):
        if instance_ids is None or type(instance_ids) is not 'list':
            raise ValueError('instances must be provided in a list')

        raise NotImplementedError("I'll come back to this later.")

    def _mongo(self, instances, force=False):
        if not hasattr(AwsMongoBackup, 'mongo') or force:
            mongo_rs_str = ','.join([x.public_dns_name for x in instances])
            self.logger.debug("connecting to mongo URI %s" % mongo_rs_str)
            self.mongo = MongoReplicaSetClient(mongo_rs_str,
                                               replicaSet=self.replicaset)

            if self.mongo.alive():
                return self.mongo
            else:
                self.mongo = None
                return None

    def _ssh(self, hostname, ssh_opts):
        self.ssh = SSHClient()
        self.ssh.set_missing_host_key_policy(AutoAddPolicy())
        self.ssh.connect(hostname=hostname, **ssh_opts)

        self.logger.debug("connected via ssh to %s" % hostname)

        return self.ssh

    def test_replicaset(self):
        test_result = True
        err_str = ''

        optime_dates = []
        rs_states = {}
        hidden_members = []

        rs_status = self.mongo.admin.command('replSetGetStatus')
        rs_member_hosts = [z[0] for z in self.mongo.hosts]

        for rs_member in rs_status['members']:
            if rs_member['name'].split(':')[0] not in rs_member_hosts:
                hidden_members.append((rs_member['name'].split(':')[0],
                                       int(rs_member['name'].split(':')[1])))
            try:
                rs_states[rs_member['state']] += 1
            except KeyError:
                rs_states[rs_member['state']] = 1

            if rs_member['state'] not in [1, 2, 7]:
                # primary, secondary, arbiter
                err_str = "RS member {rs_member} has a state of {state}, "\
                    "please check RS integrity and try again."\
                    .format(
                        rs_member=rs_member['name'],
                        state=rs_member['stateStr']
                    )
                test_result = False
                return (test_result, err_str)
            self.logger.debug("member %s passed state" % rs_member['name'])

            if rs_member.get('health', 1) != 1:
                err_str = "RS member {rs_member} is marked as unhealthy, "\
                    "please check RS integrity and try again."\
                    .format(rs_member=rs_member['name'])
                test_result = False
                return (test_result, err_str)
            self.logger.debug("member %s passed health" % rs_member['name'])

            if rs_member.get('pingMs', 0) > 10:
                err_str = "ping time for RS member {rs_member} is larger than"\
                    "10ms.  Please check network connectivity and try again."\
                    .format(rs_member=rs_member['name'])
                test_result = False
                return (test_result, err_str)
            self.logger.debug("member %s passed pingMs" % rs_member['name'])

            optime_dates.append(rs_member['optimeDate'])

        self.hidden_members = hidden_members

        if (max(optime_dates) - min(optime_dates)).total_seconds() > 5:
            err_str = "optimeDates is over 5 seconds, there is too much "\
                "replication lag to continue."
            test_result = False
            return (test_result, err_str)
        self.logger.debug("passed replication lag test")

        if len(self.mongo.secondaries) + len(hidden_members) < 2:
            err_str = "There needs to be at least two secondaries or a hidden"\
                " member available to do backups.  Please check RS integrity "\
                "and try again."
            test_result = False
            return (test_result, err_str)

        self.logger.debug("mongo secondaries test passed")

        if rs_states[1] != 1:
            err_str = "There needs to be one and exactly one mongo primary to"\
                " do backups.  Please check RS integrity and try again."
            test_result = False
            return (test_result, err_str)
        self.logger.debug("passed primary mongo test")

        return (test_result, err_str)

    def choose_member(self):
        if self.hidden_members:
            return self.hidden_members.pop()
        else:
            return self.mongo.secondaries.pop()

    def backup(self):
        # Test that the replica set is in a good state to perform backups
        test_result, err_str = self.test_replicaset()
        if test_result is False:
            raise RuntimeError(err_str)

        # Choose a member from which to back up
        backup_member = self.choose_member()

        self._ssh(backup_member[0], self.ssh_opts)

        # Get the instance ID
        stdin, stdout, stderr = self.ssh.exec_command(
            '/usr/bin/curl http://169.254.169.254/latest/meta-data/instance-id'
        )

        instance_id = stdout.readline().rstrip()
        self.logger.debug("Working on instance %s" % instance_id)

        reservation = self.ec2.get_all_instances(instance_ids=[
            instance_id,
        ])
        instance = reservation[0].instances[0]

        self.logger.debug("got boto ec2 instance %s" % instance)

        # Connect to the backup target directly
        backup_member_mongo = MongoClient(host=backup_member[0],
                                          port=backup_member[1])
        self.logger.debug("connected to mongo target %s" % backup_member_mongo)

        freeze_rs = True
        if backup_member_mongo.admin.command('isMaster').get('hidden', False):
            # This member is hidden so we can safely take backups without
            # doing any other maintenance work
            freeze_rs = False

        # Find what volume database data is on
        cfg = backup_member_mongo.admin.command('getCmdLineOpts')

        cfg_data_volume = cfg['parsed']['dbpath']

        self.logger.debug("found parsed dbpath of %s" % cfg_data_volume)

        stdin, stdout, stderr = self.ssh.exec_command(
            '/usr/bin/sudo /bin/df {cfg_data_volume} | '
            '/bin/grep -v "Filesystem"'.format(
                cfg_data_volume=cfg_data_volume))

        mount_info = stdout.readline().rstrip()
        mount_info = mount_info.split(' ')[0]

        self.logger.debug("working on mount %s" % mount_info)

        # Find the matching EBS volume for this mount point
        volumes = self.ec2.get_all_volumes(
            filters={'attachment.instance-id': instance_id})

        data_volume = None
        for volume in volumes:
            # There's a strange thing that happens, /dev/sdh1 can magically
            # become /dev/xdh1 at boot time on instances.  Check for both.
            volume_mount_point = volume.attach_data.device
            if volume_mount_point == mount_info or \
                    volume_mount_point.replace('sd', 'xvd') == mount_info:
                data_volume = volume

        if data_volume is None:
            raise RuntimeError("Couldn't find EBS data volume!")

        self.logger.debug("found data volume %s" % data_volume)

        # Remove the member from the replicaset (mark as hidden)
        if freeze_rs:
            # Can probably use replSetMaintenance here but not available
            # in my testing version
            if self.dryrun:
                self.logger.debug("Would have frozen replicaset")
            else:
                self.logger.debug('Freezing replicaset')
                backup_member_mongo.admin.command({'replSetFreeze': 86400})

        else:
            self.logger.debug(
                "skipping replicaset freeze, %s is a hidden member" %
                backup_member[0])

        # Fsynclock mongo
        if self.dryrun:
            self.logger.debug("Would have fsynclocked {backup_member}".format(
                backup_member=backup_member))
        else:
            self.logger.debug("fsync/locking {backup_member}".format(
                backup_member=backup_member))
            backup_member_mongo.fsync(lock=True)

        if self.dryrun:
            self.logger.debug(
                "Would have created snapshot of volume {volume}".format(
                    volume=data_volume))
            self.current_snapshot = None
        else:
            self.logger.debug("creating snapshot of %s" % data_volume)
            snapshot = data_volume.create_snapshot(
                description="mongobackup {date} {replicaset}".format(
                    date=self.creation_time, replicaset=self.replicaset))

            self.current_snapshot = snapshot.id

            tags = {
                'replicaset': self.replicaset,
                'sourcehost': backup_member[0],
                'creation_time': self.creation_time
            }
            self.logger.debug("adding tags %s to snapshot %s" %
                              (tags, snapshot))

            self.ec2.create_tags(resource_ids=[
                snapshot.id,
            ], tags=tags)

        # Unlock mongo
        if self.dryrun:
            self.logger.debug("Would have unlocked mongo")
        else:
            if freeze_rs:
                self.logger.debug('unfreezing replicaset')
                backup_member_mongo.admin.command({'replSetFreeze': 0})

            self.logger.debug("unlocking {backup_member}".format(
                backup_member=backup_member))
            backup_member_mongo.unlock()
class AwsMongoBackup(object):

    def __init__(self,
                 replicaset=None,
                 filters=None,
                 instance_ids=None,
                 ssh_opts=None,
                 dryrun=False,
                 region=None,
                 logger=None):
        self.creation_time = datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S")

        if logger is not None:
            self.logger = logger

        if region is None:
            region = 'us-east-1'

        if AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY:
            self.ec2 = ec2_connect_to_region(
                region,
                aws_access_key_id=AWS_ACCESS_KEY_ID,
                aws_secret_access_key=AWS_SECRET_ACCESS_KEY
            )
        else:
            self.ec2 = ec2_connect_to_region(
                region
            )

        if replicaset is None:
            raise RuntimeError('replicaset must be provided.')

        self.replicaset = replicaset

        if filters:
            self.instances = self._instances_via_filters(filters=filters)
        elif instance_ids:
            self.instances = self._instances_via_ids(instance_ids=instance_ids)
        else:
            raise RuntimeError('Either an API filter or a list of instance'
                               'IDs must be provided.')
        self.logger.debug("found instances %s" % self.instances)

        self.mongo = self._mongo(instances=self.instances)
        self.logger.debug("connected to mongo %s" % self.mongo)

        self.ssh_opts = ssh_opts
        self.logger.debug("set ssh opts to %s" % self.ssh_opts)

        self.dryrun = dryrun

    def _instances_via_filters(self, filters=None):
        if filters is None:
            raise ValueError('filters must be a dict of valid EC2 API filters')

        reservations = self.ec2.get_all_instances(
            filters=filters
        )

        instances = []

        for reservation in reservations:
            instances.extend(reservation.instances)

        return instances

    def _instances_via_ids(self, instance_ids=None):
        if instance_ids is None or type(instance_ids) is not 'list':
            raise ValueError('instances must be provided in a list')

        raise NotImplementedError("I'll come back to this later.")

    def _mongo(self, instances, force=False):
        if not hasattr(AwsMongoBackup, 'mongo') or force:
            mongo_rs_str = ','.join([x.public_dns_name for x in instances])
            self.logger.debug("connecting to mongo URI %s" % mongo_rs_str)
            self.mongo = MongoReplicaSetClient(
                mongo_rs_str,
                replicaSet=self.replicaset
            )

            if self.mongo.alive():
                return self.mongo
            else:
                self.mongo = None
                return None

    def _ssh(self, hostname, ssh_opts):
        self.ssh = SSHClient()
        self.ssh.set_missing_host_key_policy(AutoAddPolicy())
        self.ssh.connect(hostname=hostname, **ssh_opts)

        self.logger.debug("connected via ssh to %s" % hostname)

        return self.ssh

    def test_replicaset(self):
        test_result = True
        err_str = ''

        optime_dates = []
        rs_states = {}
        hidden_members = []

        rs_status = self.mongo.admin.command('replSetGetStatus')
        rs_member_hosts = [z[0] for z in self.mongo.hosts]

        for rs_member in rs_status['members']:
            if rs_member['name'].split(':')[0] not in rs_member_hosts:
                hidden_members.append((
                    rs_member['name'].split(':')[0],
                    int(rs_member['name'].split(':')[1])
                ))
            try:
                rs_states[rs_member['state']] += 1
            except KeyError:
                rs_states[rs_member['state']] = 1

            if rs_member['state'] not in [1, 2, 7]:
                # primary, secondary, arbiter
                err_str = "RS member {rs_member} has a state of {state}, "\
                    "please check RS integrity and try again."\
                    .format(
                        rs_member=rs_member['name'],
                        state=rs_member['stateStr']
                    )
                test_result = False
                return (test_result, err_str)
            self.logger.debug("member %s passed state" % rs_member['name'])

            if rs_member.get('health', 1) != 1:
                err_str = "RS member {rs_member} is marked as unhealthy, "\
                    "please check RS integrity and try again."\
                    .format(rs_member=rs_member['name'])
                test_result = False
                return (test_result, err_str)
            self.logger.debug("member %s passed health" % rs_member['name'])

            if rs_member.get('pingMs', 0) > 10:
                err_str = "ping time for RS member {rs_member} is larger than"\
                    "10ms.  Please check network connectivity and try again."\
                    .format(rs_member=rs_member['name'])
                test_result = False
                return (test_result, err_str)
            self.logger.debug("member %s passed pingMs" % rs_member['name'])

            optime_dates.append(rs_member['optimeDate'])

        self.hidden_members = hidden_members

        if (max(optime_dates) - min(optime_dates)).total_seconds() > 5:
            err_str = "optimeDates is over 5 seconds, there is too much "\
                "replication lag to continue."
            test_result = False
            return (test_result, err_str)
        self.logger.debug("passed replication lag test")

        if len(self.mongo.secondaries) + len(hidden_members) < 2:
            err_str = "There needs to be at least two secondaries or a hidden"\
                " member available to do backups.  Please check RS integrity "\
                "and try again."
            test_result = False
            return (test_result, err_str)

        self.logger.debug("mongo secondaries test passed")

        if rs_states[1] != 1:
            err_str = "There needs to be one and exactly one mongo primary to"\
                " do backups.  Please check RS integrity and try again."
            test_result = False
            return (test_result, err_str)
        self.logger.debug("passed primary mongo test")

        return (test_result, err_str)

    def choose_member(self):
        if self.hidden_members:
            return self.hidden_members.pop()
        else:
            return self.mongo.secondaries.pop()

    def backup(self):
        # Test that the replica set is in a good state to perform backups
        test_result, err_str = self.test_replicaset()
        if test_result is False:
            raise RuntimeError(err_str)

        # Choose a member from which to back up
        backup_member = self.choose_member()

        self._ssh(backup_member[0], self.ssh_opts)

        # Get the instance ID
        stdin, stdout, stderr = self.ssh.exec_command(
            '/usr/bin/curl http://169.254.169.254/latest/meta-data/instance-id'
        )

        instance_id = stdout.readline().rstrip()
        self.logger.debug("Working on instance %s" % instance_id)

        reservation = self.ec2.get_all_instances(instance_ids=[instance_id, ])
        instance = reservation[0].instances[0]

        self.logger.debug("got boto ec2 instance %s" % instance)

        # Connect to the backup target directly
        backup_member_mongo = MongoClient(
            host=backup_member[0],
            port=backup_member[1]
        )
        self.logger.debug("connected to mongo target %s" % backup_member_mongo)

        freeze_rs = True
        if backup_member_mongo.admin.command('isMaster').get('hidden', False):
            # This member is hidden so we can safely take backups without
            # doing any other maintenance work
            freeze_rs = False

        # Find what volume database data is on
        cfg = backup_member_mongo.admin.command('getCmdLineOpts')

        cfg_data_volume = cfg['parsed']['dbpath']

        self.logger.debug("found parsed dbpath of %s" % cfg_data_volume)

        stdin, stdout, stderr = self.ssh.exec_command(
            '/usr/bin/sudo /bin/df {cfg_data_volume} | '
            '/bin/grep -v "Filesystem"'
            .format(cfg_data_volume=cfg_data_volume)
        )

        mount_info = stdout.readline().rstrip()
        mount_info = mount_info.split(' ')[0]

        self.logger.debug("working on mount %s" % mount_info)

        # Find the matching EBS volume for this mount point
        volumes = self.ec2.get_all_volumes(
            filters={'attachment.instance-id': instance_id}
        )

        data_volume = None
        for volume in volumes:
            # There's a strange thing that happens, /dev/sdh1 can magically
            # become /dev/xdh1 at boot time on instances.  Check for both.
            volume_mount_point = volume.attach_data.device
            if volume_mount_point == mount_info or \
                    volume_mount_point.replace('sd', 'xvd') == mount_info:
                data_volume = volume

        if data_volume is None:
            raise RuntimeError("Couldn't find EBS data volume!")

        self.logger.debug("found data volume %s" % data_volume)

        # Remove the member from the replicaset (mark as hidden)
        if freeze_rs:
            # Can probably use replSetMaintenance here but not available
            # in my testing version
            if self.dryrun:
                self.logger.debug("Would have frozen replicaset")
            else:
                self.logger.debug('Freezing replicaset')
                backup_member_mongo.admin.command({'replSetFreeze': 86400})

        else:
            self.logger.debug(
                "skipping replicaset freeze, %s is a hidden member"
                % backup_member[0]
            )

        # Fsynclock mongo
        if self.dryrun:
            self.logger.debug(
                "Would have fsynclocked {backup_member}"
                .format(backup_member=backup_member)
            )
        else:
            self.logger.debug(
                "fsync/locking {backup_member}"
                .format(backup_member=backup_member)
            )
            backup_member_mongo.fsync(lock=True)

        if self.dryrun:
            self.logger.debug(
                "Would have created snapshot of volume {volume}"
                .format(volume=data_volume)
            )
            self.current_snapshot = None
        else:
            self.logger.debug("creating snapshot of %s" % data_volume)
            snapshot = data_volume.create_snapshot(
                description="mongobackup {date} {replicaset}".format(
                    date=self.creation_time,
                    replicaset=self.replicaset)
            )

            self.current_snapshot = snapshot.id

            tags = {
                'replicaset': self.replicaset,
                'sourcehost': backup_member[0],
                'creation_time': self.creation_time
            }
            self.logger.debug(
                "adding tags %s to snapshot %s" % (tags, snapshot)
            )

            self.ec2.create_tags(
                resource_ids=[snapshot.id, ],
                tags=tags
            )

        # Unlock mongo
        if self.dryrun:
            self.logger.debug("Would have unlocked mongo")
        else:
            if freeze_rs:
                self.logger.debug('unfreezing replicaset')
                backup_member_mongo.admin.command({'replSetFreeze': 0})

            self.logger.debug(
                "unlocking {backup_member}"
                .format(backup_member=backup_member)
            )
            backup_member_mongo.unlock()