Exemple #1
0
    def collect(self, api):
        db = api["admin"]
        status = db.command('replSetGetStatus')
        result = {}

        # Find nodes: current node (ourself) and the primary
        current = primary = None
        is_primary = False
        for member in status.get('members', []):
            if member.get('self'):
                current = member
                if int(member['state']) == 1:
                    is_primary = True
            if int(member.get('state')) == 1:
                primary = member

        # Compute a lag time
        if current is not None and primary is not None:
            if 'optimeDate' in primary and 'optimeDate' in current:
                lag = primary['optimeDate'] - current['optimeDate']
                result['replicationLag'] = lag.total_seconds()

        if current is not None:
            result['health'] = current['health']

        # Collect the number of votes
        config = db.command('replSetGetConfig')
        votes = 0
        total = 0.0
        for member in config['config']['members']:
            total += member.get('votes', 1)
            if member['_id'] == current['_id']:
                votes = member.get('votes', 1)
        result['votes'] = votes
        result['voteFraction'] = votes / total
        result['state'] = status['myState']
        self._submit_payload({'replSet': result})
        if is_primary:
            # Submit events
            replset_name = status['set']
            self._report_replica_set_states(status['members'], replset_name)

            # The replset_state tags represents the state of the current node (i.e primary at this point).
            # The next section computes lag time for other nodes, thus `replset_state` is replaced with the
            # state of each node.
            lag_time_tags = [t for t in self.base_tags if not t.startswith('replset_state:')]
            # Compute a lag time
            for member in status.get('members', []):
                if get_state_name(member.get('state')) not in ('SECONDARY', 'PRIMARY'):
                    # Can only compute a meaningful lag time from secondaries and primaries
                    continue
                if 'optimeDate' in primary and 'optimeDate' in member:
                    lag = primary['optimeDate'] - member['optimeDate']
                    tags = lag_time_tags + [
                        'member:{}'.format(member.get('name', 'unknown')),
                        'replset_state:{}'.format(get_state_name(member.get('state')).lower()),
                    ]
                    self.gauge('mongodb.replset.optime_lag', lag.total_seconds(), tags)

        self.check.last_states_by_server = {member['_id']: member['state'] for member in status['members']}
Exemple #2
0
    def _report_replica_set_states(self, members, replset_name):
        """
        Report all the members' state changes in the replica set.
        This method only runs on the primary.
        """

        for member in members:
            # The id field cannot be changed for a given replica set member.
            member_id = member['_id']
            status_id = member['state']
            old_state = self.check.last_states_by_server.get(member_id)
            if not old_state:
                # First time the agent sees this replica set member.
                continue

            if old_state == status_id:
                continue
            previous_short_state_str = get_state_name(old_state)
            short_state_str = get_state_name(status_id)
            long_state_str = get_long_state_name(status_id)
            node_hostname = member['name']

            msg_title = "{} is {} for {}".format(node_hostname,
                                                 short_state_str, replset_name)
            msg = (
                "MongoDB {node} (_id: {id}, {uri}) just reported as {status} ({status_short}) "
                "for {replset_name}; it was {old_state} before.".format(
                    node=node_hostname,
                    id=member_id,
                    uri=self.check._config.clean_server_name,
                    status=long_state_str,
                    status_short=short_state_str,
                    replset_name=replset_name,
                    old_state=previous_short_state_str,
                ))

            event_payload = {
                'timestamp':
                int(time.time()),
                'source_type_name':
                SOURCE_TYPE_NAME,
                'msg_title':
                msg_title,
                'msg_text':
                msg,
                'host':
                node_hostname,
                'tags': [
                    'action:mongo_replset_member_status_change',
                    'member_status:' + short_state_str,
                    'previous_member_status:' + previous_short_state_str,
                    'replset:' + replset_name,
                ],
            }
            if node_hostname == 'localhost':
                # Do not submit events with a 'localhost' hostname.
                event_payload['host'] = self.hostname
            self.check.event(event_payload)
def test_state_translation(check, instance):
    """
    Check that resolving replset member state IDs match to names and descriptions properly.
    """
    assert 'STARTUP2' == get_state_name(5)
    assert 'PRIMARY' == get_state_name(1)

    # Unknown state:
    assert 'UNKNOWN' == get_state_name(500)
    def collect(self, client):
        db = client["admin"]
        status = db.command('replSetGetStatus')
        result = {}

        # Find nodes: current node (ourself) and the primary
        current = primary = None
        is_primary = False
        for member in status.get('members', []):
            if member.get('self'):
                current = member
                if int(member['state']) == 1:
                    is_primary = True
            if int(member.get('state')) == 1:
                primary = member

        # Compute a lag time
        if current is not None and primary is not None:
            if 'optimeDate' in primary and 'optimeDate' in current:
                lag = primary['optimeDate'] - current['optimeDate']
                result['replicationLag'] = lag.total_seconds()

        if current is not None:
            result['health'] = current['health']

        # Collect the number of votes
        config = db.command('replSetGetConfig')
        votes = 0
        total = 0.0
        for member in config['config']['members']:
            total += member.get('votes', 1)
            if member['_id'] == current['_id']:
                votes = member.get('votes', 1)
        result['votes'] = votes
        result['voteFraction'] = votes / total
        result['state'] = status['myState']
        self._submit_payload({'replSet': result})

        if is_primary:
            # Submit events
            replset_name = status['set']
            self._report_replica_set_states(status['members'], replset_name)

            # Compute a lag time
            for member in status.get('members', []):
                if 'optimeDate' in primary and 'optimeDate' in member:
                    lag = primary['optimeDate'] - member['optimeDate']
                    tags = self.check.base_tags + [
                        'member:{}'.format(member.get('name', 'unknown')),
                        'replset_name:{}'.format(replset_name),
                        'replset_state:{}'.format(
                            get_state_name(member.get('state')).lower()),
                    ]
                    self.gauge('mongodb.replset.optime_lag',
                               lag.total_seconds(), tags)

        self.check.last_states_by_server = {
            member['_id']: member['state']
            for member in status['members']
        }
    def _report_replica_set_state(self, state, replset_name):
        """
        Report the member's replica set state
        * Submit a service check.
        * Create an event on state change.
        """
        # Don't submit an event if the state hasn't changed or if the previous state is unset.
        if state == self._last_state or self._last_state is None:
            return

        state_str = (REPLSET_MEMBER_STATES[state][1]
                     if state in REPLSET_MEMBER_STATES else
                     'Replset state %d is unknown to the Datadog agent' %
                     state)
        short_state_str = get_state_name(state)
        previous_short_state_str = get_state_name(self._last_state)
        msg_title = "%s is %s for %s" % (self.hostname, short_state_str,
                                         replset_name)
        msg = "MongoDB %s (%s) just reported as %s (%s) for %s; it was %s before."
        msg = msg % (
            self.hostname,
            self.check.clean_server_name,
            state_str,
            short_state_str,
            replset_name,
            previous_short_state_str,
        )

        self.check.event({
            'timestamp':
            int(time.time()),
            'source_type_name':
            SOURCE_TYPE_NAME,
            'msg_title':
            msg_title,
            'msg_text':
            msg,
            'host':
            self.hostname,
            'tags': [
                'action:mongo_replset_member_status_change',
                'member_status:' + short_state_str,
                'previous_member_status:' + previous_short_state_str,
                'replset:' + replset_name,
            ],
        })