Example #1
0
def get_data(key):  # Used to get keys from redis
    try:
        value = redis_connect().get(key)
    except redis.exceptions.ConnectionError:
        log_messages('Could not get ' + key + ' from Redis', 'error')
        return (None)
    return (value)
Example #2
0
def get_data(key): # Used to get keys from redis
    try:
        value = redis_connect().get(key)
    except redis.exceptions.ConnectionError:
        log_messages('Could not get '+key+' from Redis', 'error')
        return(None)
    return(value)
Example #3
0
def get_all_data(key):  # Used to get all keys with a pattern e.g pingdom_*
    try:
        value = redis_connect().keys(key)
    except redis.exceptions.ConnectionError:
        log_messages('Could not get ' + key + ' from Redis', 'error')
        return (None)
    return (value)
Example #4
0
def get_all_data(key): # Used to get all keys with a pattern e.g pingdom_*
    try:
        value = redis_connect().keys(key)
    except redis.exceptions.ConnectionError:
        log_messages('Could not get '+key+' from Redis', 'error')
        return(None)
    return(value)
def store_calendar_items():
    with open(calendar_export) as c_file:
        try:
            c_data = json.load(c_file)
        except ValueError:
            c_data = False
    c_file.close()
    if c_data != False:
        prune_calendar_items()
        for item in c_data['items']:
            if 'dateTime' in item['start']: # Check if the datetime is set
                item['start']['date'] = item['start']['dateTime'].split('T')[0] # Split the datetime to get the date and set the data parameter
                current_summary = item['summary']
                try:
                    start_time = datetime.datetime.strptime(item['start']['dateTime'].split('T')[1], '%H:%M:%SZ').strftime('%H:%M') # Convert the start time to a nice date
                    end_time = datetime.datetime.strptime(item['end']['dateTime'].split('T')[1], '%H:%M:%SZ').strftime('%H:%M: ') # Convert the end time to a nice date
                except ValueError:
                    start_time = datetime.datetime.strptime(item['start']['dateTime'].split('T')[1], '%H:%M:%S+01:00').strftime('%H:%M') # To work with DST times
                    end_time = datetime.datetime.strptime(item['end']['dateTime'].split('T')[1], '%H:%M:%S+01:00').strftime('%H:%M: ')
                item['summary'] = start_time+' - '+end_time+current_summary # Add the start and end time to the summary
            current = get_data('calendar_'+item['start']['date']) # Check if an existing key exists for the date in question
            if current == None:
                set_data('calendar_'+item['start']['date'], item['summary']) # If a date doesn't exist create one
            elif item['summary'] not in current: # If a key exists but it's not the current summary it means we have two items for one date
                set_data('calendar_'+item['start']['date'], current+calendar_split+item['summary']) # Append to the existing item
    else:
        log_messages('Could not parse calendar', 'error')
Example #6
0
def store_newrelic_results():
    failed_newrelic = 0
    total_accounts = 0
    newrelic_data = get_newrelic_data() # Get all the newrelic checks
    for account in newrelic_data:
        if newrelic_data[account] != None: # Check we actually got some data for a check
            set_data('newrelic_'+account, newrelic_data[account]) # Store it in redis
        else:
            failed_newrelic +=1
            log_messages('Could not get newrelic data for '+account, 'error')
        total_accounts+=1
    set_data('total_newrelic_accounts', total_accounts)
    set_data('failed_newrelic', failed_newrelic)
Example #7
0
def store_pingdom_results():
    failed_pingdom = 0
    total_accounts = 0
    pingdom_data = get_pingdom_data() # Get all the pingdom data
    for account in pingdom_data:
        if pingdom_data[account] != None: # If we get a don't get a None store it, otherwise log the error
            set_data('pingdom_'+account, pingdom_data[account])
        else:
            failed_pingdom +=1
            log_messages('Could not get pingdom data for '+account, 'error')
        total_accounts +=1
    set_data('total_pingdom_accounts', total_accounts)
    set_data('failed_pingdom', failed_pingdom)
Example #8
0
def store_pingdom_results():
    failed_pingdom = 0
    total_accounts = 0
    pingdom_data = get_pingdom_data() # Get all the pingdom data
    for account in pingdom_data:
        if pingdom_data[account] != None: # If we get a don't get a None store it, otherwise log the error
            set_data('pingdom_'+account, pingdom_data[account])
        else:
            failed_pingdom +=1
            log_messages('Could not get pingdom data for '+account, 'error')
        total_accounts +=1
    set_data('total_pingdom_accounts', total_accounts)
    set_data('failed_pingdom', failed_pingdom)
Example #9
0
def store_calendar_items():
    with open(calendar_export) as c_file:
        try:
            c_data = json.load(c_file)
        except ValueError:
            c_data = False
    c_file.close()
    if c_data != False:
        prune_calendar_items()
        for item in c_data['items']:
            if 'dateTime' in item['start']:  # Check if the datetime is set
                item['start']['date'] = item['start']['dateTime'].split(
                    'T'
                )[0]  # Split the datetime to get the date and set the data parameter
                current_summary = item['summary']
                try:
                    start_time = datetime.datetime.strptime(
                        item['start']['dateTime'].split('T')[1],
                        '%H:%M:%SZ').strftime(
                            '%H:%M')  # Convert the start time to a nice date
                    end_time = datetime.datetime.strptime(
                        item['end']['dateTime'].split('T')[1],
                        '%H:%M:%SZ').strftime(
                            '%H:%M: ')  # Convert the end time to a nice date
                except ValueError:
                    start_time = datetime.datetime.strptime(
                        item['start']['dateTime'].split('T')[1],
                        '%H:%M:%S+01:00').strftime(
                            '%H:%M')  # To work with DST times
                    end_time = datetime.datetime.strptime(
                        item['end']['dateTime'].split('T')[1],
                        '%H:%M:%S+01:00').strftime('%H:%M: ')
                item[
                    'summary'] = start_time + ' - ' + end_time + current_summary  # Add the start and end time to the summary
            current = get_data(
                'calendar_' + item['start']['date']
            )  # Check if an existing key exists for the date in question
            if current == None:
                set_data('calendar_' + item['start']['date'],
                         item['summary'])  # If a date doesn't exist create one
            elif item[
                    'summary'] not in current:  # If a key exists but it's not the current summary it means we have two items for one date
                set_data('calendar_' + item['start']['date'],
                         current + calendar_split +
                         item['summary'])  # Append to the existing item
    else:
        log_messages('Could not parse calendar', 'error')
Example #10
0
def get_newrelic_servers_data():
    """
    Collects data for all newrelic servers accounts provided in the config file
    and stores it in redis as json with a key per server with value:
    '[{"orderby": 0, "health_status": "green", "name": "wibble", "summary": {"cpu": 0, "fullest_disk": 0, "disk_io": 0, "memory": 0}}]'
    """
    newrelic_servers_data = {}
    newrelic_servers_data_validity = {}
    newrelic_servers_data_validity['failed_accounts'] = 0
    newrelic_servers_data_validity['total_accounts'] = 0
    newrelic_servers_data_validity['total_checks'] = 0
    newrelic_servers_data_validity['successful_checks'] = 0
    for account in newrelic_servers_keys:
        newrelic_servers_data_validity['total_accounts'] += 1
        try:
            nr_servers_response = requests.get(
                newrelic_servers_endpoint,
                headers={'X-Api-Key': newrelic_servers_keys[account]},
                timeout=newrelic_servers_timeout)
            nr_servers_response.raise_for_status()
        except requests.exceptions.RequestException as e:
            newrelic_servers_data_validity['failed_accounts'] += 1
            log_messages(
                'Could not get NewRelic Servers data for {} - error getting account data from api: Error: {}'
                .format(account, e), 'error')
            continue

        for server in json.loads(nr_servers_response.text)['servers']:
            newrelic_servers_data_validity['total_checks'] += 1
            nr_servers_host = {}
            # servers returns name and host, if no dispay name is set it returns
            # the host as both name and host
            # I will crop the name in the jinja filter
            nr_servers_host['name'] = server['name']
            # servers which are not reporting have no health_status and no
            # summary of metric data, hence we set them blue with orderby = 0
            nr_servers_host['orderby'] = 0
            nr_servers_host['health_status'] = 'blue'
            if server['reporting'] == True:
                nr_servers_host['health_status'] = server['health_status']
                if nr_servers_host['health_status'] == 'unknown':
                    nr_servers_host['health_status'] = 'green'
                    nr_servers_host['name'] = '* {}'.format(
                        nr_servers_host['name'])

                nr_servers_host['summary'] = {
                    'memory': server['summary']['memory'],
                    'disk_io': server['summary']['disk_io'],
                    'fullest_disk': server['summary']['fullest_disk'],
                    'cpu': server['summary']['cpu']
                }
                nr_servers_host['orderby'] = max(
                    nr_servers_host['summary']['cpu'],
                    nr_servers_host['summary']['memory'],
                    nr_servers_host['summary']['fullest_disk'],
                    nr_servers_host['summary']['disk_io'])

            newrelic_servers_data_validity['successful_checks'] += 1
            newrelic_servers_data[server['name']] = nr_servers_host

    # Data will be valid for 5 minutes after the module runs
    newrelic_servers_data_validity['valid_until'] = time.time() * 1000 + 300000
    return newrelic_servers_data, newrelic_servers_data_validity
Example #11
0
def hourly_tasks():
    store_calendar_items()

def daily_tasks():
    return(False)

def weekly_tasks():
    return(False)

if __name__ == '__main__':
    if getpass.getuser() != warboard_user:
        print('Please run the warboard with the correct user: '******'hourly' == sys.argv[1]:
            hourly_tasks()
            log_messages('Hourly tasks executed', 'info')
        elif 'daily' == sys.argv[1]:
            daily_tasks()
            log_messages('Daily tasks executed', 'info')
        elif 'weekly' == sys.argv[1]:
            weekly_tasks()
            log_messages('Weekly tasks executed', 'info')
        else:
            print('Invalid option!')
            exit(2)
    else:
        print('Valid tasks: hourly|daily|weekly|manual')
        exit(2)
Example #12
0
def get_resource_results():
    """
    Merges lists returned by resource modules into one list in the correct
    format for warboard.html to display monitored resources

    {% for check in resource_results['checks']|sort(attribute='orderby')|reverse %}

    <tr class="danger lead"><td>{{ check['name'] }}</td><td>{{ check['summary']['cpu'] }}%</td><td>{{ check['summary']['memory'] }}%</td><td>{{ check['summary']['disk_io'] }}%</td><td>{{ check['summary']['fullest_disk'] }}%</td></tr>

    """
    resource_results = {}
    resource_results['checks'] = []
    resource_results['green'] = 0
    resource_results['red'] = 0
    resource_results['orange'] = 0
    resource_results['blue'] = 0
    resource_results['failed_accounts'] = 0
    resource_results['total_accounts'] = 0
    resource_results['total_checks'] = 0
    successful_checks = 0

    # Defaults for when no data is reported, working towards having modules be
    # modular / optional
    resource_results['blue_percent'] = 100
    resource_results['red_percent'] = 0
    resource_results['orange_percent'] = 0
    resource_results['green_percent'] = 0
    resource_results['working_percentage'] = 100

    # Check if the data recieved from each module is still valid, if it is not
    # then all checks from that module are counted as unsuccessful and all
    # accounts are counted as failed
    milliseconds_since_epoch = time.time() * 1000
    for module in get_all_data('resources_success:*'):
        module_success_json = get_data(module)
        module_success = json.loads(module_success_json)[0]
        resource_results['total_accounts'] += module_success['total_accounts']
        resource_results['total_checks'] += module_success['total_checks']
        milliseconds_since_epoch_module_data_is_valid_until = module_success[
            'valid_until']
        if milliseconds_since_epoch > milliseconds_since_epoch_module_data_is_valid_until:
            resource_results['failed_accounts'] += module_success[
                'total_accounts']
        else:
            resource_results['failed_accounts'] += module_success[
                'failed_accounts']
            successful_checks += module_success['successful_checks']

    resource_results[
        'failed_checks'] = resource_results['total_checks'] - successful_checks

    checks_found = 0
    # Get list of keys in the format resources:module#uuid
    for host in get_all_data('resources:*'):
        try:
            # Storing lists with only one value since when I convert dictionarys
            # to json and store them in redis they come back as strings, I am
            # working around this by storing lists, ast.literal_eval also works
            host_data = json.loads(get_data(host))[0]
            resource_results['checks'].append(host_data)
            checks_found += 1
            # get the health status colour of the current check, and then add
            # one to the number of checks with that health status
            resource_results[host_data['health_status']] += 1
        except Exception as e:
            resource_results['failed_checks'] += 1
            # I would rather log to uwsgi's log but I'll sort this out later
            log_messages(
                'Data for {} is not in a valid format: {}'.format(host, e),
                'error')

    # If we are getting back old checks that are no-longer reporting hence
    # are not in the total_checks variable then they have failed.
    # If we are getting back less checks than we stored then something has
    # gone really wrong or we caught the weekly cron that clears the keys.
    resource_results['failed_checks'] += abs(resource_results['total_checks'] -
                                             checks_found)
    resource_results['total_checks'] = checks_found

    total_results = resource_results['green'] + resource_results[
        'red'] + resource_results['orange'] + resource_results['blue']
    if total_results != 0:
        resource_results['red_percent'] = (resource_results['red'] /
                                           total_results) * 100
        resource_results['orange_percent'] = (resource_results['orange'] /
                                              total_results) * 100
        resource_results['blue_percent'] = (resource_results['blue'] /
                                            total_results) * 100
        # I want the percentage to always be 100 and green seems the most
        # disposable / least affected by any rounding issues
        resource_results['green_percent'] = 100 - (
            resource_results['red_percent'] +
            resource_results['orange_percent'] +
            resource_results['blue_percent'])

    # Set the working percentage to the lowest of accounts and checks, if either
    # have a total of 0 then resources isn't working so the working percentage
    # can be set to 0 to avoid dividing by 0
    if resource_results['total_accounts'] != 0 and resource_results[
            'total_checks'] != 0:
        accounts_working_percentage = 100 - (
            (resource_results['failed_accounts'] /
             resource_results['total_accounts']) * 100)
        if accounts_working_percentage < resource_results['working_percentage']:
            resource_results[
                'working_percentage'] = accounts_working_percentage
        checks_working_percentage = 100 - (
            (resource_results['failed_checks'] /
             resource_results['total_checks']) * 100)
        if checks_working_percentage < resource_results['working_percentage']:
            resource_results['working_percentage'] = checks_working_percentage

    else:
        resource_results['working_percentage'] = 0

    resource_results['working_accounts'] = resource_results[
        'total_accounts'] - resource_results['failed_accounts']
    return resource_results
Example #13
0
def daily_tasks():
    return (False)


def weekly_tasks():
    clear_resources_keys()


if __name__ == '__main__':
    if getpass.getuser() != warboard_user:
        print('Please run the warboard with the correct user: '******'hourly' == sys.argv[1]:
            hourly_tasks()
            log_messages('Hourly tasks executed', 'info')
        elif 'daily' == sys.argv[1]:
            daily_tasks()
            log_messages('Daily tasks executed', 'info')
        elif 'weekly' == sys.argv[1]:
            weekly_tasks()
            log_messages('Weekly tasks executed', 'info')
        else:
            print('Invalid option!')
            exit(2)
    else:
        print('Valid tasks: hourly|daily|weekly|manual')
        exit(2)
Example #14
0
def delete_data(key):  # Used to delete keys
    try:
        redis_connect().delete(key)
    except redis.exceptions.ConnectionError:
        log_messages('Could delete ' + key + ' in Redis', 'error')
Example #15
0
def set_data(key, value): # Used to set keys in redis
    try:
        redis_connect().set(key, value)
    except redis.exceptions.ConnectionError:
        log_messages('Could not set '+key+' in Redis', 'error')
Example #16
0
def set_data(key, value):  # Used to set keys in redis
    try:
        redis_connect().set(key, value)
    except redis.exceptions.ConnectionError:
        log_messages('Could not set ' + key + ' in Redis', 'error')
def get_newrelic_infra_data():
    """
    Collects data for all newrelic infrastructure accounts provided in the
    config file and stores it in redis as json with a key per server with value:
    '[{"orderby": 0, "health_status": "green", "name": "wibble", "summary": {"cpu": 0, "fullest_disk": 0, "disk_io": 0, "memory": 0}}]'
    """
    newrelic_infra_data = {}
    newrelic_infra_data_validity = {}
    newrelic_infra_data_validity['failed_accounts'] = 0
    newrelic_infra_data_validity['total_accounts'] = 0
    newrelic_infra_data_validity['total_checks'] = 0
    newrelic_infra_data_validity['successful_checks'] = 0
    for account in newrelic_main_and_insights_keys:
        newrelic_infra_data_validity['total_accounts'] += 1
        number_or_hosts_url = '{}{}/query?nrql=SELECT%20uniqueCount(fullHostname)%20FROM%20SystemSample'.format(
            newrelic_insights_endpoint,
            newrelic_main_and_insights_keys[account]['account_number'])
        try:
            number_of_hosts_response = requests.get(
                number_or_hosts_url,
                headers={
                    'X-Query-Key':
                    newrelic_main_and_insights_keys[account]
                    ['insights_api_key']
                },
                timeout=newrelic_insights_timeout)
            number_of_hosts_response.raise_for_status()
        except requests.exceptions.RequestException as e:
            newrelic_infra_data_validity['failed_accounts'] += 1
            log_messages(
                'Could not get NewRelic Infrastructure data for {} - error getting number of hosts from insights api: Error: {}'
                .format(account, e), 'error')
            continue

        # It may be possible for 3 servers to be found, one of which has not
        # reported for a long time and that when limiting by number of results
        # two responses are recieved for one server, one for another and none
        # for the third, the code doesn't currently check this and I expect it
        # would pass both results through and cause duplicate rows on the
        # warboard
        number_of_hosts_data = json.loads(number_of_hosts_response.text)
        number_of_hosts = number_of_hosts_data['results'][0]['uniqueCount']
        metric_data_url = '{}{}/query?nrql=SELECT%20displayName%2C%20fullHostname%2C%20cpuPercent%2C%20memoryUsedBytes%2C%20memoryTotalBytes%2C%20diskUtilizationPercent%2C%20diskUsedPercent%2C%20timestamp%20FROM%20SystemSample%20LIMIT%20{}'.format(
            newrelic_insights_endpoint,
            newrelic_main_and_insights_keys[account]['account_number'],
            number_of_hosts)
        try:
            metric_data_response = requests.get(
                metric_data_url,
                headers={
                    'X-Query-Key':
                    newrelic_main_and_insights_keys[account]
                    ['insights_api_key']
                },
                timeout=newrelic_insights_timeout)
            metric_data_response.raise_for_status()
        except requests.exceptions.RequestException as e:
            newrelic_infra_data_validity['failed_accounts'] += 1
            log_messages(
                'Could not get NewRelic Infrastructure data for {}: - error getting metric data from insights api: Error: {}'
                .format(account, e), 'error')
            continue

        account_infra_data = json.loads(metric_data_response.text)
        try:
            violation_data_response = requests.get(
                newrelic_main_api_violation_endpoint,
                headers={
                    'X-Api-Key':
                    newrelic_main_and_insights_keys[account]['main_api_key']
                },
                timeout=newrelic_main_api_timeout)
            violation_data_response.raise_for_status()
        except requests.exceptions.RequestException as e:
            newrelic_infra_data_validity['failed_accounts'] += 1
            log_messages(
                'Could not get NewRelic Alerts violation data for {}: - error getting open violation data from main api: Error: {}'
                .format(account, e), 'error')
            continue

        violation_data = json.loads(violation_data_response.text)['violations']
        for num, host_data in enumerate(
                account_infra_data['results'][0]['events']):
            newrelic_infra_data_validity['total_checks'] += 1
            infrastructure_host = {}
            # name is the display name, if it is not set it is the hostname
            # I will crop the name in the jinja filter
            infrastructure_host['name'] = account_infra_data['results'][0][
                'events'][num]['fullHostname']
            if account_infra_data['results'][0]['events'][num]['displayName']:
                infrastructure_host['name'] = account_infra_data['results'][0][
                    'events'][num]['displayName']

            # Data older than 5 minutes will be flagged as blue
            timestamp = account_infra_data['results'][0]['events'][num][
                'timestamp']
            time_accepted_since = (time.time() -
                                   newrelic_infrastructure_max_data_age) * 1000
            infrastructure_host['orderby'] = 0
            infrastructure_host['health_status'] = 'blue'
            if timestamp > time_accepted_since:
                memory_percentage = None
                if account_infra_data['results'][0]['events'][num][
                        'memoryUsedBytes'] != None and account_infra_data[
                            'results'][0]['events'][num][
                                'memoryTotalBytes'] != None:
                    memory_percentage = (
                        account_infra_data['results'][0]['events'][num]
                        ['memoryUsedBytes'] / account_infra_data['results'][0]
                        ['events'][num]['memoryTotalBytes']) * 100

                infrastructure_host['summary'] = {
                    'memory':
                    memory_percentage,
                    'disk_io':
                    account_infra_data['results'][0]['events'][num]
                    ['diskUtilizationPercent'],
                    'fullest_disk':
                    account_infra_data['results'][0]['events'][num]
                    ['diskUsedPercent'],
                    'cpu':
                    account_infra_data['results'][0]['events'][num]
                    ['cpuPercent']
                }

                # Setting the orderby using the same field as newrelic servers
                infrastructure_host['orderby'] = max(
                    infrastructure_host['summary']['cpu'],
                    infrastructure_host['summary']['memory'],
                    infrastructure_host['summary']['fullest_disk'],
                    infrastructure_host['summary']['disk_io'])

                # Using violation data to determine the health status of servers
                violation_level = 0
                # violation level 0 is green and no violation
                # violation level 1 is orange and Warning
                # violation level 2 is red and Critical
                # I'm giving it a number to make comparisons easier
                for violation in violation_data:
                    if violation['entity']['product'] != 'Infrastructure':
                        continue

                    # We have the option to just flag all servers in the account
                    # orange or red based on Warning or Critical here
                    # This would be a consistantly wrong behavior (the best kind of
                    # wrong)
                    # The issue is that in my testing servers are using names of
                    # '<fullhostname> (/)' why they don't just use <fullhostname>
                    # is beyond me, I am unsure on if this tracks display names

                    # The best I can do to match check if the server / host we are
                    # currently checking was the cause of the violation we are
                    # currently looping through
                    if infrastructure_host['name'] in violation['entity'][
                            'name']:
                        if violation['priority'] == 'Warning':
                            if violation_level < 1:
                                violation_level = 1
                        elif violation['priority'] == 'Critical':
                            if violation_level < 2:
                                violation_level = 2
                        else:
                            log_messages(
                                'Warning: unrecognised violation {} expected Warning or Critical'
                                .format(violation['priority']), 'error')

                if violation_level == 0:
                    infrastructure_host['health_status'] = 'green'
                elif violation_level == 1:
                    infrastructure_host['health_status'] = 'orange'
                elif violation_level == 2:
                    infrastructure_host['health_status'] = 'red'

            newrelic_infra_data_validity['successful_checks'] += 1
            newrelic_infra_data[
                infrastructure_host['name']] = infrastructure_host

    # Data will be valid for 5 minutes after the module runs
    newrelic_infra_data_validity['valid_until'] = time.time() * 1000 + 300000
    return newrelic_infra_data, newrelic_infra_data_validity
Example #18
0
def get_tick_data():
    """
    Collects data for all influx users provided in the config file and returns
    it as a tuple, dictionary containing all servers as key server name, value
    server data and dictionary with meta data for checks returned
    """
    tick_data = {}
    tick_data_validity = {}
    tick_data_validity['failed_accounts'] = 0
    tick_data_validity['total_accounts'] = 0
    tick_data_validity['total_checks'] = 0
    tick_data_validity['successful_checks'] = 0
    for influx_user in influx_read_users:
        tick_data_validity['total_accounts'] += 1
        influx_query_api = '{}/query'.format(influx_user['influx_url'])
        try:
            list_of_databases_response = requests.get(
                influx_query_api,
                params={
                    'u': influx_user['influx_user'],
                    'p': influx_user['influx_pass'],
                    'q': 'SHOW DATABASES',
                    'epoch': 'ms'
                },
                timeout=influx_timeout)
            list_of_databases_response.raise_for_status()
        except requests.exceptions.RequestException as e:
            tick_data_validity['failed_accounts'] += 1
            log_messages(
                'Could not get TICK data for {} - error listing databases from Influx: Error: {}'
                .format(influx_user['influx_user'], e), 'error')
            continue

        try:
            list_of_databases = json.loads(
                list_of_databases_response.text
            )['results'][0]['series'][0]['values']
        except Exception as e:
            tick_data_validity['failed_accounts'] += 1
            log_messages(
                'Could not parse TICK data for {}: Error: {}'.format(
                    influx_user['influx_user'], e), 'error')

        queries = {}
        queries[
            'cpu_query'] = 'SELECT 100 - LAST("usage_idle") AS "cpu" FROM "{}"."autogen"."cpu" WHERE time > now() - 1h GROUP BY "host";'
        queries[
            'memory_query'] = 'SELECT LAST("used_percent") AS "memory" FROM "{}"."autogen"."mem" WHERE time > now() - 1h GROUP BY "host";'
        queries[
            'fullest_disk_query'] = 'SELECT MAX("last_used_percent") AS "fullest_disk" FROM (SELECT last("used_percent") AS "last_used_percent" FROM "{}"."autogen"."disk" WHERE time > now() - 1h GROUP BY "path") GROUP BY "host";'
        # I'm not sure the io time is the best way to calculate disk io
        queries[
            'disk_io_query'] = 'SELECT MAX(latest_delta_io) AS "disk_io" FROM (SELECT LAST("delta_io") AS "latest_delta_io" FROM (SELECT derivative(last("io_time"),100ms) AS "delta_io" FROM "{}"."autogen"."diskio" WHERE time > now() - 1h GROUP BY time(1m)) GROUP BY "name") GROUP BY "host"'
        """
        > SELECT MAX("last_combined_io_time") AS "io_time" FROM (SELECT LAST("combined_io_time") AS "last_combined_io_time" FROM (SELECT DERIVATIVE(LAST("read_time"), 1ms)+DERIVATIVE(LAST("write_time"), 1ms) AS "combined_io_time" FROM "diskio" WHERE time > now() - 1h GROUP BY time(1m)) GROUP BY "name") GROUP BY "host"
        """
        # We don't have a tag key for memory, at the moment it is the only thing without a tag so it will be seperate
        # We actually want to pull this query from all of time since it gives the most recent alert status however the db isn't going to appreciate that to I'll grab the last 28 days for now
        queries[
            'crit_alert_query'] = 'SELECT LAST("crit_duration") AS "crit_duration_before_alerting", LAST("warn_duration") AS "warn_duration_before_alerting" FROM "{}"."autogen"."kapacitor_alerts" WHERE time > now() - 28d GROUP BY "host","cpu","total","device"'
        list_of_queries = []

        # The next two for loops are a little funky, we want to make as few
        # requests to influx as possible whilst keeping the number low enough
        # that we don't go over any timeouts or max request sizes
        for database_as_list in list_of_databases:
            # database is the list ["$database_name"], I can't see how the list
            # will have multiple values and would probably rather break than
            # loop through all of the returned values
            database = database_as_list[0]
            for query in queries:
                list_of_queries.append(queries[query].format(database))

        # Collect in a list incase influx_database_batch_size is not a multipe
        # of the number of queries we are running per server
        batches_response_list = []
        time_accepted_since = (time.time() - influx_max_data_age) * 1000
        for beginning_of_slice in xrange(0, len(list_of_queries),
                                         influx_database_batch_size):
            batch_query = ';'.join(
                list_of_queries[beginning_of_slice:beginning_of_slice +
                                influx_database_batch_size])
            try:
                metric_data_batch_response = requests.get(
                    influx_query_api,
                    params={
                        'u': influx_user['influx_user'],
                        'p': influx_user['influx_pass'],
                        'q': batch_query,
                        'epoch': 'ms'
                    },
                    timeout=influx_timeout)
                metric_data_batch_response.raise_for_status()
            except requests.exceptions.RequestException as e:
                # This is now a failed batch rather than account, incrementing
                # failed accounts here could cause the output to be
                # misinterpreted.  Failed checks will still be picked up by
                # their absence from successful_checks.
                log_messages(
                    'Could not get TICK data for {} - error getting batch of data from Influx: Error: {}'
                    .format(influx_user['influx_user'], e), 'error')
                continue

            try:
                batches_response_list.append(
                    json.loads(metric_data_batch_response.text)['results'])
            except:
                log_messages(
                    'Could parse get TICK data for {} - error parsing data recieved from Influx: Error: {}'
                    .format(influx_user['influx_user'], e), 'error')

        # Key = hostname, Value = data
        hosts_data = {}
        for batch in batches_response_list:
            for statement in batch:
                # If we don't get data back there will be no series
                if 'series' not in statement:
                    continue

                # Catch kapacitor alert data and set the health status accordingly
                if statement['series'][0]['name'] == "kapacitor_alerts":
                    alerts = {}
                    # We will create two lists and to store the crit_duration and warn_duration values in
                    # when an alert is a warning it's warn_duration will be an integer and it's crit_duration
                    # will be None, we will then grab to max to check if something is alerting since
                    # crit duration has value -1 when not alerting and x when alerting where x is the kapacitor
                    # variable critTime / warnTime
                    for each_measurement_with_an_alerting_status in statement[
                            'series']:
                        hostname = each_measurement_with_an_alerting_status[
                            'tags']['host']
                        if hostname not in alerts:
                            alerts[hostname] = {}
                            alerts[hostname]['critical'] = [None]
                            alerts[hostname]['warning'] = [None]

                        for tag_or_field_position_in_list, tag_or_field in enumerate(
                                each_measurement_with_an_alerting_status[
                                    'columns']):
                            if tag_or_field == "crit_duration_before_alerting":
                                assert len(
                                    each_measurement_with_an_alerting_status[
                                        'values']) == 1
                                alerts[hostname]['critical'].append(
                                    each_measurement_with_an_alerting_status[
                                        'values'][0]
                                    [tag_or_field_position_in_list])
                            if tag_or_field == "warn_duration_before_alerting":
                                assert len(
                                    each_measurement_with_an_alerting_status[
                                        'values']) == 1
                                alerts[hostname]['warning'].append(
                                    each_measurement_with_an_alerting_status[
                                        'values'][0]
                                    [tag_or_field_position_in_list])

                    for hostname in alerts:
                        health_status = 'green'

                        if max(alerts[hostname]['warning']) > 0:
                            health_status = 'orange'

                        if max(alerts[hostname]['critical']) > 0:
                            health_status = 'red'

                        if hostname not in hosts_data:
                            tick_data_validity['total_checks'] += 1
                            hosts_data[hostname] = {}
                            hosts_data[hostname]['name'] = hostname

                        hosts_data[hostname]['health_status'] = health_status

                # for all other data - cpu memory disk diskio
                for host_data in statement['series']:
                    hostname = host_data['tags']['host']
                    if hostname not in hosts_data:
                        tick_data_validity['total_checks'] += 1
                        hosts_data[hostname] = {}
                        hosts_data[hostname]['name'] = hostname

                    # Check if we have old data
                    if host_data['values'][0][0] < time_accepted_since:
                        # No point storing old data since we don't store old
                        # data from the servers module
                        continue
                    if 'summary' not in hosts_data[hostname]:
                        hosts_data[hostname]['summary'] = {}

                    # cpu and fullest_disk will be the first non time column
                    hosts_data[hostname]['summary'][
                        host_data['columns'][1]] = host_data['values'][0][1]

        for host in hosts_data:
            tick_host_data = hosts_data[host]
            if 'health_status' not in tick_host_data:
                tick_host_data['health_status'] = 'green'

            try:
                tick_host_data['orderby'] = max(
                    tick_host_data['summary']['cpu'],
                    tick_host_data['summary']['memory'],
                    tick_host_data['summary']['fullest_disk'],
                    tick_host_data['summary']['disk_io'])
            except KeyError:
                tick_host_data['orderby'] = 0
                tick_host_data['health_status'] = 'blue'

            tick_data_validity['successful_checks'] += 1
            tick_data[tick_host_data['name']] = tick_host_data

    tick_data_validity['valid_until'] = time.time() * 1000 + 300000
    return tick_data, tick_data_validity
Example #19
0
def delete_data(key): # Used to delete keys
    try:
        redis_connect().delete(key)
    except redis.exceptions.ConnectionError:
        log_messages('Could delete '+key+' in Redis', 'error')