def get_filtered_list_of_containers(
    options=defaults.DEFAULT_CRAWL_OPTIONS,
    host_namespace=misc.get_host_ipaddr()
):
    """
    Returns a partition of all the Container objects currently running in the
    system and set the `namespace` and metadata of these containers.

    The partitioning is given by `partition_strategy`.
    """

    environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT)
    metadata = options.get('metadata', {})
    _map = metadata.get('container_long_id_to_namespace_map', {})
    container_opts = {'host_namespace': host_namespace,
                      'environment': environment,
                      'long_id_to_namespace_map': _map,
                      }

    user_list = options.get('docker_containers_list', 'ALL')
    partition_strategy = options.get('partition_strategy', None)

    assert(partition_strategy['name'] == 'equally_by_pid')
    process_id = partition_strategy['args']['process_id']
    num_processes = partition_strategy['args']['num_processes']

    filtered_list = []
    containers_list = list_all_containers(user_list, container_opts)
    for container in containers_list:

        """
        There are docker and non-docker containers in this list. An example of
        a non-docker container is a chromium-browser process.
        TODO(kollerr): the logic that defines whether a container is acceptable
        to a plugin or not should be in the plugin itself.
        """

        if (environment != defaults.DEFAULT_ENVIRONMENT and
            not container.is_docker_container()):
           continue

        """
        The partition strategy is to split all the containers equally by
        process pid. We do it by hashing the long_id of the container.
        """

        _hash = container.long_id
        num = int(_hash, 16) % int(num_processes)
        if num == process_id:
            filtered_list.append(container)


    return filtered_list
def get_filtered_list_of_containers(options=defaults.DEFAULT_CRAWL_OPTIONS,
                                    host_namespace=misc.get_host_ipaddr()):
    """
    Returns a partition of all the Container objects currently running in the
    system and set the `namespace` and metadata of these containers.

    The partitioning is given by `partition_strategy`.
    """

    environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT)
    metadata = options.get('metadata', {})
    _map = metadata.get('container_long_id_to_namespace_map', {})
    container_opts = {
        'host_namespace': host_namespace,
        'environment': environment,
        'long_id_to_namespace_map': _map,
    }

    user_list = options.get('docker_containers_list', 'ALL')
    partition_strategy = options.get('partition_strategy', None)

    assert (partition_strategy['name'] == 'equally_by_pid')
    process_id = partition_strategy['args']['process_id']
    num_processes = partition_strategy['args']['num_processes']

    filtered_list = []
    containers_list = list_all_containers(user_list, container_opts)
    for container in containers_list:
        """
        There are docker and non-docker containers in this list. An example of
        a non-docker container is a chromium-browser process.
        TODO(kollerr): the logic that defines whether a container is acceptable
        to a plugin or not should be in the plugin itself.
        """

        if (environment != defaults.DEFAULT_ENVIRONMENT
                and not container.is_docker_container()):
            continue
        """
        The partition strategy is to split all the containers equally by
        process pid. We do it by hashing the long_id of the container.
        """

        _hash = container.long_id
        num = int(_hash, 16) % int(num_processes)
        if num == process_id:
            filtered_list.append(container)

    return filtered_list
Esempio n. 3
0
def get_filtered_list_of_containers(
        options=defaults.DEFAULT_CRAWL_OPTIONS,
        host_namespace=misc.get_host_ipaddr(),
):
    """
    Returns a partition of all the Container objects currently running in the
    system and set the `namespace` and metadata of these containers.

    The partitioning is given by `partition_strategy`.
    """
    environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT)
    metadata = options.get('metadata', {})
    _map = metadata.get('container_long_id_to_namespace_map', {})
    namespace_opts = {
        'host_namespace': host_namespace,
        'environment': environment,
        'long_id_to_namespace_map': _map
    }

    user_list = options.get('docker_containers_list', 'ALL')
    partition_strategy = options.get('partition_strategy', None)

    assert (partition_strategy['name'] == 'equally_by_pid')
    process_id = partition_strategy['args']['process_id']
    num_processes = partition_strategy['args']['num_processes']

    filtered_list = []
    containers_list = list_all_containers(user_list, namespace_opts)
    for container in containers_list:

        # The partition strategy is to split all the containers equally by
        # process pid. We do it by hashing the long_id of the container.

        _hash = container.long_id
        num = int(_hash, 16) % int(num_processes)
        if num == process_id:

            try:
                container.setup_namespace_and_metadata(namespace_opts)
            except ContainerInvalidEnvironment:
                continue

            if not container.namespace:
                continue

            filtered_list.append(container)

    return filtered_list
def get_filtered_list_of_containers(
    options=defaults.DEFAULT_CRAWL_OPTIONS,
    host_namespace=misc.get_host_ipaddr(),
):
    """
    Returns a partition of all the Container objects currently running in the
    system and set the `namespace` and metadata of these containers.

    The partitioning is given by `partition_strategy`.
    """
    environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT)
    metadata = options.get('metadata', {})
    _map = metadata.get('container_long_id_to_namespace_map', {})
    container_opts = {'host_namespace': host_namespace,
                      'environment': environment,
                      'long_id_to_namespace_map': _map,
                      'container_logs': options['logcrawler']['default_log_files']
                      }

    user_list = options.get('docker_containers_list', 'ALL')
    partition_strategy = options.get('partition_strategy', None)

    assert(partition_strategy['name'] == 'equally_by_pid')
    process_id = partition_strategy['args']['process_id']
    num_processes = partition_strategy['args']['num_processes']

    filtered_list = []
    containers_list = list_all_containers(user_list, container_opts)
    for container in containers_list:

        # The partition strategy is to split all the containers equally by
        # process pid. We do it by hashing the long_id of the container.

        _hash = container.long_id
        num = int(_hash, 16) % int(num_processes)
        if num == process_id:

            try:
                container.setup_namespace_and_metadata(container_opts)
            except ContainerInvalidEnvironment:
                continue

            if not container.namespace:
                continue

            filtered_list.append(container)

    return filtered_list
def snapshot(
    urls=['stdout://'],
    namespace=misc.get_host_ipaddr(),
    features=defaults.DEFAULT_FEATURES_TO_CRAWL,
    options=defaults.DEFAULT_CRAWL_OPTIONS,
    since='BOOT',
    frequency=-1,
    crawlmode=Modes.INVM,
    inputfile='Undefined',
    format='csv',
    overwrite=False,
):
    """Entrypoint for crawler functionality.

    This is the function executed by long running crawler processes. It just
    loops sleeping for `frequency` seconds at each crawl interval.  During each
    interval, it collects the features listed in `features`, and sends them to
    the outputs listed in `urls`.

    :param urls: The url used as the output of the snapshot.
    :param namespace: This a pointer to a specific system (e.g. IP for INVM).
    :param features: List of features to crawl.
    :param options: Tree of options with details like what config files.
    :param since: Calculate deltas or not. XXX needs some work.
    :param frequency: Sleep duration between iterations. -1 means just one run.
    :param crawlmode: What's the system we want to crawl.
    :param inputfile: Applies to mode.FILE. The frame emitted is this file.
    :param format: The format of the frame, defaults to csv.
    """

    global should_exit
    saved_args = locals()
    logger.debug('snapshot args: %s' % (saved_args))

    assert ('metadata' in options)
    environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT)

    since_timestamp, last_snapshot_time = get_initial_since_values(since)

    snapshot_num = 0

    # Die if the parent dies
    PR_SET_PDEATHSIG = 1
    libc.prctl(PR_SET_PDEATHSIG, signal.SIGHUP)
    signal.signal(signal.SIGHUP, signal_handler_exit)

    if crawlmode == Modes.OUTCONTAINER:
        containers = get_filtered_list_of_containers(options, namespace)

    # This is the main loop of the system, taking a snapshot and sleeping at
    # every iteration.

    while True:

        snapshot_time = int(time.time())

        if crawlmode == Modes.OUTCONTAINER:

            curr_containers = get_filtered_list_of_containers(
                options, namespace)
            deleted = [c for c in containers if c not in curr_containers]
            containers = curr_containers

            for container in deleted:
                if options.get('link_container_log_files', False):
                    container.unlink_logfiles(options)

            logger.debug('Crawling %d containers' % (len(containers)))

            for container in containers:

                logger.info(
                    'Crawling container %s %s %s' %
                    (container.pid, container.short_id, container.namespace))

                if options.get('link_container_log_files', False):
                    # This is a NOP if files are already linked (which is
                    # pretty much always).
                    container.link_logfiles(options=options)

                snapshot_container(urls=urls,
                                   snapshot_num=snapshot_num,
                                   features=features,
                                   options=options,
                                   format=format,
                                   inputfile=inputfile,
                                   container=container,
                                   since=since,
                                   since_timestamp=since_timestamp,
                                   overwrite=overwrite)

        elif crawlmode in (Modes.INVM, Modes.MOUNTPOINT, Modes.DEVICE,
                           Modes.FILE, Modes.ISCSI):

            snapshot_generic(crawlmode=crawlmode,
                             urls=urls,
                             snapshot_num=snapshot_num,
                             features=features,
                             options=options,
                             format=format,
                             inputfile=inputfile,
                             namespace=namespace,
                             since=since,
                             since_timestamp=since_timestamp,
                             overwrite=overwrite)

        else:
            raise RuntimeError('Unknown Mode')

        # Frequency <= 0 means only one run.

        if frequency < 0 or should_exit:
            logger.info('Bye')
            break

        if since == 'LASTSNAPSHOT':
            # Subsequent snapshots will update this value.
            since_timestamp = snapshot_time
        time.sleep(frequency)
        snapshot_num += 1
Esempio n. 6
0
import logging.handlers
import time
import multiprocessing
import argparse
import json
from config_parser import (get_config,
                           apply_user_args)


# External dependencies that must be pip install'ed separately

import misc
import crawlutils
from crawlmodes import Modes

CRAWLER_HOST = misc.get_host_ipaddr()

logger = None


def csv_list(string):
    return string.split(',')


def setup_logger(logger_name, logfile='crawler.log', process_id=None):
    _logger = logging.getLogger(logger_name)
    _logger.setLevel(logging.INFO)
    (logfile_name, logfile_xtnsion) = os.path.splitext(logfile)
    if process_id is None:
        fname = logfile
    else:
def snapshot(
        urls=['stdout://'],
        namespace=misc.get_host_ipaddr(),
        features=config_parser.get_config()['general']['features_to_crawl'],
        options={},
        frequency=-1,
        crawlmode=Modes.INVM,
        format='csv',
        overwrite=False,
        first_snapshot_num=0,
        max_snapshots=-1):
    """Entrypoint for crawler functionality.

    This is the function executed by long running crawler processes. It just
    loops sleeping for `frequency` seconds at each crawl interval.  During each
    interval, it collects the features listed in `features`, and sends them to
    the outputs listed in `urls`.

    :param urls: The url used as the output of the snapshot.
    :param namespace: This a pointer to a specific system (e.g. IP for INVM).
    :param features: List of features to crawl.
    :param options: Tree of options with details like what config files.
    :param frequency: Target time period for iterations. -1 means just one run.
    :param crawlmode: What's the system we want to crawl.
    :param format: The format of the frame, defaults to csv.
    """

    global should_exit
    saved_args = locals()
    logger.debug('snapshot args: %s' % (saved_args))

    environment = options.get(
        'environment',
        config_parser.get_config()['general']['environment'])
    plugin_places = options.get(
        'plugin_places',
        config_parser.get_config()['general']['plugin_places'])

    plugin_mode = config_parser.get_config()['general']['plugin_mode']

    plugins_manager.reload_env_plugin(plugin_places=plugin_places,
                                      environment=environment)

    plugins_manager.reload_container_crawl_plugins(plugin_places=plugin_places,
                                                   features=features,
                                                   plugin_mode=plugin_mode)

    plugins_manager.reload_vm_crawl_plugins(plugin_places=plugin_places,
                                            features=features,
                                            plugin_mode=plugin_mode)

    plugins_manager.reload_host_crawl_plugins(plugin_places=plugin_places,
                                              features=features,
                                              plugin_mode=plugin_mode)

    next_iteration_time = None

    snapshot_num = first_snapshot_num

    # Die if the parent dies
    PR_SET_PDEATHSIG = 1
    try:
        libc.prctl(PR_SET_PDEATHSIG, signal.SIGHUP)
        signal.signal(signal.SIGHUP, signal_handler_exit)
    except AttributeError:
        logger.warning('prctl is not available. MacOS is not supported.')

    containers = []

    # This is the main loop of the system, taking a snapshot and sleeping at
    # every iteration.

    while True:

        snapshot_time = int(time.time())

        if crawlmode == Modes.OUTCONTAINER:
            containers = snapshot_containers(
                containers=containers,
                urls=urls,
                snapshot_num=snapshot_num,
                features=features,
                options=options,
                format=format,
                overwrite=overwrite,
                host_namespace=namespace,
            )
        elif crawlmode == Modes.MESOS:
            snapshot_mesos(
                crawlmode=crawlmode,
                urls=urls,
                snapshot_num=snapshot_num,
                options=options,
                format=format,
                overwrite=overwrite,
                namespace=namespace,
            )
        elif crawlmode == Modes.OUTVM:
            snapshot_vms(
                urls=urls,
                snapshot_num=snapshot_num,
                features=features,
                options=options,
                format=format,
                overwrite=overwrite,
                namespace=namespace,
            )
        elif crawlmode in [Modes.INVM, Modes.MOUNTPOINT]:
            snapshot_generic(crawlmode=crawlmode,
                             urls=urls,
                             snapshot_num=snapshot_num,
                             features=features,
                             options=options,
                             format=format,
                             namespace=namespace,
                             overwrite=overwrite)
        else:
            raise NotImplementedError('Crawl mode %s is not implemented' %
                                      crawlmode)

        # Frequency < 0 means only one run.
        if (frequency < 0 or should_exit or snapshot_num == max_snapshots):
            logger.info('Bye')
            break

        time_to_sleep, next_iteration_time = _get_next_iteration_time(
            next_iteration_time, frequency, snapshot_time)
        if time_to_sleep > 0:
            time.sleep(time_to_sleep)

        snapshot_num += 1
def snapshot(
    urls=['stdout://'],
    namespace=misc.get_host_ipaddr(),
    features=defaults.DEFAULT_FEATURES_TO_CRAWL,
    options=defaults.DEFAULT_CRAWL_OPTIONS,
    since='BOOT',
    frequency=-1,
    crawlmode=Modes.INVM,
    inputfile='Undefined',
    format='csv',
    overwrite=False,
):
    """Entrypoint for crawler functionality.

    This is the function executed by long running crawler processes. It just
    loops sleeping for `frequency` seconds at each crawl interval.  During each
    interval, it collects the features listed in `features`, and sends them to
    the outputs listed in `urls`.

    :param urls: The url used as the output of the snapshot.
    :param namespace: This a pointer to a specific system (e.g. IP for INVM).
    :param features: List of features to crawl.
    :param options: Tree of options with details like what config files.
    :param since: Calculate deltas or not. XXX needs some work.
    :param frequency: Target time period for iterations. -1 means just one run.
    :param crawlmode: What's the system we want to crawl.
    :param inputfile: Applies to mode.FILE. The frame emitted is this file.
    :param format: The format of the frame, defaults to csv.
    """

    global should_exit
    saved_args = locals()
    logger.debug('snapshot args: %s' % (saved_args))

    assert('metadata' in options)
    environment = options.get('environment', defaults.DEFAULT_ENVIRONMENT)
    plugin_places = options.get('plugin_places',
                                defaults.DEFAULT_PLUGIN_PLACES).split(',')
    plugins_manager.reload_env_plugin(plugin_places=plugin_places,
                                      environment=environment)

    since_timestamp, last_snapshot_time = get_initial_since_values(since)
    next_iteration_time = None

    snapshot_num = 0

    # Die if the parent dies
    PR_SET_PDEATHSIG = 1
    libc.prctl(PR_SET_PDEATHSIG, signal.SIGHUP)
    signal.signal(signal.SIGHUP, signal_handler_exit)

    if crawlmode == Modes.OUTCONTAINER:
        containers = get_filtered_list_of_containers(options, namespace)

    # This is the main loop of the system, taking a snapshot and sleeping at
    # every iteration.

    while True:

        snapshot_time = int(time.time())

        if crawlmode == Modes.OUTCONTAINER:

            curr_containers = get_filtered_list_of_containers(options,
                                                              namespace)
            deleted = [c for c in containers if c not in curr_containers]
            containers = curr_containers

            for container in deleted:
                if options.get('link_container_log_files', False):
                    try:
                        container.unlink_logfiles(options)
                    except NotImplementedError:
                        pass
 
            logger.debug('Crawling %d containers' % (len(containers)))

            for container in containers:

                logger.info(
                    'Crawling container %s %s %s' %
                    (container.pid, container.short_id, container.namespace))

                if options.get('link_container_log_files', False):
                    # This is a NOP if files are already linked (which is
                    # pretty much always).
                    try:
                        container.link_logfiles(options=options)
                    except NotImplementedError:
                        pass

                # no feature crawling
                if 'nofeatures' in features:
                    continue
                snapshot_container(
                    urls=urls,
                    snapshot_num=snapshot_num,
                    features=features,
                    options=options,
                    format=format,
                    inputfile=inputfile,
                    container=container,
                    since=since,
                    since_timestamp=since_timestamp,
                    overwrite=overwrite
                )

        elif crawlmode in (Modes.INVM,
                           Modes.MOUNTPOINT,
                           Modes.DEVICE,
                           Modes.FILE,
                           Modes.ISCSI):

            snapshot_generic(
                crawlmode=crawlmode,
                urls=urls,
                snapshot_num=snapshot_num,
                features=features,
                options=options,
                format=format,
                inputfile=inputfile,
                namespace=namespace,
                since=since,
                since_timestamp=since_timestamp,
                overwrite=overwrite
            )

        else:
            raise RuntimeError('Unknown Mode')

        if since == 'LASTSNAPSHOT':
            # Subsequent snapshots will update this value.
            since_timestamp = snapshot_time

        # Frequency <= 0 means only one run.
        if frequency < 0 or should_exit:
            logger.info('Bye')
            break
        elif frequency == 0:
            continue

        if next_iteration_time is None:
            next_iteration_time = snapshot_time + frequency
        else:
            next_iteration_time = next_iteration_time + frequency

        while next_iteration_time + frequency < time.time():
            next_iteration_time = next_iteration_time + frequency

        time_to_sleep = next_iteration_time - time.time()
        if time_to_sleep > 0:
            time.sleep(time_to_sleep)

        snapshot_num += 1
Esempio n. 9
0
import cPickle as pickle
import json
import copy


# External dependencies that must be pip install'ed separately

import bottle
import defaults
import misc
import crawlutils
from crawlmodes import Modes

app = bottle.Bottle()

CRAWLER_HOST = misc.get_host_ipaddr()
CRAWLER_PORT = 9999

logger = None

# This dict keeps track of active snapshot tasks on this host

tasks = {}

# this string should be same as the contents of the README.API file

apihelp = \
    '''

Crawler API
-----------