Beispiel #1
0
def _kill_process(leash):
    leash.terminate()

    time.sleep(3)

    if leash.returncode is None:
        log.error('Process %s did not stop gracefully, killing...' % leash.pid)
        leash.kill()
        leash.wait()
def get_recording(data_dir,
                  filename,
                  exists=False,
                  not_exists=False,
                  ensure_path=False):
    filename = os.path.abspath(os.path.join(data_dir, filename))

    if not filename.startswith(data_dir):
        log.error('Requested recording %s is outside of data root' % filename)
        raise error.ControlPlaneError('No such recording')

    if exists and not os.path.exists(filename):
        log.error('Requested recording %s does not exist' % filename)
        raise error.ControlPlaneError('No such recording')

    if not_exists and os.path.exists(filename):
        log.error('Requested recording %s unexpectedly exists' % filename)
        raise error.ControlPlaneError('No such recording')

    if ensure_path:
        directory = os.path.dirname(filename)
        if not os.path.exists(directory):
            try:
                os.makedirs(directory)

            except OSError as exc:
                log.error('Failed to create %s: %s', directory, exc)
                raise error.ControlPlaneError('No such recording')

        elif not os.path.isdir(directory):
            raise error.ControlPlaneError('No such recording')

    return os.path.dirname(filename), os.path.basename(filename)
Beispiel #3
0
def import_metrics(jsondoc):
    """Update metrics DB from `dict` data structure.

    The input data structure is expected to be the one produced by SNMP
    simulator's command responder `fulljson` reporting module.
    """
    flavor = jsondoc.get('format')
    importer = KNOWN_IMPORTERS.get(flavor)
    if not importer:
        log.error('Unknown metric flavor %s, '
                  'ignoring' % flavor or '<unspecified>')
        return

    try:
        importer(jsondoc)

    except Exception as exc:
        log.error('Metric importer %s failed: %s' % (flavor, exc))
        log.error('JSON document causing failure is: %s' % jsondoc)
        db.session.rollback()
Beispiel #4
0
def watch_metrics(watch_dir):

    log.info('Watching directory %s' % watch_dir)

    while True:

        try:
            files = _traverse_dir(watch_dir)

        except Exception as exc:
            log.error('Directory %s traversal failure: %s' % (watch_dir, exc))
            time.sleep(10)
            continue

        for filename in files:

            log.info('Processing %s' % filename)

            try:
                with open(filename) as fl:
                    jsondoc = json.loads(fl.read())

            except Exception as exc:
                log.error('Error reading file %s: %s' % (filename, exc))
                continue

            finally:
                os.unlink(filename)

            try:
                manager.import_metrics(jsondoc)

            except Exception as exc:
                log.error('Error processing file %s: %s' % (filename, exc))
                continue

        time.sleep(POLL_PERIOD)
Beispiel #5
0
def manage_executables(watch_dir):
    known_instances = {}

    log.info('Watching directory %s' % watch_dir)

    while True:
        # Collect and log processes output

        rlist = {
            x['pipe'][0]: x['executable']
            for x in known_instances.values() if x['state'] == STATE_RUNNING
        }

        while True:
            try:
                r, w, x = select.select(rlist, [], [], 0.1)

            except Exception as exc:
                log.error(exc)
                break

            if not r:
                break

            timestamp = int(time.time())

            for fd in r:
                executable = rlist[fd]
                instance = known_instances[executable]
                console = instance['console']

                log.msg('Output from process "%s" begins' % executable)

                page_text = os.read(fd, console.MAX_CONSOLE_SIZE)
                page_text = page_text.decode(errors='ignore')

                console.add(page_text, timestamp)

                log.msg(page_text)
                log.msg('Output from process "%s" ends' % executable)

        # Watch executables

        existing_files = set()

        try:
            files = _traverse_dir(watch_dir)

        except Exception as exc:
            log.error('Directory %s traversal failure: %s' % (watch_dir, exc))
            time.sleep(10)
            continue

        for fl in files:
            instance = known_instances.get(fl)

            stat = os.stat(fl).st_mtime

            if not instance:
                instance = {
                    'pid': 0,
                    'executable': fl,
                    'file_info': stat,
                    'leash': None,
                    'pipe': (None, None),
                    'state': STATE_ADDED,
                    'created': time.time(),
                    'started': None,
                    'stopped': None,
                    'runtime': lifecycle.Counter(0),
                    'changes': lifecycle.Counter(0),
                    'exits': lifecycle.Counter(0),
                    'console': lifecycle.ConsoleLog(),
                }
                known_instances[fl] = instance

                log.info('Start tracking executable %s' % fl)

            pid = instance['leash'].pid if instance['leash'] else '?'

            if instance['file_info'] != stat:
                instance['file_info'] = stat
                instance['state'] = STATE_CHANGED
                instance['changes'] += 1

                log.info('Existing executable %s (PID %s) has '
                         'changed' % (fl, pid))

            if instance['state'] == STATE_RUNNING:
                executable = instance['leash']

                executable.poll()

                if executable.returncode is not None:
                    instance['state'] = STATE_DIED
                    instance['stopped'] = time.time()
                    instance['exits'] += 1

                    uptime = int(time.time() - instance['started']
                                 or time.time())

                    log.info('Executable %s (PID %s) has died '
                             '(rc=%s), uptime %s' %
                             (fl, pid, executable.returncode, uptime))

            existing_files.add(fl)

        removed_files = set(known_instances) - existing_files

        for fl in removed_files:
            instance = known_instances[fl]
            instance['state'] = STATE_REMOVED
            instance['changes'] += 1

            log.info('Existing executable %s (PID %s) has been '
                     'removed' % (fl, instance['pid']))

        for fl, instance in tuple(known_instances.items()):
            state = instance['state']

            if state in (STATE_ADDED, STATE_DIED):
                if state == STATE_DIED:
                    r, w = instance['pipe']

                    try:
                        os.close(r)
                        os.close(w)

                    except OSError as exc:
                        log.error(exc)

                r, w = os.pipe()

                leash = _run_process(fl, w)

                instance['leash'] = leash
                instance['pipe'] = r, w

                if leash:
                    instance['state'] = STATE_RUNNING
                    instance['started'] = time.time()
                    instance['pid'] = leash.pid

                    log.info('Executable %s (PID %s) has been '
                             'started' % (fl, leash.pid))

            elif state in (STATE_CHANGED, STATE_REMOVED):
                leash = instance['leash']

                if leash:
                    _kill_process(leash)

                    log.info('Executable %s (PID %s) has been '
                             'stopped' % (fl, leash.pid))

                r, w = instance['pipe']
                if r:
                    try:
                        os.close(r)
                        os.close(w)

                    except OSError as exc:
                        log.error(exc)

                if state == STATE_CHANGED:
                    instance['state'] = STATE_DIED

                else:
                    known_instances.pop(fl)

                    log.info('Stopped tracking executable %s' % fl)

            elif state == STATE_RUNNING:
                leash = instance['leash']
                if _process_is_running(leash):
                    now = time.time()
                    instance['runtime'] = lifecycle.Counter(
                        now - instance['created'])

                else:
                    instance['state'] = STATE_DIED
                    instance['exits'] += 1

                    log.info('Executable %s (PID %s) has '
                             'died' % (fl, leash.pid))

        ReportingManager.process_metrics(watch_dir, *known_instances.values())

        time.sleep(POLL_PERIOD)
Beispiel #6
0
def _run_process(fl, fd):
    try:
        return subprocess.Popen([fl], stdout=fd, stderr=fd)

    except Exception as exc:
        log.error('Executable %s failed to start: %s' % (fl, exc))
Beispiel #7
0
def collect_metrics(*instances):
    """Collect process metrics.

    Example
    -------

    .. code-block::

        {
            'executable': '/path/to/executable',
            'memory': 0,  # memory being used (MB, gauge)
            'cpu': 0,  # consumed cpu time (ms, cumulative)
            'files': 0,  # number of open files (gauge)
            'runtime': 0,  # total time this executable has been running
                           # (cumulative)
            'exits': 0,  # number of unexpected exits (cumulative)
            'restarts': 0,  # number of restarts because of changes
                            # (cumulative)
            'endpoints': {  # allocated network endpoints (gauge)
                'udpv4': [
                    '127.0.0.1:161',
                    '127.0.0.2:161
                ]
            },
            'console': [
                {
                    'timestamp': {time},
                    'text': '{text}
                }
            ]
        }
    """
    all_metrics = []

    for instance in instances:
        pid = instance['pid']
        if not pid:
            continue

        try:
            process = psutil.Process(pid)

            process_info = process.as_dict()

            endpoints = collections.defaultdict(list)

            for kind in ENDPOINT_MAP:
                for conn in process.connections(kind):
                    endpoints[ENDPOINT_MAP[kind]].append(
                        '%s:%s' % (conn.laddr.ip, conn.laddr.port)
                    )

        except psutil.Error as exc:
            log.error(exc)
            continue

        endpoints = collections.defaultdict(list)

        for kind in ENDPOINT_MAP:
            for conn in process.connections(kind):
                endpoints[ENDPOINT_MAP[kind]].append(
                    '%s:%s' % (conn.laddr.ip, conn.laddr.port)
                )

        metrics = {
            'memory': lifecycle.Gauge(
                process_info['memory_info'].vms // 1024 // 1024),
            'cpu': lifecycle.Counter(
                (process_info['cpu_times'].user +
                 process_info['cpu_times'].system) * 1000),
            'endpoints': endpoints,
            'files': lifecycle.Gauge(process_info['num_fds']),
        }

        metrics.update(
            **{metric: instance[metric] for metric in LIFECYCLE_METRICS})

        all_metrics.append(metrics)

    return all_metrics