Ejemplo n.º 1
0
    def destroy(self):
        """Destroy the monitoring stack.

        This destroys all the container and associated volumes.
        """
        with play_on(pattern_hosts="ui", roles=self._roles) as p:
            p.docker_container(
                display_name="Destroying Grafana",
                name="grafana",
                state="absent",
                force_kill=True,
            )

        with play_on(pattern_hosts="agent", roles=self._roles) as p:
            p.docker_container(display_name="Destroying telegraf",
                               name="telegraf",
                               state="absent")

        with play_on(pattern_hosts="collector", roles=self._roles) as p:
            p.docker_container(
                display_name="Destroying InfluxDB",
                name="influxdb",
                state="absent",
                force_kill=True,
            )
            p.docker_volume(
                display_name="Destroying associated volumes",
                name="influxdb-data",
                state="absent",
            )
Ejemplo n.º 2
0
    def _get_cpus(self):
        """Retrieve cpu info of all sensored hosts and put it in
        dictionaries."""
        if self.hostname_to_cpu: ## lazy loading
            return

        local_lscpus = './_tmp_enos_/lscpus'
        remote_lscpu = 'tmp/lscpu'        
        ## #1 remove outdated data
        if (Path(local_lscpus).exists() and Path(local_lscpus).is_dir()):
            shutil.rmtree(Path(local_lscpus))
        
        ## #2 retrieve new data
        with play_on(pattern_hosts='sensors', roles=self._roles) as p:
            p.shell(f'lscpu > /{remote_lscpu}')
            p.fetch(
                display_name='Retrieving the result of lscpu…',
                src=f'/{remote_lscpu}', dest=f'{local_lscpus}', flat=False,
            )
            
        for path_host_name in Path(local_lscpus).iterdir():
            path_host_name_lscpu = path_host_name / remote_lscpu
            cpu = CPU(path_host_name_lscpu.resolve())
            cpu.get_cpu()
            self.cpuname_to_cpu[cpu.cpu_name] = cpu
            self.hostname_to_cpu[path_host_name.name] = cpu
Ejemplo n.º 3
0
def config(env=None):
    roles = env["roles"]
    with play_on(pattern_hosts="all", roles=roles) as p:
        p.shell("date > /tmp/date")

    with open("/tmp/date") as f:
        print(f.readlines())
Ejemplo n.º 4
0
def provision(rs: Roles):
    ensure_python3(roles=rs)

    with play_on(roles=rs, pattern_hosts="OpenStack") as p:
        # Install the bare necessities
        p.apt(pkg=['bat', 'curl', 'htop', 'tcpdump', 'lynx', 'vim', 'kmod'],
              update_cache=True)
        # Workaround ripgrep error
        # https://bugs.launchpad.net/ubuntu/+source/rust-bat/+bug/1868517
        p.raw('apt download ripgrep')
        p.raw('dpkg --force-overwrite -i ripgrep*.deb')

        # IP Forwarding
        p.raw('sysctl -w net.ipv4.ip_forward=1')

        # Setup ssh for root w/ password
        p.raw('echo "root:lab-os" | chpasswd')
        p.blockinfile(path='/etc/ssh/sshd_config',
                      block='''
                      PasswordAuthentication yes
                      PermitRootLogin yes
                      ''')
        p.systemd(name='ssh', state='restarted')

        # Enhance default bash
        for l in (
                '. /etc/bash_completion',  # Offer bash completion
                'export PATH=/snap/bin:${PATH}',  # Put /snap/bin in PATH
                'alias cat="bat --style=plain"',  # Better cat
                'alias fgrep="rg --fixed-strings"'  # Better fgrep
        ):
            p.lineinfile(path='/root/.bashrc', line=l)
Ejemplo n.º 5
0
    def destroy(self):
        """ Destroy the energy monitoring stack. This destroys all
        containers."""
        self._get_cpus()
                
        with play_on(pattern_hosts="grafana", roles=self._roles) as p:
            p.docker_container(
                display_name="Destroying Grafana…", name="grafana", state="absent",
                force_kill=True,
            )
        
        with play_on(pattern_hosts="sensors", roles=self._roles) as p:
            p.docker_container(
                display_name="Destroying sensors…", name="powerapi-sensor", state="absent",
                force_kill=True,
            )

        i = 0
        for cpu_name, cpu in self.cpuname_to_cpu.items():
            with play_on(pattern_hosts =
                         self._get_address(self.formulas[i%len(self.formulas)]),
                         roles = self._roles) as p:
                smartwatts_name = self._get_smartwatts_name(cpu)
                p.docker_container(
                    display_name="Destroying SmartWatts…",
                    name=f"{smartwatts_name}", state="absent",
                    force_kill=True,
                )
            ++i
        
        with play_on(pattern_hosts="mongos", roles=self._roles) as p:
            p.docker_container(
                display_name="Destroying MongoDBs…", name="mongodb", state="absent",
                force_kill=True,
            )

        with play_on(pattern_hosts="influxdbs", roles=self._roles) as p:
            p.docker_container(
                display_name="Destroying InfluxDBs…", name="influxdb", state="absent",
                force_kill=True,
            )
Ejemplo n.º 6
0
 def deploy(self):
     # Some requirements
     with play_on(pattern_hosts="all", roles=self.roles) as p:
         p.apt(
             display_name="[Preinstall] Installing python-pip",
             name=["python3", "python-pip", "python3-pip"],
             state="present",
             update_cache=True,
         )
         p.pip(display_name="[Preinstall] Installing pyyaml", name="pyyaml")
     _playbook = os.path.join(SERVICE_PATH, "skydive", "skydive.yml")
     run_ansible([_playbook], roles=self.roles, extra_vars=self.extra_vars)
Ejemplo n.º 7
0
def bootstrap(rs: Roles):
    ensure_python3(roles=rs)

    with play_on(roles=rs, pattern_hosts="OpenStack") as p:
        # Install the bare necessities
        p.apt(pkg=[
            'silversearcher-ag', 'curl', 'htop', 'tcpdump', 'lynx', 'vim',
            'kmod'
        ])

        # Setup ssh for root w/ password
        p.raw('echo "root:os-imt" | chpasswd')
        p.blockinfile(path='/etc/ssh/sshd_config',
                      block='''
                      PasswordAuthentication yes
                      PermitRootLogin yes
                      ''')
        p.systemd(name='ssh', state='restarted')

        # Put /snap/bin in PATH
        p.lineinfile(path='/root/.bashrc',
                     line='export PATH=/snap/bin:${PATH}')
Ejemplo n.º 8
0
    def backup(self, backup_dir=None):
        """Backup the monitoring stack.

        Args:
            backup_dir (str): path of the backup directory to use.
        """
        def _check_path(backup_dir):
            """Make sur the backup_dir is created somewhere."""
            backup_path = _to_abs(backup_dir)
            # make sure it exists
            backup_path.mkdir(parents=True, exist_ok=True)
            return backup_path

        if backup_dir is None:
            backup_dir = Path.cwd()

        _backup_dir = _check_path(backup_dir)

        with play_on(pattern_hosts="collector", roles=self._roles) as p:
            p.docker_container(display_name="Stopping InfluxDB",
                               name="influxdb",
                               state="stopped")
            p.archive(
                display_name="Archiving the data volume",
                path="/influxdb-data",
                dest="/influxdb-data.tar.gz",
            )

            p.fetch(
                display_name="Fetching the data volume",
                src="/influxdb-data.tar.gz",
                dest=str(Path(_backup_dir, "influxdb-data.tar.gz")),
                flat=True,
            )
            p.shell("docker start influxdb",
                    display_name="Restarting InfluxDB")
Ejemplo n.º 9
0
boxes.print()
longestTimeOfLongest = boxes.getMaxTime()
logging.debug(f"Longest possible task takes {longestTimeOfLongest}ms.")

seed(SEED)



provider = G5k(conf)
roles, networks = provider.init()
roles = discover_networks(roles, networks)



priors = [__python3__, __default_python3__, __docker__]
with play_on(pattern_hosts='all', roles=roles, priors=priors) as p:
    p.pip(display_name='Installing python-docker…', name='docker')

## #A deploy jaeger, for now, we set up with all in one
with play_on(pattern_hosts='collector', roles=roles) as p:
    p.docker_container(
        display_name=f'Installing jaeger…',
        name='jaeger',
        image='jaegertracing/all-in-one:1.17',
        detach=True, network_mode='host', state='started',
        recreate=True,
        published_ports=['5775:5775/udp',
                         '6831:6831/udp', '6832:6832/udp',
                         '5778:5778',
                         '16686:16686',
                         '14268:14268',
Ejemplo n.º 10
0
roles = discover_networks(roles, networks)

m = Energy(sensors=roles['sensored'],
           mongos=roles['collector'],
           formulas=roles['collector'],
           influxdbs=roles['collector'],
           grafana=roles['collector'],
           monitor={
               'dram': False,
               'cores': True
           })

m.deploy()

priors = [__python3__, __default_python3__, __docker__]
with play_on(pattern_hosts='all', roles=roles, priors=priors) as p:
    p.pip(display_name='Installing python-docker…', name='docker')

## #A deploy jaeger, for now, we set up with all in one
with play_on(pattern_hosts='collector', roles=roles) as p:
    p.docker_container(display_name=f'Installing jaeger…',
                       name='jaeger',
                       image='jaegertracing/all-in-one:1.17',
                       detach=True,
                       network_mode='host',
                       state='started',
                       recreate=True,
                       published_ports=[
                           '5775:5775/udp', '6831:6831/udp', '6832:6832/udp',
                           '5778:5778', '16686:16686', '14268:14268',
                           '14250:14250', '9411:9411'
Ejemplo n.º 11
0
           formulas=roles['control'],
           influxdbs=roles['control'],
           grafana=roles['control'],
           monitor={
               'dram': True,
               'cores': True
           })

m.deploy()

ui_address = roles['control'][0].extra['my_network_ip']
print("Grafana is available at http://%s:3000" % ui_address)
print("user=admin, password=admin")

## #B deploy a service
with play_on(pattern_hosts='compute', roles=roles) as p:
    p.docker_image(  #source='load', # Added in ansible 2.8
        name='meow-world',
        tag='latest',
        load_path='/home/brnedelec/meow-world_latest.tar'
    )  ## (TODO) automatic or configurable

with play_on(pattern_hosts='compute',
             roles=roles,
             extra_vars={
                 'ansible_hostname_to_cpu': m.hostname_to_cpu,
                 'ansible_hostname_to_influxdb': m.hostname_to_influxdb
             }) as p:
    p.docker_container(
        display_name='Installing meow-world service…',
        name='meow-world-{{inventory_hostname_short}}',
Ejemplo n.º 12
0
def distem_bootstrap(roles, path_sshkeys):
    """Bootstrap distem on G5k nodes


    Args :
        roles (dict): physical machines to start containers on.
        path_sshkeys (dict): ssh keys paths

    Return :
        distem (class): distem client
    """

    coordinator = _get_all_hosts(roles)[0]
    distem = d.Distem(serveraddr=coordinator)
    got_pnodes = False

    # check if a client is already running
    try:
        got_pnodes = distem.pnodes_info()
    except Exception:
        logger.error("No pnodes detected - Not critical error")

    with play_on(roles=roles) as p:
        # copy ssh keys for each node
        p.copy(dest="/root/.ssh/id_rsa",
               src=path_sshkeys["private"],
               mode="600")
        p.copy(dest="/root/.ssh/id_rsa.pub",
               src=path_sshkeys["public"],
               mode="600")
        p.lineinfile(path="/root/.ssh/authorized_keys",
                     line=open(path_sshkeys["public"]).read())

        repo = "deb [allow_insecure=yes] http://distem.gforge.inria.fr/deb-stretch ./"
        # instal Distem from the debian package
        p.apt_repository(repo=repo, update_cache="no", state="present")
        p.shell("apt-get update")
        p.apt(
            name="distem",
            state="present",
            allow_unauthenticated="yes",
            force="yes",
            force_apt_get="yes",
        )
        # see below
        p.apt(name="tmux", state="present")
        p.apt_repository(repo=repo, update_cache="no", state="absent")

    if got_pnodes:
        distem.pnodes_quit()

    with play_on(roles=roles) as p:
        # kill distem process for each node
        kill_cmd = []
        kill_cmd.append("kill -9 `ps aux|grep \"distemd\"")
        kill_cmd.append("grep -v grep")
        kill_cmd.append("sed \"s/ \\{1,\\}/ /g\"")
        kill_cmd.append("cut -f 2 -d\" \"`")
        p.shell("|".join(kill_cmd) + "|| true")
        p.wait_for(state="stopped", port=4567)
        p.wait_for(state="stopped", port=4568)

    with play_on(pattern_hosts=coordinator, roles=roles) as p:
        p.file(state="directory", dest=PATH_DISTEMD_LOGS)
        # nohup starts distem but 4568 is unreachable (and init-pnodes returns
        # nil) The only thing I found is to start distem in a tmux session...
        # this is weird because distem-bootstrap seems to start correctly
        # distem over SSH without any trouble
        p.shell('tmux new-session -d "exec distemd --verbose -d"')
        p.wait_for(state="started", port=4567, timeout=10)
        p.wait_for(state="started", port=4568, timeout=10)

    distem.pnode_init(_get_all_hosts(roles))

    return distem
Ejemplo n.º 13
0
    def deploy(self):
        """Deploy the monitoring stack"""
        if self.collector is None:
            return

        # Some requirements
        with play_on(pattern_hosts="all", roles=self._roles) as p:
            p.apt(
                display_name="Installing python-setuptools",
                name="python-pip",
                state="present",
                update_cache=True,
            )
            p.pip(display_name="Installing python-docker", name="docker")
            p.shell(
                "which docker || (curl -sSL https://get.docker.com/ | sh)",
                display_name="Installing docker",
            )

        # Deploy the collector
        with play_on(pattern_hosts="collector", roles=self._roles) as p:

            p.docker_container(
                display_name="Installing",
                name="influxdb",
                image="influxdb",
                detach=True,
                network_mode="host",
                state="started",
                volumes=["/influxdb-data:/var/lib/influxdb"],
            )
            p.wait_for(
                display_name="Waiting for InfluxDB to be ready",
                host="localhost",
                port="8086",
                state="started",
                delay=2,
                timeout=120,
            )

        # Deploy the agents
        _path = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
        if self.network is not None:
            # This assumes that `discover_network` has been run before
            collector_address = self.collector[0].extra[self.network + "_ip"]
        else:
            collector_address = self.collector[0].address
        extra_vars = {"collector_address": collector_address}
        with play_on(pattern_hosts="agent",
                     roles=self._roles,
                     extra_vars=extra_vars) as p:
            p.template(
                display_name="Generating the configuration file",
                src=os.path.join(_path, self.agent_conf),
                dest="/telegraf.conf",
            )

            volumes = [
                "/telegraf.conf:/etc/telegraf/telegraf.conf",
                "sys:/rootfs/sys:ro",
                "/proc:/rootfs/proc:ro",
                "/var/run/docker.sock:/var/run/docker.sock:ro",
            ]

            p.docker_container(
                display_name="Installing Telegraf",
                name="telegraf",
                image="telegraf",
                detach=True,
                state="started",
                network_mode="host",
                volumes=volumes,
                env={
                    "HOST_PROC": "/rootfs/proc",
                    "HOST_SYS": "/rootfs/sys"
                },
            )
        # Deploy the UI
        with play_on(pattern_hosts="ui", roles=self._roles) as p:
            p.docker_container(
                display_name="Installing Grafana",
                name="grafana",
                image="grafana/grafana",
                detach=True,
                network_mode="host",
                state="started",
            )
            p.wait_for(
                display_name="Waiting for grafana to be ready",
                host="localhost",
                port=3000,
                state="started",
                delay=2,
                timeout=120,
            )
Ejemplo n.º 14
0
    def deploy(self):
        """Deploy the energy monitoring stack."""
        ## #0A Retrieve requirements
        with play_on(pattern_hosts='all', roles=self._roles, priors=self.priors) as p:
            p.pip(display_name='Installing python-docker…', name='docker')

        ## #0B retrieve cpu data from each host then perform a checking
        self._get_cpus()
        
        logging.debug(self.cpuname_to_cpu)
        logging.debug(self.hostname_to_cpu)

        if (len(self.mongos) > len(self.cpuname_to_cpu) or
            len(self.formulas) > len(self.cpuname_to_cpu) or
            len(self.influxdbs) > len(self.formulas)):
            logging.warning("""There might be an issue with the setup: too many
            collectors (stack dbs and analysis), (or) not enough cpu types.
            It may waste resources.""")


        ## #0C clean everything to make sure that interdependency
        ## conditions are met (needed since restarting without it led
        ## to early crashes of smartwatts formula…)
        self.destroy()
            
        ## #1 Deploy MongoDB collectors
        with play_on(pattern_hosts='mongos', roles=self._roles) as p:
            p.docker_container(
                display_name='Installing mongodb…',
                name='mongodb',
                image=f'mongo:{MONGODB_VERSION}',
                detach=True, state='started', recreate=True,
                exposed_ports=[f'27017'],
                published_ports=[f'{MONGODB_PORT}:27017'],
                volumes='/tmp/:/data/db',
            )
            p.wait_for(
                display_name='Waiting for MongoDB to be ready…',
                host='localhost', port='27017', state='started',
                delay=2, timeout=120,
            )

        ## #2 Deploy energy sensors        
        cpunames = list(self.cpuname_to_cpu.keys())
        for hostname, cpu in self.hostname_to_cpu.items():
            mongo_index = cpunames.index(cpu.cpu_name)%len(self.mongos)
            influxdb_index = cpunames.index(cpu.cpu_name)%len(self.influxdbs)
            self.hostname_to_mongo[hostname] = self._get_address(self._roles['mongos'][mongo_index])
            self.hostname_to_influxdb[hostname] = self._get_address(self._roles['influxdbs'][influxdb_index])
        
        with play_on(pattern_hosts='sensors', roles=self._roles,
                     extra_vars={'ansible_hostname_to_mongo': self.hostname_to_mongo,
                                 'ansible_hostname_to_cpu': self.hostname_to_cpu}) as p:
            # (TODO) check without volumes, it potentially uses volumes to read about
            # events and containers... maybe it is mandatory then.
            volumes = ['/sys:/sys',
                       '/var/lib/docker/containers:/var/lib/docker/containers:ro',
                       '/tmp/powerapi-sensor-reporting:/reporting']            
            command=['-n sensor-{{inventory_hostname_short}}',
                     '-r mongodb -U mongodb://{{ansible_hostname_to_mongo[inventory_hostname]}}:27017',
                     f'-D {SENSORS_OUTPUT_DB_NAME}', '-C col_{{ansible_hostname_to_cpu[inventory_hostname].cpu_shortname}}',
                     '-s rapl -o',] ## RAPL: Running Average Power Limit (need privileged)
            ## (TODO) double check if these options are available at hardware/OS level
            if self.monitor['cores']: command.append('-e RAPL_ENERGY_PKG')  # power consumption of all cores + LLc cache
            if self.monitor['dram'] : command.append('-e RAPL_ENERGY_DRAM')  # power consumption of DRAM
            if self.monitor['cores']: command.append('-e RAPL_ENERGY_CORES')  # power consumption of all cores on socket
            if self.monitor['gpu']  : command.append('-e RAPL_ENERGY_GPU')  # power consumption of GPU
            command.extend(['-s msr -e TSC -e APERF -e MPERF',
		            '-c core', ## CORE 
                            # (TODO) does not seem to work properly this part
                            # (TODO) check possible event names depending on cpu architecture
                            #'-e "CPU_CLK_THREAD_UNHALTED:REF_P"', ## nehalem & westmere
                            #'-e "CPU_CLK_THREAD_UNHALTED:THREAD_P"', ## nehalem & westmere
                            #'-e "CPU_CLK_THREAD_UNHALTED.REF_XCLK"', # sandy -> broadwell archi, not scaled!
                            #'-e "CPU_CLK_THREAD_UNHALTED.REF_XCLK"', # skylake and newer, must be scale by x4 base ratio.
                            '-e CPU_CLK_UNHALTED',
                            '-e LLC_MISSES -e INSTRUCTIONS_RETIRED'])

            p.docker_container(
                display_name='Installing PowerAPI sensors…',
                name='powerapi-sensor',
                image=f'powerapi/hwpc-sensor:{HWPCSENSOR_VERSION}',
                detach=True, state='started', recreate=True, network_mode='host',
                privileged=True,
                volumes=volumes,
                command=command,
            )

        ## #3 deploy InfluxDB, it will be the output of SmartWatts and
        ## the input of the optional Grafana.
        with play_on(pattern_hosts='influxdbs', roles=self._roles) as p:
            p.docker_container(
                display_name='Installing InfluxDB…',
                name='influxdb', image=f'influxdb:{INFLUXDB_VERSION}',
                detach=True, state='started', recreate=True,
                exposed_ports='8086',
                published_ports=f'{INFLUXDB_PORT}:8086',
            )
            p.wait_for(
                display_name='Waiting for InfluxDB to be ready…',
                host='localhost', port='8086', state='started',
                delay=2, timeout=120,
            )
            
        ## #4 deploy SmartWatts (there may be multiple SmartWatts per machine)
        ## (TODO) start multiple formulas in the same formula container?
        ## (TODO) ansiblify instead of sequentially push commands
        i = 0
        for cpu_name, cpu in self.cpuname_to_cpu.items():            
            cpunames = list(self.cpuname_to_cpu.keys())
            mongo_index = cpunames.index(cpu.cpu_name)%len(self.mongos)
            mongo_addr = self._get_address(self._roles['mongos'][mongo_index])
            influxdbs_addr = self._get_address(self.influxdbs[i%len(self.influxdbs)])
            smartwatts_name = self._get_smartwatts_name(cpu)
            
            with play_on(pattern_hosts =
                         self._get_address(self.formulas[i%len(self.formulas)]),
                         roles = self._roles) as p:                
                command=['-s',
                         '--input mongodb --model HWPCReport',
                         f'--uri mongodb://{mongo_addr}:{MONGODB_PORT}',
                         f'-d {SENSORS_OUTPUT_DB_NAME} -c col_{cpu.cpu_shortname}',
                         # f"--output influxdb --name hwpc --model HPWCReport",
                         # f"--uri {influxdbs_addr} --port {INFLUXDB_PORT} --db hwpc_report",
                         f'--output influxdb --name power_{cpu.cpu_shortname} --model PowerReport',
                         f'--uri {influxdbs_addr} --port {INFLUXDB_PORT} --db power_{cpu.cpu_shortname}',
                         # vvv Formula report does not have to_influxdb (yet?)
                         #f"--output influxdb --name formula --model FormulaReport",
                         #f"--uri {influxdbs_addr} --port {INFLUXDB_PORT} --db formula_report",
                         '--formula smartwatts', f'--cpu-ratio-base {cpu.cpu_nom}',
                         f'--cpu-ratio-min {cpu.cpu_min}', f'--cpu-ratio-max {cpu.cpu_max}', 
                         f'--cpu-error-threshold {SMARTWATTS_CPU_ERROR_THRESHOLD}',
                         f'--dram-error-threshold {SMARTWATTS_DRAM_ERROR_THRESHOLD}',]
                if not self.monitor['cores']: command.append('--disable-cpu-formula')
                if not self.monitor['dram'] : command.append('--disable-dram-formula')
                p.docker_container(
                    display_name='Installing smartwatts formula…',
                    name=f'{smartwatts_name}',
                    image=f'powerapi/smartwatts-formula:{SMARTWATTS_VERSION}',
                    detach=True, network_mode='host', recreate=True,
                    command=command,
                )
            ++i
        
        ## #5 Deploy the optional grafana server
        if self.grafana is None:
            return

        
        ## #A prepare dashboard
        with open('grafana_dashboard.json', 'r') as f:
            dashboard_json = json.load(f)
            panel_targets = [None] * len(self.cpuname_to_cpu)

        i = 0
        for cpu_name, cpu in self.cpuname_to_cpu.items():
            panel_targets[i] = {
                'datasource': f'power-{cpu_name}',
                'groupBy': [{'params':['$__interval'], 'type':'time'},
                            {'params':['target'], 'type':'tag'}],
                'measurement': 'power_consumption',
                'orderByTime': 'ASC',
                'policy': 'default',
                'refId': f'{cpu.cpu_shortname}',
                'resultFormat': 'time_series',
                'select': [[{'params':['power'], 'type': 'field'},
                            {'params':[], 'type': 'mean'}]],
                'tags': [{'key':'target', 'operator':'!=', 'value':'global'},
                         {'key':'target', 'operator':'!=', 'value':'powerapi-sensor'},
                         {'key':'target', 'operator':'!=', 'value':'rapl'}]}
            i = i + 1
        dashboard_json['dashboard']['panels'][0]['targets'] = panel_targets

        with play_on(pattern_hosts='grafana', roles=self._roles) as p:
            p.docker_container(
                display_name='Installing Grafana…',
                name='grafana', image=f'grafana/grafana:{GRAFANA_VERSION}',
                detach=True, recreate=True, state='started',
                #exposed_ports='3000',
                network_mode='host', # not very clean "host"
                # published_ports=f'{GRAFANA_PORT}:3000',
            )
            p.wait_for(
                display_name='Waiting for Grafana to be ready…',
                host='localhost', port='3000', state='started',
                delay=2, timeout=120,
            )

            ## #B add datasources and fill the dashboard
            i = 0
            for cpu_name, cpu in self.cpuname_to_cpu.items():
                influxdbs_addr = self._get_address(self.influxdbs[i%len(self.influxdbs)])
                smartwatts_name = self._get_smartwatts_name(cpu)
                p.uri(
                    display_name='Add InfluxDB power reports in Grafana…',
                    url=f'http://localhost:{GRAFANA_PORT}/api/datasources',
                    user='******', password='******', force_basic_auth=True,
                    body_format='json', method='POST',
                    status_code=[200, 409], # 409 means: already added
                    body=json.dumps({'name': f'power-{cpu_name}',
                                     'type': 'influxdb',
                                     'url': f'http://{influxdbs_addr}:{INFLUXDB_PORT}',
                                     'access': 'proxy',
                                     'database': f'power_{cpu.cpu_shortname}',
                                     'isDefault': True}),
                )
                i = i + 1

            p.uri(
                display_name='Create a dashboard with all containers…',
                url='http://localhost:3000/api/dashboards/import',
                user='******', password='******', force_basic_auth=True,
                body_format='json', method='POST', status_code=[200],
                body=json.dumps(dashboard_json)
            )
Ejemplo n.º 15
0
                 cluster="paravance",
                 number=1,
                 flavour="large")\
    .finalize()
provider = Distem(conf)

roles, networks = provider.init()

print(roles)
print(networks)
gateway = networks[0]['gateway']
print("Gateway : %s" % gateway)

discover_networks(roles, networks)

with play_on(roles=roles, gather_facts=False) as p:
    # We first need internet connectivity
    # Netmask for a subnet in g5k is a /14 netmask
    p.shell("ifconfig if0 $(hostname -I) netmask 255.252.0.0")
    p.shell("route add default gw %s dev if0" % gateway)

# Experimentation logic starts here
with play_on(roles=roles) as p:
    # flent requires python3, so we default python to python3
    p.apt_repository(
        repo="deb http://deb.debian.org/debian stretch main contrib non-free",
        state="present")
    p.apt(name=["flent", "netperf", "python3-setuptools"], state="present")

with play_on(pattern_hosts="server", roles=roles) as p:
    p.shell("nohup netperf &")
Ejemplo n.º 16
0
                                   image="/grid5000/virt-images/debian9-x64-std-2019040916.qcow2",
                                   gateway="access.grid5000.fr",
                                   gateway_user="******")\
                    .add_machine(roles=["server"],
                                 cluster="grisou",
                                 number=1)\
                    .add_machine(roles=["client"],
                                 cluster="grisou",
                                 number=1)\
                    .finalize()

provider = VMonG5k(conf)

roles, networks = provider.init()
discover_networks(roles, networks)
with play_on("all", roles=roles) as p:
    # flent requires python3, so we default python to python3
    p.shell(
        "update-alternatives --install /usr/bin/python python /usr/bin/python3 1"
    )
    p.apt_repository(
        repo="deb http://deb.debian.org/debian stretch main contrib non-free",
        state="present")
    p.apt(name=["flent", "netperf", "python3-setuptools"], state="present")

with play_on("server", roles=roles) as p:
    p.shell("nohup netperf &")

with play_on("client", roles=roles) as p:
    p.shell("flent rrul -p all_scaled " + "-l 60 " +
            "-H {{ hostvars[groups['server'][0]].inventory_hostname }} " +
Ejemplo n.º 17
0
               })

    e.deploy()

    ## #B check if everything has deployed well
    local_sensor_logs = './_tmp_enos_/sensor-logs'
    remote_sensor_logs = 'tmp/sensor-logs'
    ## #1 remove outdated data

    localDirLogs = Path(f"{local_sensor_logs}/{roles['calibrate'][0].address}")

    if localDirLogs.exists() and localDirLogs.is_dir():
        shutil.rmtree(localDirLogs)

    ## #2 retrieve new data
    with play_on(pattern_hosts='calibrate', roles=roles) as p:
        p.shell(
            f'sudo docker container logs powerapi-sensor > /{remote_sensor_logs}'
        )
        p.fetch(
            display_name='Retrieving the logs of powerapi-sensor',
            src=f'/{remote_sensor_logs}',
            dest=f'{local_sensor_logs}',
            flat=False,
        )

    pathFileLogs = localDirLogs / remote_sensor_logs

    with pathFileLogs.open('r') as f:
        logs = f.read()
        print(logs)
Ejemplo n.º 18
0
        "networks": [
            {
                "roles": ["local"],
                "start": "172.17.0.0",
                "end": "172.17.255.255",
                "cidr": "172.17.0.0/16",
                "gateway": "172.17.0.1",
                "dns": "172.17.0.1",
            }
        ],
    }
}

inventory = os.path.join(os.getcwd(), "hosts")
conf = Configuration.from_dictionnary(provider_conf)
provider = Static(conf)

roles, networks = provider.init()

with play_on(roles=roles) as p:
    p.shell("date > /tmp/date")

with open("/tmp/date") as f:
    print(f.readlines())


# async
with play_on(pattern_hosts="all", roles=roles) as p:
    for i in range(10):
        p.shell("sleep 10", async=100, poll=0)
Ejemplo n.º 19
0
                                   image="/grid5000/virt-images/debian9-x64-std-2019040916.qcow2",
                                   gateway="access.grid5000.fr",
                                   gateway_user="******")\
                    .add_machine(roles=["server"],
                                 cluster="grisou",
                                 number=1)\
                    .add_machine(roles=["client"],
                                 cluster="grisou",
                                 number=1)\
                    .finalize()

provider = VMonG5k(conf)

roles, networks = provider.init()
discover_networks(roles, networks)
with play_on(roles=roles) as p:
    # flent requires python3, so we default python to python3
    p.shell(
        "update-alternatives --install /usr/bin/python python /usr/bin/python3 1"
    )
    p.apt_repository(
        repo="deb http://deb.debian.org/debian stretch main contrib non-free",
        state="present")
    p.apt(name=["flent", "netperf", "python3-setuptools"], state="present")

with play_on(pattern_hosts="server", roles=roles) as p:
    p.shell("nohup netperf &")

with play_on(pattern_hosts="client", roles=roles) as p:
    p.shell("flent rrul -p all_scaled " + "-l 60 " +
            "-H {{ hostvars[groups['server'][0]].inventory_hostname }} " +
Ejemplo n.º 20
0
        "machines": [{
            "roles": ["control"],
            "address": "localhost",
            "alias": "test_machine",
            "extra": {
                "ansible_connection": "local"
            }
        }],
        "networks": [{
            "roles": ["local"],
            "start": "172.17.0.0",
            "end": "172.17.255.255",
            "cidr": "172.17.0.0/16",
            "gateway": "172.17.0.1",
            "dns": "172.17.0.1",
        }]
    }
}

inventory = os.path.join(os.getcwd(), "hosts")
conf = Configuration.from_dictionnary(provider_conf)
provider = Static(conf)

roles, networks = provider.init()

with play_on("all", roles=roles) as p:
    p.shell("date > /tmp/date")

with open("/tmp/date") as f:
    print(f.readlines())
Ejemplo n.º 21
0
def monitor(rs: Roles, nets: List[Network]):
    '''Fig4. Reusable function for monitoring.

    Collect metrics on `monitored` hosts. Store and see metrics on
    `aggregator` hosts. Use the `monitor` network to send metrics.

    '''
    # Discover networks to use net info in telegraf.conf.j2
    discover_networks(rs, nets)

    # Install Docker
    with play_on(pattern_hosts="all", roles=rs) as ansible:
        ansible.shell(
            "which docker || (curl -sSL https://get.docker.com/ | sh)",
            display_name="Install docker")
        ansible.apt(
            display_name="Install python-docker (for ansible docker_container)",
            name="python-docker", update_cache=True)

    # Install Telegraf on monitored machines
    with play_on(pattern_hosts="monitored", roles=rs, gather_facts="all") as ansible:
        ansible.template(
            display_name="Generating Telegraf conf",
            src="misc/telegraf.conf.j2",
            dest="/root/telegraf.conf")
        ansible.docker_container(
            display_name="Installing Telegraf",
            name="telegraf", image="telegraf:1.12-alpine",
            detach=True, network_mode="host", state="started",
            volumes=['/root/telegraf.conf:/etc/telegraf/telegraf.conf'])

    # Install InfluxDB and Grafana on `aggregator` machines
    with play_on(pattern_hosts="aggregator", roles=rs) as ansible:
        ansible.docker_container(
            display_name="Install InfluxDB",
            name="influxdb", image="influxdb:1.7-alpine",
            detach=True, state="started", network_mode="host",
            exposed_ports="8086:8086")
        ansible.wait_for(
            display_name="Waiting for InfluxDB to be ready",
            host="localhost", port="8086", state="started",
            delay=2, timeout=120,)

        ansible.docker_container(
            display_name="Install Grafana",
            name="grafana", image="grafana/grafana:5.4.3",
            detach=True, state="started", network_mode="host",
            exposed_ports="3000:3000")
        ansible.wait_for(
            display_name="Waiting for Grafana to be ready",
            host="localhost", port="3000", state="started",
            delay=2, timeout=120,)
        ansible.uri(
            display_name="Add InfluxDB in Grafana",
            url="http://localhost:3000/api/datasources",
            user="******", password="******", force_basic_auth=True,
            body_format="json", method="POST",
            status_code=[200,409], # 409 for already added
            body=json.dumps({
                "name": "telegraf", "type": "influxdb",
                "url": "http://localhost:8086",
                "access": "proxy", "database": "telegraf",
                "isDefault": True}))
        ansible.uri(
            display_name="Import dashboard in Grafana",
            url="http://localhost:3000/api/dashboards/import",
            user="******", password="******", force_basic_auth=True,
            body_format="json", method="POST",
            status_code=[200], # 409 for already added
            src="misc/grafana-dashboard.json")

    # Display UI URLs to view metrics
    ui_urls = map(lambda h: f'http://{h.extra["monitor_ip"]}:3000', rs['aggregator'])
    LOG.info(f'View UI on {list(ui_urls)}')
    LOG.info('Connect with `admin` as login and password, '
             'then skip the change password, '
             'and finally select `Host Dashboard`.')