def wait_for_sink():
    sink_joined = is_state('flume-sink.joined')
    sink_ready = is_state('flume-sink.ready')
    if not sink_joined:
        hookenv.status_set('blocked', 'Waiting for connection to Flume Sink')
    elif sink_joined and not sink_ready:
        hookenv.status_set('blocked', 'Waiting for Flume Sink')
Beispiel #2
0
def main():
    # Modify the behavior of the PostgreSQL package installation
    # before any packages are installed. We do this here, rather than
    # in handlers, so that extra_packages declared by the operator
    # don't drag in the PostgreSQL packages as dependencies before
    # the environment tweaks have been made.
    if (not reactive.is_state('apt.installed.postgresql-common') and
            not reactive.is_state('postgresql.cluster.inhibited')):
        generate_locale()
        inhibit_default_cluster_creation()
        install_postgresql_packages()
        install_extra_packages()  # Deprecated extra-packages option

    # Don't trust this state from the last hook. Daemons may have
    # crashed and servers rebooted since then.
    if reactive.is_state('postgresql.cluster.created'):
        try:
            reactive.toggle_state('postgresql.cluster.is_running',
                                  postgresql.is_running())
        except subprocess.CalledProcessError as x:
            if not reactive.is_state('workloadstatus.blocked'):
                status_set('blocked',
                           'Local PostgreSQL cluster is corrupt: {}'
                           ''.format(x.stderr))

    # Reconfigure PostgreSQL. While we don't strictly speaking need
    # to do this every hook, we do need to do this almost every hook,
    # since even things like the number of peers or number of clients
    # can affect minimum viable configuration settings.
    reactive.remove_state('postgresql.cluster.configured')

    log_states()  # Debug noise.
Beispiel #3
0
def install_spark_standalone(zks, peers):
    """
    Called in local/standalone mode after Juju has elected a leader.
    """
    hosts = {
        'spark-master': leadership.leader_get('master-fqdn'),
    }

    # If zks have changed and we are not handling a departed spark peer,
    # give the ensemble time to settle. Otherwise we might try to start
    # spark master with data from the wrong zk leader. Doing so will cause
    # spark-master to shutdown:
    #  https://issues.apache.org/jira/browse/SPARK-15544
    if (zks and data_changed('zks', zks) and not is_state('sparkpeers.departed')):
        hookenv.status_set('maintenance',
                           'waiting for zookeeper ensemble to settle')
        hookenv.log("Waiting 2m to ensure zk ensemble has settled: {}".format(zks))
        time.sleep(120)

    # Let spark know if we have cuda libs installed.
    # NB: spark packages prereq hadoop (boo), so even in standalone mode, we'll
    # have hadoop libs installed. May as well include them in our lib path.
    extra_libs = ["/usr/lib/hadoop/lib/native"]
    if is_state('cuda.installed'):
        extra_libs.append("/usr/local/cuda/lib64")

    spark = Spark()
    spark.configure(hosts, zk_units=zks, peers=peers, extra_libs=extra_libs)
    set_deployment_mode_state('spark.standalone.installed')
 def test_toggle_state(self):
     reactive.toggle_state('foo', True)
     reactive.toggle_state('foo', True)
     reactive.toggle_state('bar', False)
     reactive.toggle_state('bar', False)
     assert reactive.is_state('foo')
     assert not reactive.is_state('bar')
Beispiel #5
0
def reinstall_spark():
    spark_master_host = leadership.leader_get('master-fqdn')
    peers = []
    zks = []
    if is_state('zookeeper.ready'):
        # if ZK is availuable we are in HA. We do not want reconfigurations if a leader fails
        # HA takes care of this
        spark_master_host = ''
        zk = RelationBase.from_state('zookeeper.ready')
        zks = zk.zookeepers()
        # We need reconfigure Spark when in HA and peers change ignore otherwise
        peers = get_spark_peers()

    deployment_matrix = {
        'spark_master': spark_master_host,
        'yarn_ready': is_state('hadoop.yarn.ready'),
        'hdfs_ready': is_state('hadoop.hdfs.ready'),
        'zookeepers': zks,
        'peers': peers,
    }

    if not data_changed('deployment_matrix', deployment_matrix):
        return

    hookenv.status_set('maintenance', 'configuring spark')
    hadoop = (RelationBase.from_state('hadoop.yarn.ready') or
              RelationBase.from_state('hadoop.hdfs.ready'))
    if install_spark(hadoop, zks):
        if is_state('hadoop.yarn.ready'):
            set_deployment_mode_state('spark.yarn.installed')
        else:
            set_deployment_mode_state('spark.standalone.installed')

        report_status()
def configure_kubelet(dns):
    layer_options = layer.options('tls-client')
    ca_cert_path = layer_options.get('ca_certificate_path')
    server_cert_path = layer_options.get('server_certificate_path')
    server_key_path = layer_options.get('server_key_path')

    kubelet_opts = {}
    kubelet_opts['require-kubeconfig'] = 'true'
    kubelet_opts['kubeconfig'] = kubeconfig_path
    kubelet_opts['network-plugin'] = 'cni'
    kubelet_opts['v'] = '0'
    kubelet_opts['address'] = '0.0.0.0'
    kubelet_opts['port'] = '10250'
    kubelet_opts['cluster-dns'] = dns['sdn-ip']
    kubelet_opts['cluster-domain'] = dns['domain']
    kubelet_opts['anonymous-auth'] = 'false'
    kubelet_opts['client-ca-file'] = ca_cert_path
    kubelet_opts['tls-cert-file'] = server_cert_path
    kubelet_opts['tls-private-key-file'] = server_key_path
    kubelet_opts['logtostderr'] = 'true'
    kubelet_opts['fail-swap-on'] = 'false'

    privileged = is_state('kubernetes-worker.privileged')
    kubelet_opts['allow-privileged'] = 'true' if privileged else 'false'

    if is_state('kubernetes-worker.gpu.enabled'):
        if get_version('kubelet') < (1, 6):
            hookenv.log('Adding --experimental-nvidia-gpus=1 to kubelet')
            kubelet_opts['experimental-nvidia-gpus'] = '1'
        else:
            hookenv.log('Adding --feature-gates=Accelerators=true to kubelet')
            kubelet_opts['feature-gates'] = 'Accelerators=true'

    configure_kubernetes_service('kubelet', kubelet_opts, 'kubelet-extra-args')
Beispiel #7
0
def reconfigure_spark():
    config = hookenv.config()
    maintenance = config['maintenance_mode']
    if maintenance:
        remove_state('not.upgrading')
        spark = Spark(get_dist_config())
        report_status(spark)
        spark.stop()
        current_version = spark.get_current_version()
        if config['upgrade_immediately'] and config['spark_version'] != current_version:
            upgrade_spark()
        return
    else:
        set_state('not.upgrading')

    mode = hookenv.config()['spark_execution_mode']
    hookenv.status_set('maintenance', 'Configuring Apache Spark')
    spark = Spark(get_dist_config())
    spark.stop()
    if is_state('hadoop.ready') and mode.startswith('yarn') and (not is_state('yarn.configured')):
        # was in a mode other than yarn, going to yarn
        hookenv.status_set('maintenance', 'Setting up Apache Spark for YARN')
        spark.configure_yarn_mode()
        set_state('yarn.configured')

    if is_state('hadoop.ready') and (not mode.startswith('yarn')) and is_state('yarn.configured'):
        # was in a yarn mode and going to another mode
        hookenv.status_set('maintenance', 'Disconnecting Apache Spark from YARN')
        spark.disable_yarn_mode()
        remove_state('yarn.configured')

    spark.configure()
    spark.start()
    report_status(spark)
Beispiel #8
0
def install_spark_yarn():
    """
    Called in 'yarn-*' mode after Juju has elected a leader. The
    'hadoop.yarn.ready' state must be set.
    """
    hosts = {
        'spark-master': leadership.leader_get('master-fqdn'),
    }
    hadoop = (RelationBase.from_state('hadoop.yarn.ready') or
              RelationBase.from_state('hadoop.hdfs.ready'))
    rms = hadoop.resourcemanagers()
    hosts['resourcemanager'] = rms[0]

    # Probably don't need to check this since yarn.ready implies hdfs.ready
    # for us, but it doesn't hurt.
    if is_state('hadoop.hdfs.ready'):
        nns = hadoop.namenodes()
        hosts['namenode'] = nns[0]

    # Always include native hadoop libs in yarn mode; add cuda libs if present.
    extra_libs = ["/usr/lib/hadoop/lib/native"]
    if is_state('cuda.installed'):
        extra_libs.append("/usr/local/cuda/lib64")

    spark = Spark()
    spark.configure(hosts, zk_units=None, peers=None, extra_libs=extra_libs)
    set_deployment_mode_state('spark.yarn.installed')
def restart_for_cloud():
    if is_state('endpoint.gcp.ready'):
        _write_gcp_snap_config('kubelet')
    elif is_state('endpoint.openstack.ready'):
        _write_openstack_snap_config('kubelet')
    set_state('kubernetes-worker.restarted-for-cloud')
    set_state('kubernetes-worker.restart-needed')
def request_integration():
    hookenv.status_set('maintenance', 'requesting cloud integration')
    kube_control = endpoint_from_flag('kube-control.cluster_tag.available')
    cluster_tag = kube_control.get_cluster_tag()
    if is_state('endpoint.aws.joined'):
        cloud = endpoint_from_flag('endpoint.aws.joined')
        cloud.tag_instance({
            'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned',
        })
        cloud.tag_instance_security_group({
            'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned',
        })
        cloud.tag_instance_subnet({
            'kubernetes.io/cluster/{}'.format(cluster_tag): 'owned',
        })
        cloud.enable_object_storage_management(['kubernetes-*'])
    elif is_state('endpoint.gcp.joined'):
        cloud = endpoint_from_flag('endpoint.gcp.joined')
        cloud.label_instance({
            'k8s-io-cluster-name': cluster_tag,
        })
        cloud.enable_object_storage_management()
    cloud.enable_instance_inspection()
    cloud.enable_dns_management()
    set_state('kubernetes-worker.cloud-request-sent')
    hookenv.status_set('waiting', 'waiting for cloud integration')
Beispiel #11
0
def is_ready():
    nginx_ready = is_state('charm-svg.nginx.configured')
    uwsgi_ready = is_state('charm-svg.uwsgi.configured')
    uwsgi_running = uwsgi.running()

    if nginx_ready and uwsgi_ready and uwsgi_running:
        hookenv.status_set('active', 'running on port 80')
def set_deployment_mode_state(state):
    if is_state('spark.yarn.installed'):
        remove_state('spark.yarn.installed')
    if is_state('spark.standalone.installed'):
        remove_state('spark.standalone.installed')
    remove_state('spark.ready.to.install')
    set_state('spark.started')
    set_state(state)
def prereq_status():
    hdfs_rel = is_state('namenode.joined')
    hdfs_ready = is_state('namenode.ready')

    if not hdfs_rel:
        status_set('blocked', 'missing required namenode relation')
    elif hdfs_rel and not hdfs_ready:
        status_set('waiting', 'waiting for hdfs to become ready')
Beispiel #14
0
def set_deployment_mode_state(state):
    if is_state('spark.yarn.installed'):
        remove_state('spark.standalone.installed')
    if is_state('spark.standalone.installed'):
        remove_state('spark.yarn.installed')
    set_state(state)
    # set app version string for juju status output
    spark_version = get_package_version('spark-core') or 'unknown'
    hookenv.application_version_set(spark_version)
Beispiel #15
0
def update():
    if is_state('statsd.started'):
        host.service_stop('statsd')
    apt_update()
    apt_upgrade(['nodejs', 'npm', 'git'])
    charm_dir = hookenv.charm_dir()
    check_call(['npm', 'update', os.path.join(charm_dir, 'files/statsd-influxdb-backend')])
    if is_state('statsd.started'):
        host.service_start('statsd')
def report_status():
    nn_joined = is_state('namenode.joined')
    rm_joined = is_state('resourcemanager.joined')
    if not nn_joined and not rm_joined:
        hookenv.status_set('blocked', 'waiting for connections to resource manager and namenode')
    elif not nn_joined:
        hookenv.status_set('blocked', 'waiting for connection to namenode')
    elif not rm_joined:
        hookenv.status_set('blocked', 'waiting for connection to resource manager')
Beispiel #17
0
def report_status():
    """Set juju status based on the deployment topology."""
    giraph_joined = is_state('giraph.joined')
    giraph_installed = is_state('giraph.installed')
    if not giraph_joined:
        hookenv.status_set('blocked',
                           'waiting for relation to a giraph host')
    elif giraph_installed:
        hookenv.status_set('active',
                           'ready')
def get_node_name():
    kubelet_extra_args = parse_extra_args('kubelet-extra-args')
    cloud_provider = kubelet_extra_args.get('cloud-provider', '')
    if is_state('endpoint.aws.ready'):
        cloud_provider = 'aws'
    elif is_state('endpoint.gcp.ready'):
        cloud_provider = 'gcp'
    if cloud_provider == 'aws':
        return getfqdn()
    else:
        return gethostname()
def prereq_status():
    hdfs_rel = is_state('namenode.joined')
    yarn_rel = is_state('resourcemanager.joined')
    yarn_ready = is_state('resourcemanager.ready')

    if not hdfs_rel:
        status_set('blocked', 'missing required namenode relation')
    elif not yarn_rel:
        status_set('blocked', 'missing required resourcemanager relation')
    elif yarn_rel and not yarn_ready:
        status_set('waiting', 'waiting for yarn to become ready')
def report_status():
    mode = hookenv.config()['spark_execution_mode']
    if (not is_state('spark.yarn.installed')) and mode.startswith('yarn'):
        hookenv.status_set('blocked',
                           'Yarn execution mode not available')
        return

    if mode == 'standalone' and is_state('leadership.is_leader'):
        mode = mode + " - master"

    hookenv.status_set('active', 'Ready ({})'.format(mode))
Beispiel #21
0
def install_hive(hadoop):
    '''
    Anytime our dependencies are available, check to see if we have a valid
    reason to (re)install. These include:
    - initial install
    - HBase has joined/departed
    '''
    # Hive cannot handle - in the metastore db name and
    # mysql uses the service name to name the db
    if "-" in hookenv.service_name():
        hookenv.status_set('blocked', "application name may not contain '-'; "
                                      "redeploy with a different name")
        return

    # Get hbase connection dict if it's available
    if is_state('hbase.ready'):
        hbase = RelationBase.from_state('hbase.ready')
        hbserver = hbase.hbase_servers()[0]
    else:
        hbserver = None

    # Use this to determine if we need to reinstall
    deployment_matrix = {
        'hbase': hbserver,
    }

    # Handle nuances when installing versus re-installing
    if not is_state('hive.installed'):
        prefix = "installing"

        # On initial install, prime our kv with the current deployment matrix.
        # Subsequent calls will use this to determine if a reinstall is needed.
        data_changed('deployment_matrix', deployment_matrix)
    else:
        prefix = "configuring"

        # Return if our matrix has not changed
        if not data_changed('deployment_matrix', deployment_matrix):
            return

    hookenv.status_set('maintenance', '{} hive'.format(prefix))
    hookenv.log("{} hive with: {}".format(prefix, deployment_matrix))
    hive = Hive()
    hive.install(hbase=hbserver)
    hive.restart()
    hive.open_ports()
    set_state('hive.installed')
    report_status()

    # set app version string for juju status output
    hive_version = get_package_version('hive') or 'unknown'
    hookenv.application_version_set(hive_version)
Beispiel #22
0
def config_changed():
    config = hookenv.config()
    if not reactive.is_state('apache.available') or not config.changed('port'):
        return
    with open('apache.yaml') as fp:
        workload = yaml.safe_load(fp)
    for name, site in workload['sites'].items():
        configure_site(name, site)
    if reactive.is_state('apache.started'):
        hookenv.close_port(config.previous('port'))
        assert host.service_reload('apache2'), 'Failed to reload Apache'
        hookenv.open_port(config['port'])
    hookenv.status_set('maintenance', '')
def ganglia_changed():
    hadoop = get_hadoop_base()
    hdfs = HDFS(hadoop)
    yarn = YARN(hadoop)
    if is_state('namenode.started'):
        hdfs.restart_namenode()
    if is_state('datanode.started'):
        hdfs.restart_datanode()
    if is_state('journalnode.started'):
        hdfs.restart_journalnode()
    if is_state('resourcemanager.started'):
        yarn.restart_resourcemanager()
    if is_state('nodemanager.started'):
        yarn.restart_nodemanager()
Beispiel #24
0
def report_status(spark):
    mode = hookenv.config()['spark_execution_mode']
    if (not is_state('yarn.configured')) and mode.startswith('yarn'):
        hookenv.status_set('blocked',
                           'Yarn execution mode not available')
        return

    if mode == 'standalone':
        if is_state('zookeeper.configured'):
            mode = mode + " HA"
        elif spark.is_master():
            mode = mode + " - master"

    hookenv.status_set('active', 'Ready ({})'.format(mode))
Beispiel #25
0
    def test_missing_java(self, mock_status, mock_options):
        '''
        Test to verify that our missing_java function kicks us into a
        'waiting' state if 'java.joined' is set, or tells us that
        we're blocked if it is not.

        In the case of install_java being set, verify that we instead
        set the install_java state, and set no status.

        '''
        mock_status.side_effect = self.status_set
        mock_options.return_value = {'install_java': 'foo'}

        missing_java()
        self.assertTrue(is_state('install_java'))
        self.assertFalse(self.last_status[0])

        mock_options.return_value = {'install_java': ''}

        set_state('some.state')
        missing_java()
        self.assertEqual(self.last_status[0], 'blocked')

        set_state('java.joined', 'some.other.state')
        missing_java()
        self.assertEqual(self.last_status[0], 'waiting')

        remove_state('java.joined')
        missing_java()
        self.assertEqual(self.last_status[0], 'blocked')
Beispiel #26
0
def configure_kubelet(dns, ingress_ip):
    layer_options = layer.options('tls-client')
    ca_cert_path = layer_options.get('ca_certificate_path')
    server_cert_path = layer_options.get('server_certificate_path')
    server_key_path = layer_options.get('server_key_path')

    kubelet_opts = {}
    kubelet_opts['require-kubeconfig'] = 'true'
    kubelet_opts['kubeconfig'] = kubeconfig_path
    kubelet_opts['network-plugin'] = 'cni'
    kubelet_opts['v'] = '0'
    kubelet_opts['address'] = '0.0.0.0'
    kubelet_opts['port'] = '10250'
    kubelet_opts['cluster-domain'] = dns['domain']
    kubelet_opts['anonymous-auth'] = 'false'
    kubelet_opts['client-ca-file'] = ca_cert_path
    kubelet_opts['tls-cert-file'] = server_cert_path
    kubelet_opts['tls-private-key-file'] = server_key_path
    kubelet_opts['logtostderr'] = 'true'
    kubelet_opts['fail-swap-on'] = 'false'
    kubelet_opts['node-ip'] = ingress_ip

    if (dns['enable-kube-dns']):
        kubelet_opts['cluster-dns'] = dns['sdn-ip']

    # set --allow-privileged flag for kubelet
    kubelet_opts['allow-privileged'] = set_privileged()

    if is_state('kubernetes-worker.gpu.enabled'):
        hookenv.log('Adding '
                    '--feature-gates=DevicePlugins=true '
                    'to kubelet')
        kubelet_opts['feature-gates'] = 'DevicePlugins=true'

    configure_kubernetes_service('kubelet', kubelet_opts, 'kubelet-extra-args')
Beispiel #27
0
def upgrade_charm():
    # Trigger removal of PPA docker installation if it was previously set.
    set_state('config.changed.install_from_upstream')
    hookenv.atexit(remove_state, 'config.changed.install_from_upstream')

    cleanup_pre_snap_services()
    check_resources_for_upgrade_needed()

    # Remove the RC for nginx ingress if it exists
    if hookenv.config().get('ingress'):
        kubectl_success('delete', 'rc', 'nginx-ingress-controller')

    # Remove gpu.enabled state so we can reconfigure gpu-related kubelet flags,
    # since they can differ between k8s versions
    if is_state('kubernetes-worker.gpu.enabled'):
        remove_state('kubernetes-worker.gpu.enabled')
        try:
            disable_gpu()
        except ApplyNodeLabelFailed:
            # Removing node label failed. Probably the master is unavailable.
            # Proceed with the upgrade in hope GPUs will still be there.
            hookenv.log('Failed to remove GPU labels. Proceed with upgrade.')

    remove_state('kubernetes-worker.cni-plugins.installed')
    remove_state('kubernetes-worker.config.created')
    remove_state('kubernetes-worker.ingress.available')
    remove_state('worker.auth.bootstrapped')
    set_state('kubernetes-worker.restart-needed')
Beispiel #28
0
def config_changed():
    """
    Render the config template
    and restart the service.

    :returns: None
    """
    context = config()
    config_file = 'config.toml'
    config_directory = '/etc/containerd'

    # Mutate the input string into a dictionary.
    context['custom_registries'] = \
        json.loads(context['custom_registries'])

    if is_state('containerd.nvidia.available') \
            and context.get('runtime') == 'auto':
        context['runtime'] = 'nvidia-container-runtime'
    else:
        context['runtime'] = 'runc'

    if not os.path.isdir(config_directory):
        os.mkdir(config_directory)

    render(config_file, os.path.join(config_directory, config_file), context)

    host.service_restart('containerd')

    if _check_containerd():
        status_set('active', 'Container runtime available.')
        set_state('containerd.ready')

    else:
        status_set('blocked', 'Container runtime not available.')
Beispiel #29
0
def configure_vault(context):
    log("Running configure_vault", level=DEBUG)
    context['disable_mlock'] = is_container() or config('disable-mlock')
    context['ssl_available'] = is_state('vault.ssl.available')

    if is_flag_set('etcd.tls.available'):
        etcd = endpoint_from_flag('etcd.available')
        log("Etcd detected, adding to context", level=DEBUG)
        context['etcd_conn'] = etcd.connection_string()
        context['etcd_tls_ca_file'] = '/var/snap/vault/common/etcd-ca.pem'
        context['etcd_tls_cert_file'] = '/var/snap/vault/common/etcd-cert.pem'
        context['etcd_tls_key_file'] = '/var/snap/vault/common/etcd.key'
        save_etcd_client_credentials(etcd,
                                     key=context['etcd_tls_key_file'],
                                     cert=context['etcd_tls_cert_file'],
                                     ca=context['etcd_tls_ca_file'])
        context['api_addr'] = vault.get_api_url()
        context['cluster_addr'] = vault.get_cluster_url()
        log("Etcd detected, setting api_addr to {}".format(
            context['api_addr']))
    else:
        log("Etcd not detected", level=DEBUG)
    log("Rendering vault.hcl.j2", level=DEBUG)
    render('vault.hcl.j2', VAULT_CONFIG, context, perms=0o600)
    log("Rendering vault systemd configuation", level=DEBUG)
    render('vault.service.j2', VAULT_SYSTEMD_CONFIG, {}, perms=0o644)
    service('enable', 'vault')
    log("Opening vault port", level=DEBUG)
    open_port(8200)
    set_flag('configured')
    if any_file_changed([VAULT_CONFIG, VAULT_SYSTEMD_CONFIG]):
        # force a restart if config has changed
        clear_flag('started')
def purge_containerd():
    """
    Purge Containerd from the cluster.

    :return: None
    """
    status.maintenance('Removing containerd from principal')

    host.service_stop('containerd.service')
    apt_unhold(CONTAINERD_PACKAGE)
    apt_purge(CONTAINERD_PACKAGE, fatal=True)

    if is_state('containerd.nvidia.ready'):
        nvidia_packages = config('nvidia_apt_packages').split()
        apt_purge(nvidia_packages, fatal=True)

    sources = [
        '/etc/apt/sources.list.d/nvidia.list'
    ]

    for f in sources:
        if os.path.isfile(f):
            os.remove(f)

    apt_autoremove(purge=True, fatal=True)

    remove_state('containerd.ready')
    remove_state('containerd.installed')
    remove_state('containerd.nvidia.ready')
    remove_state('containerd.nvidia.checked')
    remove_state('containerd.nvidia.available')
    remove_state('containerd.version-published')
Beispiel #31
0
 def get_roles(self):
     roles = ['spark-worker', 'spark-client']
     zk_units = unitdata.kv().get('zookeeper.units', [])
     if is_state('leadership.is_leader') or zk_units:
         roles.append('spark-master')
         roles.append('spark-history-server')
     return roles
def purge_containerd():
    """
    Purge Containerd from the
    cluster.

    :return: None
    """
    status_set('maintenance', 'Removing containerd from principal')

    host.service_stop('containerd.service')
    apt_purge('containerd', fatal=True)

    if is_state('containerd.nvidia.ready'):
        apt_purge(NVIDIA_PACKAGES, fatal=True)

    sources = [
        '/etc/apt/sources.list.d/cuda.list',
        '/etc/apt/sources.list.d/nvidia-container-runtime.list'
    ]

    for f in sources:
        if os.path.isfile(f):
            os.remove(f)

    apt_autoremove(purge=True, fatal=True)

    remove_state('containerd.ready')
    remove_state('containerd.installed')
    remove_state('containerd.nvidia.ready')
    remove_state('containerd.nvidia.checked')
    remove_state('containerd.nvidia.available')
Beispiel #33
0
def send_client_all_info(client):
    """Send clients (plugin, RM, non-DNs) all dfs relation data.

    At this point, the namenode is ready to serve clients. Send all
    dfs relation data so that our 'namenode.ready' state becomes set.
    """
    bigtop = Bigtop()
    fqdn = get_fqdn()
    hdfs_port = get_layer_opts().port('namenode')
    webhdfs_port = get_layer_opts().port('nn_webapp_http')

    client.send_spec(bigtop.spec())
    client.send_namenodes([fqdn])
    client.send_ports(hdfs_port, webhdfs_port)
    # namenode.ready implies we have at least 1 datanode, which means hdfs
    # is ready for use. Inform clients of that with send_ready().
    if is_state('apache-bigtop-namenode.ready'):
        client.send_ready(True)
    else:
        client.send_ready(False)

    # hosts_map and clustername are required by the dfs interface to signify
    # NN's readiness. Send it, even though they are not utilized by bigtop.
    client.send_hosts_map(utils.get_kv_hosts())
    client.send_clustername(hookenv.service_name())
Beispiel #34
0
def setup_non_leader_authentication():

    service_key = '/root/cdk/serviceaccount.key'
    basic_auth = '/root/cdk/basic_auth.csv'
    known_tokens = '/root/cdk/known_tokens.csv'

    keys = [service_key, basic_auth, known_tokens]
    # The source of truth for non-leaders is the leader.
    # Therefore we overwrite_local with whatever the leader has.
    if not get_keys_from_leader(keys, overwrite_local=True):
        # the keys were not retrieved. Non-leaders have to retry.
        return

    if not any_file_changed(keys) and is_state('authentication.setup'):
        # No change detected and we have already setup the authentication
        return

    hookenv.status_set('maintenance', 'Rendering authentication templates.')
    api_opts = FlagManager('kube-apiserver')
    api_opts.add('basic-auth-file', basic_auth)
    api_opts.add('token-auth-file', known_tokens)
    api_opts.add('service-account-key-file', service_key)

    controller_opts = FlagManager('kube-controller-manager')
    controller_opts.add('service-account-private-key-file', service_key)

    remove_state('kubernetes-master.components.started')
    set_state('authentication.setup')
Beispiel #35
0
def send_client_all_info(client):
    """Send clients (plugin, RM, non-DNs) all dfs relation data.

    At this point, the resourcemanager is ready to serve clients. Send all
    mapred relation data so that our 'resourcemanager.ready' state becomes set.
    """
    bigtop = Bigtop()
    rm_host = get_fqdn()
    rm_ipc = get_layer_opts().port('resourcemanager')
    jh_ipc = get_layer_opts().port('jobhistory')
    jh_http = get_layer_opts().port('jh_webapp_http')

    client.send_resourcemanagers([rm_host])
    client.send_spec(bigtop.spec())
    client.send_ports(rm_ipc, jh_http, jh_ipc)

    # resourcemanager.ready implies we have at least 1 nodemanager, which means
    # yarn is ready for use. Inform clients of that with send_ready().
    if is_state('apache-bigtop-resourcemanager.ready'):
        client.send_ready(True)
    else:
        client.send_ready(False)

    # hosts_map is required by the mapred interface to signify
    # RM's readiness. Send it, even though it is not utilized by bigtop.
    client.send_hosts_map(utils.get_kv_hosts())
Beispiel #36
0
def client_present(client):
    if is_state('leadership.is_leader'):
        client.set_spark_started()
        spark = Spark()
        master_ip = utils.resolve_private_address(hookenv.unit_private_ip())
        master_url = spark.get_master_url(master_ip)
        client.send_master_info(master_url, master_ip)
def configure_cdk_addons():
    ''' Configure CDK addons '''
    remove_state('cdk-addons.configured')
    load_gpu_plugin = hookenv.config('enable-nvidia-plugin').lower()
    gpuEnable = (get_version('kube-apiserver') >= (1, 9) and
                 load_gpu_plugin == "auto" and
                 is_state('kubernetes-master.gpu.enabled'))
    dbEnabled = str(hookenv.config('enable-dashboard-addons')).lower()
    dnsEnabled = str(hookenv.config('enable-kube-dns')).lower()
    metricsEnabled = str(hookenv.config('enable-metrics')).lower()
    args = [
        'arch=' + arch(),
        'dns-ip=' + get_deprecated_dns_ip(),
        'dns-domain=' + hookenv.config('dns_domain'),
        'enable-dashboard=' + dbEnabled,
        'enable-kube-dns=' + dnsEnabled,
        'enable-metrics=' + metricsEnabled,
        'enable-gpu=' + str(gpuEnable).lower()
    ]
    check_call(['snap', 'set', 'cdk-addons'] + args)
    if not addons_ready():
        hookenv.status_set('waiting', 'Waiting to retry addon deployment')
        remove_state('cdk-addons.configured')
        return

    set_state('cdk-addons.configured')
def publish_config():
    endpoint = endpoint_from_flag("endpoint.docker.joined")
    endpoint.set_config(
        socket=Docker().socket,
        runtime="docker",
        nvidia_enabled=is_state("nvidia-docker.supported"),
    )
Beispiel #39
0
def charm_status():
    """
    Set the charm's status after each hook is run.

    :return: None
    """
    if is_state('upgrade.series.in-progress'):
        status.blocked('Series upgrade in progress')
    elif is_state('containerd.nvidia.invalid-option'):
        status.blocked('{} is an invalid option for gpu_driver'.format(
            config().get('gpu_driver')))
    elif _check_containerd():
        status.active('Container runtime available')
        set_state('containerd.ready')
    else:
        status.blocked('Container runtime not available')
Beispiel #40
0
def check_app_config():
    """
    Check the Ghost application config and possibly update and restart it.
    """
    cfg_changed = is_state('config.changed')
    db_changed = ghost.check_db_changed()
    if cfg_changed or db_changed:
        hookenv.status_set('maintenance', 'updating configuration')

        # Update application
        if config.changed('release') or config.changed('checksum'):
            ghost.update_ghost()

        # Update general config
        if cfg_changed:
            ghost.update_general_config()

        # Update database config
        if db_changed:
            ghost.update_db_config()

        ghost.restart_ghost()
        set_state('ghost.running')
        host.service_restart('nginx')
    hookenv.status_set('active', 'ready')
def upgrade_charm():
    # Trigger removal of PPA docker installation if it was previously set.
    set_state('config.changed.install_from_upstream')
    hookenv.atexit(remove_state, 'config.changed.install_from_upstream')

    cleanup_pre_snap_services()
    check_resources_for_upgrade_needed()

    # Remove the RC for nginx ingress if it exists
    if hookenv.config().get('ingress'):
        kubectl_success('delete', 'rc', 'nginx-ingress-controller')

    # Remove gpu.enabled state so we can reconfigure gpu-related kubelet flags,
    # since they can differ between k8s versions
    if is_state('kubernetes-worker.gpu.enabled'):
        remove_state('kubernetes-worker.gpu.enabled')
        try:
            disable_gpu()
        except ApplyNodeLabelFailed:
            # Removing node label failed. Probably the master is unavailable.
            # Proceed with the upgrade in hope GPUs will still be there.
            hookenv.log('Failed to remove GPU labels. Proceed with upgrade.')

    remove_state('kubernetes-worker.cni-plugins.installed')
    remove_state('kubernetes-worker.config.created')
    remove_state('kubernetes-worker.ingress.available')
    remove_state('worker.auth.bootstrapped')
    set_state('kubernetes-worker.restart-needed')
def install_nodemanager(namenode, resourcemanager):
    """Install if we have FQDNs.

    We only need the master FQDNs to perform the nodemanager install, so poll
    for master host data from the appropriate relation. This allows us to
    install asap, even if '<master>.ready' is not set.
    """
    namenodes = namenode.namenodes()
    resourcemanagers = resourcemanager.resourcemanagers()
    masters = namenodes + resourcemanagers
    if namenodes and resourcemanagers and data_changed('nm.masters', masters):
        installed = is_state('apache-bigtop-nodemanager.installed')
        action = 'installing' if not installed else 'configuring'
        hookenv.status_set('maintenance', '%s nodemanager' % action)
        bigtop = Bigtop()
        bigtop.render_site_yaml(
            hosts={
                'namenode': namenodes[0],
                'resourcemanager': resourcemanagers[0],
            },
            roles=[
                'nodemanager',
                'mapred-app',
            ],
        )
        bigtop.queue_puppet()
        set_state('apache-bigtop-nodemanager.pending')
Beispiel #43
0
def client_present(client):
    if is_state('leadership.is_leader'):
        client.set_spark_started()
        spark = Spark()
        master_ip = utils.resolve_private_address(hookenv.unit_private_ip())
        master_url = spark.get_master_url(master_ip)
        client.send_master_info(master_url, master_ip)
Beispiel #44
0
def send_client_all_info(client):
    """Send clients (plugin, RM, non-DNs) all dfs relation data.

    At this point, the resourcemanager is ready to serve clients. Send all
    mapred relation data so that our 'resourcemanager.ready' state becomes set.
    """
    bigtop = Bigtop()
    rm_host = get_fqdn()
    rm_ipc = get_layer_opts().port('resourcemanager')
    jh_ipc = get_layer_opts().port('jobhistory')
    jh_http = get_layer_opts().port('jh_webapp_http')

    client.send_resourcemanagers([rm_host])
    client.send_spec(bigtop.spec())
    client.send_ports(rm_ipc, jh_http, jh_ipc)

    # resourcemanager.ready implies we have at least 1 nodemanager, which means
    # yarn is ready for use. Inform clients of that with send_ready().
    if is_state('apache-bigtop-resourcemanager.ready'):
        client.send_ready(True)
    else:
        client.send_ready(False)

    # hosts_map is required by the mapred interface to signify
    # RM's readiness. Send it, even though it is not utilized by bigtop.
    client.send_hosts_map(utils.get_kv_hosts())
def install():
    if is_state('app.installed'):
        return
    adduser('puma')
    fetch.apt_install(fetch.filter_installed_packages(['git', 'libpq-dev', 'nodejs']))

    install_site()
def configure_kubelet(dns, ingress_ip):
    layer_options = layer.options('tls-client')
    ca_cert_path = layer_options.get('ca_certificate_path')
    server_cert_path = layer_options.get('server_certificate_path')
    server_key_path = layer_options.get('server_key_path')

    kubelet_opts = {}
    kubelet_opts['require-kubeconfig'] = 'true'
    kubelet_opts['kubeconfig'] = kubeconfig_path
    kubelet_opts['network-plugin'] = 'cni'
    kubelet_opts['v'] = '0'
    kubelet_opts['address'] = '0.0.0.0'
    kubelet_opts['port'] = '10250'
    kubelet_opts['cluster-domain'] = dns['domain']
    kubelet_opts['anonymous-auth'] = 'false'
    kubelet_opts['client-ca-file'] = ca_cert_path
    kubelet_opts['tls-cert-file'] = server_cert_path
    kubelet_opts['tls-private-key-file'] = server_key_path
    kubelet_opts['logtostderr'] = 'true'
    kubelet_opts['fail-swap-on'] = 'false'
    kubelet_opts['node-ip'] = ingress_ip

    if (dns['enable-kube-dns']):
        kubelet_opts['cluster-dns'] = dns['sdn-ip']

    # set --allow-privileged flag for kubelet
    kubelet_opts['allow-privileged'] = set_privileged()

    if is_state('kubernetes-worker.gpu.enabled'):
        hookenv.log('Adding '
                    '--feature-gates=DevicePlugins=true '
                    'to kubelet')
        kubelet_opts['feature-gates'] = 'DevicePlugins=true'

    configure_kubernetes_service('kubelet', kubelet_opts, 'kubelet-extra-args')
Beispiel #47
0
 def get_roles(self):
     roles = ['spark-worker', 'spark-client']
     zk_units = unitdata.kv().get('zookeeper.units', [])
     if is_state('leadership.is_leader') or zk_units:
         roles.append('spark-master')
         roles.append('spark-history-server')
     return roles
Beispiel #48
0
def openvim_available(openvim, db):
    for service in openvim.services():
        for endpoint in service['hosts']:
            host = endpoint['hostname']
            port = endpoint['port']
            user = endpoint['user']

            openvim_uri = '{}:{}'.format(host, port)
            if kvdb.get('openvim_uri') == openvim_uri:
                return

            # TODO: encapsulate the logic in create-datacenter.sh into python
            try:
                cmd = './scripts/create-datacenter.sh {} {} {} {}'.format(
                    host, port, user, kvdb.get('openmano-tenant'))
                out, err = _run(cmd)
            except subprocess.CalledProcessError as e:
                # Ignore the error if the datacenter already exists.
                if e.returncode != 153:
                    raise

            kvdb.set('openvim_uri', openvim_uri)
            if not is_state('db.available'):
                status_set('waiting', 'Waiting for database')
            break
        break
def get_node_name():
    kubelet_extra_args = parse_extra_args('kubelet-extra-args')
    cloud_provider = kubelet_extra_args.get('cloud-provider', '')
    if is_state('endpoint.aws.ready'):
        cloud_provider = 'aws'
    elif is_state('endpoint.gcp.ready'):
        cloud_provider = 'gce'
    elif is_state('endpoint.openstack.ready'):
        cloud_provider = 'openstack'
    elif is_state('endpoint.vsphere.ready'):
        cloud_provider = 'vsphere'
    elif is_state('endpoint.azure.ready'):
        cloud_provider = 'azure'
    if cloud_provider == 'aws':
        return getfqdn().lower()
    else:
        return gethostname().lower()
Beispiel #50
0
def installing_hbase(zk, hdfs):
    zks = zk.zookeepers()
    if is_state('hbase.installed') and (not data_changed('zks', zks)):
        return

    msg = "configuring hbase" if is_state(
        'hbase.installed') else "installing hbase"
    hookenv.status_set('maintenance', msg)

    hbase = HBase()
    hosts = {}
    nns = hdfs.namenodes()
    hosts['namenode'] = nns[0]
    hbase.configure(hosts, zks)
    hbase.open_ports()
    set_state('hbase.installed')
    hookenv.status_set('active', 'ready')
Beispiel #51
0
def install():
    opts = layer.options('snap')
    for snapname, snap_opts in opts.items():
        installed_state = 'snap.installed.{}'.format(snapname)
        if not reactive.is_state(installed_state):
            snap.install(snapname, **snap_opts)
    if data_changed('snap.install.opts', opts):
        snap.connect_all()
Beispiel #52
0
def get_node_name():
    kubelet_extra_args = parse_extra_args("kubelet-extra-args")
    cloud_provider = kubelet_extra_args.get("cloud-provider", "")
    if is_state("endpoint.aws.ready"):
        cloud_provider = "aws"
    elif is_state("endpoint.gcp.ready"):
        cloud_provider = "gce"
    elif is_state("endpoint.openstack.ready"):
        cloud_provider = "openstack"
    elif is_state("endpoint.vsphere.ready"):
        cloud_provider = "vsphere"
    elif is_state("endpoint.azure.ready"):
        cloud_provider = "azure"
    if cloud_provider == "aws":
        return getfqdn().lower()
    else:
        return gethostname().lower()
Beispiel #53
0
def report_status():
    hadoop_joined = is_state('hadoop.joined')
    hadoop_ready = is_state('hadoop.ready')
    hbase_joined = is_state('hbase.joined')
    hbase_ready = is_state('hbase.ready')
    database_joined = is_state('database.connected')
    database_ready = is_state('database.available')
    hive_installed = is_state('hive.installed')
    if not hadoop_joined:
        hookenv.status_set('blocked',
                           'waiting for relation to hadoop plugin')
    elif not hadoop_ready:
        hookenv.status_set('waiting',
                           'waiting for hadoop to become ready')
    elif database_joined and not database_ready:
        hookenv.status_set('waiting',
                           'waiting for database to become ready')
    elif hbase_joined and not hbase_ready:
        hookenv.status_set('waiting',
                           'waiting for hbase to become ready')
    elif hive_installed and not database_ready:
        hookenv.status_set('active',
                           'ready (local metastore)')
    elif hive_installed and database_ready:
        hookenv.status_set('active',
                           'ready (remote metastore)')
Beispiel #54
0
def report_status(spark):
    if not is_state('not.upgrading'):
        hookenv.status_set('maintenance', 'Preparing for an upgrade')
        return

    mode = hookenv.config()['spark_execution_mode']
    if (not is_state('yarn.configured')) and mode.startswith('yarn'):
        hookenv.status_set('blocked',
                           'Yarn execution mode not available')
        return

    if mode == 'standalone':
        if is_state('zookeeper.configured'):
            mode = mode + " HA"
        elif spark.is_master():
            mode = mode + " - master"

    hookenv.status_set('active', 'Ready ({})'.format(mode))
Beispiel #55
0
def report_status():
    hadoop_joined = is_state('hadoop.joined')
    hdfs_ready = is_state('hadoop.hdfs.ready')
    zk_joined = is_state('zookeeper.joined')
    zk_ready = is_state('zookeeper.ready')
    hbase_installed = is_state('hbase.installed')
    if not hadoop_joined:
        hookenv.status_set('blocked', 'waiting for relation to hadoop plugin')
    elif not hdfs_ready:
        hookenv.status_set('waiting', 'waiting for hdfs to become ready')
    elif not zk_joined:
        hookenv.status_set('blocked', 'waiting for relation to zookeeper')
    elif not zk_ready:
        hookenv.status_set('waiting', 'waiting for zookeeper to become ready')
    elif not hbase_installed:
        hookenv.status_set('waiting', 'waiting to install hbase')
    else:
        hookenv.status_set('active', 'ready')
Beispiel #56
0
def update_recovery_conf(follow):
    assert follow != hookenv.local_unit()

    peer_rel = helpers.get_peer_relation()
    follow_relinfo = peer_rel.get(follow)
    assert follow_relinfo is not None, "Invalid upstream {}".format(follow)

    current_follow = get_following()

    if follow != current_follow:
        status_set("maintenance", "Following new unit {}".format(follow))
        set_following(follow)
        # Setting the state to defer publication until after restart.
        reactive.set_state("postgresql.replication.publish_following")

    else:
        # Even though the master is unchanged, we still regenerate
        # recovery.conf in case connection details such as IP addresses
        # have changed.
        hookenv.log("Continuing to follow {}".format(follow))

    pg12 = postgresql.has_version("12")
    if pg12:
        path = postgresql.hot_standby_conf_path()
        template = "hot_standby.conf.tmpl"
    else:
        path = postgresql.recovery_conf_path()
        template = "recovery.conf.tmpl"

    config = hookenv.config()

    data = dict(
        streaming_replication=config["streaming_replication"],
        host=follow_relinfo["host"],
        port=follow_relinfo["port"],
        user=replication_username(),
        password=leader_get("replication_password"),
    )

    if reactive.helpers.is_state("postgresql.wal_e.enabled"):
        data["restore_command"] = wal_e.wal_e_restore_command()

    templating.render(template,
                      path,
                      data,
                      owner="postgres",
                      group="postgres",
                      perms=0o600)

    if pg12:
        touch(postgresql.hot_standby_signal_path())

    # Use @when_file_changed for this when Issue #44 is resolved.
    if reactive.helpers.any_file_changed([path]):
        reactive.set_state("postgresql.cluster.needs_restart")
        if reactive.is_state("postgresql.replication.cloned"):
            reactive.set_state("postgresql.replication.check_following")
Beispiel #57
0
def report_status():
    kafka_joined = is_state('kafka.joined')
    kafka_ready = is_state('kafka.ready')
    sink_joined = is_state('flume-sink.ready')
    sink_ready = is_state('flume-sink.joined')
    if not kafka_joined and not sink_joined:
        hookenv.status_set('blocked',
                           'Waiting for relation to Kafka and Flume sink')
    elif not kafka_joined:
        hookenv.status_set('blocked', 'Waiting for relation to Kafka')
    elif not sink_joined:
        hookenv.status_set('blocked', 'Waiting for relation to Flume sink')
    elif not kafka_ready and not sink_ready:
        hookenv.status_set('waiting', 'Waiting for Kafka and Flume sink')
    elif not kafka_ready:
        hookenv.status_set('waiting', 'Waiting for Kafka')
    elif not sink_ready:
        hookenv.status_set('waiting', 'Waiting for Flume sink')
def is_privileged():
    """Return boolean indicating whether or not to set allow-privileged=true.

    """
    privileged = hookenv.config('allow-privileged').lower()
    if privileged == 'auto':
        return is_state('kubernetes-master.gpu.enabled')
    else:
        return privileged == 'true'
    def apply_node_labels(self) -> None:
        """
        Parse the `labels` configuration option and apply the labels to the
        node.

        @raises LabelMaker.NodeLabelError: if the label cannot be added or removed
        """
        # Get the user's configured labels.
        config = hookenv.config()
        user_labels = {}
        for item in config.get("labels").split(" "):
            try:
                key, val = item.split("=")
            except ValueError:
                hookenv.log("Skipping malformed option: {}.".format(item))
            else:
                user_labels[key] = val
        # Collect the current label state.
        current_labels = db.get("current_labels") or {}

        try:
            # Remove any labels that the user has removed from the config.
            for key in list(current_labels.keys()):
                if key not in user_labels:
                    self.remove_label(key)
                    del current_labels[key]
                    db.set("current_labels", current_labels)

            # Add any new labels.
            for key, val in user_labels.items():
                self.set_label(key, val)
                current_labels[key] = val
                db.set("current_labels", current_labels)

            # Set the juju-application label.
            self.set_label("juju-application", hookenv.service_name())

            # Set the juju.io/cloud label.
            juju_io_cloud_labels = [
                ("aws", "ec2"),
                ("gcp", "gce"),
                ("openstack", "openstack"),
                ("vsphere", "vsphere"),
                ("azure", "azure"),
            ]
            for endpoint, label in juju_io_cloud_labels:
                if is_state("endpoint.{0}.ready".format(endpoint)):
                    self.set_label("juju.io/cloud", label)
                    break
            else:
                # none of the endpoints matched, remove the label
                self.remove_label("juju.io/cloud")

        except self.NodeLabelError as ex:
            hookenv.log(str(ex))
            raise
Beispiel #60
0
def install():
    if reactive.is_state('apache.available'):
        return
    with open('apache.yaml') as fp:
        workload = yaml.safe_load(fp)
    install_packages(workload)
    for name, site in workload['sites'].items():
        install_site(name, site)
    hookenv.status_set('maintenance', '')
    reactive.set_state('apache.available')