Esempio n. 1
0
def cluster(request):
    log.info(f"All logs located at {log_path}")
    log.info("Running OCS basic installation")
    cluster_path = config.ENV_DATA['cluster_path']
    deploy = config.RUN['cli_params']['deploy']
    teardown = config.RUN['cli_params']['teardown']
    # Add a finalizer to teardown the cluster after test execution is finished
    if teardown:
        request.addfinalizer(cluster_teardown)
        log.info("Will teardown cluster because --teardown was provided")
    # Test cluster access and if exist just skip the deployment.
    if is_cluster_running(cluster_path):
        log.info("The installation is skipped because the cluster is running")
        return
    elif teardown and not deploy:
        log.info("Attempting teardown of non-accessible cluster: %s", cluster_path)
        return
    elif not deploy and not teardown:
        msg = "The given cluster can not be connected to: {}. ".format(cluster_path)
        msg += "Provide a valid --cluster-path or use --deploy to deploy a new cluster"
        pytest.fail(msg)
    elif not system.is_path_empty(cluster_path) and deploy:
        msg = "The given cluster path is not empty: {}. ".format(cluster_path)
        msg += "Provide an empty --cluster-path and --deploy to deploy a new cluster"
        pytest.fail(msg)
    else:
        log.info("A testing cluster will be deployed and cluster information stored at: %s", cluster_path)

    # Generate install-config from template
    log.info("Generating install-config")
    run_cmd(f"mkdir -p {cluster_path}")
    pull_secret_path = os.path.join(
        constants.TOP_DIR,
        "data",
        "pull-secret"
    )

    # TODO: check for supported platform and raise the exception if not
    # supported. Currently we support just AWS.

    _templating = templating.Templating()
    install_config_str = _templating.render_template(
        "install-config.yaml.j2", config.ENV_DATA
    )
    # Log the install config *before* adding the pull secret, so we don't leak
    # sensitive data.
    log.info(f"Install config: \n{install_config_str}")
    # Parse the rendered YAML so that we can manipulate the object directly
    install_config_obj = yaml.safe_load(install_config_str)
    with open(pull_secret_path, "r") as f:
        # Parse, then unparse, the JSON file.
        # We do this for two reasons: to ensure it is well-formatted, and
        # also to ensure it ends up as a single line.
        install_config_obj['pullSecret'] = json.dumps(json.loads(f.read()))
    install_config_str = yaml.safe_dump(install_config_obj)
    install_config = os.path.join(cluster_path, "install-config.yaml")
    with open(install_config, "w") as f:
        f.write(install_config_str)

    # Download installer
    installer = get_openshift_installer(
        config.DEPLOYMENT['installer_version']
    )
    # Download client
    get_openshift_client()

    # Deploy cluster
    log.info("Deploying cluster")
    run_cmd(
        f"{installer} create cluster "
        f"--dir {cluster_path} "
        f"--log-level debug"
    )

    # Test cluster access
    if not OCP.set_kubeconfig(
        os.path.join(cluster_path, config.RUN.get('kubeconfig_location'))
    ):
        pytest.fail("Cluster is not available!")

    # TODO: Create cluster object, add to config.ENV_DATA for other tests to
    # utilize.
    # Determine worker pattern and create ebs volumes
    with open(os.path.join(cluster_path, "terraform.tfvars")) as f:
        tfvars = json.load(f)

    cluster_id = tfvars['cluster_id']
    worker_pattern = f'{cluster_id}-worker*'
    log.info(f'Worker pattern: {worker_pattern}')
    create_ebs_volumes(worker_pattern, region_name=config.ENV_DATA['region'])

    # render templates and create resources
    create_oc_resource('common.yaml', cluster_path, _templating, config.ENV_DATA)
    run_cmd(
        f'oc label namespace {config.ENV_DATA["cluster_namespace"]} '
        f'"openshift.io/cluster-monitoring=true"'
    )
    run_cmd(
        f"oc policy add-role-to-user view "
        f"system:serviceaccount:openshift-monitoring:prometheus-k8s "
        f"-n {config.ENV_DATA['cluster_namespace']}"
    )
    apply_oc_resource(
        'csi-nodeplugin-rbac_rbd.yaml',
        cluster_path,
        _templating,
        config.ENV_DATA,
        template_dir="ocs-deployment/csi/rbd/"
    )
    apply_oc_resource(
        'csi-provisioner-rbac_rbd.yaml',
        cluster_path,
        _templating,
        config.ENV_DATA,
        template_dir="ocs-deployment/csi/rbd/"
    )
    apply_oc_resource(
        'csi-nodeplugin-rbac_cephfs.yaml',
        cluster_path,
        _templating,
        config.ENV_DATA,
        template_dir="ocs-deployment/csi/cephfs/"
    )
    apply_oc_resource(
        'csi-provisioner-rbac_cephfs.yaml',
        cluster_path,
        _templating,
        config.ENV_DATA,
        template_dir="ocs-deployment/csi/cephfs/"
    )
    # Increased to 15 seconds as 10 is not enough
    # TODO: do the sampler function and check if resource exist
    wait_time = 15
    log.info(f"Waiting {wait_time} seconds...")
    time.sleep(wait_time)
    create_oc_resource(
        'operator-openshift-with-csi.yaml', cluster_path, _templating, config.ENV_DATA
    )
    log.info(f"Waiting {wait_time} seconds...")
    time.sleep(wait_time)
    run_cmd(
        f"oc wait --for condition=ready pod "
        f"-l app=rook-ceph-operator "
        f"-n {config.ENV_DATA['cluster_namespace']} "
        f"--timeout=120s"
    )
    run_cmd(
        f"oc wait --for condition=ready pod "
        f"-l app=rook-discover "
        f"-n {config.ENV_DATA['cluster_namespace']} "
        f"--timeout=120s"
    )
    create_oc_resource('cluster.yaml', cluster_path, _templating, config.ENV_DATA)

    POD = ocp.OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace'])
    CFS = ocp.OCP(
        kind=constants.CEPHFILESYSTEM, namespace=config.ENV_DATA['cluster_namespace']
    )

    # Check for the Running status of Ceph Pods
    run_cmd(
        f"oc wait --for condition=ready pod "
        f"-l app=rook-ceph-agent "
        f"-n {config.ENV_DATA['cluster_namespace']} "
        f"--timeout=120s"
    )
    assert POD.wait_for_resource(
        condition='Running', selector='app=rook-ceph-mon',
        resource_count=3, timeout=600
    )
    assert POD.wait_for_resource(
        condition='Running', selector='app=rook-ceph-mgr',
        timeout=600
    )
    assert POD.wait_for_resource(
        condition='Running', selector='app=rook-ceph-osd',
        resource_count=3, timeout=600
    )

    create_oc_resource('toolbox.yaml', cluster_path, _templating, config.ENV_DATA)
    log.info(f"Waiting {wait_time} seconds...")
    time.sleep(wait_time)
    create_oc_resource(
        'storage-manifest.yaml', cluster_path, _templating, config.ENV_DATA
    )
    create_oc_resource(
        "service-monitor.yaml", cluster_path, _templating, config.ENV_DATA
    )
    create_oc_resource(
        "prometheus-rules.yaml", cluster_path, _templating, config.ENV_DATA
    )
    log.info(f"Waiting {wait_time} seconds...")
    time.sleep(wait_time)

    # Create MDS pods for CephFileSystem
    fs_data = templating.load_yaml_to_dict(constants.CEPHFILESYSTEM_YAML)
    fs_data['metadata']['namespace'] = config.ENV_DATA['cluster_namespace']

    ceph_obj = OCS(**fs_data)
    ceph_obj.create()
    assert POD.wait_for_resource(
        condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds',
        resource_count=2, timeout=600
    )

    # Check for CephFilesystem creation in ocp
    cfs_data = CFS.get()
    cfs_name = cfs_data['items'][0]['metadata']['name']

    if helpers.validate_cephfilesystem(cfs_name):
        log.info(f"MDS deployment is successful!")
        defaults.CEPHFILESYSTEM_NAME = cfs_name
    else:
        log.error(
            f"MDS deployment Failed! Please check logs!"
        )

    # Verify health of ceph cluster
    # TODO: move destroy cluster logic to new CLI usage pattern?
    log.info("Done creating rook resources, waiting for HEALTH_OK")
    assert ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])
Esempio n. 2
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace)
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")
        self.deploy_ocs_via_operator()
        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        # validate ceph mon/osd volumes are backed by pvc
        validate_cluster_on_pvc()

        # validate PDB creation of MON, MDS, OSD pods
        validate_pdb_creation()

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-tools',
                                     resource_count=1,
                                     timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info("MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error("MDS deployment Failed! Please check logs!")

        # Change monitoring backend to OCS
        if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                'persistent-monitoring'):

            sc = helpers.default_storage_class(
                interface_type=constants.CEPHBLOCKPOOL)

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=['prometheus', 'alertmanager'])

            # Create configmap cluster-monitoring-config and reconfigure
            # storage class and telemeter server (if the url is specified in a
            # config file)
            create_configmap_cluster_monitoring_pod(
                sc_name=sc.name,
                telemeter_server_url=config.ENV_DATA.get(
                    "telemeter_server_url"))

            # Take some time to respin the pod
            waiting_time = 45
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=3,
                  delay=15)(validate_pods_are_respinned_and_running_state)(
                      pods_list)

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            retry((CommandFailed, AssertionError), tries=3, delay=15)(
                validate_pvc_are_mounted_on_monitoring_pods)(pods_list)
        elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                "telemeter_server_url"):
            # Create configmap cluster-monitoring-config to reconfigure
            # telemeter server url when 'persistent-monitoring' is False
            create_configmap_cluster_monitoring_pod(
                telemeter_server_url=config.ENV_DATA["telemeter_server_url"])

        # Change registry backend to OCS CEPHFS RWX PVC
        registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(namespace=self.namespace)
        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
Esempio n. 3
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace)
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        if config.DEPLOYMENT['external_mode']:
            logger.info("Deploying OCS on external mode RHCS")
            return self.deploy_with_external_mode()

        self.deploy_ocs_via_operator()
        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        # validate ceph mon/osd volumes are backed by pvc
        validate_cluster_on_pvc()

        # validate PDB creation of MON, MDS, OSD pods
        validate_pdb_creation()

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-tools',
                                     resource_count=1,
                                     timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info("MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error("MDS deployment Failed! Please check logs!")

        # Change monitoring backend to OCS
        if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                'persistent-monitoring'):

            sc = helpers.default_storage_class(
                interface_type=constants.CEPHBLOCKPOOL)

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=['prometheus', 'alertmanager'])

            # Create configmap cluster-monitoring-config and reconfigure
            # storage class and telemeter server (if the url is specified in a
            # config file)
            create_configmap_cluster_monitoring_pod(
                sc_name=sc.name,
                telemeter_server_url=config.ENV_DATA.get(
                    "telemeter_server_url"))

            # Take some time to respin the pod
            waiting_time = 45
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=3,
                  delay=15)(validate_pods_are_respinned_and_running_state)(
                      pods_list)

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            retry((CommandFailed, AssertionError), tries=3, delay=15)(
                validate_pvc_are_mounted_on_monitoring_pods)(pods_list)
        elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                "telemeter_server_url"):
            # Create configmap cluster-monitoring-config to reconfigure
            # telemeter server url when 'persistent-monitoring' is False
            create_configmap_cluster_monitoring_pod(
                telemeter_server_url=config.ENV_DATA["telemeter_server_url"])

        # Change registry backend to OCS CEPHFS RWX PVC
        registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        try:
            ceph_health_check(namespace=self.namespace, tries=30, delay=10)
        except CephHealthException as ex:
            err = str(ex)
            logger.warning(f"Ceph health check failed with {err}")
            if "clock skew detected" in err:
                logger.info(f"Changing NTP on compute nodes to"
                            f" {constants.RH_NTP_CLOCK}")
                update_ntp_compute_nodes()
                assert ceph_health_check(namespace=self.namespace,
                                         tries=60,
                                         delay=10)

        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()

        # Modify Noobaa endpoint auto scale values according to the cluster specs
        if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU,
                             min_memory=constants.MIN_NODE_MEMORY):
            logger.info("The cluster specs meet the minimum requirements and "
                        "therefore, NooBaa auto scale will be enabled")
            min_nb_eps = config.DEPLOYMENT.get('min_noobaa_endpoints')
            max_nb_eps = config.DEPLOYMENT.get('max_noobaa_endpoints')
            change_noobaa_endpoints_count(min_nb_eps=min_nb_eps,
                                          max_nb_eps=max_nb_eps)
        else:
            logger.warning(
                "The cluster specs do not meet the minimum requirements and "
                "therefore, NooBaa auto scale will remain with its default values"
            )
            min_eps = 1
            max_eps = 1 if float(config.ENV_DATA['ocs_version']) < 4.6 else 2
            logger.info(
                f"The Noobaa endpoint auto scale values: min: {min_eps}, max: {max_eps}"
            )
Esempio n. 4
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        _templating = templating.Templating()

        ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace)
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        if not self.ocs_operator_deployment:
            create_oc_resource('common.yaml', self.cluster_path, _templating,
                               config.ENV_DATA)
            run_cmd(
                f'oc label namespace {config.ENV_DATA["cluster_namespace"]} '
                f'"openshift.io/cluster-monitoring=true"')
            run_cmd(
                f"oc policy add-role-to-user view "
                f"system:serviceaccount:openshift-monitoring:prometheus-k8s "
                f"-n {self.namespace}")
            # HACK: If you would like to drop this hack, make sure that you
            # also updated docs and write appropriate unit/integration tests
            # for config processing.
            if config.ENV_DATA.get('monitoring_enabled') in ("true", "True",
                                                             True):
                # RBAC rules for monitoring, based on documentation change in
                # rook:
                # https://github.com/rook/rook/commit/1b6fe840f6ae7372a9675ba727ecc65326708aa8
                # HACK: This should be dropped when OCS is managed by OLM
                apply_oc_resource('rbac.yaml',
                                  self.cluster_path,
                                  _templating,
                                  config.ENV_DATA,
                                  template_dir="monitoring")
            # Increased to 15 seconds as 10 is not enough
            # TODO: do the sampler function and check if resource exist
            wait_time = 15
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            create_oc_resource('operator-openshift.yaml', self.cluster_path,
                               _templating, config.ENV_DATA)
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            run_cmd(f"oc wait --for condition=ready pod "
                    f"-l app=rook-ceph-operator "
                    f"-n {self.namespace} "
                    f"--timeout=120s")
            run_cmd(f"oc wait --for condition=ready pod "
                    f"-l app=rook-discover "
                    f"-n {self.namespace} "
                    f"--timeout=120s")
            create_oc_resource('cluster.yaml', self.cluster_path, _templating,
                               config.ENV_DATA)
        else:
            self.deploy_ocs_via_operator()

        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        if not self.ocs_operator_deployment:
            # Creatig toolbox pod
            create_oc_resource(
                'toolbox.yaml',
                self.cluster_path,
                _templating,
                config.ENV_DATA,
            )
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-tools',
                                     resource_count=1,
                                     timeout=600)

        if not self.ocs_operator_deployment:
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            # HACK: This should be dropped (including service-monitor.yaml and
            # prometheus-rules.yaml files) when OCS is managed by OLM
            if config.ENV_DATA.get('monitoring_enabled') not in ("true",
                                                                 "True", True):
                # HACK: skip creation of rook-ceph-mgr service monitor when
                # monitoring is enabled (if this were not skipped, the step
                # would fail because rook would create the service monitor at
                # this point already)
                create_oc_resource("service-monitor.yaml", self.cluster_path,
                                   _templating, config.ENV_DATA)
                # HACK: skip creation of prometheus-rules, rook-ceph is
                # concerned with it's setup now, based on clarification from
                # Umanga Chapagain
                create_oc_resource("prometheus-rules.yaml", self.cluster_path,
                                   _templating, config.ENV_DATA)
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)

            # Create MDS pods for CephFileSystem
            fs_data = templating.load_yaml(constants.CEPHFILESYSTEM_YAML)
            fs_data['metadata']['namespace'] = self.namespace

            ceph_obj = OCS(**fs_data)
            ceph_obj.create()
            assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                         selector='app=rook-ceph-mds',
                                         resource_count=2,
                                         timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info(f"MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error(f"MDS deployment Failed! Please check logs!")

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(namespace=self.namespace)
        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
Esempio n. 5
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace)
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        self.deploy_ocs_via_operator()
        if config.DEPLOYMENT.get('ui_deployment'):
            config.ENV_DATA['skip_ocs_deployment'] = True
            return

        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        # validate ceph mon/osd volumes are backed by pvc
        validate_cluster_on_pvc()

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-tools',
                                     resource_count=1,
                                     timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info(f"MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error(f"MDS deployment Failed! Please check logs!")

        # Change monitoring backend to OCS
        if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                'persistent-monitoring'):

            sc_name = f"{config.ENV_DATA['storage_cluster_name']}-{constants.DEFAULT_SC_RBD}"

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=['prometheus', 'alertmanager'])

            # Create configmap cluster-monitoring-config
            create_configmap_cluster_monitoring_pod(sc_name)

            # Take some time to respin the pod
            waiting_time = 45
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            validate_pods_are_respinned_and_running_state(pods_list)

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            validate_pvc_are_mounted_on_monitoring_pods(pods_list)

        # Change registry backend to OCS CEPHFS RWX PVC
        registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(namespace=self.namespace)
        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
Esempio n. 6
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        _templating = templating.Templating()

        ceph_cluster = ocp.OCP(kind='CephCluster',
                               namespace=config.ENV_DATA['cluster_namespace'])
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        create_oc_resource('common.yaml', self.cluster_path, _templating,
                           config.ENV_DATA)

        run_cmd(f'oc label namespace {config.ENV_DATA["cluster_namespace"]} '
                f'"openshift.io/cluster-monitoring=true"')
        run_cmd(f"oc policy add-role-to-user view "
                f"system:serviceaccount:openshift-monitoring:prometheus-k8s "
                f"-n {config.ENV_DATA['cluster_namespace']}")
        apply_oc_resource('csi-nodeplugin-rbac_rbd.yaml',
                          self.cluster_path,
                          _templating,
                          config.ENV_DATA,
                          template_dir="ocs-deployment/csi/rbd/")
        apply_oc_resource('csi-provisioner-rbac_rbd.yaml',
                          self.cluster_path,
                          _templating,
                          config.ENV_DATA,
                          template_dir="ocs-deployment/csi/rbd/")
        apply_oc_resource('csi-nodeplugin-rbac_cephfs.yaml',
                          self.cluster_path,
                          _templating,
                          config.ENV_DATA,
                          template_dir="ocs-deployment/csi/cephfs/")
        apply_oc_resource('csi-provisioner-rbac_cephfs.yaml',
                          self.cluster_path,
                          _templating,
                          config.ENV_DATA,
                          template_dir="ocs-deployment/csi/cephfs/")
        # Increased to 15 seconds as 10 is not enough
        # TODO: do the sampler function and check if resource exist
        wait_time = 15
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        create_oc_resource('operator-openshift-with-csi.yaml',
                           self.cluster_path, _templating, config.ENV_DATA)
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        run_cmd(f"oc wait --for condition=ready pod "
                f"-l app=rook-ceph-operator "
                f"-n {config.ENV_DATA['cluster_namespace']} "
                f"--timeout=120s")
        run_cmd(f"oc wait --for condition=ready pod "
                f"-l app=rook-discover "
                f"-n {config.ENV_DATA['cluster_namespace']} "
                f"--timeout=120s")
        create_oc_resource('cluster.yaml', self.cluster_path, _templating,
                           config.ENV_DATA)

        pod = ocp.OCP(kind=constants.POD,
                      namespace=config.ENV_DATA['cluster_namespace'])
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM,
                      namespace=config.ENV_DATA['cluster_namespace'])

        # Check for the Running status of Ceph Pods
        run_cmd(f"oc wait --for condition=ready pod "
                f"-l app=rook-ceph-agent "
                f"-n {config.ENV_DATA['cluster_namespace']} "
                f"--timeout=120s")
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        create_oc_resource('toolbox.yaml', self.cluster_path, _templating,
                           config.ENV_DATA)
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        create_oc_resource('storage-manifest.yaml', self.cluster_path,
                           _templating, config.ENV_DATA)
        create_oc_resource("service-monitor.yaml", self.cluster_path,
                           _templating, config.ENV_DATA)
        create_oc_resource("prometheus-rules.yaml", self.cluster_path,
                           _templating, config.ENV_DATA)
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)

        # Create MDS pods for CephFileSystem
        fs_data = templating.load_yaml_to_dict(constants.CEPHFILESYSTEM_YAML)
        fs_data['metadata']['namespace'] = config.ENV_DATA['cluster_namespace']

        ceph_obj = OCS(**fs_data)
        ceph_obj.create()
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-mds',
                                     resource_count=2,
                                     timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info(f"MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error(f"MDS deployment Failed! Please check logs!")

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(
            namespace=config.ENV_DATA['cluster_namespace'])
        # patch gp2 (EBS) storage class as 'non-default'
        logger.info("Patch gp2 storageclass as non-default")
        patch = " '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"false\"}}}' "
        run_cmd(f"oc patch storageclass gp2 "
                f"-p {patch} "
                f"--request-timeout=120s")
    def test_deployment(self):
        log.info("Running OCS basic installation")
        cluster_path = config.ENV_DATA['cluster_path']
        # Test cluster access and if exist just skip the deployment.
        if config.RUN['cli_params'].get('cluster_path') and OCP.set_kubeconfig(
            os.path.join(cluster_path, config.RUN.get('kubeconfig_location'))
        ):
            pytest.skip(
                "The installation is skipped cause the cluster is running"
            )

        # Generate install-config from template
        log.info("Generating install-config")
        run_cmd(f"mkdir -p {cluster_path}")
        pull_secret_path = os.path.join(
            TOP_DIR,
            "data",
            "pull-secret"
        )

        # TODO: check for supported platform and raise the exception if not
        # supported. Currently we support just AWS.

        _templating = templating.Templating()
        install_config_str = _templating.render_template(
            "install-config.yaml.j2", config.ENV_DATA
        )
        # Parse the rendered YAML so that we can manipulate the object directly
        install_config_obj = yaml.safe_load(install_config_str)
        with open(pull_secret_path, "r") as f:
            # Parse, then unparse, the JSON file.
            # We do this for two reasons: to ensure it is well-formatted, and
            # also to ensure it ends up as a single line.
            install_config_obj['pullSecret'] = json.dumps(json.loads(f.read()))
        install_config_str = yaml.safe_dump(install_config_obj)
        log.info(f"Install config: \n{install_config_str}")
        install_config = os.path.join(cluster_path, "install-config.yaml")
        with open(install_config, "w") as f:
            f.write(install_config_str)

        # Download installer
        installer = get_openshift_installer(
            config.DEPLOYMENT['installer_version']
        )
        # Download client
        get_openshift_client()

        # Deploy cluster
        log.info("Deploying cluster")
        run_cmd(
            f"{installer} create cluster "
            f"--dir {cluster_path} "
            f"--log-level debug"
        )

        # Test cluster access
        if not OCP.set_kubeconfig(
            os.path.join(cluster_path, config.RUN.get('kubeconfig_location'))
        ):
            pytest.fail("Cluster is not available!")

        # TODO: Create cluster object, add to config.ENV_DATA for other tests to
        # utilize.
        # Determine worker pattern and create ebs volumes
        with open(os.path.join(cluster_path, "terraform.tfvars")) as f:
            tfvars = json.load(f)

        cluster_id = tfvars['cluster_id']
        worker_pattern = f'{cluster_id}-worker*'
        log.info(f'Worker pattern: {worker_pattern}')
        create_ebs_volumes(worker_pattern, region_name=config.ENV_DATA['region'])

        # render templates and create resources
        create_oc_resource('common.yaml', cluster_path, _templating, config.ENV_DATA)
        run_cmd(
            f'oc label namespace {config.ENV_DATA["cluster_namespace"]} '
            f'"openshift.io/cluster-monitoring=true"'
        )
        run_cmd(
            f"oc policy add-role-to-user view "
            f"system:serviceaccount:openshift-monitoring:prometheus-k8s "
            f"-n {config.ENV_DATA['cluster_namespace']}"
        )
        apply_oc_resource(
            'csi-nodeplugin-rbac_rbd.yaml',
            cluster_path,
            _templating,
            config.ENV_DATA,
            template_dir="ocs-deployment/csi/rbd/"
        )
        apply_oc_resource(
            'csi-provisioner-rbac_rbd.yaml',
            cluster_path,
            _templating,
            config.ENV_DATA,
            template_dir="ocs-deployment/csi/rbd/"
        )
        apply_oc_resource(
            'csi-nodeplugin-rbac_cephfs.yaml',
            cluster_path,
            _templating,
            config.ENV_DATA,
            template_dir="ocs-deployment/csi/cephfs/"
        )
        apply_oc_resource(
            'csi-provisioner-rbac_cephfs.yaml',
            cluster_path,
            _templating,
            config.ENV_DATA,
            template_dir="ocs-deployment/csi/cephfs/"
        )
        # Increased to 15 seconds as 10 is not enough
        # TODO: do the sampler function and check if resource exist
        wait_time = 15
        log.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        create_oc_resource(
            'operator-openshift-with-csi.yaml', cluster_path, _templating, config.ENV_DATA
        )
        log.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        run_cmd(
            f"oc wait --for condition=ready pod "
            f"-l app=rook-ceph-operator "
            f"-n {config.ENV_DATA['cluster_namespace']} "
            f"--timeout=120s"
        )
        run_cmd(
            f"oc wait --for condition=ready pod "
            f"-l app=rook-discover "
            f"-n {config.ENV_DATA['cluster_namespace']} "
            f"--timeout=120s"
        )
        create_oc_resource('cluster.yaml', cluster_path, _templating, config.ENV_DATA)

        # Check for the Running status of Ceph Pods
        run_cmd(
            f"oc wait --for condition=ready pod "
            f"-l app=rook-ceph-agent "
            f"-n {config.ENV_DATA['cluster_namespace']} "
            f"--timeout=120s"
        )
        assert POD.wait_for_resource(
            condition='Running', selector='app=rook-ceph-mon',
            resource_count=3, timeout=600
        )
        assert POD.wait_for_resource(
            condition='Running', selector='app=rook-ceph-mgr',
            timeout=600
        )
        assert POD.wait_for_resource(
            condition='Running', selector='app=rook-ceph-osd',
            resource_count=3, timeout=600
        )

        create_oc_resource('toolbox.yaml', cluster_path, _templating, config.ENV_DATA)
        log.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        create_oc_resource(
            'storage-manifest.yaml', cluster_path, _templating, config.ENV_DATA
        )
        create_oc_resource(
            "service-monitor.yaml", cluster_path, _templating, config.ENV_DATA
        )
        create_oc_resource(
            "prometheus-rules.yaml", cluster_path, _templating, config.ENV_DATA
        )
        log.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)

        # Create MDS pods for CephFileSystem
        self.fs_data = copy.deepcopy(defaults.CEPHFILESYSTEM_DICT)
        self.fs_data['metadata']['namespace'] = config.ENV_DATA['cluster_namespace']

        global CEPH_OBJ
        CEPH_OBJ = OCS(**self.fs_data)
        CEPH_OBJ.create()
        assert POD.wait_for_resource(
            condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds',
            resource_count=2, timeout=600
        )

        # Check for CephFilesystem creation in ocp
        cfs_data = CFS.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            log.info(f"MDS deployment is successful!")
        else:
            log.error(
                f"MDS deployment Failed! Please check logs!"
            )

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        log.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])