Ejemplo n.º 1
0
def create_ebs_volumes(
    worker_pattern,
    size=100,
    region_name=None,
):
    """
    Create volumes on workers

    Args:
        worker_pattern (string): Worker name pattern e.g.:
            cluster-55jx2-worker*
        size (int): Size in GB (default: 100)
        region_name (str): Region name (default: config.ENV_DATA['region'])
    """
    aws = AWS(region_name)
    region_name = region_name or config.ENV_DATA['region']
    worker_instances = aws.get_instances_by_name_pattern(worker_pattern)
    with parallel() as p:
        for worker in worker_instances:
            log.info(
                f"Creating and attaching {size} GB volume to {worker['name']}")
            p.spawn(
                aws.create_volume_and_attach,
                availability_zone=worker['avz'],
                instance_id=worker['id'],
                name=f"{worker['name']}_extra_volume",
                size=size,
            )
Ejemplo n.º 2
0
 def __init__(self):
     """
     This would be base for both IPI and UPI deployment
     """
     super(AWSBase, self).__init__()
     self.region = config.ENV_DATA['region']
     self.aws = AWSUtil(self.region)
Ejemplo n.º 3
0
 def __init__(self):
     """
     This would be base for both IPI and UPI deployment
     """
     super(AWSBase, self).__init__()
     self.aws = AWSUtil(self.region)
     # dict of cluster prefixes with special handling rules (for existence
     # check or during a cluster cleanup)
     self.cluster_prefixes_special_rules = CLUSTER_PREFIXES_SPECIAL_RULES
Ejemplo n.º 4
0
def destroy_cluster(cluster_path, log_level="DEBUG"):
    """
    Destroy existing cluster resources in AWS.

    Args:
        cluster_path (str): filepath to cluster directory to be destroyed
        log_level (str): log level to set for openshift_installer

    """
    # Download installer
    installer = get_openshift_installer()

    destroy_cmd = (f"{installer} destroy cluster "
                   f"--dir {cluster_path} "
                   f"--log-level {log_level}")

    try:
        cluster_path = os.path.normpath(cluster_path)

        # Retrieve cluster name and aws region from metadata
        metadata_file = os.path.join(cluster_path, "metadata.json")
        with open(metadata_file) as f:
            metadata = json.loads(f.read())
        cluster_name = metadata.get("clusterName")
        region_name = metadata.get("aws").get("region")

        # Execute destroy cluster using OpenShift installer
        log.info(f"Destroying cluster defined in {cluster_path}")
        run_cmd(destroy_cmd)

        # Find and delete volumes
        aws = AWS(region_name)
        volume_pattern = f"{cluster_name}*"
        log.debug(f"Finding volumes with pattern: {volume_pattern}")
        volumes = aws.get_volumes_by_name_pattern(volume_pattern)
        log.debug(f"Found volumes: \n {volumes}")
        for volume in volumes:
            aws.detach_and_delete_volume(aws.ec2_resource.Volume(volume['id']))

        # Remove installer
        delete_file(installer)

    except Exception:
        log.error(traceback.format_exc())
Ejemplo n.º 5
0
 def __init__(self):
     """
     This would be base for both IPI and UPI deployment
     """
     super(AWSBase, self).__init__()
     self.region = config.ENV_DATA['region']
     self.aws = AWSUtil(self.region)
     if config.ENV_DATA.get('cluster_name'):
         self.cluster_name = config.ENV_DATA['cluster_name']
     else:
         self.cluster_name = get_cluster_name(self.cluster_path)
Ejemplo n.º 6
0
def get_clusters_to_delete(time_to_delete, region_name,
                           prefixes_hours_to_spare):
    """
    Get all cluster names that their EC2 instances running time is greater
    than the specified time to delete

    Args:
        time_to_delete (int): The maximum time in seconds that is allowed
            for clusters to continue running
        region_name (str): The name of the AWS region to delete the resources from
        prefixes_hours_to_spare (dict): Dictionaries of the cluster prefixes to spare
            along with the maximum time in hours that is allowed for spared
            clusters to continue running

    Returns:
        tuple: List of the cluster names (e.g ebenahar-cluster-gqtd4) to be provided to the
            ci-cleanup script and a list of VPCs that are part of cloudformations

    """
    aws = AWS(region_name=region_name)
    clusters_to_delete = list()
    cloudformation_vpcs = list()
    vpcs = aws.ec2_client.describe_vpcs()['Vpcs']
    vpc_ids = [vpc['VpcId'] for vpc in vpcs]
    vpc_objs = [aws.ec2_resource.Vpc(vpc_id) for vpc_id in vpc_ids]
    for vpc_obj in vpc_objs:
        vpc_tags = vpc_obj.tags
        vpc_cloudformation = [
            tag['Value'] for tag in vpc_tags
            if tag['Key'] == defaults.AWS_CLOUDFORMATION_TAG
        ]
        if vpc_cloudformation:
            cloudformation_vpcs.append(vpc_cloudformation)
            continue
        vpc_name = [tag['Value'] for tag in vpc_tags
                    if tag['Key'] == 'Name'][0]
        cluster_name = vpc_name[:-4]
        vpc_instances = vpc_obj.instances.all()
        if not vpc_instances:
            clusters_to_delete.append(cluster_name)
        for instance in vpc_instances:
            allowed_running_time = time_to_delete
            if instance.state["Name"] == "running":
                for prefix, hours in prefixes_hours_to_spare.items():
                    if prefix in cluster_name:
                        allowed_running_time = int(hours) * 60 * 60
                        break
                launch_time = instance.launch_time
                current_time = datetime.datetime.now(launch_time.tzinfo)
                running_time = current_time - launch_time
                if running_time.seconds > allowed_running_time:
                    clusters_to_delete.append(cluster_name)
                break
    return clusters_to_delete, cloudformation_vpcs
Ejemplo n.º 7
0
class AWSBase(Deployment):
    def __init__(self):
        """
        This would be base for both IPI and UPI deployment
        """
        super(AWSBase, self).__init__()
        self.region = config.ENV_DATA['region']
        self.aws = AWSUtil(self.region)

    def create_ebs_volumes(self, worker_pattern, size=100):
        """
        Add new ebs volumes to the workers

        Args:
            worker_pattern (str):  Worker name pattern e.g.:
                cluster-55jx2-worker*
            size (int): Size in GB (default: 100)
        """
        worker_instances = self.aws.get_instances_by_name_pattern(
            worker_pattern
        )
        with parallel() as p:
            for worker in worker_instances:
                logger.info(
                    f"Creating and attaching {size} GB "
                    f"volume to {worker['name']}"
                )
                p.spawn(
                    self.aws.create_volume_and_attach,
                    availability_zone=worker['avz'],
                    instance_id=worker['id'],
                    name=f"{worker['name']}_extra_volume",
                    size=size,
                )

    def add_volume(self, size=100):
        """
        Add a new volume to all the workers

        Args:
            size (int): Size of volume in GB (default: 100)
        """
        tfvars_file = "terraform.tfvars.json"
        with open(os.path.join(self.cluster_path, tfvars_file)) as f:
            tfvars = json.load(f)

        cluster_id = tfvars['cluster_id']
        worker_pattern = f'{cluster_id}-worker*'
        logger.info(f'Worker pattern: {worker_pattern}')
        self.create_ebs_volumes(worker_pattern, size)

    def add_node(self):
        # TODO: Implement later
        super(AWSBase, self).add_node()
Ejemplo n.º 8
0
def get_clusters_to_delete(time_to_delete, region_name, prefixes_to_spare):
    """
    Get all cluster names that their EC2 instances running time is greater
    than the specified time to delete

    Args:
        time_to_delete (int): The maximum time in seconds that is allowed
            for clusters to continue running
        region_name (str): The name of the AWS region to delete the resources from
        prefixes_to_spare (list): The cluster prefixes to spare

    Returns:
        list: The cluster names (e.g ebenahar-cluster-gqtd4) to be provided to the
            ci-cleanup script

    """
    aws = AWS(region_name=region_name)
    clusters_to_delete = list()
    vpcs = aws.ec2_client.describe_vpcs()['Vpcs']
    vpc_ids = [vpc['VpcId'] for vpc in vpcs]
    vpc_objs = [aws.ec2_resource.Vpc(vpc_id) for vpc_id in vpc_ids]
    for vpc_obj in vpc_objs:
        vpc_name = [
            tag['Value'] for tag in vpc_obj.tags if tag['Key'] == 'Name'
        ][0]
        cluster_name = vpc_name[:-4]
        if any(prefix not in cluster_name for prefix in prefixes_to_spare):
            vpc_instances = vpc_obj.instances.all()
            if not vpc_instances:
                clusters_to_delete.append(cluster_name)
            for instance in vpc_instances:
                if instance.state["Name"] == "running":
                    launch_time = instance.launch_time
                    current_time = datetime.datetime.now(launch_time.tzinfo)
                    running_time = current_time - launch_time
                    if running_time.seconds > time_to_delete:
                        clusters_to_delete.append(cluster_name)
                    break
    return clusters_to_delete
Ejemplo n.º 9
0
class AWSBase(Deployment):
    def __init__(self):
        """
        This would be base for both IPI and UPI deployment
        """
        super(AWSBase, self).__init__()
        self.region = config.ENV_DATA['region']
        self.aws = AWSUtil(self.region)
        if config.ENV_DATA.get('cluster_name'):
            self.cluster_name = config.ENV_DATA['cluster_name']
        else:
            self.cluster_name = get_cluster_name(self.cluster_path)

    def create_ebs_volumes(self, worker_pattern, size=100):
        """
        Add new ebs volumes to the workers

        Args:
            worker_pattern (str):  Worker name pattern e.g.:
                cluster-55jx2-worker*
            size (int): Size in GB (default: 100)
        """
        worker_instances = self.aws.get_instances_by_name_pattern(
            worker_pattern)
        with parallel() as p:
            for worker in worker_instances:
                logger.info(f"Creating and attaching {size} GB "
                            f"volume to {worker['name']}")
                p.spawn(
                    self.aws.create_volume_and_attach,
                    availability_zone=worker['avz'],
                    instance_id=worker['id'],
                    name=f"{worker['name']}_extra_volume",
                    size=size,
                )

    def add_volume(self, size=100):
        """
        Add a new volume to all the workers

        Args:
            size (int): Size of volume in GB (default: 100)
        """
        cluster_id = get_infra_id(self.cluster_path)
        worker_pattern = f'{cluster_id}-worker*'
        logger.info(f'Worker pattern: {worker_pattern}')
        self.create_ebs_volumes(worker_pattern, size)

    def host_network_update(self):
        """
        Update security group rules for HostNetwork
        """
        cluster_id = get_infra_id(self.cluster_path)
        worker_pattern = f'{cluster_id}-worker*'
        worker_instances = self.aws.get_instances_by_name_pattern(
            worker_pattern)
        security_groups = worker_instances[0]['security_groups']
        sg_id = security_groups[0]['GroupId']
        security_group = self.aws.ec2_resource.SecurityGroup(sg_id)
        # The ports are not 100 % clear yet. Taken from doc:
        # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit#
        security_group.authorize_ingress(
            DryRun=False,
            IpPermissions=[
                {
                    'FromPort':
                    6800,
                    'ToPort':
                    7300,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph OSDs',
                            'GroupId': sg_id,
                        },
                    ],
                },
                {
                    'FromPort':
                    3300,
                    'ToPort':
                    3300,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph MONs rule1',
                            'GroupId': sg_id,
                        },
                    ],
                },
                {
                    'FromPort':
                    6789,
                    'ToPort':
                    6789,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph MONs rule2',
                            'GroupId': sg_id,
                        },
                    ],
                },
                {
                    'FromPort':
                    8443,
                    'ToPort':
                    8443,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph Dashboard rule1',
                            'GroupId': sg_id,
                        },
                    ],
                },
                {
                    'FromPort':
                    8080,
                    'ToPort':
                    8080,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph Dashboard rule2',
                            'GroupId': sg_id,
                        },
                    ],
                },
            ])

    def add_node(self):
        # TODO: Implement later
        super(AWSBase, self).add_node()

    def check_cluster_existence(self, cluster_name_prefix):
        """
        Check cluster existence according to cluster name prefix

        Returns:
            bool: True if a cluster with the same name prefix already exists,
                False otherwise

        """
        instances = self.aws.get_instances_by_name_pattern(cluster_name_prefix)
        instance_objs = [
            self.aws.get_ec2_instance(ins.get('id')) for ins in instances
        ]
        non_terminated_instances = [
            ins for ins in instance_objs
            if ins.state.get('Code') != constants.INSTANCE_TERMINATED
        ]
        if non_terminated_instances:
            logger.error(
                f"Non terminated EC2 instances with the same name prefix were"
                f" found: {[ins.id for ins in non_terminated_instances]}")
            return True
        return False
Ejemplo n.º 10
0
def vsphere_cleanup():
    """
    Deletes the cluster and all the associated resources
    on vSphere environment.

    Resources that are deleting:
        1. Delete disks
        2. Delete VM's
        3. Delete Resource Pool
        4. Remove IP's from IPAM server
        5. Removes Resource records from Hosted Zone
        6. Removes Hosted Zone from AWS
        7. Removes records from Base Domain

    """
    parser = argparse.ArgumentParser(
        description="vSphere cluster cleanup",
        formatter_class=argparse.RawTextHelpFormatter,
    )
    parser.add_argument(
        "--cluster_name",
        action="store",
        required=True,
        help="The name of the cluster to delete from vSphere",
    )
    parser.add_argument(
        "--vsphere_conf",
        action="store",
        required=True,
        type=argparse.FileType("r", encoding="UTF-8"),
        help="""vSphere configuration file in yaml format.
            Example file:
                ---
                ENV_DATA:
                  # aws region
                  region: 'us-east-2'
                  base_domain: 'qe.rh-ocs.com'
                  # vsphere details
                  vsphere_server: '<your_vcenter.lab.com>'
                  vsphere_user: '******'
                  vsphere_password: '******'
                  vsphere_cluster: '<cluster name>'
                  vsphere_datacenter: '<datacenter name>'
                  ipam: '<IP>'
                  ipam_token: '<IPAM token>'
            """,
    )

    args = parser.parse_args()

    cluster_name = args.cluster_name
    vsphere_conf = args.vsphere_conf

    # load vsphere_conf data to config
    vsphere_config_data = yaml.safe_load(vsphere_conf)
    framework.config.update(vsphere_config_data)
    vsphere_conf.close()

    # get connection to vSphere
    server = config.ENV_DATA["vsphere_server"]
    user = config.ENV_DATA["vsphere_user"]
    password = config.ENV_DATA["vsphere_password"]
    vsphere = get_vsphere_connection(server, user, password)

    # delete the cluster
    delete_cluster(vsphere, cluster_name)

    # release IP's from IPAM server
    ipam = IPAM()
    ipam.delete_ips(cluster_name=cluster_name)

    # Delete AWS route
    aws = AWS()
    aws.delete_hosted_zone(cluster_name=cluster_name)

    # Delete records in base domain
    base_domain = config.ENV_DATA["base_domain"]
    aws.delete_record_from_base_domain(cluster_name=cluster_name,
                                       base_domain=base_domain)
Ejemplo n.º 11
0
 def __init__(self):
     self.name = self.__class__.__name__
     super(ROSA, self).__init__()
     ocm.download_ocm_cli()
     rosa.download_rosa_cli()
     self.aws = AWSUtil(self.region)
Ejemplo n.º 12
0
def get_clusters(time_to_delete, region_name, prefixes_hours_to_spare):
    """
    Get all cluster names that their EC2 instances running time is greater
    than the specified time to delete

    Args:
        time_to_delete (int): The maximum time in seconds that is allowed
            for clusters to continue running
        region_name (str): The name of the AWS region to delete the resources from
        prefixes_hours_to_spare (dict): Dictionaries of the cluster prefixes to spare
            along with the maximum time in hours that is allowed for spared
            clusters to continue running

    Returns:
        tuple: List of the cluster names (e.g ebenahar-cluster-gqtd4) to be provided to the
            ci-cleanup script, a list of VPCs that are part of cloudformation,
            and a list of remaining clusters

    """
    def determine_cluster_deletion(ec2_instances, cluster_name):
        for instance in ec2_instances:
            allowed_running_time = time_to_delete
            do_not_delete = False
            if instance.state["Name"] == "running":
                for prefix, hours in prefixes_hours_to_spare.items():
                    # case insensitive 'startswith'
                    if bool(re.match(prefix, cluster_name, re.I)):
                        if hours == 'never':
                            do_not_delete = True
                        else:
                            allowed_running_time = int(hours) * 60 * 60
                        break
                if do_not_delete:
                    logger.info(
                        "%s marked as 'do not delete' and will not be "
                        "destroyed", cluster_name)
                    return False
                else:
                    launch_time = instance.launch_time
                    current_time = datetime.datetime.now(launch_time.tzinfo)
                    running_time = current_time - launch_time
                    logger.info(
                        f"Instance {[tag['Value'] for tag in instance.tags if tag['Key'] == 'Name'][0]} "
                        f"(id: {instance.id}) running time is {running_time} hours while the allowed"
                        f" running time for it is {allowed_running_time/3600} hours"
                    )
                    if running_time.total_seconds() > allowed_running_time:
                        return True
        return False

    aws = AWS(region_name=region_name)
    clusters_to_delete = list()
    remaining_clusters = list()
    cloudformation_vpc_names = list()
    vpcs = aws.ec2_client.describe_vpcs()['Vpcs']
    vpc_ids = [vpc['VpcId'] for vpc in vpcs]
    vpc_objs = [aws.ec2_resource.Vpc(vpc_id) for vpc_id in vpc_ids]
    for vpc_obj in vpc_objs:
        vpc_tags = vpc_obj.tags
        if vpc_tags:
            cloudformation_vpc_name = [
                tag['Value'] for tag in vpc_tags
                if tag['Key'] == defaults.AWS_CLOUDFORMATION_TAG
            ]
            if cloudformation_vpc_name:
                cloudformation_vpc_names.append(cloudformation_vpc_name[0])
                continue
            vpc_name = [
                tag['Value'] for tag in vpc_tags if tag['Key'] == 'Name'
            ][0]
            cluster_name = vpc_name.replace('-vpc', '')
            vpc_instances = vpc_obj.instances.all()
            if not vpc_instances:
                clusters_to_delete.append(cluster_name)
                continue

            # Append to clusters_to_delete if cluster should be deleted
            if determine_cluster_deletion(vpc_instances, cluster_name):
                clusters_to_delete.append(cluster_name)
            else:
                remaining_clusters.append(cluster_name)
        else:
            logger.info("No tags found for VPC")

    # Get all cloudformation based clusters to delete
    cf_clusters_to_delete = list()
    for vpc_name in cloudformation_vpc_names:
        instance_dicts = aws.get_instances_by_name_pattern(
            f"{vpc_name.replace('-vpc', '')}*")
        ec2_instances = [
            aws.get_ec2_instance(instance_dict['id'])
            for instance_dict in instance_dicts
        ]
        if not ec2_instances:
            continue
        cluster_io_tag = None
        for instance in ec2_instances:
            cluster_io_tag = [
                tag['Key'] for tag in instance.tags
                if 'kubernetes.io/cluster' in tag['Key']
            ]
            if cluster_io_tag:
                break
        if not cluster_io_tag:
            logger.warning(
                "Unable to find valid cluster IO tag from ec2 instance tags "
                "for VPC %s. This is probably not an OCS cluster VPC!",
                vpc_name)
            continue
        cluster_name = cluster_io_tag[0].replace('kubernetes.io/cluster/', '')
        if determine_cluster_deletion(ec2_instances, cluster_name):
            cf_clusters_to_delete.append(cluster_name)
        else:
            remaining_clusters.append(cluster_name)

    return clusters_to_delete, cf_clusters_to_delete, remaining_clusters
Ejemplo n.º 13
0
class AWSBase(CloudDeploymentBase):

    # default storage class for StorageCluster CRD on AWS platform
    DEFAULT_STORAGECLASS = "gp2"

    def __init__(self):
        """
        This would be base for both IPI and UPI deployment
        """
        super(AWSBase, self).__init__()
        self.aws = AWSUtil(self.region)

    def host_network_update(self):
        """
        Update security group rules for HostNetwork
        """
        cluster_id = get_infra_id(self.cluster_path)
        worker_pattern = f'{cluster_id}-worker*'
        worker_instances = self.aws.get_instances_by_name_pattern(
            worker_pattern)
        security_groups = worker_instances[0]['security_groups']
        sg_id = security_groups[0]['GroupId']
        security_group = self.aws.ec2_resource.SecurityGroup(sg_id)
        # The ports are not 100 % clear yet. Taken from doc:
        # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit#
        security_group.authorize_ingress(
            DryRun=False,
            IpPermissions=[
                {
                    'FromPort':
                    6800,
                    'ToPort':
                    7300,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph OSDs',
                            'GroupId': sg_id,
                        },
                    ],
                },
                {
                    'FromPort':
                    3300,
                    'ToPort':
                    3300,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph MONs rule1',
                            'GroupId': sg_id,
                        },
                    ],
                },
                {
                    'FromPort':
                    6789,
                    'ToPort':
                    6789,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph MONs rule2',
                            'GroupId': sg_id,
                        },
                    ],
                },
                {
                    'FromPort':
                    8443,
                    'ToPort':
                    8443,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph Dashboard rule1',
                            'GroupId': sg_id,
                        },
                    ],
                },
                {
                    'FromPort':
                    8080,
                    'ToPort':
                    8080,
                    'IpProtocol':
                    'tcp',
                    'UserIdGroupPairs': [
                        {
                            'Description': 'Ceph Dashboard rule2',
                            'GroupId': sg_id,
                        },
                    ],
                },
            ])

    def add_node(self):
        # TODO: Implement later
        super(AWSBase, self).add_node()

    def check_cluster_existence(self, cluster_name_prefix):
        """
        Check cluster existence according to cluster name prefix

        Returns:
            bool: True if a cluster with the same name prefix already exists,
                False otherwise

        """
        cluster_name_pattern = cluster_name_prefix + "*"
        instances = self.aws.get_instances_by_name_pattern(
            cluster_name_pattern)
        instance_objs = [
            self.aws.get_ec2_instance(ins.get('id')) for ins in instances
        ]
        non_terminated_instances = [
            ins for ins in instance_objs
            if ins.state.get('Code') != constants.INSTANCE_TERMINATED
        ]
        if non_terminated_instances:
            logger.error(
                f"Non terminated EC2 instances with the same name prefix were"
                f" found: {[ins.id for ins in non_terminated_instances]}")
            return True
        return False
Ejemplo n.º 14
0
def create_cluster(cluster_name, version, region):
    """
    Create OCP cluster.

    Args:
        cluster_name (str): Cluster name
        version (str): cluster version
        region (str): Cluster region

    """

    rosa_ocp_version = config.DEPLOYMENT["installer_version"]
    # Validate ocp version with rosa ocp supported version
    # Select the valid version if given version is invalid
    if not validate_ocp_version(rosa_ocp_version):
        logger.warning(f"Given OCP version {rosa_ocp_version} "
                       f"is not valid ROSA OCP version. "
                       f"Selecting latest rosa version for deployment")
        rosa_ocp_version = get_latest_rosa_version(version)
        logger.info(f"Using OCP version {rosa_ocp_version}")

    create_account_roles(version)
    compute_nodes = config.ENV_DATA["worker_replicas"]
    compute_machine_type = config.ENV_DATA["worker_instance_type"]
    multi_az = "--multi-az " if config.ENV_DATA.get(
        "multi_availability_zones") else ""
    cluster_type = config.ENV_DATA.get("cluster_type", "")
    provider_name = config.ENV_DATA.get("provider_name", "")
    rosa_mode = config.ENV_DATA.get("rosa_mode", "")
    cmd = (
        f"rosa create cluster --cluster-name {cluster_name} --region {region} "
        f"--compute-nodes {compute_nodes} --compute-machine-type "
        f"{compute_machine_type}  --version {rosa_ocp_version} {multi_az}--sts --yes"
    )
    if rosa_mode == "auto":
        cmd += " --mode auto"
    if cluster_type.lower() == "consumer" and config.ENV_DATA.get(
            "provider_name", ""):
        aws = AWSUtil()
        subnet_id = ",".join(aws.get_cluster_subnet_ids(provider_name))
        cmd = f"{cmd} --subnet-ids {subnet_id}"

    utils.run_cmd(cmd, timeout=1200)
    if rosa_mode != "auto":
        logger.info(
            "Waiting for ROSA cluster status changed to waiting or pending state"
        )
        for cluster_info in utils.TimeoutSampler(4500, 30,
                                                 ocm.get_cluster_details,
                                                 cluster_name):
            status = cluster_info["status"]["state"]
            logger.info(f"Current installation status: {status}")
            if status == "waiting" or status == "pending":
                logger.info(f"Cluster is in {status} state")
                break
        create_operator_roles(cluster_name)
        create_oidc_provider(cluster_name)

    logger.info("Waiting for installation of ROSA cluster")
    for cluster_info in utils.TimeoutSampler(4500, 30, ocm.get_cluster_details,
                                             cluster_name):
        status = cluster_info["status"]["state"]
        logger.info(f"Current installation status: {status}")
        if status == "ready":
            logger.info("Cluster was installed")
            break
    cluster_info = ocm.get_cluster_details(cluster_name)
    # Create metadata file to store the cluster name
    cluster_info["clusterName"] = cluster_name
    cluster_info["clusterID"] = cluster_info["id"]
    cluster_path = config.ENV_DATA["cluster_path"]
    metadata_file = os.path.join(cluster_path, "metadata.json")
    with open(metadata_file, "w+") as f:
        json.dump(cluster_info, f)
Ejemplo n.º 15
0
class AWSBase(CloudDeploymentBase):

    # default storage class for StorageCluster CRD on AWS platform
    DEFAULT_STORAGECLASS = "gp2"

    def __init__(self):
        """
        This would be base for both IPI and UPI deployment
        """
        super(AWSBase, self).__init__()
        self.aws = AWSUtil(self.region)
        # dict of cluster prefixes with special handling rules (for existence
        # check or during a cluster cleanup)
        self.cluster_prefixes_special_rules = CLUSTER_PREFIXES_SPECIAL_RULES

    def host_network_update(self):
        """
        Update security group rules for HostNetwork
        """
        cluster_id = get_infra_id(self.cluster_path)
        worker_pattern = f"{cluster_id}-worker*"
        worker_instances = self.aws.get_instances_by_name_pattern(
            worker_pattern)
        security_groups = worker_instances[0]["security_groups"]
        sg_id = security_groups[0]["GroupId"]
        security_group = self.aws.ec2_resource.SecurityGroup(sg_id)
        # The ports are not 100 % clear yet. Taken from doc:
        # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit#
        security_group.authorize_ingress(
            DryRun=False,
            IpPermissions=[
                {
                    "FromPort":
                    6800,
                    "ToPort":
                    7300,
                    "IpProtocol":
                    "tcp",
                    "UserIdGroupPairs": [
                        {
                            "Description": "Ceph OSDs",
                            "GroupId": sg_id,
                        },
                    ],
                },
                {
                    "FromPort":
                    3300,
                    "ToPort":
                    3300,
                    "IpProtocol":
                    "tcp",
                    "UserIdGroupPairs": [
                        {
                            "Description": "Ceph MONs rule1",
                            "GroupId": sg_id,
                        },
                    ],
                },
                {
                    "FromPort":
                    6789,
                    "ToPort":
                    6789,
                    "IpProtocol":
                    "tcp",
                    "UserIdGroupPairs": [
                        {
                            "Description": "Ceph MONs rule2",
                            "GroupId": sg_id,
                        },
                    ],
                },
                {
                    "FromPort":
                    8443,
                    "ToPort":
                    8443,
                    "IpProtocol":
                    "tcp",
                    "UserIdGroupPairs": [
                        {
                            "Description": "Ceph Dashboard rule1",
                            "GroupId": sg_id,
                        },
                    ],
                },
                {
                    "FromPort":
                    8080,
                    "ToPort":
                    8080,
                    "IpProtocol":
                    "tcp",
                    "UserIdGroupPairs": [
                        {
                            "Description": "Ceph Dashboard rule2",
                            "GroupId": sg_id,
                        },
                    ],
                },
            ],
        )

    def add_node(self):
        # TODO: Implement later
        super(AWSBase, self).add_node()

    def check_cluster_existence(self, cluster_name_prefix):
        """
        Check cluster existence according to cluster name prefix

        Returns:
            bool: True if a cluster with the same name prefix already exists,
                False otherwise

        """
        cluster_name_pattern = cluster_name_prefix + "*"
        instances = self.aws.get_instances_by_name_pattern(
            cluster_name_pattern)
        instance_objs = [
            self.aws.get_ec2_instance(ins.get("id")) for ins in instances
        ]
        non_terminated_instances = [
            ins for ins in instance_objs
            if ins.state.get("Code") != constants.INSTANCE_TERMINATED
        ]
        if non_terminated_instances:
            logger.error(
                f"Non terminated EC2 instances with the same name prefix were"
                f" found: {[ins.id for ins in non_terminated_instances]}")
            return True
        return False
Ejemplo n.º 16
0
def cleanup(cluster_name, cluster_id, upi=False, failed_deletions=None):
    """
    Cleanup existing cluster in AWS

    Args:
        cluster_name (str): Name of the cluster
        cluster_id (str): Cluster id to cleanup
        upi (bool): True for UPI cluster, False otherwise
        failed_deletions (list): list of clusters we failed to delete, used
            for reporting purposes

    """
    data = {'cluster_name': cluster_name, 'cluster_id': cluster_id}
    template = templating.Templating(base_path=TEMPLATE_CLEANUP_DIR)
    cleanup_template = template.render_template(CLEANUP_YAML, data)
    cleanup_path = tempfile.mkdtemp(prefix='cleanup_')
    cleanup_file = os.path.join(cleanup_path, 'metadata.json')
    with open(cleanup_file, "w") as temp:
        temp.write(cleanup_template)
    bin_dir = os.path.expanduser(config.RUN['bin_dir'])
    oc_bin = os.path.join(bin_dir, "openshift-install")

    if upi:
        aws = AWS()
        rhel_workers = get_rhel_worker_instances(cleanup_path)
        logger.info(f"{cluster_name}'s RHEL workers: {rhel_workers}")
        if rhel_workers:
            terminate_rhel_workers(rhel_workers)
        # Destroy extra volumes
        destroy_volumes(cluster_name)

        stack_names = list()
        # Get master, bootstrap and security group stacks
        for stack_type in ['ma', 'bs', 'sg']:
            try:
                stack_names.append(
                    aws.get_cloudformation_stacks(
                        pattern=f"{cluster_name}-{stack_type}")[0]
                    ['StackName'])
            except ClientError:
                continue

        # Get the worker stacks
        worker_index = 0
        worker_stack_exists = True
        while worker_stack_exists:
            try:
                stack_names.append(
                    aws.get_cloudformation_stacks(
                        pattern=f"{cluster_name}-no{worker_index}")[0]
                    ['StackName'])
                worker_index += 1
            except ClientError:
                worker_stack_exists = False

        logger.info(f"Deleting stacks: {stack_names}")
        aws.delete_cloudformation_stacks(stack_names)

        # Destroy the cluster
        logger.info(f"cleaning up {cluster_id}")
        destroy_cluster(installer=oc_bin, cluster_path=cleanup_path)

        for stack_type in ['inf', 'vpc']:
            try:
                stack_names.append(
                    aws.get_cloudformation_stacks(
                        pattern=f"{cluster_name}-{stack_type}")[0]
                    ['StackName'])
            except ClientError:
                continue
        try:
            aws.delete_cloudformation_stacks(stack_names)
        except StackStatusError:
            logger.error('Failed to fully destroy cluster %s', cluster_name)
            if failed_deletions:
                failed_deletions.append(cluster_name)
            raise
    else:
        logger.info(f"cleaning up {cluster_id}")
        try:
            destroy_cluster(installer=oc_bin, cluster_path=cleanup_path)
        except CommandFailed:
            logger.error('Failed to fully destroy cluster %s', cluster_name)
            if failed_deletions:
                failed_deletions.append(cluster_name)
            raise
Ejemplo n.º 17
0
class ROSA(CloudDeploymentBase):
    """
    Deployment class for ROSA.
    """

    OCPDeployment = ROSAOCP

    def __init__(self):
        self.name = self.__class__.__name__
        super(ROSA, self).__init__()
        ocm.download_ocm_cli()
        rosa.download_rosa_cli()
        self.aws = AWSUtil(self.region)

    def deploy_ocp(self, log_cli_level="DEBUG"):
        """
        Deployment specific to OCP cluster on a cloud platform.

        Args:
            log_cli_level (str): openshift installer's log level
                (default: "DEBUG")

        """
        ocm.login()
        super(ROSA, self).deploy_ocp(log_cli_level)
        if config.DEPLOYMENT.get("host_network"):
            self.host_network_update()

    def check_cluster_existence(self, cluster_name_prefix):
        """
        Check cluster existence based on a cluster name.

        Args:
            cluster_name_prefix (str): name prefix which identifies a cluster

        Returns:
            bool: True if a cluster with the same name prefix already exists,
                False otherwise

        """
        cluster_list = ocm.list_cluster()
        for cluster in cluster_list:
            name, state = cluster
            if state != "uninstalling" and name.startswith(cluster_name_prefix):
                return True
        return False

    def deploy_ocs(self):
        """
        Deployment of ODF Managed Service addon on ROSA.
        """
        ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace)
        try:
            ceph_cluster.get().get("items")[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")
        rosa.install_odf_addon(self.cluster_name)
        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)

        if config.ENV_DATA.get("cluster_type") != "consumer":
            # Check for Ceph pods
            assert pod.wait_for_resource(
                condition="Running",
                selector=constants.MON_APP_LABEL,
                resource_count=3,
                timeout=600,
            )
            assert pod.wait_for_resource(
                condition="Running", selector=constants.MGR_APP_LABEL, timeout=600
            )
            assert pod.wait_for_resource(
                condition="Running",
                selector=constants.OSD_APP_LABEL,
                resource_count=3,
                timeout=600,
            )

        if config.DEPLOYMENT.get("pullsecret_workaround"):
            update_pull_secret()
        if config.ENV_DATA.get("cluster_type") == "consumer":
            patch_consumer_toolbox()

        # Verify health of ceph cluster
        ceph_health_check(namespace=self.namespace, tries=60, delay=10)

    def destroy_ocs(self):
        """
        Uninstall ODF Managed Service addon via rosa cli.
        """
        cluster_namespace = config.ENV_DATA["cluster_namespace"]

        # Deleting PVCs
        rbd_pvcs = [
            p
            for p in pvc.get_all_pvcs_in_storageclass(constants.CEPHBLOCKPOOL_SC)
            if not (
                p.data["metadata"]["namespace"] == cluster_namespace
                and p.data["metadata"]["labels"]["app"] == "noobaa"
            )
        ]
        pvc.delete_pvcs(rbd_pvcs)
        cephfs_pvcs = pvc.get_all_pvcs_in_storageclass(constants.CEPHFILESYSTEM_SC)
        pvc.delete_pvcs(cephfs_pvcs)
        rosa.delete_odf_addon(self.cluster_name)

    def host_network_update(self):
        """
        Update security group rules for HostNetwork
        """
        infrastructure_id = ocp.OCP().exec_oc_cmd(
            "get -o jsonpath='{.status.infrastructureName}{\"\\n\"}' infrastructure cluster"
        )
        worker_pattern = f"{infrastructure_id}-worker*"
        worker_instances = self.aws.get_instances_by_name_pattern(worker_pattern)
        security_groups = worker_instances[0]["security_groups"]
        sg_id = security_groups[0]["GroupId"]
        security_group = self.aws.ec2_resource.SecurityGroup(sg_id)
        # The ports are not 100 % clear yet. Taken from doc:
        # https://docs.google.com/document/d/1RM8tmMbvnJcOZFdsqbCl9RvHXBv5K2ZI6ziQ-YTloGk/edit#
        security_group.authorize_ingress(
            DryRun=False,
            IpPermissions=[
                {
                    "FromPort": 6800,
                    "ToPort": 7300,
                    "IpProtocol": "tcp",
                    "IpRanges": [
                        {"CidrIp": "10.0.0.0/16", "Description": "Ceph OSDs"},
                    ],
                },
                {
                    "FromPort": 3300,
                    "ToPort": 3300,
                    "IpProtocol": "tcp",
                    "IpRanges": [
                        {"CidrIp": "10.0.0.0/16", "Description": "Ceph MONs rule1"}
                    ],
                },
                {
                    "FromPort": 6789,
                    "ToPort": 6789,
                    "IpProtocol": "tcp",
                    "IpRanges": [
                        {"CidrIp": "10.0.0.0/16", "Description": "Ceph MONs rule2"},
                    ],
                },
                {
                    "FromPort": 9283,
                    "ToPort": 9283,
                    "IpProtocol": "tcp",
                    "IpRanges": [
                        {"CidrIp": "10.0.0.0/16", "Description": "Ceph Manager"},
                    ],
                },
                {
                    "FromPort": 31659,
                    "ToPort": 31659,
                    "IpProtocol": "tcp",
                    "IpRanges": [
                        {"CidrIp": "10.0.0.0/16", "Description": "API Server"},
                    ],
                },
            ],
        )