def create_ebs_volumes( worker_pattern, size=100, region_name=None, ): """ Create volumes on workers Args: worker_pattern (string): Worker name pattern e.g.: cluster-55jx2-worker* size (int): Size in GB (default: 100) region_name (str): Region name (default: config.ENV_DATA['region']) """ aws = AWS(region_name) region_name = region_name or config.ENV_DATA['region'] worker_instances = aws.get_instances_by_name_pattern(worker_pattern) with parallel() as p: for worker in worker_instances: log.info( f"Creating and attaching {size} GB volume to {worker['name']}") p.spawn( aws.create_volume_and_attach, availability_zone=worker['avz'], instance_id=worker['id'], name=f"{worker['name']}_extra_volume", size=size, )
def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.region = config.ENV_DATA['region'] self.aws = AWSUtil(self.region)
def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.aws = AWSUtil(self.region) # dict of cluster prefixes with special handling rules (for existence # check or during a cluster cleanup) self.cluster_prefixes_special_rules = CLUSTER_PREFIXES_SPECIAL_RULES
def destroy_cluster(cluster_path, log_level="DEBUG"): """ Destroy existing cluster resources in AWS. Args: cluster_path (str): filepath to cluster directory to be destroyed log_level (str): log level to set for openshift_installer """ # Download installer installer = get_openshift_installer() destroy_cmd = (f"{installer} destroy cluster " f"--dir {cluster_path} " f"--log-level {log_level}") try: cluster_path = os.path.normpath(cluster_path) # Retrieve cluster name and aws region from metadata metadata_file = os.path.join(cluster_path, "metadata.json") with open(metadata_file) as f: metadata = json.loads(f.read()) cluster_name = metadata.get("clusterName") region_name = metadata.get("aws").get("region") # Execute destroy cluster using OpenShift installer log.info(f"Destroying cluster defined in {cluster_path}") run_cmd(destroy_cmd) # Find and delete volumes aws = AWS(region_name) volume_pattern = f"{cluster_name}*" log.debug(f"Finding volumes with pattern: {volume_pattern}") volumes = aws.get_volumes_by_name_pattern(volume_pattern) log.debug(f"Found volumes: \n {volumes}") for volume in volumes: aws.detach_and_delete_volume(aws.ec2_resource.Volume(volume['id'])) # Remove installer delete_file(installer) except Exception: log.error(traceback.format_exc())
def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.region = config.ENV_DATA['region'] self.aws = AWSUtil(self.region) if config.ENV_DATA.get('cluster_name'): self.cluster_name = config.ENV_DATA['cluster_name'] else: self.cluster_name = get_cluster_name(self.cluster_path)
def get_clusters_to_delete(time_to_delete, region_name, prefixes_hours_to_spare): """ Get all cluster names that their EC2 instances running time is greater than the specified time to delete Args: time_to_delete (int): The maximum time in seconds that is allowed for clusters to continue running region_name (str): The name of the AWS region to delete the resources from prefixes_hours_to_spare (dict): Dictionaries of the cluster prefixes to spare along with the maximum time in hours that is allowed for spared clusters to continue running Returns: tuple: List of the cluster names (e.g ebenahar-cluster-gqtd4) to be provided to the ci-cleanup script and a list of VPCs that are part of cloudformations """ aws = AWS(region_name=region_name) clusters_to_delete = list() cloudformation_vpcs = list() vpcs = aws.ec2_client.describe_vpcs()['Vpcs'] vpc_ids = [vpc['VpcId'] for vpc in vpcs] vpc_objs = [aws.ec2_resource.Vpc(vpc_id) for vpc_id in vpc_ids] for vpc_obj in vpc_objs: vpc_tags = vpc_obj.tags vpc_cloudformation = [ tag['Value'] for tag in vpc_tags if tag['Key'] == defaults.AWS_CLOUDFORMATION_TAG ] if vpc_cloudformation: cloudformation_vpcs.append(vpc_cloudformation) continue vpc_name = [tag['Value'] for tag in vpc_tags if tag['Key'] == 'Name'][0] cluster_name = vpc_name[:-4] vpc_instances = vpc_obj.instances.all() if not vpc_instances: clusters_to_delete.append(cluster_name) for instance in vpc_instances: allowed_running_time = time_to_delete if instance.state["Name"] == "running": for prefix, hours in prefixes_hours_to_spare.items(): if prefix in cluster_name: allowed_running_time = int(hours) * 60 * 60 break launch_time = instance.launch_time current_time = datetime.datetime.now(launch_time.tzinfo) running_time = current_time - launch_time if running_time.seconds > allowed_running_time: clusters_to_delete.append(cluster_name) break return clusters_to_delete, cloudformation_vpcs
class AWSBase(Deployment): def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.region = config.ENV_DATA['region'] self.aws = AWSUtil(self.region) def create_ebs_volumes(self, worker_pattern, size=100): """ Add new ebs volumes to the workers Args: worker_pattern (str): Worker name pattern e.g.: cluster-55jx2-worker* size (int): Size in GB (default: 100) """ worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern ) with parallel() as p: for worker in worker_instances: logger.info( f"Creating and attaching {size} GB " f"volume to {worker['name']}" ) p.spawn( self.aws.create_volume_and_attach, availability_zone=worker['avz'], instance_id=worker['id'], name=f"{worker['name']}_extra_volume", size=size, ) def add_volume(self, size=100): """ Add a new volume to all the workers Args: size (int): Size of volume in GB (default: 100) """ tfvars_file = "terraform.tfvars.json" with open(os.path.join(self.cluster_path, tfvars_file)) as f: tfvars = json.load(f) cluster_id = tfvars['cluster_id'] worker_pattern = f'{cluster_id}-worker*' logger.info(f'Worker pattern: {worker_pattern}') self.create_ebs_volumes(worker_pattern, size) def add_node(self): # TODO: Implement later super(AWSBase, self).add_node()
def get_clusters_to_delete(time_to_delete, region_name, prefixes_to_spare): """ Get all cluster names that their EC2 instances running time is greater than the specified time to delete Args: time_to_delete (int): The maximum time in seconds that is allowed for clusters to continue running region_name (str): The name of the AWS region to delete the resources from prefixes_to_spare (list): The cluster prefixes to spare Returns: list: The cluster names (e.g ebenahar-cluster-gqtd4) to be provided to the ci-cleanup script """ aws = AWS(region_name=region_name) clusters_to_delete = list() vpcs = aws.ec2_client.describe_vpcs()['Vpcs'] vpc_ids = [vpc['VpcId'] for vpc in vpcs] vpc_objs = [aws.ec2_resource.Vpc(vpc_id) for vpc_id in vpc_ids] for vpc_obj in vpc_objs: vpc_name = [ tag['Value'] for tag in vpc_obj.tags if tag['Key'] == 'Name' ][0] cluster_name = vpc_name[:-4] if any(prefix not in cluster_name for prefix in prefixes_to_spare): vpc_instances = vpc_obj.instances.all() if not vpc_instances: clusters_to_delete.append(cluster_name) for instance in vpc_instances: if instance.state["Name"] == "running": launch_time = instance.launch_time current_time = datetime.datetime.now(launch_time.tzinfo) running_time = current_time - launch_time if running_time.seconds > time_to_delete: clusters_to_delete.append(cluster_name) break return clusters_to_delete
class AWSBase(Deployment): def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.region = config.ENV_DATA['region'] self.aws = AWSUtil(self.region) if config.ENV_DATA.get('cluster_name'): self.cluster_name = config.ENV_DATA['cluster_name'] else: self.cluster_name = get_cluster_name(self.cluster_path) def create_ebs_volumes(self, worker_pattern, size=100): """ Add new ebs volumes to the workers Args: worker_pattern (str): Worker name pattern e.g.: cluster-55jx2-worker* size (int): Size in GB (default: 100) """ worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) with parallel() as p: for worker in worker_instances: logger.info(f"Creating and attaching {size} GB " f"volume to {worker['name']}") p.spawn( self.aws.create_volume_and_attach, availability_zone=worker['avz'], instance_id=worker['id'], name=f"{worker['name']}_extra_volume", size=size, ) def add_volume(self, size=100): """ Add a new volume to all the workers Args: size (int): Size of volume in GB (default: 100) """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f'{cluster_id}-worker*' logger.info(f'Worker pattern: {worker_pattern}') self.create_ebs_volumes(worker_pattern, size) def host_network_update(self): """ Update security group rules for HostNetwork """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f'{cluster_id}-worker*' worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) security_groups = worker_instances[0]['security_groups'] sg_id = security_groups[0]['GroupId'] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { 'FromPort': 6800, 'ToPort': 7300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph OSDs', 'GroupId': sg_id, }, ], }, { 'FromPort': 3300, 'ToPort': 3300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 6789, 'ToPort': 6789, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule2', 'GroupId': sg_id, }, ], }, { 'FromPort': 8443, 'ToPort': 8443, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 8080, 'ToPort': 8080, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule2', 'GroupId': sg_id, }, ], }, ]) def add_node(self): # TODO: Implement later super(AWSBase, self).add_node() def check_cluster_existence(self, cluster_name_prefix): """ Check cluster existence according to cluster name prefix Returns: bool: True if a cluster with the same name prefix already exists, False otherwise """ instances = self.aws.get_instances_by_name_pattern(cluster_name_prefix) instance_objs = [ self.aws.get_ec2_instance(ins.get('id')) for ins in instances ] non_terminated_instances = [ ins for ins in instance_objs if ins.state.get('Code') != constants.INSTANCE_TERMINATED ] if non_terminated_instances: logger.error( f"Non terminated EC2 instances with the same name prefix were" f" found: {[ins.id for ins in non_terminated_instances]}") return True return False
def vsphere_cleanup(): """ Deletes the cluster and all the associated resources on vSphere environment. Resources that are deleting: 1. Delete disks 2. Delete VM's 3. Delete Resource Pool 4. Remove IP's from IPAM server 5. Removes Resource records from Hosted Zone 6. Removes Hosted Zone from AWS 7. Removes records from Base Domain """ parser = argparse.ArgumentParser( description="vSphere cluster cleanup", formatter_class=argparse.RawTextHelpFormatter, ) parser.add_argument( "--cluster_name", action="store", required=True, help="The name of the cluster to delete from vSphere", ) parser.add_argument( "--vsphere_conf", action="store", required=True, type=argparse.FileType("r", encoding="UTF-8"), help="""vSphere configuration file in yaml format. Example file: --- ENV_DATA: # aws region region: 'us-east-2' base_domain: 'qe.rh-ocs.com' # vsphere details vsphere_server: '<your_vcenter.lab.com>' vsphere_user: '******' vsphere_password: '******' vsphere_cluster: '<cluster name>' vsphere_datacenter: '<datacenter name>' ipam: '<IP>' ipam_token: '<IPAM token>' """, ) args = parser.parse_args() cluster_name = args.cluster_name vsphere_conf = args.vsphere_conf # load vsphere_conf data to config vsphere_config_data = yaml.safe_load(vsphere_conf) framework.config.update(vsphere_config_data) vsphere_conf.close() # get connection to vSphere server = config.ENV_DATA["vsphere_server"] user = config.ENV_DATA["vsphere_user"] password = config.ENV_DATA["vsphere_password"] vsphere = get_vsphere_connection(server, user, password) # delete the cluster delete_cluster(vsphere, cluster_name) # release IP's from IPAM server ipam = IPAM() ipam.delete_ips(cluster_name=cluster_name) # Delete AWS route aws = AWS() aws.delete_hosted_zone(cluster_name=cluster_name) # Delete records in base domain base_domain = config.ENV_DATA["base_domain"] aws.delete_record_from_base_domain(cluster_name=cluster_name, base_domain=base_domain)
def __init__(self): self.name = self.__class__.__name__ super(ROSA, self).__init__() ocm.download_ocm_cli() rosa.download_rosa_cli() self.aws = AWSUtil(self.region)
def get_clusters(time_to_delete, region_name, prefixes_hours_to_spare): """ Get all cluster names that their EC2 instances running time is greater than the specified time to delete Args: time_to_delete (int): The maximum time in seconds that is allowed for clusters to continue running region_name (str): The name of the AWS region to delete the resources from prefixes_hours_to_spare (dict): Dictionaries of the cluster prefixes to spare along with the maximum time in hours that is allowed for spared clusters to continue running Returns: tuple: List of the cluster names (e.g ebenahar-cluster-gqtd4) to be provided to the ci-cleanup script, a list of VPCs that are part of cloudformation, and a list of remaining clusters """ def determine_cluster_deletion(ec2_instances, cluster_name): for instance in ec2_instances: allowed_running_time = time_to_delete do_not_delete = False if instance.state["Name"] == "running": for prefix, hours in prefixes_hours_to_spare.items(): # case insensitive 'startswith' if bool(re.match(prefix, cluster_name, re.I)): if hours == 'never': do_not_delete = True else: allowed_running_time = int(hours) * 60 * 60 break if do_not_delete: logger.info( "%s marked as 'do not delete' and will not be " "destroyed", cluster_name) return False else: launch_time = instance.launch_time current_time = datetime.datetime.now(launch_time.tzinfo) running_time = current_time - launch_time logger.info( f"Instance {[tag['Value'] for tag in instance.tags if tag['Key'] == 'Name'][0]} " f"(id: {instance.id}) running time is {running_time} hours while the allowed" f" running time for it is {allowed_running_time/3600} hours" ) if running_time.total_seconds() > allowed_running_time: return True return False aws = AWS(region_name=region_name) clusters_to_delete = list() remaining_clusters = list() cloudformation_vpc_names = list() vpcs = aws.ec2_client.describe_vpcs()['Vpcs'] vpc_ids = [vpc['VpcId'] for vpc in vpcs] vpc_objs = [aws.ec2_resource.Vpc(vpc_id) for vpc_id in vpc_ids] for vpc_obj in vpc_objs: vpc_tags = vpc_obj.tags if vpc_tags: cloudformation_vpc_name = [ tag['Value'] for tag in vpc_tags if tag['Key'] == defaults.AWS_CLOUDFORMATION_TAG ] if cloudformation_vpc_name: cloudformation_vpc_names.append(cloudformation_vpc_name[0]) continue vpc_name = [ tag['Value'] for tag in vpc_tags if tag['Key'] == 'Name' ][0] cluster_name = vpc_name.replace('-vpc', '') vpc_instances = vpc_obj.instances.all() if not vpc_instances: clusters_to_delete.append(cluster_name) continue # Append to clusters_to_delete if cluster should be deleted if determine_cluster_deletion(vpc_instances, cluster_name): clusters_to_delete.append(cluster_name) else: remaining_clusters.append(cluster_name) else: logger.info("No tags found for VPC") # Get all cloudformation based clusters to delete cf_clusters_to_delete = list() for vpc_name in cloudformation_vpc_names: instance_dicts = aws.get_instances_by_name_pattern( f"{vpc_name.replace('-vpc', '')}*") ec2_instances = [ aws.get_ec2_instance(instance_dict['id']) for instance_dict in instance_dicts ] if not ec2_instances: continue cluster_io_tag = None for instance in ec2_instances: cluster_io_tag = [ tag['Key'] for tag in instance.tags if 'kubernetes.io/cluster' in tag['Key'] ] if cluster_io_tag: break if not cluster_io_tag: logger.warning( "Unable to find valid cluster IO tag from ec2 instance tags " "for VPC %s. This is probably not an OCS cluster VPC!", vpc_name) continue cluster_name = cluster_io_tag[0].replace('kubernetes.io/cluster/', '') if determine_cluster_deletion(ec2_instances, cluster_name): cf_clusters_to_delete.append(cluster_name) else: remaining_clusters.append(cluster_name) return clusters_to_delete, cf_clusters_to_delete, remaining_clusters
class AWSBase(CloudDeploymentBase): # default storage class for StorageCluster CRD on AWS platform DEFAULT_STORAGECLASS = "gp2" def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.aws = AWSUtil(self.region) def host_network_update(self): """ Update security group rules for HostNetwork """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f'{cluster_id}-worker*' worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) security_groups = worker_instances[0]['security_groups'] sg_id = security_groups[0]['GroupId'] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { 'FromPort': 6800, 'ToPort': 7300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph OSDs', 'GroupId': sg_id, }, ], }, { 'FromPort': 3300, 'ToPort': 3300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 6789, 'ToPort': 6789, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule2', 'GroupId': sg_id, }, ], }, { 'FromPort': 8443, 'ToPort': 8443, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 8080, 'ToPort': 8080, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule2', 'GroupId': sg_id, }, ], }, ]) def add_node(self): # TODO: Implement later super(AWSBase, self).add_node() def check_cluster_existence(self, cluster_name_prefix): """ Check cluster existence according to cluster name prefix Returns: bool: True if a cluster with the same name prefix already exists, False otherwise """ cluster_name_pattern = cluster_name_prefix + "*" instances = self.aws.get_instances_by_name_pattern( cluster_name_pattern) instance_objs = [ self.aws.get_ec2_instance(ins.get('id')) for ins in instances ] non_terminated_instances = [ ins for ins in instance_objs if ins.state.get('Code') != constants.INSTANCE_TERMINATED ] if non_terminated_instances: logger.error( f"Non terminated EC2 instances with the same name prefix were" f" found: {[ins.id for ins in non_terminated_instances]}") return True return False
def create_cluster(cluster_name, version, region): """ Create OCP cluster. Args: cluster_name (str): Cluster name version (str): cluster version region (str): Cluster region """ rosa_ocp_version = config.DEPLOYMENT["installer_version"] # Validate ocp version with rosa ocp supported version # Select the valid version if given version is invalid if not validate_ocp_version(rosa_ocp_version): logger.warning(f"Given OCP version {rosa_ocp_version} " f"is not valid ROSA OCP version. " f"Selecting latest rosa version for deployment") rosa_ocp_version = get_latest_rosa_version(version) logger.info(f"Using OCP version {rosa_ocp_version}") create_account_roles(version) compute_nodes = config.ENV_DATA["worker_replicas"] compute_machine_type = config.ENV_DATA["worker_instance_type"] multi_az = "--multi-az " if config.ENV_DATA.get( "multi_availability_zones") else "" cluster_type = config.ENV_DATA.get("cluster_type", "") provider_name = config.ENV_DATA.get("provider_name", "") rosa_mode = config.ENV_DATA.get("rosa_mode", "") cmd = ( f"rosa create cluster --cluster-name {cluster_name} --region {region} " f"--compute-nodes {compute_nodes} --compute-machine-type " f"{compute_machine_type} --version {rosa_ocp_version} {multi_az}--sts --yes" ) if rosa_mode == "auto": cmd += " --mode auto" if cluster_type.lower() == "consumer" and config.ENV_DATA.get( "provider_name", ""): aws = AWSUtil() subnet_id = ",".join(aws.get_cluster_subnet_ids(provider_name)) cmd = f"{cmd} --subnet-ids {subnet_id}" utils.run_cmd(cmd, timeout=1200) if rosa_mode != "auto": logger.info( "Waiting for ROSA cluster status changed to waiting or pending state" ) for cluster_info in utils.TimeoutSampler(4500, 30, ocm.get_cluster_details, cluster_name): status = cluster_info["status"]["state"] logger.info(f"Current installation status: {status}") if status == "waiting" or status == "pending": logger.info(f"Cluster is in {status} state") break create_operator_roles(cluster_name) create_oidc_provider(cluster_name) logger.info("Waiting for installation of ROSA cluster") for cluster_info in utils.TimeoutSampler(4500, 30, ocm.get_cluster_details, cluster_name): status = cluster_info["status"]["state"] logger.info(f"Current installation status: {status}") if status == "ready": logger.info("Cluster was installed") break cluster_info = ocm.get_cluster_details(cluster_name) # Create metadata file to store the cluster name cluster_info["clusterName"] = cluster_name cluster_info["clusterID"] = cluster_info["id"] cluster_path = config.ENV_DATA["cluster_path"] metadata_file = os.path.join(cluster_path, "metadata.json") with open(metadata_file, "w+") as f: json.dump(cluster_info, f)
class AWSBase(CloudDeploymentBase): # default storage class for StorageCluster CRD on AWS platform DEFAULT_STORAGECLASS = "gp2" def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.aws = AWSUtil(self.region) # dict of cluster prefixes with special handling rules (for existence # check or during a cluster cleanup) self.cluster_prefixes_special_rules = CLUSTER_PREFIXES_SPECIAL_RULES def host_network_update(self): """ Update security group rules for HostNetwork """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f"{cluster_id}-worker*" worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) security_groups = worker_instances[0]["security_groups"] sg_id = security_groups[0]["GroupId"] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { "FromPort": 6800, "ToPort": 7300, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph OSDs", "GroupId": sg_id, }, ], }, { "FromPort": 3300, "ToPort": 3300, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph MONs rule1", "GroupId": sg_id, }, ], }, { "FromPort": 6789, "ToPort": 6789, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph MONs rule2", "GroupId": sg_id, }, ], }, { "FromPort": 8443, "ToPort": 8443, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph Dashboard rule1", "GroupId": sg_id, }, ], }, { "FromPort": 8080, "ToPort": 8080, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph Dashboard rule2", "GroupId": sg_id, }, ], }, ], ) def add_node(self): # TODO: Implement later super(AWSBase, self).add_node() def check_cluster_existence(self, cluster_name_prefix): """ Check cluster existence according to cluster name prefix Returns: bool: True if a cluster with the same name prefix already exists, False otherwise """ cluster_name_pattern = cluster_name_prefix + "*" instances = self.aws.get_instances_by_name_pattern( cluster_name_pattern) instance_objs = [ self.aws.get_ec2_instance(ins.get("id")) for ins in instances ] non_terminated_instances = [ ins for ins in instance_objs if ins.state.get("Code") != constants.INSTANCE_TERMINATED ] if non_terminated_instances: logger.error( f"Non terminated EC2 instances with the same name prefix were" f" found: {[ins.id for ins in non_terminated_instances]}") return True return False
def cleanup(cluster_name, cluster_id, upi=False, failed_deletions=None): """ Cleanup existing cluster in AWS Args: cluster_name (str): Name of the cluster cluster_id (str): Cluster id to cleanup upi (bool): True for UPI cluster, False otherwise failed_deletions (list): list of clusters we failed to delete, used for reporting purposes """ data = {'cluster_name': cluster_name, 'cluster_id': cluster_id} template = templating.Templating(base_path=TEMPLATE_CLEANUP_DIR) cleanup_template = template.render_template(CLEANUP_YAML, data) cleanup_path = tempfile.mkdtemp(prefix='cleanup_') cleanup_file = os.path.join(cleanup_path, 'metadata.json') with open(cleanup_file, "w") as temp: temp.write(cleanup_template) bin_dir = os.path.expanduser(config.RUN['bin_dir']) oc_bin = os.path.join(bin_dir, "openshift-install") if upi: aws = AWS() rhel_workers = get_rhel_worker_instances(cleanup_path) logger.info(f"{cluster_name}'s RHEL workers: {rhel_workers}") if rhel_workers: terminate_rhel_workers(rhel_workers) # Destroy extra volumes destroy_volumes(cluster_name) stack_names = list() # Get master, bootstrap and security group stacks for stack_type in ['ma', 'bs', 'sg']: try: stack_names.append( aws.get_cloudformation_stacks( pattern=f"{cluster_name}-{stack_type}")[0] ['StackName']) except ClientError: continue # Get the worker stacks worker_index = 0 worker_stack_exists = True while worker_stack_exists: try: stack_names.append( aws.get_cloudformation_stacks( pattern=f"{cluster_name}-no{worker_index}")[0] ['StackName']) worker_index += 1 except ClientError: worker_stack_exists = False logger.info(f"Deleting stacks: {stack_names}") aws.delete_cloudformation_stacks(stack_names) # Destroy the cluster logger.info(f"cleaning up {cluster_id}") destroy_cluster(installer=oc_bin, cluster_path=cleanup_path) for stack_type in ['inf', 'vpc']: try: stack_names.append( aws.get_cloudformation_stacks( pattern=f"{cluster_name}-{stack_type}")[0] ['StackName']) except ClientError: continue try: aws.delete_cloudformation_stacks(stack_names) except StackStatusError: logger.error('Failed to fully destroy cluster %s', cluster_name) if failed_deletions: failed_deletions.append(cluster_name) raise else: logger.info(f"cleaning up {cluster_id}") try: destroy_cluster(installer=oc_bin, cluster_path=cleanup_path) except CommandFailed: logger.error('Failed to fully destroy cluster %s', cluster_name) if failed_deletions: failed_deletions.append(cluster_name) raise
class ROSA(CloudDeploymentBase): """ Deployment class for ROSA. """ OCPDeployment = ROSAOCP def __init__(self): self.name = self.__class__.__name__ super(ROSA, self).__init__() ocm.download_ocm_cli() rosa.download_rosa_cli() self.aws = AWSUtil(self.region) def deploy_ocp(self, log_cli_level="DEBUG"): """ Deployment specific to OCP cluster on a cloud platform. Args: log_cli_level (str): openshift installer's log level (default: "DEBUG") """ ocm.login() super(ROSA, self).deploy_ocp(log_cli_level) if config.DEPLOYMENT.get("host_network"): self.host_network_update() def check_cluster_existence(self, cluster_name_prefix): """ Check cluster existence based on a cluster name. Args: cluster_name_prefix (str): name prefix which identifies a cluster Returns: bool: True if a cluster with the same name prefix already exists, False otherwise """ cluster_list = ocm.list_cluster() for cluster in cluster_list: name, state = cluster if state != "uninstalling" and name.startswith(cluster_name_prefix): return True return False def deploy_ocs(self): """ Deployment of ODF Managed Service addon on ROSA. """ ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace) try: ceph_cluster.get().get("items")[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") rosa.install_odf_addon(self.cluster_name) pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) if config.ENV_DATA.get("cluster_type") != "consumer": # Check for Ceph pods assert pod.wait_for_resource( condition="Running", selector=constants.MON_APP_LABEL, resource_count=3, timeout=600, ) assert pod.wait_for_resource( condition="Running", selector=constants.MGR_APP_LABEL, timeout=600 ) assert pod.wait_for_resource( condition="Running", selector=constants.OSD_APP_LABEL, resource_count=3, timeout=600, ) if config.DEPLOYMENT.get("pullsecret_workaround"): update_pull_secret() if config.ENV_DATA.get("cluster_type") == "consumer": patch_consumer_toolbox() # Verify health of ceph cluster ceph_health_check(namespace=self.namespace, tries=60, delay=10) def destroy_ocs(self): """ Uninstall ODF Managed Service addon via rosa cli. """ cluster_namespace = config.ENV_DATA["cluster_namespace"] # Deleting PVCs rbd_pvcs = [ p for p in pvc.get_all_pvcs_in_storageclass(constants.CEPHBLOCKPOOL_SC) if not ( p.data["metadata"]["namespace"] == cluster_namespace and p.data["metadata"]["labels"]["app"] == "noobaa" ) ] pvc.delete_pvcs(rbd_pvcs) cephfs_pvcs = pvc.get_all_pvcs_in_storageclass(constants.CEPHFILESYSTEM_SC) pvc.delete_pvcs(cephfs_pvcs) rosa.delete_odf_addon(self.cluster_name) def host_network_update(self): """ Update security group rules for HostNetwork """ infrastructure_id = ocp.OCP().exec_oc_cmd( "get -o jsonpath='{.status.infrastructureName}{\"\\n\"}' infrastructure cluster" ) worker_pattern = f"{infrastructure_id}-worker*" worker_instances = self.aws.get_instances_by_name_pattern(worker_pattern) security_groups = worker_instances[0]["security_groups"] sg_id = security_groups[0]["GroupId"] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1RM8tmMbvnJcOZFdsqbCl9RvHXBv5K2ZI6ziQ-YTloGk/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { "FromPort": 6800, "ToPort": 7300, "IpProtocol": "tcp", "IpRanges": [ {"CidrIp": "10.0.0.0/16", "Description": "Ceph OSDs"}, ], }, { "FromPort": 3300, "ToPort": 3300, "IpProtocol": "tcp", "IpRanges": [ {"CidrIp": "10.0.0.0/16", "Description": "Ceph MONs rule1"} ], }, { "FromPort": 6789, "ToPort": 6789, "IpProtocol": "tcp", "IpRanges": [ {"CidrIp": "10.0.0.0/16", "Description": "Ceph MONs rule2"}, ], }, { "FromPort": 9283, "ToPort": 9283, "IpProtocol": "tcp", "IpRanges": [ {"CidrIp": "10.0.0.0/16", "Description": "Ceph Manager"}, ], }, { "FromPort": 31659, "ToPort": 31659, "IpProtocol": "tcp", "IpRanges": [ {"CidrIp": "10.0.0.0/16", "Description": "API Server"}, ], }, ], )