class AWSBase(CloudDeploymentBase): # default storage class for StorageCluster CRD on AWS platform DEFAULT_STORAGECLASS = "gp2" def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.aws = AWSUtil(self.region) # dict of cluster prefixes with special handling rules (for existence # check or during a cluster cleanup) self.cluster_prefixes_special_rules = CLUSTER_PREFIXES_SPECIAL_RULES def host_network_update(self): """ Update security group rules for HostNetwork """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f"{cluster_id}-worker*" worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) security_groups = worker_instances[0]["security_groups"] sg_id = security_groups[0]["GroupId"] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { "FromPort": 6800, "ToPort": 7300, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph OSDs", "GroupId": sg_id, }, ], }, { "FromPort": 3300, "ToPort": 3300, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph MONs rule1", "GroupId": sg_id, }, ], }, { "FromPort": 6789, "ToPort": 6789, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph MONs rule2", "GroupId": sg_id, }, ], }, { "FromPort": 8443, "ToPort": 8443, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph Dashboard rule1", "GroupId": sg_id, }, ], }, { "FromPort": 8080, "ToPort": 8080, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph Dashboard rule2", "GroupId": sg_id, }, ], }, ], ) def add_node(self): # TODO: Implement later super(AWSBase, self).add_node() def check_cluster_existence(self, cluster_name_prefix): """ Check cluster existence according to cluster name prefix Returns: bool: True if a cluster with the same name prefix already exists, False otherwise """ cluster_name_pattern = cluster_name_prefix + "*" instances = self.aws.get_instances_by_name_pattern( cluster_name_pattern) instance_objs = [ self.aws.get_ec2_instance(ins.get("id")) for ins in instances ] non_terminated_instances = [ ins for ins in instance_objs if ins.state.get("Code") != constants.INSTANCE_TERMINATED ] if non_terminated_instances: logger.error( f"Non terminated EC2 instances with the same name prefix were" f" found: {[ins.id for ins in non_terminated_instances]}") return True return False
def get_clusters(time_to_delete, region_name, prefixes_hours_to_spare): """ Get all cluster names that their EC2 instances running time is greater than the specified time to delete Args: time_to_delete (int): The maximum time in seconds that is allowed for clusters to continue running region_name (str): The name of the AWS region to delete the resources from prefixes_hours_to_spare (dict): Dictionaries of the cluster prefixes to spare along with the maximum time in hours that is allowed for spared clusters to continue running Returns: tuple: List of the cluster names (e.g ebenahar-cluster-gqtd4) to be provided to the ci-cleanup script, a list of VPCs that are part of cloudformation, and a list of remaining clusters """ def determine_cluster_deletion(ec2_instances, cluster_name): for instance in ec2_instances: allowed_running_time = time_to_delete do_not_delete = False if instance.state["Name"] == "running": for prefix, hours in prefixes_hours_to_spare.items(): # case insensitive 'startswith' if bool(re.match(prefix, cluster_name, re.I)): if hours == 'never': do_not_delete = True else: allowed_running_time = int(hours) * 60 * 60 break if do_not_delete: logger.info( "%s marked as 'do not delete' and will not be " "destroyed", cluster_name) return False else: launch_time = instance.launch_time current_time = datetime.datetime.now(launch_time.tzinfo) running_time = current_time - launch_time logger.info( f"Instance {[tag['Value'] for tag in instance.tags if tag['Key'] == 'Name'][0]} " f"(id: {instance.id}) running time is {running_time} hours while the allowed" f" running time for it is {allowed_running_time/3600} hours" ) if running_time.total_seconds() > allowed_running_time: return True return False aws = AWS(region_name=region_name) clusters_to_delete = list() remaining_clusters = list() cloudformation_vpc_names = list() vpcs = aws.ec2_client.describe_vpcs()['Vpcs'] vpc_ids = [vpc['VpcId'] for vpc in vpcs] vpc_objs = [aws.ec2_resource.Vpc(vpc_id) for vpc_id in vpc_ids] for vpc_obj in vpc_objs: vpc_tags = vpc_obj.tags if vpc_tags: cloudformation_vpc_name = [ tag['Value'] for tag in vpc_tags if tag['Key'] == defaults.AWS_CLOUDFORMATION_TAG ] if cloudformation_vpc_name: cloudformation_vpc_names.append(cloudformation_vpc_name[0]) continue vpc_name = [ tag['Value'] for tag in vpc_tags if tag['Key'] == 'Name' ][0] cluster_name = vpc_name.replace('-vpc', '') vpc_instances = vpc_obj.instances.all() if not vpc_instances: clusters_to_delete.append(cluster_name) continue # Append to clusters_to_delete if cluster should be deleted if determine_cluster_deletion(vpc_instances, cluster_name): clusters_to_delete.append(cluster_name) else: remaining_clusters.append(cluster_name) else: logger.info("No tags found for VPC") # Get all cloudformation based clusters to delete cf_clusters_to_delete = list() for vpc_name in cloudformation_vpc_names: instance_dicts = aws.get_instances_by_name_pattern( f"{vpc_name.replace('-vpc', '')}*") ec2_instances = [ aws.get_ec2_instance(instance_dict['id']) for instance_dict in instance_dicts ] if not ec2_instances: continue cluster_io_tag = None for instance in ec2_instances: cluster_io_tag = [ tag['Key'] for tag in instance.tags if 'kubernetes.io/cluster' in tag['Key'] ] if cluster_io_tag: break if not cluster_io_tag: logger.warning( "Unable to find valid cluster IO tag from ec2 instance tags " "for VPC %s. This is probably not an OCS cluster VPC!", vpc_name) continue cluster_name = cluster_io_tag[0].replace('kubernetes.io/cluster/', '') if determine_cluster_deletion(ec2_instances, cluster_name): cf_clusters_to_delete.append(cluster_name) else: remaining_clusters.append(cluster_name) return clusters_to_delete, cf_clusters_to_delete, remaining_clusters
class AWSBase(Deployment): def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.region = config.ENV_DATA['region'] self.aws = AWSUtil(self.region) if config.ENV_DATA.get('cluster_name'): self.cluster_name = config.ENV_DATA['cluster_name'] else: self.cluster_name = get_cluster_name(self.cluster_path) def create_ebs_volumes(self, worker_pattern, size=100): """ Add new ebs volumes to the workers Args: worker_pattern (str): Worker name pattern e.g.: cluster-55jx2-worker* size (int): Size in GB (default: 100) """ worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) with parallel() as p: for worker in worker_instances: logger.info(f"Creating and attaching {size} GB " f"volume to {worker['name']}") p.spawn( self.aws.create_volume_and_attach, availability_zone=worker['avz'], instance_id=worker['id'], name=f"{worker['name']}_extra_volume", size=size, ) def add_volume(self, size=100): """ Add a new volume to all the workers Args: size (int): Size of volume in GB (default: 100) """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f'{cluster_id}-worker*' logger.info(f'Worker pattern: {worker_pattern}') self.create_ebs_volumes(worker_pattern, size) def host_network_update(self): """ Update security group rules for HostNetwork """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f'{cluster_id}-worker*' worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) security_groups = worker_instances[0]['security_groups'] sg_id = security_groups[0]['GroupId'] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { 'FromPort': 6800, 'ToPort': 7300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph OSDs', 'GroupId': sg_id, }, ], }, { 'FromPort': 3300, 'ToPort': 3300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 6789, 'ToPort': 6789, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule2', 'GroupId': sg_id, }, ], }, { 'FromPort': 8443, 'ToPort': 8443, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 8080, 'ToPort': 8080, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule2', 'GroupId': sg_id, }, ], }, ]) def add_node(self): # TODO: Implement later super(AWSBase, self).add_node() def check_cluster_existence(self, cluster_name_prefix): """ Check cluster existence according to cluster name prefix Returns: bool: True if a cluster with the same name prefix already exists, False otherwise """ instances = self.aws.get_instances_by_name_pattern(cluster_name_prefix) instance_objs = [ self.aws.get_ec2_instance(ins.get('id')) for ins in instances ] non_terminated_instances = [ ins for ins in instance_objs if ins.state.get('Code') != constants.INSTANCE_TERMINATED ] if non_terminated_instances: logger.error( f"Non terminated EC2 instances with the same name prefix were" f" found: {[ins.id for ins in non_terminated_instances]}") return True return False
class AWSBase(CloudDeploymentBase): # default storage class for StorageCluster CRD on AWS platform DEFAULT_STORAGECLASS = "gp2" def __init__(self): """ This would be base for both IPI and UPI deployment """ super(AWSBase, self).__init__() self.aws = AWSUtil(self.region) def host_network_update(self): """ Update security group rules for HostNetwork """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f'{cluster_id}-worker*' worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) security_groups = worker_instances[0]['security_groups'] sg_id = security_groups[0]['GroupId'] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { 'FromPort': 6800, 'ToPort': 7300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph OSDs', 'GroupId': sg_id, }, ], }, { 'FromPort': 3300, 'ToPort': 3300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 6789, 'ToPort': 6789, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule2', 'GroupId': sg_id, }, ], }, { 'FromPort': 8443, 'ToPort': 8443, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 8080, 'ToPort': 8080, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule2', 'GroupId': sg_id, }, ], }, ]) def add_node(self): # TODO: Implement later super(AWSBase, self).add_node() def check_cluster_existence(self, cluster_name_prefix): """ Check cluster existence according to cluster name prefix Returns: bool: True if a cluster with the same name prefix already exists, False otherwise """ cluster_name_pattern = cluster_name_prefix + "*" instances = self.aws.get_instances_by_name_pattern( cluster_name_pattern) instance_objs = [ self.aws.get_ec2_instance(ins.get('id')) for ins in instances ] non_terminated_instances = [ ins for ins in instance_objs if ins.state.get('Code') != constants.INSTANCE_TERMINATED ] if non_terminated_instances: logger.error( f"Non terminated EC2 instances with the same name prefix were" f" found: {[ins.id for ins in non_terminated_instances]}") return True return False