def create_rhel_instance(self): """ This function does the following: 1. Create RHEL worker instances, copy required AWS tags from existing 2. worker instances to new RHEL instances 3. Copy IAM role from existing worker to new RHEL workers """ cluster_id = get_infra_id(self.cluster_path) num_workers = int(os.environ.get("num_workers", 3)) logging.info(f"Creating {num_workers} RHEL workers") for i in range(num_workers): self.gather_worker_data(f"no{i}") logging.info(f"Creating {i + 1}/{num_workers} worker") response = self.client.run_instances( BlockDeviceMappings=[ { "DeviceName": config.ENV_DATA["root_disk"], "Ebs": { "DeleteOnTermination": True, "VolumeSize": config.ENV_DATA["root_disk_size"], "VolumeType": "gp2", }, }, ], ImageId=config.ENV_DATA["rhel_worker_ami"], SubnetId=self.worker_subnet, InstanceType=config.ENV_DATA["rhel_worker_instance_type"], MaxCount=1, MinCount=1, Monitoring={"Enabled": False}, SecurityGroupIds=[ self.worker_security_group[0]["GroupId"], ], KeyName="openshift-dev", ) inst_id = response["Instances"][0]["InstanceId"] worker_ec2 = boto3.resource("ec2", region_name=self.region) worker_instance = worker_ec2.Instance(inst_id) worker_instance.wait_until_running() worker_name = f"{cluster_id}-rhel-worker-{i}" self.rhel_worker_list[worker_name] = worker_instance worker_ec2.create_tags( Resources=[inst_id], Tags=[ { "Key": "Name", "Value": f"{worker_name}" }, { "Key": self.worker_tag[0], "Value": self.worker_tag[1] }, ], ) logging.info(self.worker_iam_role) self.client.associate_iam_instance_profile( IamInstanceProfile=self.worker_iam_role, InstanceId=inst_id, )
def _prepare_upi_rhel_node(self, node_conf): """ Handle RHEL worker instance creation 1. Create RHEL worker instance , copy required AWS tags from existing worker instances to new RHEL instance 2. Copy IAM role from existing worker to new RHEL worker """ cluster_id = get_infra_id(self.cluster_path) node_id = node_conf['node_id'] if not node_conf.get('zone'): num_zone = get_az_count() zone = random.randint(0, num_zone) else: zone = node_conf.get('zone') logger.info("Creating RHEL worker node") self.gather_worker_data(f'no{zone}') response = self.client.run_instances( BlockDeviceMappings=[ { 'DeviceName': node_conf['root_disk'], 'Ebs': { 'DeleteOnTermination': True, 'VolumeSize': node_conf['root_disk_size'], 'VolumeType': 'gp2' }, }, ], ImageId=node_conf['rhel_worker_ami'], SubnetId=self.worker_subnet, InstanceType=node_conf['rhel_worker_instance_type'], MaxCount=1, MinCount=1, Monitoring={ 'Enabled': False }, SecurityGroupIds=[ self.worker_security_group[0]['GroupId'], ], KeyName='openshift-dev' ) inst_id = response['Instances'][0]['InstanceId'] worker_ec2 = boto3.resource('ec2', region_name=self.region) worker_instance = worker_ec2.Instance(inst_id) worker_instance.wait_until_running() worker_name = f'{cluster_id}-rhel-worker-{node_id}' worker_ec2.create_tags( Resources=[inst_id], Tags=[ {'Key': 'Name', 'Value': f'{worker_name}'}, {'Key': self.worker_tag[0], 'Value': self.worker_tag[1]} ] ) logging.info(self.worker_iam_role) self.client.associate_iam_instance_profile( IamInstanceProfile=self.worker_iam_role, InstanceId=inst_id, ) return worker_instance
def create_rhel_instance(self): """ This function does the following: 1. Create RHEL worker instances, copy required AWS tags from existing 2. worker instances to new RHEL instances 3. Copy IAM role from existing worker to new RHEL workers """ cluster_id = get_infra_id(self.cluster_path) num_workers = int(os.environ.get('num_workers', 3)) logging.info(f"Creating {num_workers} RHEL workers") for i in range(num_workers): self.gather_worker_data(f'no{i}') logging.info(f"Creating {i + 1}/{num_workers} worker") response = self.client.run_instances( BlockDeviceMappings=[ { 'DeviceName': config.ENV_DATA['root_disk'], 'Ebs': { 'DeleteOnTermination': True, 'VolumeSize': config.ENV_DATA['root_disk_size'], 'VolumeType': 'gp2' }, }, ], ImageId=config.ENV_DATA['rhel_worker_ami'], SubnetId=self.worker_subnet, InstanceType=config.ENV_DATA['rhel_worker_instance_type'], MaxCount=1, MinCount=1, Monitoring={ 'Enabled': False }, SecurityGroupIds=[ self.worker_security_group[0]['GroupId'], ], KeyName='openshift-dev' ) inst_id = response['Instances'][0]['InstanceId'] worker_ec2 = boto3.resource('ec2', region_name=self.region) worker_instance = worker_ec2.Instance(inst_id) worker_instance.wait_until_running() worker_name = f'{cluster_id}-rhel-worker-{i}' self.rhel_worker_list[worker_name] = worker_instance worker_ec2.create_tags( Resources=[inst_id], Tags=[ {'Key': 'Name', 'Value': f'{worker_name}'}, {'Key': self.worker_tag[0], 'Value': self.worker_tag[1]} ] ) logging.info(self.worker_iam_role) self.client.associate_iam_instance_profile( IamInstanceProfile=self.worker_iam_role, InstanceId=inst_id, )
def add_volume(self, size=100): """ Add a new volume to all the workers Args: size (int): Size of volume in GB (default: 100) """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f'{cluster_id}-worker*' logger.info(f'Worker pattern: {worker_pattern}') self.create_ebs_volumes(worker_pattern, size)
def get_node_data_aws(): """ Retrieve bootstrap public IP and master node private IPs running in aws Raises: NodeNotFoundError: If we are unable to find the bootstrap node or IP Returns: dict: bootstrap and master node IP data """ session = boto3.Session() credentials = session.get_credentials().get_frozen_credentials() ec2_driver = get_driver(Provider.EC2) driver = ec2_driver( credentials.access_key, credentials.secret_key, region=config.ENV_DATA['region'] ) cluster_path = config.ENV_DATA['cluster_path'] infra_id = get_infra_id(cluster_path) bootstrap_name = f"{infra_id}-bootstrap" master_pattern = f"{infra_id}-master" data = dict() try: bootstrap_node = [ node for node in driver.list_nodes() if bootstrap_name == node.name ][0] bootstrap_ip = bootstrap_node.public_ips[0] logger.info( "Found bootstrap node %s with IP %s", bootstrap_name, bootstrap_ip ) data['bootstrap_ip'] = bootstrap_ip except IndexError: raise NodeNotFoundError( f"Unable to find bootstrap node with name {bootstrap_name}" ) master_nodes = [ node for node in driver.list_nodes() if master_pattern in node.name ] master_ips = [master.private_ips[0] for master in master_nodes] data['master_ips'] = [ip for ip in master_ips if ip is not None] if len(data['master_ips']) < config.ENV_DATA['master_replicas']: logger.warning('IP data was not found for all master nodes') logger.debug(data) return data
def get_rhel_worker_instances(self): """ Get list of rhel worker instance IDs Returns: list: list of instance IDs of rhel workers """ rhel_workers = [] worker_pattern = get_infra_id(self.cluster_path) + "*rhel-worker*" worker_filter = [{'Name': 'tag:Name', 'Values': [worker_pattern]}] response = self.client.describe_instances(Filters=worker_filter) for worker in response['Reservations']: rhel_workers.append(worker['Instances'][0]['InstanceId']) return rhel_workers
def deploy(self, log_level=""): self.flexy_instance.deploy(log_level) self.test_cluster() # add disks to instances # Get all instances and for each instance add # one disk pattern = "-".join( [get_infra_id(config.ENV_DATA["cluster_path"]), "compute"]) for instance in self.utils.get_instances_with_pattern(pattern): vol = self.utils.create_volume( name=f"{pattern}-disk0-{instance.name[-1]}", size=config.FLEXY["volume_size"], ) # wait till volume is available sample = TimeoutSampler(300, 10, self.utils.check_expected_vol_status, vol, "available") if not sample.wait_for_func_status(True): logger.info("Volume failed to reach 'available'") raise exceptions.PSIVolumeNotInExpectedState # attach the volume self.utils.attach_volume(vol, instance.id)
def get_rhel_worker_instances(cluster_path): """ Get list of rhel worker instance IDs Args: cluster_path (str): The cluster path Returns: list: list of instance IDs of rhel workers """ aws = AWS() rhel_workers = [] worker_pattern = get_infra_id(cluster_path) + "*rhel-worker*" worker_filter = [{'Name': 'tag:Name', 'Values': [worker_pattern]}] response = aws.ec2_client.describe_instances(Filters=worker_filter) if not response['Reservations']: return for worker in response['Reservations']: rhel_workers.append(worker['Instances'][0]['InstanceId']) return rhel_workers
def generate_cluster_info(self): """ Generates the cluster information file """ logger.info("Generating cluster information file") # get kubeconfig and upload to httpd server kubeconfig = os.path.join(self.cluster_path, config.RUN.get('kubeconfig_location')) remote_path = os.path.join(config.ENV_DATA.get('path_to_upload'), f"{config.RUN.get('run_id')}_kubeconfig") upload_file(config.ENV_DATA.get('httpd_server'), kubeconfig, remote_path, config.ENV_DATA.get('httpd_server_user'), config.ENV_DATA.get('httpd_server_password')) # Form the kubeconfig url path kubeconfig_url_path = os.path.join( 'http://', config.ENV_DATA.get('httpd_server'), remote_path.lstrip('/var/www/html/')) config.ENV_DATA['kubeconfig_url'] = kubeconfig_url_path # get the infra_id infra_id = get_infra_id(self.cluster_path) config.ENV_DATA['infra_id'] = infra_id # get the cluster id cluster_id = get_cluster_id(self.cluster_path) config.ENV_DATA['cluster_id'] = cluster_id # fetch the installer version installer_version_str = run_cmd( f"{config.RUN['bin_dir']}/openshift-install version") installer_version = installer_version_str.split()[1] config.ENV_DATA['installer_version'] = installer_version # get the major and minor version of OCP version_obj = Version(installer_version) ocp_version_x = version_obj.major ocp_version_y = version_obj.minor config.ENV_DATA['ocp_version_x'] = ocp_version_x config.ENV_DATA['ocp_version_y'] = ocp_version_y # generate the cluster info yaml file terraform_var_template = "cluster_info.yaml.j2" terraform_var_template_path = os.path.join("ocp-deployment", terraform_var_template) terraform_config_str = self._templating.render_template( terraform_var_template_path, config.ENV_DATA) terraform_var_yaml = os.path.join(self.cluster_path, constants.TERRAFORM_DATA_DIR, constants.SCALEUP_TERRAFORM_DATA_DIR, "cluster_info.yaml") with open(terraform_var_yaml, "w") as f: f.write(terraform_config_str) # config.ENV_DATA['dns_server'] = config.ENV_DATA['dns'] template_vars = (f"\"dns_server: {config.ENV_DATA['dns']}" f"\\nremove_rhcos_worker: 'yes'\\n\"") replace_content_in_file(terraform_var_yaml, "PLACEHOLDER", template_vars) logger.info(f"cluster yaml file: {terraform_var_yaml}")
def host_network_update(self): """ Update security group rules for HostNetwork """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f"{cluster_id}-worker*" worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) security_groups = worker_instances[0]["security_groups"] sg_id = security_groups[0]["GroupId"] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { "FromPort": 6800, "ToPort": 7300, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph OSDs", "GroupId": sg_id, }, ], }, { "FromPort": 3300, "ToPort": 3300, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph MONs rule1", "GroupId": sg_id, }, ], }, { "FromPort": 6789, "ToPort": 6789, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph MONs rule2", "GroupId": sg_id, }, ], }, { "FromPort": 8443, "ToPort": 8443, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph Dashboard rule1", "GroupId": sg_id, }, ], }, { "FromPort": 8080, "ToPort": 8080, "IpProtocol": "tcp", "UserIdGroupPairs": [ { "Description": "Ceph Dashboard rule2", "GroupId": sg_id, }, ], }, ], )
def host_network_update(self): """ Update security group rules for HostNetwork """ cluster_id = get_infra_id(self.cluster_path) worker_pattern = f'{cluster_id}-worker*' worker_instances = self.aws.get_instances_by_name_pattern( worker_pattern) security_groups = worker_instances[0]['security_groups'] sg_id = security_groups[0]['GroupId'] security_group = self.aws.ec2_resource.SecurityGroup(sg_id) # The ports are not 100 % clear yet. Taken from doc: # https://docs.google.com/document/d/1c23ooTkW7cdbHNRbCTztprVU6leDqJxcvFZ1ZvK2qtU/edit# security_group.authorize_ingress( DryRun=False, IpPermissions=[ { 'FromPort': 6800, 'ToPort': 7300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph OSDs', 'GroupId': sg_id, }, ], }, { 'FromPort': 3300, 'ToPort': 3300, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 6789, 'ToPort': 6789, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph MONs rule2', 'GroupId': sg_id, }, ], }, { 'FromPort': 8443, 'ToPort': 8443, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule1', 'GroupId': sg_id, }, ], }, { 'FromPort': 8080, 'ToPort': 8080, 'IpProtocol': 'tcp', 'UserIdGroupPairs': [ { 'Description': 'Ceph Dashboard rule2', 'GroupId': sg_id, }, ], }, ])