def _recover_auto_scaling_groups(self): """ This steps does the following: - fetch the previously restored auto scaling group config. If this config cannot be found, we can assume that all autoscaling groups have correct configurations. This could happen when previous restart failed in the middle but passed this stage already, or the cluster is not even paused - Wait for all instances to be in service :return: """ # Get previously persisted asg status logger.info("Fetching last cluster status ...") cluster_status_raw = self._cluster_info.download_cluster_status_before_pause( ) asg_mgr = AXUserASGManager(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) if cluster_status_raw: logger.info("Found last cluster status, restoring cluster ...") cluster_status = yaml.load(cluster_status_raw) all_asg_statuses = cluster_status["asg_status"] # Restore minions for asg_name in all_asg_statuses.keys(): asg_status = all_asg_statuses[asg_name] min_size = asg_status["min_size"] max_size = asg_status["max_size"] desired = asg_status["desired_capacity"] self._total_nodes += desired logger.info( "Recovering autoscaling group %s. Min: %s, Max: %s, Desired: %s", asg_name, min_size, max_size, desired) asg_mgr.set_asg_spec(name=asg_name, minsize=min_size, maxsize=max_size, desired=desired) logger.info("Waiting for all auto scaling groups to scale up ...") asg_mgr.wait_for_desired_asg_state() logger.info("%sAll cluster instances are in service%s", COLOR_GREEN, COLOR_NORM) # Delete previously stored cluster status self._cluster_info.delete_cluster_status_before_pause() else: all_asgs = asg_mgr.get_all_asgs() for asg in all_asgs: self._total_nodes += asg["DesiredCapacity"] logger.info( "Cannot find last cluster status, cluster already resumed with %s nodes", self._total_nodes)
def modify_asg(self, min, max): logger.info("Modifying autoscaling group ...") asg_manager = AXUserASGManager(self._cluster_name_id, self._region, self._aws_profile) asg = asg_manager.get_variable_asg() if not asg: raise AXPlatformException( "Failed to get variable autoscaling group for cluster {}". format(self._cluster_name_id)) asg_name = asg["AutoScalingGroupName"] try: asg_manager.set_asg_spec(name=asg_name, minsize=1, maxsize=max) except ClientError as ce: raise AXPlatformException( "Failed to set cluster's variable autoscaling group min/max. Error: {}" .format(ce)) logger.info("Modifying cluster autoscaling group ... DONE")
def _scale_down_auto_scaling_groups(self): """ This step: - Persist autoscaling group states to S3, - Scale down all autoscaling groups to zero, - Wait for all minion to be terminated :return: """ logger.info("Discovering autoscaling groups") asg_mgr = AXUserASGManager(cluster_name_id=self._name_id, aws_profile=self._cfg.cloud_profile, region=self._cluster_config.get_region()) all_asgs = asg_mgr.get_all_asgs() # Generate cluster status before pause. This is used to recover same amount of nodes # when we want to restart cluster cluster_status = {"asg_status": {}} for asg in all_asgs: cluster_status["asg_status"][asg["AutoScalingGroupName"]] = { "min_size": asg["MinSize"], "max_size": asg["MaxSize"], "desired_capacity": asg["DesiredCapacity"] } self._cluster_info.upload_cluster_status_before_pause( status=yaml.dump(cluster_status)) # Scale down asg logger.info("Scaling down autoscaling groups ...") for asg in all_asgs: asg_name = asg["AutoScalingGroupName"] asg_mgr.set_asg_spec(name=asg_name, minsize=0, maxsize=0) # Waiting for nodes to be terminated logger.info("Waiting for all auto scaling groups to scale down ...") asg_mgr.wait_for_desired_asg_state() logger.info("%sAll cluster nodes are terminated%s", COLOR_GREEN, COLOR_NORM)