Example #1
0
class Cloud(object):
    def __init__(self, cloud_config):
        self.config = cloud_config
        self.all_instances = []
        self.failed_launch = False
        self.failed_count = 0
        self.failed_last_valid_count = 0
        self._conn = None
        self._as_conn = None
        self._lc = None
        self._asg = None
        self._last_asg_launch_attempt = None
        self.maxed = False
        self._last_launch_attempt = datetime.datetime.utcnow()
        self._initialize()

    def _create_connection(self):
        LOG.debug("Creating connection for %s" % self.config.name)
        self._conn = boto.connect_ec2(self.config.access_id,
                                      self.config.secret_key,
                                      validate_certs=False)
        self._conn.host = self.config.cloud_uri
        self._conn.port = self.config.cloud_port

    def _create_autoscale_connection(self):
        LOG.debug("Creating autoscale connection for %s" % self.config.name)
        region = RegionInfo(name=self.config.cloud_type,
                            endpoint=self.config.as_uri)
        self._as_conn = AutoScaleConnection(
            aws_access_key_id=self.config.access_id,
            aws_secret_access_key=self.config.secret_key,
            is_secure=True,
            port=self.config.as_port,
            region=region,
            validate_certs=False)

    def _create_or_set_launch_configuration(self):
        name = self.config.lc_name
        if not self._lc:
            LOG.debug("Attempting to load launch configuration: %s" % (name))
            lc = self._as_conn.get_all_launch_configurations(names=[name])
            if len(lc) == 1:
                LOG.debug("Launch configuration %s found." % (name))
                self._lc = lc[0]
        if not self._lc:
            #TODO(pdmars): key and security groups are hardcoded for now, gross
            if self.config.user_data_file is not None:
                user_data_file = self.config.user_data_file
                with open(user_data_file) as f:
                    user_data = f.read()
            else:
                user_data = None
            LOG.debug("Creating launch configuration %s" % name)
            LOG.debug("\tname: %s" % name)
            LOG.debug("\timage_id: %s" % self.config.image_id)
            LOG.debug("\tinstance_type: %s" % self.config.instance_type)
            LOG.debug("\tuser_data: %s" % user_data)
            self._lc = LaunchConfiguration(
                name=name,
                image_id=self.config.image_id,
                key_name="phantomkey",
                security_groups=['default'],
                instance_type=self.config.instance_type,
                user_data=user_data)
            self._as_conn.create_launch_configuration(self._lc)

    def _create_or_set_autoscale_group(self):
        name = self.config.asg_name
        if not self._asg:
            LOG.debug("Attempting to load autoscale group: %s" % name)
            asg = self._as_conn.get_all_groups(names=[name])
            LOG.debug("Autoscale group: %s" % asg)
            if len(asg) == 1:
                LOG.debug("Autoscale group %s found." % name)
                self._asg = asg[0]
        if not self._asg:
            # TODO(pdmars): more hard coded grossness, for now
            try:
                cloud_guess = self.config.lc_name.split("@")[1].strip()
            except Exception as e:
                LOG.warn("Unable to guess cloud for auto scale tags")
                LOG.warn("Setting cloud to hotel")
                cloud_guess = "hotel"
            policy_name_key = "PHANTOM_DEFINITION"
            policy_name = "error_overflow_n_preserving"
            ordered_clouds_key = "clouds"
            n_preserve_key = "minimum_vms"
            ordered_clouds = cloud_guess + ":-1"
            n_preserve = 0
            policy_tag = Tag(connection=self._as_conn, key=policy_name_key,
                             value=policy_name, resource_id=name)
            clouds_tag = Tag(connection=self._as_conn, key=ordered_clouds_key,
                             value=ordered_clouds, resource_id=name)
            npreserve_tag = Tag(connection=self._as_conn, key=n_preserve_key,
                                value=n_preserve, resource_id=name)
            tags = [policy_tag, clouds_tag, npreserve_tag]
            zones = [self.config.az]
            LOG.debug("Creating autoscale group %s" % name)
            LOG.debug("\tname: %s" % name)
            LOG.debug("\tavailability_zones: %s" % zones)
            LOG.debug("\tlaunch_config: %s" % self._lc)
            self._asg = AutoScalingGroup(group_name=name,
                                         availability_zones=zones,
                                         min_size=0,
                                         max_size=0,
                                         launch_config=self._lc,
                                         tags=tags)
            self._as_conn.create_auto_scaling_group(self._asg)

    def _initialize(self):
        LOG.debug("Initializing %s" % self.config.name)
        self._create_connection()
        self._create_autoscale_connection()
        self._create_or_set_launch_configuration()
        self._create_or_set_autoscale_group()
        LOG.debug("Initialization complete for %s" % self.config.name)

    def get_valid_instances(self):
        return self.all_instances

    def _refresh_instances(self):
        LOG.debug("%s: getting instance information" % self.config.name)
        self.all_instances = []
        instances = []
        as_instances = self._asg.instances
        as_instance_ids = [i.instance_id for i in as_instances]
        reservations = self._conn.get_all_instances()
        for reservation in reservations:
            for instance in reservation.instances:
                if instance.id in as_instance_ids:
                    if instance.state in VALID_RUN_STATES:
                        instances.append(instance)
        for instance in instances:
            self.all_instances.append(instance)
        num_instances = len(self.all_instances)
        LOG.debug("%s: updated %d instances" % (self.config.name,
                                                num_instances))
        if num_instances >= self.config.max_instances:
            LOG.warn("%s reached the max (%s) instances: %s" % (
                self.config.name, self.config.max_instances,
                num_instances))
            self.maxed = True
        else:
            self.maxed = False

    def _refresh_asg(self):
        LOG.debug("%s: refreshing autoscale group" % self.config.name)
        asg_name = self.config.asg_name
        asgs = self._as_conn.get_all_groups(names=[asg_name])
        if len(asgs) == 1:
            self._asg = asgs[0]
            LOG.debug("\trefreshed autoscale group: %s" % asg_name)
        else:
            LOG.warn("\tunable to refresh autoscale group: %s" % asg_name)

    def refresh(self, cluster):
        self._refresh_asg()
        self._refresh_instances()

    def get_total_num_valid_cores(self):
        LOG.debug("%s: getting number of valid cores" % self.config.name)
        total_num_valid_cores = 0
        num_valid_instances = len(self.get_valid_instances())
        total_valid_cores = num_valid_instances * self.config.instance_cores
        num_desired_instances = self._asg.desired_capacity
        num_desired_cores = num_desired_instances * self.config.instance_cores
        if num_desired_cores != total_num_valid_cores:
            LOG.debug("\tmismatching core counts")
            LOG.debug("\tnum_desired_cores: %d" % (num_desired_cores))
            LOG.debug("\ttotal_valid_cores: %d" % (total_valid_cores))
        return total_valid_cores

    def get_instance_by_id(self, id):
        LOG.debug("Searching for instance %s" % id)
        for instances in self.all_instances:
            if instance.id == id:
                LOG.debug("Found instance %s" % id)
                return instance
        return None

    def get_instance_ids_for_public_dns_names(self, public_dns_names):
        instance_ids = []
        for instance in self.all_instances:
            if instance.public_dns_name in public_dns_names:
                instance_ids.append(instance.id)
        return instance_ids

    def get_public_dns_names_close_to_charge(self):
        instances_close_to_charge = []
        sleep_secs = self.config.get_loop_sleep_secs()
        cur_utc_time = datetime.datetime.utcnow()
        valid_instances = self.get_valid_instances()
        time_fmt = "%Y-%m-%dT%H:%M:%S.%fZ"
        for instance in valid_instances:
            launch_time = datetime.datetime.strptime(instance.launch_time,
                                                     time_fmt)
            time_diff = cur_utc_time - launch_time
            # Ignores microseconds
            time_diff_secs = time_diff.seconds + time_diff.days * 24 * 3600
            cur_charge_secs = time_diff_secs % self.config.charge_time_secs
            secs_to_charge = self.config.charge_time_secs - cur_charge_secs
            LOG.debug("%s:%s: charge: %d; current: %d; to charge: %d" % (
                instance.id, instance.public_dns_name,
                self.config.charge_time_secs,
                cur_charge_secs, secs_to_charge))
            if secs_to_charge < (3 * sleep_secs):
                instances_close_to_charge.append(instance.public_dns_name)
        return instances_close_to_charge

    def delete_instances(self, instance_ids=[]):
        if not instance_ids:
            return
        LOG.debug("Deleting instances: %s" % instance_ids)
        # TODO(pdmars): this has the potential to kill instances running jobs
        # maybe I should err on the side of having extra instances if the
        # capacity is higher than the cloud can currently support
        num_instances = len(self.all_instances)
        if ((self._asg.desired_capacity > num_instances) and
                (num_instances > 0)):
            LOG.warn("Desired capacity is greater than num_instances running")
            LOG.warn("Adjusting desired capacity to match")
            self.set_capacity(num_instances)
        for instance_id in instance_ids:
            self._as_conn.terminate_instance(instance_id)
            # TODO(pdmars): due to a bug in phantom, maybe this will help
            # 2013/04/05: this might not be relevant anymore
            time.sleep(.1)

    def launch_autoscale_instances(self, num_instances=1):
        new_capacity = self._asg.desired_capacity + int(num_instances)
        if new_capacity > self.config.max_instances:
            new_capacity = self.config.max_instances
            LOG.warn("%s can launch %s total instances" % (self.config.name,
                                                           new_capacity))
        self._last_launch_attempt = datetime.datetime.utcnow()
        LOG.debug("Setting cloud capacity for %s to %s" % (self.config.name,
                                                           new_capacity))
        self.set_capacity(new_capacity)

    def set_capacity(self, new_capacity):
        self._asg.set_capacity(new_capacity)