def _make_spot_request(self, num, instance_type, price, worker_ud): worker_ud_str = yaml.dump(worker_ud) reqs = None try: ec2_conn = self.get_ec2_connection() if self.get_subnet_id(): log.debug( "Making a spot instance request, using groups: %s, subnet=%s" % (self.get_security_group_ids(), self.get_subnet_id())) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=self.get_subnet_id(), groups=self.get_security_group_ids(), associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( interface) reqs = ec2_conn.request_spot_instances( price=price, image_id=self.get_ami(), count=num, key_name=self.get_key_pair_name(), instance_type=instance_type, placement=self.get_zone(), user_data=worker_ud_str, network_interfaces=interfaces, ) else: log.debug( "Making a Spot request with the following command: " "ec2_conn.request_spot_instances(price='{price}', image_id='{iid}', " "count='{num}', key_name='{key}', security_groups=['{sgs}'], " "instance_type='{type}', placement='{zone}', user_data='{ud}')" .format(price=price, iid=self.get_ami(), num=num, key=self.get_key_pair_name(), sgs=", ".join(self.get_security_groups()), type=instance_type, zone=self.get_zone(), ud=worker_ud_str)) reqs = ec2_conn.request_spot_instances( price=price, image_id=self.get_ami(), count=num, key_name=self.get_key_pair_name(), security_groups=self.get_security_groups(), instance_type=instance_type, placement=self.get_zone(), user_data=worker_ud_str) if reqs is not None: for req in reqs: i = Instance(app=self.app, spot_request_id=req.id) log.debug("Adding Spot request {0} as an Instance".format( req.id)) self.app.manager.worker_instances.append(i) except EC2ResponseError, e: log.error("Trouble issuing a spot instance request: {0}".format( e.message)) return False
def _launch_instances(self, num, instance_type, worker_ud, min_num=1): """ Actually launch the `num` instance(s) of type `instance_type` and using the provided `worker_ud` dict that contains the instance user data. """ worker_ud_str = yaml.dump(worker_ud) try: reservation = None ec2_conn = self.get_ec2_connection() log.debug( "Starting instance(s) with the following command: ec2_conn.run_instances(" "image_id='{iid}', min_count='{min_num}', max_count='{num}', " "key_name='{key}', security_groups=['{sgs}'], " "user_data(with password/secret_key filtered out)=[{ud}], " "instance_type='{type}', placement='{zone}')".format( iid=self.get_ami(), min_num=min_num, num=num, key=self.get_key_pair_name(), sgs=", ".join(self.get_security_groups()), ud="\n".join([ '%s: %s' % (key, value) for key, value in worker_ud.iteritems() if key not in ['password', 'secret_key'] ]), type=instance_type, zone=self.get_zone())) reservation = ec2_conn.run_instances( image_id=self.get_ami(), min_count=min_num, max_count=num, key_name=self.get_key_pair_name(), security_groups=self.get_security_groups(), user_data=worker_ud_str, instance_type=instance_type, placement=self.get_zone()) # Occasionally, instances take a bit to register, so wait a few seconds time.sleep(3) if reservation: for instance in reservation.instances: i = Instance(app=self.app, inst=instance, m_state=instance.state) log.debug("Adding Instance %s to the list of workers" % instance) self.app.manager.worker_instances.append(i) log.debug("Started %s instance(s)" % num) return True except EC2ResponseError, e: err = "EC2 response error when starting worker nodes: %s" % str(e) log.error(err) return False
def _run_ondemand_instances(self, num, instance_type, spot_price, worker_ud, min_num=1): # log.debug("Setting boto's logger to DEBUG mode") # logging.getLogger('boto').setLevel(logging.DEBUG) worker_ud_str = yaml.dump(worker_ud) try: # log.debug( "Would be starting worker instance(s)..." ) reservation = None ec2_conn = self.get_ec2_connection() if self.running_in_vpc: log.debug("Starting instance(s) in VPC with the following command : ec2_conn.run_instances( " "image_id='{iid}', min_count='{min_num}', max_count='{num}', key_name='{key}', " "security_group_ids={sgs}, user_data(with sensitive info filtered out)=[{ud}], " "instance_type='{type}', placement='{zone}', subnet_id='{subnet_id}')" .format(iid=self.get_ami(), min_num=min_num, num=num, key=self.get_key_pair_name(), sgs=self.get_security_group_ids(), ud=("\n".join(['%s: %s' % (key, value) for key, value in worker_ud.iteritems() if key not in['password', 'freenxpass', 'secret_key']])), type=instance_type, zone=self.get_zone(), subnet_id=self.get_subnet_id())) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=self.get_subnet_id(), groups=self.get_security_group_ids(), associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) reservation = ec2_conn.run_instances(image_id=self.get_ami(), min_count=min_num, max_count=num, key_name=self.get_key_pair_name(), user_data=worker_ud_str, instance_type=instance_type, network_interfaces=interfaces, ) else: log.debug("Starting instance(s) with the following command : ec2_conn.run_instances( " "image_id='{iid}', min_count='{min_num}', max_count='{num}', key_name='{key}', " "security_groups=['{sgs}'], user_data(with sensitive info filtered out)=[{ud}], " "instance_type='{type}', placement='{zone}')" .format(iid=self.get_ami(), min_num=min_num, num=num, key=self.get_key_pair_name(), sgs=", ".join(self.get_security_groups()), ud=("\n".join(['%s: %s' % (key, value) for key, value in worker_ud.iteritems() if key not in['password', 'freenxpass', 'secret_key']])), type=instance_type, zone=self.get_zone())) reservation = ec2_conn.run_instances(image_id=self.get_ami(), min_count=min_num, max_count=num, key_name=self.get_key_pair_name(), security_groups=self.get_security_groups(), user_data=worker_ud_str, instance_type=instance_type, placement=self.get_zone()) # Rarely, instances take a bit to register, # so wait a few seconds (although this is a very poor # 'solution') time.sleep(3) if reservation: for instance in reservation.instances: # At this point in the launch, tag only amazon instances if 'amazon' in self.app.config.get('cloud_name', 'amazon').lower(): self.add_tag(instance, 'clusterName', self.app.config['cluster_name']) self.add_tag(instance, 'role', worker_ud['role']) self.add_tag(instance, 'Name', "Worker: {0}".format(self.app.config['cluster_name'])) i = Instance(app=self.app, inst=instance, m_state=instance.state) log.debug("Adding Instance %s" % instance) self.app.manager.worker_instances.append(i) except EC2ResponseError, e: err = "EC2 response error when starting worker nodes: %s" % str(e) log.error(err) return False
def __setup_app(self, ud={}): self.app = TestApp(ud=ud) self.inst = MockBotoInstance() self.instance = Instance(self.app, inst=self.inst) self.app.manager.worker_instances = [self.instance]
class MasterInstanceTestCase(TestCase): def setUp(self): self.__setup_app() def __setup_app(self, ud={}): self.app = TestApp(ud=ud) self.inst = MockBotoInstance() self.instance = Instance(self.app, inst=self.inst) self.app.manager.worker_instances = [self.instance] def test_id(self): assert self.instance.id == DEFAULT_MOCK_BOTO_INSTANCE_ID def test_get_cloud_instance_object(self): instance = self.instance # Without deep=True, just returned cached boto inst object. assert instance.get_cloud_instance_object() is self.inst # With deep=True should fetch new instance. fresh_instance = self.__seed_fresh_instance() assert instance.get_cloud_instance_object(deep=True) is fresh_instance def test_get_m_state(self): assert self.instance.m_state is None self.__seed_fresh_instance(state=instance_states.RUNNING) assert self.instance.get_m_state() == instance_states.RUNNING assert self.instance.m_state == instance_states.RUNNING def test_reboot(self): """ Check reboot was called on boto instance, time_rebooted is updated, and reboot_count is incremeneted. """ assert self.instance.time_rebooted is TIME_IN_PAST assert self.instance.reboot_count == 0 self.instance.reboot() assert self.inst.was_rebooted assert self.instance.time_rebooted is not TIME_IN_PAST assert self.instance.reboot_count == 1 # Check successive calls continue to increment reboot_count. self.instance.reboot() assert self.instance.reboot_count == 2 def test_terminate_success(self): self.__expect_terminatation(success=True) assert self.instance.terminate_attempt_count == 0 assert self.instance.inst is not None assert self.instance in self.app.manager.worker_instances thread = self.instance.terminate() thread.join() assert self.instance.inst is None assert self.instance.terminate_attempt_count == 1 assert self.instance not in self.app.manager.worker_instances def test_terminate_failure(self): self.__expect_terminatation(success=False) assert self.instance.terminate_attempt_count == 0 self.__seed_fresh_instance() # Needed for log statement in failure thread = self.instance.terminate() thread.join() # inst is only set to None after success assert self.instance.inst is not None assert self.instance.terminate_attempt_count == 1 assert self.instance in self.app.manager.worker_instances def test_maintain_reboot_stuck(self): """ Test method verifies instance is rebooted after stuck in PENDING state for 1000 seconds.""" with instrument_time() as time: self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Not rebooted after 100 seconds time.set_offset(seconds=100) self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Does reboot after 600 seconds time.set_offset(seconds=600) self.__assert_maintain_reboots(with_state=instance_states.PENDING) def test_maintain_retry_reboot(self): with instrument_time() as time: self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Maintain at 500 seconds determines it is stuck, attempts reboot time.set_offset(seconds=500) self.__assert_maintain_reboots(with_state=instance_states.PENDING) # Maintain at 600 is still stuck, but waiting for reboot. time.set_offset(seconds=700) self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Maintain at 900 seconds, still stuck retries reboot time.set_offset(seconds=900) self.__assert_maintain_reboots(with_state=instance_states.PENDING) def test_maintain_extend_reboot_timeout(self): self.__setup_app(ud={"instance_reboot_timeout": 500}) with instrument_time() as time: self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Maintain at 500 seconds determines it is stuck, attempts reboot time.set_offset(seconds=500) self.__assert_maintain_reboots(with_state=instance_states.PENDING) # Maintain at 600 is still stuck, but waiting for reboot. time.set_offset(seconds=700) self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Maintain at 900 seconds, would normally reboot but timeout is # extended so it won't. time.set_offset(seconds=900) self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Will eventually reboot again though... time.set_offset(seconds=1200) self.__assert_maintain_reboots(with_state=instance_states.PENDING) def test_maintain_state_change(self): with instrument_time() as time: self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # 350 is past reboot timeout (300), but wait for 400 seconds for state change # so no reboot. time.set_offset(seconds=350) self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) def test_maintain_extend_state_change_wait(self): self.__setup_app(ud={"instance_state_change_wait": 700}) with instrument_time() as time: self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Does not reboot after 600 seconds, waiting for state change. time.set_offset(seconds=600) self.__assert_maintain_does_not_reboot(with_state=instance_states.PENDING) # Does eventually reboot though time.set_offset(seconds=800) self.__assert_maintain_reboots(with_state=instance_states.PENDING) def test_maintain_reboots_on_error(self): self.__assert_maintain_reboots(with_state=instance_states.ERROR) def test_maintain_reboots_after_comm_loss(self): with instrument_time() as time: self.instance.handle_message("TEST") self.__assert_maintain_does_not_reboot(with_state=instance_states.RUNNING) time.set_offset(seconds=500) self.__assert_maintain_reboots(with_state=instance_states.RUNNING) def test_extend_comm_timeout(self): # Same test as above, but extend the comm timeout to verify it prevents # instance from being rebooted. self.__setup_app(ud={"instance_comm_timeout": 700}) with instrument_time() as time: self.instance.handle_message("TEST") self.__assert_maintain_does_not_reboot(with_state=instance_states.RUNNING) time.set_offset(seconds=500) self.__assert_maintain_does_not_reboot(with_state=instance_states.RUNNING) def test_maintain_no_reboot_if_comm_active(self): with instrument_time() as time: self.instance.handle_message("TEST") self.__assert_maintain_does_not_reboot(with_state=instance_states.RUNNING) time.set_offset(seconds=350) self.instance.handle_message("TEST") time.set_offset(seconds=500) self.__assert_maintain_does_not_reboot(with_state=instance_states.RUNNING) def test_terminates_after_enough_reboots(self): for _ in range(4): self.__assert_maintain_reboots(with_state=instance_states.ERROR) self.__expect_terminatation(success=False) self.__assert_maintain_does_not_reboot(with_state=instance_states.ERROR) def test_change_reboot_attempts(self): self.__setup_app(ud={"instance_reboot_attempts": 2}) for _ in range(2): self.__assert_maintain_reboots(with_state=instance_states.ERROR) self.__expect_terminatation(success=False) self.__assert_maintain_does_not_reboot(with_state=instance_states.ERROR) # Test is flaky, need to join thread somehow. def test_terminate_attempts(self): for _ in range(4): self.__assert_maintain_reboots(with_state=instance_states.ERROR) for _ in range(4): self.__expect_terminatation(success=False) self.__assert_maintain_does_not_reboot(with_state=instance_states.ERROR) assert self.instance in self.app.manager.worker_instances # Ultimately gives on terminates and just reboots. self.__assert_maintain_does_not_reboot(with_state=instance_states.ERROR) assert self.instance not in self.app.manager.worker_instances def test_modify_terminate_attempts(self): self.__setup_app(ud={"instance_terminate_attempts": 2}) for _ in range(4): self.__assert_maintain_reboots(with_state=instance_states.ERROR) for _ in range(2): self.__expect_terminatation(success=False) self.__assert_maintain_does_not_reboot(with_state=instance_states.ERROR) assert self.instance in self.app.manager.worker_instances # Ultimately gives on terminates and just reboots. self.__assert_maintain_does_not_reboot(with_state=instance_states.ERROR) assert self.instance not in self.app.manager.worker_instances def __assert_maintain_reboots(self, with_state): inst = self.__maintain_with_instance(state=with_state) assert inst.was_rebooted def __assert_maintain_does_not_reboot(self, with_state): inst = self.__maintain_with_instance(state=with_state) assert not inst.was_rebooted def __maintain_with_instance(self, **instance_kwds): inst = self.__seed_fresh_instance(**instance_kwds) self.instance.maintain() return inst def __expect_terminatation(self, success): self.app.cloud_interface.expect_terminatation( DEFAULT_MOCK_BOTO_INSTANCE_ID, spot_request_id=None, success=success ) def __seed_fresh_instance(self, state=None): fresh_instance = MockBotoInstance() self.app.cloud_interface.set_mock_instances([fresh_instance]) if state: fresh_instance.state = state return fresh_instance
class MasterInstanceTestCase(TestCase): def setUp(self): self.__setup_app() def __setup_app(self, ud={}): self.app = TestApp(ud=ud) self.inst = MockBotoInstance() self.instance = Instance(self.app, inst=self.inst) self.app.manager.worker_instances = [self.instance] def test_id(self): assert self.instance.id == DEFAULT_MOCK_BOTO_INSTANCE_ID def test_get_cloud_instance_object(self): instance = self.instance # Without deep=True, just returned cached boto inst object. assert instance.get_cloud_instance_object() is self.inst # With deep=True should fetch new instance. fresh_instance = self.__seed_fresh_instance() assert instance.get_cloud_instance_object(deep=True) is fresh_instance def test_get_m_state(self): assert self.instance.m_state is None self.__seed_fresh_instance(state=instance_states.RUNNING) assert self.instance.get_m_state() == instance_states.RUNNING assert self.instance.m_state == instance_states.RUNNING def test_reboot(self): """ Check reboot was called on boto instance, time_rebooted is updated, and reboot_count is incremeneted. """ assert self.instance.time_rebooted is TIME_IN_PAST assert self.instance.reboot_count == 0 self.instance.reboot() assert self.inst.was_rebooted assert self.instance.time_rebooted is not TIME_IN_PAST assert self.instance.reboot_count == 1 # Check successive calls continue to increment reboot_count. self.instance.reboot() assert self.instance.reboot_count == 2 def test_terminate_success(self): self.__expect_terminatation(success=True) assert self.instance.terminate_attempt_count == 0 assert self.instance.inst is not None assert self.instance in self.app.manager.worker_instances thread = self.instance.terminate() thread.join() assert self.instance.inst is None assert self.instance.terminate_attempt_count == 1 assert self.instance not in self.app.manager.worker_instances def test_terminate_failure(self): self.__expect_terminatation(success=False) assert self.instance.terminate_attempt_count == 0 self.__seed_fresh_instance() # Needed for log statement in failure thread = self.instance.terminate() thread.join() # inst is only set to None after success assert self.instance.inst is not None assert self.instance.terminate_attempt_count == 1 assert self.instance in self.app.manager.worker_instances def test_maintain_reboot_stuck(self): """ Test method verifies instance is rebooted after stuck in PENDING state for 1000 seconds.""" with instrument_time() as time: self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Not rebooted after 100 seconds time.set_offset(seconds=100) self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Does reboot after 600 seconds time.set_offset(seconds=600) self.__assert_maintain_reboots(with_state=instance_states.PENDING) def test_maintain_retry_reboot(self): with instrument_time() as time: self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Maintain at 500 seconds determines it is stuck, attempts reboot time.set_offset(seconds=500) self.__assert_maintain_reboots(with_state=instance_states.PENDING) # Maintain at 600 is still stuck, but waiting for reboot. time.set_offset(seconds=700) self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Maintain at 900 seconds, still stuck retries reboot time.set_offset(seconds=900) self.__assert_maintain_reboots(with_state=instance_states.PENDING) def test_maintain_extend_reboot_timeout(self): self.__setup_app(ud={"instance_reboot_timeout": 500}) with instrument_time() as time: self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Maintain at 500 seconds determines it is stuck, attempts reboot time.set_offset(seconds=500) self.__assert_maintain_reboots(with_state=instance_states.PENDING) # Maintain at 600 is still stuck, but waiting for reboot. time.set_offset(seconds=700) self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Maintain at 900 seconds, would normally reboot but timeout is # extended so it won't. time.set_offset(seconds=900) self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Will eventually reboot again though... time.set_offset(seconds=1200) self.__assert_maintain_reboots(with_state=instance_states.PENDING) def test_maintain_state_change(self): with instrument_time() as time: self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # 350 is past reboot timeout (300), but wait for 400 seconds for state change # so no reboot. time.set_offset(seconds=350) self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) def test_maintain_extend_state_change_wait(self): self.__setup_app(ud={"instance_state_change_wait": 700}) with instrument_time() as time: self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Does not reboot after 600 seconds, waiting for state change. time.set_offset(seconds=600) self.__assert_maintain_does_not_reboot( with_state=instance_states.PENDING) # Does eventually reboot though time.set_offset(seconds=800) self.__assert_maintain_reboots(with_state=instance_states.PENDING) def test_maintain_reboots_on_error(self): self.__assert_maintain_reboots(with_state=instance_states.ERROR) def test_maintain_reboots_after_comm_loss(self): with instrument_time() as time: self.instance.handle_message("TEST") self.__assert_maintain_does_not_reboot( with_state=instance_states.RUNNING) time.set_offset(seconds=500) self.__assert_maintain_reboots(with_state=instance_states.RUNNING) def test_extend_comm_timeout(self): # Same test as above, but extend the comm timeout to verify it prevents # instance from being rebooted. self.__setup_app(ud={"instance_comm_timeout": 700}) with instrument_time() as time: self.instance.handle_message("TEST") self.__assert_maintain_does_not_reboot( with_state=instance_states.RUNNING) time.set_offset(seconds=500) self.__assert_maintain_does_not_reboot( with_state=instance_states.RUNNING) def test_maintain_no_reboot_if_comm_active(self): with instrument_time() as time: self.instance.handle_message("TEST") self.__assert_maintain_does_not_reboot( with_state=instance_states.RUNNING) time.set_offset(seconds=350) self.instance.handle_message("TEST") time.set_offset(seconds=500) self.__assert_maintain_does_not_reboot( with_state=instance_states.RUNNING) def test_terminates_after_enough_reboots(self): for _ in range(4): self.__assert_maintain_reboots(with_state=instance_states.ERROR) self.__expect_terminatation(success=False) self.__assert_maintain_does_not_reboot( with_state=instance_states.ERROR) def test_change_reboot_attempts(self): self.__setup_app(ud={"instance_reboot_attempts": 2}) for _ in range(2): self.__assert_maintain_reboots(with_state=instance_states.ERROR) self.__expect_terminatation(success=False) self.__assert_maintain_does_not_reboot( with_state=instance_states.ERROR) # Test is flaky, need to join thread somehow. def test_terminate_attempts(self): for _ in range(4): self.__assert_maintain_reboots(with_state=instance_states.ERROR) for _ in range(4): self.__expect_terminatation(success=False) self.__assert_maintain_does_not_reboot( with_state=instance_states.ERROR) assert self.instance in self.app.manager.worker_instances # Ultimately gives on terminates and just reboots. self.__assert_maintain_does_not_reboot( with_state=instance_states.ERROR) assert self.instance not in self.app.manager.worker_instances def test_modify_terminate_attempts(self): self.__setup_app(ud={"instance_terminate_attempts": 2}) for _ in range(4): self.__assert_maintain_reboots(with_state=instance_states.ERROR) for _ in range(2): self.__expect_terminatation(success=False) self.__assert_maintain_does_not_reboot( with_state=instance_states.ERROR) assert self.instance in self.app.manager.worker_instances # Ultimately gives on terminates and just reboots. self.__assert_maintain_does_not_reboot( with_state=instance_states.ERROR) assert self.instance not in self.app.manager.worker_instances def __assert_maintain_reboots(self, with_state): inst = self.__maintain_with_instance(state=with_state) assert inst.was_rebooted def __assert_maintain_does_not_reboot(self, with_state): inst = self.__maintain_with_instance(state=with_state) assert not inst.was_rebooted def __maintain_with_instance(self, **instance_kwds): inst = self.__seed_fresh_instance(**instance_kwds) self.instance.maintain() return inst def __expect_terminatation(self, success): self.app.cloud_interface.expect_terminatation( DEFAULT_MOCK_BOTO_INSTANCE_ID, spot_request_id=None, success=success) def __seed_fresh_instance(self, state=None): fresh_instance = MockBotoInstance() self.app.cloud_interface.set_mock_instances([fresh_instance]) if state: fresh_instance.state = state return fresh_instance