def assign_hosts_to_workers(self): """Assign host instances to workers.""" all_host_names = set() for assignment in self.gce_project.host_worker_assignments: host_cluster = self.gce_project.get_cluster(assignment.host) worker_cluster = self.gce_project.get_cluster(assignment.worker) if host_cluster.gce_zone != worker_cluster.gce_zone: logging.error('Mismatching zones for %s and %s.', assignment.host, assignment.worker) continue if (host_cluster.instance_count * assignment.workers_per_host != worker_cluster.instance_count): logging.error( 'Invalid host/worker cluster size for %s and %s.', assignment.host, assignment.worker) continue if host_cluster.high_end != worker_cluster.high_end: logging.error('Mismatching high end setting for %s and %s', assignment.host, assignment.worker) continue manager = bot_manager.BotManager(self.gce_project.project_id, host_cluster.gce_zone) host_instance_group = manager.instance_group(host_cluster.name) if not host_instance_group.exists(): logging.error('Host instance group %s does not exist.', host_cluster.name) continue host_names = [ _instance_name_from_url(instance['instance']) for instance in host_instance_group.list_managed_instances() ] all_host_names.update(host_names) worker_instances = self.get_all_workers_in_cluster( manager, worker_cluster.name) if len(worker_instances) != worker_cluster.instance_count: logging.error( 'Actual number of worker instances for %s did not match. ' 'Expected %d, got %d.', worker_cluster.name, worker_cluster.instance_count, len(worker_instances)) continue new_assignments = self.do_assign_hosts_to_workers( host_names, worker_instances, assignment.workers_per_host) ndb_utils.put_multi(new_assignments) self.cleanup_old_assignments(all_host_names)
def cleanup_resources(): """Clean up resources.""" manager = bot_manager.BotManager(TEST_PROJECT, TEST_ZONE) try: manager.instance_group(test_instance_group_name()).delete() except bot_manager.NotFoundError: pass try: manager.instance_template(test_instance_template_name()).delete() except bot_manager.NotFoundError: pass
def delete_gce_resources(self, project_info, cluster_info): """Delete instance templates and instance groups.""" manager = bot_manager.BotManager(self.gce_project.project_id, cluster_info.gce_zone) resource_name = get_resource_name(cluster_info.cluster, project_info.name) try: manager.instance_group(resource_name).delete() except bot_manager.NotFoundError: logging.info('Instance group %s already deleted.', resource_name) try: manager.instance_template(resource_name).delete() except bot_manager.NotFoundError: logging.info('Instance template %s already deleted.', resource_name)
def update_cluster(self, cluster, resource_name, cpu_count, task_tag=None, disk_size_gb=None, service_account=None, tls_cert=None): """Update the cluster.""" manager = bot_manager.BotManager(self.gce_project.project_id, cluster.gce_zone) instance_template = manager.instance_template(resource_name) instance_group = manager.instance_group(resource_name) # Load expected template body. template_body = get_template_body( self.gce_project, cluster.instance_template, task_tag=task_tag, disk_size_gb=disk_size_gb, service_account=service_account, tls_cert=tls_cert) if instance_template.exists(): # Check for updates. current_template_body = instance_template.get() template_needs_update = _template_needs_update( current_template_body, template_body, resource_name) else: logging.info('Creating new instance template: %s', resource_name) instance_template.create(template_body) template_needs_update = False if instance_group.exists(): if template_needs_update: # Instance groups need to be deleted first before an instance template # can be deleted. logging.info('Deleting instance group %s for template update.', resource_name) try: instance_group.delete() except bot_manager.NotFoundError: # Already deleted. pass else: instance_group_body = instance_group.get() if instance_group_body['targetSize'] != cpu_count: logging.info('Resizing instance group %s from %d to %d.', resource_name, instance_group_body['targetSize'], cpu_count) try: instance_group.resize(cpu_count, wait_for_instances=False) except bot_manager.OperationError as e: logging.error('Failed to resize instance group %s: %s', resource_name, str(e)) else: logging.info('No instance group size changes needed.') return if template_needs_update: logging.info('Recreating instance template: %s', resource_name) instance_template.delete() instance_template.create(template_body) logging.info('Creating new instance group: %s', resource_name) try: instance_group.create( resource_name, resource_name, size=cpu_count, wait_for_instances=False) except bot_manager.OperationError as e: logging.error('Failed to create instance group %s: %s', resource_name, str(e))
def update_cluster(self, cluster, resource_name, cpu_count, task_tag=None, disk_size_gb=None, service_account=None, tls_cert=None): """Update the cluster.""" manager = bot_manager.BotManager(self.gce_project.project_id, cluster.gce_zone) instance_template = manager.instance_template(resource_name) instance_group = manager.instance_group(resource_name) # Load expected template body. template_body = get_template_body(self.gce_project, cluster.instance_template, task_tag=task_tag, disk_size_gb=disk_size_gb, service_account=service_account, tls_cert=tls_cert) if instance_template.exists(): # Check for updates. current_template_body = instance_template.get() template_needs_update = _template_needs_update( current_template_body, template_body, resource_name) else: logging.info('Creating new instance template: %s', resource_name) instance_template.create(template_body) template_needs_update = False if instance_group.exists(): if template_needs_update: # Instance groups need to be deleted first before an instance template # can be deleted. logging.info('Deleting instance group %s for template update.', resource_name) try: instance_group.delete() except bot_manager.NotFoundError: # Already deleted. pass else: instance_group_body = instance_group.get() if instance_group_body['targetSize'] != cpu_count: logging.info('Resizing instance group %s from %d to %d.', resource_name, instance_group_body['targetSize'], cpu_count) try: instance_group.resize(cpu_count, wait_for_instances=False) except bot_manager.OperationError as e: logging.error('Failed to resize instance group %s: %s', resource_name, str(e)) else: logging.info('No instance group size changes needed.') # Check if needs to update autoHealingPolicies. auto_healing_policy = {} # Check if needs to update health check URL in autoHealingPolicies. old_url = instance_group_body.get('auto_healing_policy', {}).get('health_check') new_url = cluster.auto_healing_policy.get('health_check') if new_url != old_url: logging.info( 'Updating the health check URL in auto_healing_policy' 'of instance group %s from %s to %s.', resource_name, old_url, new_url) auto_healing_policy['healthCheck'] = new_url # Check if needs to update initial delay in autoHealingPolicies. old_delay = instance_group_body.get( 'auto_healing_policy', {}).get('initial_delay_sec') new_delay = cluster.auto_healing_policy.get( 'initial_delay_sec') if new_delay != old_delay: logging.info( 'Updating the health check initial delay in auto_healing_policy' 'of instance group %s from %s seconds to %s seconds.', resource_name, old_delay, new_delay) auto_healing_policy['initialDelaySec'] = new_delay # Send one request to update either or both if needed if auto_healing_policy: if new_url is None or new_delay is None: auto_healing_policy = {} if new_url is not None or new_delay is not None: logging.warning( 'Deleting auto_healing_policy ' 'because its two values (health_check, initial_delay_sec) ' 'should never exist independently: (%s, %s)', new_url, new_delay) try: instance_group.patch_auto_healing_policies( auto_healing_policy=auto_healing_policy, wait_for_instances=False) except bot_manager.OperationError as e: logging.error('Failed to create instance group %s: %s', resource_name, str(e)) return if template_needs_update: logging.info('Recreating instance template: %s', resource_name) instance_template.delete() instance_template.create(template_body) logging.info('Creating new instance group: %s', resource_name) try: instance_group.create( resource_name, resource_name, size=cpu_count, auto_healing_policy=cluster.auto_healing_policy, wait_for_instances=False) except bot_manager.OperationError as e: logging.error('Failed to create instance group %s: %s', resource_name, str(e))
def setUp(self): cleanup_resources() self.manager = bot_manager.BotManager(TEST_PROJECT, TEST_ZONE)