def _update_db_proxy(db_name, datastore_name, datastore_ro_name, db_password, datastore_password, datastore_ro_password): logs.info('Updating db proxy') db_proxy_manager.update(wait_updated=False) ok = False for i in range(5): try: for user, password, db in [ (db_name, db_password, db_name), (datastore_name, datastore_password, datastore_name), (datastore_ro_name, datastore_ro_password, datastore_name) ]: if user: db_manager.check_connection_string( db_manager.get_external_connection_string( user, password, db)) ok = True break except Exception as e: logs.warning(str(e)) logs.info(f'Waiting for connection to db proxy...') # 40 seconds on first iteration - to ensure secret is updated in pgbouncer volume time.sleep(40 if i == 0 else 1) db_proxy_manager.reload() time.sleep(10 if i == 2 else 5) assert ok, 'failed to get connection to db proxy' yield { 'step': 'update-db-proxy', 'msg': f'Updated DB Proxy with the new dbs and roles: {db_name}, {datastore_name}, {datastore_ro_name}' }
def get_collection_status(collection_name): output = solr_curl(f'/{collection_name}/schema') if output == False: return { 'ready': False, 'collection_name': collection_name, 'solr_http_endpoint': get_internal_http_endpoint() } else: def_ver, def_name = '2.8', 'ckan' res = {'schema': {'version': def_ver, 'name': def_name}} try: res = json.loads(output) except json.decoder.JSONDecodeError as e: logs.warning( f'Not able to decode response from SOLR. Using default values for schema version/name - {def_ver}/{def_name}\n SOLR response: \n{output}' ) return { 'ready': True, 'collection_name': collection_name, 'solr_http_endpoint': get_internal_http_endpoint(), 'schemaVersion': res['schema']['version'], 'schemaName': res['schema']['name'] }
def delete_bucket(instance_id, dry_run=False): s3_buckets = list( filter(lambda x: x.startswith(f'{instance_id}-cc'), list_s3_buckets(names_only=True))) if not s3_buckets: logs.warning( f'No bucket found for the instance "{instance_id}". Skipping.') return instance = kubectl.get(f'ckancloudckaninstance {instance_id}') bucket = instance['spec'].get('ckanStorageBucket').get(PROVIDER_ID) if not bucket: logs.warning('This instance does not have S3 bucket attached.') return bucket_name = bucket.get('BUCKET_NAME') cmd = f's3 rm {bucket_name} --recursive' if dry_run: cmd += ' --dryrun' # Two steps deletion. See the `aws s3 rb help` aws_check_output(cmd) if not dry_run: aws_check_output(f's3 rb {bucket_name}')
def get(routes, letsencrypt_cloudflare_email, enable_access_log=False, wildcard_ssl_domain=None, external_domains=False, dns_provider=None, force=False): assert dns_provider == 'cloudflare' logs.info('Generating nginx configuration', routes_len=len(routes) if routes else 0, letsencrypt_cloudflare_email=letsencrypt_cloudflare_email, enable_access_log=enable_access_log, wildcard_ssl_domain=wildcard_ssl_domain, external_domains=external_domains) config = {} domains = {} enable_ssl_redirect = True logs.info('Adding routes') i = 0 errors = 0 for route in routes: try: _add_route(config, domains, route, enable_ssl_redirect) i += 1 except Exception as e: if force: logs.error(traceback.format_exc()) logs.error(str(e)) errors += 1 else: raise logs.info(f'Added {i} routes') if errors > 0: logs.warning(f'Encountered {errors} errors') return config
def create_volume(disk_size_gb, labels, use_existing_disk_name=None, zone=0): if zone != 0: logs.warning(f'variable zone for create_volume has been deprecated.') disk_id = use_existing_disk_name or 'cc' + _generate_password(12) if use_existing_disk_name: logs.info(f'using existing persistent disk {disk_id}') else: logs.info( f'creating persistent disk {disk_id} with size {disk_size_gb}GB') labels = ','.join([ '{}={}'.format(k.replace('/', '_'), v.replace('/', '_')) for k, v in labels.items() ]) kubectl.apply({ "kind": "PersistentVolumeClaim", "apiVersion": "v1", "metadata": { "name": disk_id, "namespace": "ckan-cloud" }, "spec": { "accessModes": ["ReadWriteOnce"], "resources": { "requests": { "storage": f'{disk_size_gb}G' } }, "storageClassName": "cca-ckan" } }) return {'persistentVolumeClaim': {'claimName': disk_id}}
def create(instance_type, instance_id=None, instance_name=None, values=None, values_filename=None, exists_ok=False, dry_run=False, update_=False, wait_ready=False, skip_deployment=False, skip_route=False, force=False): if not instance_id: if instance_name: instance_id = '{}-{}'.format(instance_name, _generate_password(6)) else: instance_id = _generate_password(12) if values_filename: assert values is None if values_filename != '-': with open(values_filename) as f: values = yaml.load(f.read()) else: values = yaml.load(sys.stdin.read()) if not exists_ok and crds_manager.get(INSTANCE_CRD_SINGULAR, name=instance_id, required=False): raise Exception('instance already exists') values_id = values.get('id') if values_id and values_id != instance_id: logs.warning(f'changing instance id in spec from {values_id} to the instance id {instance_id}') values['id'] = instance_id logs.info('Creating instance', instance_id=instance_id) kubectl.apply(crds_manager.get_resource( INSTANCE_CRD_SINGULAR, instance_id, extra_label_suffixes={'instance-type': instance_type}, spec=values ), dry_run=dry_run) if instance_name: set_name(instance_id, instance_name, dry_run=dry_run) if update_: update(instance_id, wait_ready=wait_ready, skip_deployment=skip_deployment, skip_route=skip_route, force=force, dry_run=dry_run) return instance_id
def initialize(interactive=False): config_manager.interactive_set({ 'default-storage-bucket': 'ckan', }, secret_name='ckan-storage-config', interactive=interactive) logs.warning('Minio bucket policy was not applied!')
def get_load_balancer_ip(router_name, failfast=False): resource_name = _get_resource_name(router_name) RETRIES = 10 for retries in range(RETRIES): load_balancer = kubectl.get(f'service loadbalancer-{resource_name}', required=False) if not load_balancer: if failfast: return None else: continue ingresses = load_balancer.get('status', {}).get('loadBalancer', {}).get('ingress', []) if len(ingresses) == 0: continue assert len(ingresses) == 1 if cluster_manager.get_provider_id() == 'aws': load_balancer_hostname = ingresses[0].get('hostname') if load_balancer_hostname: return load_balancer_hostname logs.warning('Failed to get hostname, retrying %r' % ingresses[0]) else: load_balancer_ip = ingresses[0].get('ip') if load_balancer_ip: return load_balancer_ip logs.warning('Failed to get ip, retrying %r' % ingresses[0]) time.sleep(60) assert retries < RETRIES - 1, "Gave up on waiting for load balancer IP"
def initialize_zookeeper(interactive=False, dry_run=False): headless_service_name = _apply_zookeeper_headless_service(dry_run=dry_run) zk_instances = { suffix: { 'host_name': suffix, 'volume_spec': _get_or_create_volume(suffix, disk_size_gb=20, dry_run=dry_run), } for suffix in _get_zk_suffixes() } zk_host_names = [zk['host_name'] for zk in zk_instances.values()] zk_configmap_name = _apply_zookeeper_configmap(zk_host_names) if interactive: logs.info('Starting interactive update of zookeeper deployments') print( '\nDeployments will be done one by one, you should check if deployment succeeded before moving on to next one' ) for zk_suffix, zk in zk_instances.items(): if input(f'Update zookeeper deployment {zk_suffix}? [y/n]: ' ) == 'y': _apply_zookeeper_deployment(zk_suffix, zk['volume_spec'], zk_configmap_name, headless_service_name, dry_run=dry_run) else: logs.warning('deployments are not updated in non-interactive mode') namespace = cluster_manager.get_operator_namespace_name() return [ f'{h}.{headless_service_name}.{namespace}.svc.cluster.local:2181' for h in zk_host_names ]
def initialize_solrcloud(zk_host_names, pause_deployment, interactive=False, dry_run=False): sc_logs_configmap_name = _apply_solrcloud_logs_configmap() headless_service_name = _apply_solrcloud_headless_service(dry_run=dry_run) sc_instances = { suffix: { 'host_name': suffix, 'volume_spec': _get_or_create_volume(suffix, disk_size_gb=100, dry_run=dry_run) } for suffix in _get_sc_suffixes() } sc_host_names = [sc['host_name'] for sc in sc_instances.values()] sc_configmap_name = _apply_solrcloud_configmap(zk_host_names) if interactive: logs.info('Starting interactive update of solrcloud deployments') print( '\nDeployments will be done one by one, you should check if deployment succeeded before moving on to next one' ) for sc_suffix, sc in sc_instances.items(): if input(f'Update solrcloud deployment {sc_suffix}? [y/n]: ' ) == 'y': _apply_solrcloud_deployment(sc_suffix, sc['volume_spec'], sc_configmap_name, sc_logs_configmap_name, headless_service_name, pause_deployment, dry_run=dry_run) else: logs.warning('deployments are not updated in non-interactive mode') return sc_host_names
def port_forward(db_prefix, all_daemon): if all_daemon: assert not db_prefix and all_daemon == 'I know the risks' subprocess.Popen( ['ckan-cloud-operator', 'db', 'proxy', 'port-forward'], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) for db_prefix in manager.get_provider().get_all_db_prefixes(): subprocess.Popen([ 'ckan-cloud-operator', 'db', 'proxy', 'port-forward', '--db-prefix', db_prefix ], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) else: while True: start_time = datetime.datetime.now() try: manager.start_port_forward(db_prefix=db_prefix) except Exception: traceback.print_exc() end_time = datetime.datetime.now() if (end_time - start_time).total_seconds() < 10: logs.critical('DB Proxy failure') logs.exit_catastrophic_failure() else: logs.warning('Restarting the DB proxy')
def pre_update_hook(instance_id, instance, override_spec, skip_route=False, dry_run=False): _init_namespace(instance_id, dry_run=dry_run) _pre_update_hook_override_spec(override_spec, instance) if not instance['spec'].get('useCentralizedInfra'): logs.warning( 'Forcing centralized infra even though useCentralizedInfra is disabled' ) _pre_update_hook_modify_spec( instance_id, instance, lambda i: i.update(useCentralizedInfra=True), dry_run=dry_run) res = {} sub_domain, root_domain = _pre_update_hook_route(instance_id, skip_route, instance, res, dry_run=dry_run) _pre_update_hook_admin_user(instance, sub_domain, root_domain, instance_id, res, dry_run=dry_run) return res
def get(routes, letsencrypt_cloudflare_email, enable_access_log=False, wildcard_ssl_domain=None, external_domains=False, dns_provider=None, force=False): if not dns_provider: dns_provider = 'cloudflare' logs.info('Generating traefik configuration', routes_len=len(routes) if routes else 0, letsencrypt_cloudflare_email=letsencrypt_cloudflare_email, enable_access_log=enable_access_log, wildcard_ssl_domain=wildcard_ssl_domain, external_domains=external_domains) config = _get_base_config(**({ 'accessLog': { "format": "json", "fields": { 'defaultMode': "keep" } }, } if enable_access_log else {})) domains = {} if dns_provider == 'cloudflare' and letsencrypt_cloudflare_email: enable_ssl_redirect = True elif dns_provider == 'route53': enable_ssl_redirect = True else: enable_ssl_redirect = False logs.info(enable_ssl_redirect=enable_ssl_redirect) logs.info('Adding routes') i = 0 errors = 0 for route in routes: try: _add_route(config, domains, route, enable_ssl_redirect) i += 1 except Exception as e: if force: logs.error(traceback.format_exc()) logs.error(str(e)) errors += 1 else: raise logs.info(f'Added {i} routes') if errors > 0: logs.warning(f'Encountered {errors} errors') if ((dns_provider == 'cloudflare' and letsencrypt_cloudflare_email) or (dns_provider == 'route53')): _add_letsencrypt(dns_provider, config, letsencrypt_cloudflare_email, domains, wildcard_ssl_domain=wildcard_ssl_domain, external_domains=external_domains) else: logs.info('No valid dns_provider, will not setup SSL', dns_provider=dns_provider) return config
def solr_curl(path, required=False, debug=False, max_retries=15): deployment_name = _get_resource_name(_get_sc_suffixes()[0]) if debug: kubectl.check_call( f'exec deployment-pod::{deployment_name} -- curl \'localhost:8983/solr{path}\'', use_first_pod=True) else: exitcode, output = kubectl.getstatusoutput( f'exec deployment-pod::{deployment_name} -- curl -s -f \'localhost:8983/solr{path}\'', use_first_pod=True) if exitcode == 0: return output elif required: if max_retries > 0: logs.info( f'Failed to run solr curl: localhost:8983/solr{path} - retring in 30 seconds' ) time.sleep(30) solr_curl(path, required=required, debug=debug, max_retries=max_retries - 1) logs.critical(output) raise Exception( f'Failed to run solr curl: localhost:8983/solr{path}') else: logs.warning(output) return False
def update(self, wait_ready=False, skip_solr=False, skip_deployment=False): """Ensure the instance is updated to latest spec""" old_deployment = kubectl.get(f'deployment {self.id}', required=False, namespace=self.id) if old_deployment: old_deployment_generation = old_deployment.get('metadata', {}).get('generation') else: old_deployment_generation = None if old_deployment_generation: expected_new_deployment_generation = old_deployment_generation + 1 else: expected_new_deployment_generation = 1 print(f'old deployment generation = {old_deployment_generation}') DeisCkanInstanceNamespace(self).update() DeisCkanInstanceDb(self, 'db').update() DeisCkanInstanceDb(self, 'datastore').update() if not skip_solr: DeisCkanInstanceSolr(self).update() DeisCkanInstanceStorage(self).update() DeisCkanInstanceRegistry(self).update() envvars = DeisCkanInstanceEnvvars(self) envvars.update() if not skip_deployment: DeisCkanInstanceDeployment(self).update() while True: time.sleep(.2) new_deployment = kubectl.get(f'deployment {self.id}', required=False, namespace=self.id) if not new_deployment: continue new_deployment_generation = new_deployment.get('metadata', {}).get('generation') if not new_deployment_generation: continue if new_deployment_generation == old_deployment_generation: continue if new_deployment_generation != expected_new_deployment_generation: raise Exception(f'Invalid generation: {new_deployment_generation} ' f'(expected: {expected_new_deployment_generation}') print(f'new deployment generation: {new_deployment_generation}') break if wait_ready: print('Waiting for ready status') time.sleep(3) while True: data = self.get() if data.get('ready'): print(yaml.dump(data, default_flow_style=False)) break else: print(yaml.dump( { k: v for k, v in data.items() if (k not in ['ready'] and type(v) == dict and not v.get('ready')) or k == 'namespace' }, default_flow_style=False) ) time.sleep(2) self.ckan.update() try: DeisCkanInstanceDb(self, 'datastore').set_datastore_readonly_permissions() except Exception: logs.warning('Setting datastore permissions failed, continuing anyway') # Create/Update uptime monitoring after everything else is ready DeisCkanInstanceUptime(self).update(envvars.site_url)
def delete_bucket(instance_id, dry_run=False): if instance_id not in list_gcloud_buckets(): logs.warning( f'No bucket found for the instance "{instance_id}". Skipping.') return logs.info(f'Removing bucket for instance_id {instance_id}') if not dry_run: gcloud_check_output(f'rm -r gs://{instance_id}', gsutil=True)
def delete(instance_id, instance): errors = [] try: app_type = instance['spec'].get('app-type') _get_app_type_manager(app_type).delete(instance_id, instance) except Exception: logs.warning(traceback.format_exc()) errors.append(f'Failed to delete app') assert len(errors) == 0, ', '.join(errors)
def get_dynamic(routes, letsencrypt_cloudflare_email, wildcard_ssl_domain=None, external_domains=False, dns_provider=None, force=False): if not dns_provider: dns_provider = 'cloudflare' logs.info('Generating traefik v2 dynamic configuration', routes_len=len(routes) if routes else 0, letsencrypt_cloudflare_email=letsencrypt_cloudflare_email, wildcard_ssl_domain=wildcard_ssl_domain, external_domains=external_domains) dynamic_config = { 'http': { 'routers': {}, 'services': {}, 'middlewares': { 'SSLRedirect': { 'redirectScheme': { 'scheme': 'https', 'permanent': True } } } } } domains = {} if dns_provider == 'cloudflare' and letsencrypt_cloudflare_email: enable_ssl_redirect = True elif dns_provider == 'route53': enable_ssl_redirect = True elif dns_provider == 'azure': enable_ssl_redirect = True else: enable_ssl_redirect = False logs.info(enable_ssl_redirect=enable_ssl_redirect) logs.info('Adding routes') i = 0 errors = 0 for route in routes: try: _add_route(dynamic_config, domains, route, enable_ssl_redirect, external_domains, wildcard_ssl_domain) i += 1 except Exception as e: if force: logs.error(traceback.format_exc()) logs.error(str(e)) errors += 1 else: raise logs.info(f'Added {i} routes') if errors > 0: logs.warning(f'Encountered {errors} errors') return dynamic_config
def initialize(interactive=False): ckan_infra = CkanInfra(required=False) config_manager.interactive_set( { 'default-storage-bucket': ckan_infra.GCLOUD_STORAGE_BUCKET, }, secret_name='ckan-storage-config', interactive=interactive ) logs.warning('Minio bucket policy was not applied!')
def _get_running_pod_name(instance_id, service='ckan'): pod_name = None while not pod_name: try: pod_name = kubectl.get_deployment_pod_name(service, instance_id, use_first_pod=True, required_phase='Running') break except Exception as e: logs.warning('Failed to find running ckan pod', str(e)) time.sleep(20) return pod_name
def delete_bucket(instance_id, dry_run=False): if instance_id not in list_azure_buckets(): logs.warning(f'No bucket found for the instance "{instance_id}". Skipping.') return if dry_run: return logs.info(f'Removing Azure storage bucket for instance_id {instance_id}') cred_options = _get_cred_options() az_check_output(f'storage container delete -n {instance_id} {cred_options}')
def delete(instance_id, instance): tiller_namespace_name = _get_resource_name() ckan_helm_release_name = f'ckan-cloud-{instance_id}' errors = [] try: logs.info(f'Deleting helm release {ckan_helm_release_name}') helm_driver.delete(tiller_namespace_name, ckan_helm_release_name) except Exception as e: logs.warning(traceback.format_exc()) errors.append(f'Failed to delete helm release') if kubectl.call(f'delete --wait=false namespace {instance_id}') != 0: errors.append(f'Failed to delete namespace') assert len(errors) == 0, ', '.join(errors)
def port_forward(): while True: start_time = datetime.datetime.now() try: manager.start_port_forward() except Exception: traceback.print_exc() end_time = datetime.datetime.now() if (end_time - start_time).total_seconds() < 10: logs.critical('DB Proxy failure') logs.exit_catastrophic_failure() else: logs.warning('Restarting the DB proxy')
def _pre_update_hook_route(instance_id, skip_route, instance, res, dry_run=False): root_domain = routers_manager.get_default_root_domain() sub_domain = f'ckan-cloud-{instance_id}' if not skip_route: # full domain to route to the instance instance_domain = instance['spec'].get('domain') if instance_domain and instance_domain != f'{sub_domain}.{root_domain}': logs.warning(f'instance domain was changed from {instance_domain} to {sub_domain}.{root_domain}') _pre_update_hook_modify_spec(instance_id, instance, lambda i: i.update(domain=f'{sub_domain}.{root_domain}'), dry_run=dry_run) # instance is added to router only if this is true, as all routers must use SSL and may use sans SSL too with_sans_ssl = instance['spec'].get('withSansSSL') if not with_sans_ssl: logs.warning(f'forcing with_sans_ssl, even though withSansSSL is disabled') _pre_update_hook_modify_spec(instance_id, instance, lambda i: i.update(withSansSSL=True), dry_run=dry_run) # subdomain to register on the default root domain register_subdomain = instance['spec'].get('registerSubdomain') if register_subdomain != sub_domain: logs.warning(f'instance register sub domain was changed from {register_subdomain} to {sub_domain}') _pre_update_hook_modify_spec(instance_id, instance, lambda i: i.update(registerSubdomain=sub_domain), dry_run=dry_run) res.update(**{'root-domain': root_domain, 'sub-domain': sub_domain}) site_url = instance['spec'].get('siteUrl') if site_url != f'https://{sub_domain}.{root_domain}': logs.warning(f'instance siteUrl was changed from {site_url} to https://{sub_domain}.{root_domain}') _pre_update_hook_modify_spec(instance_id, instance, lambda i: i.update(siteUrl=f'https://{sub_domain}.{root_domain}'), dry_run=dry_run) return sub_domain, root_domain
def get_bucket(instance_id): if instance_id not in list_gcloud_buckets(): logs.warning( f'No bucket found for the instance "{instance_id}" on Google Cloud. Skipping.' ) return instance = kubectl.get(f'ckancloudckaninstance {instance_id}') bucket = instance['spec'].get('ckanStorageBucket').get(PROVIDER_ID) if not bucket: logs.warning( 'This instance does not have Google Cloud bucket attached.') return return {'instance_id': instance_id, 'ckanStorageBucket': bucket}
def get_bucket(instance_id): if instance_id not in list_azure_buckets(): logs.warning(f'No bucket found for the instance "{instance_id}" in this Azure storage account. Skipping.') return instance = kubectl.get(f'ckancloudckaninstance {instance_id}') bucket = instance['spec'].get('bucket').get(PROVIDER_ID) if not bucket: logs.warning('This instance does not have Azure bucket attached.') return return { 'instance_id': instance_id, 'bucket': bucket }
def solr_curl(path, required=False, debug=False): deployment_name = _get_resource_name(_get_sc_suffixes()[0]) if debug: kubectl.check_call(f'exec deployment-pod::{deployment_name} -- curl \'localhost:8983/solr{path}\'', use_first_pod=True) else: exitcode, output = kubectl.getstatusoutput(f'exec deployment-pod::{deployment_name} -- curl -s -f \'localhost:8983/solr{path}\'', use_first_pod=True) if exitcode == 0: return output elif required: logs.critical(output) raise Exception(f'Failed to run solr curl: localhost:8983/solr{path}') else: logs.warning(output) return False
def create_ckan_admin_user(instance_id, instance, user): pod_name = None while not pod_name: try: pod_name = kubectl.get_deployment_pod_name('ckan', instance_id, use_first_pod=True, required_phase='Running') break except Exception as e: logs.warning('Failed to find running ckan pod', str(e)) time.sleep(20) name, password, email = [user[k] for k in ['name', 'password', 'email']] logs.info(f'Creating CKAN admin user with {name} ({email}) and {password} on pod {pod_name}') logs.subprocess_check_call( f'echo y | kubectl -n {instance_id} exec -i {pod_name} -- ckan-paster --plugin=ckan sysadmin -c /etc/ckan/production.ini add {name} password={password} email={email}', shell=True )
def solr_curl(path, required=False, debug=False): if is_self_hosted(): return get_provider().solr_curl(path, required=required, debug=debug) else: http_endpoint = get_internal_http_endpoint() if debug: subprocess.check_call(f'curl \'{http_endpoint}{path}\'') else: exitcode, output = subprocess.getstatusoutput(f'curl -s -f \'{http_endpoint}{path}\'') if exitcode == 0: return output elif required: logs.critical(output) raise Exception(f'Failed to run solr curl: {http_endpoint}{path}') else: logs.warning(output) return False
def get_bucket(instance_id): s3_buckets = list( filter(lambda x: x.startswith(f'{instance_id}-cc'), list_s3_buckets(names_only=True))) if not s3_buckets: logs.warning( f'No bucket found for the instance "{instance_id}" on S3. Skipping.' ) return instance = kubectl.get(f'ckancloudckaninstance {instance_id}') bucket = instance['spec'].get('ckanStorageBucket').get(PROVIDER_ID) if not bucket: logs.warning('This instance does not have S3 bucket attached.') return return {'instance_id': instance_id, 'ckanStorageBucket': bucket}