def get_credentials(cls) -> DefaultAzureCredential: if cls.credentials is None: cls.credentials = DefaultAzureCredential() f = StringIO() try: with redirect_stderr(f): cls.credentials.get_token("https://storage.azure.com/") except ClientAuthenticationError: pass except Exception as e: logger.error(f.getvalue()) raise e return cls.credentials
def _upgrade() -> None: try: upgrade_present = check_version_upgrade(is_upgrade_call=True) if upgrade_present: _make_installation_file() nice_run([f"./{TEMP_INSTALLATION_FILENAME}"], input=b"y") _upgrade_successful() except Exception: logger.error( "\nUnable to install latest version of Opta." "\nPlease follow the instructions on https://docs.opta.dev/installation" ) finally: _cleanup_installation_file()
def tail_pod_log(namespace: str, pod: V1Pod, color_idx: int, seconds: Optional[int]) -> None: v1 = CoreV1Api() watch = Watch() print( f"{fg(color_idx)}Showing the logs for server {pod.metadata.name} of your service{attr(0)}" ) retry_count = 0 while True: try: for logline in watch.stream( v1.read_namespaced_pod_log, name=pod.metadata.name, namespace=namespace, container="k8s-service", since_seconds=seconds, ): print(f"{fg(color_idx)}{pod.metadata.name} {logline}{attr(0)}") except Exception as e: if type(e) == ApiException: if e.status == 404: # type: ignore print( f"{fg(color_idx)}Server {pod.metadata.name} has been terminated{attr(0)}" ) return if retry_count < 15: print( f"{fg(color_idx)}Couldn't get logs, waiting a bit and retrying{attr(0)}" ) time.sleep(retry_count) retry_count += 1 else: logger.error( f"Got the following error while trying to fetch the logs for pod {pod.metadata.name} in namespace {namespace}: {e}" ) return
def _apply( config: str, env: Optional[str], refresh: bool, local: bool, image_tag: Optional[str], test: bool, auto_approve: bool, input_variables: Dict[str, str], image_digest: Optional[str] = None, stdout_logs: bool = True, detailed_plan: bool = False, ) -> None: pre_check() _clean_tf_folder() if local and not test: config = local_setup(config, input_variables, image_tag, refresh_local_env=True) layer = Layer.load_from_yaml(config, env, input_variables=input_variables) layer.verify_cloud_credentials() layer.validate_required_path_dependencies() if Terraform.download_state(layer): tf_lock_exists, _ = Terraform.tf_lock_details(layer) if tf_lock_exists: raise UserErrors(USER_ERROR_TF_LOCK) _verify_parent_layer(layer, auto_approve) event_properties: Dict = layer.get_event_properties() amplitude_client.send_event( amplitude_client.START_GEN_EVENT, event_properties=event_properties, ) # We need a region with at least 3 AZs for leader election during failover. # Also EKS historically had problems with regions that have fewer than 3 AZs. if layer.cloud == "aws": providers = layer.gen_providers(0)["provider"] aws_region = providers["aws"]["region"] azs = _fetch_availability_zones(aws_region) if len(azs) < 3: raise UserErrors( fmt_msg(f""" Opta requires a region with at least *3* availability zones like us-east-1 or us-west-2. ~You configured {aws_region}, which only has the availability zones: {azs}. ~Please choose a different region. """)) Terraform.create_state_storage(layer) gen_opta_resource_tags(layer) cloud_client: CloudClient if layer.cloud == "aws": cloud_client = AWS(layer) elif layer.cloud == "google": cloud_client = GCP(layer) elif layer.cloud == "azurerm": cloud_client = Azure(layer) elif layer.cloud == "local": if local: # boolean passed via cli pass cloud_client = Local(layer) elif layer.cloud == "helm": cloud_client = HelmCloudClient(layer) else: raise Exception(f"Cannot handle upload config for cloud {layer.cloud}") existing_config: Optional[ StructuredConfig] = cloud_client.get_remote_config() old_semver_string = ("" if existing_config is None else existing_config.get("opta_version", "").strip("v")) current_semver_string = VERSION.strip("v") _verify_semver(old_semver_string, current_semver_string, layer, auto_approve) try: existing_modules: Set[str] = set() first_loop = True for module_idx, current_modules, total_block_count in gen( layer, existing_config, image_tag, image_digest, test, True, auto_approve): if first_loop: # This is set during the first iteration, since the tf file must exist. existing_modules = Terraform.get_existing_modules(layer) first_loop = False configured_modules = set([x.name for x in current_modules]) is_last_module = module_idx == total_block_count - 1 has_new_modules = not configured_modules.issubset(existing_modules) if not is_last_module and not has_new_modules and not refresh: continue if is_last_module: untouched_modules = existing_modules - configured_modules configured_modules = configured_modules.union( untouched_modules) layer.pre_hook(module_idx) if layer.cloud == "local": if is_last_module: targets = [] else: targets = list( map(lambda x: f"-target=module.{x}", sorted(configured_modules))) if test: Terraform.plan("-lock=false", *targets, layer=layer) print( "Plan ran successfully, not applying since this is a test." ) else: current_properties = event_properties.copy() current_properties["module_idx"] = module_idx amplitude_client.send_event( amplitude_client.APPLY_EVENT, event_properties=current_properties, ) logger.info("Planning your changes (might take a minute)") try: Terraform.plan( "-lock=false", "-input=false", f"-out={TF_PLAN_PATH}", layer=layer, *targets, quiet=True, ) except CalledProcessError as e: logger.error(e.stderr or "") raise e PlanDisplayer.display(detailed_plan=detailed_plan) if not auto_approve: click.confirm( "The above are the planned changes for your opta run. Do you approve?", abort=True, ) logger.info("Applying your changes (might take a minute)") service_modules = (layer.get_module_by_type( "k8s-service", module_idx) if layer.cloud == "aws" else layer.get_module_by_type( "gcp-k8s-service", module_idx)) if (len(service_modules) != 0 and cluster_exist(layer.root()) and stdout_logs): service_module = service_modules[0] # Tailing logs logger.info( f"Identified deployment for kubernetes service module {service_module.name}, tailing logs now." ) new_thread = Thread( target=tail_module_log, args=( layer, service_module.name, 10, datetime.datetime.utcnow().replace( tzinfo=pytz.UTC), 2, ), daemon=True, ) # Tailing events new_thread.start() new_thread = Thread( target=tail_namespace_events, args=( layer, datetime.datetime.utcnow().replace( tzinfo=pytz.UTC), 3, ), daemon=True, ) new_thread.start() tf_flags: List[str] = [] if auto_approve: tf_flags.append("-auto-approve") try: Terraform.apply(layer, *tf_flags, TF_PLAN_PATH, no_init=True, quiet=False) except Exception as e: layer.post_hook(module_idx, e) raise e else: layer.post_hook(module_idx, None) cloud_client.upload_opta_config() logger.info("Opta updates complete!") except Exception as e: event_properties["success"] = False event_properties["error_name"] = e.__class__.__name__ raise e else: event_properties["success"] = True finally: amplitude_client.send_event( amplitude_client.FINISH_GEN_EVENT, event_properties=event_properties, )
def tail_namespace_events( layer: "Layer", earliest_event_start_time: Optional[datetime.datetime] = None, color_idx: int = 15, # White Color ) -> None: load_opta_kube_config() v1 = EventsV1Api() watch = Watch() print(f"{fg(color_idx)}Showing events for namespace {layer.name}{attr(0)}") retry_count = 0 old_events: List[EventsV1Event] = v1.list_namespaced_event( namespace=layer.name).items # Filter by time if earliest_event_start_time is not None: # Redefine so mypy doesn't complain about earliest_event_start_time being Optional during lambda call filter_start_time = earliest_event_start_time old_events = list( filter( lambda x: _event_last_observed(x) > filter_start_time, old_events, )) # Sort by timestamp old_events = sorted(old_events, key=lambda x: _event_last_observed(x)) event: EventsV1Event for event in old_events: if do_not_show_event(event): continue earliest_event_start_time = _event_last_observed(event) print( f"{fg(color_idx)}{earliest_event_start_time} Namespace {layer.name} event: {event.note}{attr(0)}" ) deleted_pods = set() while True: try: for stream_obj in watch.stream( v1.list_namespaced_event, namespace=layer.name, ): event = stream_obj["object"] event_time = _event_last_observed(event) if (earliest_event_start_time is None or event_time > earliest_event_start_time): if "Deleted pod:" in event.note: deleted_pods.add(event.note.split(" ")[-1]) involved_object: Optional[ V1ObjectReference] = event.regarding if (involved_object is not None and involved_object.kind == "Pod" and involved_object.name in deleted_pods): continue if do_not_show_event(event): continue print( f"{fg(color_idx)}{event_time} Namespace {layer.name} event: {event.note}{attr(0)}" ) except ApiException as e: if retry_count < 5: print( f"{fg(color_idx)}Couldn't get logs, waiting a bit and retrying{attr(0)}" ) time.sleep(1 << retry_count) retry_count += 1 else: logger.error( f"{fg(color_idx)}Got the following error while trying to fetch the events in namespace {layer.name}: {e}" ) return except Exception as e: # print(sys.exc_info()[2]) logger.error( f"{fg(color_idx)}Got the following error while trying to fetch the events in namespace {layer.name}: {e}{attr(0)}" ) logger.debug("Event watch exception", exc_info=True) return
cli.add_command(generate_terraform) cli.add_command(help) cli.add_command(show) if __name__ == "__main__": try: # In case OPTA_DEBUG is set, local state files may not be cleaned up # after the command. # However, we should still clean them up before the next command, or # else it may interfere with it. one_time() cleanup_files() cli() except UserErrors as e: if os.environ.get("OPTA_DEBUG") is None: logger.error(str(e)) else: logger.exception(str(e)) logger.info( f"{fg('magenta')}If you need more help please reach out to the contributors in our slack channel at: https://slack.opta.dev{attr(0)}" ) sys.exit(1) except Exception as e: logger.exception(str(e)) logger.info( f"{fg('red')}Unhandled error encountered -- a crash report zipfile has been created for you. " "If you need more help please reach out (passing the crash report) to the contributors in our " f"slack channel at: https://slack.opta.dev{attr(0)}" "\nHint: As a first step in debugging, try rerunning the command and seeing if it still fails." ) CURRENT_CRASH_REPORTER.generate_report()
def destroy( config: str, env: Optional[str], auto_approve: bool, detailed_plan: bool, local: Optional[bool], var: Dict[str, str], ) -> None: """Destroy all opta resources from the current config To destroy an environment, you have to first destroy all the services first. Examples: opta destroy -c my-service.yaml --auto-approve opta destroy -c my-env.yaml --auto-approve """ try: opta_acquire_lock() pre_check() logger.warning( "You are destroying your cloud infra state. DO NOT, I REPEAT, DO NOT do this as " "an attempt to debug a weird/errored apply. What you have created is not some ephemeral object that can be " "tossed arbitrarily (perhaps some day) and destroying unnecessarily just to reapply typically makes it " "worse. If you're doing this cause you are really trying to destroy the environment entirely, then that's" "perfectly fine-- if not then please reach out to the opta team in the slack workspace " "(https://slack.opta.dev) and I promise that they'll be happy to help debug." ) config = check_opta_file_exists(config) if local: config, _ = _handle_local_flag(config, False) _clean_tf_folder() layer = Layer.load_from_yaml(config, env, input_variables=var) event_properties: Dict = layer.get_event_properties() amplitude_client.send_event( amplitude_client.DESTROY_EVENT, event_properties=event_properties, ) layer.verify_cloud_credentials() layer.validate_required_path_dependencies() if not Terraform.download_state(layer): logger.info( "The opta state could not be found. This may happen if destroy ran successfully before." ) return tf_lock_exists, _ = Terraform.tf_lock_details(layer) if tf_lock_exists: raise UserErrors(USER_ERROR_TF_LOCK) # Any child layers should be destroyed first before the current layer. children_layers = _fetch_children_layers(layer) if children_layers: # TODO: ideally we can just automatically destroy them but it's # complicated... logger.error( "Found the following services that depend on this environment. Please run `opta destroy` on them first!\n" + "\n".join(children_layers) ) raise UserErrors("Dependant services found!") tf_flags: List[str] = [] if auto_approve: sleep_time = 5 logger.info( f"{attr('bold')}Opta will now destroy the {attr('underlined')}{layer.name}{attr(0)}" f"{attr('bold')} layer.{attr(0)}\n" f"{attr('bold')}Sleeping for {attr('underlined')}{sleep_time} secs{attr(0)}" f"{attr('bold')}, press Ctrl+C to Abort.{attr(0)}" ) time.sleep(sleep_time) tf_flags.append("-auto-approve") modules = Terraform.get_existing_modules(layer) layer.modules = [x for x in layer.modules if x.name in modules] gen_all(layer) Terraform.init(False, "-reconfigure", layer=layer) Terraform.refresh(layer) idx = len(layer.modules) - 1 for module in reversed(layer.modules): try: module_address_prefix = f"-target=module.{module.name}" logger.info("Planning your changes (might take a minute)") Terraform.plan( "-lock=false", "-input=false", "-destroy", f"-out={TF_PLAN_PATH}", layer=layer, *list([module_address_prefix]), ) PlanDisplayer.display(detailed_plan=detailed_plan) tf_flags = [] if not auto_approve: click.confirm( "The above are the planned changes for your opta run. Do you approve?", abort=True, ) else: tf_flags.append("-auto-approve") Terraform.apply(layer, *tf_flags, TF_PLAN_PATH, no_init=True, quiet=False) layer.post_delete(idx) idx -= 1 except Exception as e: raise e Terraform.delete_state_storage(layer) finally: opta_release_lock()