def __init__(self, twine, handle_monitor_message=None, **kwargs): if isinstance(twine, Twine): self.twine = twine else: self.twine = Twine(source=twine) self._handle_monitor_message = handle_monitor_message strand_kwargs = {name: kwargs.pop(name, None) for name in ALL_STRANDS} # Values strands. self.configuration_values = strand_kwargs.get("configuration_values", None) self.input_values = strand_kwargs.get("input_values", None) self.output_values = strand_kwargs.get("output_values", None) # Manifest strands. self.configuration_manifest = strand_kwargs.get( "configuration_manifest", None) self.input_manifest = strand_kwargs.get("input_manifest", None) self.output_manifest = strand_kwargs.get("output_manifest", None) # Other strands. self.children = strand_kwargs.get("children", None) # Non-strands. self.output_location = kwargs.pop("output_location", None) self._calculate_strand_hashes(strands=strand_kwargs) self._finalised = False super().__init__(**kwargs)
def test_error_raised_if_datasets_are_missing_from_manifest(self): """Test that an error is raised if a dataset is missing from a manifest.""" twine = """ { "input_manifest": { "datasets": { "cat": { "purpose": "blah" }, "dog": { "purpose": "blah" } } } } """ input_manifest = { "id": "30d2c75c-a7b9-4f16-8627-9c8d5cc04bf4", "datasets": {"my-dataset": "gs://my-bucket/my_dataset", "dog": "gs://dog-house/dog"}, } twine = Twine(source=twine) with self.assertRaises(exceptions.InvalidManifestContents) as context: twine.validate_input_manifest(source=input_manifest) self.assertEqual( context.exception.message, "A dataset named 'cat' is expected in the input_manifest but is missing.", )
def test_valid(self): """Test that a valid twine will validate valid children Valiantly and Validly validating validity since 1983. To those reading this, know that YOU'RE valid. """ twine = Twine(source=self.VALID_TWINE_WITH_CHILDREN) twine.validate_children(source=self.VALID_CHILD_VALUE)
def test_invalid_strand(self): """Ensures that an incorrect strand name would lead to the correct exception Note: This tests an internal method. The current API doesn't allow this error to emerge but tthis check allows us to extend to a generic method """ twine = Twine(source=VALID_SCHEMA_TWINE) data = twine._load_json("configuration", source=self.VALID_CONFIGURATION_VALUE) with self.assertRaises(exceptions.UnknownStrand): twine._validate_against_schema("not_a_strand_name", data)
def test_credentials(self): """Test that the environment will override a default value for a credential.""" twine = Twine(source=self.VALID_CREDENTIALS_TWINE) with mock.patch.dict( os.environ, { "SECRET_THE_FIRST": "a value", "SECRET_THE_SECOND": "another value", "SECRET_THE_THIRD": "value" }, ): twine.validate_credentials() self.assertEqual(os.environ["SECRET_THE_THIRD"], "value")
def test_valid_values_files(self): """Ensures that values can be read and validated correctly from files on disk""" twine = Twine(source=VALID_SCHEMA_TWINE) with TemporaryDirectory() as tmp_dir: valid_configuration_file = self._write_json_string_to_file(self.VALID_CONFIGURATION_VALUE, tmp_dir) twine.validate_configuration_values(source=valid_configuration_file) twine.validate_input_values(source="""{"height": 40}""") twine.validate_output_values(source="""{"width": 36}""")
def test_valid_children(self): """Ensures that a twine with one child can be instantiated correctly.""" source = """ { "children": [{"key": "gis", "purpose": "The purpose.", "notes": "Some notes.", "filters": "tags:gis"}] } """ self.assertEqual(len(Twine(source=source).children), 1)
def test_non_existent_attributes_cannot_be_retrieved(self): """Ensure attributes that don't exist on Analysis aren't retrieved as None and instead raise an error. See https://github.com/octue/octue-sdk-python/issues/45 for reasoning behind adding this. """ analysis = Analysis(twine=Twine(source="{}")) with self.assertRaises(AttributeError): analysis.furry_purry_cat
def test_valid_with_extra_values(self): """Ensures that extra values get ignored""" configuration_valid_with_extra_field = """ { "n_iterations": 1, "another_field": "may or may not be quietly ignored" } """ Twine(source=VALID_SCHEMA_TWINE).validate_configuration_values(source=configuration_valid_with_extra_field)
def test_extra_key_validation_on_valid_twine(self): """Test that children with extra data will not raise a validation error on a non-empty valid twine. # TODO review this behaviour - possibly should raise an error but allow for a user specified extra_data property """ single_child_with_extra_data = """ [ { "key": "gis", "id": "some-id", "backend": { "name": "GCPPubSubBackend", "project_name": "my-project" }, "some_extra_property": "should not be a problem if present" } ] """ twine = Twine(source=self.VALID_TWINE_WITH_CHILDREN) twine.validate_children(source=single_child_with_extra_data)
def test_extra_key_validation_on_empty_twine(self): """Test that children with extra data will not raise a validation error on an empty twine.""" children_values_with_extra_data = """ [ {"key": "gis", "id": "id", "uri_env_name": "VAR_NAME", "an_extra_key": "not a problem if present"}, {"key": "some_weird_other_child", "id": "some-other-id", "uri_env_name": "SOME_ENV_VAR_NAME"} ] """ with self.assertRaises(exceptions.InvalidValuesContents): Twine().validate_children(source=children_values_with_extra_data)
def test_missing_values_files(self): """Ensures that if you try to read values from missing files, the right exceptions get raised""" twine = Twine(source=VALID_SCHEMA_TWINE) values_file = os.path.join(self.path, "not_a_file.json") with self.assertRaises(exceptions.ConfigurationValuesFileNotFound): twine.validate_configuration_values(source=values_file) with self.assertRaises(exceptions.InputValuesFileNotFound): twine.validate_input_values(source=values_file) with self.assertRaises(exceptions.OutputValuesFileNotFound): twine.validate_output_values(source=values_file)
def test_missing_manifest_files(self): """Ensures that if you try to read values from missing files, the right exceptions get raised""" twine = Twine(source=self.VALID_MANIFEST_STRAND) file = os.path.join(self.path, "not_a_file.json") with self.assertRaises(exceptions.ConfigurationManifestFileNotFound): twine.validate_configuration_manifest(source=file) with self.assertRaises(exceptions.InputManifestFileNotFound): twine.validate_input_manifest(source=file) with self.assertRaises(exceptions.OutputManifestFileNotFound): twine.validate_output_manifest(source=file)
def __init__( self, app_src, twine="twine.json", configuration_values=None, configuration_manifest=None, children=None, output_location=None, project_name=None, service_id=None, ): self.app_source = app_src self.children = children if output_location and not re.match(r"^gs://[a-z\d][a-z\d_./-]*$", output_location): raise exceptions.InvalidInputException( "The output location must be a Google Cloud Storage path e.g. 'gs://bucket-name/output_directory'." ) self.output_location = output_location # Ensure the twine is present and instantiate it. if isinstance(twine, Twine): self.twine = twine else: self.twine = Twine(source=twine) logger.debug("Parsed twine with strands %r", self.twine.available_strands) # Validate and initialise configuration data. self.configuration = self.twine.validate( configuration_values=configuration_values, configuration_manifest=configuration_manifest, cls=CLASS_MAP, ) logger.debug("Configuration validated.") self.service_id = service_id self._project_name = project_name
def test_invalid_children_no_key(self): """Ensures InvalidTwine exceptions are raised when instantiating twines where a child is specified without the required `key` field """ source = """ { "children": [{"purpose": "The purpose.", "notes": "Here are some notes.", "filters": "tags:gis"}] } """ with self.assertRaises(exceptions.InvalidTwine): Twine(source=source)
def test_error_raised_if_multiple_datasets_have_same_name(self): """Test that an error is raised if the input manifest has more than one dataset with the same name.""" twine = """ { "input_manifest": { "datasets": { "met_mast_data": { "purpose": "A dataset containing meteorological mast data" } } } } """ input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": { "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", "name": "met_mast_data", "tags": {}, "labels": [], "files": [] }, "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", "name": "met_mast_data", "tags": {}, "labels": [], "files": [] } } } """ twine = Twine(source=twine) with self.assertRaises(KeyError): twine.validate_input_manifest(source=input_manifest)
def test_invalid_env_name(self): """Test that a child uri env name not in ALL_CAPS_SNAKE_CASE doesn't validate""" child_with_invalid_environment_variable_name = """ [ { "key": "gis", "id": "some-id", "uri_env_name": "an environment variable not in CAPS_CASE is invalid per the credentials spec" } ] """ with self.assertRaises(exceptions.InvalidValuesContents): Twine().validate_children(source=child_with_invalid_environment_variable_name)
def test_fails_on_dict(self): """Ensures InvalidTwine exceptions are raised when instantiating twines with invalid `credentials` entries (given as a dict, not an array). """ invalid_credentials_dict_not_array_twine = """ { "credentials": { "name": "MY_API_SECRET_KEY", "purpose": "Token for accessing a 3rd party API service" } } """ with self.assertRaises(exceptions.InvalidTwine): Twine(source=invalid_credentials_dict_not_array_twine)
def test_fails_on_name_whitespace(self): """Test that a credential with spaces in its name causes an error to be raised when validated.""" invalid_credentials_space_in_name_twine = """ { "credentials": [ { "name": "MY NAME SHOULD NOT HAVE WHITESPACE", "purpose": "Token for accessing a 3rd party API service" } ] } """ with self.assertRaises(exceptions.InvalidTwine): Twine(source=invalid_credentials_space_in_name_twine)
def test_fails_on_no_name(self): """Ensures InvalidTwine exceptions are raised when instantiating twines with a missing `name` field in a credential. """ invalid_credentials_no_name_twine = """ { "credentials": [ { "purpose": "credentials without a name should be invalid" } ] } """ with self.assertRaises(exceptions.InvalidTwine): Twine(source=invalid_credentials_no_name_twine)
def test_fails_on_lowercase_name(self): """Ensures InvalidTwine exceptions are raised when instantiating twines with lowercase letters in the `name` field. """ invalid_credentials_lowercase_name_twine = """ { "credentials": [ { "name": "my_secrets_should_be_uppercase", "purpose": "Token for accessing a 3rd party API service" } ] } """ with self.assertRaises(exceptions.InvalidTwine): Twine(source=invalid_credentials_lowercase_name_twine)
def start(service_config, timeout, rm): """Start an Octue service or digital twin locally as a child so it can be asked questions by other Octue services.""" service_configuration, app_configuration = load_service_and_app_configuration( service_config) runner = Runner( app_src=service_configuration.app_source_path, twine=Twine(source=service_configuration.twine_path), configuration_values=app_configuration.configuration_values, configuration_manifest=app_configuration.configuration_manifest, children=app_configuration.children, output_location=app_configuration.output_location, service_id=service_configuration.service_id, ) run_function = functools.partial( runner.run, analysis_log_level=global_cli_context["log_level"], analysis_log_handler=global_cli_context["log_handler"], ) backend_configuration_values = (app_configuration.configuration_values or {}).get("backend") if backend_configuration_values: backend_configuration_values = copy.deepcopy( backend_configuration_values) backend = service_backends.get_backend( backend_configuration_values.pop("name"))( **backend_configuration_values) else: # If no backend details are provided, use Google Pub/Sub with the default project. _, project_name = auth.default() backend = service_backends.get_backend()(project_name=project_name) service = Service( service_id=service_configuration.service_id, backend=backend, run_function=run_function, ) service.serve(timeout=timeout, delete_topic_and_subscription_on_exit=rm)
def test_missing_optional_datasets_do_not_raise_error(self): """Test that optional datasets specified in the twine missing from the manifest don't raise an error.""" twine = """ { "input_manifest": { "datasets": { "cat": { "purpose": "blah", "optional": true }, "dog": { "purpose": "blah" } } } } """ input_manifest = { "id": "30d2c75c-a7b9-4f16-8627-9c8d5cc04bf4", "datasets": {"dog": "gs://dog-house/dog"}, } Twine(source=twine).validate_input_manifest(source=input_manifest)
def test_strand_not_found(self): """Ensures that if a twine doesn't have a strand, you can't validate against it""" valid_no_output_schema_twine = """ { "configuration_values_schema": { "$schema": "https://json-schema.org/draft/2020-12/schema", "title": "The example configuration form", "description": "The configuration strand of an example twine", "type": "object", "properties": { "n_iterations": { "description": "An example of an integer configuration variable, called 'n_iterations'.", "type": "integer", "minimum": 1, "maximum": 10, "default": 5 } } } } """ with self.assertRaises(exceptions.StrandNotFound): Twine(source=valid_no_output_schema_twine).validate_output_values(source="{}")
def run(service_config, input_dir, output_file, output_manifest_file, monitor_messages_file): """Run an analysis on the given input data using an Octue service or digital twin locally. The output values are printed to `stdout`. If an output manifest is produced, it will be saved locally (see the `--output-manifest-file` option). """ service_configuration, app_configuration = load_service_and_app_configuration( service_config) input_values_path = os.path.join(input_dir, VALUES_FILENAME) input_manifest_path = os.path.join(input_dir, MANIFEST_FILENAME) input_values = None input_manifest = None if os.path.exists(input_values_path): input_values = input_values_path if os.path.exists(input_manifest_path): input_manifest = input_manifest_path runner = Runner( app_src=service_configuration.app_source_path, twine=Twine(source=service_configuration.twine_path), configuration_values=app_configuration.configuration_values, configuration_manifest=app_configuration.configuration_manifest, children=app_configuration.children, output_location=app_configuration.output_location, ) if monitor_messages_file: if not os.path.exists(os.path.dirname(monitor_messages_file)): os.makedirs(os.path.dirname(monitor_messages_file)) monitor_message_handler = lambda message: _add_monitor_message_to_file( monitor_messages_file, message) else: monitor_message_handler = None analysis = runner.run( analysis_id=global_cli_context["analysis_id"], input_values=input_values, input_manifest=input_manifest, analysis_log_level=global_cli_context["log_level"], analysis_log_handler=global_cli_context["log_handler"], handle_monitor_message=monitor_message_handler, ) click.echo(json.dumps(analysis.output_values)) if analysis.output_values and output_file: if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) with open(output_file, "w") as f: json.dump(analysis.output_values, f, cls=OctueJSONEncoder, indent=4) if analysis.output_manifest: if not os.path.exists(os.path.dirname(output_manifest_file)): os.makedirs(os.path.dirname(output_manifest_file)) with open( output_manifest_file or f"output_manifest_{analysis.id}.json", "w") as f: json.dump(analysis.output_manifest.to_primitive(), f, cls=OctueJSONEncoder, indent=4) return 0
def test_valid_manifest_files(self): """Ensures that a manifest file will validate.""" valid_configuration_manifest = """ { "id": "3ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": { "configuration_files_data": { "id": "34ad7669-8162-4f64-8cd5-4abe92509e17", "name": "configuration_files_data", "tags": {}, "labels": ["the", "config", "labels"], "files": [ { "path": "configuration/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", "tags": {}, "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", "name": "file_1.csv", "size_bytes": 59684813, "sha-512/256": "somesha" }, { "path": "configuration/datasets/7ead7669/file_2.csv", "cluster": 0, "sequence": 1, "extension": "csv", "tags": {}, "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", "name": "file_2.csv", "size_bytes": 59684813, "sha-512/256": "someothersha" } ] } } } """ valid_input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": { "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "met_mast_data", "tags": {}, "labels": ["met", "mast", "wind"], "files": [ { "path": "input/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", "tags": {}, "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", "name": "file_1.csv", "size_bytes": 59684813, "sha-512/256": "somesha" }, { "path": "input/datasets/7ead7669/file_2.csv", "cluster": 0, "sequence": 1, "extension": "csv", "tags": {}, "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", "name": "file_2.csv", "size_bytes": 59684813, "sha-512/256": "someothersha" } ] }, "scada_data": "gs://my-bucket/scada-data" } } """ valid_output_manifest = """ { "id": "2ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": { "output_files_data": { "id": "1ead7669-8162-4f64-8cd5-4abe92509e17", "name": "output_files_data", "tags": {}, "labels": ["the", "output", "labels"], "files": [ { "path": "input/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", "tags": {}, "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", "name": "file_1.csv", "size_bytes": 59684813, "sha-512/256": "somesha" }, { "path": "input/datasets/7ead7669/file_2.csv", "cluster": 0, "sequence": 1, "extension": "csv", "tags": {}, "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", "name": "file_2.csv", "size_bytes": 59684813, "sha-512/256": "someothersha" } ] } } } """ twine = Twine(source=self.VALID_MANIFEST_STRAND) twine.validate_configuration_manifest(source=valid_configuration_manifest) twine.validate_input_manifest(source=valid_input_manifest) twine.validate_output_manifest(source=valid_output_manifest)
class Analysis(Identifiable, Serialisable, Labelable, Taggable): """A class representing a scientific or computational analysis. It holds references to all configuration, input, and output data, logs, connections to child services, credentials, etc. It's essentially the "Internal API" for your service - a single point of contact where you can get or update anything you need. An ``Analysis`` instance is automatically provided to the app in an Octue service when a question is received. Its attributes include every strand that can be added to a ``Twine``, although only the strands specified in the service's twine will be non-``None``. Incoming data is validated before it's added to the analysis. All input and configuration attributes are hashed using a `BLAKE3 hash <https://github.com/BLAKE3-team/BLAKE3>`_ so the inputs and configuration that produced a given output in your app can always be verified. These hashes exist on the following attributes: - ``input_values_hash`` - ``input_manifest_hash`` - ``configuration_values_hash`` - ``configuration_manifest_hash`` If a strand is ``None``, so will its corresponding hash attribute be. The hash of a datafile is the hash of its file, while the hash of a manifest or dataset is the cumulative hash of the files it refers to. :param twined.Twine|dict|str twine: the twine, dictionary defining a twine, or path to "twine.json" file defining the service's data interface :param callable|None handle_monitor_message: an optional function for sending monitor messages to the parent that requested the analysis :param any configuration_values: the configuration values for the analysis - this can be expressed as a python primitive (e.g. dict), a path to a JSON file, or a JSON string. :param octue.resources.manifest.Manifest configuration_manifest: a manifest of configuration datasets for the analysis if required :param any input_values: the input values for the analysis - this can be expressed as a python primitive (e.g. dict), a path to a JSON file, or a JSON string. :param octue.resources.manifest.Manifest input_manifest: a manifest of input datasets for the analysis if required :param any output_values: any output values the analysis produces :param octue.resources.manifest.Manifest output_manifest: a manifest of output dataset from the analysis if it produces any :param dict children: a mapping of string key to ``Child`` instance for all the children used by the service :param str id: Optional UUID for the analysis :return None: """ def __init__(self, twine, handle_monitor_message=None, **kwargs): if isinstance(twine, Twine): self.twine = twine else: self.twine = Twine(source=twine) self._handle_monitor_message = handle_monitor_message strand_kwargs = {name: kwargs.pop(name, None) for name in ALL_STRANDS} # Values strands. self.configuration_values = strand_kwargs.get("configuration_values", None) self.input_values = strand_kwargs.get("input_values", None) self.output_values = strand_kwargs.get("output_values", None) # Manifest strands. self.configuration_manifest = strand_kwargs.get( "configuration_manifest", None) self.input_manifest = strand_kwargs.get("input_manifest", None) self.output_manifest = strand_kwargs.get("output_manifest", None) # Other strands. self.children = strand_kwargs.get("children", None) # Non-strands. self.output_location = kwargs.pop("output_location", None) self._calculate_strand_hashes(strands=strand_kwargs) self._finalised = False super().__init__(**kwargs) @property def finalised(self): """Check whether the analysis has been finalised (i.e. whether its outputs have been validated and, if an output manifest is produced, its datasets uploaded). :return bool: """ return self._finalised def send_monitor_message(self, data): """Send a monitor message to the parent that requested the analysis. :param any data: any JSON-compatible data structure :return None: """ try: self.twine.validate_monitor_message(source=data) except twined.exceptions.InvalidValuesContents as e: raise InvalidMonitorMessage(e) if self._handle_monitor_message is None: logger.warning( "Attempted to send a monitor message but no handler is specified." ) return self._handle_monitor_message(data) def finalise(self, upload_output_datasets_to=None): """Validate the output values and output manifest, optionally uploading the output manifest's datasets to the cloud and updating its dataset paths to signed URLs. :param str|None upload_output_datasets_to: if provided, upload any output datasets to this cloud directory and update the output manifest with their locations :return None: """ serialised_strands = {"output_values": None, "output_manifest": None} if self.output_values: serialised_strands["output_values"] = json.dumps( self.output_values, cls=OctueJSONEncoder) if self.output_manifest: serialised_strands[ "output_manifest"] = self.output_manifest.to_primitive() self.twine.validate(**serialised_strands) self._finalised = True logger.info( "Validated output values and output manifest against the twine.") if not (upload_output_datasets_to and hasattr(self, "output_manifest")): return for name, dataset in self.output_manifest.datasets.items(): dataset.upload( cloud_path=storage.path.join(upload_output_datasets_to, name)) self.output_manifest.use_signed_urls_for_datasets() logger.info("Uploaded output datasets to %r.", upload_output_datasets_to) def _calculate_strand_hashes(self, strands): """Calculate the hashes of the strands specified in the HASH_FUNCTIONS constant. :param dict strands: strand names mapped to strand data :return None: """ for strand_name, strand_data in strands.items(): if strand_name in HASH_FUNCTIONS: strand_hash_name = f"{strand_name}_hash" if strand_data is not None: setattr(self, strand_hash_name, HASH_FUNCTIONS[strand_name](strand_data)) else: setattr(self, strand_hash_name, None)
def test_backend_cannot_be_empty(self): """Test that the backend field of a child cannot be empty.""" single_child_missing_backend = """[{"key": "gis", "id": "some-id", "backend": {}}]""" with self.assertRaises(exceptions.InvalidValuesContents): Twine().validate_children(source=single_child_missing_backend)
def test_extra_children(self): """Test that a twine with no children will not validate a non-empty children input.""" with self.assertRaises(exceptions.InvalidValuesContents): Twine().validate_children(source=self.VALID_CHILD_VALUE)
def test_missing_children(self): """Test that a twine with children will not validate on an empty children input.""" with self.assertRaises(exceptions.InvalidValuesContents): Twine(source=self.VALID_TWINE_WITH_CHILDREN).validate_children(source=[])