Example #1
0
def test_protocol_graph_execution(calculation_backend, compute_resources):

    if calculation_backend is not None:
        calculation_backend.start()

    protocol_a = DummyInputOutputProtocol("protocol_a")
    protocol_a.input_value = 1
    protocol_b = DummyInputOutputProtocol("protocol_b")
    protocol_b.input_value = ProtocolPath("output_value", protocol_a.id)

    protocol_graph = ProtocolGraph()
    protocol_graph.add_protocols(protocol_a, protocol_b)

    with tempfile.TemporaryDirectory() as directory:

        results = protocol_graph.execute(directory, calculation_backend,
                                         compute_resources)

        final_result = results[protocol_b.id]

        if calculation_backend is not None:
            final_result = final_result.result()

        with open(final_result[1]) as file:
            results_b = json.load(file, cls=TypedJSONDecoder)

    assert results_b[".output_value"] == protocol_a.input_value

    if compute_resources is not None:
        assert protocol_b.output_value == protocol_a.input_value

    if calculation_backend is not None:
        calculation_backend.stop()
Example #2
0
def test_protocol_group_merging():
    def build_protocols(prefix):

        #     .-------------------.
        #     |          / i - j -|- b
        # a - | g - h - |         |
        #     |          \ k - l -|- c
        #     .-------------------.
        protocol_a = DummyInputOutputProtocol(prefix + "protocol_a")
        protocol_a.input_value = 1
        fork_protocols = build_fork(prefix)
        fork_protocols[0].input_value = ProtocolPath("output_value",
                                                     protocol_a.id)
        protocol_group = ProtocolGroup(prefix + "protocol_group")
        protocol_group.add_protocols(*fork_protocols)
        protocol_b = DummyInputOutputProtocol(prefix + "protocol_b")
        protocol_b.input_value = ProtocolPath("output_value",
                                              protocol_group.id, "protocol_j")
        protocol_c = DummyInputOutputProtocol(prefix + "protocol_c")
        protocol_c.input_value = ProtocolPath("output_value",
                                              protocol_group.id, "protocol_l")

        return [protocol_a, protocol_group, protocol_b, protocol_c]

    protocols_a = build_protocols("a_")
    protocols_b = build_protocols("b_")

    protocol_graph = ProtocolGraph()
    protocol_graph.add_protocols(*protocols_a)
    protocol_graph.add_protocols(*protocols_b)

    assert len(protocol_graph.protocols) == len(protocols_a)
    assert "a_protocol_group" in protocol_graph.protocols

    original_protocol_group = protocols_a[1]
    merged_protocol_group = protocol_graph.protocols["a_protocol_group"]

    assert original_protocol_group.schema.json(
    ) == merged_protocol_group.schema.json()
Example #3
0
def test_protocol_group_resume():
    """A test that protocol groups can recover after being killed
    (e.g. by a worker being killed due to hitting a wallclock limit)
    """

    compute_resources = ComputeResources()

    # Fake a protocol group which executes the first
    # two protocols and then 'gets killed'.
    protocol_a = DummyInputOutputProtocol("protocol_a")
    protocol_a.input_value = 1
    protocol_b = DummyInputOutputProtocol("protocol_b")
    protocol_b.input_value = ProtocolPath("output_value", protocol_a.id)

    protocol_group_a = ProtocolGroup("group_a")
    protocol_group_a.add_protocols(protocol_a, protocol_b)

    protocol_graph = ProtocolGraph()
    protocol_graph.add_protocols(protocol_group_a)
    protocol_graph.execute("graph_a", compute_resources=compute_resources)

    # Remove the output file so it appears the the protocol group had not
    # completed.
    os.unlink(
        os.path.join("graph_a", protocol_group_a.id,
                     f"{protocol_group_a.id}_output.json"))

    # Build the 'full' group with the last two protocols which
    # 'had not been exited' after the group was 'killed'
    protocol_a = DummyInputOutputProtocol("protocol_a")
    protocol_a.input_value = 1
    protocol_b = DummyInputOutputProtocol("protocol_b")
    protocol_b.input_value = ProtocolPath("output_value", protocol_a.id)
    protocol_c = DummyInputOutputProtocol("protocol_c")
    protocol_c.input_value = ProtocolPath("output_value", protocol_b.id)
    protocol_d = DummyInputOutputProtocol("protocol_d")
    protocol_d.input_value = ProtocolPath("output_value", protocol_c.id)

    protocol_group_a = ProtocolGroup("group_a")
    protocol_group_a.add_protocols(protocol_a, protocol_b, protocol_c,
                                   protocol_d)

    protocol_graph = ProtocolGraph()
    protocol_graph.add_protocols(protocol_group_a)
    protocol_graph.execute("graph_a", compute_resources=compute_resources)

    assert all(x != UNDEFINED for x in protocol_group_a.outputs.values())
Example #4
0
def test_protocol_graph_simple(protocols_a, protocols_b):

    # Make sure that the graph can merge simple protocols
    # when they are added one after the other.
    protocol_graph = ProtocolGraph()
    protocol_graph.add_protocols(*protocols_a)

    dependants_graph = protocol_graph._build_dependants_graph(
        protocol_graph.protocols, False, apply_reduction=True)

    assert len(protocol_graph.protocols) == len(protocols_a)
    assert len(dependants_graph) == len(protocols_a)
    n_root_protocols = len(protocol_graph.root_protocols)

    protocol_graph.add_protocols(*protocols_b)

    dependants_graph = protocol_graph._build_dependants_graph(
        protocol_graph.protocols, False, apply_reduction=False)

    assert len(protocol_graph.protocols) == len(protocols_a)
    assert len(dependants_graph) == len(protocols_a)
    assert len(protocol_graph.root_protocols) == n_root_protocols

    # Currently the graph shouldn't merge with an
    # addition
    protocol_graph = ProtocolGraph()
    protocol_graph.add_protocols(*protocols_a, *protocols_b)

    dependants_graph = protocol_graph._build_dependants_graph(
        protocol_graph.protocols, False, apply_reduction=False)

    assert len(protocol_graph.protocols) == len(protocols_a) + len(protocols_b)
    assert len(dependants_graph) == len(protocols_a) + len(protocols_b)
    assert len(protocol_graph.root_protocols) == 2 * n_root_protocols
Example #5
0
    def __init__(self):

        super(WorkflowGraph, self).__init__()

        self._workflows_to_execute = {}
        self._protocol_graph = ProtocolGraph()
Example #6
0
class WorkflowGraph:
    """A hierarchical structure for storing and submitting the workflows
    which will estimate a set of physical properties..
    """
    @property
    def protocols(self):
        """dict of str and Protocol: The protocols in this graph."""
        return self._protocol_graph.protocols

    @property
    def root_protocols(self):
        """list of str: The ids of the protocols in the group which do not
        take input from the other grouped protocols."""
        return self._protocol_graph.root_protocols

    def __init__(self):

        super(WorkflowGraph, self).__init__()

        self._workflows_to_execute = {}
        self._protocol_graph = ProtocolGraph()

    def add_workflows(self, *workflows):
        """Insert a set of workflows into the workflow graph.

        Parameters
        ----------
        workflow: Workflow
            The workflow to insert.
        """

        workflow_uuids = [x.uuid for x in workflows]

        if len(set(workflow_uuids)) != len(workflow_uuids):
            raise ValueError("A number of workflows have the same uuid.")

        existing_uuids = [
            x for x in workflow_uuids if x in self._workflows_to_execute
        ]

        if len(existing_uuids) > 0:

            raise ValueError(
                f"Workflows with the uuids {existing_uuids} are already in the graph."
            )

        original_protocols = []

        for workflow in workflows:

            original_protocols.extend(workflow.protocols.values())
            self._workflows_to_execute[workflow.uuid] = workflow

        # Add the workflow protocols to the graph.
        merged_protocol_ids = self._protocol_graph.add_protocols(
            *original_protocols, allow_external_dependencies=False)

        # Update the workflow to use the possibly merged protocols
        for original_id, new_id in merged_protocol_ids.items():

            original_protocol = original_id
            new_protocol = new_id

            for workflow in workflows:

                if (retrieve_uuid(original_protocol if isinstance(
                        original_protocol, str) else original_protocol.id) !=
                        workflow.uuid):
                    continue

                if original_protocol in workflow.protocols:
                    # Only retrieve the actual protocol if it isn't nested in
                    # a group.
                    original_protocol = workflow.protocols[original_id]
                    new_protocol = self._protocol_graph.protocols[new_id]

                workflow.replace_protocol(original_protocol, new_protocol,
                                          True)

    def execute(self,
                root_directory="",
                calculation_backend=None,
                compute_resources=None):
        """Executes the workflow graph.

        Parameters
        ----------
        root_directory: str
            The directory to execute the graph in.
        calculation_backend: CalculationBackend, optional.
            The backend to execute the graph on. This parameter
            is mutually exclusive with `compute_resources`.
        compute_resources: CalculationBackend, optional.
            The compute resources to run using. If None and no
            `calculation_backend` is specified, the workflow will
            be executed on a single CPU thread. This parameter
            is mutually exclusive with `calculation_backend`.

        Returns
        -------
        list of WorkflowResult or list of Future of WorkflowResult:
            The results of executing the graph. If a `calculation_backend`
            is specified, these results will be wrapped in a `Future`.
        """
        if calculation_backend is None and compute_resources is None:
            compute_resources = ComputeResources(number_of_threads=1)

        protocol_outputs = self._protocol_graph.execute(
            root_directory, calculation_backend, compute_resources)

        value_futures = []

        for workflow_id in self._workflows_to_execute:

            workflow = self._workflows_to_execute[workflow_id]
            data_futures = []

            # Make sure we keep track of all of the futures which we
            # will use to populate things such as a final property value
            # or gradient keys.
            if workflow.final_value_source != UNDEFINED:

                protocol_id = workflow.final_value_source.start_protocol
                data_futures.append(protocol_outputs[protocol_id])

            if workflow.gradients_sources != UNDEFINED:

                for gradient_source in workflow.gradients_sources:

                    protocol_id = gradient_source.start_protocol
                    data_futures.append(protocol_outputs[protocol_id])

            if workflow.outputs_to_store != UNDEFINED:

                for output_label, output_to_store in workflow.outputs_to_store.items(
                ):

                    for attribute_name in output_to_store.get_attributes(
                            StorageAttribute):

                        attribute_value = getattr(output_to_store,
                                                  attribute_name)

                        if not isinstance(attribute_value, ProtocolPath):
                            continue

                        data_futures.append(
                            protocol_outputs[attribute_value.start_protocol])

            if len(data_futures) == 0:
                data_futures = [*protocol_outputs.values()]

            if calculation_backend is None:

                value_futures.append(
                    WorkflowGraph._gather_results(
                        root_directory,
                        workflow.uuid,
                        workflow.final_value_source,
                        workflow.gradients_sources,
                        workflow.outputs_to_store,
                        *data_futures,
                    ))

            else:

                value_futures.append(
                    calculation_backend.submit_task(
                        WorkflowGraph._gather_results,
                        root_directory,
                        workflow.uuid,
                        workflow.final_value_source,
                        workflow.gradients_sources,
                        workflow.outputs_to_store,
                        *data_futures,
                    ))

        return value_futures

    @staticmethod
    def _gather_results(
        directory,
        workflow_id,
        value_reference,
        gradient_sources,
        outputs_to_store,
        *protocol_result_paths,
        **_,
    ):
        """Gather the data associated with the workflows in this graph.

        Parameters
        ----------
        directory: str
            The directory to store any working files in.
        workflow_id: str
            The id of the workflow associated with this result.
        value_reference: ProtocolPath, optional
            A reference to which property in the output dictionary is the actual value.
        gradient_sources: list of ProtocolPath
            A list of references to those entries in the output dictionaries which correspond
            to parameter gradients.
        outputs_to_store: dict of str and WorkflowOutputToStore
            A list of references to data which should be stored on the storage backend.
        protocol_results: dict of str and str
            The result dictionary of the protocol which calculated the value of the property.

        Returns
        -------
        CalculationLayerResult, optional
            The result of attempting to estimate this property from a workflow graph. `None`
            will be returned if the target uncertainty is set but not met.
        """

        return_object = WorkflowResult()
        return_object.workflow_id = workflow_id

        try:

            results_by_id = {}

            for protocol_id, protocol_result_path in protocol_result_paths:

                with open(protocol_result_path, "r") as file:
                    protocol_results = json.load(file, cls=TypedJSONDecoder)

                # Make sure none of the protocols failed and we actually have a value
                # and uncertainty.
                if isinstance(protocol_results, EvaluatorException):

                    return_object.exceptions.append(protocol_results)
                    return return_object

                # Store the protocol results in a dictionary, with keys of the
                # path to the original protocol output.
                for protocol_path, output_value in protocol_results.items():

                    protocol_path = ProtocolPath.from_string(protocol_path)

                    if (protocol_path.start_protocol is None
                            or protocol_path.start_protocol != protocol_id):
                        protocol_path.prepend_protocol_id(protocol_id)

                    results_by_id[protocol_path] = output_value

            if value_reference is not None:
                return_object.value = results_by_id[value_reference]

            for gradient_source in gradient_sources:
                return_object.gradients.append(results_by_id[gradient_source])

            return_object.data_to_store = []

            for output_to_store in outputs_to_store.values():

                unique_id = str(uuid.uuid4()).replace("-", "")

                data_object_path = path.join(directory,
                                             f"data_{unique_id}.json")
                data_directory = path.join(directory, f"data_{unique_id}")

                WorkflowGraph._store_output_data(
                    data_object_path,
                    data_directory,
                    output_to_store,
                    results_by_id,
                )

                return_object.data_to_store.append(
                    (data_object_path, data_directory))

        except Exception as e:
            return_object.exceptions.append(
                EvaluatorException.from_exception(e))

        return return_object

    @staticmethod
    def _store_output_data(
        data_object_path,
        data_directory,
        output_to_store,
        results_by_id,
    ):
        """Collects all of the simulation to store, and saves it into a directory
        whose path will be passed to the storage backend to process.

        Parameters
        ----------
        data_object_path: str
            The file path to serialize the data object to.
        data_directory: str
            The path of the directory to store ancillary data in.
        output_to_store: BaseStoredData
            An object which contains `ProtocolPath`s pointing to the
            data to store.
        results_by_id: dict of ProtocolPath and any
            The results of the protocols which formed the property
            estimation workflow.
        """

        makedirs(data_directory, exist_ok=True)

        for attribute_name in output_to_store.get_attributes(StorageAttribute):

            attribute = getattr(output_to_store.__class__, attribute_name)
            attribute_value = getattr(output_to_store, attribute_name)

            if not isinstance(attribute_value, ProtocolPath):
                continue

            attribute_value = results_by_id[attribute_value]

            if issubclass(attribute.type_hint, FilePath):
                file_copy(attribute_value, data_directory)
                attribute_value = path.basename(attribute_value)

            setattr(output_to_store, attribute_name, attribute_value)

        with open(data_object_path, "w") as file:
            json.dump(output_to_store, file, cls=TypedJSONEncoder)