Example #1
0
def test_hive_task_query_generation():
    with _common_utils.AutoDeletingTempDir(
            "user_dir") as user_working_directory:
        context = _common_engine.EngineContext(
            execution_id=WorkflowExecutionIdentifier(project="unit_test",
                                                     domain="unit_test",
                                                     name="unit_test"),
            execution_date=_datetime.utcnow(),
            stats=None,  # TODO: A mock stats object that we can read later.
            logging=
            _logging,  # TODO: A mock logging object that we can read later.
            tmp_dir=user_working_directory,
        )
        references = {
            name: _task_output.OutputReference(
                _type_helpers.get_sdk_type_from_literal_type(variable.type))
            for name, variable in _six.iteritems(two_queries.interface.outputs)
        }

        qubole_hive_jobs = two_queries._generate_plugin_objects(
            context, references)
        assert len(qubole_hive_jobs) == 2

        # deprecated, collection is only here for backwards compatibility
        assert len(qubole_hive_jobs[0].query_collection.queries) == 1
        assert len(qubole_hive_jobs[1].query_collection.queries) == 1

        # The output references should now have the same fake S3 path as the formatted queries
        assert references["hive_results"].value[0].uri != ""
        assert references["hive_results"].value[1].uri != ""
        assert references["hive_results"].value[0].uri in qubole_hive_jobs[
            0].query.query
        assert references["hive_results"].value[1].uri in qubole_hive_jobs[
            1].query.query
Example #2
0
    def execute(self, context, inputs):
        """
        :param flytekit.engines.common.EngineContext context:
        :param flytekit.models.literals.LiteralMap inputs:
        :rtype: dict[Text, flytekit.models.common.FlyteIdlEntity]
        :returns: This function must return a dictionary mapping 'filenames' to Flyte Interface Entities.  These
            entities will be used by the engine to pass data from node to node, populate metadata, etc. etc..  Each
            engine will have different behavior.  For instance, the Flyte engine will upload the entities to a remote
            working directory (with the names provided), which will in turn allow Flyte Propeller to push along the
            workflow.  Where as local engine will merely feed the outputs directly into the next node.
        """
        inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std(
            inputs,
            {
                k: _type_helpers.get_sdk_type_from_literal_type(v.type)
                for k, v in _six.iteritems(self.interface.inputs)
            },
        )
        outputs_dict = {
            name: _task_output.OutputReference(
                _type_helpers.get_sdk_type_from_literal_type(variable.type))
            for name, variable in _six.iteritems(self.interface.outputs)
        }
        inputs_dict.update(outputs_dict)

        self._execute_user_code(context, inputs_dict)

        return {
            _constants.OUTPUT_FILE_NAME:
            _literal_models.LiteralMap(literals={
                k: v.sdk_value
                for k, v in _six.iteritems(outputs_dict)
            })
        }
Example #3
0
    def _produce_dynamic_job_spec(self, context, inputs):
        """
        Runs user code and and produces future task nodes to run sub-tasks.
        :param context:
        :param flytekit.models.literals.LiteralMap literal_map inputs:
        :rtype: flytekit.models.dynamic_job.DynamicJobSpec
        """
        inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std(
            inputs,
            {
                k: _type_helpers.get_sdk_type_from_literal_type(v.type)
                for k, v in _six.iteritems(self.interface.inputs)
            },
        )
        outputs_dict = {
            name: _task_output.OutputReference(
                _type_helpers.get_sdk_type_from_literal_type(variable.type))
            for name, variable in _six.iteritems(self.interface.outputs)
        }

        # Add outputs to inputs
        inputs_dict.update(outputs_dict)

        nodes = []
        tasks = []
        # One node per query
        generated_queries = self._generate_plugin_objects(context, inputs_dict)

        # Create output bindings always - this has to happen after user code has run
        output_bindings = [
            _literal_models.Binding(
                var=name,
                binding=_interface.BindingData.from_python_std(
                    b.sdk_type.to_flyte_literal_type(), b.value),
            ) for name, b in _six.iteritems(outputs_dict)
        ]

        i = 0
        for quboleHiveJob in generated_queries:
            hive_job_node = _create_hive_job_node("HiveQuery_{}".format(i),
                                                  quboleHiveJob.to_flyte_idl(),
                                                  self.metadata)
            nodes.append(hive_job_node)
            tasks.append(hive_job_node.executable_sdk_object)
            i += 1

        dynamic_job_spec = _dynamic_job.DynamicJobSpec(
            min_successes=len(nodes),
            tasks=tasks,
            nodes=nodes,
            outputs=output_bindings,
            subworkflows=[],
        )

        return dynamic_job_spec
Example #4
0
def test_sdk_output_references_construction():
    references = {
        name: _task_output.OutputReference(_type_helpers.get_sdk_type_from_literal_type(variable.type))
        for name, variable in _six.iteritems(two_queries.interface.outputs)
    }
    # Before user code is run, the outputs passed to the user code should not have values
    assert references['hive_results'].sdk_value == _base_sdk_types.Void()

    # Should be a list of schemas
    assert isinstance(references['hive_results'].sdk_type, _containers.TypedCollectionType)
    assert isinstance(references['hive_results'].sdk_type.sub_type, _schema.SchemaInstantiator)
Example #5
0
    def _produce_dynamic_job_spec(self, context, inputs):
        """
        Runs user code and and produces future task nodes to run sub-tasks.
        :param context:
        :param flytekit.models.literals.LiteralMap literal_map inputs:
        :rtype: flytekit.models.dynamic_job.DynamicJobSpec
        """
        inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std(
            inputs, {
                k: _type_helpers.get_sdk_type_from_literal_type(v.type)
                for k, v in _six.iteritems(self.interface.inputs)
            })
        outputs_dict = {
            name: _task_output.OutputReference(
                _type_helpers.get_sdk_type_from_literal_type(variable.type))
            for name, variable in _six.iteritems(self.interface.outputs)
        }

        # Add outputs to inputs
        inputs_dict.update(outputs_dict)

        # Note: Today a hive task corresponds to a dynamic job spec with one node, which contains multiple
        # queries. We may change this in future.
        nodes = []
        tasks = []
        generated_queries = self._generate_hive_queries(context, inputs_dict)

        # Create output bindings always - this has to happen after user code has run
        output_bindings = [
            _literal_models.Binding(
                var=name,
                binding=_interface.BindingData.from_python_std(
                    b.sdk_type.to_flyte_literal_type(), b.value))
            for name, b in _six.iteritems(outputs_dict)
        ]

        if len(generated_queries.query_collection.queries) > 0:
            hive_job_node = _create_hive_job_node(
                "HiveQueries", generated_queries.to_flyte_idl(), self.metadata)
            nodes.append(hive_job_node)
            tasks.append(hive_job_node.executable_sdk_object)

        dynamic_job_spec = _dynamic_job.DynamicJobSpec(
            min_successes=len(
                nodes
            ),  # At most we only have one node for now, see above comment
            tasks=tasks,
            nodes=nodes,
            outputs=output_bindings,
            subworkflows=[])

        return dynamic_job_spec
Example #6
0
    def execute(self, context, inputs):
        """
        :param flytekit.engines.common.EngineContext context:
        :param flytekit.models.literals.LiteralMap inputs:
        :rtype: dict[Text, flytekit.models.common.FlyteIdlEntity]
        :returns: This function must return a dictionary mapping 'filenames' to Flyte Interface Entities.  These
            entities will be used by the engine to pass data from node to node, populate metadata, etc. etc..  Each
            engine will have different behavior.  For instance, the Flyte engine will upload the entities to a remote
            working directory (with the names provided), which will in turn allow Flyte Propeller to push along the
            workflow.  Where as local engine will merely feed the outputs directly into the next node.
        """
        inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std(
            inputs,
            {
                k: _type_helpers.get_sdk_type_from_literal_type(v.type)
                for k, v in _six.iteritems(self.interface.inputs)
            },
        )

        input_notebook_path = self._notebook_path
        # Execute Notebook via Papermill.
        output_notebook_path = input_notebook_path.split(
            ".ipynb")[0] + "-out.ipynb"
        _pm.execute_notebook(input_notebook_path,
                             output_notebook_path,
                             parameters=inputs_dict)

        # Parse Outputs from Notebook.
        outputs = None
        with open(output_notebook_path) as json_file:
            data = _json.load(json_file)
            for p in data["cells"]:
                meta = p["metadata"]
                if "outputs" in meta["tags"]:
                    outputs = " ".join(p["outputs"][0]["data"]["text/plain"])

        if outputs is not None:
            dict = _literal_models._literals_pb2.LiteralMap()
            _text_format.Parse(outputs, dict)

        # Add output_notebook as an output to the task.
        output_notebook = _task_output.OutputReference(
            _type_helpers.get_sdk_type_from_literal_type(
                _Types.Blob.to_flyte_literal_type()))
        output_notebook.set(output_notebook_path)

        output_literal_map = _literal_models.LiteralMap.from_flyte_idl(dict)
        output_literal_map.literals[
            OUTPUT_NOTEBOOK] = output_notebook.sdk_value

        return {_constants.OUTPUT_FILE_NAME: output_literal_map}