Beispiel #1
0
def test_single_step_entrypoint_in_proc():
    with _TemporaryConfiguration(os.path.join(os.path.dirname(__file__), 'fake.config'),
                                 internal_overrides={
                                     'project': 'test',
                                     'domain': 'development'
                                 }):
        with _utils.AutoDeletingTempDir("in") as input_dir:
            literal_map = _type_helpers.pack_python_std_map_to_literal_map(
                {'a': 9}, _type_map_from_variable_map(_task_defs.add_one.interface.inputs))
            input_file = os.path.join(input_dir.name, "inputs.pb")
            _utils.write_proto_to_file(literal_map.to_flyte_idl(), input_file)

            with _utils.AutoDeletingTempDir("out") as output_dir:
                _execute_task(
                    _task_defs.add_one.task_module,
                    _task_defs.add_one.task_function_name,
                    input_file,
                    output_dir.name,
                    False
                )

                p = _utils.load_proto_from_file(
                    _literals_pb2.LiteralMap,
                    os.path.join(output_dir.name, _constants.OUTPUT_FILE_NAME)
                )
                raw_map = _type_helpers.unpack_literal_map_to_sdk_python_std(
                    _literal_models.LiteralMap.from_flyte_idl(p),
                    _type_map_from_variable_map(_task_defs.add_one.interface.outputs)
                )
                assert raw_map['b'] == 10
                assert len(raw_map) == 1
def evaluate_on_datasets(
    wf_params,
    model,
    evaluation_clean_mpblob,
    evaluation_dirty_mpblob,
    ground_truths_out,
    predictions_out,
):
    """ Map prediction task on a set of zip files of images to sub tasks"""

    with flytekit_utils.AutoDeletingTempDir("results") as output_models_dir:
        with flytekit_utils.AutoDeletingTempDir(
                "evaluation") as evaluation_dir:
            download_data(evaluation_dir.name, {
                "clean": evaluation_clean_mpblob,
                "dirty": evaluation_dirty_mpblob
            })
            model.download()
            ground_truths, predictions = predict_with_resnet50_model(
                model_path=model.local_path,
                evaluation_dataset=evaluation_dir.name,
                batch_size=DEFAULT_BATCH_SIZE,
                img_size=DEFAULT_IMG_SIZE,
            )

            ground_truths_out.set(ground_truths)
            predictions_out.set(predictions)
Beispiel #3
0
def test_fetch(value_type_pair):
    column_name, flyte_type, values = value_type_pair
    values = [tuple([value]) for value in values]
    schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)])

    with _utils.AutoDeletingTempDir("test") as tmpdir:
        for i in _six_moves.range(3):
            _pd.DataFrame.from_records(values, columns=[
                column_name
            ]).to_parquet(tmpdir.get_named_tempfile(str(i).zfill(6)),
                          coerce_timestamps='us')

        with _utils.AutoDeletingTempDir("test2") as local_dir:
            schema_obj = _schema_impl.Schema.fetch(
                tmpdir.name,
                local_path=local_dir.get_named_tempfile('schema_test'),
                schema_type=schema_type)
            with schema_obj as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(
                            values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual
Beispiel #4
0
def test_single_step_entrypoint_out_of_proc():
    with _TemporaryConfiguration(os.path.join(os.path.dirname(__file__), 'fake.config'),
                                 internal_overrides={
                                     'project': 'test',
                                     'domain': 'development'
                                 }):
        with _utils.AutoDeletingTempDir("in") as input_dir:
            literal_map = _type_helpers.pack_python_std_map_to_literal_map({'a': 9}, _type_map_from_variable_map(
                _task_defs.add_one.interface.inputs))
            input_file = os.path.join(input_dir.name, "inputs.pb")
            _utils.write_proto_to_file(literal_map.to_flyte_idl(), input_file)

            with _utils.AutoDeletingTempDir("out") as output_dir:
                cmd = []
                cmd.extend(["--task-module", _task_defs.add_one.task_module])
                cmd.extend(["--task-name", _task_defs.add_one.task_function_name])
                cmd.extend(["--inputs", input_file])
                cmd.extend(["--output-prefix", output_dir.name])
                result = CliRunner().invoke(execute_task_cmd, cmd)

                assert result.exit_code == 0
                p = _utils.load_proto_from_file(
                    _literals_pb2.LiteralMap,
                    os.path.join(output_dir.name, _constants.OUTPUT_FILE_NAME)
                )
                raw_map = _type_helpers.unpack_literal_map_to_sdk_python_std(
                    _literal_models.LiteralMap.from_flyte_idl(p),
                    _type_map_from_variable_map(_task_defs.add_one.interface.outputs)
                )
                assert raw_map['b'] == 10
                assert len(raw_map) == 1
def convert_to_sagemaker_csv(ctx, x_train, y_train, x_test, y_test, train,
                             validation):
    _train = read_and_merge(y_train, x_train)
    _validate = read_and_merge(y_test, x_test)

    with utils.AutoDeletingTempDir("train") as t:
        f = t.get_named_tempfile("train.csv")
        _train.to_csv(f, header=False, index=False)
        train.set(t.name)

    with utils.AutoDeletingTempDir("validate") as t:
        f = t.get_named_tempfile("validate.csv")
        _validate.to_csv(f, header=False, index=False)
        validation.set(t.name)
Beispiel #6
0
def luminance_select_collection_worker(
    wf_params,
    raw_frames_mpblob,
    n_clusters,
    sample_size,
    random_seed,
    selected_image_mpblob,
    selected_file_names,
):

    with flytekit_utils.AutoDeletingTempDir("output_images") as local_output_dir:
        raw_frames_mpblob.download()

        luminance_sample_collection(
            raw_frames_dir=raw_frames_mpblob.local_path,
            sampled_frames_out_dir=local_output_dir.name,
            n_clusters=n_clusters,
            sample_size=sample_size,
            logger=wf_params.logging,
            random_seed=random_seed,
        )

        # Get the full paths of all the files, excluding sub-folders, under folder_path
        selected_file_names_in_folder = [
            f for f in sorted(listdir(local_output_dir.name))
            if isfile(join(local_output_dir.name, f))
        ]

        selected_image_mpblob.set(local_output_dir.name)
        selected_file_names.set(selected_file_names_in_folder)
Beispiel #7
0
    def inputs(self) -> Dict[str, Any]:
        """
        Returns the inputs to the execution in the standard python format as dictated by the type engine.
        """
        if self._inputs is None:
            client = _flyte_engine.get_client()
            execution_data = client.get_execution_data(self.id)

            # Inputs are returned inline unless they are too big, in which case a url blob pointing to them is returned.
            input_map: _literal_models.LiteralMap = _literal_models.LiteralMap(
                {})
            if bool(execution_data.full_inputs.literals):
                input_map = execution_data.full_inputs
            elif execution_data.inputs.bytes > 0:
                with _common_utils.AutoDeletingTempDir() as tmp_dir:
                    tmp_name = _os.path.join(tmp_dir.name, "inputs.pb")
                    _data_proxy.Data.get_data(execution_data.inputs.url,
                                              tmp_name)
                    input_map = _literal_models.LiteralMap.from_flyte_idl(
                        _common_utils.load_proto_from_file(
                            _literals_pb2.Literalmap, tmp_name))
            lp_id = self.spec.launch_plan
            workflow = _workflow.FlyteWorkflow.fetch(lp_id.project,
                                                     lp_id.domain, lp_id.name,
                                                     lp_id.version)
            self._inputs = TypeEngine.literal_map_to_kwargs(
                ctx=FlyteContextManager.current_context(),
                lm=input_map,
                python_types=TypeEngine.guess_python_types(
                    workflow.interface.inputs),
            )
        return self._inputs
def get_traintest_splitdatabase(ctx, dataset, seed, test_split_ratio, x_train,
                                x_test, y_train, y_test):
    """
    Retrieves the training dataset from the given blob location and then splits it using the split ratio and returns the result
    This splitter is only for the dataset that has the format as specified in the example csv. The last column is assumed to be
    the class and all other columns 0-8 the features.

    The data is returned as a schema, which gets converted to a parquet file in the back.
    """
    with flytekit_utils.AutoDeletingTempDir("dataset_dir"):
        dataset_blob = Types.Blob.fetch(remote_path=dataset)
        column_names = [k for k in DATASET_SCHEMA.columns.keys()]
        df = pd.read_csv(dataset_blob.local_path, names=column_names)

        # Select all features
        x = df[column_names[:8]]
        # Select only the classes
        y = df[[column_names[-1]]]

        # split data into train and test sets
        _x_train, _x_test, _y_train, _y_test = train_test_split(
            x, y, test_size=test_split_ratio, random_state=seed)

        # TODO also add support for Spark dataframe, but make the pyspark dependency optional
        x_train.set(_x_train)
        x_test.set(_x_test)
        y_train.set(_y_train)
        y_test.set(_y_test)
Beispiel #9
0
def object_detection(wf_params, url, result, parsed_image):
    with utils.AutoDeletingTempDir('tmp') as tmpdir:
        request = urllib2.Request(url, headers=HEADERS)
        fname = '{}/image.jpg'.format(tmpdir.name)
        d = urllib2.urlopen(request)
        with open(fname, 'wb') as opfile:
            data = d.read()
            opfile.write(data)
        wf_params.logging.info("downloaded image")

        output_file = '{}/output.jpg'.format(tmpdir.name)
        output = download.detect(fname, output_file)
        scores = output["detection_scores"]
        classes = output["detection_classes"]
        category_index = output["category_index"]
        
        results = []
        for i in range(len(scores)):
            if scores[i] > MIN_SCORE:
                if classes[i] in six.viewkeys(category_index):
                  class_name = category_index[classes[i]]['name']
                else:
                  class_name = 'N/A'
                display_str = str(class_name)
                display_str = '{}: {}%'.format(display_str, int(100*scores[i]))
                results.append(display_str)

        parsed_image.set(output_file)
        result.set("\n".join(results))
Beispiel #10
0
def test_datetime_coercion():
    values = [
        tuple(
            [
                _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1, microsecond=1)
                - _datetime.timedelta(days=x)
            ]
        )
        for x in _six_moves.range(5)
    ]
    schema_type = _schema_impl.SchemaType(columns=[("testname", _primitives.Datetime)])

    with _test_utils.LocalTestFileSystem():
        with _utils.AutoDeletingTempDir("test") as t:
            a = _schema_impl.Schema.create_at_known_location(t.name, mode="wb", schema_type=schema_type)
            with a as writer:
                for _ in _six_moves.range(5):
                    # us to ms coercion segfaults unless we explicitly allow truncation.
                    writer.write(
                        _pd.DataFrame.from_records(values, columns=["testname"]),
                        coerce_timestamps="ms",
                        allow_truncated_timestamps=True,
                    )

                    # TODO: Uncomment when segfault bug is resolved
                    # with _pytest.raises(Exception):
                    #    writer.write(
                    #        _pd.DataFrame.from_records(values, columns=['testname']),
                    #        coerce_timestamps='ms')

            b = _schema_impl.Schema.create_at_known_location(t.name, mode="wb", schema_type=schema_type)
            with b as writer:
                for _ in _six_moves.range(5):
                    writer.write(_pd.DataFrame.from_records(values, columns=["testname"]))
Beispiel #11
0
def test_simple_read_and_write_with_different_types(value_type_pair):
    column_name, flyte_type, values = value_type_pair
    values = [tuple([value]) for value in values]
    schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)])

    with _test_utils.LocalTestFileSystem() as sandbox:
        with _utils.AutoDeletingTempDir("test") as t:
            a = _schema_impl.Schema.create_at_known_location(
                t.name, mode='wb', schema_type=schema_type)
            assert a.local_path is None
            with a as writer:
                for _ in _six_moves.range(5):
                    writer.write(
                        _pd.DataFrame.from_records(values,
                                                   columns=[column_name]))
                assert a.local_path.startswith(sandbox.name)
            assert a.local_path is None

            b = _schema_impl.Schema.create_at_known_location(
                t.name, mode='rb', schema_type=schema_type)
            assert b.local_path is None
            with b as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(
                            values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual
                assert b.local_path.startswith(sandbox.name)
            assert b.local_path is None
Beispiel #12
0
    def outputs(self):
        """
        Returns the outputs of the task execution, if available, in the standard Python format that is produced by
        the type engine. If not available, perhaps due to execution being in progress or an error being produced,
        this will raise an exception.
        :rtype: dict[Text, T]
        """
        if not self.is_complete:
            raise _user_exceptions.FlyteAssertion(
                "Please what until the task execution has completed before requesting the outputs."
            )
        if self.error:
            raise _user_exceptions.FlyteAssertion("Outputs could not be found because the execution ended in failure.")

        if self._outputs is None:
            client = _flyte_engine.get_client()
            execution_data = client.get_task_execution_data(self.id)

            # Inputs are returned inline unless they are too big, in which case a url blob pointing to them is returned.
            if bool(execution_data.full_outputs.literals):
                output_map = execution_data.full_outputs

            elif execution_data.outputs.bytes > 0:
                with _common_utils.AutoDeletingTempDir() as t:
                    tmp_name = _os.path.join(t.name, "outputs.pb")
                    _data_proxy.Data.get_data(execution_data.outputs.url, tmp_name)
                    output_map = _literal_models.LiteralMap.from_flyte_idl(
                        _common_utils.load_proto_from_file(_literals_pb2.LiteralMap, tmp_name)
                    )
            else:
                output_map = _literal_models.LiteralMap({})
            self._outputs = _type_helpers.unpack_literal_map_to_sdk_python_std(output_map)
        return self._outputs
Beispiel #13
0
def test_hive_task_query_generation():
    with _common_utils.AutoDeletingTempDir(
            "user_dir") as user_working_directory:
        context = _common_engine.EngineContext(
            execution_id=WorkflowExecutionIdentifier(project="unit_test",
                                                     domain="unit_test",
                                                     name="unit_test"),
            execution_date=_datetime.utcnow(),
            stats=None,  # TODO: A mock stats object that we can read later.
            logging=
            _logging,  # TODO: A mock logging object that we can read later.
            tmp_dir=user_working_directory,
        )
        references = {
            name: _task_output.OutputReference(
                _type_helpers.get_sdk_type_from_literal_type(variable.type))
            for name, variable in _six.iteritems(two_queries.interface.outputs)
        }

        qubole_hive_jobs = two_queries._generate_plugin_objects(
            context, references)
        assert len(qubole_hive_jobs) == 2

        # deprecated, collection is only here for backwards compatibility
        assert len(qubole_hive_jobs[0].query_collection.queries) == 1
        assert len(qubole_hive_jobs[1].query_collection.queries) == 1

        # The output references should now have the same fake S3 path as the formatted queries
        assert references["hive_results"].value[0].uri != ""
        assert references["hive_results"].value[1].uri != ""
        assert references["hive_results"].value[0].uri in qubole_hive_jobs[
            0].query.query
        assert references["hive_results"].value[1].uri in qubole_hive_jobs[
            1].query.query
Beispiel #14
0
def test_task_system_failure():
    with TemporaryConfiguration(os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            '../../../common/configs/local.config'),
                                internal_overrides={
                                    'image':
                                    'myflyteimage:{}'.format(
                                        os.environ.get('IMAGE_VERSION',
                                                       'sha')),
                                    'project':
                                    'myflyteproject',
                                    'domain':
                                    'development'
                                }):
        m = MagicMock()
        m.execute = _raise_system_exception

        with utils.AutoDeletingTempDir("test") as tmp:
            engine.FlyteTask(m).execute(None, {'output_prefix': tmp.name})

            doc = errors.ErrorDocument.from_flyte_idl(
                utils.load_proto_from_file(
                    errors_pb2.ErrorDocument,
                    os.path.join(tmp.name, constants.ERROR_FILE_NAME)))
            assert doc.error.code == "SYSTEM:Unknown"
            assert doc.error.kind == errors.ContainerError.Kind.RECOVERABLE
            assert "errorERRORerror" in doc.error.message
    def outputs(self) -> Dict[str, Any]:
        """
        Returns the outputs to the execution in the standard python format as dictated by the type engine.

        :raises: ``FlyteAssertion`` error if execution is in progress or execution ended in error.
        """
        if not self.is_complete:
            raise _user_exceptions.FlyteAssertion(
                "Please wait until the node execution has completed before requesting the outputs."
            )
        if self.error:
            raise _user_exceptions.FlyteAssertion(
                "Outputs could not be found because the execution ended in failure."
            )

        if self._outputs is None:
            client = _flyte_engine.get_client()
            execution_data = client.get_execution_data(self.id)
            # Outputs are returned inline unless they are too big, in which case a url blob pointing to them is returned.
            output_map: LiteralMap = _literal_models.LiteralMap({})
            if bool(execution_data.full_outputs.literals):
                output_map = execution_data.full_outputs
            elif execution_data.outputs.bytes > 0:
                with _common_utils.AutoDeletingTempDir() as tmp_dir:
                    tmp_name = _os.path.join(tmp_dir.name, "outputs.pb")
                    _data_proxy.Data.get_data(execution_data.outputs.url,
                                              tmp_name)
                    output_map = _literal_models.LiteralMap.from_flyte_idl(
                        _common_utils.load_proto_from_file(
                            _literals_pb2.LiteralMap, tmp_name))
            # TODO: need to convert flyte literals to python types. For now just use literals
            # self._outputs = TypeEngine.literal_map_to_kwargs(ctx=FlyteContext.current_context(), lm=output_map)
            self._outputs = output_map
        return self._outputs
Beispiel #16
0
    def inputs(self):
        """
        Returns the inputs of the task execution in the standard Python format that is produced by
        the type engine.
        :rtype: dict[Text, T]
        """
        if self._inputs is None:
            client = _flyte_engine.get_client()
            execution_data = client.get_task_execution_data(self.id)

            # Inputs are returned inline unless they are too big, in which case a url blob pointing to them is returned.
            if bool(execution_data.full_inputs.literals):
                input_map = execution_data.full_inputs
            elif execution_data.inputs.bytes > 0:
                with _common_utils.AutoDeletingTempDir() as t:
                    tmp_name = _os.path.join(t.name, "inputs.pb")
                    _data_proxy.Data.get_data(execution_data.inputs.url, tmp_name)
                    input_map = _literal_models.LiteralMap.from_flyte_idl(
                        _common_utils.load_proto_from_file(_literals_pb2.LiteralMap, tmp_name)
                    )
            else:
                input_map = _literal_models.LiteralMap({})

            self._inputs = _type_helpers.unpack_literal_map_to_sdk_python_std(input_map)
        return self._inputs
Beispiel #17
0
    def inputs(self) -> Dict[str, Any]:
        """
        Returns the inputs to the execution in the standard python format as dicatated by the type engine.
        """
        if self._inputs is None:
            client = _flyte_engine.get_client()
            execution_data = client.get_node_execution_data(self.id)

            # Inputs are returned inline unless they are too big, in which case a url blob pointing to them is returned.
            input_map: _literal_models.LiteralMap = _literal_models.LiteralMap(
                {})
            if bool(execution_data.full_inputs.literals):
                input_map = execution_data.full_inputs
            elif execution_data.inputs.bytes > 0:
                with _common_utils.AutoDeletingTempDir() as tmp_dir:
                    tmp_name = _os.path.join(tmp_dir.name, "inputs.pb")
                    _data_proxy.Data.get_data(execution_data.inputs.url,
                                              tmp_name)
                    input_map = _literal_models.LiteralMap.from_flyte_idl(
                        _common_utils.load_proto_from_file(
                            _literals_pb2.LiteralMap, tmp_name))

            # TODO: need to convert flyte literals to python types. For now just use literals
            # self._inputs = TypeEngine.literal_map_to_kwargs(ctx=FlyteContext.current_context(), lm=input_map)
            self._inputs = input_map
        return self._inputs
Beispiel #18
0
def test_module_loading():
    with _utils.AutoDeletingTempDir("mypackage") as pkg:
        path = pkg.name
        # Create directories
        top_level = os.path.join(path, "top")
        middle_level = os.path.join(top_level, "middle")
        bottom_level = os.path.join(middle_level, "bottom")
        os.makedirs(bottom_level)

        # Create init files
        with open(os.path.join(path, "__init__.py"), "w"):
            pass
        with open(os.path.join(top_level, "__init__.py"), "w"):
            pass
        with open(os.path.join(top_level, "a.py"), "w"):
            pass
        with open(os.path.join(middle_level, "__init__.py"), "w"):
            pass
        with open(os.path.join(middle_level, "a.py"), "w"):
            pass
        with open(os.path.join(bottom_level, "__init__.py"), "w"):
            pass
        with open(os.path.join(bottom_level, "a.py"), "w"):
            pass

        sys.path.append(path)

        # Not a sufficient test but passes for now
        assert sum(1 for _ in module_loader.iterate_modules(["top"])) == 6
        assert [
            pkg.__file__ for pkg in module_loader.iterate_modules(["top.a", "top.middle.a", "top.middle.bottom.a"])
        ] == [os.path.join(lvl, "a.py") for lvl in (top_level, middle_level, bottom_level)]
Beispiel #19
0
def execute_task(task_module, task_name, inputs, output_prefix, test):
    with _TemporaryConfiguration(_internal_config.CONFIGURATION_PATH.get()):
        with _utils.AutoDeletingTempDir('input_dir') as input_dir:
            # Load user code
            task_module = _importlib.import_module(task_module)
            task_def = getattr(task_module, task_name)

            if not test:
                local_inputs_file = input_dir.get_named_tempfile('inputs.pb')

                # Handle inputs/outputs for array job.
                if _os.environ.get('BATCH_JOB_ARRAY_INDEX_VAR_NAME'):
                    job_index = _compute_array_job_index()

                    # TODO: Perhaps remove.  This is a workaround to an issue we perceived with limited entropy in
                    # TODO: AWS batch array jobs.
                    _flyte_random.seed_flyte_random("{} {} {}".format(
                        _random.random(), _datetime.datetime.utcnow(),
                        job_index))

                    # If an ArrayTask is discoverable, the original job index may be different than the one specified in
                    # the environment variable. Look up the correct input/outputs in the index lookup mapping file.
                    job_index = _map_job_index_to_child_index(
                        input_dir, inputs, job_index)

                    inputs = _os.path.join(inputs, str(job_index), 'inputs.pb')
                    output_prefix = _os.path.join(output_prefix,
                                                  str(job_index))

                _data_proxy.Data.get_data(inputs, local_inputs_file)
                input_proto = _utils.load_proto_from_file(
                    _literals_pb2.LiteralMap, local_inputs_file)
                _engine_loader.get_engine().get_task(task_def).execute(
                    _literal_models.LiteralMap.from_flyte_idl(input_proto),
                    context={'output_prefix': output_prefix})
Beispiel #20
0
def test_hive_task_dynamic_job_spec_generation():
    with _common_utils.AutoDeletingTempDir(
            "user_dir") as user_working_directory:
        context = _common_engine.EngineContext(
            execution_id=WorkflowExecutionIdentifier(project="unit_test",
                                                     domain="unit_test",
                                                     name="unit_test"),
            execution_date=_datetime.utcnow(),
            stats=None,  # TODO: A mock stats object that we can read later.
            logging=
            _logging,  # TODO: A mock logging object that we can read later.
            tmp_dir=user_working_directory,
        )
        dj_spec = two_queries._produce_dynamic_job_spec(
            context, _literals.LiteralMap(literals={}))

        # Bindings
        assert len(dj_spec.outputs[0].binding.collection.bindings) == 2
        assert isinstance(
            dj_spec.outputs[0].binding.collection.bindings[0].scalar.schema,
            Schema)
        assert isinstance(
            dj_spec.outputs[0].binding.collection.bindings[1].scalar.schema,
            Schema)

        # Custom field is filled in
        assert len(dj_spec.tasks[0].custom) > 0
Beispiel #21
0
    def inputs(self) -> Dict[str, Any]:
        """
        Returns the inputs of the task execution in the standard Python format that is produced by
        the type engine.
        """
        from flytekit.control_plane.tasks.task import FlyteTask

        if self._inputs is None:
            client = _flyte_engine.get_client()
            execution_data = client.get_task_execution_data(self.id)

            # Inputs are returned inline unless they are too big, in which case a url blob pointing to them is returned.
            input_map = _literal_models.LiteralMap({})
            if bool(execution_data.full_inputs.literals):
                input_map = execution_data.full_inputs
            elif execution_data.inputs.bytes > 0:
                with _common_utils.AutoDeletingTempDir() as tmp_dir:
                    tmp_name = os.path.join(tmp_dir.name, "inputs.pb")
                    _data_proxy.Data.get_data(execution_data.inputs.url,
                                              tmp_name)
                    input_map = _literal_models.LiteralMap.from_flyte_idl(
                        _common_utils.load_proto_from_file(
                            _literals_pb2.LiteralMap, tmp_name))

            task = FlyteTask.fetch(self.id.task_id.project,
                                   self.id.task_id.domain,
                                   self.id.task_id.name,
                                   self.id.task_id.version)
            self._inputs = TypeEngine.literal_map_to_kwargs(
                ctx=FlyteContextManager.current_context(),
                lm=input_map,
                python_types=TypeEngine.guess_python_types(
                    task.interface.inputs),
            )
        return self._inputs
Beispiel #22
0
def test_module_loading():
    with _utils.AutoDeletingTempDir("mypackage") as pkg:
        path = pkg.name
        # Create directories
        top_level = os.path.join(path, 'top')
        middle_level = os.path.join(top_level, 'middle')
        bottom_level = os.path.join(middle_level, 'bottom')
        os.makedirs(bottom_level)

        # Create init files
        with open(os.path.join(path, '__init__.py'), 'w'):
            pass
        with open(os.path.join(top_level, '__init__.py'), 'w'):
            pass
        with open(os.path.join(top_level, 'a.py'), 'w'):
            pass
        with open(os.path.join(middle_level, '__init__.py'), 'w'):
            pass
        with open(os.path.join(middle_level, 'a.py'), 'w'):
            pass
        with open(os.path.join(bottom_level, '__init__.py'), 'w'):
            pass
        with open(os.path.join(bottom_level, 'a.py'), 'w'):
            pass

        sys.path.append(path)

        # Not a sufficient test but passes for now
        assert sum(1 for _ in module_loader.iterate_modules(['top'])) == 6
Beispiel #23
0
def test_datetime_coercion_explicitly():
    """
    Sanity check that we're using a version of pyarrow that allows us to
    truncate timestamps
    """
    dt = _datetime.datetime(day=1,
                            month=1,
                            year=2017,
                            hour=1,
                            minute=1,
                            second=1,
                            microsecond=1)
    values = [(dt, )]
    df = _pd.DataFrame.from_records(values, columns=['testname'])
    assert df['testname'][0] == dt

    with _utils.AutoDeletingTempDir('test') as tmpdir:
        tmpfile = tmpdir.get_named_tempfile('repro.parquet')
        df.to_parquet(tmpfile,
                      coerce_timestamps='ms',
                      allow_truncated_timestamps=True)
        df2 = _pd.read_parquet(tmpfile)

    dt2 = _datetime.datetime(day=1,
                             month=1,
                             year=2017,
                             hour=1,
                             minute=1,
                             second=1)
    assert df2['testname'][0] == dt2
Beispiel #24
0
def download_video_worker(
    wf_params, video_external_path, video_blob,
):
    # avi_local = wf_params.working_directory.get_named_tempfile("input.avi")
    with flytekit_utils.AutoDeletingTempDir("stream") as download_dir:
        local_path = join(download_dir.name, basename(video_external_path))
        b = Types.Blob.fetch(remote_path=video_external_path, local_path=local_path)
        video_blob.set(b)
def test_arrayjob_entrypoint_in_proc():
    with _TemporaryConfiguration(os.path.join(os.path.dirname(__file__),
                                              'fake.config'),
                                 internal_overrides={
                                     'project': 'test',
                                     'domain': 'development'
                                 }):
        with _utils.AutoDeletingTempDir("dir") as dir:
            literal_map = _type_helpers.pack_python_std_map_to_literal_map(
                {'a': 9},
                _type_map_from_variable_map(
                    _task_defs.add_one.interface.inputs))

            input_dir = os.path.join(dir.name, "1")
            os.mkdir(
                input_dir)  # auto cleanup will take this subdir into account

            input_file = os.path.join(input_dir, "inputs.pb")
            _utils.write_proto_to_file(literal_map.to_flyte_idl(), input_file)

            # construct indexlookup.pb which has array: [1]
            mapped_index = _literals.Literal(
                _literals.Scalar(primitive=_literals.Primitive(integer=1)))
            index_lookup_collection = _literals.LiteralCollection(
                [mapped_index])
            index_lookup_file = os.path.join(dir.name, "indexlookup.pb")
            _utils.write_proto_to_file(index_lookup_collection.to_flyte_idl(),
                                       index_lookup_file)

            # fake arrayjob task by setting environment variables
            orig_env_index_var_name = os.environ.get(
                'BATCH_JOB_ARRAY_INDEX_VAR_NAME')
            orig_env_array_index = os.environ.get('AWS_BATCH_JOB_ARRAY_INDEX')
            os.environ[
                'BATCH_JOB_ARRAY_INDEX_VAR_NAME'] = 'AWS_BATCH_JOB_ARRAY_INDEX'
            os.environ['AWS_BATCH_JOB_ARRAY_INDEX'] = '0'

            execute_task(_task_defs.add_one.task_module,
                         _task_defs.add_one.task_function_name, dir.name,
                         dir.name, False)

            raw_map = _type_helpers.unpack_literal_map_to_sdk_python_std(
                _literal_models.LiteralMap.from_flyte_idl(
                    _utils.load_proto_from_file(
                        _literals_pb2.LiteralMap,
                        os.path.join(input_dir, _constants.OUTPUT_FILE_NAME))),
                _type_map_from_variable_map(
                    _task_defs.add_one.interface.outputs))
            assert raw_map['b'] == 10
            assert len(raw_map) == 1

            # reset the env vars
            if orig_env_index_var_name:
                os.environ[
                    'BATCH_JOB_ARRAY_INDEX_VAR_NAME'] = orig_env_index_var_name
            if orig_env_array_index:
                os.environ['AWS_BATCH_JOB_ARRAY_INDEX'] = orig_env_array_index
def confusion_matrix(wf_params, y_true, y_pred, title, normalize, classes, matrix, visual):
    with utils.AutoDeletingTempDir('test') as tmpdir:
        f_path = tmpdir.get_named_tempfile("visual.png")
        cm = _plot_confusion_matrix(np.asarray(y_true), np.asarray(y_pred), classes=np.asarray(classes), title=title, normalize=normalize, to_file_path=f_path)
        m = []
        for i in range(cm.shape[0]):
            m.append([])
            for j in range(cm.shape[1]):
              m[i].append(j)
        visual.set(f_path)
        matrix.set(m)
Beispiel #27
0
 def get_outputs(self):
     """
     :rtype: flytekit.models.literals.LiteralMap
     """
     with _common_utils.AutoDeletingTempDir() as t:
         tmp_name = _os.path.join(t.name, "outputs.pb")
         _data_proxy.Data.get_data(
             self.sdk_task_execution.closure.output_uri, tmp_name)
         return _literals.LiteralMap.from_flyte_idl(
             _common_utils.load_proto_from_file(_literals_pb2.LiteralMap,
                                                tmp_name))
def test_backwards_compatible_replacement(mock_execute_task):
    def return_args(*args, **kwargs):
        assert args[4] is None

    mock_execute_task.side_effect = return_args

    with _TemporaryConfiguration(
        os.path.join(os.path.dirname(__file__), "fake.config"),
        internal_overrides={"project": "test", "domain": "development"},
    ):
        with _utils.AutoDeletingTempDir("in"):
            with _utils.AutoDeletingTempDir("out"):
                cmd = []
                cmd.extend(["--task-module", "fake"])
                cmd.extend(["--task-name", "fake"])
                cmd.extend(["--inputs", "fake"])
                cmd.extend(["--output-prefix", "fake"])
                cmd.extend(["--raw-output-data-prefix", "{{.rawOutputDataPrefix}}"])
                result = CliRunner().invoke(execute_task_cmd, cmd)
                assert result.exit_code == 0
Beispiel #29
0
 def __enter__(self):
     """
     :rtype: flytekit.common.utils.AutoDeletingTempDir
     """
     self._exit_stack.__enter__()
     temp_dir = self._exit_stack.enter_context(
         _utils.AutoDeletingTempDir("local_test_filesystem"))
     self._exit_stack.enter_context(
         _data_proxy.LocalDataContext(temp_dir.name))
     self._exit_stack.enter_context(
         _data_proxy.LocalWorkingDirectoryContext(temp_dir))
     return temp_dir
Beispiel #30
0
def test_download(value_type_pair):
    column_name, flyte_type, values = value_type_pair
    values = [tuple([value]) for value in values]
    schema_type = _schema_impl.SchemaType(columns=[(column_name, flyte_type)])

    with _utils.AutoDeletingTempDir("test") as tmpdir:
        for i in _six_moves.range(3):
            _pd.DataFrame.from_records(values, columns=[column_name]).to_parquet(
                tmpdir.get_named_tempfile(str(i).zfill(6)), coerce_timestamps="us"
            )

        with _utils.AutoDeletingTempDir("test2") as local_dir:
            schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type)
            schema_obj.download(local_dir.get_named_tempfile(_uuid.uuid4().hex))
            with schema_obj as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual

        with _pytest.raises(Exception):
            schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type)
            schema_obj.download()

        with _test_utils.LocalTestFileSystem():
            schema_obj = _schema_impl.Schema(tmpdir.name, schema_type=schema_type)
            schema_obj.download()
            with schema_obj as reader:
                for df in reader.iter_chunks():
                    for check, actual in _six_moves.zip(values, df[column_name].tolist()):
                        assert check[0] == actual
                assert reader.read() is None
                reader.seek(0)
                df = reader.read(concat=True)
                for iter_count, actual in enumerate(df[column_name].tolist()):
                    assert values[iter_count % len(values)][0] == actual