Python DataCatalog Examples, kedro.io.DataCatalog Python Examples

Example #1

0

Show file

 def test_bad_confirm(self, sane_config, dataset_name, pattern):
     """Test confirming non existent dataset or the one that
     does not have `confirm` method"""
     data_catalog = DataCatalog.from_config(**sane_config)
     with pytest.raises(DataSetError, match=re.escape(pattern)):
         data_catalog.confirm(dataset_name)

Example #2

0

Show file

    def _run(  # pylint: disable=too-many-locals,useless-suppression
            self,
            pipeline: Pipeline,
            catalog: DataCatalog,
            run_id: str = None) -> None:
        """The abstract interface for running pipelines.

        Args:
            pipeline: The ``Pipeline`` to run.
            catalog: The ``DataCatalog`` from which to fetch data.
            run_id: The id of the run.

        Raises:
            AttributeError: When the provided pipeline is not suitable for
                parallel execution.
            RuntimeError: If the runner is unable to schedule the execution of
                all pipeline nodes.
            Exception: In case of any downstream node failure.

        """
        # pylint: disable=import-outside-toplevel,cyclic-import
        from kedro.framework.session.session import get_current_session

        nodes = pipeline.nodes
        self._validate_catalog(catalog, pipeline)
        self._validate_nodes(nodes)

        load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))
        node_dependencies = pipeline.node_dependencies
        todo_nodes = set(node_dependencies.keys())
        done_nodes = set()  # type: Set[Node]
        futures = set()
        done = None
        max_workers = self._get_required_workers_count(pipeline)

        from kedro.framework.project import PACKAGE_NAME

        session = get_current_session(silent=True)
        # pylint: disable=protected-access
        conf_logging = session._get_logging_config() if session else None

        with ProcessPoolExecutor(max_workers=max_workers) as pool:
            while True:
                ready = {
                    n
                    for n in todo_nodes if node_dependencies[n] <= done_nodes
                }
                todo_nodes -= ready
                for node in ready:
                    futures.add(
                        pool.submit(
                            _run_node_synchronization,
                            node,
                            catalog,
                            self._is_async,
                            run_id,
                            package_name=PACKAGE_NAME,
                            conf_logging=conf_logging,
                        ))
                if not futures:
                    if todo_nodes:
                        debug_data = {
                            "todo_nodes": todo_nodes,
                            "done_nodes": done_nodes,
                            "ready_nodes": ready,
                            "done_futures": done,
                        }
                        debug_data_str = "\n".join(
                            f"{k} = {v}" for k, v in debug_data.items())
                        raise RuntimeError(
                            f"Unable to schedule new tasks although some nodes "
                            f"have not been run:\n{debug_data_str}")
                    break  # pragma: no cover
                done, futures = wait(futures, return_when=FIRST_COMPLETED)
                for future in done:
                    try:
                        node = future.result()
                    except Exception:
                        self._suggest_resume_scenario(pipeline, done_nodes)
                        raise
                    done_nodes.add(node)

                    # decrement load counts and release any data sets we've finished with
                    # this is particularly important for the shared datasets we create above
                    for data_set in node.inputs:
                        load_counts[data_set] -= 1
                        if (load_counts[data_set] < 1
                                and data_set not in pipeline.inputs()):
                            catalog.release(data_set)
                    for data_set in node.outputs:
                        if (load_counts[data_set] < 1
                                and data_set not in pipeline.outputs()):
                            catalog.release(data_set)

Example #3

0

Show file

def data_catalog_from_config(sane_config):
    return DataCatalog.from_config(**sane_config)

Example #4

0

Show file

 def register_catalog(
     self, catalog, credentials, load_versions, save_version, journal
 ) -> DataCatalog:
     return DataCatalog.from_config(
         catalog, credentials, load_versions, save_version, journal
     )

Example #5

0

Show file

File: test_thread_runner.py Project: zeta1999/kedro

def catalog():
    return DataCatalog()

Example #6

0

Show file

File: test_transformers.py Project: zhongchen/kedro

    def test_not_found_error(self, fake_transformer):
        catalog = DataCatalog()

        with pytest.raises(DataSetNotFoundError):
            catalog.add_transformer(fake_transformer, "test")

Example #7

0

Show file

File: test_transformers.py Project: zhongchen/kedro

def catalog(fake_data_set):
    return DataCatalog({"test": fake_data_set})

Example #8

0

Show file

File: test_data_catalog.py Project: watilde/kedro

 def test_missing_credentials(self, sane_config):
     """Check the error if credentials can't be located"""
     sane_config["catalog"]["cars"]["credentials"] = "missing"
     with pytest.raises(KeyError, match=r"Unable to find credentials"):
         DataCatalog.from_config(**sane_config)

Example #9

0

Show file

File: test_data_catalog.py Project: watilde/kedro

 def test_idempotent_catalog(self, sane_config):
     """Test that data catalog instantiations are idempotent"""
     _ = DataCatalog.from_config(**sane_config)  # NOQA
     catalog = DataCatalog.from_config(**sane_config)
     assert catalog

Example #10

0

Show file

File: test_data_catalog.py Project: watilde/kedro

 def test_config_invalid_module(self, sane_config):
     """Check the error if the type points to nonexistent module"""
     sane_config["catalog"]["boats"][
         "type"] = "kedro.invalid_module_name.io.CSVLocalDataSet"
     with pytest.raises(DataSetError, match=r"Cannot import module"):
         DataCatalog.from_config(**sane_config)

Example #11

0

Show file

File: test_data_catalog.py Project: watilde/kedro

 def test_empty_config(self):
     """Test empty config"""
     assert DataCatalog.from_config(None)

Example #12

0

Show file

File: test_data_catalog.py Project: watilde/kedro

 def test_save_to_unregistered(self, dummy_dataframe):
     """Check the error when attempting to save to unregistered data set"""
     catalog = DataCatalog(data_sets={})
     pattern = r"DataSet 'test' not found in the catalog"
     with pytest.raises(DataSetNotFoundError, match=pattern):
         catalog.save("test", dummy_dataframe)

Example #13

0

Show file

def test_mlflow_hook_metrics_dataset_with_run_id(
    kedro_project_with_mlflow_conf, dummy_pipeline, dummy_run_params
):

    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()

        with mlflow.start_run():
            existing_run_id = mlflow.active_run().info.run_id

        dummy_catalog_with_run_id = DataCatalog(
            {
                "raw_data": MemoryDataSet(pd.DataFrame(data=[1], columns=["a"])),
                "params:unused_param": MemoryDataSet("blah"),
                "data": MemoryDataSet(),
                "model": PickleDataSet(
                    (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix()
                ),
                "my_metrics": MlflowMetricsDataSet(run_id=existing_run_id),
                "another_metrics": MlflowMetricsDataSet(
                    run_id=existing_run_id, prefix="foo"
                ),
                "my_metric": MlflowMetricDataSet(run_id=existing_run_id),
                "another_metric": MlflowMetricDataSet(
                    run_id=existing_run_id, key="foo"
                ),
                "my_metric_history": MlflowMetricHistoryDataSet(run_id=existing_run_id),
                "another_metric_history": MlflowMetricHistoryDataSet(
                    run_id=existing_run_id, key="bar"
                ),
            }
        )

        mlflow_hook = MlflowHook()
        runner = SequentialRunner()

        mlflow_hook.after_context_created(context)
        mlflow_hook.after_catalog_created(
            catalog=dummy_catalog_with_run_id,
            # `after_catalog_created` is not using any of arguments bellow,
            # so we are setting them to empty values.
            conf_catalog={},
            conf_creds={},
            feed_dict={},
            save_version="",
            load_versions="",
        )
        mlflow_hook.before_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline,
            catalog=dummy_catalog_with_run_id,
        )
        runner.run(dummy_pipeline, dummy_catalog_with_run_id, session._hook_manager)

        current_run_id = mlflow.active_run().info.run_id

        mlflow_hook.after_pipeline_run(
            run_params=dummy_run_params,
            pipeline=dummy_pipeline,
            catalog=dummy_catalog_with_run_id,
        )

        mlflow_client = MlflowClient(context.mlflow.server.mlflow_tracking_uri)
        # the first run is created in Default (id 0),
        # but the one initialised in before_pipeline_run
        # is create  in kedro_project experiment (id 1)
        all_runs_id = set(
            [
                run.run_id
                for k in range(2)
                for run in mlflow_client.list_run_infos(experiment_id=f"{k}")
            ]
        )

        # the metrics are supposed to have been logged inside existing_run_id
        run_data = mlflow_client.get_run(existing_run_id).data

        # Check if metrics datasets have prefix with its names.
        # for metric
        assert all_runs_id == {current_run_id, existing_run_id}

        assert run_data.metrics["my_metrics.metric_key"] == 1.1
        assert run_data.metrics["foo.metric_key"] == 1.1
        assert run_data.metrics["my_metric"] == 1.1
        assert run_data.metrics["foo"] == 1.1
        assert (
            run_data.metrics["my_metric_history"] == 0.2
        )  # the list is stored, but only the last value is retrieved
        assert (
            run_data.metrics["bar"] == 0.2
        )  # the list is stored, but only the last value is retrieved

Example #14

0

Show file

def preprocess_raw_data(parameters: Dict):
    import glob, os
    os.chdir(parameters["path_raw"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
        #                                     'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure',
        #                                     'ESP Current', 'Emulsion Flow Rate']]

        #         well = well[['Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Emulsion Flow Rate']]

        well = well[[
            'Date', 'IBHP', 'PBHP', 'Steam [m3/d]', 'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    timesteps = 983

    for well, file, filename in zip(wells_data_, files, filenames):
        #         well = well.iloc[:min_well_length]     # use minimum well life
        well = well.iloc[:timesteps]  # daily, weekly, monthly data
        #         well = well.fillna(0)
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well_dated = well.copy()
        well_dated["Well"] = filename  # create a column for well name
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well_dated)
        Raw_Data_dated.append(well_dated)
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_raw_static"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'Effective_Length':'BottomWater_Oil_Saturation'].values
            properties.append(property_)
    properties = np.array(properties)
    return [
        timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames,
        probabilities, asset_names, properties
    ]

Example #15

0

Show file

def preprocess_raw_data(parameters: Dict):
    import glob, os
    os.chdir(parameters["path_raw"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    for well, file, filename in zip(wells_data, files, filenames):
        well = well[[
            'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
            'Bottom Hole Heel Temperature', 'Emulsion Pressure',
            'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate'
        ]]
        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')

        well = well.iloc[:1399]
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well_dated = well.copy()
        well_dated["Well"] = filename  # create a column for well name
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well_dated)
        Raw_Data_dated.append(well_dated)

        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_raw_static"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values
            properties.append(property_)
    properties = np.array(properties)
    return [
        Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities,
        asset_names, properties
    ]

Example #16

0

Show file

File: test_data_catalog.py Project: watilde/kedro

 def test_error_dataset_init(self, bad_config):
     """Check the error when trying to instantiate erroneous data set"""
     pattern = r"Failed to instantiate DataSet \'bad\' " r"of type `.*BadDataSet`"
     with pytest.raises(DataSetError, match=pattern):
         DataCatalog.from_config(bad_config, None)

Example #17

0

Show file

File: hooks_template.py Project: zschuster/kedro

 def register_catalog(  # pylint: disable=no-self-use, too-many-arguments
         self, catalog, credentials, load_versions, save_version, journal):
     return DataCatalog.from_config(catalog, credentials, load_versions,
                                    save_version, journal)

Example #18

0

Show file

File: test_data_catalog.py Project: watilde/kedro

def multi_catalog(mocker):
    csv = CSVLocalDataSet(filepath="abc.csv")
    parq = ParquetLocalDataSet(filepath="xyz.parq")
    journal = mocker.Mock()
    return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)

Example #19

0

Show file

File: test_transformers.py Project: zhongchen/kedro

 def test_not_found_error_in_constructor(self):
     with pytest.raises(DataSetNotFoundError):
         DataCatalog(transformers={"test": []})

Example #20

0

Show file

File: test_data_catalog.py Project: watilde/kedro

def memory_catalog():
    ds1 = MemoryDataSet({"data": 42})
    ds2 = MemoryDataSet([1, 2, 3, 4, 5])
    return DataCatalog({"ds1": ds1, "ds2": ds2})

Example #21

0

Show file

"""Contents of hello_kedro.py"""
# prerequisite
# kedro jupyter convert notebooks/generationCopy1.ipynb
from kedro.io import DataCatalog, MemoryDataSet
from kedro.pipeline import node, Pipeline
from kedro.runner import SequentialRunner
from src.kedro_bioimage.nodes.generationCopy1 import generation

# Prepare a data catalog
data_catalog = DataCatalog({"example_data": MemoryDataSet()})

# Prepare first node
def return_greeting():
    return "Hello"


return_greeting_node = node(
    return_greeting, inputs=None, outputs="my_salutation"
)

# Prepare second node
def join_statements(greeting):
    return f"{greeting} Kedro!"


join_statements_node = node(
    join_statements, inputs="my_salutation", outputs="my_message"
)

#

Example #22

0

Show file

"""Contents of hello_kedro.py"""
from kedro.io import DataCatalog, MemoryDataSet
from kedro.pipeline import node, Pipeline
from kedro.runner import SequentialRunner

# Prepare a data catalog
data_catalog = DataCatalog({"my_salutation": MemoryDataSet()})


# Prepare first node
def return_greeting():
    return "Hello"


return_greeting_node = node(return_greeting, inputs=None, outputs="my_salutation")


# Prepare second node
def join_statements(greeting):
    return f"{greeting} Kedro!"


join_statements_node = node(
    join_statements, inputs="my_salutation", outputs="my_message"
)

# Assemble nodes into a pipeline
pipeline = Pipeline([return_greeting_node, join_statements_node])

# Create a runner to run the pipeline
runner = SequentialRunner()

Example #23

0

Show file

File: test_thread_runner.py Project: zeta1999/kedro

 def test_node_returning_none(self):
     pipeline = Pipeline([node(identity, "A", "B"), node(return_none, "B", "C")])
     catalog = DataCatalog({"A": MemoryDataSet("42")})
     pattern = "Saving `None` to a `DataSet` is not allowed"
     with pytest.raises(DataSetError, match=pattern):
         ThreadRunner().run(pipeline, catalog)

Example #24

0

Show file

 def test_config_good_version(self):
     config = yaml.safe_load(StringIO(YML_CONFIG_VERSIONED))
     catalog = DataCatalog.from_config(config,
                                       load_versions={"test_ds": "42"})
     assert catalog._data_sets["test_ds"]._dataset._version.load == "42"

Example #25

0

Show file

File: test_thread_runner.py Project: zeta1999/kedro

 def test_memory_data_set_input(self, fan_out_fan_in):
     catalog = DataCatalog({"A": MemoryDataSet("42")})
     result = ThreadRunner().run(fan_out_fan_in, catalog)
     assert "Z" in result
     assert result["Z"] == ("42", "42", "42")

Example #26

0

Show file

def load_well_validation_data(dummy2, timesteps, parameters: Dict):
    import glob, os
    os.chdir(parameters["path_model_input"])
    files_val = []
    for file in glob.glob("*.csv"):
        files_val.append(file)
    filenames_val = []
    wells_data = []
    for file in files_val:
        filename, extension = file.split('.')
        filenames_val.append(filename)
    for file, filename in zip(files_val, filenames_val):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" +
                            file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed_val = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure',
        #                    'Producer Bottom Hole Pressure', 'ESP Speed',
        #                    'Steam Flow Rate - Outer',
        #                    'Emulsion Flow Rate']]

        well = well[[
            'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP',
            'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]',
            'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Prod_Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Prod_Date')
        #         well = well.dropna(axis=0)   # may change
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        #         well = well.rolling(30, min_periods=1).mean()
        #         well = well.rolling(30, min_periods=1).mean()

        data = well['Oil [bbl/d]'] / 6.28981
        well.insert(4, 'Oil [m3/d]', data)
        time_data = np.arange(len(well))
        well.insert(0, 'Timestep', time_data)

        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    if min_well_length < timesteps:
        timesteps_validation = min_well_length
    else:
        timesteps_validation = timesteps

    for well, file, filename in zip(wells_data_, files_val, filenames_val):
        well = well.iloc[:timesteps_validation]  # daily data
        #         well = well.fillna(0)
        #         well = well.fillna(well.rolling(30, min_periods=1).median())
        #         well = well.fillna(well.median())
        Raw_Data_preprocessed_val.append(well)

    stats_validation = CSVLocalDataSet(filepath=parameters["path_val_stats"] +
                                       "/static_P50_data_validation.csv")
    stats_val = stats_validation.load()
    stats_val_ROIP = stats_val.loc[:, 'ROIP']
    stats_val = stats_val.loc[:,
                              'Effective_Length':'BottomWater_Oil_Saturation']

    # #     using only rich geoostats and no bottom water properties
    #     stats_val = stats_val.loc[:, 'Effective_Length':'Rich_Oil_Saturation']

    # #     Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness
    #     data = stats_val['Rich_Pay_Thickness'] - stats_val['Stand_Off']
    #     stats_val.insert(3, 'Effective_Rich_Pay_Thickness', data)
    #     stats_val = stats_val.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off'])

    property_names_val = list(stats_val.columns)
    properties_val = list(stats_val.values)

    #     properties_val = stats.loc[:, ['Effective_Length', 'Spacing', 'Effective_Rich_Pay_Thickness', 'Non_Rich_Pay_Thickness',
    #                               'Rich_Vertical_Permeability','Non_Rich_Vertical_Permeability', 'Rich_Porosity',
    #                                       'Non_Rich_Porosity', 'Rich_Oil_Saturation', 'Non_Rich_Oil_Saturation']].values
    properties_val = np.array(properties_val)
    dummy11 = files_val
    return [
        dummy11, timesteps_validation, Raw_Data_preprocessed_val, files_val,
        filenames_val, properties_val, stats_val_ROIP, property_names_val
    ]

Example #27

0

Show file

def data_catalog(data_set):
    return DataCatalog(data_sets={"test": data_set})

Example #28

0

Show file

File: code_example.py Project: sourcery-ai-bot/kedro-user-testing

from kedro.io import DataCatalog
from kedro.extras.datasets.pandas import CSVDataSet

io = DataCatalog({"titanic_training_data": CSVDataSet(filepath="train.csv")})

# Load your file and print the output
df = io.load("titanic_training_data")
print(df.head())

Example #29

0

Show file

 def test_load_from_unregistered(self):
     """Check the error when attempting to load unregistered data set"""
     catalog = DataCatalog(data_sets={})
     pattern = r"DataSet 'test' not found in the catalog"
     with pytest.raises(DataSetNotFoundError, match=pattern):
         catalog.load("test")

Example #30

0

Show file

 def test_missing_nested_credentials(self, sane_config_with_nested_creds):
     del sane_config_with_nested_creds["credentials"]["other_credentials"]
     pattern = "Unable to find credentials 'other_credentials'"
     with pytest.raises(KeyError, match=pattern):
         DataCatalog.from_config(**sane_config_with_nested_creds)