def _prep_dir(source, target_dir, suffix, secrets, clone): if not target_dir: raise ValueError( "please specify a target (context) directory for clone") if clone and path.exists(target_dir) and path.isdir(target_dir): shutil.rmtree(target_dir) tmpfile = mktemp(suffix) mlrun.get_dataitem(source, secrets).download(tmpfile) return tmpfile
def _prep_dir(source, target_dir, suffix, secrets, clone): if not target_dir: raise ValueError( "please specify a target (context) directory for clone") if clone and path.exists(target_dir) and path.isdir(target_dir): shutil.rmtree(target_dir) temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False).name mlrun.get_dataitem(source, secrets).download(temp_file) return temp_file
def dataitems(self) -> List["mlrun.DataItem"]: """return as a list of DataItem objects""" dataitems = [] for item in self: artifact = get_artifact_target(item) if artifact: dataitems.append(mlrun.get_dataitem(artifact)) return dataitems
def artifact(self, key) -> "mlrun.DataItem": """return artifact DataItem by key""" if self.outputs_wait_for_completion: self.wait_for_completion() artifact = self._artifact(key) if artifact: uri = get_artifact_target(artifact, self.metadata.project) if uri: return mlrun.get_dataitem(uri) return None
def test_fsspec(): with TemporaryDirectory() as tmpdir: print(tmpdir) store, _ = mlrun.store_manager.get_or_create_store(tmpdir) fs = store.get_filesystem(False) with store.open(tmpdir + "/1x.txt", "w") as fp: fp.write("123") with mlrun.get_dataitem(tmpdir + "/2x.txt").open("w") as fp: fp.write("456") files = fs.ls(tmpdir) assert len(files) == 2, "2 test files were not written" assert files[0].endswith("x.txt"), "wrong file name" assert fs.open(tmpdir + "/1x.txt", "r").read() == "123", "wrong file content"
def as_df( self, columns=None, df_module=None, entities=None, start_time=None, end_time=None, time_column=None, ): """return the target data as dataframe""" return mlrun.get_dataitem(self._target_path).as_df( columns=columns, df_module=df_module, start_time=start_time, end_time=end_time, time_column=time_column, )
def test_local_coxph_train(): ctx = get_or_create_ctx(name="tasks survive trainer") data_url = "https://raw.githubusercontent.com/mlrun/demos/0.6.x/customer-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv" src = mlrun.get_dataitem(data_url) data_clean(context=ctx, src=src, cleaned_key="artifacts/inputs/cleaned-data", encoded_key="artifacts/inputs/encoded-data") fn = import_function("function.yaml") fn.run(params={ "strata_cols": ['InternetService', 'StreamingMovies', 'StreamingTV', 'PhoneService'], "encode_cols": { "Contract": "Contract", "PaymentMethod": "Payment" }, "models_dest": 'models/cox' }, inputs={"dataset": "artifacts/inputs/encoded-data.csv"}, local=True) model = load(open("models/cox/km/model.pkl", "rb")) ans = model.predict([1, 10, 30, 100, 200]) assert (list(np.around(ans, 3)) == [0.969, 0.869, 0.781, 0.668, 0.668])
def as_df(self, columns=None, df_module=None): """return the target data as dataframe""" return mlrun.get_dataitem(self._target_path).as_df(columns=columns, df_module=df_module)
def to_dataframe(self): """return result as dataframe""" return mlrun.get_dataitem(self.target_uri).as_df()
def to_dataitem(self): """return a DataItem object (if available) representing the artifact content""" uri = self.get_store_url() if uri: return mlrun.get_dataitem(uri)
def test_log_large_dask_dataframe_to_azure(auth_method): # Create the environmental variables verify_auth_parameters_and_configure_env(auth_method) A = np.random.random_sample(size=(25000000, 6)) df = pd.DataFrame(data=A, columns=list("ABCDEF")) ddf = dd.from_pandas(df, npartitions=10).persist() size = ddf.memory_usage().sum().compute() print(f"demo data has size: {size // 1e6} MB") # Verify that the size of the dataframe is > 1GB, and so # will write a collection of files, instead of a single # file assert (size // 1e6) > 1100 # Create environmental vars context = mlrun.get_or_create_ctx("test") # Define the artifact location target_path = "az://" + config["env"].get("AZURE_CONTAINER") + "/" context.log_dataset( key="demo_data", df=ddf, format="parquet", artifact_path=target_path, stats=True, ) data_item2 = mlrun.get_dataitem(f"{target_path}demo_data.parquet") ddf2 = data_item2.as_df(df_module=dd) # Check that a collection of files is written to Azure, # rather than a single parquet file from adlfs import AzureBlobFileSystem fs = AzureBlobFileSystem( account_name=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"), account_key=os.getenv("AZURE_STORAGE_ACCOUNT_KEY"), connection_string=os.getenv("AZURE_STORAGE_CONNECTION_STRING"), tenant_id=os.getenv("AZURE_STORAGE_TENANT_ID"), client_id=os.getenv("AZURE_STORAGE_CLIENT_ID"), client_secret=os.getenv("AZURE_STORAGE_CLIENT_SECRET"), sas_token=os.getenv("AZURE_STORAGE_SAS_TOKEN"), ) # Verify that a directory was created, rather than a file path = target_path.partition("//")[2] path = os.path.join(path, "demo_data.parquet") assert fs.isdir(path) is True # Verify that a collection of files was written files = fs.ls(path) assert len(files) > 4 df2 = ddf2.compute() df2 = df2.reset_index(drop=True) df = ddf.compute() df = df.reset_index(drop=True) # Verify that the returned dataframe matches the original pd.testing.assert_frame_equal(df, df2, check_index_type=False, check_less_precise=True)