Ejemplo n.º 1
0
def _prep_dir(source, target_dir, suffix, secrets, clone):
    if not target_dir:
        raise ValueError(
            "please specify a target (context) directory for clone")
    if clone and path.exists(target_dir) and path.isdir(target_dir):
        shutil.rmtree(target_dir)
    tmpfile = mktemp(suffix)
    mlrun.get_dataitem(source, secrets).download(tmpfile)
    return tmpfile
Ejemplo n.º 2
0
def _prep_dir(source, target_dir, suffix, secrets, clone):
    if not target_dir:
        raise ValueError(
            "please specify a target (context) directory for clone")
    if clone and path.exists(target_dir) and path.isdir(target_dir):
        shutil.rmtree(target_dir)

    temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False).name
    mlrun.get_dataitem(source, secrets).download(temp_file)
    return temp_file
Ejemplo n.º 3
0
Archivo: lists.py Proyecto: mlrun/mlrun
 def dataitems(self) -> List["mlrun.DataItem"]:
     """return as a list of DataItem objects"""
     dataitems = []
     for item in self:
         artifact = get_artifact_target(item)
         if artifact:
             dataitems.append(mlrun.get_dataitem(artifact))
     return dataitems
Ejemplo n.º 4
0
 def artifact(self, key) -> "mlrun.DataItem":
     """return artifact DataItem by key"""
     if self.outputs_wait_for_completion:
         self.wait_for_completion()
     artifact = self._artifact(key)
     if artifact:
         uri = get_artifact_target(artifact, self.metadata.project)
         if uri:
             return mlrun.get_dataitem(uri)
     return None
Ejemplo n.º 5
0
def test_fsspec():
    with TemporaryDirectory() as tmpdir:
        print(tmpdir)
        store, _ = mlrun.store_manager.get_or_create_store(tmpdir)
        fs = store.get_filesystem(False)
        with store.open(tmpdir + "/1x.txt", "w") as fp:
            fp.write("123")
        with mlrun.get_dataitem(tmpdir + "/2x.txt").open("w") as fp:
            fp.write("456")
        files = fs.ls(tmpdir)
        assert len(files) == 2, "2 test files were not written"
        assert files[0].endswith("x.txt"), "wrong file name"
        assert fs.open(tmpdir + "/1x.txt",
                       "r").read() == "123", "wrong file content"
Ejemplo n.º 6
0
 def as_df(
     self,
     columns=None,
     df_module=None,
     entities=None,
     start_time=None,
     end_time=None,
     time_column=None,
 ):
     """return the target data as dataframe"""
     return mlrun.get_dataitem(self._target_path).as_df(
         columns=columns,
         df_module=df_module,
         start_time=start_time,
         end_time=end_time,
         time_column=time_column,
     )
Ejemplo n.º 7
0
def test_local_coxph_train():
    ctx = get_or_create_ctx(name="tasks survive trainer")
    data_url = "https://raw.githubusercontent.com/mlrun/demos/0.6.x/customer-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
    src = mlrun.get_dataitem(data_url)
    data_clean(context=ctx,
               src=src,
               cleaned_key="artifacts/inputs/cleaned-data",
               encoded_key="artifacts/inputs/encoded-data")
    fn = import_function("function.yaml")
    fn.run(params={
        "strata_cols":
        ['InternetService', 'StreamingMovies', 'StreamingTV', 'PhoneService'],
        "encode_cols": {
            "Contract": "Contract",
            "PaymentMethod": "Payment"
        },
        "models_dest":
        'models/cox'
    },
           inputs={"dataset": "artifacts/inputs/encoded-data.csv"},
           local=True)
    model = load(open("models/cox/km/model.pkl", "rb"))
    ans = model.predict([1, 10, 30, 100, 200])
    assert (list(np.around(ans, 3)) == [0.969, 0.869, 0.781, 0.668, 0.668])
Ejemplo n.º 8
0
 def as_df(self, columns=None, df_module=None):
     """return the target data as dataframe"""
     return mlrun.get_dataitem(self._target_path).as_df(columns=columns,
                                                        df_module=df_module)
Ejemplo n.º 9
0
 def to_dataframe(self):
     """return result as dataframe"""
     return mlrun.get_dataitem(self.target_uri).as_df()
Ejemplo n.º 10
0
Archivo: base.py Proyecto: mlrun/mlrun
 def to_dataitem(self):
     """return a DataItem object (if available) representing the artifact content"""
     uri = self.get_store_url()
     if uri:
         return mlrun.get_dataitem(uri)
def test_log_large_dask_dataframe_to_azure(auth_method):
    # Create the environmental variables
    verify_auth_parameters_and_configure_env(auth_method)

    A = np.random.random_sample(size=(25000000, 6))
    df = pd.DataFrame(data=A, columns=list("ABCDEF"))
    ddf = dd.from_pandas(df, npartitions=10).persist()

    size = ddf.memory_usage().sum().compute()
    print(f"demo data has size:  {size // 1e6} MB")
    # Verify that the size of the dataframe is > 1GB, and so
    # will write a collection of files, instead of a single
    # file
    assert (size // 1e6) > 1100

    # Create environmental vars
    context = mlrun.get_or_create_ctx("test")

    # Define the artifact location
    target_path = "az://" + config["env"].get("AZURE_CONTAINER") + "/"

    context.log_dataset(
        key="demo_data",
        df=ddf,
        format="parquet",
        artifact_path=target_path,
        stats=True,
    )

    data_item2 = mlrun.get_dataitem(f"{target_path}demo_data.parquet")
    ddf2 = data_item2.as_df(df_module=dd)

    # Check that a collection of files is written to Azure,
    # rather than a single parquet file
    from adlfs import AzureBlobFileSystem

    fs = AzureBlobFileSystem(
        account_name=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"),
        account_key=os.getenv("AZURE_STORAGE_ACCOUNT_KEY"),
        connection_string=os.getenv("AZURE_STORAGE_CONNECTION_STRING"),
        tenant_id=os.getenv("AZURE_STORAGE_TENANT_ID"),
        client_id=os.getenv("AZURE_STORAGE_CLIENT_ID"),
        client_secret=os.getenv("AZURE_STORAGE_CLIENT_SECRET"),
        sas_token=os.getenv("AZURE_STORAGE_SAS_TOKEN"),
    )
    # Verify that a directory was created, rather than a file
    path = target_path.partition("//")[2]
    path = os.path.join(path, "demo_data.parquet")
    assert fs.isdir(path) is True

    # Verify that a collection of files was written
    files = fs.ls(path)
    assert len(files) > 4

    df2 = ddf2.compute()
    df2 = df2.reset_index(drop=True)
    df = ddf.compute()
    df = df.reset_index(drop=True)
    # Verify that the returned dataframe matches the original
    pd.testing.assert_frame_equal(df,
                                  df2,
                                  check_index_type=False,
                                  check_less_precise=True)