def load_dataset( context: MLClientCtx, dataset: str, name: str = "", file_ext: str = "parquet", params: dict = {}, ) -> None: """Loads a scikit-learn toy dataset for classification or regression The following datasets are available ('name' : desription): 'boston' : boston house-prices dataset (regression) 'iris' : iris dataset (classification) 'diabetes' : diabetes dataset (regression) 'digits' : digits dataset (classification) 'linnerud' : linnerud dataset (multivariate regression) 'wine' : wine dataset (classification) 'breast_cancer' : breast cancer wisconsin dataset (classification) The scikit-learn functions return a data bunch including the following items: - data the features matrix - target the ground truth labels - DESCR a description of the dataset - feature_names header for data The features (and their names) are stored with the target labels in a DataFrame. For further details see https://scikit-learn.org/stable/datasets/index.html#toy-datasets :param context: function execution context :param dataset: name of the dataset to load :param name: artifact name (defaults to dataset) :param file_ext: output file_ext: parquet or csv :param params: params of the sklearn load_data method """ dataset = str(dataset) pkg_module = "sklearn.datasets" fname = f"load_{dataset}" pkg_module = __import__(pkg_module, fromlist=[fname]) load_data_fn = getattr(pkg_module, fname) data = load_data_fn(**params) feature_names = data["feature_names"] xy = np.concatenate([data["data"], data["target"].reshape(-1, 1)], axis=1) if hasattr(feature_names, "append"): feature_names.append("labels") else: feature_names = np.append(feature_names, "labels") df = pd.DataFrame(data=xy, columns=feature_names) context.log_dataset(name or dataset, df=df, format=file_ext, index=False)
def training(context: MLClientCtx, p1: int = 1, p2: int = 2) -> None: """Train a model. :param context: The runtime context object. :param p1: A model parameter. :param p2: Another model parameter. """ # access input metadata, values, and inputs print(f"Run: {context.name} (uid={context.uid})") print(f"Params: p1={p1}, p2={p2}") context.logger.info("started training") # <insert training code here> # log the run results (scalar values) context.log_result("accuracy", p1 * 2) context.log_result("loss", p1 * 3) # add a lable/tag to this run context.set_label("category", "tests") # log a simple artifact + label the artifact # If you want to upload a local file to the artifact repo add src_path=<local-path> context.log_artifact("somefile", body=b"abc is 123", local_path="myfile.txt") # create a dataframe artifact df = pd.DataFrame([{ "A": 10, "B": 100 }, { "A": 11, "B": 110 }, { "A": 12, "B": 120 }]) context.log_dataset("mydf", df=df) # Log an ML Model artifact, add metrics, params, and labels to it # and place it in a subdir ('models') under artifacts path context.log_model( "mymodel", body=b"abc is 123", model_file="model.txt", metrics={"accuracy": 0.85}, parameters={"xx": "abc"}, labels={"framework": "xgboost"}, artifact_path=context.artifact_subpath("models"), )
def gen_class_data(context: MLClientCtx, n_samples: int, m_features: int, k_classes: int, header: Optional[List[str]], label_column: Optional[str] = "labels", weight: float = 0.5, random_state: int = 1, key: str = "classifier-data", file_ext: str = "parquet", sk_params={}): """Create a binary classification sample dataset and save. If no filename is given it will default to: "simdata-{n_samples}X{m_features}.parquet". Additional scikit-learn parameters can be set using **sk_params, please see https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html for more details. :param context: function context :param n_samples: number of rows/samples :param m_features: number of cols/features :param k_classes: number of classes :param header: header for features array :param label_column: column name of ground-truth series :param weight: fraction of sample negative value (ground-truth=0) :param random_state: rng seed (see https://scikit-learn.org/stable/glossary.html#term-random-state) :param key: key of data in artifact store :param file_ext: (pqt) extension for parquet file :param sk_params: additional parameters for `sklearn.datasets.make_classification` """ features, labels = make_classification(n_samples=n_samples, n_features=m_features, weights=weight, n_classes=k_classes, random_state=random_state, **sk_params) # make dataframes, add column names, concatenate (X, y) X = pd.DataFrame(features) if not header: X.columns = ["feat_" + str(x) for x in range(m_features)] else: X.columns = header y = pd.DataFrame(labels, columns=[label_column]) data = pd.concat([X, y], axis=1) context.log_dataset(key, df=data, format=file_ext, index=False)
def sql_to_file( context: MLClientCtx, sql_query: str, database_url: str, file_ext: str = "parquet", ) -> None: """SQL Ingest - Ingest data using SQL query :param context: the function context :param sql_query: the sql query used to retrieve the data :param database_url: database connection URL :param file_ext: ("parquet") format for result file """ engine = create_engine(database_url) df = pd.read_sql(sql_query, engine) context.log_dataset( "query result", df=df, format=file_ext, artifact_path=context.artifact_subpath("data"), )
def fit(context: MLClientCtx, dataset: DataItem, num_boost_round: int = 10, evals: List[Tuple[DMatrix, str]] = [], obj: Union[Callable, str] = "", feval: Union[Callable, str] = None, maximize: bool = False, early_stopping_rounds: int = None, evals_result: dict = {}, verbose_eval: bool = True, xgb_model: DataItem = None, callbacks: List[Callable] = [], label_column: str = "labels", encode_cols: dict = {}, sample: int = -1, test_size: float = 0.25, valid_size: float = 0.75, random_state: int = 1994, models_dest: str = "models", plots_dest: str = "plots", file_ext: str = "csv", test_set_key: str = "test-set", gpus: bool = False) -> None: """low level xgboost train api for the xgboost `train` params see: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.train Note: the first parameter of xgboost's `train` method is a dict of parameters supplied to the booster (engine). To modify one of those simply add a task parameter (when running you supply an mlrun NewTask) with the prefix "XGB_". So for example, to set the 'tree_method' parameter to 'approx', add {"XGB_tree_method":"approx"} to the task params key. :param context: the function context :param dataset: the full data set, train, valid and test will be extracted and each converted to a DMatrix for input to xgboost's `train` :param label_column: ground-truth (y) labels :param encode_cols: dictionary of names and prefixes for columns that are to hot be encoded. :param sample: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample :param test_size: (0.05) test set size :param valid_size: (0.75) Once the test set has been removed the training set gets this proportion. :param random_state: (1) sklearn rng seed :param models_dest: destination subfolder for model artifacts :param plots_dest: destination subfolder for plot artifacts :param file_ext: format for test_set_key hold out data :param test_set_key: (test-set), key of held out data in artifact store :param gpus: (False), run on gpus """ raw, labels, header = get_sample(dataset, sample, label_column) # hot-encode if encode_cols: raw = pd.get_dummies(raw, columns=list(encode_cols.keys()), prefix=list(encode_cols.values()), drop_first=True) # split the sample into train validate, test and calibration sets: (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = \ get_splits(raw, labels, 3, test_size, valid_size, random_state) # save test data as regular dataframe as it may be used by other process context.log_dataset(test_set_key, df=pd.concat([xtest, ytest], axis=1), format=file_ext, index=False) # convert to xgboost DMatrix (todo - dask, gpu) dtrain = DMatrix(xtrain, label=ytrain) dvalid = DMatrix(xvalid, label=yvalid) boost_params = { "tree_method": "gpu_hist" if gpus else "hist", "seed": random_state, "disable_default_eval_metric": 1, "objective": "reg:squaredlogerror", "eval_metric": "rmsle" } # enable user to customize `booster param` parameters for k, v in context.parameters.items(): if k.startswith('XGB_'): boost_params[k[4:]] = v # collect learning curves / training history results = dict() booster = train( boost_params, dtrain=dtrain, num_boost_round=num_boost_round, evals=[(dtrain, "train"), (dvalid, "valid")], evals_result=results, obj=squared_log, feval=rmsle, maximize=maximize, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, # xgb_model=xgb_model, # callbacks: List[Callable] = [] ) context.log_model("model", body=dumps(booster), model_file="model.pkl", artifact_path='/User/artifacts/tttt') learning_curves(context, results)
def train_model( context: MLClientCtx, model_pkg_class: str, dataset: DataItem, label_column: str = "labels", encode_cols: List[str] = [], sample: int = -1, test_size: float = 0.30, train_val_split: float = 0.75, test_set_key: str = "test_set", model_evaluator=None, models_dest: str = "", plots_dest: str = "plots", file_ext: str = "parquet", model_pkg_file: str = "", random_state: int = 1, ) -> None: """train a classifier An optional cutom model evaluator can be supplied that should have the signature: `my_custom_evaluator(context, xvalid, yvalid, model)` and return a dictionary of scalar "results", a "plots" keys with a list of PlotArtifacts, and and "tables" key containing a returned list of TableArtifacts. :param context: the function context :param model_pkg_class: the model to train, e.g, "sklearn.neural_networks.MLPClassifier", or json model config :param dataset: ("data") name of raw data file :param label_column: ground-truth (y) labels :param encode_cols: dictionary of names and prefixes for columns that are to hot be encoded. :param sample: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample :param test_size: (0.05) test set size :param train_val_split: (0.75) Once the test set has been removed the training set gets this proportion. :param test_set_key: key of held out data in artifact store :param model_evaluator: (None) a custom model evaluator can be specified :param models_dest: ("") models subfolder on artifact path :param plots_dest: plot subfolder on artifact path :param file_ext: ("parquet") format for test_set_key hold out data :param random_state: (1) sklearn rng seed """ models_dest = models_dest or "model" raw, labels, header = get_sample(dataset, sample, label_column) if encode_cols: raw = pd.get_dummies(raw, columns=list(encode_cols.keys()), prefix=list(encode_cols.values()), drop_first=True) (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits( raw, labels, 3, test_size, 1 - train_val_split, random_state) context.log_dataset(test_set_key, df=pd.concat([xtest, ytest.to_frame()], axis=1), format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": xtrain, "y": ytrain.values}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) if model_evaluator: eval_metrics = model_evaluator(context, xvalid, yvalid, model, plots_artifact_path=plots_path) else: eval_metrics = eval_model_v2(context, xvalid, yvalid, model, plots_artifact_path=plots_path) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, extra_data=eval_metrics, model_file="model.pkl", metrics=context.results, labels={"class": model_pkg_class})
def describe_spark(context: MLClientCtx, dataset: DataItem, artifact_path, bins: int = 30, describe_extended: bool = True): location = dataset.local() spark = SparkSession.builder.appName("Spark job").getOrCreate() df = spark.read.csv(location, header=True, inferSchema=True) kwargs = [] float_cols = [ item[0] for item in df.dtypes if item[1].startswith('float') or item[1].startswith('double') ] if describe_extended == True: table, variables, freq = describe(df, bins, float_cols, kwargs) tbl_1 = variables.reset_index() if len(freq) != 0: tbl_2 = pd.DataFrame.from_dict( freq, orient="index").sort_index().stack().reset_index() tbl_2.columns = ['col', 'key', 'val'] tbl_2['Merged'] = [{ key: val } for key, val in zip(tbl_2.key, tbl_2.val)] tbl_2 = tbl_2.groupby( 'col', as_index=False).agg(lambda x: tuple(x))[['col', 'Merged']] summary = pd.merge(tbl_1, tbl_2, how='left', left_on='index', right_on='col') else: summary = tbl_1 context.log_dataset("summary_stats", df=summary, format="csv", index=False, artifact_path=context.artifact_subpath('data')) context.log_results(table) else: tbl_1 = df.describe().toPandas() summary = tbl_1.T context.log_dataset("summary_stats", df=summary, format="csv", index=False, artifact_path=context.artifact_subpath('data')) spark.stop()
def train_model(context: MLClientCtx, dataset: DataItem, model_pkg_class: str, label_column: str = "label", train_validation_size: float = 0.75, sample: float = 1.0, models_dest: str = "models", test_set_key: str = "test_set", plots_dest: str = "plots", dask_key: str = "dask_key", dask_persist: bool = False, scheduler_key: str = '', file_ext: str = "parquet", random_state: int = 42) -> None: """ Train a sklearn classifier with Dask :param context: Function context. :param dataset: Raw data file. :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", or json model config. :param label_column: (label) Ground-truth y labels. :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. :param models_dest: (models) Models subfolder on artifact path. :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. :param plots_dest: (plots) Plot subfolder on artifact path. :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. :param dask_persist: (False) Should the data be persisted (through the `client.persist`) :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. :param file_ext: (parquet) format for test_set_key hold out data :param random_state: (42) sklearn seed """ if scheduler_key: client = Client(scheduler_key) else: client = Client() context.logger.info("Read Data") df = dataset.as_df(df_module=dd) context.logger.info("Prep Data") numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df = df.select_dtypes(include=numerics) if df.isna().any().any().compute() == True: raise Exception('NAs valus found') df_header = df.columns df = df.sample(frac=sample).reset_index(drop=True) encoder = LabelEncoder() encoder = encoder.fit(df[label_column]) X = df.drop(label_column, axis=1).to_dask_array(lengths=True) y = encoder.transform(df[label_column]) classes = df[label_column].drop_duplicates() # no unique values in dask classes = [str(i) for i in classes] context.logger.info("Split and Train") X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, train_size=train_validation_size, random_state=random_state) scaler = StandardScaler() scaler = scaler.fit(X_train) X_train_transformed = scaler.transform(X_train) X_test_transformed = scaler.transform(X_test) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": X_train_transformed, "y": y_train}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) with joblib.parallel_backend("dask"): model = model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.logger.info("Evaluate") extra_data_dict = {} for report in (ROCAUC, ClassificationReport, ConfusionMatrix): report_name = str(report.__name__) plt.cla() plt.clf() plt.close() viz = report(model, classes=classes, per_class=True, is_fitted=True) viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data plot = context.log_artifact(PlotArtifact(report_name, body=viz.fig, title=report_name), db_key=False) extra_data_dict[str(report)] = plot if report_name == 'ROCAUC': context.log_results({ "micro": viz.roc_auc.get("micro"), "macro": viz.roc_auc.get("macro") }) elif report_name == 'ClassificationReport': for score_name in viz.scores_: for score_class in viz.scores_[score_name]: context.log_results({ score_name + "-" + score_class: viz.scores_[score_name].get(score_class) }) viz = FeatureImportances(model, classes=classes, per_class=True, is_fitted=True, labels=df_header.delete( df_header.get_loc(label_column))) viz.fit(X_train_transformed, y_train) viz.score(X_test_transformed, y_test) plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, title="FeatureImportances"), db_key=False) extra_data_dict[str("FeatureImportances")] = plot plt.cla() plt.clf() plt.close() context.logger.info("Log artifacts") artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, model_file="model.pkl", extra_data=extra_data_dict, metrics=context.results, labels={"class": model_pkg_class}) context.log_artifact("standard_scaler", body=dumps(scaler), artifact_path=artifact_path, model_file="scaler.gz", label="standard_scaler") context.log_artifact("label_encoder", body=dumps(encoder), artifact_path=artifact_path, model_file="encoder.gz", label="label_encoder") df_to_save = delayed(np.column_stack)((X_test, y_test)).compute() context.log_dataset( test_set_key, df=pd.DataFrame(df_to_save, columns=df_header), # improve log dataset ability format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) context.logger.info("Done!")
def permutation_importance( context: MLClientCtx, model: DataItem, dataset: DataItem, labels: str, figsz=(10, 5), plots_dest: str = "plots", fitype: str = "permute", ) -> pd.DataFrame: """calculate change in metric type 'permute' uses a pre-estimated model type 'dropcol' uses a re-estimates model :param context: the function's execution context :param model: a trained model :param dataset: features and ground truths, regression targets :param labels name of the ground truths column :param figsz: matplotlib figure size :param plots_dest: path within artifact store : """ model_file, model_data, _ = get_model(model.url, suffix=".pkl") model = load(open(str(model_file), "rb")) X = dataset.as_df() y = X.pop(labels) header = X.columns metric = _oob_classifier_accuracy baseline = metric(model, X, y) imp = [] for col in X.columns: if fitype is "permute": save = X[col].copy() X[col] = np.random.permutation(X[col]) m = metric(model, X, y) X[col] = save imp.append(baseline - m) elif fitype is "dropcol": X_ = X.drop(col, axis=1) model_ = clone(model) #model_.random_state = random_state model_.fit(X_, y) o = model_.oob_score_ imp.append(baseline - o) else: raise ValueError( "unknown fitype, only 'permute' or 'dropcol' permitted") zipped = zip(imp, header) feature_imp = pd.DataFrame(sorted(zipped), columns=["importance", "feature"]) feature_imp.sort_values(by="importance", ascending=False, inplace=True) plt.clf() plt.figure(figsize=figsz) sns.barplot(x="importance", y="feature", data=feature_imp) plt.title(f"feature importances-{fitype}") plt.tight_layout() context.log_artifact( PlotArtifact(f"feature importances-{fitype}", body=plt.gcf()), local_path=f"{plots_dest}/feature-permutations.html", ) context.log_dataset(f"feature-importances-{fitype}-tbl", df=feature_imp, index=False)
def data_clean(context: MLClientCtx, src: DataItem, file_ext: str = "csv", models_dest: str = "models/encoders", cleaned_key: str = "cleaned-data", encoded_key: str = "encoded-data"): df = src.as_df() # drop columns drop_cols_list = ["customerID", "TotalCharges"] df.drop(drop_cols_list, axis=1, inplace=True) # header transformations old_cols = df.columns rename_cols_map = { "SeniorCitizen": "senior", "Partner": "partner", "Dependents": "deps", "Churn": "labels" } df.rename(rename_cols_map, axis=1, inplace=True) # add drop column to logs: for col in drop_cols_list: rename_cols_map.update({col: "_DROPPED_"}) # log the op tp = os.path.join(models_dest, "preproc-column_map.json") context.log_artifact("preproc-column_map.json", body=json.dumps(rename_cols_map), local_path=tp) df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x) # encode numerical type as category bins (ordinal) bins = [0, 12, 24, 36, 48, 60, np.inf] labels = [0, 1, 2, 3, 4, 5] tenure = df.tenure.copy(deep=True) df["tenure_map"] = pd.cut(df.tenure, bins, labels=False) tenure_map = dict(zip(bins, labels)) # save this transformation tp = os.path.join(models_dest, "preproc-numcat_map.json") context.log_artifact("preproc-numcat_map.json", body=bytes(json.dumps(tenure_map).encode("utf-8")), local_path=tp) context.log_dataset(cleaned_key, df=df, format=file_ext, index=False) fix_cols = [ "gender", "partner", "deps", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PhoneService", "MultipleLines", "PaperlessBilling", "InternetService", "Contract", "PaymentMethod", "labels" ] d = defaultdict(LabelEncoder) df[fix_cols] = df[fix_cols].apply( lambda x: d[x.name].fit_transform(x.astype(str))) context.log_dataset(encoded_key, df=df, format=file_ext, index=False) model_bin = dumps(d) context.log_model("model", body=model_bin, artifact_path=os.path.join(context.artifact_path, models_dest), model_file="model.pkl")
def data_clean( context: MLClientCtx, src: DataItem, file_ext: str = "csv", models_dest: str = "models/encoders", cleaned_key: str = "cleaned-data", encoded_key: str = "encoded-data", ): """process a raw churn data file Data has 3 states here: `raw`, `cleaned` and `encoded` * `raw` kept by default, the pipeline begins with a raw data artifact * `cleaned` kept for charts, presentations * `encoded` is input for a cross validation and training function steps (not necessarily in correct order, some parallel) * column name maps * deal with nans and other types of missings/junk * label encode binary and ordinal category columns * create category ranges from numerical columns And finally, * test Why we don't one-hot-encode here? One hot encoding isn't a necessary step for all algorithms. It can also generate a very large feature matrix that doesn't need to be serialized (even if sparse). So we leave one-hot-encoding for the training step. What about scaling numerical columns? Same as why we don't one hot encode here. Do we scale before train-test split? IMHO, no. Scaling before splitting introduces a type of data leakage. In addition, many estimators are completely immune to the monotonic transformations implied by scaling, so why waste the cycles? TODO: * parallelize where possible * more abstraction (more parameters, chain sklearn transformers) * convert to marketplace function :param context: the function execution context :param src: an artifact or file path :param file_ext: file type for artifacts :param models_dest: label encoders and other preprocessing steps should be saved together with other pipeline models :param cleaned_key: key of cleaned data table in artifact store :param encoded_key: key of encoded data table in artifact store """ df = src.as_df() # drop columns drop_cols_list = ["customerID", "TotalCharges"] df.drop(drop_cols_list, axis=1, inplace=True) # header transformations rename_cols_map = { "SeniorCitizen": "senior", "Partner": "partner", "Dependents": "deps", "Churn": "labels", } df.rename(rename_cols_map, axis=1, inplace=True) # add drop column to logs: for col in drop_cols_list: rename_cols_map.update({col: "_DROPPED_"}) # log the op tp = os.path.join(models_dest, "preproc-column_map.json") context.log_artifact("preproc-column_map.json", body=json.dumps(rename_cols_map), local_path=tp) # VALUE transformations # clean # truncate reply to "No" df = df.applymap(lambda x: "No" if str(x).startswith("No ") else x) # encode numerical type as category bins (ordinal) bins = [0, 12, 24, 36, 48, 60, np.inf] labels = [0, 1, 2, 3, 4, 5] df["tenure_map"] = pd.cut(df.tenure, bins, labels=False) tenure_map = dict(zip(bins, labels)) # save this transformation tp = os.path.join(models_dest, "preproc-numcat_map.json") context.log_artifact( "preproc-numcat_map.json", body=bytes(json.dumps(tenure_map).encode("utf-8")), local_path=tp, ) context.log_dataset(cleaned_key, df=df, format=file_ext, index=False) # label encoding - generate model for each column saved in dict # some of these columns may be hot encoded in the training step fix_cols = [ "gender", "partner", "deps", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "PhoneService", "MultipleLines", "PaperlessBilling", "InternetService", "Contract", "PaymentMethod", "labels", ] d = defaultdict(LabelEncoder) df[fix_cols] = df[fix_cols].apply( lambda x: d[x.name].fit_transform(x.astype(str))) context.log_dataset(encoded_key, df=df, format=file_ext, index=False) model_bin = dumps(d) context.log_model( "model", body=model_bin, artifact_path=os.path.join(context.artifact_path, models_dest), model_file="model.pkl", )
def arc_to_parquet(context: MLClientCtx, archive_url: DataItem, header: List[str] = [None], chunksize: int = 0, dtype=None, encoding: str = "latin-1", key: str = "data", dataset: str = "None", part_cols=[], file_ext: str = "parquet", index: bool = False, refresh_data: bool = False, stats: bool = False) -> None: """Open a file/object archive and save as a parquet file or dataset Notes ----- * this function is typically for large files, please be sure to check all settings * partitioning requires precise specification of column types. * the archive_url can be any file readable by pandas read_csv, which includes tar files * if the `dataset` parameter is not empty, then a partitioned dataset will be created instead of a single file in the folder `dataset` * if a key exists already then it will not be re-acquired unless the `refresh_data` param is set to `True`. This is in case the original file is corrupt, or a refresh is required. :param context: the function context :param archive_url: MLRun data input (DataItem object) :param chunksize: (0) when > 0, row size (chunk) to retrieve per iteration :param dtype destination data type of specified columns :param encoding ("latin-8") file encoding :param key: key in artifact store (when log_data=True) :param dataset: (None) if not None then "target_path/dataset" is folder for partitioned files :param part_cols: ([]) list of partitioning columns :param file_ext: (parquet) csv/parquet file extension :param index: (False) pandas save index option :param refresh_data: (False) overwrite existing data at that location :param stats: (None) calculate table stats when logging artifact """ base_path = context.artifact_path os.makedirs(base_path, exist_ok=True) archive_url = archive_url.local() if dataset is not None: dest_path = os.path.join(base_path, dataset) exists = os.path.isdir(dest_path) else: dest_path = os.path.join(base_path, key + f".{file_ext}") exists = os.path.isfile(dest_path) if not exists: context.logger.info("destination file does not exist, downloading") if chunksize > 0: header = _chunk_readwrite(archive_url, dest_path, chunksize, encoding, dtype, dataset) context.log_dataset(key=key, stats=stats, format='parquet', target_path=dest_path) else: df = pd.read_csv(archive_url) context.log_dataset(key, df=df, format=file_ext, index=index) else: context.logger.info("destination file already exists, nothing done")
def train_model( context: MLClientCtx, dataset: DataItem, event_column: str = "labels", time_column: str = "tenure", encode_cols: dict = {}, strata_cols: list = [], plot_cov_groups: bool = False, p_value: float = 0.005, sample: int = -1, test_size: float = 0.25, valid_size: float = 0.75, # (after test removed) random_state: int = 1, models_dest: str = "", plots_dest: str = "", file_ext: str = "csv", ) -> None: """train models to predict the timing of events Although identical in structure to other training functions, this one requires generating a 'Y' that represents the age/duration/tenure of the obervation, designated 'tenure' here, and a binary labels columns that represents the event of interest, churned/not-churned. In addition, there is a strata_cols parameter, representing a list of stratification (aka grouping) variables. :param context: the function context :param dataset: ("data") name of raw data file :param event_column: ground-truth (y) labels (considered as events in this model) :param time_column: age or tenure column :param encode_cols: dictionary of names and prefixes for columns that are to hot be encoded. :param strata_cols: columns used to stratify predictors :param plot_cov_groups: :param p_value: (0.005) max p value for coeffcients selected :param sample: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample :param test_size: (0.25) test set size :param valid_size: (0.75) Once the test set has been removed the training set gets this proportion. :param random_state: (1) sklearn rng seed :param models_dest: destination subfolder for model artifacts :param plots_dest: destination subfolder for plot artifacts :param file_ext: format for test_set_key hold out data """ from lifelines.plotting import plot_lifetimes import matplotlib.pyplot as plt models_dest = models_dest or "models" plots_dest = plots_dest or f"plots/{context.name}" raw, tenure, header = get_sample(dataset, sample, time_column) if encode_cols: raw = pd.get_dummies( raw, columns=list(encode_cols.keys()), prefix=list(encode_cols.values()), drop_first=True, ) (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits( raw, tenure, 3, test_size, valid_size, random_state) for X in [xtrain, xvalid, xtest]: drop_cols = X.columns.str.startswith(time_column) X.drop(X.columns[drop_cols], axis=1, inplace=True) for Y in [ytrain, yvalid, ytest]: Y.name = time_column context.log_dataset( "tenured-test-set", df=pd.concat([xtest, ytest.to_frame()], axis=1), format=file_ext, index=False, ) km_model = KaplanMeierFitter().fit(ytrain, xtrain.labels) _kaplan_meier_log_model(context, km_model, models_dest=models_dest) coxdata = pd.concat([xtrain, ytrain.to_frame()], axis=1) cx_model = CoxPHFitter().fit(coxdata, time_column, event_column, strata=strata_cols) _coxph_log_model( context, cx_model, models_dest=models_dest, plot_cov_groups=plot_cov_groups, extra_data={"km": f"{models_dest}/km"}, )
def train_model( context: MLClientCtx, model_type: str, dataset: Union[DataItem, pd.core.frame.DataFrame], label_column: str = "labels", encode_cols: dict = {}, sample: int = -1, imbal_vec=[], test_size: float = 0.25, valid_size: float = 0.75, random_state: int = 1, models_dest: str = "models", plots_dest: str = "plots", eval_metrics: list = ["error", "auc"], file_ext: str = "parquet", test_set: str = "test_set", ) -> None: """train an xgboost model. Note on imabalanced data: the `imbal_vec` parameter represents the measured class representations in the sample and can be used as a first step in tuning an XGBoost model. This isn't a hyperparamter, merely an estimate that should be set as 'constant' throughout tuning process. :param context: the function context :param model_type: the model type to train, "classifier", "regressor"... :param dataset: ("data") name of raw data file :param label_column: ground-truth (y) labels :param encode_cols: dictionary of names and prefixes for columns that are to hot be encoded. :param sample: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample :param imbal_vec: ([]) vector of class weights seen in sample :param test_size: (0.05) test set size :param valid_size: (0.75) Once the test set has been removed the training set gets this proportion. :param random_state: (1) sklearn rng seed :param models_dest: destination subfolder for model artifacts :param plots_dest: destination subfolder for plot artifacts :param eval_metrics: (["error", "auc"]) learning curve metrics :param file_ext: format for test_set_key hold out data :param test-set: (test_set) key of held out data in artifact store """ models_dest = models_dest or "models" plots_dest = plots_dest or f"plots/{context.name}" raw, labels, header = get_sample(dataset, sample, label_column) if encode_cols: raw = pd.get_dummies( raw, columns=list(encode_cols.keys()), prefix=list(encode_cols.values()), drop_first=True, ) (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits( raw, labels, 3, test_size, valid_size, random_state) context.log_dataset(test_set, df=pd.concat([xtest, ytest], axis=1), format=file_ext, index=False) model_config = _gen_xgb_model(model_type, context.parameters.items()) XGBBoostClass = create_class(model_config["META"]["class"]) model = XGBBoostClass(**model_config["CLASS"]) model_config["FIT"].update({ "X": xtrain, "y": ytrain.values, "eval_set": [(xtrain, ytrain), (xvalid, yvalid)], "eval_metric": eval_metrics, }) model.fit(**model_config["FIT"]) eval_metrics = eval_model_v2(context, xvalid, yvalid, model) model_bin = dumps(model) context.log_model( "model", body=model_bin, artifact_path=os.path.join(context.artifact_path, models_dest), model_file="model.pkl", )