def _run_train(self, *args, **kwargs): """ When an algorithm runs it always takes in a dataframe with training data, it may optionally have a dataframe of validation data and will return a dictionary with information on the trained model plus a number of artifacts. """ assert isinstance(args[0], pd.DataFrame) started_on = time_ms() results = collections.OrderedDict({ "type": "analitico/training", "plugins": { "training": self.Meta.name, # plugin used to train model "prediction": self.Meta. name, # plugin to be used for predictions (usually the same) }, "data": {}, # number of records, etc "parameters": {}, # model parameters, hyperparameters "scores": {}, # training scores "performance": get_runtime_brief(), # time elapsed, cpu, gpu, memory, disk, etc }) train = args[0] test = args[1] if len(args) > 1 else None results = self.train(train, test, results, *args, **kwargs) # finalize results and save as metadata.json results["performance"]["total_ms"] = time_ms(started_on) artifacts_path = self.factory.get_artifacts_directory() results_path = os.path.join(artifacts_path, "metadata.json") save_json(results, results_path) self.info("saved %s (%d bytes)", results_path, os.path.getsize(results_path)) return results
def run(self, *args, action=None, **kwargs): """ Process plugins in sequence, return combined result """ try: pipeline_on = time_ms() # logging is expensive so we don't track everything in prediction mode predicting = action and ACTION_PREDICT in action if not predicting: self.factory.status(self, status.STATUS_RUNNING) for p, plugin in enumerate(self.plugins): plugin_on = time_ms() if not predicting: self.factory.status(plugin, status.STATUS_RUNNING) # a plugin can have one or more input parameters and one or more # output parameters. results from a call to the next in the chain # are passed as tuples. when we finally return, if we have a single # result we unpackit, otherwise we return as tuple. this allows # a pipeline of plugins to chain plugins with a variable number of # parameters. each plugin is responsible for validating the type of # its input positional parameters and named parameters. try: args = plugin.run(*args, action=action, **kwargs) if not isinstance(args, tuple): args = (args, ) except Exception as e: self.factory.status(plugin, status.STATUS_FAILED, exception=e) raise # log outputs of plugin # TODO skip when predicting if not predicting: output = self.get_metadata(*args) self.factory.status(plugin, status.STATUS_COMPLETED, elapsed_ms=time_ms(plugin_on), output=output) if not predicting: # log outputs of pipeline self.factory.status(self, status.STATUS_COMPLETED, elapsed_ms=time_ms(pipeline_on), output=output) return args if len(args) > 1 else args[0] except Exception as e: self.factory.status(self, status.STATUS_FAILED) self.factory.exception(self.Meta.name + " failed while processing", item=self, exception=e)
def retrieve_df(self, *args, action=None, **kwargs): """ Retrieve dataframe from dataset with id set in plugin's configuration """ try: dataset_id = self.get_attribute("dataset_id") if not dataset_id: dataset_id = self.get_attribute("source.dataset_id") if not dataset_id: self.exception( "DatasetSourcePlugin - must specify 'dataset_id'") info_url = "analitico://datasets/" + dataset_id + "/data/info" self.info("reading: %s", info_url) info = self.factory.get_url_json(info_url) schema = get_dict_dot(info, "data.schema", None) if not schema: self.warning( "DatasetSourcePlugin - %s does not contain schema information", info_url) # save the schema for the source so it can be used to enforce it on prediction self.set_attribute("source.schema", schema) # stream data from dataset endpoint or storage as csv csv_url = "analitico://datasets/" + dataset_id + "/data/csv" csv_stream = self.factory.get_url_stream(csv_url, binary=False) reading_on = time_ms() self.info("reading: %s", csv_url) df = analitico.pandas.pd_read_csv(csv_stream, schema) self.info("%d rows in %d ms", len(df), time_ms(reading_on)) # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html sample = self.get_attribute("sample", 0) if sample > 0: rows_before = len(df) df = analitico.pandas.pd_sample(df, sample) self.info("sample: %f, rows before: %d, rows after: %d", sample, rows_before, len(df)) tail = self.get_attribute("tail", 0) if tail > 0: rows_before = len(df) df = df.tail(tail) self.info("tail: %d, rows before: %d, rows after: %d", tail, rows_before, len(df)) return df except Exception as exc: raise exc
def drop_na_rows(self, df, column): """ Drops rows with null values in given column, logs action """ started_on = time_ms() rows_before = len(df.index) if rows_before < 1: self.warning( "Can't drop null '%s' rows because dataframe is empty", column) return df df.dropna(subset=[column], inplace=True) rows_after = len(df.index) rows_dropped = rows_before - rows_after msg = "Dropped rows where '%s' is null, rows before: %d, after: %d, dropped: %d (%.2f%%) in %d ms" self.info( msg, column, rows_before, rows_after, rows_dropped, (100.0 * rows_dropped) / rows_before, time_ms(started_on), ) return df
def _run_predict(self, *args, **kwargs): """ When an algorithm runs it always takes in a dataframe with training data, it may optionally have a dataframe of validation data and will return a dictionary with information on the trained model plus a number of artifacts. """ # assert isinstance(args[0], pandas.DataFrame) # custom models may take json as input data = args[0] artifacts_path = self.factory.get_artifacts_directory() training = read_json(os.path.join(artifacts_path, "metadata.json")) assert training started_on = time_ms() results = collections.OrderedDict({ "type": "analitico/prediction", # "endpoint_id": None, # "model_id": None, # "job_id": None, # "records": None, # processed (augmented) data will be added by IAlgorithm # "predictions": None, # predictions # "probabilities": None, "performance": get_runtime_brief(), # time elapsed, cpu, gpu, memory, disk, etc }) # force schema like in training data if isinstance(data, pd.DataFrame): schema = training["data"]["schema"] data = apply_schema(data, schema) # load model, calculate predictions results = self.predict(data, training, results, *args, **kwargs) results["performance"]["total_ms"] = time_ms(started_on) results_path = os.path.join(artifacts_path, "results.json") save_json(results, results_path) return results
def drop_selected_rows(self, df, df_dropped, message=None): """ Drops df_dropped rows from dp in place, logs action """ started_on = time_ms() rows_before = len(df.index) if rows_before < 1: self.warning( "Can't drop rows where '%s' because dataframe is empty", message) return df df.drop(df_dropped.index, inplace=True) if message: rows_after = len(df.index) rows_dropped = rows_before - rows_after msg = "Dropped rows where '%s', rows before: %d, after: %d, dropped: %d (%.2f%%) in %d ms" self.info( msg, message, rows_before, rows_after, rows_dropped, (100.0 * rows_dropped) / rows_before, time_ms(started_on), ) return df
def predict(self, data, training, results, *args, **kwargs): """ Return predictions from trained model """ # data should already come in as pd.DataFrame but it's just a dictionary we convert it if not isinstance(data, pd.DataFrame): data = pd.DataFrame.from_dict(data, orient="columns") # record that we're predicting on after augmentation is added # to the results. if the endpoint or the jupyter notebook in # charge of communicating with the caller does not want to send # this information back, it can always take it out. in the future # we may want to optimized here and add this optionally instead. results["records"] = analitico.pandas.pd_to_dict(data) # initialize data pool to be tested categorical_idx = self.get_categorical_idx(data) data_pool = catboost.Pool(data, cat_features=categorical_idx) # create model object from stored file loading_on = time_ms() model_path = os.path.join(self.factory.get_artifacts_directory(), "model.cbm") if not os.path.isfile(model_path): self.exception( "CatBoostPlugin.predict - cannot find saved model in %s", model_path) model = self.create_model(training) model.load_model(model_path) results["performance"]["loading_ms"] = time_ms(loading_on) algo = training.get("algorithm", ALGORITHM_TYPE_REGRESSION) if algo == ALGORITHM_TYPE_REGRESSION: y_predictions = model.predict(data_pool) y_predictions = np.around(y_predictions, decimals=3) results["predictions"] = list(y_predictions) else: # predict class and probabilities of each class y_predictions = model.predict( data_pool, prediction_type="Class" ) # array di array of 1 element with class index y_probabilities = model.predict( data_pool, prediction_type="Probability" ) # array of array of probabilities y_classes = training["data"]["classes"] # list of possible classes preds = results["predictions"] = [] probs = results["probabilities"] = [] # create predictions with assigned class and probabilities if algo == ALGORITHM_TYPE_MULTICLASS_CLASSIFICATION: for i in range(0, len(data)): preds.append(y_classes[int(y_predictions[i][0])]) probs.append({ y_classes[j]: y_probabilities[i][j] for j in range(0, len(y_classes)) }) elif algo == ALGORITHM_TYPE_BINARY_CLASSICATION: for i in range(0, len(data)): preds.append(y_classes[int(y_predictions[i])]) probs.append({ y_classes[0]: y_probabilities[i][0], y_classes[1]: y_probabilities[i][1] }) return results
def train(self, train, test, results, *args, **kwargs): """ Train with algorithm and given data to produce a trained model """ try: assert isinstance(train, pd.DataFrame) and len(train.columns) > 1 train_df = train test_df = test # if not specified the prediction target will be the last column of the dataset label = self.get_attribute("data.label") if not label: label = train_df.columns[len(train_df.columns) - 1] results["data"]["label"] = label # choose between regression, binary classification and multiclass classification label_type = analitico.schema.get_column_type(train_df, label) self.info("label: %s", label) self.info("label_type: %s", label_type) if label_type == analitico.schema.ANALITICO_TYPE_CATEGORY: label_classes = list(train_df[label].cat.categories) results["data"]["classes"] = label_classes train_df[label] = train_df[label].cat.codes results["algorithm"] = ( ALGORITHM_TYPE_BINARY_CLASSICATION if len(label_classes) == 2 else ALGORITHM_TYPE_MULTICLASS_CLASSIFICATION) self.info("classes: %s", label_classes) else: results["algorithm"] = ALGORITHM_TYPE_REGRESSION self.info("algorithm: %s", results["algorithm"]) # remove rows with missing label from training and test sets train_rows = len(train_df) train_df = train_df.dropna(subset=[label]) if len(train_df) < train_rows: self.warning("Training data has %s rows without '%s' label", train_rows - len(train_df), label) if test_df: test_rows = len(test_df) test_df = test_df.dropna(subset=[label]) if len(test_df) < test_rows: self.warning("Test data has %s rows without '%s' label", test_rows - len(test_df), label) # make sure schemas match train_schema = self.validate_schema(train_df, test_df) # shortened training was requested? tail = self.get_attribute("parameters.tail", 0) if tail > 0: self.info("Tail: %d, cutting training data", tail) train_df = train_df.tail(tail).copy() # create test set from training set if not provided if not test_df: # decide how to create test set from settings variable # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html chronological = self.get_attribute("data.chronological", False) test_size = self.get_attribute("parameters.test_size", 0.20) results["data"]["chronological"] = chronological results["parameters"]["test_size"] = test_size if chronological: # test set if from the last rows (chronological order) self.info("Test set split: chronological") test_rows = int(len(train_df) * test_size) test_df = train_df[-test_rows:] train_df = train_df[:-test_rows] else: # test set if from a random assortment of rows self.info("Test set split: random") train_df, test_df, = train_test_split(train_df, test_size=test_size, random_state=42) self.info("training: %d rows", len(train_df)) self.info("testing: %d rows", len(test_df)) # validate data types for column in train_schema["columns"]: if column["type"] not in ("integer", "float", "boolean", "category"): self.warning( "Column '%s' of type '%s' is incompatible and will be dropped", column["name"], column["type"]) train_df = train_df.drop(column["name"], axis=1) test_df = test_df.drop(column["name"], axis=1) # save schema after dropping unused columns results["data"]["schema"] = generate_schema(train_df) results["data"]["source_records"] = len(train) results["data"]["training_records"] = len(train_df) results["data"]["test_records"] = len(test_df) results["data"]["dropped_records"] = len(train) - len( train_df) - len(test_df) # save some training data for debugging artifacts_path = self.factory.get_artifacts_directory() self.info("artifacts_path: %s", artifacts_path) samples_df = analitico.pandas.pd_sample(train_df, 200) samples_path = os.path.join(artifacts_path, "training-samples.json") samples_df.to_json(samples_path, orient="records") self.info("saved: %s (%d bytes)", samples_path, os.path.getsize(samples_path)) samples_path = os.path.join(artifacts_path, "training-samples.csv") samples_df.to_csv(samples_path) self.info("saved: %s (%d bytes)", samples_path, os.path.getsize(samples_path)) # split data and labels train_labels = train_df[label] train_df = train_df.drop([label], axis=1) test_labels = test_df[label] test_df = test_df.drop([label], axis=1) # indexes of columns that should be considered categorical categorical_idx = self.get_categorical_idx(train_df) train_pool = catboost.Pool(train_df, train_labels, cat_features=categorical_idx) test_pool = catboost.Pool(test_df, test_labels, cat_features=categorical_idx) # create regressor or classificator then train training_on = time_ms() model = self.create_model(results) model.fit(train_pool, eval_set=test_pool) results["performance"]["training_ms"] = time_ms(training_on) # score test set, add related metrics to results self.score_training(model, test_df, test_pool, test_labels, results) if results["algorithm"] == ALGORITHM_TYPE_REGRESSION: self.score_regressor_training(model, test_df, test_pool, test_labels, results) else: self.score_classifier_training(model, test_df, test_pool, test_labels, results) # save model file and training results model_path = os.path.join(artifacts_path, "model.cbm") model.save_model(model_path) results["scores"]["model_size"] = os.path.getsize(model_path) self.info("saved: %s (%d bytes)", model_path, os.path.getsize(model_path)) return results except Exception as exc: self.exception("CatBoostPlugin - error while training: %s", str(exc), exception=exc)
def upload_random_rainbows(self, item: Item, size: int): """ Uploads random bytes to test upload limits, timeouts, etc. Size of upload is specified by caller. """ try: # random directory to test subdirectory generation remotepath = f"tst_dir_{id_generator(12)}/abc/def/ghi/unicorns.data" logger.info(f"\nsdk upload {remotepath}") # random bytes to avoid compression, etc data1 = bytearray(os.urandom(size)) # upload data directly to item's storage with tempfile.NamedTemporaryFile() as f1: f1.write(data1) started_ms = time_ms() item.upload(filepath=f1.name, remotepath=remotepath, direct=True) elapsed_ms = max(1, time_ms(started_ms)) kb_sec = (size / 1024.0) / (elapsed_ms / 1000.0) msg = f"sdk upload (direct): {size / MB_SIZE} MB in {elapsed_ms} ms, {kb_sec:.0f} KB/s" logger.info(msg) # download (streaming) started_ms = time_ms() stream2 = item.download(remotepath, stream=True) with tempfile.NamedTemporaryFile() as f2: for chunk in iter(stream2): f2.write(chunk) elapsed_ms = max(1, time_ms(started_ms)) kb_sec = (size / 1024.0) / (elapsed_ms / 1000.0) msg = f"sdk download (streaming): {size / MB_SIZE} MB in {elapsed_ms} ms, {kb_sec:.0f} KB/s" logger.info(msg) f2.seek(0) data2 = f2.file.read() self.assertEqual(data1, data2) # upload data to /files APIs with tempfile.NamedTemporaryFile() as f1: f1.write(data1) started_ms = time_ms() item.upload(filepath=f1.name, remotepath=remotepath, direct=False) elapsed_ms = max(1, time_ms(started_ms)) kb_sec = (size / 1024.0) / (elapsed_ms / 1000.0) msg = f"sdk upload (server): {size / MB_SIZE} MB in {elapsed_ms} ms, {kb_sec:.0f} KB/s" logger.info(msg) # download data from item's storage with tempfile.NamedTemporaryFile() as f3: started_ms = time_ms() item.download(remotepath, f3.name) elapsed_ms = max(1, time_ms(started_ms)) kb_sec = (size / 1024.0) / (elapsed_ms / 1000.0) msg = f"sdk download (file): {size / MB_SIZE} MB in {elapsed_ms} ms, {kb_sec:.0f} KB/s" logger.info(msg) data3 = f3.file.read() self.assertEqual(data1, data3) except Exception: raise finally: pass