예제 #1
0
    def test_get_ml_job_result(self):
        """
        Get ML Job Result works
        """

        data = self.build_ml_job_dataset()
        # validate jwt tokens work
        self.login_user()
        request = self.factory.post(self.ml_run_url,
                                    data,
                                    HTTP_AUTHORIZATION=self.jwt_auth,
                                    format="json")
        view = MLJobViewSet.as_view({"post": "create"})
        response = view(request)
        self.assertEqual(MLJob.objects.count(), 1)
        self.assertEqual(MLJobResult.objects.count(), 1)
        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
        self.assertEqual(response.data["job"]["user_name"], self.test_username)
        self.assertEqual(response.data["job"]["title"], data["title"])

        job_id = int(response.data["job"]["id"])
        result_data = response.data["results"]
        result_id = int(result_data["id"])
        error_data = result_data["error_data"]

        request = self.factory.get(self.ml_get_result_url,
                                   HTTP_AUTHORIZATION=self.jwt_auth,
                                   format="json")
        view = MLJobResultViewSet.as_view({"get": "retrieve"})
        get_response = view(request, pk=result_id)
        print(ppj(get_response.data))
        self.assertEqual(get_response.data["user_name"], self.test_username)
        self.assertEqual(get_response.data["job_id"], job_id)
        self.assertEqual(get_response.data["id"], result_id)
        self.assertEqual(get_response.data["error_data"], error_data)
        self.assertContains(get_response, "model_json")
        self.assertContains(get_response, "model_weights")
        self.assertEqual(
            len(json.dumps(get_response.data["model_json"])) > 0, True)
        self.assertEqual(
            len(json.dumps(get_response.data["model_weights"])) > 0, True)
예제 #2
0
    def build_dataset_regression_request(
        self,
        data_file=("./tests/datasets/regression/"
                   "dataset_prediction.json"),
        predict_rows_file=("./tests/datasets/regression/"
                           "stock.csv")):

        predict_rows = self.build_prediction_rows(data_file=predict_rows_file)

        dataset_manifest = None
        with open(data_file) as cur_file:
            dataset_manifest = json.loads(cur_file.read())

        prediction_req = dataset_manifest
        prediction_req["label"] = "testing_{}".format(str(uuid.uuid4()))
        prediction_req["predict_rows"] = predict_rows

        if bool(os.getenv("TEST_DEBUG", "0") == "1"):
            print(ppj(prediction_req))

        return prediction_req
if post_response.status_code != 201 \
   and post_response.status_code != 200:
    log.error(("Failed with Post response status={} reason={}")
              .format(post_response.status_code,
                      post_response.reason))
    log.error("Details:\n{}".format(post_response.text))
    sys.exit(1)
else:
    log.info(("SUCCESS - Post Response status={} reason={}")
             .format(post_response.status_code,
                     post_response.reason))

    as_json = True
    record = {}
    if as_json:
        record = json.loads(post_response.text)
        log.info(ppj(record))
    if using_named_files or custom_output_dir:
        print("")
        print("Train a Neural Network with:")
        print(("./create-keras-dnn.py "
               "{} {}cleaned_{}").format(
                    record["clean_file"],
                    record["output_dir"],
                    record["meta_suffix"]))
        print("")
# end of post for running an ML Job

sys.exit(0)
app.config_from_object(
    "django.conf:settings",
    namespace="CELERY")

app.autodiscover_tasks(
    lambda: settings.INSTALLED_APPS)

datafile = "./drf_network_pipeline/tests/pubsub/publish-to-core.json"
data = {}
with open(datafile, "r") as f:
    data = json.loads(f.read())

# Celery task routing and queue
parent_route = "drf_network_pipeline.pipeline.tasks"
task_name = ("{}.task_publish_to_core").format(
    parent_route)
queue_name = ("{}.task_publish_to_core").format(
    parent_route)

log.info(("sending args={} to broker={} task={}")
         .format(
            ppj(data),
            app.conf["BROKER_URL"],
            task_name))

app.send_task(
    task_name,
    args=[data],
    queue=queue_name)
os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                      "drf_network_pipeline.settings")

log.info("creating celery app")
app = Celery("test-app")

app.config_from_object("django.conf:settings", namespace="CELERY")

app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)

datafile = "./drf_network_pipeline/tests/pubsub/get-user.json"
data = {}
with open(datafile, "r") as f:
    data = json.loads(f.read())

# Celery task routing and queue
parent_route = "drf_network_pipeline.users.tasks"
task_name = ("{}.task_get_user").format(parent_route)
queue_name = ("{}.task_get_user").format(parent_route)

log.info(
    ("sending args={} to broker={} task={}").format(ppj(data),
                                                    app.conf["BROKER_URL"],
                                                    task_name))

task_res = app.send_task(task_name, args=[data], queue=queue_name)

log.info(("task={} task.id={} result={}").format(task_name, task_res.id,
                                                 ppj(task_res.get())))
예제 #6
0
 def show_diagnostics(self):
     """show_diagnostics"""
     log.info(("{} - models={}").format(self.name, self.models))
     for midx, m in enumerate(self.recv_msgs):
         log.info(("msg={} contents={}").format(midx, ppj(m)))
예제 #7
0
def run_task(task_method=None,
             task_name="please-set-name",
             req_data=None,
             get_result=False,
             delay_timeout=settings.CELERY_GET_RESULT_TIMEOUT,
             use_cache=settings.CACHEOPS_ENABLED,
             cache_record=False,
             cache_key=None):
    """run_task

    Handles Celery sync/async task processing

    :param task_method: requested method
    :param task_name: name of the task for logging
    :param req_data: requested data
    :param get_result: get the result from task
    :param delay_timeout: seconds to wait for the task to finish
    :param use_cache: use the cached record if available
    :param cache_record: cache the result in redis after done
    :param cache_key: cache the result in this redis key
    """

    req_node = build_task_request(task_name=task_name,
                                  use_cache=use_cache,
                                  cache_record=cache_record,
                                  cache_key=cache_key,
                                  data=req_data)
    res_node = build_task_response(status=NOTRUN, data=None, err="not-run")

    try:

        res_node = handle_task_method(req_node=req_node,
                                      get_result=get_result,
                                      delay_timeout=delay_timeout,
                                      task_method=task_method)

        if "celery_enabled" not in res_node:
            log.error(("Invalid return node from task={} "
                       "task_method={} with req_node={} "
                       "returned data={}").format(task_name, task_method,
                                                  ppj(req_node),
                                                  ppj(res_node)))

        if res_node["status"] == SUCCESS:
            log.info(("celery={} - running task with data={}").format(
                res_node["celery_enabled"],
                str(res_node["data"])[0:32]))
        elif not get_result and res_node["status"] == NOTDONE:
            log.info(("celery={} - running task with data={}").format(
                res_node["celery_enabled"],
                str(res_node["data"])[0:32]))
        else:
            res_node["data"] = None
            res_node["status"] = res_node["status"]
            res_node["err"] = ("task={} method={} "
                               "status={} err={}").format(
                                   task_name, task_method, res_node["status"],
                                   res_node["err"])
            log.error(("Failed {}").format(res_node["err"]))
        # end of handling success/failure

    except Exception as e:
        res_node = build_task_response(
            status=ERR,
            data=None,
            err=("Failed to run {} celery={} "
                 "with ex={}").format(task_name,
                                      res_node.get("celery_enabled", None), e))
        log.error(res_node["err"])
    # try/ex handling Celery task

    return res_node
예제 #8
0
    def run_train_and_predict(self, req):
        """run_train_and_predict

        :param req: message dict consumed from a queue
        """

        log.info(("{} loading predict_rows into a df").format(
            req["use_model_name"]))

        # the REST API can ask for the worker to publish
        # results to the broker details from the manifest
        # which is stored in the REST API db
        worker_result_node = None
        if "manifest" in req:
            worker_result_node = req["manifest"].get("worker_result_node",
                                                     None)
        # end of getting 'where to send the results' from
        # the manifest

        predict_df = pd.read_json(req["predict_rows"])
        show_model_json = False
        try:
            show_model_json = bool(int(req.get("show_model_json", "0")) == 1)
        except Exception as e:
            show_model_json = False
            log.error(("{} - Set show_model_json to 0 or 1 ex={}").format(
                req["label"], e))
        ml_type = req["ml_type"].lower()
        predict_feature = req["predict_feature"]
        predictions = []
        res = antinex_utils.make_predictions.make_predictions(req)
        if res["status"] == SUCCESS:
            log.info(("{} - processing results").format(req["label"]))
            res_data = res["data"]
            model = res_data["model"]
            acc_data = res_data["acc"]
            are_predictions_merged = res_data["are_predicts_merged"]
            predictions = res_data["sample_predictions"]

            accuracy = acc_data.get("accuracy", None)
            if are_predictions_merged:
                log.info(("{} - processing merged predictions").format(
                    req["label"]))
                merge_df = res_data["merge_df"]
                model_predict_feature = "_predicted_{}".format(predict_feature)
                if model_predict_feature not in merge_df:
                    log.error(
                        ("{} missing predicted feature={} "
                         "from res={}").format(req["label"],
                                               model_predict_feature, res))
                    return res

                for idx, row in merge_df.iterrows():
                    log.info(("cur_sample={} - {}={} predicted={}").format(
                        idx, predict_feature, float(row[predict_feature]),
                        float(row[model_predict_feature])))
                # same as the merge method in antinex-utils
            else:
                for idx, node in enumerate(predictions):
                    label_name = None
                    if "label_name" in node:
                        label_name = node["label_name"]
                    org_feature = "_original_{}".format(predict_feature)
                    original_value = None
                    if org_feature in node:
                        original_value = node[org_feature]
                    if "regression" in ml_type:
                        log.info(("sample={} - {}={} predicted={}").format(
                            node["_row_idx"], predict_feature,
                            float(original_value),
                            float(node[predict_feature])))
                    elif "classification" in ml_type:
                        log.info(
                            ("sample={} - {}={} predicted={} label={}").format(
                                node["_row_idx"], predict_feature,
                                original_value, node[predict_feature],
                                label_name))
                    else:
                        log.info(("sample={} - {}={} predicted={}").format(
                            node["_row_idx"], predict_feature, original_value,
                            node[predict_feature]))
            # end of predicting predictions

            if show_model_json:
                log.info(("{} made predictions={} model={} ").format(
                    req["label"], len(predict_df.index),
                    ppj(json.loads(model.model.to_json()))))

            log.info(
                ("{} made predictions={} found={} "
                 "accuracy={}").format(req["label"], len(predict_df.index),
                                       len(res_data["sample_predictions"]),
                                       accuracy))

            final_results = {
                "data": res_data,
                "created":
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

            if worker_result_node:

                log.info(("CORERES {} publishing results back to api").format(
                    req["label"]))

                status = send_results_to_broker(loc=worker_result_node,
                                                final_results=final_results)

                log.info(("CORERES {} publishing results success={}").format(
                    req["label"], bool(status == SUCCESS)))
            # end of sending the results back

            log.info(("{} - model={} finished processing").format(
                req["label"], req["use_model_name"]))

            self.models[req["use_model_name"]] = final_results
        else:
            log.info(
                ("{} failed predictions={}").format(req["label"],
                                                    len(predict_df.index)))
        # end of if good train and predict

        return res
def task_ml_prepare(self=None, req_node=None):
    """task_ml_prepare

    :param self: parent task object for bind=True
    :param req_node: job utils dictionary for passing a dictionary
    """

    log.info(("task - {} - start "
              "req_node={}").format(req_node["task_name"], ppj(req_node)))

    ml_prepare_data = req_node["data"].get("ml_prepare_data", None)

    user_obj = None
    ml_prepare_obj = None
    if req_node["use_cache"]:
        ml_prepare_obj = MLPrepare.objects.select_related().filter(
            Q(id=int(ml_prepare_data["id"]))).cache().first()
    else:
        ml_prepare_obj = MLPrepare.objects.select_related().filter(
            Q(id=int(ml_prepare_data["id"]))).first()
    # end of finding the MLPrepare record

    create_new_record = False

    # create the response node from request
    res = build_task_response(use_cache=req_node["use_cache"],
                              celery_enabled=req_node["celery_enabled"],
                              cache_key=req_node["cache_key"])

    try:

        if create_new_record:
            create_res = create_ml_prepare_record(req_node=req_node)
            user_obj = create_res.get("user_obj", None)
            ml_prepare_obj = create_res.get("ml_prepare_obj", None)
            if not user_obj:
                res["error"] = ("{} - Failed to find User").format(
                    req_node["task_name"])
                res["status"] = ERR
                res["error"] = create_res.get("err", "error not set")
                res["data"] = None
                log.error(res["error"])
                return res
            if not ml_prepare_obj:
                res["error"] = ("{} - Failed to create MLPrepare").format(
                    req_node["task_name"])
                res["status"] = ERR
                res["error"] = create_res.get("err", "error not set")
                res["data"] = None
                log.error(res["error"])
            return res
        # end of create_new_record

        last_step = ("starting user={} prepare={} "
                     "pipeline={} clean={} full={} "
                     "post={} label={} tracking={}").format(
                         ml_prepare_obj.user_id, ml_prepare_obj.id,
                         ml_prepare_obj.pipeline_files,
                         ml_prepare_obj.clean_file, ml_prepare_obj.full_file,
                         ml_prepare_obj.post_proc, ml_prepare_obj.label_rules,
                         ml_prepare_obj.tracking_id)
        log.info(last_step)

        log_id = "job={}".format(ml_prepare_obj.id)

        log.info(("prepare={} csvs={}").format(ml_prepare_obj.id,
                                               ml_prepare_obj.ds_glob_path))

        ml_prepare_obj.pipeline_files = find_all_pipeline_csvs(
            use_log_id=log_id, csv_glob_path=ml_prepare_obj.ds_glob_path)

        log.info(
            ("preparing={} clean={} full={} "
             "meta_suffix={} files={}").format(ml_prepare_obj.id,
                                               ml_prepare_obj.clean_file,
                                               ml_prepare_obj.full_file,
                                               ml_prepare_obj.meta_suffix,
                                               ml_prepare_obj.pipeline_files))

        save_node = build_csv(use_log_id=log_id,
                              pipeline_files=ml_prepare_obj.pipeline_files,
                              fulldata_file=ml_prepare_obj.full_file,
                              clean_file=ml_prepare_obj.clean_file,
                              post_proc_rules=ml_prepare_obj.post_proc,
                              label_rules=ml_prepare_obj.label_rules,
                              meta_suffix=ml_prepare_obj.meta_suffix)

        if save_node["status"] == VALID:

            log.info("successfully processed datasets:")

            ml_prepare_obj.post_proc = save_node["post_proc_rules"]
            ml_prepare_obj.post_proc["features_to_process"] = \
                save_node["features_to_process"]
            ml_prepare_obj.post_proc["ignore_features"] = \
                save_node["ignore_features"]
            ml_prepare_obj.post_proc["feature_to_predict"] = \
                save_node["feature_to_predict"]
            ml_prepare_obj.label_rules = save_node["label_rules"]
            ml_prepare_obj.pipeline_files = save_node["pipeline_files"]
            ml_prepare_obj.full_file = save_node["fulldata_file"]
            ml_prepare_obj.clean_file = save_node["clean_file"]
            ml_prepare_obj.status = "finished"
            ml_prepare_obj.control_state = "finished"
            ml_prepare_obj.save()
            log.info(("saved prepare={}").format(ml_prepare_obj.id))

            if ev("SHOW_SUMMARY", "0") == "1":
                log.info(("Full csv: {}").format(save_node["fulldata_file"]))
                log.info(("Full meta: {}").format(
                    save_node["fulldata_metadata_file"]))
                log.info(("Clean csv: {}").format(save_node["clean_file"]))
                log.info(("Clean meta: {}").format(
                    save_node["clean_metadata_file"]))
                log.info("------------------------------------------")
                log.info(("Predicting Feature: {}").format(
                    save_node["feature_to_predict"]))
                log.info(("Features to Process: {}").format(
                    ppj(save_node["features_to_process"])))
                log.info(("Ignored Features: {}").format(
                    ppj(save_node["ignore_features"])))
                log.info("------------------------------------------")
            # end of show summary

            log.info("Full: {}".format(save_node["fulldata_file"]))
            log.info("Cleaned (no-NaNs in columns): {}".format(
                save_node["clean_file"]))
            data = ml_prepare_obj.get_public()
            res["status"] = SUCCESS
            res["err"] = ""
            res["data"] = data
        else:
            last_step = ("failed to prepare csv status={} "
                         "errors: {}").format(save_node["status"],
                                              save_node["err"])
            log.error(last_step)
            ml_prepare_obj.status = "error"
            ml_prepare_obj.control_state = "error"
            ml_prepare_obj.save()
            data["prepare"] = ml_prepare_obj.get_public()
            data["ready"] = {}
            res["status"] = ERR
            res["error"] = last_step
            res["data"] = data
            return res
        # end of checking it started

    except Exception as e:
        res["status"] = ERR
        res["err"] = ("Failed task={} with "
                      "ex={}").format(req_node["task_name"], e)
        res["data"] = None
        log.error(res["err"])
    # end of try/ex

    log.info(("task - {} - done").format(req_node["task_name"]))

    return res
def task_ml_job(self=None, req_node=None):
    """task_ml_job

    :param self: parent task object for bind=True
    :param req_node: job utils dictionary for passing a dictionary
    """

    log.info(("task - {} - start "
              "req_node={}").format(req_node["task_name"], ppj(req_node)))

    user_data = req_node["data"].get("user_data", None)
    ml_job = req_node["data"].get("ml_job_data", None)
    ml_result = req_node["data"].get("ml_result_data", None)
    model_desc = req_node["data"].get("model_desc", None)
    label_rules = req_node["data"].get("label_rules", None)
    predict_rows = req_node["data"].get("predict_rows", None)

    user_res = db_lookup_user(user_id=user_data["id"])
    user_obj = user_res.get("user_obj", None)
    ml_job_id = None
    ml_result_id = None
    ml_job_obj = None
    found_predictions = []
    found_accuracy = None

    if req_node["use_cache"]:
        ml_job_obj = MLJob.objects.select_related().filter(
            Q(id=int(ml_job["id"])) & Q(user=user_obj)).cache().first()
    else:
        ml_job_obj = MLJob.objects.select_related().filter(
            Q(id=int(ml_job["id"])) & Q(user=user_obj)).first()
    # end of finding the MLJob record

    ml_result_obj = None
    if req_node["use_cache"]:
        ml_result_obj = MLJobResult.objects.select_related().filter(
            Q(id=int(ml_result["id"])) & Q(user=user_obj)).cache().first()
    else:
        ml_result_obj = MLJobResult.objects.select_related().filter(
            Q(id=int(ml_result["id"])) & Q(user=user_obj)).first()
    # end of finding the MLJobResult record

    res = build_task_response(use_cache=req_node["use_cache"],
                              celery_enabled=req_node["celery_enabled"],
                              cache_key=req_node["cache_key"])

    last_step = "not started"
    data = {}
    data["job"] = {}
    data["results"] = {}
    try:

        res["status"] = ERR
        res["error"] = ""

        predict_manifest = ml_job_obj.predict_manifest
        csv_file = predict_manifest.get("csv_file", None)
        meta_file = predict_manifest.get("meta_file", None)
        epochs = int(predict_manifest.get("epochs", "5"))
        test_size = float(predict_manifest.get("test_size", "0.2"))
        batch_size = int(predict_manifest.get("batch_size", "32"))
        verbose = int(predict_manifest.get("verbose", "1"))

        # use pre-trained models in memory by label
        use_model_name = ml_job_obj.predict_manifest.get(
            "use_model_name", None)
        dataset = ml_job_obj.predict_manifest.get("dataset", None)
        predict_rows = ml_job_obj.predict_manifest.get("predict_rows", None)
        predict_feature = ml_job_obj.predict_manifest.get(
            "predict_feature", None)
        features_to_process = ml_job_obj.predict_manifest.get(
            "features_to_process", None)
        ignore_features = ml_job_obj.predict_manifest.get(
            "ignore_features", None)
        publish_to_core = ml_job_obj.predict_manifest.get(
            "publish_to_core", None)
        apply_scaler = ml_job_obj.predict_manifest.get("apply_scaler", True)
        sort_values = ml_job_obj.predict_manifest.get("sort_values", None)
        max_records = int(
            ml_job_obj.predict_manifest.get("max_records", "100000"))
        loss = ml_job_obj.predict_manifest.get("loss", "binary_crossentropy")
        metrics = ml_job_obj.predict_manifest.get("metrics", ["accuracy"])
        optimizer = ml_job_obj.predict_manifest.get("optimizer", "adam")
        histories = ml_job_obj.predict_manifest.get(
            "histories", ["val_loss", "val_acc", "loss", "acc"])

        needs_local_builder = True
        if ((dataset or predict_rows) and features_to_process):
            log.info(("using antinex builder dataset={} predict_rows={} "
                      "features_to_process={}").format(dataset, predict_rows,
                                                       features_to_process))

            needs_local_builder = False
        # flag for bypassing build inside django instead of antinex-utils

        image_file = ml_result_obj.acc_image_file
        version = ml_job_obj.version

        ml_job_id = ml_job_obj.id
        ml_result_id = ml_result_obj.id

        last_step = ("starting user={} "
                     "job.id={} result.id={} predict={} "
                     "model_desc={} "
                     "csv={} meta={}").format(ml_job_obj.user.id, ml_job_id,
                                              ml_result_id,
                                              ml_job_obj.predict_feature,
                                              model_desc, csv_file, meta_file)
        log.info(last_step)

        ml_job_obj.status = "analyzing"
        ml_job_obj.save()

        if needs_local_builder:

            log.info("starting local build_training_request")

            ml_req = build_training_request(
                csv_file=csv_file,
                meta_file=meta_file,
                predict_feature=ml_job_obj.predict_feature,
                test_size=test_size)

            if ml_req["status"] != VALID:
                last_step = ("Stopping for status={} "
                             "errors: {}").format(ml_req["status"],
                                                  ml_req["err"])
                log.error(last_step)
                ml_job_obj.status = "error"
                ml_job_obj.control_state = "error"
                log.info(("saving job={}").format(ml_job_id))
                ml_job_obj.save()
                data["job"] = ml_job_obj.get_public()
                error_data = {"status": ml_req["status"], "err": ml_req["err"]}
                data["results"] = error_data
                res["status"] = ERR
                res["error"] = last_step
                res["data"] = data
                return res
            else:

                predict_manifest["ignore_features"] = \
                    ml_req.get("ignore_features", [])
                predict_manifest["features_to_process"] = \
                    ml_req.get("features_to_process", [])
                if label_rules:
                    predict_manifest["label_rules"] = \
                        label_rules
                else:
                    predict_manifest["label_rules"] = \
                        ml_req["meta_data"]["label_rules"]
                predict_manifest["post_proc_rules"] = \
                    ml_req["meta_data"]["post_proc_rules"]
                predict_manifest["version"] = version

                last_step = ("job.id={} built_training_request={} "
                             "predict={} features={} ignore={} "
                             "label_rules={} post_proc={}").format(
                                 ml_job_obj.id, ml_req["status"],
                                 predict_manifest["predict_feature"],
                                 predict_manifest["features_to_process"],
                                 predict_manifest["ignore_features"],
                                 predict_manifest["label_rules"],
                                 predict_manifest["post_proc_rules"])

                log.info(last_step)

                if ml_job_obj.ml_type == "regression":
                    log.info(("using Keras - regression - "
                              "sequential model ml_type={}").format(
                                  ml_job_obj.ml_type))

                    loss = "mse"
                    metrics = ["mse", "mae", "mape", "cosine"]

                    histories = [
                        "mean_squared_error", "mean_absolute_error",
                        "mean_absolute_percentage_error", "cosine_proximity"
                    ]
                else:
                    log.info(("using Keras - sequential model "
                              "ml_type={}").format(ml_job_obj.ml_type))
                # end of classification vs regression

                ml_job_obj.predict_manifest["epochs"] = epochs
                ml_job_obj.predict_manifest["batch_size"] = batch_size
                ml_job_obj.predict_manifest["verbose"] = verbose
                ml_job_obj.predict_manifest["loss"] = loss
                ml_job_obj.predict_manifest["metrics"] = metrics
                ml_job_obj.predict_manifest["optimizer"] = optimizer
                ml_job_obj.predict_manifest["histories"] = histories
                ml_job_obj.predict_manifest = predict_manifest

        # end of updating without antinex-utils
        # end of if needs_local_builder:

        ml_job_obj.status = "started"
        ml_job_obj.save()

        scores = None
        prediction_req = {
            "label": "job_{}_result_{}".format(ml_job_id, ml_result_id),
            "manifest": ml_job_obj.predict_manifest,
            "model_json": ml_result_obj.model_json,
            "model_desc": model_desc,
            "weights_json": ml_result_obj.model_weights,
        }

        if dataset:
            prediction_req["dataset"] = dataset
        if max_records:
            prediction_req["max_records"] = max_records
        if predict_rows:
            prediction_req["predict_rows"] = json.dumps(predict_rows)
        if features_to_process:
            prediction_req["features_to_process"] = features_to_process
        if ignore_features:
            prediction_req["ignore_features"] = ignore_features
        if apply_scaler:
            prediction_req["apply_scaler"] = apply_scaler
        if sort_values:
            prediction_req["sort_values"] = sort_values
        if loss:
            prediction_req["loss"] = loss
        if metrics:
            prediction_req["metrics"] = metrics
        if optimizer:
            prediction_req["optimizer"] = optimizer
        if histories:
            prediction_req["histories"] = histories
        if predict_feature:
            prediction_req["predict_feature"] = predict_feature
        if csv_file:
            prediction_req["csv_file"] = csv_file
        if meta_file:
            prediction_req["meta_file"] = meta_file

        already_predicted = False

        # if you just want to use the core without django training:
        if publish_to_core or settings.ANTINEX_WORKER_ONLY:
            log.info(("model_name={} only publish={} worker={}").format(
                use_model_name, publish_to_core, settings.ANTINEX_WORKER_ONLY))
            ml_job_obj.status = "launched"
            ml_job_obj.control_state = "launched"
            ml_job_obj.save()
            ml_result_obj.status = "launched"
            ml_result_obj.control_state = "launched"
            ml_result_obj.save()
        else:
            log.info(
                ("start make_predictions req={}").format(ppj(prediction_req)))

            prediction_res = make_predictions(req=prediction_req)

            if prediction_res["status"] != SUCCESS:
                last_step = ("Stopping for prediction_status={} "
                             "errors: {}").format(prediction_res["status"],
                                                  prediction_res["err"])
                log.error(last_step)
                ml_job_obj.status = "error"
                ml_job_obj.control_state = "error"
                log.info(("saving job={}").format(ml_job_id))
                ml_job_obj.save()
                data["job"] = ml_job_obj.get_public()
                error_data = {
                    "status": prediction_res["status"],
                    "err": prediction_res["err"]
                }
                data["results"] = error_data
                res["status"] = ERR
                res["error"] = last_step
                res["data"] = data
                return res

            already_predicted = True
            res_data = prediction_res["data"]
            model = res_data["model"]
            model_weights = res_data["weights"]
            scores = res_data["scores"]
            acc_data = res_data["acc"]
            error_data = res_data["err"]
            predictions_json = {
                "predictions":
                json.loads(
                    pd.Series(res_data["sample_predictions"]).to_json(
                        orient="records"))
            }
            found_predictions = res_data["sample_predictions"]
            found_accuracy = acc_data.get("accuracy", None)

            last_step = ("job={} accuracy={}").format(ml_job_id,
                                                      scores[1] * 100)
            log.info(last_step)

            ml_job_obj.status = "finished"
            ml_job_obj.control_state = "finished"
            ml_job_obj.save()
            log.info(("saved job={}").format(ml_job_id))

            data["job"] = ml_job_obj.get_public()
            acc_data = {"accuracy": scores[1] * 100}
            error_data = None
            log.info(("converting job={} model to json").format(ml_job_id))
            model_json = json.loads(model.to_json())
            log.info(("saving job={} weights_file={}").format(
                ml_job_id, ml_result_obj.model_weights_file))

            log.info(("building job={} results").format(ml_job_id))

            ml_result_obj.status = "finished"
            ml_result_obj.acc_data = acc_data
            ml_result_obj.error_data = error_data
            ml_result_obj.model_json = model_json
            ml_result_obj.model_weights = model_weights
            ml_result_obj.acc_image_file = image_file
            ml_result_obj.predictions_json = predictions_json
            ml_result_obj.version = version
        # end of handing off to core worker without a database connection

        log.info(("saving job={} results").format(ml_job_id))

        # OpenShift 9.6 Postgres container killed the worker
        # here. Interested to see if this is a jsonb/jsonfield problem
        # 2018-05-20
        try:
            ml_result_obj.save()
        except Exception as e:
            res["error"] = ("Failed saving model job.id={} with ex={}").format(
                ml_job_id, e)
            res["status"] = ERR
            res["data"] = data
            log.error(res["error"])
            return res
        # end try/ex

        log.info(("done saving job={} results").format(ml_job_id))
        data["job"] = ml_job_obj.get_public()
        data["results"] = ml_result_obj.get_public()
        res["status"] = SUCCESS
        res["error"] = ""
        res["data"] = data

        if settings.ANTINEX_WORKER_ENABLED and not already_predicted:

            if use_model_name:
                prediction_req["label"] = use_model_name

            log.info(("publishing to core use_model_name={} "
                      "worker={} already_predicted={}").format(
                          use_model_name, settings.ANTINEX_WORKER_ENABLED,
                          already_predicted))

            publish_req = {"body": prediction_req}
            if settings.CELERY_ENABLED:
                task_publish_to_core.delay(publish_node=publish_req)
            else:
                task_publish_to_core(publish_node=publish_req)
        else:
            log.info(("skip - worker={} already_predicted={}").format(
                settings.ANTINEX_WORKER_ENABLED, already_predicted))
        # send to core

    except Exception as e:
        res["status"] = ERR
        res["err"] = ("Failed task={} with "
                      "ex={}").format(req_node["task_name"], e)
        if ml_job_obj:
            data["job"] = ml_job_obj.get_public()
        else:
            data["job"] = None

        if ml_result_obj:
            data["results"] = ml_result_obj.get_public()
        else:
            data["results"] = None
        log.error(res["err"])
    # end of try/ex

    log.info(
        ("task - {} - done - "
         "ml_job.id={} ml_result.id={} "
         "accuracy={} predictions={}").format(req_node["task_name"], ml_job_id,
                                              ml_result_id, found_accuracy,
                                              len(found_predictions)))

    return res
def build_csv(pipeline_files=[],
              fulldata_file=None,
              clean_file=None,
              post_proc_rules=None,
              label_rules=None,
              use_log_id=None,
              meta_suffix="metadata.json"):
    """build_csv

    :param pipeline_files: list of files to process
    :param fulldata_file: output of non-edited merged data
    :param clean_file: cleaned csv file should be ready for training
    :param post_proc_rules: apply these rules to post processing (clean)
    :param label_rules: apply labeling rules (classification only)
    :param use_log_id: label for tracking the job in the logs
    :param meta_suffix: file suffix
    """

    save_node = {
        "status": INVALID,
        "pipeline_files": pipeline_files,
        "post_proc_rules": post_proc_rules,
        "label_rules": label_rules,
        "fulldata_file": fulldata_file,
        "fulldata_metadata_file": None,
        "clean_file": clean_file,
        "clean_metadata_file": None,
        "features_to_process": [],
        "feature_to_predict": None,
        "ignore_features": [],
        "full_headers": [],
        "clean_headers": [],
        "df_json": {},
        "version": 1
    }

    log_id = ""
    if use_log_id:
        log_id = use_log_id

    if not fulldata_file:
        log.error("missing fulldata_file - stopping")
        save_node["status"] = INVALID
        return save_node
    if not clean_file:
        log.error("missing clean_file - stopping")
        save_node["status"] = INVALID
        return save_node

    fulldata_metadata_file = "{}/fulldata_{}".format(
        "/".join(fulldata_file.split("/")[:-1]), meta_suffix)

    clean_metadata_file = "{}/cleaned_{}".format(
        "/".join(clean_file.split("/")[:-1]), meta_suffix)

    log.info(("{} build_csv - START").format(log_id))

    common_headers, \
        headers_dict = find_all_headers(
                            use_log_id=log_id,
                            pipeline_files=pipeline_files)

    log.info(
        ("{} num common_headers={} headers={}").format(log_id,
                                                       len(common_headers),
                                                       common_headers))

    # since the headers can be different we rebuild a new one:

    mark_default_value = None
    if "mark_empty" in post_proc_rules:
        mark_default_value = post_proc_rules["mark_empty"]
        log.info(("{} using mark_empty={}").format(log_id, mark_default_value))

    hdrs = {}
    for h in common_headers:
        hdrs[h] = mark_default_value

    features_to_process = []
    feature_to_predict = None
    ignore_features = []

    set_if_above = None
    labels = []
    label_values = []
    if label_rules:
        set_if_above = label_rules["set_if_above"]
        labels = label_rules["labels"]
        label_values = label_rules["label_values"]
    if post_proc_rules:
        if "predict_feature" in post_proc_rules:
            feature_to_predict = post_proc_rules["predict_feature"]
    if not feature_to_predict:
        if "label_name" in hdrs:
            feature_to_predict = "label_name"

    all_rows = []
    num_done = 1
    total_files = len(pipeline_files)
    for c in pipeline_files:
        log.info(("{} merging={}/{} csv={}").format(log_id, num_done,
                                                    total_files, c))
        cf = pd.read_csv(c)
        if mark_default_value:
            log.info(("{} filling nan with value={}").format(
                log_id, mark_default_value))
            cf.fillna(value=mark_default_value, inplace=True)
        # end of making sure fillna is done if requested

        log.info(("{} processing rows={}").format(log_id, len(cf.index)))
        for index, row in cf.iterrows():
            valid_row = True
            new_row = copy.deepcopy(hdrs)
            new_row["src_file"] = c
            for k in hdrs:
                if k in row:
                    new_row[k] = row[k]
                else:
                    if mark_default_value:
                        new_row[k] = mark_default_value
            # end of for all headers to copy in

            if label_rules:
                test_rand = random.randint(0, 100)
                if test_rand > set_if_above:
                    new_row["label_value"] = label_values[1]
                    new_row["label_name"] = labels[1]

                # if you make the "set above" greater than 100
                # it will tag the entire dataset with just 1 label
                # nice if your data is the same
                else:
                    new_row["label_value"] = label_values[0]
                    new_row["label_name"] = labels[0]
            # end of applying label rules

            if valid_row:
                all_rows.append(new_row)
        # end of for all rows in this file

        num_done += 1
    # end of building all files into one list

    log.info(
        ("{} fulldata rows={} generating df").format(log_id, len(all_rows)))

    df = pd.DataFrame(all_rows)
    log.info(("{} df rows={} headers={}").format(log_id, len(df.index),
                                                 df.columns.values))

    if ev("CONVERT_DF", "0") == "1":
        log.info(("{} converting df to json").format(log_id))
        save_node["df_json"] = df.to_json()

    if clean_file:
        log.info(("{} writing fulldata_file={}").format(log_id, fulldata_file))
        df.to_csv(fulldata_file, sep=',', encoding='utf-8', index=False)
        log.info(
            ("{} done writing fulldata_file={}").format(log_id, fulldata_file))

        if post_proc_rules:

            features_to_process = []
            ignore_features = []
            if label_rules:
                if feature_to_predict:
                    ignore_features = [feature_to_predict]

            if "drop_columns" in post_proc_rules:
                for p in post_proc_rules["drop_columns"]:
                    if p in headers_dict:
                        ignore_features.append(p)
                # post proce filter more features out
                # for non-int/float types

                for d in df.columns.values:
                    add_this_one = True
                    for i in ignore_features:
                        if d == i:
                            add_this_one = False
                            break
                    if add_this_one:
                        features_to_process.append(d)
                # for all df columns we're not ignoring...
                # add them as features to process

                log.info(("{} writing fulldata metadata file={}").format(
                    log_id, fulldata_metadata_file))
                header_data = {
                    "headers": list(df.columns.values),
                    "output_type": "fulldata",
                    "pipeline_files": pipeline_files,
                    "post_proc_rules": post_proc_rules,
                    "label_rules": label_rules,
                    "features_to_process": features_to_process,
                    "feature_to_predict": feature_to_predict,
                    "ignore_features": ignore_features,
                    "created": rnow()
                }

                save_node["full_headers"] = list(df.columns.values)
                with open(fulldata_metadata_file, "w") as otfile:
                    otfile.write(str(ppj(header_data)))

                keep_these = features_to_process
                if feature_to_predict:
                    keep_these.append(feature_to_predict)

                log.info(("{} creating new clean_file={} "
                          "keep_these={} "
                          "predict={}").format(log_id, clean_file, keep_these,
                                               feature_to_predict))

                # need to remove all columns that are all nan
                clean_df = None
                if "keep_nans" not in post_proc_rules:
                    clean_df = df[keep_these].dropna(axis=1,
                                                     how='all').dropna()
                else:
                    clean_df = df[keep_these].dropna(axis=1, how='all')
                # allow keeping empty columns

                log.info(("{} clean_df colums={} rows={}").format(
                    log_id, clean_df.columns.values, len(clean_df.index)))

                if len(clean_df.columns.values) == 0:
                    log.error("Postproc clean df has no columns")
                if len(clean_df.index) == 0:
                    log.error("Postproc clean df has no rows")

                cleaned_features = clean_df.columns.values
                cleaned_to_process = []
                cleaned_ignore_features = []
                for c in cleaned_features:
                    if feature_to_predict:
                        if c == feature_to_predict:
                            cleaned_ignore_features.append(c)
                    else:
                        keep_it = True
                        for ign in ignore_features:
                            if c == ign:
                                cleaned_ignore_features.append(c)
                                keep_it = False
                                break
                        # end of for all feaures to remove
                        if keep_it:
                            cleaned_to_process.append(c)
                # end of new feature columns

                log.info(("{} writing DROPPED clean_file={} "
                          "features_to_process={} "
                          "ignore_features={} "
                          "predict={}").format(log_id, clean_file,
                                               cleaned_to_process,
                                               cleaned_ignore_features,
                                               feature_to_predict))

                write_clean_df = clean_df.drop(columns=cleaned_ignore_features)
                log.info(
                    ("cleaned_df rows={}").format(len(write_clean_df.index)))
                write_clean_df.to_csv(clean_file,
                                      sep=',',
                                      encoding='utf-8',
                                      index=False)

                clean_metadata_file = "{}/cleaned_{}".format(
                    "/".join(clean_file.split("/")[:-1]), meta_suffix)
                log.info(("{} writing clean metadata file={}").format(
                    log_id, clean_metadata_file))
                header_data = {
                    "headers": list(write_clean_df.columns.values),
                    "output_type": "clean",
                    "pipeline_files": pipeline_files,
                    "post_proc_rules": post_proc_rules,
                    "label_rules": label_rules,
                    "features_to_process": cleaned_to_process,
                    "feature_to_predict": feature_to_predict,
                    "ignore_features": cleaned_ignore_features,
                    "created": rnow()
                }

                save_node["clean_headers"] = list(
                    write_clean_df.columns.values)
                with open(clean_metadata_file, "w") as otfile:
                    otfile.write(str(ppj(header_data)))
            else:

                for d in df.columns.values:
                    add_this_one = True
                    for i in ignore_features:
                        if d == i:
                            add_this_one = False
                            break
                    if add_this_one:
                        features_to_process.append(d)
                # for all df columns we're not ignoring...
                # add them as features to process

                log.info(("{} writing fulldata metadata file={}").format(
                    log_id, fulldata_metadata_file))
                header_data = {
                    "headers": list(df.columns.values),
                    "output_type": "fulldata",
                    "pipeline_files": pipeline_files,
                    "post_proc_rules": post_proc_rules,
                    "label_rules": label_rules,
                    "features_to_process": features_to_process,
                    "feature_to_predict": feature_to_predict,
                    "ignore_features": ignore_features,
                    "created": rnow()
                }

                save_node["full_headers"] = list(df.columns.values)
                with open(fulldata_metadata_file, "w") as otfile:
                    otfile.write(str(ppj(header_data)))

                keep_these = features_to_process
                if feature_to_predict:
                    keep_these.append(feature_to_predict)

                log.info(("{} creating new clean_file={} "
                          "keep_these={} "
                          "predict={}").format(log_id, clean_file, keep_these,
                                               feature_to_predict))

                # need to remove all columns that are all nan
                clean_df = None
                if "keep_nans" not in post_proc_rules:
                    clean_df = df[keep_these].dropna(axis=1,
                                                     how='all').dropna()
                else:
                    clean_df = df[keep_these].dropna(axis=1, how='all')
                # allow keeping empty columns

                log.info(("{} clean_df colums={} rows={}").format(
                    log_id, clean_df.columns.values, len(clean_df.index)))

                if len(clean_df.columns.values) == 0:
                    log.error(
                        ("{} The clean df has no columns").format(log_id))
                if len(clean_df.index) == 0:
                    log.error(("{} The clean df has no rows").format(log_id))

                cleaned_features = clean_df.columns.values
                cleaned_to_process = []
                cleaned_ignore_features = []
                for c in cleaned_features:
                    if feature_to_predict:
                        if c == feature_to_predict:
                            cleaned_ignore_features.append(c)
                    else:
                        keep_it = True
                        for ign in ignore_features:
                            if c == ign:
                                cleaned_ignore_features.append(c)
                                keep_it = False
                                break
                        # end of for all feaures to remove
                        if keep_it:
                            cleaned_to_process.append(c)
                # end of new feature columns

                log.info(("{} writing DROPPED clean_file={} "
                          "features_to_process={} "
                          "ignore_features={} "
                          "predict={}").format(log_id, clean_file,
                                               cleaned_to_process,
                                               cleaned_ignore_features,
                                               feature_to_predict))

                write_clean_df = clean_df.drop(columns=cleaned_ignore_features)
                log.info(("{} cleaned_df rows={}").format(
                    log_id, len(write_clean_df.index)))
                write_clean_df.to_csv(clean_file,
                                      sep=',',
                                      encoding='utf-8',
                                      index=False)

                log.info(("{} writing clean metadata file={}").format(
                    log_id, clean_metadata_file))
                header_data = {
                    "headers": list(write_clean_df.columns.values),
                    "output_type": "clean",
                    "pipeline_files": pipeline_files,
                    "post_proc_rules": post_proc_rules,
                    "label_rules": label_rules,
                    "features_to_process": cleaned_to_process,
                    "feature_to_predict": feature_to_predict,
                    "ignore_features": cleaned_ignore_features,
                    "created": rnow()
                }

                save_node["clean_headers"] = list(
                    write_clean_df.columns.values)
                with open(clean_metadata_file, "w") as otfile:
                    otfile.write(str(ppj(header_data)))

            # end of if/else

            save_node["clean_file"] = clean_file
            save_node["clean_metadata_file"] = clean_metadata_file

            log.info(
                ("{} done writing clean_file={}").format(log_id, clean_file))
        # end of post_proc_rules

        save_node["fulldata_file"] = fulldata_file
        save_node["fulldata_metadata_file"] = fulldata_metadata_file

        save_node["status"] = VALID
    # end of writing the file

    save_node["features_to_process"] = features_to_process
    save_node["feature_to_predict"] = feature_to_predict
    save_node["ignore_features"] = ignore_features

    log.info(("{} build_csv - END").format(log_id))

    return save_node
예제 #12
0
from antinex_utils.utils import ppj

name = 'send-worker-publish-to-core'
log = build_colorized_logger(name=name)

log.info("creating celery app")
app = Celery("test-decoupled-app")

broker_settings = {
    "broker_url":
    os.getenv("ANTINEX_REST_API_BROKER_URL", "redis://localhost:6379/9")
}
app.conf.update(**broker_settings)

datafile = "../webapp/drf_network_pipeline/tests/pubsub/publish-to-core.json"
data = {}
with open(datafile, "r") as f:
    data = json.loads(f.read())

# Celery task routing and queue
parent_route = "drf_network_pipeline.pipeline.tasks"
task_name = ("{}.task_publish_to_core").format(parent_route)
queue_name = ("{}.task_publish_to_core").format(parent_route)

log.info(
    ("sending args={} to broker={} task={}").format(ppj(data),
                                                    app.conf["BROKER_URL"],
                                                    task_name))

app.send_task(task_name, args=[data], queue=queue_name)