def retrieve(self, request, pk=None): """ GET request for the data of a dataset, specified by dataset id :param request: GET request, containing the dataset id :param pk: Dataset id :return: Dataset data and relevant statistics """ if pk: try: dataset = Dataset.objects.get(pk=pk) except Dataset.DoesNotExist: return Response("No dataset found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if not IsOwner().has_object_permission(request, self, dataset): return Response(status=status.HTTP_401_UNAUTHORIZED) serializer = self.serializer_class(dataset, many=False) response_data = serializer.data m = Metadata(dataset) meta = m.get_metadata("DatasetMetadata") response = "Response" if meta: response_data["metadata"] = meta response = meta["target"] response_data["data"] = load_dataset(pk) if response not in response_data["data"]: response = response_data["data"].columns.tolist()[0] response_data["statistics"] = DatasetStatistics(response_data["data"]).calculate_statistics(response) return Response(response_data, status=status.HTTP_200_OK) else: return Response( "Required id for the dataset was not found.", status=status.HTTP_400_BAD_REQUEST )
def update(self, request, pk=None): """ PUT request to update a dataset :param request: PUT request :param pk: dataset ID to be updated :return: 200/details of updated dataset, 400/bad request, or 401/unauthorized """ dataset_inputs = load_request(request) serializer = self.serializer_class(data=dataset_inputs, context={'request': request}) if serializer.is_valid() and pk is not None: try: original_dataset = Dataset.objects.get(id=int(pk)) except Dataset.DoesNotExist: return Response( "No dataset found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST ) if IsOwner().has_object_permission(request, self, original_dataset): amodel = serializer.update(original_dataset, serializer.validated_data) m = Metadata(amodel, dataset_inputs["metadata"]) meta = m.set_metadata("DatasetMetadata") if amodel: response_status = status.HTTP_201_CREATED response_data = serializer.data response_data["id"] = amodel.id del response_data["data"] if meta: response_data["metadata"] = meta if int(pk) == amodel.id: response_status = status.HTTP_200_OK return Response(response_data, status=response_status) else: return Response(status=status.HTTP_401_UNAUTHORIZED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def execute_task(df, model_id, model_name, dataset_id): logger.info( "Starting VB task -------- Model ID: {}; Model Type: {}; step 1/{}" .format(model_id, model_name, step_count[model_name])) DaskTasks.update_status(model_id, "Loading and validating data", "1/{}".format(step_count[model_name])) dataset_m = Metadata(parent=Dataset.objects.get( id=dataset_id)).get_metadata("DatasetMetadata") target = "Response" if "response" not in dataset_m.keys( ) else dataset_m["response"] attributes = None if "attributes" not in dataset_m.keys( ) else dataset_m["attributes"] y = df[target] if attributes: attributes_list = json.loads(attributes.replace("\'", "\"")) x = df[attributes_list] else: x = df.drop(target, axis=1) logger.info("Model ID: {}, loading hyper-parameters step 2/{}".format( model_id, step_count[model_name])) DaskTasks.update_status(model_id, "Loading hyper-parameters", "2/{}".format(step_count[model_name])) parameters = Metadata(parent=AnalyticalModel.objects.get( id=model_id)).get_metadata("ModelMetadata") if model_name == "lra": DaskTasks.execute_lra(model_id, parameters, x, y, step_count[model_name])
def create(self, request): """ POST request that creates a new Pipeline. :param request: POST request :return: New pipeline object """ pipeline_inputs = load_request(request) serializer = self.serializer_class(data=pipeline_inputs, context={'request': request}) try: project = Project.objects.get(id=int(pipeline_inputs["project"])) except Project.DoesNotExist: return Response("No project found for id: {}".format( int(pipeline_inputs["project"])), status=status.HTTP_400_BAD_REQUEST) if project.owner != request.user: return Response(status=status.HTTP_401_UNAUTHORIZED) if serializer.is_valid(): serializer.save() pipeline = serializer.data if "metadata" not in pipeline_inputs.keys(): pipeline_inputs["metadata"] = None a = Pipeline.objects.get(pk=int(pipeline["id"])) m = Metadata(a, pipeline_inputs["metadata"]) meta = m.set_metadata("PipelineMetadata") pipeline["metadata"] = meta if pipeline: return Response(pipeline, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def retrieve(self, request, pk=None): """ GET request for the data of a dataset, specified by dataset id :param request: GET request, containing the dataset id :param pk: Dataset id :return: Dataset data and relevant statistics """ if pk: dataset = Dataset.objects.get(pk=pk) serializer = self.serializer_class(dataset, many=False) response_data = serializer.data m = Metadata(dataset) meta = m.get_metadata("DatasetMetadata") response = "Response" if meta: response_data["metadata"] = meta response = meta["response"] response_data["data"] = pd.read_csv( StringIO(bytes(dataset.data).decode())) if response not in response_data["data"]: response = response_data["data"].columns.tolist()[0] response_data["statistics"] = DatasetStatistics( response_data["data"]).calculate_statistics(response) return Response(response_data, status=status.HTTP_200_OK) else: return Response("Required id for the dataset id was not found.", status=status.HTTP_400_BAD_REQUEST)
def update(self, request, pk=None): amodel_inputs = request.data.dict() serializer = self.serializer_class(data=request.data.dict(), context={'request': request}) if serializer.is_valid() and pk is not None: try: original_amodel = AnalyticalModel.objects.get(id=int(pk)) except AnalyticalModel.DoesNotExist: return Response( "No analytical model found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if IsOwnerOfWorkflowChild().has_object_permission( request, self, original_amodel): amodel = serializer.update(original_amodel, serializer.validated_data) if amodel: response_status = status.HTTP_201_CREATED response_data = serializer.data response_data["id"] = amodel.id if int(pk) == amodel.id: response_status = status.HTTP_200_OK if "metadata" in amodel_inputs.keys(): amodel_inputs["metadata"] = None m = Metadata(amodel_inputs, amodel_inputs["metadata"]) meta = m.set_metadata("ModelMetadata") response_data["metadata"] = m.get_metadata( "ModelMetadata") return Response(response_data, status=response_status) else: return Response(status=status.HTTP_401_UNAUTHORIZED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def create(self, request): """ POST request that creates a new Dataset. :param request: POST request :return: New dataset """ dataset_inputs = request.data.dict() serializer = self.serializer_class(data=dataset_inputs, context={'request': request}) if serializer.is_valid(): serializer.save() dataset = serializer.data if dataset: d = Dataset.objects.get(id=dataset["id"]) if "metadata" not in dataset_inputs.keys(): dataset_inputs["metadata"] = None m = Metadata(d, dataset_inputs["metadata"]) meta = m.set_metadata("DatasetMetadata") response = "Response" if meta: dataset["metadata"] = meta response = meta["response"] data = pd.read_csv(StringIO(bytes(d.data).decode())) if response not in data: response = data.columns.tolist()[0] dataset["statistics"] = DatasetStatistics( data).calculate_statistics(response) del dataset["data"] return Response(dataset, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def update(self, request, pk=None): dataset_inputs = request.data.dict() serializer = self.serializer_class(data=dataset_inputs, context={'request': request}) if serializer.is_valid() and pk is not None: try: original_dataset = Dataset.objects.get(id=int(pk)) except Dataset.DoesNotExist: return Response("No dataset model found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if IsOwnerOfWorkflowChild().has_object_permission( request, self, original_dataset): amodel = serializer.update(original_dataset, serializer.validated_data) m = Metadata(amodel, dataset_inputs["metadata"]) meta = m.set_metadata("DatasetMetadata") if amodel: response_status = status.HTTP_201_CREATED response_data = serializer.data response_data["id"] = amodel.id del response_data["data"] if meta: response_data["metadata"] = meta if int(pk) == amodel.id: response_status = status.HTTP_200_OK return Response(response_data, status=response_status) else: return Response(status=status.HTTP_401_UNAUTHORIZED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def update(self, request, pk=None): """ PUT request to update an existing location. :param request: PUT request :param pk: Location ID :return: """ dataset_inputs = load_request(request) serializer = self.serializer_class(data=dataset_inputs, context={'request': request}) if serializer.is_valid() and pk is not None: try: original_location = Location.objects.get(id=int(pk)) except Location.DoesNotExist: return Response("No location found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if original_location.owner == request.user: location = serializer.update(original_location, serializer.validated_data) if location: l = serializer.data m = Metadata(location, dataset_inputs["metadata"]) meta = m.set_metadata("LocationMetadata") if meta: l["metadata"] = meta request_status = status.HTTP_201_CREATED if int(pk) == location.id: request_status = status.HTTP_200_OK return Response(l, status=request_status) else: return Response(status=status.HTTP_401_UNAUTHORIZED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def update(self, request, pk=None): """ PUT request for updating a project. :param request: PUT request :return: The updated/200 """ dataset_inputs = load_request(request) serializer = self.serializer_class(data=dataset_inputs, context={'request': request}) if serializer.is_valid() and pk is not None: try: project = Project.objects.get(id=int(pk)) except Project.DoesNotExist: return Response("No project found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if IsOwner().has_object_permission(request, self, project): project = serializer.update(project, serializer.validated_data) if "metadata" not in dataset_inputs.keys(): dataset_inputs["metadata"] = None m = Metadata(project, dataset_inputs["metadata"]) meta = m.set_metadata("ProjectMetadata") response_data = serializer.data if meta: response_data["metadata"] = meta request_status = status.HTTP_200_OK return Response(response_data, status=request_status) else: return Response(status=status.HTTP_401_UNAUTHORIZED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def set_prediction_estimators(project_id, model_id, selected_models: dict): project = Project.objects.get(id=int(project_id)) model = Model.objects.get(id=int(model_id)) m = load_model(model.id, model.model) model_metadata = Metadata(parent=model).get_metadata("ModelMetadata") m.prediction_model_type = model_metadata[ "prediction_model_type"] if "prediction_model_type" in model_metadata.keys( ) else "single" m.refitPredictionModels(selected_models=selected_models) m.save(n=4, model_id=model_id, message="Model selection")
def refit_model(self, request): input_data = load_request(request) required_parameters = ["project_id", "model_id", "predictive_models"] if set(required_parameters).issubset(input_data.keys()): permissions = [] try: project = Project.objects.get(id=int(input_data["project_id"])) if not IsOwnerOfProject().has_object_permission( request, self, project): permissions.append("Unauthorized to access project.") except Project.DoesNotExist: project = None try: model = Model.objects.get(id=int(input_data["model_id"])) if not IsOwnerOfModel().has_object_permission( request, self, model): permissions.append("Unauthorized to access pipeline") except Model.DoesNotExist: model = None if len(permissions) > 0: return Response(permissions, status=status.HTTP_401_UNAUTHORIZED) if model is None or project is None: message = [] if project is None: message.append("No project found for id: {}".format( input_data["project_id"])) if model is None: message.append("No model found for id: {}".format( input_data["model_id"])) return Response(", ".join(message), status=status.HTTP_400_BAD_REQUEST) p_models = {} for p in json.loads(input_data["predictive_models"]): p_models[p[0]] = int(p[1]) m = Metadata(model, json.dumps({"predictive_models": p_models})) meta = m.set_metadata("ModelMetadata") response = {} DaskTasks.refit_task(project.id, model.id, p_models) response["project_id"] = project.id response["pipeline_id"] = model.pipeline.id response["model_id"] = model.id response["dataset_id"] = project.dataset response["model_metadata"] = meta response[ "status"] = "Initiated refit for specified models for prediction" return Response(response, status=status.HTTP_200_OK) data = "Missing required parameters: {}".format( ", ".join(required_parameters)) response_status = status.HTTP_200_OK return Response(data, status=response_status)
def list(self, request): """ GET request that lists all the locations owned by the user. :param request: GET request :return: List of locations """ locations = Location.objects.filter(owner=request.user) # TODO: Add ACL access objects serializer = self.serializer_class(locations, many=True) response_data = serializer.data for l in response_data: loc = Location.objects.get(pk=int(l["id"])) m = Metadata(loc, None) l["metadata"] = m.get_metadata("LocationMetadata") return Response(response_data, status=status.HTTP_200_OK)
def update_status(_id, status, stage, message=None, retry=5, log=None): if _id == -1: return if retry == 0: return meta = 'PipelineMetadata' try: amodel = Pipeline.objects.get(id=int(_id)) m = Metadata(parent=amodel, metadata=json.dumps({"status": status, "stage": stage, "message": message})) m.set_metadata(meta) if log: logger.info(log) PipelineLog(parent=amodel, logtype=status, log=f"Stage: {stage}, Message: {message}", timestamp=str(datetime.datetime.now().timestamp())) except Exception as ex: logger.warning("Error attempting to save status update: {}".format(ex)) update_status(_id, status, stage, None, retry - 1)
def list(self, request, pk=None): """ GET request that lists all the projects :param request: GET request :return: List of projects """ projects = Project.objects.filter(owner=request.user) # TODO: Add ACL access objects serializer = self.serializer_class(projects, many=True) response_data = serializer.data for d in response_data: p = Project.objects.get(id=d["id"]) m = Metadata(p, None) meta = m.get_metadata("ProjectMetadata") d["metadata"] = meta return Response(serializer.data, status=status.HTTP_200_OK)
def predict(self, request): input_data = load_request(request) required_parameters = ["project_id", "model_id", "data"] if set(required_parameters).issubset(input_data.keys()): permissions = [] try: project = Project.objects.get(id=int(input_data["project_id"])) if not IsOwnerOfProject().has_object_permission( request, self, project): permissions.append("Unauthorized to access project.") except Project.DoesNotExist: project = None try: model = Model.objects.get(id=int(input_data["model_id"])) if not IsOwnerOfModel().has_object_permission( request, self, model): permissions.append("Unauthorized to access pipeline") except Model.DoesNotExist: model = None if len(permissions) > 0: return Response(permissions, status=status.HTTP_401_UNAUTHORIZED) if model is None or project is None: message = [] if project is None: message.append("No project found for id: {}".format( input_data["project_id"])) if model is None: message.append("No model found for id: {}".format( input_data["model_id"])) return Response(", ".join(message), status=status.HTTP_400_BAD_REQUEST) response = {} data = str(input_data["data"]) results = DaskTasks.predict(project.id, model.id, data) response["project_id"] = project.id response["pipeline_id"] = model.pipeline.id response["model_id"] = model.id m = Metadata(model) response["model_metadata"] = m.get_metadata("ModelMetadata") response["dataset_id"] = project.dataset response["results"] = results return Response(response, status=status.HTTP_200_OK) data = "Missing required parameters: {}".format( ", ".join(required_parameters)) response_status = status.HTTP_200_OK return Response(data, status=response_status)
def update(self, request, pk=None): """ :param request: :param pk: :return: """ pipeline_inputs = load_request(request) serializer = self.serializer_class(data=pipeline_inputs, context={'request': request}) try: project = Project.objects.get(id=int(pipeline_inputs["project"])) except Project.DoesNotExist: return Response("No project found for id: {}".format( int(pipeline_inputs["project"])), status=status.HTTP_400_BAD_REQUEST) if project.owner != request.user: return Response(status=status.HTTP_401_UNAUTHORIZED) if serializer.is_valid() and pk is not None: try: original_pipeline = Pipeline.objects.get(id=int(pk)) except Pipeline.DoesNotExist: return Response("No pipeline found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if IsOwnerOfPipeline().has_object_permission( request, self, original_pipeline): experiment = serializer.update(original_pipeline, serializer.validated_data) if experiment: response_status = status.HTTP_201_CREATED response_data = serializer.data response_data["id"] = experiment.id if int(pk) == experiment.id: response_status = status.HTTP_200_OK if "metadata" not in pipeline_inputs.keys(): pipeline_inputs["metadata"] = None a = Pipeline.objects.get(pk=experiment.id) m = Metadata(a, pipeline_inputs["metadata"]) response_data["metadata"] = m.set_metadata( "PipelineMetadata") return Response(response_data, status=response_status) else: return Response(status=status.HTTP_401_UNAUTHORIZED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def update_status(_id, status, stage, message=None, retry=5): if retry == 0: pass meta = 'ModelMetadata' try: amodel = AnalyticalModel.objects.get(id=int(_id)) m = Metadata(parent=amodel, metadata=json.dumps({ "status": status, "stage": stage, "message": message })) m.set_metadata(meta) except Exception as ex: logger.warning( "Error attempting to save metadata update: {}".format(ex)) DaskTasks.update_status(_id, status, stage, None, retry - 1)
def destroy(self, request, pk=None): if pk is not None: try: dataset = Dataset.objects.get(id=int(pk)) except Dataset.DoesNotExist: return Response("No dataset found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if IsOwnerOfWorkflowChild().has_object_permission( request, self, dataset): m = Metadata(dataset) m.delete_metadata("DatasetMetadata") dataset.delete() return Response(status=status.HTTP_200_OK) else: return Response(status=status.HTTP_401_UNAUTHORIZED) return Response("No dataset 'id' in request.", status=status.HTTP_400_BAD_REQUEST)
def list(self, request): """ GET request that lists all the pipeline for a specific project id :param request: GET request, containing the project id as 'project' :return: List of analytical models """ if 'project' in self.request.query_params.keys(): pipeline = Pipeline.objects.filter( project=int(self.request.query_params.get('project'))) serializer = self.serializer_class(pipeline, many=True) response_data = serializer.data for l in response_data: a = Pipeline.objects.get(pk=int(l["id"])) m = Metadata(a, None) l["metadata"] = m.get_metadata("PipelineMetadata") return Response(response_data, status=status.HTTP_200_OK) return Response( "Required 'project' parameter for the pipeline was not found.", status=status.HTTP_400_BAD_REQUEST)
def create(self, request): """ POST request that creates a new project. :param request: POST request :return: New project object """ dataset_inputs = load_request(request) serializer = self.serializer_class(data=dataset_inputs, context={'request': request}) if serializer.is_valid(): serializer.save() project = serializer.data p = Project.objects.get(id=project["id"]) if "metadata" not in dataset_inputs.keys(): dataset_inputs["metadata"] = None m = Metadata(p, dataset_inputs["metadata"]) meta = m.set_metadata("ProjectMetadata") project["metadata"] = meta return Response(project, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def set_prediction_estimators(project_id, model_id, selected_models: dict): project = Project.objects.get(id=int(project_id)) dataset = Dataset.objects.get(id=project.dataset) df = load_dataset(dataset.id, dataset) project_metadata = Metadata(parent=Project.objects.get(id=project.id)).get_metadata("ProjectMetadata") target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"] features_label = None if "features" not in project_metadata.keys() else project_metadata["features"] target = df[target_label] if features_label: features_list = json.loads(features_label.replace("\'", "\"")) features = df[features_list] else: features = df.drop(target_label, axis=1) model = Model.objects.get(id=int(model_id)) m = load_model(model.id, model.model) # TODO: update predictive_model_type from model metadata m.refitPredictiveModels(selected_models=selected_models, y_df=target, x_df=features) m.save(n=4, model_id=model_id)
def create(self, request): """ POST request that creates a new analytical model. :param request: POST request :return: New analytical object """ amodel_inputs = request.data.dict() serializer = self.serializer_class(data=amodel_inputs, context={'request': request}) if serializer.is_valid(): serializer.save() amodel_inputs = serializer.data if "metadata" in amodel_inputs.keys(): amodel_inputs["metadata"] = None m = Metadata(amodel_inputs, amodel_inputs["metadata"]) meta = m.set_metadata("ModelMetadata") amodel_inputs["metadata"] = m.get_metadata("ModelMetadata") if amodel_inputs: return Response(amodel_inputs, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def destroy(self, request, pk=None): """ DEL to delete an existing dataset specified by dataset ID :param request: DEL request :param pk: dataset ID to be deleted :return: 200/success, 400/bad request, or 401/unauthorized """ if pk is not None: try: dataset = Dataset.objects.get(id=int(pk)) except Dataset.DoesNotExist: return Response("No dataset found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if IsOwner().has_object_permission(request, self, dataset): m = Metadata(dataset) m.delete_metadata("DatasetMetadata") dataset.delete() return Response(status=status.HTTP_200_OK) else: return Response(status=status.HTTP_401_UNAUTHORIZED) return Response("No dataset 'id' in request.", status=status.HTTP_400_BAD_REQUEST)
def data(self, request): inputs = request.data.dict() required_parameters = ["workflow_id", "model_id"] if set(required_parameters).issubset(inputs.keys()): try: workflow = Workflow.objects.get(id=int(inputs["workflow_id"])) except ObjectDoesNotExist: workflow = None try: amodel = AnalyticalModel.objects.get(id=int(inputs["model_id"])) except ObjectDoesNotExist: amodel = None if workflow is None or amodel is None: message = [] message = message if workflow else message.append("No workflow found for id: {}".format(inputs["workflow_id"])) message = message if amodel else message.append("No analytical model found for id: {}".format(inputs["amodel_id"])) return Response(",".join(message), status=status.HTTP_400_BAD_REQUEST) elif IsOwnerOfLocationChild().has_object_permission(request, self, workflow): response = {} meta = Metadata(parent=amodel) metadata = meta.get_metadata("ModelMetadata", ['status', 'stage', 'message']) response["metadata"] = metadata completed = False if "stage" in metadata.keys(): i = metadata["stage"].split("/") if int(i[0]) == int(i[1]): completed = True if completed: if amodel.model: data = None if "data" in inputs.keys(): data = pd.read_csv(StringIO(inputs["data"])) response["data"] = DaskTasks.make_prediction(amodel.id, data) response["dataset_id"] = amodel.dataset response["analytical_model_id"] = amodel.id response["workflow_id"] = workflow.id return Response(response, status=status.HTTP_200_OK) data = "Missing required parameters: {}".format(", ".join(required_parameters)) response_status = status.HTTP_200_OK return Response(data, status=response_status)
def create(self, request): """ POST request that creates a new location. :param request: POST request :return: New location object """ dataset_inputs = load_request(request) serializer = self.serializer_class(data=dataset_inputs, context={'request': request}) # TODO: Add project existence and ownership check if serializer.is_valid(): location = serializer.save() location_data = serializer.data if "metadata" not in dataset_inputs.keys(): dataset_inputs["metadata"] = None m = Metadata(location, dataset_inputs["metadata"]) meta = m.set_metadata("LocationMetadata") if meta: location_data["metadata"] = meta if location: return Response(location_data, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def make_prediction(amodel_id, data=None): amodel = AnalyticalModel.objects.get(id=int(amodel_id)) dataset = Dataset.objects.get(id=int(amodel.dataset)) y_data = None df = pd.read_csv(StringIO(bytes(dataset.data).decode())) dataset_m = Metadata(parent=dataset).get_metadata("DatasetMetadata") target = "Response" if "response" not in dataset_m.keys( ) else dataset_m["response"] attributes = None if "attributes" not in dataset_m.keys( ) else dataset_m["attributes"] y = df[target] if attributes: attributes_list = json.loads(attributes.replace("\'", "\"")) x = df[attributes_list] else: x = df.drop(target, axis=1) t = LinearRegressionAutomatedVB() t.set_data(x, y) x_train = t.x_train y_train = t.y_train x_data = t.x_test y_test = t.y_test.to_numpy().flatten() if data is not None: x_data = data model = pickle.loads(amodel.model) response = { "results": model.predict(x_data), "train_score": model.score(x_train, y_train) } if data is None: response["residuals"] = y_test - response["results"] response["test_score"] = model.score(x_data, y_test) return response
def __init__(self, dataset_id): # TODO: replace the need for the project_id with providing the target variable self.dataset_id = dataset_id self.dataset = Dataset.objects.get(pk=dataset_id) self.df = load_dataset(dataset_id, self.dataset) self.dataset_metadata = Metadata( parent=self.dataset).get_metadata("DatasetMetadata") self.target_label = "target" if "target" not in self.dataset_metadata.keys( ) else self.dataset_metadata["target"] self.features_label = None if "features" not in self.dataset_metadata.keys( ) else self.dataset_metadata["features"] if self.features_label is None or self.features_label == "*": self.features_label = list(self.df.columns) self.features_label.remove(self.target_label) else: self.features_label = json.loads(self.features_label) self.y_df = self.df[self.target_label].to_frame() self.X_df = self.df[self.features_label] self.vbhelper = VBHelper(pipeline_id=-1) self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df)
def execute_task(project_id, dataset_id, pipeline_id): # STAGE 1 - Data and parameter load from db update_status(pipeline_id, "Data and Model Setup: Retrieving dataset and pipeline", "1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 1/{}".format( pipeline_id, None, pre_processing_steps), message="Cross validation") project = Project.objects.get(id=int(project_id)) dataset = Dataset.objects.get(id=int(dataset_id)) pipeline = Pipeline.objects.get(id=int(pipeline_id)) project.dataset = int(dataset_id) project.save() df = load_dataset(dataset_id, dataset) dataset_metadata = Metadata( parent=dataset).get_metadata("DatasetMetadata") pipeline_metadata = Metadata( parent=pipeline).get_metadata("PipelineMetadata") project_metadata = Metadata( parent=project).get_metadata("ProjectMetadata") target_label = None if "target" not in project_metadata.keys( ) else project_metadata["target"] features_label = None if "features" not in project_metadata.keys( ) else project_metadata["features"] target_label = "target" if ( "target" not in dataset_metadata.keys() and target_label is None) else dataset_metadata["target"] if "features" not in dataset_metadata.keys( ) and features_label is None: features_label = None else: features_label = dataset_metadata["features"] if features_label is None or features_label == "*": features_label = list(df.columns) features_label.remove(target_label) else: features_label = json.loads(features_label) drop_vars = [] if "drop_features" not in project_metadata.keys( ) else json.loads(project_metadata["drop_features"].replace( "\'", "\"")) for d in drop_vars: features_label.remove(d) # STAGE 2 - Data prep update_status(pipeline_id, "Data and Model Setup: Loading data", "2/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 2/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") target = df[target_label].to_frame() if features_label: features = df[features_label] else: features = df.drop(target_label, axis=1) # STAGE 3 - VBHelper execution update_status( pipeline_id, "Data and Model Setup: Loading all parameters and settings", "3/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 3/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") if pipeline_metadata: vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys( ) else json.loads(pipeline_metadata["parameters"].replace( "'", "\"")) else: vbhelper_parameters = {} vbhelper_parameters["pipeline_id"] = pipeline_id outer_cv = pipeline_metadata[ "outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True" try: vbhelper = VBHelper(**vbhelper_parameters) if "estimators" in pipeline_metadata.keys(): est_str = pipeline_metadata["estimators"].replace("\'", "\"") estimators = json.loads(est_str) else: update_status( pipeline_id, "Error: VB Helper requires an estimator.", "-1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: -1/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") return vbhelper.setData(X_df=features, y_df=target) inner_cv_dict = { 'cv_reps': 1, 'cv_folds': 5, 'cv_strategy': ('quantile', 5) } inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict) # prep_dict = {'cat_approach': 'together', 'impute_strategy': 'IterativeImputer', 'cat_idx': vbhelper.cat_idx} prep_dict = { 'cat_approach': 'together', 'impute_strategy': 'impute_middle', 'cat_idx': vbhelper.cat_idx } pipe_kwargs = dict(do_prep=not vbhelper.run_stacked, prep_dict=prep_dict, inner_cv=inner_cv, cat_idx=vbhelper.cat_idx, float_idx=vbhelper.float_idx, bestT=False) estimators_dict = {} e_i = 0 for e in estimators: name = e["name"] if "name" in e.keys( ) else e["type"] + "-{}".format(e_i) n_i = 1 n_name = name while n_name in estimators_dict.keys(): n_name = name + "-{}".format(n_i) n_i += 1 name = n_name estimator = DaskTasks.get_estimator(e["type"]) e_kwargs = copy.copy(pipe_kwargs) for k, p in e["parameters"].items(): e_kwargs[k] = p estimators_dict[name] = { "pipe": estimator, "pipe_kwargs": e_kwargs } e_i += 1 vbhelper.setPipeDict(estimators_dict) vbhelper.setModelDict() if outer_cv == "True": vbhelper.runCrossValidate(verbose=True) vbhelper.buildCVScoreDict() else: #TODO: check processing for non-outer-cv instance for data cleanup vbhelper.fitEstimators() try: model = Model.objects.get(pipeline=pipeline) model_id = model.id except Model.DoesNotExist: model_id = None vbhelper.save(message="Completed.") del model except Exception as e: update_status(pipeline_id, "Error: Unknown error executing pipeline", "-0/16", log="Pipeline: {}, Type: {}, Error: {}".format( pipeline_id, pipeline.name, e), message="Cross validation") del vbhelper
class DataExploration: def __init__(self, dataset_id): # TODO: replace the need for the project_id with providing the target variable self.dataset_id = dataset_id self.dataset = Dataset.objects.get(pk=dataset_id) self.df = load_dataset(dataset_id, self.dataset) self.dataset_metadata = Metadata( parent=self.dataset).get_metadata("DatasetMetadata") self.target_label = "target" if "target" not in self.dataset_metadata.keys( ) else self.dataset_metadata["target"] self.features_label = None if "features" not in self.dataset_metadata.keys( ) else self.dataset_metadata["features"] if self.features_label is None or self.features_label == "*": self.features_label = list(self.df.columns) self.features_label.remove(self.target_label) else: self.features_label = json.loads(self.features_label) self.y_df = self.df[self.target_label].to_frame() self.X_df = self.df[self.features_label] self.vbhelper = VBHelper(pipeline_id=-1) self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df) def get_missing_vals(self): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.missingVals() def get_components(self, num_cols, keep_cats=False): try: if "," in num_cols: _num_cols = num_cols.split(",") num_cols = [] for n in _num_cols: num_cols.append(int(n)) else: num_cols = [int(num_cols)] except Exception: num_cols = [1] data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.viewComponents(num_cols=num_cols, keep_cats=keep_cats) def get_kerneldensity(self): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.kernelDensityPie() def get_dendrogram(self, linkage='ward', dist='spearmanr'): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.hierarchicalDendrogram(linkage=linkage, dist=dist)