def create(self, request): """ POST request that creates a new Dataset. :param request: POST request. :return: New dataset """ dataset_inputs = load_request(request) serializer = self.serializer_class(data=dataset_inputs, context={'request': request}) if serializer.is_valid(): serializer.save() dataset = serializer.data if dataset: d = Dataset.objects.get(id=dataset["id"]) if "metadata" not in dataset_inputs.keys(): dataset_inputs["metadata"] = None m = Metadata(d, dataset_inputs["metadata"]) meta = m.set_metadata("DatasetMetadata") response = "Response" if meta: dataset["metadata"] = meta response = meta["target"] data = load_dataset(d.id) if response not in data: response = data.columns.tolist()[0] dataset["statistics"] = DatasetStatistics(data).calculate_statistics(response) del dataset["data"] return Response(dataset, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def retrieve(self, request, pk=None): """ GET request for the data of a dataset, specified by dataset id :param request: GET request, containing the dataset id :param pk: Dataset id :return: Dataset data and relevant statistics """ if pk: try: dataset = Dataset.objects.get(pk=pk) except Dataset.DoesNotExist: return Response("No dataset found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST) if not IsOwner().has_object_permission(request, self, dataset): return Response(status=status.HTTP_401_UNAUTHORIZED) serializer = self.serializer_class(dataset, many=False) response_data = serializer.data m = Metadata(dataset) meta = m.get_metadata("DatasetMetadata") response = "Response" if meta: response_data["metadata"] = meta response = meta["target"] response_data["data"] = load_dataset(pk) if response not in response_data["data"]: response = response_data["data"].columns.tolist()[0] response_data["statistics"] = DatasetStatistics(response_data["data"]).calculate_statistics(response) return Response(response_data, status=status.HTTP_200_OK) else: return Response( "Required id for the dataset was not found.", status=status.HTTP_400_BAD_REQUEST )
def set_prediction_estimators(project_id, model_id, selected_models: dict): project = Project.objects.get(id=int(project_id)) dataset = Dataset.objects.get(id=project.dataset) df = load_dataset(dataset.id, dataset) project_metadata = Metadata(parent=Project.objects.get(id=project.id)).get_metadata("ProjectMetadata") target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"] features_label = None if "features" not in project_metadata.keys() else project_metadata["features"] target = df[target_label] if features_label: features_list = json.loads(features_label.replace("\'", "\"")) features = df[features_list] else: features = df.drop(target_label, axis=1) model = Model.objects.get(id=int(model_id)) m = load_model(model.id, model.model) # TODO: update predictive_model_type from model metadata m.refitPredictiveModels(selected_models=selected_models, y_df=target, x_df=features) m.save(n=4, model_id=model_id)
def __init__(self, dataset_id): # TODO: replace the need for the project_id with providing the target variable self.dataset_id = dataset_id self.dataset = Dataset.objects.get(pk=dataset_id) self.df = load_dataset(dataset_id, self.dataset) self.dataset_metadata = Metadata( parent=self.dataset).get_metadata("DatasetMetadata") self.target_label = "target" if "target" not in self.dataset_metadata.keys( ) else self.dataset_metadata["target"] self.features_label = None if "features" not in self.dataset_metadata.keys( ) else self.dataset_metadata["features"] if self.features_label is None or self.features_label == "*": self.features_label = list(self.df.columns) self.features_label.remove(self.target_label) else: self.features_label = json.loads(self.features_label) self.y_df = self.df[self.target_label].to_frame() self.X_df = self.df[self.features_label] self.vbhelper = VBHelper(pipeline_id=-1) self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df)
def execute_task(project_id, dataset_id, pipeline_id): # STAGE 1 - Data and parameter load from db update_status(pipeline_id, "Data and Model Setup: Retrieving dataset and pipeline", "1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 1/{}".format( pipeline_id, None, pre_processing_steps), message="Cross validation") project = Project.objects.get(id=int(project_id)) dataset = Dataset.objects.get(id=int(dataset_id)) pipeline = Pipeline.objects.get(id=int(pipeline_id)) project.dataset = int(dataset_id) project.save() df = load_dataset(dataset_id, dataset) dataset_metadata = Metadata( parent=dataset).get_metadata("DatasetMetadata") pipeline_metadata = Metadata( parent=pipeline).get_metadata("PipelineMetadata") project_metadata = Metadata( parent=project).get_metadata("ProjectMetadata") target_label = None if "target" not in project_metadata.keys( ) else project_metadata["target"] features_label = None if "features" not in project_metadata.keys( ) else project_metadata["features"] target_label = "target" if ( "target" not in dataset_metadata.keys() and target_label is None) else dataset_metadata["target"] if "features" not in dataset_metadata.keys( ) and features_label is None: features_label = None else: features_label = dataset_metadata["features"] if features_label is None or features_label == "*": features_label = list(df.columns) features_label.remove(target_label) else: features_label = json.loads(features_label) drop_vars = [] if "drop_features" not in project_metadata.keys( ) else json.loads(project_metadata["drop_features"].replace( "\'", "\"")) for d in drop_vars: features_label.remove(d) # STAGE 2 - Data prep update_status(pipeline_id, "Data and Model Setup: Loading data", "2/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 2/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") target = df[target_label].to_frame() if features_label: features = df[features_label] else: features = df.drop(target_label, axis=1) # STAGE 3 - VBHelper execution update_status( pipeline_id, "Data and Model Setup: Loading all parameters and settings", "3/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 3/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") if pipeline_metadata: vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys( ) else json.loads(pipeline_metadata["parameters"].replace( "'", "\"")) else: vbhelper_parameters = {} vbhelper_parameters["pipeline_id"] = pipeline_id outer_cv = pipeline_metadata[ "outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True" try: vbhelper = VBHelper(**vbhelper_parameters) if "estimators" in pipeline_metadata.keys(): est_str = pipeline_metadata["estimators"].replace("\'", "\"") estimators = json.loads(est_str) else: update_status( pipeline_id, "Error: VB Helper requires an estimator.", "-1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: -1/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") return vbhelper.setData(X_df=features, y_df=target) inner_cv_dict = { 'cv_reps': 1, 'cv_folds': 5, 'cv_strategy': ('quantile', 5) } inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict) # prep_dict = {'cat_approach': 'together', 'impute_strategy': 'IterativeImputer', 'cat_idx': vbhelper.cat_idx} prep_dict = { 'cat_approach': 'together', 'impute_strategy': 'impute_middle', 'cat_idx': vbhelper.cat_idx } pipe_kwargs = dict(do_prep=not vbhelper.run_stacked, prep_dict=prep_dict, inner_cv=inner_cv, cat_idx=vbhelper.cat_idx, float_idx=vbhelper.float_idx, bestT=False) estimators_dict = {} e_i = 0 for e in estimators: name = e["name"] if "name" in e.keys( ) else e["type"] + "-{}".format(e_i) n_i = 1 n_name = name while n_name in estimators_dict.keys(): n_name = name + "-{}".format(n_i) n_i += 1 name = n_name estimator = DaskTasks.get_estimator(e["type"]) e_kwargs = copy.copy(pipe_kwargs) for k, p in e["parameters"].items(): e_kwargs[k] = p estimators_dict[name] = { "pipe": estimator, "pipe_kwargs": e_kwargs } e_i += 1 vbhelper.setPipeDict(estimators_dict) vbhelper.setModelDict() if outer_cv == "True": vbhelper.runCrossValidate(verbose=True) vbhelper.buildCVScoreDict() else: #TODO: check processing for non-outer-cv instance for data cleanup vbhelper.fitEstimators() try: model = Model.objects.get(pipeline=pipeline) model_id = model.id except Model.DoesNotExist: model_id = None vbhelper.save(message="Completed.") del model except Exception as e: update_status(pipeline_id, "Error: Unknown error executing pipeline", "-0/16", log="Pipeline: {}, Type: {}, Error: {}".format( pipeline_id, pipeline.name, e), message="Cross validation") del vbhelper
def execute_task(project_id, dataset_id, pipeline_id): # STAGE 1 - Data and parameter load from db update_status( pipeline_id, "Data and Model Setup: Retrieving dataset and pipeline", "1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 1/{}".format(pipeline_id, None, pre_processing_steps) ) project = Project.objects.get(id=int(project_id)) dataset = Dataset.objects.get(id=int(dataset_id)) pipeline = Pipeline.objects.get(id=int(pipeline_id)) project.dataset = int(dataset_id) project.save() df = load_dataset(dataset_id, dataset) pipeline_metadata = Metadata(parent=Pipeline.objects.get(id=pipeline_id)).get_metadata("PipelineMetadata") project_metadata = Metadata(parent=Project.objects.get(id=project_id)).get_metadata("ProjectMetadata") target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"] features_label = None if "features" not in project_metadata.keys() else project_metadata["features"] # STAGE 2 - Data prep update_status( pipeline_id, "Data and Model Setup: Loading data", "2/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 2/{}".format(pipeline_id, pipeline.name, pre_processing_steps) ) target = df[target_label] if features_label: features_list = json.loads(features_label.replace("\'", "\"")) features = df[features_list] else: features = df.drop(target_label, axis=1) # STAGE 3 - VBHelper execution update_status( pipeline_id, "Data and Model Setup: Loading all parameters and settings", "3/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 3/{}".format(pipeline_id, pipeline.name, pre_processing_steps) ) if pipeline_metadata: vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys() else json.loads(pipeline_metadata["parameters"].replace("'", "\"")) else: vbhelper_parameters = {} vbhelper_parameters["pipeline_id"] = pipeline_id outer_cv = pipeline_metadata["outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True" try: vbhelper = VBHelper(**vbhelper_parameters) if "estimators" in pipeline_metadata.keys(): estimators = json.loads(pipeline_metadata["estimators"].replace("\'", "\"")) else: update_status(pipeline_id, "Error: VB Helper requires an estimator.", "-1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: -1/{}".format(pipeline_id, pipeline.name, pre_processing_steps) ) return vbhelper.setData(X_df=features, y_df=target) inner_cv_dict = {'cv_reps': 1, 'cv_folds': 5, 'cv_strategy': ('quantile', 5)} inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict) prep_dict = {'impute_strategy': 'impute_knn5', 'cat_idx': vbhelper.cat_idx} pipe_kwargs = dict(do_prep=not vbhelper.run_stacked, prep_dict=prep_dict, inner_cv=inner_cv, gridpoints=4, cat_idx=vbhelper.cat_idx, float_idx=vbhelper.float_idx, bestT=False) estimators_dict = {} e_i = 0 for e in estimators: name = e["name"] if "name" in e.keys() else e["type"] + "-{}".format(e_i) n_i = 1 n_name = name while n_name in estimators_dict.keys(): n_name = name + "-{}".format(n_i) n_i += 1 name = n_name estimator = DaskTasks.get_estimator(e["type"]) e_kwargs = copy.copy(pipe_kwargs) for k, p in e["parameters"].items(): e_kwargs[k] = p estimators_dict[name] = {"pipe": estimator, "pipe_kwargs": e_kwargs} e_i += 1 vbhelper.setPipeDict(estimators_dict) vbhelper.setModelDict() if outer_cv == "True": vbhelper.runCrossValidate() vbhelper.buildCVScoreDict() else: vbhelper.fitEstimators() vbhelper.save() except Exception as e: update_status(pipeline_id, "Error: Unknown error executing pipeline", "-0/16", log="Pipeline: {}, Type: {}, Error: {}".format(pipeline_id, pipeline.name, e) )