コード例 #1
0
 def create(self, request):
     """
     POST request that creates a new Dataset.
     :param request: POST request.
     :return: New dataset
     """
     dataset_inputs = load_request(request)
     serializer = self.serializer_class(data=dataset_inputs, context={'request': request})
     if serializer.is_valid():
         serializer.save()
         dataset = serializer.data
         if dataset:
             d = Dataset.objects.get(id=dataset["id"])
             if "metadata" not in dataset_inputs.keys():
                 dataset_inputs["metadata"] = None
             m = Metadata(d, dataset_inputs["metadata"])
             meta = m.set_metadata("DatasetMetadata")
             response = "Response"
             if meta:
                 dataset["metadata"] = meta
                 response = meta["target"]
             data = load_dataset(d.id)
             if response not in data:
                 response = data.columns.tolist()[0]
             dataset["statistics"] = DatasetStatistics(data).calculate_statistics(response)
             del dataset["data"]
             return Response(dataset, status=status.HTTP_201_CREATED)
     return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
コード例 #2
0
 def retrieve(self, request, pk=None):
     """
     GET request for the data of a dataset, specified by dataset id
     :param request: GET request, containing the dataset id
     :param pk: Dataset id
     :return: Dataset data and relevant statistics
     """
     if pk:
         try:
             dataset = Dataset.objects.get(pk=pk)
         except Dataset.DoesNotExist:
             return Response("No dataset found for id: {}".format(pk), status=status.HTTP_400_BAD_REQUEST)
         if not IsOwner().has_object_permission(request, self, dataset):
             return Response(status=status.HTTP_401_UNAUTHORIZED)
         serializer = self.serializer_class(dataset, many=False)
         response_data = serializer.data
         m = Metadata(dataset)
         meta = m.get_metadata("DatasetMetadata")
         response = "Response"
         if meta:
             response_data["metadata"] = meta
             response = meta["target"]
         response_data["data"] = load_dataset(pk)
         if response not in response_data["data"]:
             response = response_data["data"].columns.tolist()[0]
         response_data["statistics"] = DatasetStatistics(response_data["data"]).calculate_statistics(response)
         return Response(response_data, status=status.HTTP_200_OK)
     else:
         return Response(
             "Required id for the dataset was not found.",
             status=status.HTTP_400_BAD_REQUEST
         )
コード例 #3
0
    def set_prediction_estimators(project_id, model_id, selected_models: dict):
        project = Project.objects.get(id=int(project_id))
        dataset = Dataset.objects.get(id=project.dataset)
        df = load_dataset(dataset.id, dataset)
        project_metadata = Metadata(parent=Project.objects.get(id=project.id)).get_metadata("ProjectMetadata")

        target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"]
        features_label = None if "features" not in project_metadata.keys() else project_metadata["features"]
        target = df[target_label]
        if features_label:
            features_list = json.loads(features_label.replace("\'", "\""))
            features = df[features_list]
        else:
            features = df.drop(target_label, axis=1)
        model = Model.objects.get(id=int(model_id))
        m = load_model(model.id, model.model)
        # TODO: update predictive_model_type from model metadata
        m.refitPredictiveModels(selected_models=selected_models, y_df=target, x_df=features)
        m.save(n=4, model_id=model_id)
コード例 #4
0
    def __init__(self, dataset_id):
        # TODO: replace the need for the project_id with providing the target variable
        self.dataset_id = dataset_id
        self.dataset = Dataset.objects.get(pk=dataset_id)

        self.df = load_dataset(dataset_id, self.dataset)
        self.dataset_metadata = Metadata(
            parent=self.dataset).get_metadata("DatasetMetadata")

        self.target_label = "target" if "target" not in self.dataset_metadata.keys(
        ) else self.dataset_metadata["target"]
        self.features_label = None if "features" not in self.dataset_metadata.keys(
        ) else self.dataset_metadata["features"]
        if self.features_label is None or self.features_label == "*":
            self.features_label = list(self.df.columns)
            self.features_label.remove(self.target_label)
        else:
            self.features_label = json.loads(self.features_label)

        self.y_df = self.df[self.target_label].to_frame()
        self.X_df = self.df[self.features_label]

        self.vbhelper = VBHelper(pipeline_id=-1)
        self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df)
コード例 #5
0
    def execute_task(project_id, dataset_id, pipeline_id):
        # STAGE 1 - Data and parameter load from db
        update_status(pipeline_id,
                      "Data and Model Setup: Retrieving dataset and pipeline",
                      "1/{}".format(pre_processing_steps),
                      log="Pipeline: {}, Type: {}, Setup: 1/{}".format(
                          pipeline_id, None, pre_processing_steps),
                      message="Cross validation")
        project = Project.objects.get(id=int(project_id))
        dataset = Dataset.objects.get(id=int(dataset_id))
        pipeline = Pipeline.objects.get(id=int(pipeline_id))

        project.dataset = int(dataset_id)
        project.save()

        df = load_dataset(dataset_id, dataset)
        dataset_metadata = Metadata(
            parent=dataset).get_metadata("DatasetMetadata")
        pipeline_metadata = Metadata(
            parent=pipeline).get_metadata("PipelineMetadata")
        project_metadata = Metadata(
            parent=project).get_metadata("ProjectMetadata")

        target_label = None if "target" not in project_metadata.keys(
        ) else project_metadata["target"]
        features_label = None if "features" not in project_metadata.keys(
        ) else project_metadata["features"]

        target_label = "target" if (
            "target" not in dataset_metadata.keys()
            and target_label is None) else dataset_metadata["target"]

        if "features" not in dataset_metadata.keys(
        ) and features_label is None:
            features_label = None
        else:
            features_label = dataset_metadata["features"]
        if features_label is None or features_label == "*":
            features_label = list(df.columns)
            features_label.remove(target_label)
        else:
            features_label = json.loads(features_label)
        drop_vars = [] if "drop_features" not in project_metadata.keys(
        ) else json.loads(project_metadata["drop_features"].replace(
            "\'", "\""))
        for d in drop_vars:
            features_label.remove(d)

        # STAGE 2 - Data prep
        update_status(pipeline_id,
                      "Data and Model Setup: Loading data",
                      "2/{}".format(pre_processing_steps),
                      log="Pipeline: {}, Type: {}, Setup: 2/{}".format(
                          pipeline_id, pipeline.name, pre_processing_steps),
                      message="Cross validation")

        target = df[target_label].to_frame()
        if features_label:
            features = df[features_label]
        else:
            features = df.drop(target_label, axis=1)

        # STAGE 3 - VBHelper execution
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading all parameters and settings",
            "3/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 3/{}".format(
                pipeline_id, pipeline.name, pre_processing_steps),
            message="Cross validation")
        if pipeline_metadata:
            vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys(
            ) else json.loads(pipeline_metadata["parameters"].replace(
                "'", "\""))
        else:
            vbhelper_parameters = {}

        vbhelper_parameters["pipeline_id"] = pipeline_id
        outer_cv = pipeline_metadata[
            "outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True"
        try:
            vbhelper = VBHelper(**vbhelper_parameters)
            if "estimators" in pipeline_metadata.keys():
                est_str = pipeline_metadata["estimators"].replace("\'", "\"")
                estimators = json.loads(est_str)
            else:
                update_status(
                    pipeline_id,
                    "Error: VB Helper requires an estimator.",
                    "-1/{}".format(pre_processing_steps),
                    log="Pipeline: {}, Type: {}, Setup: -1/{}".format(
                        pipeline_id, pipeline.name, pre_processing_steps),
                    message="Cross validation")
                return
            vbhelper.setData(X_df=features, y_df=target)
            inner_cv_dict = {
                'cv_reps': 1,
                'cv_folds': 5,
                'cv_strategy': ('quantile', 5)
            }
            inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict)
            # prep_dict = {'cat_approach': 'together', 'impute_strategy': 'IterativeImputer', 'cat_idx': vbhelper.cat_idx}
            prep_dict = {
                'cat_approach': 'together',
                'impute_strategy': 'impute_middle',
                'cat_idx': vbhelper.cat_idx
            }
            pipe_kwargs = dict(do_prep=not vbhelper.run_stacked,
                               prep_dict=prep_dict,
                               inner_cv=inner_cv,
                               cat_idx=vbhelper.cat_idx,
                               float_idx=vbhelper.float_idx,
                               bestT=False)
            estimators_dict = {}
            e_i = 0
            for e in estimators:
                name = e["name"] if "name" in e.keys(
                ) else e["type"] + "-{}".format(e_i)
                n_i = 1
                n_name = name
                while n_name in estimators_dict.keys():
                    n_name = name + "-{}".format(n_i)
                    n_i += 1
                name = n_name
                estimator = DaskTasks.get_estimator(e["type"])
                e_kwargs = copy.copy(pipe_kwargs)
                for k, p in e["parameters"].items():
                    e_kwargs[k] = p
                estimators_dict[name] = {
                    "pipe": estimator,
                    "pipe_kwargs": e_kwargs
                }
                e_i += 1
            vbhelper.setPipeDict(estimators_dict)
            vbhelper.setModelDict()
            if outer_cv == "True":
                vbhelper.runCrossValidate(verbose=True)
                vbhelper.buildCVScoreDict()
            else:
                #TODO: check processing for non-outer-cv instance for data cleanup
                vbhelper.fitEstimators()
            try:
                model = Model.objects.get(pipeline=pipeline)
                model_id = model.id
            except Model.DoesNotExist:
                model_id = None
            vbhelper.save(message="Completed.")
            del model
        except Exception as e:
            update_status(pipeline_id,
                          "Error: Unknown error executing pipeline",
                          "-0/16",
                          log="Pipeline: {}, Type: {}, Error: {}".format(
                              pipeline_id, pipeline.name, e),
                          message="Cross validation")
        del vbhelper
コード例 #6
0
    def execute_task(project_id, dataset_id, pipeline_id):
        # STAGE 1 - Data and parameter load from db
        update_status(
            pipeline_id,
            "Data and Model Setup: Retrieving dataset and pipeline", "1/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 1/{}".format(pipeline_id, None, pre_processing_steps)
        )
        project = Project.objects.get(id=int(project_id))
        dataset = Dataset.objects.get(id=int(dataset_id))
        pipeline = Pipeline.objects.get(id=int(pipeline_id))

        project.dataset = int(dataset_id)
        project.save()

        df = load_dataset(dataset_id, dataset)
        pipeline_metadata = Metadata(parent=Pipeline.objects.get(id=pipeline_id)).get_metadata("PipelineMetadata")
        project_metadata = Metadata(parent=Project.objects.get(id=project_id)).get_metadata("ProjectMetadata")

        target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"]
        features_label = None if "features" not in project_metadata.keys() else project_metadata["features"]

        # STAGE 2 - Data prep
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading data", "2/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 2/{}".format(pipeline_id, pipeline.name, pre_processing_steps)
        )

        target = df[target_label]
        if features_label:
            features_list = json.loads(features_label.replace("\'", "\""))
            features = df[features_list]
        else:
            features = df.drop(target_label, axis=1)

        # STAGE 3 - VBHelper execution
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading all parameters and settings", "3/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 3/{}".format(pipeline_id, pipeline.name, pre_processing_steps)
        )
        if pipeline_metadata:
            vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys() else json.loads(pipeline_metadata["parameters"].replace("'", "\""))
        else:
            vbhelper_parameters = {}

        vbhelper_parameters["pipeline_id"] = pipeline_id
        outer_cv = pipeline_metadata["outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True"
        try:
            vbhelper = VBHelper(**vbhelper_parameters)
            if "estimators" in pipeline_metadata.keys():
                estimators = json.loads(pipeline_metadata["estimators"].replace("\'", "\""))
            else:
                update_status(pipeline_id, "Error: VB Helper requires an estimator.",
                              "-1/{}".format(pre_processing_steps),
                              log="Pipeline: {}, Type: {}, Setup: -1/{}".format(pipeline_id, pipeline.name,
                                                                                pre_processing_steps)
                              )
                return
            vbhelper.setData(X_df=features, y_df=target)
            inner_cv_dict = {'cv_reps': 1, 'cv_folds': 5, 'cv_strategy': ('quantile', 5)}
            inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict)
            prep_dict = {'impute_strategy': 'impute_knn5', 'cat_idx': vbhelper.cat_idx}
            pipe_kwargs = dict(do_prep=not vbhelper.run_stacked, prep_dict=prep_dict, inner_cv=inner_cv,
                               gridpoints=4, cat_idx=vbhelper.cat_idx, float_idx=vbhelper.float_idx,
                               bestT=False)
            estimators_dict = {}
            e_i = 0
            for e in estimators:
                name = e["name"] if "name" in e.keys() else e["type"] + "-{}".format(e_i)
                n_i = 1
                n_name = name
                while n_name in estimators_dict.keys():
                    n_name = name + "-{}".format(n_i)
                    n_i += 1
                name = n_name
                estimator = DaskTasks.get_estimator(e["type"])
                e_kwargs = copy.copy(pipe_kwargs)
                for k, p in e["parameters"].items():
                    e_kwargs[k] = p
                estimators_dict[name] = {"pipe": estimator, "pipe_kwargs": e_kwargs}
                e_i += 1
            vbhelper.setPipeDict(estimators_dict)
            vbhelper.setModelDict()
            if outer_cv == "True":
                vbhelper.runCrossValidate()
                vbhelper.buildCVScoreDict()
            else:
                vbhelper.fitEstimators()
            vbhelper.save()
        except Exception as e:
            update_status(pipeline_id, "Error: Unknown error executing pipeline",
                          "-0/16",
                          log="Pipeline: {}, Type: {}, Error: {}".format(pipeline_id, pipeline.name, e)
                          )