Example #1
0
 def get_dendrogram(self, linkage='ward', dist='spearmanr'):
     data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                     y_df=self.y_df,
                                     X_df_s=self.vbhelper.X_df_start_order,
                                     y_df_s=self.vbhelper.y_df_start_order)
     vbs = VBSummary()
     vbs.setData(data)
     return vbs.hierarchicalDendrogram(linkage=linkage, dist=dist)
Example #2
0
 def get_kerneldensity(self):
     data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                     y_df=self.y_df,
                                     X_df_s=self.vbhelper.X_df_start_order,
                                     y_df_s=self.vbhelper.y_df_start_order)
     vbs = VBSummary()
     vbs.setData(data)
     return vbs.kernelDensityPie()
Example #3
0
 def get_missing_vals(self):
     data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                     y_df=self.y_df,
                                     X_df_s=self.vbhelper.X_df_start_order,
                                     y_df_s=self.vbhelper.y_df_start_order)
     vbs = VBSummary()
     vbs.setData(data)
     return vbs.missingVals()
Example #4
0
    def __init__(self, dataset_id):
        # TODO: replace the need for the project_id with providing the target variable
        self.dataset_id = dataset_id
        self.dataset = Dataset.objects.get(pk=dataset_id)

        self.df = load_dataset(dataset_id, self.dataset)
        self.dataset_metadata = Metadata(
            parent=self.dataset).get_metadata("DatasetMetadata")

        self.target_label = "target" if "target" not in self.dataset_metadata.keys(
        ) else self.dataset_metadata["target"]
        self.features_label = None if "features" not in self.dataset_metadata.keys(
        ) else self.dataset_metadata["features"]
        if self.features_label is None or self.features_label == "*":
            self.features_label = list(self.df.columns)
            self.features_label.remove(self.target_label)
        else:
            self.features_label = json.loads(self.features_label)

        self.y_df = self.df[self.target_label].to_frame()
        self.X_df = self.df[self.features_label]

        self.vbhelper = VBHelper(pipeline_id=-1)
        self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df)
Example #5
0
 def get_components(self, num_cols, keep_cats=False):
     try:
         if "," in num_cols:
             _num_cols = num_cols.split(",")
             num_cols = []
             for n in _num_cols:
                 num_cols.append(int(n))
         else:
             num_cols = [int(num_cols)]
     except Exception:
         num_cols = [1]
     data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                     y_df=self.y_df,
                                     X_df_s=self.vbhelper.X_df_start_order,
                                     y_df_s=self.vbhelper.y_df_start_order)
     vbs = VBSummary()
     vbs.setData(data)
     return vbs.viewComponents(num_cols=num_cols, keep_cats=keep_cats)
Example #6
0
    def execute_task(project_id, dataset_id, pipeline_id):
        # STAGE 1 - Data and parameter load from db
        update_status(pipeline_id,
                      "Data and Model Setup: Retrieving dataset and pipeline",
                      "1/{}".format(pre_processing_steps),
                      log="Pipeline: {}, Type: {}, Setup: 1/{}".format(
                          pipeline_id, None, pre_processing_steps),
                      message="Cross validation")
        project = Project.objects.get(id=int(project_id))
        dataset = Dataset.objects.get(id=int(dataset_id))
        pipeline = Pipeline.objects.get(id=int(pipeline_id))

        project.dataset = int(dataset_id)
        project.save()

        df = load_dataset(dataset_id, dataset)
        dataset_metadata = Metadata(
            parent=dataset).get_metadata("DatasetMetadata")
        pipeline_metadata = Metadata(
            parent=pipeline).get_metadata("PipelineMetadata")
        project_metadata = Metadata(
            parent=project).get_metadata("ProjectMetadata")

        target_label = None if "target" not in project_metadata.keys(
        ) else project_metadata["target"]
        features_label = None if "features" not in project_metadata.keys(
        ) else project_metadata["features"]

        target_label = "target" if (
            "target" not in dataset_metadata.keys()
            and target_label is None) else dataset_metadata["target"]

        if "features" not in dataset_metadata.keys(
        ) and features_label is None:
            features_label = None
        else:
            features_label = dataset_metadata["features"]
        if features_label is None or features_label == "*":
            features_label = list(df.columns)
            features_label.remove(target_label)
        else:
            features_label = json.loads(features_label)
        drop_vars = [] if "drop_features" not in project_metadata.keys(
        ) else json.loads(project_metadata["drop_features"].replace(
            "\'", "\""))
        for d in drop_vars:
            features_label.remove(d)

        # STAGE 2 - Data prep
        update_status(pipeline_id,
                      "Data and Model Setup: Loading data",
                      "2/{}".format(pre_processing_steps),
                      log="Pipeline: {}, Type: {}, Setup: 2/{}".format(
                          pipeline_id, pipeline.name, pre_processing_steps),
                      message="Cross validation")

        target = df[target_label].to_frame()
        if features_label:
            features = df[features_label]
        else:
            features = df.drop(target_label, axis=1)

        # STAGE 3 - VBHelper execution
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading all parameters and settings",
            "3/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 3/{}".format(
                pipeline_id, pipeline.name, pre_processing_steps),
            message="Cross validation")
        if pipeline_metadata:
            vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys(
            ) else json.loads(pipeline_metadata["parameters"].replace(
                "'", "\""))
        else:
            vbhelper_parameters = {}

        vbhelper_parameters["pipeline_id"] = pipeline_id
        outer_cv = pipeline_metadata[
            "outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True"
        try:
            vbhelper = VBHelper(**vbhelper_parameters)
            if "estimators" in pipeline_metadata.keys():
                est_str = pipeline_metadata["estimators"].replace("\'", "\"")
                estimators = json.loads(est_str)
            else:
                update_status(
                    pipeline_id,
                    "Error: VB Helper requires an estimator.",
                    "-1/{}".format(pre_processing_steps),
                    log="Pipeline: {}, Type: {}, Setup: -1/{}".format(
                        pipeline_id, pipeline.name, pre_processing_steps),
                    message="Cross validation")
                return
            vbhelper.setData(X_df=features, y_df=target)
            inner_cv_dict = {
                'cv_reps': 1,
                'cv_folds': 5,
                'cv_strategy': ('quantile', 5)
            }
            inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict)
            # prep_dict = {'cat_approach': 'together', 'impute_strategy': 'IterativeImputer', 'cat_idx': vbhelper.cat_idx}
            prep_dict = {
                'cat_approach': 'together',
                'impute_strategy': 'impute_middle',
                'cat_idx': vbhelper.cat_idx
            }
            pipe_kwargs = dict(do_prep=not vbhelper.run_stacked,
                               prep_dict=prep_dict,
                               inner_cv=inner_cv,
                               cat_idx=vbhelper.cat_idx,
                               float_idx=vbhelper.float_idx,
                               bestT=False)
            estimators_dict = {}
            e_i = 0
            for e in estimators:
                name = e["name"] if "name" in e.keys(
                ) else e["type"] + "-{}".format(e_i)
                n_i = 1
                n_name = name
                while n_name in estimators_dict.keys():
                    n_name = name + "-{}".format(n_i)
                    n_i += 1
                name = n_name
                estimator = DaskTasks.get_estimator(e["type"])
                e_kwargs = copy.copy(pipe_kwargs)
                for k, p in e["parameters"].items():
                    e_kwargs[k] = p
                estimators_dict[name] = {
                    "pipe": estimator,
                    "pipe_kwargs": e_kwargs
                }
                e_i += 1
            vbhelper.setPipeDict(estimators_dict)
            vbhelper.setModelDict()
            if outer_cv == "True":
                vbhelper.runCrossValidate(verbose=True)
                vbhelper.buildCVScoreDict()
            else:
                #TODO: check processing for non-outer-cv instance for data cleanup
                vbhelper.fitEstimators()
            try:
                model = Model.objects.get(pipeline=pipeline)
                model_id = model.id
            except Model.DoesNotExist:
                model_id = None
            vbhelper.save(message="Completed.")
            del model
        except Exception as e:
            update_status(pipeline_id,
                          "Error: Unknown error executing pipeline",
                          "-0/16",
                          log="Pipeline: {}, Type: {}, Error: {}".format(
                              pipeline_id, pipeline.name, e),
                          message="Cross validation")
        del vbhelper
Example #7
0
class DataExploration:
    def __init__(self, dataset_id):
        # TODO: replace the need for the project_id with providing the target variable
        self.dataset_id = dataset_id
        self.dataset = Dataset.objects.get(pk=dataset_id)

        self.df = load_dataset(dataset_id, self.dataset)
        self.dataset_metadata = Metadata(
            parent=self.dataset).get_metadata("DatasetMetadata")

        self.target_label = "target" if "target" not in self.dataset_metadata.keys(
        ) else self.dataset_metadata["target"]
        self.features_label = None if "features" not in self.dataset_metadata.keys(
        ) else self.dataset_metadata["features"]
        if self.features_label is None or self.features_label == "*":
            self.features_label = list(self.df.columns)
            self.features_label.remove(self.target_label)
        else:
            self.features_label = json.loads(self.features_label)

        self.y_df = self.df[self.target_label].to_frame()
        self.X_df = self.df[self.features_label]

        self.vbhelper = VBHelper(pipeline_id=-1)
        self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df)

    def get_missing_vals(self):
        data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                        y_df=self.y_df,
                                        X_df_s=self.vbhelper.X_df_start_order,
                                        y_df_s=self.vbhelper.y_df_start_order)
        vbs = VBSummary()
        vbs.setData(data)
        return vbs.missingVals()

    def get_components(self, num_cols, keep_cats=False):
        try:
            if "," in num_cols:
                _num_cols = num_cols.split(",")
                num_cols = []
                for n in _num_cols:
                    num_cols.append(int(n))
            else:
                num_cols = [int(num_cols)]
        except Exception:
            num_cols = [1]
        data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                        y_df=self.y_df,
                                        X_df_s=self.vbhelper.X_df_start_order,
                                        y_df_s=self.vbhelper.y_df_start_order)
        vbs = VBSummary()
        vbs.setData(data)
        return vbs.viewComponents(num_cols=num_cols, keep_cats=keep_cats)

    def get_kerneldensity(self):
        data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                        y_df=self.y_df,
                                        X_df_s=self.vbhelper.X_df_start_order,
                                        y_df_s=self.vbhelper.y_df_start_order)
        vbs = VBSummary()
        vbs.setData(data)
        return vbs.kernelDensityPie()

    def get_dendrogram(self, linkage='ward', dist='spearmanr'):
        data = VBHelper.saveFullFloatXy(X_df=self.X_df,
                                        y_df=self.y_df,
                                        X_df_s=self.vbhelper.X_df_start_order,
                                        y_df_s=self.vbhelper.y_df_start_order)
        vbs = VBSummary()
        vbs.setData(data)
        return vbs.hierarchicalDendrogram(linkage=linkage, dist=dist)
Example #8
0
    def execute_task(project_id, dataset_id, pipeline_id):
        # STAGE 1 - Data and parameter load from db
        update_status(
            pipeline_id,
            "Data and Model Setup: Retrieving dataset and pipeline", "1/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 1/{}".format(pipeline_id, None, pre_processing_steps)
        )
        project = Project.objects.get(id=int(project_id))
        dataset = Dataset.objects.get(id=int(dataset_id))
        pipeline = Pipeline.objects.get(id=int(pipeline_id))

        project.dataset = int(dataset_id)
        project.save()

        df = load_dataset(dataset_id, dataset)
        pipeline_metadata = Metadata(parent=Pipeline.objects.get(id=pipeline_id)).get_metadata("PipelineMetadata")
        project_metadata = Metadata(parent=Project.objects.get(id=project_id)).get_metadata("ProjectMetadata")

        target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"]
        features_label = None if "features" not in project_metadata.keys() else project_metadata["features"]

        # STAGE 2 - Data prep
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading data", "2/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 2/{}".format(pipeline_id, pipeline.name, pre_processing_steps)
        )

        target = df[target_label]
        if features_label:
            features_list = json.loads(features_label.replace("\'", "\""))
            features = df[features_list]
        else:
            features = df.drop(target_label, axis=1)

        # STAGE 3 - VBHelper execution
        update_status(
            pipeline_id,
            "Data and Model Setup: Loading all parameters and settings", "3/{}".format(pre_processing_steps),
            log="Pipeline: {}, Type: {}, Setup: 3/{}".format(pipeline_id, pipeline.name, pre_processing_steps)
        )
        if pipeline_metadata:
            vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys() else json.loads(pipeline_metadata["parameters"].replace("'", "\""))
        else:
            vbhelper_parameters = {}

        vbhelper_parameters["pipeline_id"] = pipeline_id
        outer_cv = pipeline_metadata["outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True"
        try:
            vbhelper = VBHelper(**vbhelper_parameters)
            if "estimators" in pipeline_metadata.keys():
                estimators = json.loads(pipeline_metadata["estimators"].replace("\'", "\""))
            else:
                update_status(pipeline_id, "Error: VB Helper requires an estimator.",
                              "-1/{}".format(pre_processing_steps),
                              log="Pipeline: {}, Type: {}, Setup: -1/{}".format(pipeline_id, pipeline.name,
                                                                                pre_processing_steps)
                              )
                return
            vbhelper.setData(X_df=features, y_df=target)
            inner_cv_dict = {'cv_reps': 1, 'cv_folds': 5, 'cv_strategy': ('quantile', 5)}
            inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict)
            prep_dict = {'impute_strategy': 'impute_knn5', 'cat_idx': vbhelper.cat_idx}
            pipe_kwargs = dict(do_prep=not vbhelper.run_stacked, prep_dict=prep_dict, inner_cv=inner_cv,
                               gridpoints=4, cat_idx=vbhelper.cat_idx, float_idx=vbhelper.float_idx,
                               bestT=False)
            estimators_dict = {}
            e_i = 0
            for e in estimators:
                name = e["name"] if "name" in e.keys() else e["type"] + "-{}".format(e_i)
                n_i = 1
                n_name = name
                while n_name in estimators_dict.keys():
                    n_name = name + "-{}".format(n_i)
                    n_i += 1
                name = n_name
                estimator = DaskTasks.get_estimator(e["type"])
                e_kwargs = copy.copy(pipe_kwargs)
                for k, p in e["parameters"].items():
                    e_kwargs[k] = p
                estimators_dict[name] = {"pipe": estimator, "pipe_kwargs": e_kwargs}
                e_i += 1
            vbhelper.setPipeDict(estimators_dict)
            vbhelper.setModelDict()
            if outer_cv == "True":
                vbhelper.runCrossValidate()
                vbhelper.buildCVScoreDict()
            else:
                vbhelper.fitEstimators()
            vbhelper.save()
        except Exception as e:
            update_status(pipeline_id, "Error: Unknown error executing pipeline",
                          "-0/16",
                          log="Pipeline: {}, Type: {}, Error: {}".format(pipeline_id, pipeline.name, e)
                          )