def get_dendrogram(self, linkage='ward', dist='spearmanr'): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.hierarchicalDendrogram(linkage=linkage, dist=dist)
def get_kerneldensity(self): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.kernelDensityPie()
def get_missing_vals(self): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.missingVals()
def __init__(self, dataset_id): # TODO: replace the need for the project_id with providing the target variable self.dataset_id = dataset_id self.dataset = Dataset.objects.get(pk=dataset_id) self.df = load_dataset(dataset_id, self.dataset) self.dataset_metadata = Metadata( parent=self.dataset).get_metadata("DatasetMetadata") self.target_label = "target" if "target" not in self.dataset_metadata.keys( ) else self.dataset_metadata["target"] self.features_label = None if "features" not in self.dataset_metadata.keys( ) else self.dataset_metadata["features"] if self.features_label is None or self.features_label == "*": self.features_label = list(self.df.columns) self.features_label.remove(self.target_label) else: self.features_label = json.loads(self.features_label) self.y_df = self.df[self.target_label].to_frame() self.X_df = self.df[self.features_label] self.vbhelper = VBHelper(pipeline_id=-1) self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df)
def get_components(self, num_cols, keep_cats=False): try: if "," in num_cols: _num_cols = num_cols.split(",") num_cols = [] for n in _num_cols: num_cols.append(int(n)) else: num_cols = [int(num_cols)] except Exception: num_cols = [1] data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.viewComponents(num_cols=num_cols, keep_cats=keep_cats)
def execute_task(project_id, dataset_id, pipeline_id): # STAGE 1 - Data and parameter load from db update_status(pipeline_id, "Data and Model Setup: Retrieving dataset and pipeline", "1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 1/{}".format( pipeline_id, None, pre_processing_steps), message="Cross validation") project = Project.objects.get(id=int(project_id)) dataset = Dataset.objects.get(id=int(dataset_id)) pipeline = Pipeline.objects.get(id=int(pipeline_id)) project.dataset = int(dataset_id) project.save() df = load_dataset(dataset_id, dataset) dataset_metadata = Metadata( parent=dataset).get_metadata("DatasetMetadata") pipeline_metadata = Metadata( parent=pipeline).get_metadata("PipelineMetadata") project_metadata = Metadata( parent=project).get_metadata("ProjectMetadata") target_label = None if "target" not in project_metadata.keys( ) else project_metadata["target"] features_label = None if "features" not in project_metadata.keys( ) else project_metadata["features"] target_label = "target" if ( "target" not in dataset_metadata.keys() and target_label is None) else dataset_metadata["target"] if "features" not in dataset_metadata.keys( ) and features_label is None: features_label = None else: features_label = dataset_metadata["features"] if features_label is None or features_label == "*": features_label = list(df.columns) features_label.remove(target_label) else: features_label = json.loads(features_label) drop_vars = [] if "drop_features" not in project_metadata.keys( ) else json.loads(project_metadata["drop_features"].replace( "\'", "\"")) for d in drop_vars: features_label.remove(d) # STAGE 2 - Data prep update_status(pipeline_id, "Data and Model Setup: Loading data", "2/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 2/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") target = df[target_label].to_frame() if features_label: features = df[features_label] else: features = df.drop(target_label, axis=1) # STAGE 3 - VBHelper execution update_status( pipeline_id, "Data and Model Setup: Loading all parameters and settings", "3/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 3/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") if pipeline_metadata: vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys( ) else json.loads(pipeline_metadata["parameters"].replace( "'", "\"")) else: vbhelper_parameters = {} vbhelper_parameters["pipeline_id"] = pipeline_id outer_cv = pipeline_metadata[ "outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True" try: vbhelper = VBHelper(**vbhelper_parameters) if "estimators" in pipeline_metadata.keys(): est_str = pipeline_metadata["estimators"].replace("\'", "\"") estimators = json.loads(est_str) else: update_status( pipeline_id, "Error: VB Helper requires an estimator.", "-1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: -1/{}".format( pipeline_id, pipeline.name, pre_processing_steps), message="Cross validation") return vbhelper.setData(X_df=features, y_df=target) inner_cv_dict = { 'cv_reps': 1, 'cv_folds': 5, 'cv_strategy': ('quantile', 5) } inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict) # prep_dict = {'cat_approach': 'together', 'impute_strategy': 'IterativeImputer', 'cat_idx': vbhelper.cat_idx} prep_dict = { 'cat_approach': 'together', 'impute_strategy': 'impute_middle', 'cat_idx': vbhelper.cat_idx } pipe_kwargs = dict(do_prep=not vbhelper.run_stacked, prep_dict=prep_dict, inner_cv=inner_cv, cat_idx=vbhelper.cat_idx, float_idx=vbhelper.float_idx, bestT=False) estimators_dict = {} e_i = 0 for e in estimators: name = e["name"] if "name" in e.keys( ) else e["type"] + "-{}".format(e_i) n_i = 1 n_name = name while n_name in estimators_dict.keys(): n_name = name + "-{}".format(n_i) n_i += 1 name = n_name estimator = DaskTasks.get_estimator(e["type"]) e_kwargs = copy.copy(pipe_kwargs) for k, p in e["parameters"].items(): e_kwargs[k] = p estimators_dict[name] = { "pipe": estimator, "pipe_kwargs": e_kwargs } e_i += 1 vbhelper.setPipeDict(estimators_dict) vbhelper.setModelDict() if outer_cv == "True": vbhelper.runCrossValidate(verbose=True) vbhelper.buildCVScoreDict() else: #TODO: check processing for non-outer-cv instance for data cleanup vbhelper.fitEstimators() try: model = Model.objects.get(pipeline=pipeline) model_id = model.id except Model.DoesNotExist: model_id = None vbhelper.save(message="Completed.") del model except Exception as e: update_status(pipeline_id, "Error: Unknown error executing pipeline", "-0/16", log="Pipeline: {}, Type: {}, Error: {}".format( pipeline_id, pipeline.name, e), message="Cross validation") del vbhelper
class DataExploration: def __init__(self, dataset_id): # TODO: replace the need for the project_id with providing the target variable self.dataset_id = dataset_id self.dataset = Dataset.objects.get(pk=dataset_id) self.df = load_dataset(dataset_id, self.dataset) self.dataset_metadata = Metadata( parent=self.dataset).get_metadata("DatasetMetadata") self.target_label = "target" if "target" not in self.dataset_metadata.keys( ) else self.dataset_metadata["target"] self.features_label = None if "features" not in self.dataset_metadata.keys( ) else self.dataset_metadata["features"] if self.features_label is None or self.features_label == "*": self.features_label = list(self.df.columns) self.features_label.remove(self.target_label) else: self.features_label = json.loads(self.features_label) self.y_df = self.df[self.target_label].to_frame() self.X_df = self.df[self.features_label] self.vbhelper = VBHelper(pipeline_id=-1) self.vbhelper.setData(X_df=self.X_df, y_df=self.y_df) def get_missing_vals(self): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.missingVals() def get_components(self, num_cols, keep_cats=False): try: if "," in num_cols: _num_cols = num_cols.split(",") num_cols = [] for n in _num_cols: num_cols.append(int(n)) else: num_cols = [int(num_cols)] except Exception: num_cols = [1] data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.viewComponents(num_cols=num_cols, keep_cats=keep_cats) def get_kerneldensity(self): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.kernelDensityPie() def get_dendrogram(self, linkage='ward', dist='spearmanr'): data = VBHelper.saveFullFloatXy(X_df=self.X_df, y_df=self.y_df, X_df_s=self.vbhelper.X_df_start_order, y_df_s=self.vbhelper.y_df_start_order) vbs = VBSummary() vbs.setData(data) return vbs.hierarchicalDendrogram(linkage=linkage, dist=dist)
def execute_task(project_id, dataset_id, pipeline_id): # STAGE 1 - Data and parameter load from db update_status( pipeline_id, "Data and Model Setup: Retrieving dataset and pipeline", "1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 1/{}".format(pipeline_id, None, pre_processing_steps) ) project = Project.objects.get(id=int(project_id)) dataset = Dataset.objects.get(id=int(dataset_id)) pipeline = Pipeline.objects.get(id=int(pipeline_id)) project.dataset = int(dataset_id) project.save() df = load_dataset(dataset_id, dataset) pipeline_metadata = Metadata(parent=Pipeline.objects.get(id=pipeline_id)).get_metadata("PipelineMetadata") project_metadata = Metadata(parent=Project.objects.get(id=project_id)).get_metadata("ProjectMetadata") target_label = "response" if "target" not in project_metadata.keys() else project_metadata["target"] features_label = None if "features" not in project_metadata.keys() else project_metadata["features"] # STAGE 2 - Data prep update_status( pipeline_id, "Data and Model Setup: Loading data", "2/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 2/{}".format(pipeline_id, pipeline.name, pre_processing_steps) ) target = df[target_label] if features_label: features_list = json.loads(features_label.replace("\'", "\"")) features = df[features_list] else: features = df.drop(target_label, axis=1) # STAGE 3 - VBHelper execution update_status( pipeline_id, "Data and Model Setup: Loading all parameters and settings", "3/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: 3/{}".format(pipeline_id, pipeline.name, pre_processing_steps) ) if pipeline_metadata: vbhelper_parameters = None if "parameters" not in pipeline_metadata.keys() else json.loads(pipeline_metadata["parameters"].replace("'", "\"")) else: vbhelper_parameters = {} vbhelper_parameters["pipeline_id"] = pipeline_id outer_cv = pipeline_metadata["outer_cv"] if "outer_cv" in pipeline_metadata.keys() else "True" try: vbhelper = VBHelper(**vbhelper_parameters) if "estimators" in pipeline_metadata.keys(): estimators = json.loads(pipeline_metadata["estimators"].replace("\'", "\"")) else: update_status(pipeline_id, "Error: VB Helper requires an estimator.", "-1/{}".format(pre_processing_steps), log="Pipeline: {}, Type: {}, Setup: -1/{}".format(pipeline_id, pipeline.name, pre_processing_steps) ) return vbhelper.setData(X_df=features, y_df=target) inner_cv_dict = {'cv_reps': 1, 'cv_folds': 5, 'cv_strategy': ('quantile', 5)} inner_cv = vbhelper.getCV(cv_dict=inner_cv_dict) prep_dict = {'impute_strategy': 'impute_knn5', 'cat_idx': vbhelper.cat_idx} pipe_kwargs = dict(do_prep=not vbhelper.run_stacked, prep_dict=prep_dict, inner_cv=inner_cv, gridpoints=4, cat_idx=vbhelper.cat_idx, float_idx=vbhelper.float_idx, bestT=False) estimators_dict = {} e_i = 0 for e in estimators: name = e["name"] if "name" in e.keys() else e["type"] + "-{}".format(e_i) n_i = 1 n_name = name while n_name in estimators_dict.keys(): n_name = name + "-{}".format(n_i) n_i += 1 name = n_name estimator = DaskTasks.get_estimator(e["type"]) e_kwargs = copy.copy(pipe_kwargs) for k, p in e["parameters"].items(): e_kwargs[k] = p estimators_dict[name] = {"pipe": estimator, "pipe_kwargs": e_kwargs} e_i += 1 vbhelper.setPipeDict(estimators_dict) vbhelper.setModelDict() if outer_cv == "True": vbhelper.runCrossValidate() vbhelper.buildCVScoreDict() else: vbhelper.fitEstimators() vbhelper.save() except Exception as e: update_status(pipeline_id, "Error: Unknown error executing pipeline", "-0/16", log="Pipeline: {}, Type: {}, Error: {}".format(pipeline_id, pipeline.name, e) )