def dbnd_doctor( python_sanity=parameter.value(True)[bool], airflow_sanity=parameter.value(True)[bool], logs=parameter.value(None)[bool], python_packages=parameter.value(None)[bool], check_time=datetime.datetime.now(), all=False, ): if all: # change only "none" params logs = True if logs is None else logs python_packages = True if python_packages is None else python_packages main_report = DoctorStatusReportBuilder("Dbnd Doctor") main_report.log("check_time", check_time) system_report = system_dbnd.dbnd_status() logger.debug("system_report: %s", system_report) if python_sanity: system_python_report = system_python.python_status( python_packages=python_packages ) main_report.add_sub_report(system_python_report) if airflow_sanity: airflow_report = system_airflow.airflow_status() main_report.add_sub_report(airflow_report) if logs: system_logging_report = system_logging.logging_status() main_report.add_sub_report(system_logging_report) logger.info("Your system is good to go! Enjoy Databand!") return main_report.get_status_str()
class PredictWineQualityParameterSearch(PipelineTask): data = data(default=test_data_csv).target alpha_step = parameter.value(0.3) l1_ratio_step = parameter.value(0.4) results = output def band(self): result = {} variants = list( itertools.product( np.arange(0, 1, self.alpha_step), np.arange(0, 1, self.l1_ratio_step) ) ) # variants = list(itertools.product([0.1, 0.5], [0.2, 0.3])) logger.info("All Variants: %s", variants) for alpha_value, l1_ratio in variants: predict = PredictWineQuality( data=self.data, alpha=alpha_value, l1_ratio=l1_ratio ) exp_name = "%f_%f" % (alpha_value, l1_ratio) result[exp_name] = (predict.model, predict.validation) self.results = result
class MyTask(Task): p_int = parameter.value(3) p_str = parameter.value("check") p_int_with_default = parameter.value(0) output_str = parameter.output[str] def run(self): logging.info("I am running") self.output_str = "success"
class DatabricksConfig(SparkEngineConfig): """Databricks cloud for Apache Spark """ _conf__task_family = "databricks" cluster_type = SparkClusters.databricks stop_session_on_finish = False cluster_id = parameter(default=None).help("existing cluster id")[str] cloud_type = parameter().help("cloud type: aws/azure")[str] conn_id = parameter.value(default="databricks_default").help( "databricks connection settings" )[str] connection_retry_limit = parameter.value(default=3).help( "databricks connection - retry limit" )[int] connection_retry_delay = parameter.value(default=1).help( "databricks connection - delay in between retries" )[int] status_polling_interval_seconds = parameter(default=10).help( "seconds to sleep between polling databricks for job status." )[int] cluster_log_conf = parameter(default={}).help( 'location for logs, like: {"s3": {"destination": "s3://<BUCKET>/<KEY>", "region": "us-east-1"}}"' ) # new cluster config num_workers = parameter(default=0).help("number of workers as in databricks api.")[ int ] init_scripts = parameter(default=[]).help( "init script list, default:{ 's3': { 'destination' : 's3://init_script_bucket/prefix', 'region' : 'us-west-2' } }'" )[List] spark_version = parameter().help("spark version")[str] spark_conf = parameter(default={}).help("spark config")[Dict] node_type_id = parameter(default="").help("nodes for spark machines")[str] spark_env_vars = parameter(default={}).help("spark env vars")[Dict] def get_spark_ctrl(self, task_run): from dbnd_databricks.databricks import DatabricksCtrl return DatabricksCtrl(task_run=task_run) def _validate(self): super(DatabricksConfig, self)._validate() if not self.cluster_id: logger.warning( "no databricks.cluster_id is set, will create a new databricks cluster - please remember" " to configure your cluster parameters." )
class MyMultipleOutputs(Task): p_str = parameter.value("some_string") p_int_with_default = parameter.value(0) output_str = parameter.output[str] output_int = parameter.output[int] def run(self): logging.info("I am running") self.output_str = "success" self.output_int = 2
class MXNetTask(PythonTask): seed = parameter.value(1) batch_size = parameter.value(100) def run(self): mx.random.seed(42) ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu() self.main(ctx=ctx) def to_ndarray_iterator(self, data_file, label_file, shuffle=False): return mnist_to_ndarray_iterator(data_file.path, label_file.path, self.batch_size, shuffle)
class MyExpTask(DecoratedPythonTask): custom_name = parameter.value("aa") previous_exp = parameter.value(1) score_card = output.csv.data my_ratio = output.csv.data def run(self): # wrapping code score = self._invoke_func() self.score_card.write(str(score)) self.my_ratio.write_pickle(self.previous_exp + 1)
class ApacheBeamConfig(Config): """Apache Beam (-s [TASK].spark.[PARAM]=[VAL] for specific tasks)""" # we don't want spark class to inherit from this one, as it should has Config behaviour _conf__task_family = "beam" jar = parameter.value(None, description="Main application jar")[str] verbose = parameter.value( False, description="Whether to pass the verbose flag to spark-submit process for debugging", ) options = parameter(empty_default=True)[Dict[str, str]]
class SparkLogParserConfig(Config): """(Advanced) Apache Spark log parser""" _conf__task_family = "spark_log_parser" error_regex_pattern = parameter.value( default="([A-Z][a-z]+)+Error| Error | Exception ", description="regular expression to find errors in spark logs.", ) lines_to_show = parameter.value( default=4, description="log lines to show for each error snippet") snippets_to_show = parameter.value( default=3, description="error snippets to show in error message")
class SimplestTask(PythonTask): simplest_param = parameter.value("1") simplest_output = output def run(self): logger.info("We are running some simplest code!") self.simplest_output.write(self.simplest_param)
class SleepyTask(SimplestTask): sleep_time = parameter.value(0.1, significant=False) def run(self): if self.sleep_time: time.sleep(self.sleep_time) super(SleepyTask, self).run()
class TaskInfoParamsTask(TTask): str_param = parameter[str] num_param = parameter[int] list_param = parameter[List[int]] date_param = parameter.value(DateValueType().parse_from_str("2015-04-03")) false_param = parameter.value(False) true_param = parameter.value(True) def run(self): super(TaskInfoParamsTask, self).run() assert self.str_param == "15" assert self.num_param == 12 assert self.list_param == [1, 2, 3] assert self.date_param == datetime.date(2015, 4, 3) assert not self.false_param assert self.true_param
class MyTask(TTask): param1 = parameter[int] param2 = parameter.value(default=False) def run(self): super(MyTask, self).run() assert self.param1 == 1 and self.param2
class TMultipleInjectPipeline(PipelineTask): t_types = parameter.value([1, 2]) t_output = output def band(self): t_inputs = {t: TTask(t_param=t).t_output for t in self.t_types} self.t_output = TTaskCombineInputs(t_inputs=t_inputs).t_output
class ParallelTasksPipeline(PipelineTask): num_of_tasks = parameter.value(3) def band(self): tasks = [] for i in range(self.num_of_tasks): tasks.append(SleepyTask(simplest_param=str(i))) return tasks
class TLongTimeRunning(TTask): sleep = parameter.value(default=0) def run(self): if self.sleep: sleep(self.sleep) super(TLongTimeRunning, self).run() raise Exception("Some user error")
class B_F4Task(PythonTask): t_param = parameter.value(default="B") a1_input = data o_output = output def run(self): self.o_output.write("done %s\n" % self.t_param)
def test_calculate_alpha_value_factory(self): #### DOC START @task(alpha=parameter.value(0.5)) def calculate_alpha(alpha) -> float: return alpha #### DOC END calculate_alpha.dbnd_run()
class A(TTask): task_namespace = "mynamespace" t = parameter.value(((1, 2), (3, 4))) expected = parameter[Tuple] def complete(self): if self.t != self.expected: raise ValueError return True
class A(TTask): task_namespace = "mynamespace" l_param = parameter.value([1, 2, 3]) expected = parameter[List[int]] def complete(self): if self.l_param != self.expected: raise ValueError return True
class A(TTask): task_namespace = "mynamespace" p1 = parameter.value(100) expected = parameter[int] def complete(self): if self.p1 != self.expected: raise ValueError return True
class TMultipleOutputsPipeline(PipelineTask): t_types = parameter.value([1, 2]) t_output = output def band(self): self.t_output = { t: TTask(t_param=t).t_output for t in self.t_types }
class DataflowConfig(EngineConfig): """Google Dataflow""" _conf__task_family = "dataflow" project = parameter[str] region = parameter(default=DEFAULT_DATAFLOW_LOCATION)[str] temp_location = parameter(default=None).folder[Target] poll_sleep = parameter.value(5) options = parameter(empty_default=True)[Dict[str, str]] runner = parameter.value("DataflowRunner") def get_beam_ctrl(self, task_run): from dbnd_gcp.dataflow.dataflow import DataFlowJobCtrl return DataFlowJobCtrl(task_run)
class DataSplitIntoMultipleOutputs(PythonTask): parts = parameter.value(3) splits = output.csv.folder(output_factory=_custom_outputs_factory) def run(self): for key, split in self.splits.items(): train, test = split train.write(key) test.write(key)
class TGeneratedOutputs(PythonTask): parts = parameter.value(3) splits = output.csv.folder(output_factory=_get_all_splits) def run(self): for key, split in self.splits.items(): train, test = split train.write(key) test.write(key)
class C_F4Task(PythonTask): t_param = parameter.value("C") b_input = parameter.data a2_input = parameter.data o_output = output def run(self): self.o_output.write("done %s\n" % self.t_param)
class PrepareSaladAtSpark(PipelineTask): vegetables = data(default=data_repo.vegetables) dressing = parameter.value("oil") salad = output.data def band(self): s1 = CutAtSpark(vegetables=self.vegetables) self.salad = AddDressingAtSpark( chopped_vegetables=s1.chopped_vegetables, dressing=self.dressing)
class TComplicatedTask(SimplestTask): specific_input = data.target task_input = data.target some_param = parameter.value(1) def run(self): self.log_metric("some_metric", 1) self.log_metric("some_metric1", 2.0) self.log_metric("m_string", "my_metric") self.log_metric("m_tuple", (1, 2, "complicated")) super(TComplicatedTask, self).run()
class PrepareSalad(PipelineTask): vegetables = data(default=data_repo.vegetables).target dressing = parameter.value(default="oil", description="dressing for the salad") salad = output def band(self): s1 = Cut(vegetables=self.vegetables) self.salad = AddDressing(chopped_vegetables=s1.chopped_vegetables, dressing=self.dressing).salad
class AggregateTopArtists(PipelineTask): period = parameter.value(timedelta(days=2)) def band(self): streams = [ Stream(task_name="Stream_%s" % i, task_target_date=d).stream for i, d in enumerate( period_dates(self.task_target_date, self.period)) ] artists = ArtistAggregator(streams=streams) top_n = TopNArtists(artists=artists.index) return top_n