Ejemplo n.º 1
0
def dbnd_doctor(
    python_sanity=parameter.value(True)[bool],
    airflow_sanity=parameter.value(True)[bool],
    logs=parameter.value(None)[bool],
    python_packages=parameter.value(None)[bool],
    check_time=datetime.datetime.now(),
    all=False,
):
    if all:
        # change only "none" params
        logs = True if logs is None else logs
        python_packages = True if python_packages is None else python_packages

    main_report = DoctorStatusReportBuilder("Dbnd Doctor")
    main_report.log("check_time", check_time)

    system_report = system_dbnd.dbnd_status()
    logger.debug("system_report: %s", system_report)

    if python_sanity:
        system_python_report = system_python.python_status(
            python_packages=python_packages
        )
        main_report.add_sub_report(system_python_report)
    if airflow_sanity:
        airflow_report = system_airflow.airflow_status()
        main_report.add_sub_report(airflow_report)
    if logs:
        system_logging_report = system_logging.logging_status()
        main_report.add_sub_report(system_logging_report)

    logger.info("Your system is good to go! Enjoy Databand!")
    return main_report.get_status_str()
Ejemplo n.º 2
0
class PredictWineQualityParameterSearch(PipelineTask):
    data = data(default=test_data_csv).target
    alpha_step = parameter.value(0.3)
    l1_ratio_step = parameter.value(0.4)

    results = output

    def band(self):
        result = {}
        variants = list(
            itertools.product(
                np.arange(0, 1, self.alpha_step), np.arange(0, 1, self.l1_ratio_step)
            )
        )

        # variants = list(itertools.product([0.1, 0.5], [0.2, 0.3]))
        logger.info("All Variants: %s", variants)
        for alpha_value, l1_ratio in variants:
            predict = PredictWineQuality(
                data=self.data, alpha=alpha_value, l1_ratio=l1_ratio
            )

            exp_name = "%f_%f" % (alpha_value, l1_ratio)
            result[exp_name] = (predict.model, predict.validation)
        self.results = result
Ejemplo n.º 3
0
class MyTask(Task):
    p_int = parameter.value(3)
    p_str = parameter.value("check")
    p_int_with_default = parameter.value(0)

    output_str = parameter.output[str]

    def run(self):
        logging.info("I am running")
        self.output_str = "success"
Ejemplo n.º 4
0
class DatabricksConfig(SparkEngineConfig):
    """Databricks cloud for Apache Spark """

    _conf__task_family = "databricks"
    cluster_type = SparkClusters.databricks

    stop_session_on_finish = False

    cluster_id = parameter(default=None).help("existing cluster id")[str]
    cloud_type = parameter().help("cloud type: aws/azure")[str]

    conn_id = parameter.value(default="databricks_default").help(
        "databricks connection settings"
    )[str]

    connection_retry_limit = parameter.value(default=3).help(
        "databricks connection - retry limit"
    )[int]

    connection_retry_delay = parameter.value(default=1).help(
        "databricks connection - delay in between retries"
    )[int]

    status_polling_interval_seconds = parameter(default=10).help(
        "seconds to sleep between polling databricks for job status."
    )[int]

    cluster_log_conf = parameter(default={}).help(
        'location for logs, like: {"s3": {"destination": "s3://<BUCKET>/<KEY>", "region": "us-east-1"}}"'
    )

    # new cluster config
    num_workers = parameter(default=0).help("number of workers as in databricks api.")[
        int
    ]
    init_scripts = parameter(default=[]).help(
        "init script list, default:{ 's3': { 'destination' : 's3://init_script_bucket/prefix', 'region' : 'us-west-2' } }'"
    )[List]
    spark_version = parameter().help("spark version")[str]
    spark_conf = parameter(default={}).help("spark config")[Dict]
    node_type_id = parameter(default="").help("nodes for spark machines")[str]
    spark_env_vars = parameter(default={}).help("spark env vars")[Dict]

    def get_spark_ctrl(self, task_run):
        from dbnd_databricks.databricks import DatabricksCtrl

        return DatabricksCtrl(task_run=task_run)

    def _validate(self):
        super(DatabricksConfig, self)._validate()
        if not self.cluster_id:
            logger.warning(
                "no databricks.cluster_id is set, will create a new databricks cluster - please remember"
                " to configure your cluster parameters."
            )
Ejemplo n.º 5
0
class MyMultipleOutputs(Task):
    p_str = parameter.value("some_string")
    p_int_with_default = parameter.value(0)

    output_str = parameter.output[str]
    output_int = parameter.output[int]

    def run(self):
        logging.info("I am running")
        self.output_str = "success"
        self.output_int = 2
Ejemplo n.º 6
0
class MXNetTask(PythonTask):
    seed = parameter.value(1)
    batch_size = parameter.value(100)

    def run(self):
        mx.random.seed(42)
        ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
        self.main(ctx=ctx)

    def to_ndarray_iterator(self, data_file, label_file, shuffle=False):
        return mnist_to_ndarray_iterator(data_file.path, label_file.path,
                                         self.batch_size, shuffle)
Ejemplo n.º 7
0
class MyExpTask(DecoratedPythonTask):
    custom_name = parameter.value("aa")

    previous_exp = parameter.value(1)
    score_card = output.csv.data
    my_ratio = output.csv.data

    def run(self):
        # wrapping code
        score = self._invoke_func()

        self.score_card.write(str(score))
        self.my_ratio.write_pickle(self.previous_exp + 1)
Ejemplo n.º 8
0
class ApacheBeamConfig(Config):
    """Apache Beam (-s [TASK].spark.[PARAM]=[VAL] for specific tasks)"""

    # we don't want spark class to inherit from this one, as it should has Config behaviour
    _conf__task_family = "beam"

    jar = parameter.value(None, description="Main application jar")[str]

    verbose = parameter.value(
        False,
        description="Whether to pass the verbose flag to spark-submit process for debugging",
    )

    options = parameter(empty_default=True)[Dict[str, str]]
Ejemplo n.º 9
0
class SparkLogParserConfig(Config):
    """(Advanced) Apache Spark log parser"""

    _conf__task_family = "spark_log_parser"

    error_regex_pattern = parameter.value(
        default="([A-Z][a-z]+)+Error| Error | Exception ",
        description="regular expression to find errors in spark logs.",
    )

    lines_to_show = parameter.value(
        default=4, description="log lines to show for each error snippet")

    snippets_to_show = parameter.value(
        default=3, description="error snippets to show in error message")
Ejemplo n.º 10
0
class SimplestTask(PythonTask):
    simplest_param = parameter.value("1")
    simplest_output = output

    def run(self):
        logger.info("We are running some simplest code!")
        self.simplest_output.write(self.simplest_param)
Ejemplo n.º 11
0
class SleepyTask(SimplestTask):
    sleep_time = parameter.value(0.1, significant=False)

    def run(self):
        if self.sleep_time:
            time.sleep(self.sleep_time)
        super(SleepyTask, self).run()
Ejemplo n.º 12
0
class TaskInfoParamsTask(TTask):
    str_param = parameter[str]
    num_param = parameter[int]
    list_param = parameter[List[int]]
    date_param = parameter.value(DateValueType().parse_from_str("2015-04-03"))
    false_param = parameter.value(False)
    true_param = parameter.value(True)

    def run(self):
        super(TaskInfoParamsTask, self).run()
        assert self.str_param == "15"
        assert self.num_param == 12
        assert self.list_param == [1, 2, 3]
        assert self.date_param == datetime.date(2015, 4, 3)
        assert not self.false_param
        assert self.true_param
Ejemplo n.º 13
0
        class MyTask(TTask):
            param1 = parameter[int]
            param2 = parameter.value(default=False)

            def run(self):
                super(MyTask, self).run()
                assert self.param1 == 1 and self.param2
Ejemplo n.º 14
0
        class TMultipleInjectPipeline(PipelineTask):
            t_types = parameter.value([1, 2])
            t_output = output

            def band(self):
                t_inputs = {t: TTask(t_param=t).t_output for t in self.t_types}
                self.t_output = TTaskCombineInputs(t_inputs=t_inputs).t_output
Ejemplo n.º 15
0
class ParallelTasksPipeline(PipelineTask):
    num_of_tasks = parameter.value(3)

    def band(self):
        tasks = []
        for i in range(self.num_of_tasks):
            tasks.append(SleepyTask(simplest_param=str(i)))
        return tasks
Ejemplo n.º 16
0
class TLongTimeRunning(TTask):
    sleep = parameter.value(default=0)

    def run(self):
        if self.sleep:
            sleep(self.sleep)
        super(TLongTimeRunning, self).run()
        raise Exception("Some user error")
Ejemplo n.º 17
0
class B_F4Task(PythonTask):
    t_param = parameter.value(default="B")
    a1_input = data

    o_output = output

    def run(self):
        self.o_output.write("done %s\n" % self.t_param)
Ejemplo n.º 18
0
    def test_calculate_alpha_value_factory(self):
        #### DOC START
        @task(alpha=parameter.value(0.5))
        def calculate_alpha(alpha) -> float:
            return alpha

        #### DOC END
        calculate_alpha.dbnd_run()
Ejemplo n.º 19
0
        class A(TTask):
            task_namespace = "mynamespace"
            t = parameter.value(((1, 2), (3, 4)))
            expected = parameter[Tuple]

            def complete(self):
                if self.t != self.expected:
                    raise ValueError
                return True
Ejemplo n.º 20
0
        class A(TTask):
            task_namespace = "mynamespace"
            l_param = parameter.value([1, 2, 3])
            expected = parameter[List[int]]

            def complete(self):
                if self.l_param != self.expected:
                    raise ValueError
                return True
Ejemplo n.º 21
0
        class A(TTask):
            task_namespace = "mynamespace"
            p1 = parameter.value(100)
            expected = parameter[int]

            def complete(self):
                if self.p1 != self.expected:
                    raise ValueError
                return True
Ejemplo n.º 22
0
        class TMultipleOutputsPipeline(PipelineTask):
            t_types = parameter.value([1, 2])
            t_output = output

            def band(self):
                self.t_output = {
                    t: TTask(t_param=t).t_output
                    for t in self.t_types
                }
Ejemplo n.º 23
0
class DataflowConfig(EngineConfig):
    """Google Dataflow"""

    _conf__task_family = "dataflow"

    project = parameter[str]
    region = parameter(default=DEFAULT_DATAFLOW_LOCATION)[str]
    temp_location = parameter(default=None).folder[Target]

    poll_sleep = parameter.value(5)

    options = parameter(empty_default=True)[Dict[str, str]]
    runner = parameter.value("DataflowRunner")

    def get_beam_ctrl(self, task_run):
        from dbnd_gcp.dataflow.dataflow import DataFlowJobCtrl

        return DataFlowJobCtrl(task_run)
Ejemplo n.º 24
0
class DataSplitIntoMultipleOutputs(PythonTask):
    parts = parameter.value(3)
    splits = output.csv.folder(output_factory=_custom_outputs_factory)

    def run(self):
        for key, split in self.splits.items():
            train, test = split
            train.write(key)
            test.write(key)
Ejemplo n.º 25
0
        class TGeneratedOutputs(PythonTask):
            parts = parameter.value(3)
            splits = output.csv.folder(output_factory=_get_all_splits)

            def run(self):
                for key, split in self.splits.items():
                    train, test = split
                    train.write(key)
                    test.write(key)
Ejemplo n.º 26
0
class C_F4Task(PythonTask):
    t_param = parameter.value("C")
    b_input = parameter.data
    a2_input = parameter.data

    o_output = output

    def run(self):
        self.o_output.write("done %s\n" % self.t_param)
Ejemplo n.º 27
0
class PrepareSaladAtSpark(PipelineTask):
    vegetables = data(default=data_repo.vegetables)
    dressing = parameter.value("oil")

    salad = output.data

    def band(self):
        s1 = CutAtSpark(vegetables=self.vegetables)
        self.salad = AddDressingAtSpark(
            chopped_vegetables=s1.chopped_vegetables, dressing=self.dressing)
Ejemplo n.º 28
0
class TComplicatedTask(SimplestTask):
    specific_input = data.target
    task_input = data.target
    some_param = parameter.value(1)

    def run(self):
        self.log_metric("some_metric", 1)
        self.log_metric("some_metric1", 2.0)
        self.log_metric("m_string", "my_metric")
        self.log_metric("m_tuple", (1, 2, "complicated"))
        super(TComplicatedTask, self).run()
Ejemplo n.º 29
0
class PrepareSalad(PipelineTask):
    vegetables = data(default=data_repo.vegetables).target
    dressing = parameter.value(default="oil",
                               description="dressing for the salad")

    salad = output

    def band(self):
        s1 = Cut(vegetables=self.vegetables)
        self.salad = AddDressing(chopped_vegetables=s1.chopped_vegetables,
                                 dressing=self.dressing).salad
Ejemplo n.º 30
0
class AggregateTopArtists(PipelineTask):
    period = parameter.value(timedelta(days=2))

    def band(self):
        streams = [
            Stream(task_name="Stream_%s" % i, task_target_date=d).stream
            for i, d in enumerate(
                period_dates(self.task_target_date, self.period))
        ]
        artists = ArtistAggregator(streams=streams)
        top_n = TopNArtists(artists=artists.index)
        return top_n