def test_copy(self): src = os.path.join(self.path, "src.txt") dest = os.path.join(self.path, "newdir", "dest.txt") target(src).open("w").close() self.fs.copy(src, dest) assert os.path.exists(src) assert os.path.exists(dest)
def t_f_pathlib(a, b): # type: (Path, Path) -> str assert isinstance(a, Path), type(a) assert isinstance(b, Path), type(b) target(str(b)).mkdir_parent() with open(str(b), "w") as fp: fp.write("ok") return str(a)
def _process_xcom_value(value): if isinstance(value, BaseOperator): value = build_xcom_str(value) upstream_task_ids.append(value.task_id) if isinstance(value, XComStr): upstream_task_ids.append(value.task_id) return target("xcom://%s" % value) if self._is_jinja_arg(value): return target("jinja://%s" % value) return value
def test_create_dataset(self): sample = target(sample_path("sample_data.csv")).as_pandas.read_csv() for x in range(2, 5, 1): replication = pow(100, x) logger.info("Sampling %s", replication) sample = pd.concat([sample] * 100) t = target(sample_generated(replication, file.csv)) t.as_pandas.to_csv(sample) logger.info("Sample %s -> %s ", replication, os.stat(t.path).st_size)
def test_cache(self, pandas_data_frame): t = target(self.path) df = pandas_data_frame t.as_pandas.to_parquet(df, cache=True) df_cache = t.as_pandas.read_parquet(no_copy_on_read=True) assert id(df) == id(df_cache) # validate real read actual = target(t.path).as_pandas.read_parquet() assert_frame_equal(actual, df)
def test_dict_parse(self): p = parameter[Dict[str, Path]]._p # type: ParameterDefinition data = {"a": target("/a"), "b": target("/b")} actual = p.calc_init_value(data) for k, v in actual.items(): assert isinstance(v, Target) runtime_value = p.calc_runtime_value(actual, None) for k, v in runtime_value.items(): assert isinstance(v, Path)
def test_list_parse(self): p = parameter[List[Path]]._p # type: ParameterDefinition data = [target("/a"), target("/b")] actual = p.calc_init_value(data) for l in actual: assert isinstance(l, Target) runtime_value = p.calc_runtime_value(actual, None) for l in runtime_value: assert isinstance(l, Path)
def test_copy(self): t = target(self.path) f = t.open("w") test_data = "test" f.write(test_data) f.close() assert os.path.exists(self.path) assert not os.path.exists(self.copy) t.copy(self.copy) assert os.path.exists(self.path) assert os.path.exists(self.copy) assert t.open("r").read() == target(self.copy).open("r").read()
def test_task_metrics_simple(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.features.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_metric("a", 1) tr_tracker.log_metric("a_string", "1") tr_tracker.log_metric("a_list", [1, 3]) tr_tracker.log_metric("a_tuple", (1, 2)) user_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.user) assert user_metrics == { "a": 1.0, "a_list": "[1, 3]", "a_string": 1.0, "a_tuple": "(1, 2)", }
def __init__(self, task_run): super(TaskRunLogManager, self).__init__(task_run) if hasattr(task_run.task, "airflow_log_file"): self.local_log_file = target(task_run.task.airflow_log_file) else: self.local_log_file = self.task_run.local_task_run_root.partition( name="%s.log" % task_run.attempt_number ) if os.getenv("DBND__LOG_SPARK"): self.local_spark_log_file = self.task_run.local_task_run_root.partition( name="%s-spark.log" % task_run.attempt_number ) self.local_heartbeat_log_file = self.task_run.local_task_run_root.partition( name="%s.heartbeat.log" % task_run.attempt_number ) self.remote_log_file = None if not isinstance(self.task.task_env, LocalEnvConfig): self.remote_log_file = self.task_run.attempt_folder.partition( "%s.log" % task_run.attempt_number ) # file handler for task log # if set -> we are in the context of capturing self._log_task_run_into_file_active = False
def test_pass_on_injected_format(self): task = TTextDataTask( text_data=target( scenario_path("data/some_unknown_ext.myext"), config=file.txt ) ) assert_run_task(task)
def raise_failure(failure): if failure == "missing_params": return TTaskMissingParamsMultiple() elif failure == "read_error": return target("not_exists").read() else: raise Exception("just an error")
def test_pandas_task(self, tmpdir, pandas_data_frame): class PandasTask(PythonTask): some_param = parameter[str] p_input = data.target p_output = output.json def run(self): data = self.p_input.read_df() logger.warning("Data at run %s: %s", self.task_name, data) data.to_target(self.p_output) pandas_target = target(str(tmpdir.join("pandas.csv"))) pandas_data_frame.to_target(pandas_target) task = PandasTask(p_input=pandas_target.path, some_param=1, task_name="first") task_second = PandasTask(p_input=task.p_output, some_param=2, task_name="second") task_second.dbnd_run() assert task_second.p_output print(task_second.p_output.read_df())
def main(): parser = argparse.ArgumentParser( description="Test that gdb can talk to a RISC-V target.", epilog=""" Example command line from the real world: Run all RegsTest cases against a physical FPGA, with custom openocd command: ./gdbserver.py --freedom-e300 --server_cmd "$HOME/SiFive/openocd/src/openocd -s $HOME/SiFive/openocd/tcl -d" Simple """) targets.add_target_options(parser) testlib.add_test_run_options(parser) # TODO: remove global global parsed # pylint: disable=global-statement parsed = parser.parse_args() target = targets.target(parsed) testlib.print_log_names = parsed.print_log_names if parsed.xlen: target.xlen = parsed.xlen module = sys.modules[__name__] return testlib.run_all_tests(module, target, parsed)
def load_task_params_from_task_band(self, task_band, task_params): task_band_value = target(task_band).as_object.read_json() new_params = [] found = [] source = "task_band.json" for p_value in task_params: if p_value.name not in task_band_value or p_value.name == "result": new_params.append(p_value) continue value = p_value.parameter.calc_init_value( task_band_value[p_value.name]) found.append(p_value.name) new_parameter_value = ParameterValue( parameter=p_value.parameter, source=source, source_value=value, value=value, ) new_params.append(new_parameter_value) logger.info("Loading task '{task_family}' from {task_band}:\n" "\tfields taken:\t{found}".format( task_family=self.task_family, task_band=task_band, found=",".join(found))) return new_params
def test_task_metrics_simple(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.log_metric("a", 1) tr_tracker.log_metric("a_string", "1") tr_tracker.log_metric("a_list", [1, 3]) tr_tracker.log_metric("a_tuple", (1, 2)) tr_tracker.log_dataframe("df", pandas_data_frame) actual = TaskRunMetricsFileStoreReader( metrics_folder).get_all_metrics_values() print(actual) assert actual == { "a": 1.0, "a_list": "[1, 3]", "a_string": 1.0, "a_tuple": "(1, 2)", "df.preview": "Names Births", "df.schema": "{", "df.shape": "[5, 2]", "df.shape_0_": 5.0, "df.shape_1_": 2.0, }
def validate(self, df, target_config, read_kwargs=None, write_kwargs=None): target_read_f, target_write_f = _get_read_write_functions(target_config) write_kwargs = write_kwargs or {} read_kwargs = read_kwargs or {} t = self.get_target(target_config) # regular write - should automatically select format df.to_target(t, **write_kwargs) # now we read with automatic read actual_1 = t.read_df(**read_kwargs) assert_frame_equal(actual_1, df) assert id(actual_1) != id(df) # use explicit function actual_2 = target_read_f(t, **read_kwargs) assert_frame_equal(actual_2, df) assert id(actual_2) != id(df) # validate with different target t_2 = target(t.path) actual = target_read_f(t_2, **read_kwargs) assert_frame_equal(actual, df) # validate write via pandas ctrl t_3 = self.get_target(target_config, name="local_file_2") assert not t_3.exists() target_write_f(t_3, df, **write_kwargs) actual = target_read_f(t_3, **read_kwargs) assert_frame_equal(actual, df)
def test_task_metrics_histograms(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.tracking.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_data("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled()) hist_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.histograms) expected_preview = ( " Names Births Married\n" " Bob 968 True\n" " Jessica 155 False\n" " Mary 77 True\n" " John 578 False\n" " Mel 973 True" ) # std value varies in different py versions due to float precision fluctuation df_births_std = hist_metrics["df.Births.std"] assert df_births_std == pytest.approx(428.4246)
def log_artifact(self, task_run, name, artifact, artifact_target): artifact_target.mkdir_parent() if isinstance(artifact, six.string_types): from targets.dir_target import DirTarget artifact_target_source = target(artifact) if isinstance(artifact_target_source, DirTarget): artifact_target_source.copy(artifact_target) else: data = artifact_target_source.read() artifact_target.write(data) return artifact_target if PYPLOT_INSTALLED and isinstance(artifact, Figure): temp = BytesIO() artifact.savefig(temp) temp.seek(0) artifact_target.write(temp.read(), mode="wb") return artifact_target raise DatabandRuntimeError( "Could not recognize artifact of type %s, must be string or matplotlib Figure" % type(artifact))
def execute(ctx, dbnd_run, disable_tracking_api, expected_dbnd_version, expected_python_version): """Execute databand primitives""" if expected_python_version and expected_dbnd_version: spark_python_version = get_python_version() spark_dbnd_version = get_dbnd_version() if expected_python_version != spark_python_version: warn( "You submitted job using Python {} but the Spark cluster uses Python {}. To " "assure execution consistency use the same version in both places. Execution will" "continue but it may fail due to version mismatch.".format( expected_python_version, spark_python_version), DbndVersionsClashWarning, ) if expected_dbnd_version != spark_dbnd_version: warn( "You submitted job using dbnd {} but the Spark cluster uses dbnd {}. To " "assure execution consistency use the same version in both places. Execution will" "continue but it may fail due to version mismatch.".format( expected_dbnd_version, spark_dbnd_version), DbndVersionsClashWarning, ) from targets import target with env_context(**{ENV_DBND__TRACKING: "False"}): run = RunExecutor.load_run(dump_file=target(dbnd_run), disable_tracking_api=disable_tracking_api) ctx.obj = {"run": run, "disable_tracking_api": disable_tracking_api}
def _local_multitarget(): return MultiTarget([ target( os.path.join(DBND_LOCAL_ROOT, LOCAL_SYNC_CACHE_NAME), subtarget.path.lstrip("/"), ) for subtarget in my_multitarget.targets ])
def load_task_params_from_task_band(self, task_band, task_params): task_band_value = target(task_band).as_object.read_json() new_params = {} found = [] source = "task_band.json" for name, p_value in iteritems(task_params): if name not in task_band_value or name == RESULT_PARAM: new_params[name] = p_value continue value = p_value.parameter.calc_init_value(task_band_value[name]) found.append(name) new_parameter_value = ParameterValue( parameter=p_value.parameter, source=source, source_value=value, value=value, ) new_params[new_parameter_value.name] = new_parameter_value logger.info("Loading task '{task_family}' from {task_band}:\n" "\tfields taken:\t{found}".format( task_family=self.task_family, task_band=task_band, found=",".join(found))) return new_params
def test_no_cache_for_csv(self, pandas_data_frame): t = target(self.path) df = pandas_data_frame t.as_pandas.to_csv(df, index=False, cache=True) from_cache = t.as_pandas.read_csv(no_copy_on_read=True) assert id(from_cache) != id(df)
def test_word_count_pyspark(self): logging.info("Running %s", WordCountPySparkTask) actual = WordCountPySparkTask( text=TEXT_FILE, task_version=str(random.random()), override=conf_override ) actual.dbnd_run() print(target(actual.counters.path, "part-00000").read())
def test_dbnd_run(self, tmpdir): t = targets.target(tmpdir.join("task_output")) args = [ TTask.get_task_family(), "-r", "t_param=10", "-r", "t_output=" + t.path ] run_dbnd_subprocess_test(args) assert t.exists()
def _build_base_pod(self) -> k8s.V1Pod: from kubernetes.client import ApiClient basis_pod_yaml = target(self.pod_yaml).read() basis_pod_dict = yaml.safe_load(basis_pod_yaml) or {} api_client = ApiClient() return api_client._ApiClient__deserialize_model( basis_pod_dict, k8s.V1Pod)
def partner_file_data_location(name, task_target_date): rand = random() if rand < 0.2: partner_file = "data/big_file.csv" else: partner_file = "data/small_file.csv" return target(partner_file)
def test_universal_feature_store_hdf5(self, tmpdir, sample_feature_store): write_target = target(tmpdir, "output.hdf5") universal = MyFeatureStoreValueTypeUniversal() universal.save_to_target(target=write_target, value=sample_feature_store) actual = universal.load_from_target(target=write_target) assert_frame_equal(actual.features, sample_feature_store.features)
def sync(self, local_file): if not local_file: # should return None, not empty string to be compatible with airflow code return None if not isinstance(local_file, Target): local_file = target(local_file) return str(self._sync(local_file))
def execute(ctx, dbnd_run, disable_tracking_api): """Execute databand primitives""" from dbnd._core.run.databand_run import DatabandRun from targets import target run = DatabandRun.load_run(dump_file=target(dbnd_run), disable_tracking_api=disable_tracking_api) ctx.obj = {"run": run, "disable_tracking_api": disable_tracking_api}
def main(): parser = argparse.ArgumentParser( description="Test that gdb can talk to a RISC-V target.", epilog=""" Example command line from the real world: Run all RegsTest cases against a physical FPGA, with custom openocd command: ./gdbserver.py --freedom-e300 --server_cmd "$HOME/SiFive/openocd/src/openocd -s $HOME/SiFive/openocd/tcl -d" Simple """) targets.add_target_options(parser) testlib.add_test_run_options(parser) # TODO: remove global global parsed # pylint: disable=global-statement parsed = parser.parse_args() target = targets.target(parsed) testlib.print_log_names = parsed.print_log_names module = sys.modules[__name__] return testlib.run_all_tests(module, target, parsed)