def test_mixed_db_sql_spec(tmp_pipeline_sql, add_current_to_sys_path, pg_client_and_schema, monkeypatch): _, schema = pg_client_and_schema with open('pipeline-multiple-dbs.yaml') as f: dag_spec = yaml.load(f, Loader=yaml.SafeLoader) # clients for this pipeline are initialized without custom create_engine # args but we need to set the default schema, mock the call so it # includes that info monkeypatch.setattr(db, 'create_engine', create_engine_with_schema(schema)) dates = _random_date_from(datetime(2016, 1, 1), 365, 100) df = pd.DataFrame({ 'customer_id': np.random.randint(0, 5, 100), 'value': np.random.rand(100), 'purchase_date': dates }) # make sales data for pg and sqlite loader = load_dotted_path(dag_spec['clients']['PostgresRelation']) client = loader() df.to_sql('sales', client.engine, if_exists='replace') client.engine.dispose() # make sales data for pg and sqlite loader = load_dotted_path(dag_spec['clients']['SQLiteRelation']) client = loader() df.to_sql('sales', client.engine) client.engine.dispose() dag = DAGSpec(dag_spec).to_dag() # FIXME: this does no show the custom Upstream key missing error dag.build()
def test_load_dotted_path_if_attribute_not_found(path, err_msg, root, tmp_directory, tmp_imports): Path('my_module.py').write_text('') Path('another').mkdir() Path('another', 'sub.py').touch() with pytest.raises(AttributeError) as excinfo: dotted_path.load_dotted_path(path) expected = err_msg.format(repr(os.path.abspath(root))) assert str(excinfo.value) == expected
def test_load_dotted_path_if_import_fails(path, err_msg, tmp_directory, tmp_imports): Path('my_module.py').write_text('import something') mod_name = path.split('.')[0] spec = importlib.util.find_spec(mod_name) if spec: err_msg = err_msg + f' (loaded {mod_name!r} from {spec.origin!r})' with pytest.raises(ModuleNotFoundError) as excinfo: dotted_path.load_dotted_path(path) assert str(excinfo.value) == err_msg
def process_factory_dotted_path(self, dotted_path): """Parse a factory entry point, returns initialized dag and parsed args """ entry = load_dotted_path(str(dotted_path), raise_=True) # add args using the function's signature required, _ = _add_args_from_callable(self, entry) # if entry point was decorated with @with_env, add arguments # to replace declared variables in env.yaml if hasattr(entry, '_env_dict'): _add_cli_args_from_env_dict_keys(self, entry._env_dict) args = self.parse_args() if hasattr(args, 'log'): if args.log is not None: logging.basicConfig(level=args.log.upper()) # extract required (by using function signature) params from the cli # args kwargs = {key: getattr(args, key) for key in required} # env and function defaults replaced replaced = _env_keys_to_override(args, self.static_args) # TODO: add a way of test this by the parameters it will use to # call the function, have an aux function to get those then another # to execute, test using the first one dag = entry(**{**kwargs, **replaced}) return dag, args
def _to_dag(self): """ Internal method to manage the different cases to convert to a DAG object """ if 'location' in self: return dotted_path.call_dotted_path(self['location']) dag = DAG() if 'config' in self: dag._params = DAGConfiguration.from_dict(self['config']) clients = self.get('clients') if clients: for class_name, dotted_path_spec in clients.items(): dag.clients[class_name] = dotted_path.call_spec( dotted_path_spec) # FIXME: this violates lazy_import, we must change DAG's implementation # to accept strings as attribute and load them until they are called for attr in ['serializer', 'unserializer']: if attr in self: setattr(dag, attr, dotted_path.load_dotted_path(self[attr])) process_tasks(dag, self, root_path=self._parent_path) return dag
def exists(self): if self.type == self.Pattern: return True elif self.type in {self.Directory, self.File}: return Path(self.value).exists() elif self.type == self.DottedPath: return load_dotted_path(self.value, raise_=False) is not None
def test_postgres_sql_spec(tmp_pipeline_sql, pg_client_and_schema, add_current_to_sys_path, monkeypatch): _, schema = pg_client_and_schema with open('pipeline-postgres.yaml') as f: dag_spec = yaml.load(f, Loader=yaml.SafeLoader) # clients for this pipeline are initialized without custom create_engine # args but we need to set the default schema, mock the call so it # includes that info monkeypatch.setattr(db, 'create_engine', create_engine_with_schema(schema)) dates = _random_date_from(datetime(2016, 1, 1), 365, 100) df = pd.DataFrame({ 'customer_id': np.random.randint(0, 5, 100), 'value': np.random.rand(100), 'purchase_date': dates }) loader = load_dotted_path(dag_spec['clients']['SQLScript']) client = loader() df.to_sql('sales', client.engine, if_exists='replace') client.engine.dispose() dag = DAGSpec(dag_spec).to_dag() # FIXME: this does no show the custom Upstream key missing error dag.build() assert not dag['load'].upstream assert list(dag['filter'].upstream.keys()) == ['load'] assert list(dag['transform'].upstream.keys()) == ['filter']
def load(self): if self._from_dotted_path: return load_dotted_path(self._primitive) else: if self.hot_reload: module = importlib.import_module(self.module_name) importlib.reload(module) return getattr(module, self.fn_name) else: return self._primitive
def test_lazily_located_dotted_path(dotted_path_str, tmp_imports): loc, source = dotted_path.lazily_locate_dotted_path(dotted_path_str) obj = dotted_path.load_dotted_path(dotted_path_str) loc_real = getfile(obj) lines, line = inspect.getsourcelines(obj) source_expected = ''.join(lines) loc_expected = f'{loc_real}:{line}' assert loc == loc_expected assert source == source_expected
def test_load_dotted_path_with_reload(tmp_directory, add_current_to_sys_path): # write a sample module Path('dotted_path_with_reload.py').write_text(""" def x(): pass """) # load the module dotted_path.load_dotted_path('dotted_path_with_reload.x') # add a new function Path('dotted_path_with_reload.py').write_text(""" def x(): pass def y(): pass """) # the new function should be importable since we are using reload=True assert dotted_path.load_dotted_path('dotted_path_with_reload.y', reload=True)
def test_defined_name_twice(tmp_directory, add_current_to_sys_path, no_sys_modules_cache): Path('a.py').write_text(""" def b(): pass def b(): pass """) loc = PythonCallableSource(dotted_path.load_dotted_path('a.b')).loc out = PythonCallableSource('a.b').loc assert str(Path(out).resolve()) == str(Path(loc).resolve())
def test_sql_spec_w_products_in_source(tmp_pipeline_sql_products_in_source, add_current_to_sys_path): with open('pipeline.yaml') as f: dag_spec = yaml.load(f, Loader=yaml.SafeLoader) dates = _random_date_from(datetime(2016, 1, 1), 365, 100) df = pd.DataFrame({ 'customer_id': np.random.randint(0, 5, 100), 'value': np.random.rand(100), 'purchase_date': dates }) loader = load_dotted_path(dag_spec['clients']['SQLScript']) client = loader() df.to_sql('sales', client.engine, if_exists='replace') client.engine.dispose() dag = DAGSpec(dag_spec).to_dag() dag.build()
def source_for_task_class(source_str, task_class, project_root, lazy_import, make_absolute): if task_class is tasks.PythonCallable: if lazy_import: return source_str else: return dotted_path.load_dotted_path(source_str) else: path = Path(source_str) # NOTE: there is some inconsistent behavior here. project_root # will be none if DAGSpec was initialized with a dictionary, hence # this won't resolve to absolute paths - this is a bit confusing. # maybe always convert to absolute? if project_root and not path.is_absolute() and make_absolute: return Path(project_root, source_str) else: return path
def test_sqlite_sql_spec(spec, tmp_pipeline_sql, add_current_to_sys_path): with open(spec) as f: dag_spec = yaml.load(f, Loader=yaml.SafeLoader) dates = _random_date_from(datetime(2016, 1, 1), 365, 100) df = pd.DataFrame({ 'customer_id': np.random.randint(0, 5, 100), 'value': np.random.rand(100), 'purchase_date': dates }) loader = load_dotted_path(dag_spec['clients']['SQLScript']) client = loader() df.to_sql('sales', client.engine) client.engine.dispose() dag = DAGSpec(dag_spec).to_dag() # FIXME: this does no show the custom Upstream key missing error dag.build() assert not dag['load'].upstream assert list(dag['filter'].upstream.keys()) == ['load'] assert list(dag['transform'].upstream.keys()) == ['filter']
def test_loc_is_consisent_when_initialized_from_str(tmp_directory, add_current_to_sys_path, no_sys_modules_cache, target_file, dotted_path_str): target_parent = Path(target_file).parent target_parent.mkdir(parents=True, exist_ok=True) for parent in Path(target_file).parents: (parent / '__init__.py').touch() Path(target_file).write_text(""" def symbol(): pass """) out = PythonCallableSource(dotted_path_str).loc # check that a.py hasn't been imported assert 'a' not in sys.modules loc = PythonCallableSource( dotted_path.load_dotted_path(dotted_path_str)).loc assert str(Path(out).resolve()) == str(Path(loc).resolve())
def test_load_dotted_path_custom_error_message(): with pytest.raises(AttributeError) as excinfo: dotted_path.load_dotted_path('test_pkg.not_a_function') assert ('Could not get "not_a_function" from module "test_pkg"' in str(excinfo.value))