def main(run_on_schedule: bool = True): """ Main function. Creates output directories, initialises the database, parses a workflows definition file to define workflows and configure the available dates sensor, and runs the available dates sensor. Parameters ---------- run_on_schedule : bool, default True Set run_on_schedule=False to run the sensor only once, ignoring the schedule. (useful for testing) """ # Initialise logger # TODO: Use structlog (not sure whether it will be possible for the prefect logger) log_level = os.environ["AUTOFLOW_LOG_LEVEL"] logger = logging.getLogger(__name__) handler = logging.StreamHandler() formatter = logging.Formatter( "[%(asctime)s] %(levelname)s - %(name)s | %(message)s" ) # Match prefect format for now formatter.converter = time.gmtime handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(log_level) logger.info(f"Log level for logger '{__name__}' set to '{log_level}'.") # Make output directories outputs_path = Path(os.environ["AUTOFLOW_OUTPUTS_DIR"]) logger.info( f"Creating output directories '{outputs_path/'notebooks'}' and '{outputs_path/'reports'}'." ) (outputs_path / "notebooks").mkdir(exist_ok=True) (outputs_path / "reports").mkdir(exist_ok=True) # Init DB # Note: AUTOFLOW_DB_URI must be an env var so that it can be used in prefect.config, so we read it using os.environ. # AUTOFLOW_DB_PASSWORD can (and should) be a docker secret, so we read it using get_secret_or_env_var. db_uri = os.environ["AUTOFLOW_DB_URI"] logger.info(f"Initialising database '{db_uri}'.") init_db(db_uri.format(getenv("AUTOFLOW_DB_PASSWORD", ""))) # Create workflows according to workflow definition file inputs_dir = os.environ["AUTOFLOW_INPUTS_DIR"] logger.info(f"Creating workflows defined in '{Path(inputs_dir)/'workflows.yml'}'.") workflow_storage, sensor_config = parse_workflows_yaml("workflows.yml", inputs_dir) # Run available dates sensor logger.info("Running available dates sensor.") available_dates_sensor.schedule = sensor_config["schedule"] available_dates_sensor.run( workflow_configs=sensor_config["workflows"], cdr_types=sensor_config["cdr_types"], workflow_storage=workflow_storage, run_on_schedule=run_on_schedule, )
def test_parse_workflows_yaml_missing_workflows(tmp_path): """ Test that parse_workflows_yaml raises a ValueError if the input file doesn't have a 'workflows' key. """ (tmp_path / "dummy_input.yml").write_text( dedent("""\ available_dates_sensor: schedule: "0 0 * * *" workflows: - workflow_name: workflow1 """)) with pytest.raises( ValueError, match="Input file does not have a 'workflows' section."): workflow_storage, sensor_config = parse_workflows_yaml( filename="dummy_input.yml", inputs_dir=str(tmp_path))
def test_parse_workflows_yaml_missing_available_dates_sensor(tmp_path): """ Test that parse_workflows_yaml raises a ValueError if the input file doesn't have a 'workflows' key. """ (tmp_path / "notebook1.ipynb").touch() (tmp_path / "dummy_input.yml").write_text( dedent("""\ workflows: - name: workflow1 notebooks: notebook1: filename: notebook1.ipynb """)) with pytest.raises( ValueError, match= "Input file does not have an 'available_dates_sensor' section.", ): workflow_storage, sensor_config = parse_workflows_yaml( filename="dummy_input.yml", inputs_dir=str(tmp_path))
def test_parse_workflows_yaml(tmp_path): """ Test that parse_workflows_yaml correctly parses an example input file. """ (tmp_path / "notebook1.ipynb").touch() (tmp_path / "notebook2.ipynb").touch() (tmp_path / "notebook3.ipynb").touch() (tmp_path / "custom_template.tpl").touch() (tmp_path / "dummy_input.yml").write_text( dedent("""\ workflows: - name: workflow1 notebooks: notebook1: filename: notebook1.ipynb parameters: url: flowapi_url date: reference_date extra: dummy_param notebook2: filename: notebook2.ipynb parameters: ranges: date_ranges other: notebook1 output: format: pdf template: custom_template.tpl - name: workflow2 notebooks: the_notebook: filename: notebook3.ipynb output: format: pdf available_dates_sensor: schedule: "0 0 * * *" cdr_types: - calls - sms workflows: - workflow_name: workflow1 parameters: dummy_param: 123 earliest_date: 2016-01-01 date_stencil: [[2016-01-01, 2016-01-03], -1, 0] - workflow_name: workflow2 """)) workflow_storage, sensor_config = parse_workflows_yaml( filename="dummy_input.yml", inputs_dir=str(tmp_path)) assert isinstance(workflow_storage, storage.Storage) assert "workflow1" in workflow_storage assert "workflow2" in workflow_storage assert isinstance(sensor_config["schedule"], Schedule) assert sensor_config["cdr_types"] == ["calls", "sms"] assert sensor_config["workflows"] == [ WorkflowConfig( workflow_name="workflow1", parameters={"dummy_param": 123}, earliest_date=datetime.date(2016, 1, 1), date_stencil=DateStencil( [[datetime.date(2016, 1, 1), datetime.date(2016, 1, 3)], -1, 0]), ), WorkflowConfig(workflow_name="workflow2"), ]