Esempio n. 1
0
def get_config():
    try:
        flowauth_fernet_key = environ["FLOWAUTH_FERNET_KEY"].encode()
        _ = Fernet(flowauth_fernet_key)  # Error if fernet key is bad
        log_level = getattr(logging,
                            getenv("FLOWAUTH_LOG_LEVEL", "error").upper(),
                            logging.ERROR)
        db_uri = getenv("DB_URI", os.getenv("DB_URI",
                                            "sqlite:////tmp/test.db"))
        db_uri = db_uri.format(getenv("FLOWAUTH_DB_PASSWORD", ""))

        return dict(
            PRIVATE_JWT_SIGNING_KEY=load_private_key(
                environ["PRIVATE_JWT_SIGNING_KEY"]),
            LOG_LEVEL=log_level,
            ADMIN_USER=environ["FLOWAUTH_ADMIN_USERNAME"],
            ADMIN_PASSWORD=environ["FLOWAUTH_ADMIN_PASSWORD"],
            SQLALCHEMY_DATABASE_URI=db_uri,
            SQLALCHEMY_ENGINE_OPTIONS=dict(pool_recycle=3600),
            SECRET_KEY=environ["SECRET_KEY"],
            SESSION_PROTECTION="strong",
            SQLALCHEMY_TRACK_MODIFICATIONS=False,
            FLOWAUTH_FERNET_KEY=flowauth_fernet_key,
            DEMO_MODE=True if getenv("DEMO_MODE") is not None else False,
            RESET_DB=True
            if getenv("RESET_FLOWAUTH_DB") is not None else False,
            DB_IS_SET_UP=Event(),
            CACHE_BACKEND=get_cache_backend(),
        )
    except KeyError as e:
        raise UndefinedConfigOption(
            f"Undefined configuration option: '{e.args[0]}'. Please set docker secret or environment variable."
        )
Esempio n. 2
0
def get_cache_backend() -> CacheRegion:
    """
    Get a dogpilecache cache region.

    Returns
    -------
    CacheRegion
    """
    cache_backend = getenv("FLOWAUTH_CACHE_BACKEND", "FILE").upper()

    if cache_backend == "REDIS":
        backend = "dogpile.cache.redis"
        cache_args = dict(
            host=environ["FLOWAUTH_REDIS_HOST"],
            port=int(getenv("FLOWAUTH_REDIS_PORT", "6379")),
            db=int(getenv("FLOWAUTH_REDIS_DB", "0")),
            redis_expiration_time=32,
            distributed_lock=True,
            password=getenv("FLOWAUTH_REDIS_PASSWORD", None),
        )
    elif cache_backend == "FILE":
        backend = "dogpile.cache.dbm"
        cache_args = dict(filename=environ["FLOWAUTH_CACHE_FILE"])
    else:
        backend = "dogpile.cache.memory"
        cache_args = {}

    return make_region().configure(backend=backend,
                                   expiration_time=30,
                                   arguments=cache_args)
Esempio n. 3
0
def get_config():

    try:
        jwt_public_key = load_public_key(environ["PUBLIC_JWT_SIGNING_KEY"])

        log_level = logging.getLevelName(
            getenv("FLOWAPI_LOG_LEVEL", "error").upper())

        flowmachine_host = environ["FLOWMACHINE_HOST"]
        flowmachine_port = environ["FLOWMACHINE_PORT"]

        flowdb_user = environ["FLOWAPI_FLOWDB_USER"]
        flowdb_password = environ["FLOWAPI_FLOWDB_PASSWORD"]
        flowdb_host = environ["FLOWDB_HOST"]
        flowdb_port = environ["FLOWDB_PORT"]
        flowapi_server_id = environ["FLOWAPI_IDENTIFIER"]
    except KeyError as e:
        raise UndefinedConfigOption(
            f"Undefined configuration option: '{e.args[0]}'. Please set docker secret or environment variable."
        )

    return dict(
        JWT_PUBLIC_KEY=jwt_public_key,
        JWT_ALGORITHM="RS256",
        FLOWAPI_LOG_LEVEL=log_level,
        FLOWMACHINE_HOST=flowmachine_host,
        FLOWMACHINE_PORT=flowmachine_port,
        FLOWDB_DSN=
        f"postgres://{flowdb_user}:{flowdb_password}@{flowdb_host}:{flowdb_port}/flowdb",
        JWT_DECODE_AUDIENCE=flowapi_server_id,
    )
Esempio n. 4
0
def main(run_on_schedule: bool = True):
    """
    Main function. Creates output directories, initialises the database, parses
    a workflows definition file to define workflows and configure the available
    dates sensor, and runs the available dates sensor.

    Parameters
    ----------
    run_on_schedule : bool, default True
        Set run_on_schedule=False to run the sensor only once, ignoring the schedule.
        (useful for testing)
    """
    # Initialise logger
    # TODO: Use structlog (not sure whether it will be possible for the prefect logger)
    log_level = os.environ["AUTOFLOW_LOG_LEVEL"]
    logger = logging.getLogger(__name__)
    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        "[%(asctime)s] %(levelname)s - %(name)s | %(message)s"
    )  # Match prefect format for now
    formatter.converter = time.gmtime
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(log_level)
    logger.info(f"Log level for logger '{__name__}' set to '{log_level}'.")

    # Make output directories
    outputs_path = Path(os.environ["AUTOFLOW_OUTPUTS_DIR"])
    logger.info(
        f"Creating output directories '{outputs_path/'notebooks'}' and '{outputs_path/'reports'}'."
    )
    (outputs_path / "notebooks").mkdir(exist_ok=True)
    (outputs_path / "reports").mkdir(exist_ok=True)

    # Init DB
    # Note: AUTOFLOW_DB_URI must be an env var so that it can be used in prefect.config, so we read it using os.environ.
    # AUTOFLOW_DB_PASSWORD can (and should) be a docker secret, so we read it using get_secret_or_env_var.
    db_uri = os.environ["AUTOFLOW_DB_URI"]
    logger.info(f"Initialising database '{db_uri}'.")
    init_db(db_uri.format(getenv("AUTOFLOW_DB_PASSWORD", "")))

    # Create workflows according to workflow definition file
    inputs_dir = os.environ["AUTOFLOW_INPUTS_DIR"]
    logger.info(f"Creating workflows defined in '{Path(inputs_dir)/'workflows.yml'}'.")
    workflow_storage, sensor_config = parse_workflows_yaml("workflows.yml", inputs_dir)

    # Run available dates sensor
    logger.info("Running available dates sensor.")
    available_dates_sensor.schedule = sensor_config["schedule"]
    available_dates_sensor.run(
        workflow_configs=sensor_config["workflows"],
        cdr_types=sensor_config["cdr_types"],
        workflow_storage=workflow_storage,
        run_on_schedule=run_on_schedule,
    )
Esempio n. 5
0
def get_session(db_uri: str) -> "sqlalchemy.orm.session.Session":
    """
    Create a sqlalchemy session.

    Parameters
    ----------
    db_uri : str
        Database URI

    Returns
    -------
    Session
        A sqlalchemy session
    """
    # TODO: This seems like the wrong place to be reading a secret / env var,
    # but we can't put a docker secret in the prefect config.
    full_db_uri = db_uri.format(getenv("AUTOFLOW_DB_PASSWORD", ""))
    engine = create_engine(full_db_uri)
    return sessionmaker(bind=engine)()
Esempio n. 6
0
def get_available_dates(
    cdr_types: Optional[Sequence[str]] = None, ) -> List[pendulum.Date]:
    """
    Task to return a union of the dates for which data is available in FlowDB for the specified set of CDR types.

    Parameters
    ----------
    cdr_types : list of str, optional
        Subset of CDR types for which to find available dates.
        If not provided, the union of available dates for all CDR types will be returned.

    Returns
    -------
    list of pendulum.Date
        List of available dates, in chronological order
    """
    prefect.context.logger.info(
        f"Getting available dates from FlowAPI at '{prefect.config.flowapi_url}'."
    )
    conn = flowclient.connect(
        url=prefect.config.flowapi_url,
        token=environ["FLOWAPI_TOKEN"],
        ssl_certificate=getenv("SSL_CERTIFICATE_FILE"),
    )
    dates = flowclient.get_available_dates(connection=conn)
    prefect.context.logger.debug(f"Available dates: {dates}")
    if cdr_types is None:
        prefect.context.logger.debug(
            "No CDR types provided. Will return available dates for all CDR types."
        )
        cdr_types = dates.keys()
    else:
        prefect.context.logger.debug(
            f"Returning available dates for CDR types {cdr_types}.")
        unknown_cdr_types = set(cdr_types).difference(dates.keys())
        if unknown_cdr_types:
            warnings.warn(
                f"No data available for CDR types {unknown_cdr_types}.")
    dates_union = set.union(*[
        set(pendulum.parse(date, exact=True) for date in dates[cdr_type])
        for cdr_type in cdr_types if cdr_type in dates.keys()
    ])
    return sorted(list(dates_union))
Esempio n. 7
0
def _do_connect(
    *,
    log_level: Optional[str] = None,
    flowdb_port: Optional[int] = None,
    flowdb_user: Optional[str] = None,
    flowdb_password: Optional[str] = None,
    flowdb_host: Optional[str] = None,
    flowdb_connection_pool_size: Optional[int] = None,
    flowdb_connection_pool_overflow: Optional[int] = None,
    redis_host: Optional[str] = None,
    redis_port: Optional[int] = None,
    redis_password: Optional[str] = None,
    conn: Optional[Connection] = None,
) -> Tuple[Connection, ThreadPoolExecutor, StrictRedis]:
    """
    Connects flowmachine to a database, and performs initial set-up routines.
    You may provide a Settings object here, which can specify the database
    you wish to connect to, logging behaviour, available tables and so on.

    Parameters
    ----------
    log_level : str, default "error"
        Level to log at
    flowdb_port : int, default 9000
        Port number to connect to flowdb
    flowdb_user : str, default "flowmachine"
        Name of user to connect to flowdb as
    flowdb_password : str
        Password to connect to flowdb
    flowdb_host : str, default "localhost"
        Hostname of flowdb server
    flowdb_connection_pool_size : int, default 5
        Default number of database connections to use
    flowdb_connection_pool_overflow : int, default 1
        Number of extra database connections to allow
    redis_host : str, default "localhost"
        Hostname for redis server.
    redis_port : int, default 6379
        Port the redis server is available on
    redis_password : str
        Password for the redis instance
    conn : flowmachine.core.Connection
        Optionally provide an existing Connection object to use, overriding any the db options specified here.

    Returns
    -------
    Connection

    Notes
    -----
    All parameters can also be provided as environment variables.
    If a parameter is provided, and an environment variable is set,
    then the provided value is used. If neither is provided, the defaults as given
    in the docstring are used.

    Parameters can _also_ be set using Docker secrets, in which case a file with the name
    of the parameter in upper case should be present at /run/secrets/THE_PARAM.
    If a secret is available, the secret takes precedence over both the environment variable, and
    the default.
    """
    try:
        log_level = (getenv("FLOWMACHINE_LOG_LEVEL", "error")
                     if log_level is None else log_level)
        flowdb_port = int(
            getenv("FLOWDB_PORT", "9000"
                   ) if flowdb_port is None else flowdb_port)
        flowdb_user = (getenv("FLOWMACHINE_FLOWDB_USER", "flowmachine")
                       if flowdb_user is None else flowdb_user)

        flowdb_password = (environ["FLOWMACHINE_FLOWDB_PASSWORD"]
                           if flowdb_password is None else flowdb_password)
        flowdb_host = (getenv("FLOWDB_HOST", "localhost")
                       if flowdb_host is None else flowdb_host)
        flowdb_connection_pool_size = (int(
            getenv("DB_CONNECTION_POOL_SIZE", "5"))
                                       if flowdb_connection_pool_size is None
                                       else flowdb_connection_pool_size)
        flowdb_connection_pool_overflow = int(
            getenv("DB_CONNECTION_POOL_OVERFLOW", "1"
                   ) if flowdb_connection_pool_overflow is None else
            flowdb_connection_pool_overflow)

        redis_host = (getenv("REDIS_HOST", "localhost")
                      if redis_host is None else redis_host)
        redis_port = int(
            getenv("REDIS_PORT", "6379") if redis_port is None else redis_port)
        redis_password = (environ["REDIS_PASSWORD"]
                          if redis_password is None else redis_password)
    except KeyError as e:
        raise ValueError(
            f"You must provide a secret named {e.args[0]}, set an environment variable named {e.args[0]}, or provide the value as a parameter."
        )

    set_log_level("flowmachine.debug", log_level)

    if conn is None:
        conn = Connection(
            host=flowdb_host,
            port=flowdb_port,
            user=flowdb_user,
            password=flowdb_password,
            database="flowdb",
            pool_size=flowdb_connection_pool_size,
            overflow=flowdb_connection_pool_overflow,
        )

    redis_connection = redis.StrictRedis(host=redis_host,
                                         port=redis_port,
                                         password=redis_password)
    thread_pool = ThreadPoolExecutor(flowdb_connection_pool_size)
    conn.available_dates

    print(f"FlowMachine version: {flowmachine.__version__}")

    print(
        f"Flowdb running on: {flowdb_host}:{flowdb_port}/flowdb (connecting user: {flowdb_user})"
    )
    return conn, thread_pool, redis_connection