Example #1
0
def get_data_from_sql(partition: table_batch, index: str,
                      secrets: Dict[str, str]) -> Union[table_data, None]:
    stmt = f"""SELECT *
FROM "{secrets['database_schema']}"."{partition.table_name}" 
ORDER BY "{index}" ASC
OFFSET %(offset)s
LIMIT %(limit)s"""
    get_logger().debug(stmt)
    con = get_rds_engine(secrets=secrets)
    df = pd.read_sql(sql=stmt,
                     con=con,
                     params=dict(offset=partition.offset,
                                 limit=partition.limit))

    if len(df) == 0:
        get_logger().warning(
            f'Unable to locate any filename_list in table_name: {partition.table_name} OFFSET: {partition.offset} LIMIT: {partition.limit}'
        )
        return None

    assert len(df.columns) == len(
        partition.data_types
    ), f'Mismatched columns in table_name: {partition.table_name}'

    # ensure we're using the approprite date-types for parquet
    # this is especially important for columns which allow NULL in Postgres, and NaN in Pandas
    for c in df.columns:
        assert c in partition.data_types, f"Missing data_type for query: {c}"
        # Since pandas represents timestamps in nanosecond resolution
        # the timespan that can be represented using a 64-bit integer is limited to approximately 584 years
        # ref: http://pandas-docs.github.io/pandas-docs-travis/user_guide/timeseries.html#timeseries-timestamp-limits
        if partition.data_types[c] in ['datetime64[ms]'] and df[c].min:
            # if it's outside min/max, set to NaT
            df[c] = pd.to_datetime(
                arg=df[c],
                errors='coerce',
                unit='ms',
                origin='unix',
            )
        else:
            df[c] = df[c].astype(partition.data_types[c])

    # create the placeholders for partitioning the parquet file later on
    assert 'created_at' in df.columns, f'Missing column "created_at" in table_name: {partition.table_name}'
    df['year'] = df['created_at'].dt.year.astype(np.int16)
    df['month'] = df['created_at'].dt.month.astype(np.int8)
    df['day'] = df['created_at'].dt.day.astype(np.int8)

    ntf = NamedTemporaryFile(prefix=f"{partition.table_name}_",
                             suffix='.parquet',
                             delete=False)
    filename = Path(ntf.name)
    df.to_parquet(path=filename,
                  engine='pyarrow',
                  compression='snappy',
                  index=False)

    con.dispose()
    return table_data(partition.table_name, filename)
Example #2
0
def flatten_nested_list(
    nested_list: List[List[Union[table_batch, table_data]]],
    max_concurrent_connections: int = 0
) -> List[Union[table_batch, table_data]]:
    results = [item for sublist in nested_list for item in sublist]
    get_logger().info(
        f"Discovered: {len(results)} total records from nested_list: {len(nested_list)}"
    )
    return results
Example #3
0
def prepare_table_data_for_parquet_directory(
        grouped_table_data: List[table_data],
        first_index: int,
        num_of_records_in_batch: int,
        destination_directory: str = None) -> Union[List[table_data], None]:
    total_records = 0
    if len(grouped_table_data) == 0:
        get_logger().warning('Unable to locate any data to insert...')
        return None

    table_name = grouped_table_data[-1].table_name
    total_partitions = len(grouped_table_data)
    directory = Path(destination_directory or mkdtemp(suffix=f"_{table_name}"))

    # if this is an incremental load, we should download the _metadata files from s3 locally
    # such that the appends will update that correctly
    # if first_index != 0:
    #     s3 = get_s3_connection(secrets=secrets)
    #     for filename in ['_metadata', '_common_metadata']:
    #         # _metadata must always exist locallay
    #         path = Path(secrets['s3_bucket'] + s3.sep + table_name + s3.sep + filename)
    #         s3.get(rpath=path.as_posix(), lpath=(directory / filename).as_posix())

    for i, data in enumerate(grouped_table_data):
        if data is None:
            get_logger().debug(
                "Unable to locate a filename_list object to attempt an insert")
            continue

        # the first record needs to re-create the parquet filename if starting_index == 0 and i == 0
        # everything else should append to that parquet filename one-at-a-time
        recreate_metadata = (first_index == 0) and (i == 0)
        size = pandas_to_local_parquet(
            directory=directory,
            data=data,
            num_of_records_in_batch=num_of_records_in_batch,
            append=not recreate_metadata,
            idx=f"{i+1:05}/{total_partitions:05}")
        if size == 0:
            get_logger().warning(
                f"Unable to upload contents of file: {data.filename}")

        total_records += size

    get_logger().info(
        f"Prepared: {total_records} from table_name: {table_name}")

    # return list of all files under this directory
    results = [
        table_data(table_name, _) for _ in directory.glob('**/*')
        if _.is_file()
    ]
    # assert sorted(results)[0].filename.as_posix().endswith('_metadata'), 'Missing _metadata file in directory!'
    get_logger().info(
        f"Discovered {len(results)} parquet files to upload for {table_name}")
    return results
Example #4
0
def purge_transient_folders(filename_list: List[List[table_data]]) -> bool:
    data = filename_list[-1]
    if data is None:
        get_logger().warning(f"Unable to locate any folders to purge")
        return False

    directory = get_parent_folder_name(data=data)
    get_logger().info(f"Purging directory: {directory}")
    rmtree(directory)
    return True
Example #5
0
def pandas_to_local_parquet(directory: os.PathLike,
                            data: table_data,
                            num_of_records_in_batch: int,
                            append: bool = True,
                            idx: str = '') -> int:
    df = pd.read_parquet(path=data.filename)
    if len(df) == 0:
        get_logger().warning(
            f"Unable to locate any filename_list in file: {data.filename}")
        return 0

    partition_cols = ['year', 'month', 'day']
    for p in partition_cols:
        assert p in df.columns, f'Missing column "{p}", cannot continue with S3 upload'

    get_logger().info(
        f"[{idx}] Attempting to prepare {len(df)} records, {data.filename.stat().st_size/(1024*1024)} MB from file: {data.filename}"
    )
    # df.to_parquet(
    #     # path=f"s3://{path}",
    #     path=directory,
    #     engine='pyarrow',
    #     compression='snappy',
    #     partition_cols=partition_cols,
    #     index=False,
    #     allow_truncated_timestamps=True,
    #     # flavor='spark',
    #     # filesystem=s3
    # )

    # use Dask to write the _metadata and _common_metadata files
    # write to local disk first, then use aws cli to sync the filename to s3
    # TODO: ref https://github.com/dask/dask/issues/6867
    dd.from_pandas(data=df, chunksize=num_of_records_in_batch).to_parquet(
        # path=f"s3://{path}",
        path=directory,
        append=append,
        engine='pyarrow',
        compression='snappy',
        partition_on=partition_cols,
        ignore_divisions=True,
        # storage_options=dict(
        #     anon=False,
        #     key=secrets['s3_access_key'],
        #     secret=secrets['s3_secret_key'],
        #     use_ssl=True,
        #     client_kwargs=dict(
        #         endpoint_url=secrets['s3_server'],
        #     )
        # ),
        write_index=False)
    return len(df)
Example #6
0
def get_table_data_types(
    table: str,
    secrets: Dict[str, str],
) -> Dict[str, Any]:
    # ensure Pandas has the correct filename_list-type as SQL
    stmt = """SELECT 
    column_name,
    is_nullable,
    data_type,
    udt_name
FROM information_schema.columns
WHERE 
    table_catalog = %(database)s
    AND table_schema = %(schema)s
    AND table_name = %(table_name)s
ORDER BY ordinal_position ASC"""
    get_logger().debug(stmt)
    con = get_rds_engine(secrets=secrets)
    dt = pd.read_sql(sql=stmt,
                     con=con,
                     params=dict(database=secrets['database_name'],
                                 schema=secrets['database_schema'],
                                 table_name=table))
    dt['is_nullable'] = dt['is_nullable'].apply(lambda _: _ == 'YES')
    get_logger().info(f"Discovered {len(dt)} columns for table_name: {table}")

    mapper = dict()
    # we need to convert datatype to something that can hold NaN (i.e. Floats)
    for idx, row in dt.iterrows():
        if row.data_type == 'bigint':
            mapper[
                row.column_name] = np.float64 if row.is_nullable else np.int64
        elif row.data_type == 'integer':
            mapper[
                row.column_name] = np.float32 if row.is_nullable else np.int32
        elif row.data_type == 'smallint':
            mapper[
                row.column_name] = np.float32 if row.is_nullable else np.int16
        elif row.data_type == 'boolean':
            mapper[row.column_name] = 'bool'
        elif row.data_type in ['double precision', 'numeric']:
            mapper[row.column_name] = np.float64
        elif row.udt_name in ['timestamp', 'date']:
            mapper[row.column_name] = 'datetime64[ms]'
        elif row.udt_name in ['varchar', 'text', 'json', 'jsonb']:
            mapper[row.column_name] = str
        else:
            raise RuntimeError(f'Unknown data_type: {row.data_type}')

    con.dispose()
    return mapper
def wait_on_visible(driver: RemoteWebDriver, xpath: str, timeout: int = 60):
    try:
        resolved = WebDriverWait(driver, timeout=timeout).until(
            EC.visibility_of_element_located((By.XPATH, xpath))
        )
        return resolved
    except (TimeoutException, ) as ex:
        get_logger().error(f'URL: {driver.current_url} unable to locate XPATH: {xpath} in timeout: {timeout}')
        raise ex
    except (InvalidSelectorException, ) as ex:
        raise ex
    except (NoSuchElementException, ElementNotVisibleException, InvalidElementStateException, ) as ex:
        get_logger().error(f'URL: {driver.current_url} unable to locate XPATH: {xpath}')
        raise ex
Example #8
0
def sync_with_s3(data: table_data, secrets: Dict[str, str]) -> Path:
    s3 = get_s3_connection(secrets=secrets)
    directory = get_parent_folder_name(data=data)
    destination = data.filename.as_posix().split(directory.as_posix())[-1]
    assert len(
        destination
    ) > 0, f'Unable to determine destination from directory: {directory} filename: {data.filename}'
    path = Path(secrets['s3_bucket'] + s3.sep + data.table_name + s3.sep +
                destination).as_posix()

    get_logger().debug(
        f"Upload {data.filename.stat().st_size/(1024*1024)} MB from file: {data.filename} to {path}"
    )
    s3.put(lpath=data.filename.as_posix(), rpath=path)
    return data.filename
def click_on_xpath(driver: RemoteWebDriver, xpath: str, timeout: int = 60):
    time.sleep(random.uniform(0.5, 1.))
    try:
        resolved = WebDriverWait(driver, timeout=timeout).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
        resolved.click()
        return resolved
    except (TimeoutException, ) as ex:
        get_logger().error(f'Unable to locate element: {xpath} within {timeout} seconds')
        raise ex
    except (InvalidSelectorException, ) as ex:
        raise ex
    except (NoSuchElementException, ElementNotVisibleException, InvalidElementStateException, ) as ex:
        raise ex
Example #10
0
 def __init__(self, state_handlers: Iterable[Callable] = None):
     if state_handlers is not None and not isinstance(
         state_handlers, collections.Sequence
     ):
         raise TypeError("state_handlers should be iterable.")
     self.state_handlers = state_handlers or []
     self.logger = logging.get_logger(type(self).__name__)
Example #11
0
def identify_s3_files_to_purge(
        table_name: str, first_index: int,
        secrets: Dict[str, str]) -> Union[List[str], None]:
    s3 = get_s3_connection(secrets=secrets)
    path = Path(secrets['s3_bucket'] + s3.sep + table_name).as_posix()

    if first_index > 0:
        get_logger().warning(
            f"starting_index: {first_index} implies we don't want to purge any existing data from s3 for: {path}"
        )
        return []

    # results = s3.glob(path=path + s3.sep + '**' + s3.sep + '*metadata')
    results = s3.glob(path=path + s3.sep + '**' + s3.sep + '*.parquet')
    get_logger().info(f"Discovered {len(results)} files to purge from {path}")
    return results
Example #12
0
def test_temporary_config_sets_and_resets(caplog):
    with temporary_logger_config(
            level=logging.CRITICAL,
            stream_fmt="%(message)s",
            stream_datefmt="%H:%M:%S",
    ):
        logger = get_logger()
        assert logger.level == logging.CRITICAL
        for handler in logger.handlers:
            if isinstance(handler, logging.StreamHandler):
                assert handler.formatter._fmt == "%(message)s"
                assert handler.formatter.datefmt == "%H:%M:%S"
        logger.info("Info log not shown")
        logger.critical("Critical log shown")

    logger.info("Info log shown")
    for handler in logger.handlers:
        handler.flush()

    output = caplog.text
    assert "Info log not shown" not in output
    assert "Critical log shown" in output
    assert "Info log shown" in output

    assert logger.level == logging.DEBUG
    for handler in logger.handlers:
        if isinstance(handler, logging.StreamHandler):
            assert handler.formatter._fmt != "%(message)s"
            assert handler.formatter.datefmt != "%H:%M:%S"
Example #13
0
def test_temporary_config_resets_on_exception(caplog):
    with pytest.raises(ValueError):
        with temporary_logger_config(level=logging.CRITICAL, ):
            raise ValueError()

    logger = get_logger()
    assert logger.level == logging.DEBUG
Example #14
0
 def __init__(self,
              private_registry: bool = False,
              docker_secret: str = None) -> None:
     self.identifier_label = str(uuid.uuid4())
     self.private_registry = private_registry
     self.docker_secret = docker_secret or "DOCKER_REGISTRY_CREDENTIALS"
     self.logger = logging.get_logger("CloudEnvironment")
Example #15
0
    def ensure_oauth_access_token(self):
        """Retrieves OAuth 2.0 access token using the client credentials grant and stores it in the request session."""
        logger = get_logger()
        now = datetime.utcnow()
        if self._expires_at is None or now >= self._expires_at:
            logger.info('Token is expired or missing, requesting a new one.')

            data = {
                'grant_type': 'client_credentials',
                'client_id': self.client_id,
                'client_secret': self.client_secret,
                'token_type': self.token_type,
            }

            response = requests.post(
                self.auth_url,
                data=data,
                hooks={
                    'response': log_response_hook
                }
            )
            data = response.json()
            self._session.auth = SuppliedAuth(data['access_token'], data.get('token_type', self.token_type))
            self._expires_at = now + timedelta(seconds=data['expires_in'])
            logger.info("Acquired a token that expires at {}".format(self._expires_at.isoformat()))
Example #16
0
def get_s3_connection(secrets: Dict[str, str]) -> S3FileSystem:
    # ref: https://s3fs.readthedocs.io/en/latest/#credentials
    conn = S3FileSystem(anon=False,
                        key=secrets['s3_access_key'],
                        secret=secrets['s3_secret_key'],
                        use_ssl=True,
                        client_kwargs=dict(
                            endpoint_url=secrets['s3_server'], ))

    # verify the bucket exists
    if not conn.exists(secrets['s3_bucket']):
        get_logger().warning(
            f"Unable to lcoate bucket, will attempt to create it now: {secrets['s3_bucket']}"
        )
        conn.mkdirs(secrets['s3_bucket'])

    return conn
Example #17
0
def log_response_hook(response, *args, **kwargs):  # pylint: disable=unused-argument
    """Log summary information about every request made."""
    logger = get_logger()
    logger.info(
        "[{}] [{}] [{}] {}".format(
            response.request.method, response.status_code, response.elapsed.total_seconds(), response.url
        )
    )
Example #18
0
    def __init__(
        self,
        name: str,
        schedule: prefect.schedules.Schedule = None,
        environment: Environment = None,
        storage: Storage = None,
        tasks: Iterable[Task] = None,
        edges: Iterable[Edge] = None,
        reference_tasks: Iterable[Task] = None,
        state_handlers: List[Callable] = None,
        on_failure: Callable = None,
        validate: bool = None,
        result_handler: ResultHandler = None,
    ):
        self._cache = {}  # type: dict

        if not name:
            raise ValueError("A name must be provided for the flow.")

        self.name = name
        self.logger = logging.get_logger("Flow: {}".format(self.name))
        self.schedule = schedule
        self.environment = environment or prefect.environments.RemoteEnvironment(
        )
        self.storage = storage
        self.result_handler = (
            result_handler
            or prefect.engine.get_default_result_handler_class()())

        self.tasks = set()  # type: Set[Task]
        self.edges = set()  # type: Set[Edge]
        self.constants = collections.defaultdict(
            dict)  # type: Dict[Task, Dict[str, Any]]

        for t in tasks or []:
            self.add_task(t)

        self.set_reference_tasks(reference_tasks or [])
        for e in edges or []:
            self.add_edge(
                upstream_task=e.upstream_task,
                downstream_task=e.downstream_task,
                key=e.key,
                mapped=e.mapped,
                validate=validate,
            )

        self._prefect_version = prefect.__version__

        if state_handlers and not isinstance(state_handlers,
                                             collections.Sequence):
            raise TypeError("state_handlers should be iterable.")
        self.state_handlers = state_handlers or []
        if on_failure is not None:
            self.state_handlers.append(
                callback_factory(on_failure, check=lambda s: s.is_failed()))

        super().__init__()
Example #19
0
 def __init__(self,
              value: Any = None,
              bucket: str = None,
              credentials_secret: str = None,
              **kwargs: Any) -> None:
     self.bucket = bucket
     self.credentials_secret = credentials_secret
     self.logger = logging.get_logger(type(self).__name__)
     super().__init__(value, **kwargs)
Example #20
0
def task_filter_links(
        links: T.Union[T.List[str],
                       Result], gaming_platform: T.Union[str, Parameter],
        tbl: T.Union[sa.Table, Result]) -> T.Union[T.List[str], Result]:
    """
    Remove any links which we have 'recently' scraped
    """
    stmt = sa.select([tbl.c.source_url]).where(
        sa.and_(
            tbl.c.platform == gaming_platform,
            tbl.c.source_url.in_(links),
            # tbl.c.scraped_on > datetime.datetime.utcnow() - datetime.timedelta(days=1)
        ))
    rp = tbl.bind.execute(stmt)
    results = set([_[0] for _ in rp.fetchall()])
    output = list(set(links).difference(results))
    get_logger().info(f'Discovered {len(output)} links to parse')
    return output
def default_logger():
    handler = logging.StreamHandler()
    handler.setFormatter(DatadogFormatter())

    logger = get_logger()

    logger.addHandler(handler)
    logger.info(f'Beginning Flow run for \'{prefect.context.flow_name}\'')
    logger.info(f'Task \'{prefect.context.task_name}\': Starting task run...')
Example #22
0
 def __init__(
     self,
     api_key_id: str,
     api_token: str,
 ) -> None:
     self.api_key_id = api_key_id
     self.api_token = api_token
     self.logger = get_logger()
     self._api_url = "https://api.getmontecarlo.com/graphql"
Example #23
0
 def __init__(
     self,
     labels: Iterable[str] = None,
     on_start: Callable = None,
     on_exit: Callable = None,
 ) -> None:
     self.labels = set(labels) if labels else set()
     self.on_start = on_start
     self.on_exit = on_exit
     self.logger = logging.get_logger(type(self).__name__)
Example #24
0
def load_and_run_flow() -> None:
    """
    Loads a flow (and the corresponding environment), then runs the flow with
    the environment.

    This is useful for environments whose `execute` method schedules a job that
    later needs to run the flow.

    Raises:
        - ValueError: if no `flow_run_id` is found in context
    """
    logger = logging.get_logger("Environment")
    try:
        flow_run_id = prefect.context.get("flow_run_id")

        if not flow_run_id:
            raise ValueError("No flow run ID found in context.")

        query = {
            "query": {
                with_args("flow_run", {"where": {
                    "id": {
                        "_eq": flow_run_id
                    }
                }}): {
                    "flow": {
                        "name": True,
                        "storage": True
                    },
                }
            }
        }

        client = Client()
        result = client.graphql(query)
        flow_run = result.data.flow_run[0]

        flow_data = flow_run.flow
        storage_schema = prefect.serialization.storage.StorageSchema()
        storage = storage_schema.load(flow_data.storage)

        # populate global secrets
        secrets = prefect.context.get("secrets", {})
        for secret in storage.secrets:
            secrets[secret] = prefect.tasks.secrets.PrefectSecret(
                name=secret).run()

        with prefect.context(secrets=secrets):
            flow = storage.get_flow(flow_data.name)
            flow.environment.run(flow)
    except Exception as exc:
        logger.exception(
            "Unexpected error raised during flow run: {}".format(exc))
        raise exc
Example #25
0
def create_data_partitions(table_name: str, first_index: int, last_index: int,
                           secrets: Dict[str, str],
                           num_of_records_in_batch: int) -> List[table_batch]:
    """Create partition for table_name based on number of days in the date range"""
    dt = get_table_data_types(table=table_name, secrets=secrets)

    stmt = f"""SELECT reltuples::BIGINT AS estimate 
FROM pg_class 
WHERE relname=%(table_name)s 
ORDER BY reltuples DESC
LIMIT 1"""
    get_logger().debug(stmt)
    con = get_rds_engine(secrets=secrets)
    df = pd.read_sql(sql=stmt, con=con, params=dict(table_name=table_name))
    row_estimate = df.iloc[0]['estimate'] + num_of_records_in_batch
    get_logger().info(
        f"Row estimate for table_name: {table_name} {row_estimate}")
    rows_to_pull = last_index if 0 < last_index < row_estimate else row_estimate
    assert first_index < row_estimate, f'starting_index: {first_index} is greater-than rows_to_pull: {rows_to_pull}'
    batch_size = last_index - first_index if 0 < last_index < num_of_records_in_batch else num_of_records_in_batch
    # TODO: validate batch size

    directory = Path(mkdtemp(suffix=f"_{table_name}"))
    table_partitions = [
        table_batch(table_name, dt, i, batch_size, directory)
        for i in range(first_index, rows_to_pull, batch_size)
    ]

    get_logger().info(
        f"Created {len(table_partitions)} partitions from table_name: {table_name}"
    )
    con.dispose()
    return table_partitions
Example #26
0
def run_with_thread_timeout(
    fn: Callable,
    args: Sequence = (),
    kwargs: Mapping = None,
    timeout: int = None,
    logger: Logger = None,
    name: str = None,
) -> Any:
    """
    Helper function for implementing timeouts on function executions.
    Implemented by setting a `signal` alarm on a timer. Must be run in the main thread.

    Args:
        - fn (callable): the function to execute
        - args (Sequence): arguments to pass to the function
        - kwargs (Mapping): keyword arguments to pass to the function
        - timeout (int): the length of time to allow for execution before raising a
            `TimeoutError`, represented as an integer in seconds
        - logger (Logger): an optional logger to use. If not passed, a logger for the
            `prefect.executors.run_with_thread_timeout` namespace will be created.
        - name (str): an optional name to attach to logs for this function run, defaults
            to the name of the given function. Provides an interface for passing task
            names for logs.

    Returns:
        - the result of `fn(*args, **kwargs)`

    Raises:
        - TimeoutError: if function execution exceeds the allowed timeout
        - ValueError: if run from outside the main thread
    """
    logger = logger or get_logger()
    name = name or f"Function '{fn.__name__}'"
    kwargs = kwargs or {}

    if timeout is None:
        return fn(*args, **kwargs)

    def error_handler(signum, frame):  # type: ignore
        raise TimeoutError("Execution timed out.")

    try:
        # Set the signal handler for alarms
        signal.signal(signal.SIGALRM, error_handler)
        # Raise the alarm if `timeout` seconds pass
        logger.debug(f"{name}: Sending alarm with {timeout}s timeout...")
        signal.alarm(timeout)
        logger.debug(f"{name}: Executing function in main thread...")
        return fn(*args, **kwargs)
    finally:
        signal.alarm(0)
Example #27
0
 def run(self) -> None:
     logger = get_logger("threaded_heartbeat")
     client = Client()
     iter_count = 0
     with prefect.context(
         {"flow_run_id": self.flow_run_id, "running_with_backend": True}
     ):
         with log_heartbeat_failure(logger):
             while iter_count < (self.num or 1) and (
                 self.stop_event.is_set() is False
             ):
                 send_heartbeat(self.flow_run_id, client, logger)
                 iter_count += 1 if self.num else 0
                 self.stop_event.wait(timeout=config.cloud.heartbeat_interval)
Example #28
0
 def __init__(
     self,
     value: Any = None,
     result_handler: ResultHandler = None,
     validators: Iterable[Callable] = None,
     run_validators: bool = True,
     location: str = None,
 ):
     self.value = value
     self.safe_value = NoResult  # type: SafeResult
     self.result_handler = result_handler  # type: ignore
     self.validators = validators
     self.run_validators = run_validators
     self.location = location
     self.logger = logging.get_logger(type(self).__name__)
Example #29
0
 def __init__(
     self,
     min_workers: int = 1,
     max_workers: int = 2,
     private_registry: bool = False,
     docker_secret: str = None,
 ) -> None:
     self.min_workers = min_workers
     self.max_workers = max_workers
     self.identifier_label = str(uuid.uuid4())
     self.private_registry = private_registry
     if self.private_registry:
         self.docker_secret = docker_secret or "DOCKER_REGISTRY_CREDENTIALS"
     else:
         self.docker_secret = None  # type: ignore
     self.logger = logging.get_logger("CloudEnvironment")
Example #30
0
def multiprocessing_safe_run_and_retrieve(
    queue: multiprocessing.Queue,
    payload: bytes,
) -> None:
    """
    Gets the return value from a function and puts it in a multiprocessing-safe
    container. Helper function for `run_with_multiprocess_timeout`, must be defined
    top-level so it can be pickled and sent to `multiprocessing.Process`

    Passing the payload serialized allows us to escape the limitations of the python
    native pickler which will fail on tasks defined in scripts because of name
    mismatches. Whilst this particular example only affects the `func` arg, any of the
    others could be affected by other pickle limitations as well.

    Args:
        - queue (multiprocessing.Queue): The queue to pass the resulting payload to
        - payload (bytes): A serialized dictionary containing the data required to run
            the function. Should be serialized with `cloudpickle.dumps`
            Expects the following keys:
            - fn (Callable): The function to call
            - args (list): Positional argument values to call the function with
            - kwargs (Mapping): Keyword arguments to call the function with
            - context (dict): The prefect context dictionary to use during execution
            - name (str): an optional name to attach to logs for this function run,
                defaults to the name of the given function. Provides an interface for
                passing task names for logs.
            - logger (Logger): the logger to use
    """
    request = cloudpickle.loads(payload)

    fn: Callable = request["fn"]
    context: dict = request.get("context", {})
    args: Sequence = request.get("args", [])
    kwargs: Mapping = request.get("kwargs", {})
    name: str = request.get("name", f"Function '{fn.__name__}'")
    logger: Logger = request.get("logger") or get_logger()

    try:
        with prefect.context(context):
            logger.debug(f"{name}: Executing...")
            return_val = fn(*args, **kwargs)
    except Exception as exc:
        return_val = exc

    logger.debug(f"{name}: Passing result back to main process...")
    queue.put(cloudpickle.dumps(return_val))