def __init__( self, bucket: str, key: str = None, stored_as_script: bool = False, local_script_path: str = None, client_options: dict = None, upload_options: dict = None, **kwargs: Any, ) -> None: self.bucket = bucket self.key = key self.upload_options = upload_options self.local_script_path = local_script_path or prefect.context.get( "local_script_path", None ) self.client_options = client_options result = S3Result(bucket=bucket, boto3_kwargs=client_options) super().__init__( result=result, stored_as_script=stored_as_script, **kwargs, )
def test_forwards_boto3_kwargs(self, mock_boto3): result = S3Result(bucket="mybucket", boto3_kwargs={"region_name": "a-region"}) assert result.client is not None assert mock_boto3.client.call_args[1] == { "aws_access_key_id": "access_key", "aws_session_token": None, "aws_secret_access_key": "secret_access_key", "region_name": "a-region", }
def test_s3_result_does_not_exist(self, mock_boto3): import botocore exc = botocore.exceptions.ClientError({"Error": { "Code": "NoSuchKey" }}, "list_objects") mock_boto3.client.return_value.get_object.side_effect = exc result = S3Result(bucket="bob", location="stuff") result = result.format() assert result.exists("stuff") is False
def test_s3_writes_to_blob_with_rendered_filename(self, mock_boto3): result = S3Result(bucket="foo", location="{thing}/here.txt") with prefect.context(thing="yes!"): new_result = result.write("so-much-data", **prefect.context) used_uri = mock_boto3.client.return_value.upload_fileobj.call_args[1][ "Key"] assert used_uri == new_result.location assert new_result.location.startswith("yes!/here.txt")
def test_s3_result_is_pickleable(self, mock_boto3): class NoPickle: def __getstate__(self): raise ValueError("I cannot be pickled.") mock_boto3.client.return_value = NoPickle() result = S3Result(bucket="foo") assert result.client is not None res = cloudpickle.loads(cloudpickle.dumps(result)) assert isinstance(res, S3Result)
def test_s3_client_init_uses_secrets(self, session): result = S3Result(bucket="bob", credentials_secret="AWS_CREDENTIALS") assert result.bucket == "bob" assert session.Session().client.called is False with prefect.context(secrets=dict( AWS_CREDENTIALS=dict(ACCESS_KEY=1, SECRET_ACCESS_KEY=42))): with set_temporary_config({"cloud.use_local_secrets": True}): result.initialize_client() assert session.Session().client.call_args[1] == { "aws_access_key_id": 1, "aws_secret_access_key": 42, }
def test_s3_client_init_uses_custom_secrets(self, session): result = S3Result(bucket="bob", credentials_secret="MY_FOO") with prefect.context(secrets=dict( MY_FOO=dict(ACCESS_KEY=1, SECRET_ACCESS_KEY=999))): with set_temporary_config({"cloud.use_local_secrets": True}): result.initialize_client() assert result.bucket == "bob" assert session.Session().client.call_args[1] == { "aws_access_key_id": 1, "aws_secret_access_key": 999, }
def test_s3_client_init_uses_secrets(self, mock_boto3): result = S3Result(bucket="bob") assert result.bucket == "bob" assert mock_boto3.client.called is False assert result.client is not None assert mock_boto3.client.call_args[1] == { "aws_access_key_id": "access_key", "aws_session_token": None, "aws_secret_access_key": "secret_access_key", "region_name": None, }
def __init__(self, bucket: str, client_options: dict = None, key: str = None, **kwargs: Any) -> None: self.flows = dict() # type: Dict[str, str] self._flows = dict() # type: Dict[str, "Flow"] self.bucket = bucket self.key = key self.client_options = client_options result = S3Result(bucket=bucket) super().__init__(result=result, **kwargs)
def test_s3_writes_to_blob_with_rendered_filename(self, session): result = S3Result(bucket="foo", filepath="{thing}/here.txt") with prefect.context( secrets=dict( AWS_CREDENTIALS=dict(ACCESS_KEY=1, SECRET_ACCESS_KEY=42)), thing="yes!", ) as ctx: with set_temporary_config({"cloud.use_local_secrets": True}): new_result = result.write("so-much-data", **ctx) used_uri = session.Session( ).client.return_value.upload_fileobj.call_args[1]["Key"] assert used_uri == new_result.filepath assert new_result.filepath.startswith("yes!/here.txt")
def test_s3_result_exists(self, session): import botocore exc = botocore.exceptions.ClientError({"Error": { "Code": "NoSuchKey" }}, "list_objects") class _client: def __init__(self, *args, **kwargs): pass def get_object(self, *args, **kwargs): return MagicMock() session.Session().client = _client result = S3Result(bucket="bob", location="stuff") result = result.format() assert result.exists("stuff") is True
def test_s3_result_does_not_exist(self, session): import botocore exc = botocore.exceptions.ClientError({"Error": { "Code": "404" }}, "list_objects") class _client: def __init__(self, *args, **kwargs): pass def get_object(self, *args, **kwargs): raise exc session.Session().client = _client result = S3Result(bucket="bob", filepath="stuff") result = result.format() assert result.exists("stuff") == False
def test_s3_result_is_pickleable(self, monkeypatch): class client: def __init__(self, *args, **kwargs): pass def __getstate__(self): raise ValueError("I cannot be pickled.") import boto3 with patch.dict("sys.modules", {"boto3": MagicMock()}): boto3.session.Session().client = client with prefect.context(secrets=dict( AWS_CREDENTIALS=dict(ACCESS_KEY=1, SECRET_ACCESS_KEY=42))): with set_temporary_config({"cloud.use_local_secrets": True}): result = S3Result(bucket="foo") res = cloudpickle.loads(cloudpickle.dumps(result)) assert isinstance(res, S3Result)
def __init__( self, bucket: str, key: str = None, stored_as_script: bool = False, local_script_path: str = None, client_options: dict = None, **kwargs: Any, ) -> None: self.flows = dict() # type: Dict[str, str] self._flows = dict() # type: Dict[str, "Flow"] self.bucket = bucket self.key = key self.local_script_path = local_script_path or prefect.context.get( "local_script_path", None) self.client_options = client_options result = S3Result(bucket=bucket) super().__init__( result=result, stored_as_script=stored_as_script, **kwargs, )
from prefect import task, Flow, Parameter, unmapped from prefect.engine.results import S3Result, LocalResult from prefect.tasks.secrets.base import PrefectSecret from moc_data_tasks import get_tsx_moc_imb, partition_df, df_to_db import logging logger = logging.getLogger() logger.setLevel(logging.INFO) # For other template options https://docs.prefect.io/api/latest/utilities/context.html#context-2 s3_result = S3Result(bucket="results-prefect-tst", location="{flow_name}-{today}/results.prefect") #lcl_result = LocalResult(dir="~/prefect_guide/results/", location="{flow_name}/{today}") result_h = s3_result # A flow has no particular order unless the data is bound (shown) or explicitly set (not shown). with Flow(name="Test-Get-Imbalances", result=result_h) as tsx_imb_fl: tsx_url = Parameter("tsx_url", default="https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html") imb_tbl_nm = Parameter("imb_tbl_nm", default="moc_tst") n_conn = Parameter("n_conn", default=1) # Scrape the website tsx_imb_df = get_tsx_moc_imb(tsx_url)
df = (source_df.loc[:, ~source_df.columns.str.contains("url")].drop( ["image", "episode"], axis="columns").set_index("id")) df.to_csv(str_buffer) csv_string = str_buffer.getvalue() return csv_string # Instanciar tarefas da biblioteca do Prefect s3_upload = S3Upload(name="Upload") # Definição da flow with Flow("docker-dependencias", result=S3Result(bucket="prefect-grupyrp")) as flow: bucket = Parameter("Nome do bucket", required=True) credenciais_aws = PrefectSecret("AWS_CREDENTIALS") personagens = fetch_characters() dados = build_csv(source=personagens) csv_upload = s3_upload( data=dados, bucket=bucket, credentials=credenciais_aws, key=f"rick_and_morty/{data}_characters.csv", ) if __name__ == "__main__": flow.run_config = DockerRun( env={"EXTRA_PIP_PACKAGES": "pandas requests prefect[aws]"}) flow.storage = GitHub(
flow.storage = Docker( python_dependencies=[ 'boto3', 'pandas', ], env_vars={ 'PREFECT__CONTEXT__MINIO_URL': 'http://minio:9000', }, ) flow.environment = LocalEnvironment(executor=LocalDaskExecutor()) flow.result = S3Result( bucket='prefect', boto3_kwargs={ 'endpoint_url': 'http://minio:9000', 'aws_access_key_id': 'accesskey', 'aws_secret_access_key': 'secretkey', }, ) flow.register(project_name="My Project", labels=['docker']) # warehouse from flows.warehouse import flow flow.storage = Docker( base_image='wdpbigdata/spark:latest', files={ abspath('scripts/warehouse.py'): '/opt/scripts/warehouse.py', },
def test_s3_result_exists(self, mock_boto3): result = S3Result(bucket="bob", location="stuff") result = result.format() assert result.exists("stuff") is True
from prefect.engine.results import S3Result import time @task(target="{task_name}-{today}") def extract(): """Get a list of data""" return [1, 2, 3] @task(target="{task_name}-{today}") def transform(data): """Multiply the input by 10""" return [i * 10 for i in data] @task def load(data): """Print the data to indicate it was received""" print("Here's your data: {}".format(data)) from prefect import Flow with Flow("ETL", result=S3Result(bucket="")) as flow: e = extract() t = transform(e) l = load(t) flow.run()
FLOWS_DIR_PATH = '/opt/server/src/flows' storage_kwargs = { 'dockerfile': 'server/Dockerfile', 'registry_url': REGISTRY_URL, 'stored_as_script': True, } # Executer local_executor = LocalExecutor() dask_executor = DaskExecutor(address=DASK_SCHEDULER_ADDR) # Result if RESULT_SUBCLASS == 'azure': result = AzureResult(container=AZURE_RESULT_CONTAINER) elif RESULT_SUBCLASS == 's3': result = S3Result(bucket=S3_RESULT_BUCKET) else: result = LocalResult(dir=LOCAL_RESULT_DIR) # Set flow run configs mapreduce_wordcount.run_config = run_config # Set flow storage mapreduce_wordcount.storage = Docker( path=f'{FLOWS_DIR_PATH}/mock.py', **storage_kwargs )
import prefect print(prefect.__version__) from prefect import task, Flow from prefect.engine.results import S3Result @task(target="{task_name}-{today}") def extract(): return [1, 2, 3] @task(target="{task_name}-{today}") def transform(data): return [i * 10 for i in data] @task def load(data): print("Here's your data: {}".format(data)) with Flow("ETL", result=S3Result(bucket="prefect-test-bucket")) as flow: e = extract() t = transform(e) l = load(t) flow.run()
print(f"\nHere's your data: {data}") # Parameterized Scheduling from prefect.schedules import Schedule from prefect.schedules.clocks import IntervalClock, CronClock, DatesClock clock1, clock2 = IntervalClock( start_date=datetime.utcnow() + timedelta(seconds=10), interval=timedelta(hours=12), parameter_defaults={"length": 15}), IntervalClock( start_date=datetime.utcnow() + timedelta(seconds=10), interval=timedelta(hours=24), parameter_defaults={"length": 20}) schedule = Schedule(clocks=[clock1, clock2]) # Define Tasks in a Flow Context with Flow('Evolving ETL', result=S3Result(bucket="flow-result-storage"), state_handlers=[my_state_handler], schedule=schedule) as flow: with case(length, 5): e = extract(length) with case(length, 50): e = extract(length) t = transform.map(e) l = load(t) flow.run(parameters={'length': 50}) # Prints data
def requested_dates_task(start_date:str, end_date:str) -> list: return get_open_market_dates(start_date, end_date) @task( checkpoint=True, target="checkpoints/{tick_type}/symbol={symbol_date[0]}/date={symbol_date[1]}/data.prefect" ) def backfill_task(symbol_date:tuple, tick_type:str) -> pd.DataFrame: return backfill_date_tos3(symbol=symbol_date[0], date=symbol_date[1], tick_type=tick_type) result_store = S3Result( bucket='polygon-equities', boto3_kwargs={ 'aws_access_key_id': environ['B2_ACCESS_KEY_ID'], 'aws_secret_access_key': environ['B2_SECRET_ACCESS_KEY'], 'endpoint_url': environ['B2_ENDPOINT_URL'] } ) with Flow(name='backfill-flow', result=result_store) as flow: start_date = Parameter('start_date', default='2020-01-01') end_date = Parameter('end_date', default='2020-02-01') tick_type = Parameter('tick_type', default='trades') symbols = Parameter('symbols', default=['GLD']) req_dates = requested_dates_task(start_date, end_date) symbol_date = cross_product_task(symbols, req_dates)
# Configurable value taken at runtime length = Parameter(name="length", default=5, required=False) # ETL Pipeline Tasks @task(result=LocalResult(), target="{date:%A}/{task_name}.prefect") def extract(length): # Extract the data return sample(range(100), length) @task(max_retries=3, retry_delay=timedelta(seconds=5)) def transform(data): # Transform the data return data * 10 @task(trigger=some_successful(at_least=1, at_most=6)) def load(data): # Load the data print(f"\nHere's your data: {data}") # Define Tasks in a Flow Context with Flow('Evolving ETL', result=S3Result(bucket="flow-result-storage")) as flow: with case(length, 5): e = extract(length) with case(length, 50): e = extract(length) t = transform.map(e) l = load(t) flow.run(parameters={'length': 50}) # Prints data
import prefect print(prefect.__version__) from prefect import task, Flow from prefect.engine.results import LocalResult, S3Result from prefect.tasks.secrets.base import PrefectSecret from prefect.engine.executors import DaskExecutor, LocalExecutor from prefect.environments.storage import Docker from prefect.environments import RemoteEnvironment template = '{flow_name}-{today}/{task_name}/{map_index}.prefect' s3_result = S3Result(bucket="results-prefect-tst", ) @task() def gen_list(): return [x for x in range(10)] @task(target=template) def add(x, y): return x + y @task(target=template) def multiply(x, y): return x * y