Exemple #1
0
    def __init__(
        self,
        bucket: str,
        key: str = None,
        stored_as_script: bool = False,
        local_script_path: str = None,
        client_options: dict = None,
        upload_options: dict = None,
        **kwargs: Any,
    ) -> None:
        self.bucket = bucket
        self.key = key
        self.upload_options = upload_options
        self.local_script_path = local_script_path or prefect.context.get(
            "local_script_path", None
        )

        self.client_options = client_options

        result = S3Result(bucket=bucket, boto3_kwargs=client_options)
        super().__init__(
            result=result,
            stored_as_script=stored_as_script,
            **kwargs,
        )
Exemple #2
0
    def test_forwards_boto3_kwargs(self, mock_boto3):
        result = S3Result(bucket="mybucket",
                          boto3_kwargs={"region_name": "a-region"})
        assert result.client is not None

        assert mock_boto3.client.call_args[1] == {
            "aws_access_key_id": "access_key",
            "aws_session_token": None,
            "aws_secret_access_key": "secret_access_key",
            "region_name": "a-region",
        }
Exemple #3
0
    def test_s3_result_does_not_exist(self, mock_boto3):
        import botocore

        exc = botocore.exceptions.ClientError({"Error": {
            "Code": "NoSuchKey"
        }}, "list_objects")
        mock_boto3.client.return_value.get_object.side_effect = exc

        result = S3Result(bucket="bob", location="stuff")
        result = result.format()
        assert result.exists("stuff") is False
Exemple #4
0
    def test_s3_writes_to_blob_with_rendered_filename(self, mock_boto3):
        result = S3Result(bucket="foo", location="{thing}/here.txt")

        with prefect.context(thing="yes!"):
            new_result = result.write("so-much-data", **prefect.context)

        used_uri = mock_boto3.client.return_value.upload_fileobj.call_args[1][
            "Key"]

        assert used_uri == new_result.location
        assert new_result.location.startswith("yes!/here.txt")
Exemple #5
0
    def test_s3_result_is_pickleable(self, mock_boto3):
        class NoPickle:
            def __getstate__(self):
                raise ValueError("I cannot be pickled.")

        mock_boto3.client.return_value = NoPickle()

        result = S3Result(bucket="foo")
        assert result.client is not None

        res = cloudpickle.loads(cloudpickle.dumps(result))
        assert isinstance(res, S3Result)
Exemple #6
0
    def test_s3_client_init_uses_secrets(self, session):
        result = S3Result(bucket="bob", credentials_secret="AWS_CREDENTIALS")
        assert result.bucket == "bob"
        assert session.Session().client.called is False

        with prefect.context(secrets=dict(
                AWS_CREDENTIALS=dict(ACCESS_KEY=1, SECRET_ACCESS_KEY=42))):
            with set_temporary_config({"cloud.use_local_secrets": True}):
                result.initialize_client()
        assert session.Session().client.call_args[1] == {
            "aws_access_key_id": 1,
            "aws_secret_access_key": 42,
        }
Exemple #7
0
    def test_s3_client_init_uses_custom_secrets(self, session):
        result = S3Result(bucket="bob", credentials_secret="MY_FOO")

        with prefect.context(secrets=dict(
                MY_FOO=dict(ACCESS_KEY=1, SECRET_ACCESS_KEY=999))):
            with set_temporary_config({"cloud.use_local_secrets": True}):
                result.initialize_client()

        assert result.bucket == "bob"
        assert session.Session().client.call_args[1] == {
            "aws_access_key_id": 1,
            "aws_secret_access_key": 999,
        }
Exemple #8
0
    def test_s3_client_init_uses_secrets(self, mock_boto3):
        result = S3Result(bucket="bob")
        assert result.bucket == "bob"
        assert mock_boto3.client.called is False

        assert result.client is not None

        assert mock_boto3.client.call_args[1] == {
            "aws_access_key_id": "access_key",
            "aws_session_token": None,
            "aws_secret_access_key": "secret_access_key",
            "region_name": None,
        }
Exemple #9
0
    def __init__(self,
                 bucket: str,
                 client_options: dict = None,
                 key: str = None,
                 **kwargs: Any) -> None:
        self.flows = dict()  # type: Dict[str, str]
        self._flows = dict()  # type: Dict[str, "Flow"]
        self.bucket = bucket
        self.key = key

        self.client_options = client_options

        result = S3Result(bucket=bucket)
        super().__init__(result=result, **kwargs)
Exemple #10
0
    def test_s3_writes_to_blob_with_rendered_filename(self, session):
        result = S3Result(bucket="foo", filepath="{thing}/here.txt")

        with prefect.context(
                secrets=dict(
                    AWS_CREDENTIALS=dict(ACCESS_KEY=1, SECRET_ACCESS_KEY=42)),
                thing="yes!",
        ) as ctx:
            with set_temporary_config({"cloud.use_local_secrets": True}):
                new_result = result.write("so-much-data", **ctx)

        used_uri = session.Session(
        ).client.return_value.upload_fileobj.call_args[1]["Key"]

        assert used_uri == new_result.filepath
        assert new_result.filepath.startswith("yes!/here.txt")
    def test_s3_result_exists(self, session):
        import botocore

        exc = botocore.exceptions.ClientError({"Error": {
            "Code": "NoSuchKey"
        }}, "list_objects")

        class _client:
            def __init__(self, *args, **kwargs):
                pass

            def get_object(self, *args, **kwargs):
                return MagicMock()

        session.Session().client = _client
        result = S3Result(bucket="bob", location="stuff")
        result = result.format()
        assert result.exists("stuff") is True
Exemple #12
0
    def test_s3_result_does_not_exist(self, session):
        import botocore

        exc = botocore.exceptions.ClientError({"Error": {
            "Code": "404"
        }}, "list_objects")

        class _client:
            def __init__(self, *args, **kwargs):
                pass

            def get_object(self, *args, **kwargs):
                raise exc

        session.Session().client = _client
        result = S3Result(bucket="bob", filepath="stuff")
        result = result.format()
        assert result.exists("stuff") == False
Exemple #13
0
    def test_s3_result_is_pickleable(self, monkeypatch):
        class client:
            def __init__(self, *args, **kwargs):
                pass

            def __getstate__(self):
                raise ValueError("I cannot be pickled.")

        import boto3

        with patch.dict("sys.modules", {"boto3": MagicMock()}):
            boto3.session.Session().client = client

            with prefect.context(secrets=dict(
                    AWS_CREDENTIALS=dict(ACCESS_KEY=1, SECRET_ACCESS_KEY=42))):
                with set_temporary_config({"cloud.use_local_secrets": True}):
                    result = S3Result(bucket="foo")
            res = cloudpickle.loads(cloudpickle.dumps(result))
            assert isinstance(res, S3Result)
Exemple #14
0
    def __init__(
        self,
        bucket: str,
        key: str = None,
        stored_as_script: bool = False,
        local_script_path: str = None,
        client_options: dict = None,
        **kwargs: Any,
    ) -> None:
        self.flows = dict()  # type: Dict[str, str]
        self._flows = dict()  # type: Dict[str, "Flow"]
        self.bucket = bucket
        self.key = key
        self.local_script_path = local_script_path or prefect.context.get(
            "local_script_path", None)

        self.client_options = client_options

        result = S3Result(bucket=bucket)
        super().__init__(
            result=result,
            stored_as_script=stored_as_script,
            **kwargs,
        )
Exemple #15
0
from prefect import task, Flow, Parameter, unmapped
from prefect.engine.results import S3Result, LocalResult

from prefect.tasks.secrets.base import PrefectSecret

from moc_data_tasks import get_tsx_moc_imb, partition_df, df_to_db


import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)


# For other template options https://docs.prefect.io/api/latest/utilities/context.html#context-2  
s3_result = S3Result(bucket="results-prefect-tst", location="{flow_name}-{today}/results.prefect")

#lcl_result = LocalResult(dir="~/prefect_guide/results/", location="{flow_name}/{today}")

result_h = s3_result

# A flow has no particular order unless the data is bound (shown) or explicitly set (not shown).
with Flow(name="Test-Get-Imbalances", result=result_h) as tsx_imb_fl:
    
    tsx_url = Parameter("tsx_url", default="https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html")
    imb_tbl_nm = Parameter("imb_tbl_nm", default="moc_tst")
    n_conn = Parameter("n_conn", default=1) 
    
    # Scrape the website
    tsx_imb_df = get_tsx_moc_imb(tsx_url)
    df = (source_df.loc[:, ~source_df.columns.str.contains("url")].drop(
        ["image", "episode"], axis="columns").set_index("id"))

    df.to_csv(str_buffer)
    csv_string = str_buffer.getvalue()

    return csv_string


# Instanciar tarefas da biblioteca do Prefect
s3_upload = S3Upload(name="Upload")

# Definição da flow
with Flow("docker-dependencias",
          result=S3Result(bucket="prefect-grupyrp")) as flow:
    bucket = Parameter("Nome do bucket", required=True)
    credenciais_aws = PrefectSecret("AWS_CREDENTIALS")
    personagens = fetch_characters()
    dados = build_csv(source=personagens)
    csv_upload = s3_upload(
        data=dados,
        bucket=bucket,
        credentials=credenciais_aws,
        key=f"rick_and_morty/{data}_characters.csv",
    )

if __name__ == "__main__":
    flow.run_config = DockerRun(
        env={"EXTRA_PIP_PACKAGES": "pandas requests prefect[aws]"})
    flow.storage = GitHub(
Exemple #17
0
flow.storage = Docker(
    python_dependencies=[
        'boto3',
        'pandas',
    ],
    env_vars={
        'PREFECT__CONTEXT__MINIO_URL': 'http://minio:9000',
    },
)

flow.environment = LocalEnvironment(executor=LocalDaskExecutor())

flow.result = S3Result(
    bucket='prefect',
    boto3_kwargs={
        'endpoint_url': 'http://minio:9000',
        'aws_access_key_id': 'accesskey',
        'aws_secret_access_key': 'secretkey',
    },
)

flow.register(project_name="My Project", labels=['docker'])

# warehouse

from flows.warehouse import flow

flow.storage = Docker(
    base_image='wdpbigdata/spark:latest',
    files={
        abspath('scripts/warehouse.py'): '/opt/scripts/warehouse.py',
    },
Exemple #18
0
 def test_s3_result_exists(self, mock_boto3):
     result = S3Result(bucket="bob", location="stuff")
     result = result.format()
     assert result.exists("stuff") is True
Exemple #19
0
from prefect.engine.results import S3Result
import time


@task(target="{task_name}-{today}")
def extract():
    """Get a list of data"""
    return [1, 2, 3]


@task(target="{task_name}-{today}")
def transform(data):
    """Multiply the input by 10"""
    return [i * 10 for i in data]


@task
def load(data):
    """Print the data to indicate it was received"""
    print("Here's your data: {}".format(data))


from prefect import Flow

with Flow("ETL", result=S3Result(bucket="")) as flow:
    e = extract()
    t = transform(e)
    l = load(t)

flow.run()
Exemple #20
0
FLOWS_DIR_PATH = '/opt/server/src/flows'
storage_kwargs = {
    'dockerfile': 'server/Dockerfile',
    'registry_url': REGISTRY_URL,
    'stored_as_script': True,
}

# Executer
local_executor = LocalExecutor()
dask_executor = DaskExecutor(address=DASK_SCHEDULER_ADDR)

# Result
if RESULT_SUBCLASS == 'azure':
    result = AzureResult(container=AZURE_RESULT_CONTAINER)
elif RESULT_SUBCLASS == 's3':
    result = S3Result(bucket=S3_RESULT_BUCKET)
else:
    result = LocalResult(dir=LOCAL_RESULT_DIR)


# Set flow run configs
mapreduce_wordcount.run_config = run_config


# Set flow storage
mapreduce_wordcount.storage = Docker(
    path=f'{FLOWS_DIR_PATH}/mock.py',
    **storage_kwargs
)

Exemple #21
0
import prefect
print(prefect.__version__)

from prefect import task, Flow
from prefect.engine.results import S3Result


@task(target="{task_name}-{today}")
def extract():
    return [1, 2, 3]


@task(target="{task_name}-{today}")
def transform(data):
    return [i * 10 for i in data]


@task
def load(data):
    print("Here's your data: {}".format(data))


with Flow("ETL", result=S3Result(bucket="prefect-test-bucket")) as flow:
    e = extract()
    t = transform(e)
    l = load(t)

flow.run()
Exemple #22
0
    print(f"\nHere's your data: {data}")


# Parameterized Scheduling
from prefect.schedules import Schedule
from prefect.schedules.clocks import IntervalClock, CronClock, DatesClock

clock1, clock2 = IntervalClock(
    start_date=datetime.utcnow() + timedelta(seconds=10),
    interval=timedelta(hours=12),
    parameter_defaults={"length": 15}), IntervalClock(
        start_date=datetime.utcnow() + timedelta(seconds=10),
        interval=timedelta(hours=24),
        parameter_defaults={"length": 20})

schedule = Schedule(clocks=[clock1, clock2])

# Define Tasks in a Flow Context
with Flow('Evolving ETL',
          result=S3Result(bucket="flow-result-storage"),
          state_handlers=[my_state_handler],
          schedule=schedule) as flow:
    with case(length, 5):
        e = extract(length)
    with case(length, 50):
        e = extract(length)

    t = transform.map(e)
    l = load(t)

flow.run(parameters={'length': 50})  # Prints data
def requested_dates_task(start_date:str, end_date:str) -> list:
    return get_open_market_dates(start_date, end_date)


@task(
    checkpoint=True,
    target="checkpoints/{tick_type}/symbol={symbol_date[0]}/date={symbol_date[1]}/data.prefect"
)
def backfill_task(symbol_date:tuple, tick_type:str) -> pd.DataFrame:
    return backfill_date_tos3(symbol=symbol_date[0], date=symbol_date[1], tick_type=tick_type)
    

result_store = S3Result(
    bucket='polygon-equities', 
    boto3_kwargs={
        'aws_access_key_id': environ['B2_ACCESS_KEY_ID'],
        'aws_secret_access_key': environ['B2_SECRET_ACCESS_KEY'],
        'endpoint_url':  environ['B2_ENDPOINT_URL']
    }
)


with Flow(name='backfill-flow', result=result_store) as flow:
    
    start_date = Parameter('start_date', default='2020-01-01')
    end_date = Parameter('end_date', default='2020-02-01')
    tick_type = Parameter('tick_type', default='trades')
    symbols = Parameter('symbols', default=['GLD'])

    req_dates = requested_dates_task(start_date, end_date)

    symbol_date = cross_product_task(symbols, req_dates)
Exemple #24
0
# Configurable value taken at runtime
length = Parameter(name="length", default=5, required=False)

# ETL Pipeline Tasks
@task(result=LocalResult(), target="{date:%A}/{task_name}.prefect")
def extract(length):
    # Extract the data
    return sample(range(100), length)

@task(max_retries=3, retry_delay=timedelta(seconds=5))
def transform(data):
    # Transform the data
    return data * 10

@task(trigger=some_successful(at_least=1, at_most=6))
def load(data):
    # Load the data
    print(f"\nHere's your data: {data}")

# Define Tasks in a Flow Context
with Flow('Evolving ETL', result=S3Result(bucket="flow-result-storage")) as flow:
    with case(length, 5):
        e = extract(length)
    with case(length, 50):
        e = extract(length)

    t = transform.map(e)
    l = load(t)

flow.run(parameters={'length': 50}) # Prints data
Exemple #25
0
import prefect

print(prefect.__version__)

from prefect import task, Flow
from prefect.engine.results import LocalResult, S3Result
from prefect.tasks.secrets.base import PrefectSecret
from prefect.engine.executors import DaskExecutor, LocalExecutor
from prefect.environments.storage import Docker
from prefect.environments import RemoteEnvironment

template = '{flow_name}-{today}/{task_name}/{map_index}.prefect'

s3_result = S3Result(bucket="results-prefect-tst", )


@task()
def gen_list():
    return [x for x in range(10)]


@task(target=template)
def add(x, y):
    return x + y


@task(target=template)
def multiply(x, y):
    return x * y