コード例 #1
0
def rebuild_for_store(asset: PandasDataAsset, airflow_context):

    student_data = PandasDataAssetIO.read_data_asset(
        asset=asset, source_files=asset.pickedup_files(airflow_context))

    student_data = asset.rename_fields_as_declared(student_data)

    PandasDataAssetIO.write_data_asset(asset=asset, data=student_data)
コード例 #2
0
def rebuild_for_store(asset: PandasDataAsset, airflow_context):
    student = PandasDataAsset(name="student")
    programme = PandasDataAsset(name="programme")
    enrollment = PandasDataAsset(name="enrollment")

    student_df = student.retrieve_from_store(
        airflow_context=airflow_context, consuming_asset=asset
    )
    programme_df = programme.retrieve_from_store(
        airflow_context=airflow_context, consuming_asset=asset
    )

    enrollment_df = enrollment.retrieve_from_store(
        airflow_context=airflow_context, consuming_asset=asset
    )

    enrollment_summary: pd.DataFrame = enrollment_df.merge(
        right=student_df, on=student.declarations.key_columns
    ).merge(right=programme_df, on=programme.declarations.key_columns)

    enrollment_summary = (
        enrollment_summary.loc[:, ["student_major", "programme_name", "student_id"]]
        .groupby(by=["student_major", "programme_name"])
        .count()
    )

    PandasDataAssetIO.write_data_asset(asset=asset, data=enrollment_summary)
コード例 #3
0
def test_data_asset_paths(
    test_parquet_asset: PandasDataAsset,
    test_parquet_asset_df: pd.DataFrame,
    fake_airflow_context: Dict,
) -> None:

    # test various path getters/properties:
    test_path = test_parquet_asset.staging_pickedup_path(fake_airflow_context)
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.ingest_archive_path
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.ready_path
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.staging_ready_path
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.landing_path
    assert isinstance(test_path, str)
    test_path = test_parquet_asset.ready_archive_path(fake_airflow_context)
    assert isinstance(test_path, str)
コード例 #4
0
def test_pandas_data_asset(
    fake_airflow_context: Dict,
    test_parquet_asset: PandasDataAsset,
    test_parquet_asset_df: pd.DataFrame,
) -> None:

    # none, none
    d1 = test_parquet_asset.retrieve_from_store()

    # only airflow_context
    d2 = test_parquet_asset.retrieve_from_store(
        airflow_context=fake_airflow_context)

    # only consuming asset
    d3 = test_parquet_asset.retrieve_from_store(consuming_asset=ShellDataAsset(
        name="test_consumer"))

    # both parameters set
    d4 = test_parquet_asset.retrieve_from_store(
        airflow_context=fake_airflow_context,
        consuming_asset=ShellDataAsset(name="test_consumer"),
    )

    assert d1.equals(d2) and d2.equals(d3) and d3.equals(d4)
コード例 #5
0
def test_read_write_parquet(test_parquet_in_asset: PandasDataAsset,
                            iris: pd.DataFrame, fake_airflow_context) -> None:
    p = path.join(
        test_parquet_in_asset.staging_pickedup_path(fake_airflow_context),
        "test_parquet_in.parquet",
    )
    os.makedirs(path.dirname(p), exist_ok=True)
    iris.to_parquet(p)

    PandasDataAssetIO.read_data_asset(test_parquet_in_asset, source_files=[p])

    # try with additional kwargs:
    PandasDataAssetIO.read_data_asset(asset=test_parquet_in_asset,
                                      source_files=[p],
                                      engine="auto")
コード例 #6
0
def test_read_write_xlsx(test_xlsx_in_asset: PandasDataAsset,
                         iris: pd.DataFrame, fake_airflow_context) -> None:
    p = path.join(
        test_xlsx_in_asset.staging_pickedup_path(fake_airflow_context),
        "test_xlsx_in.xls",
    )
    os.makedirs(path.dirname(p), exist_ok=True)
    iris.to_excel(p)

    # try without any extra kwargs:
    PandasDataAssetIO.read_data_asset(asset=test_xlsx_in_asset,
                                      source_files=[p])
    # try with additional kwargs:
    PandasDataAssetIO.read_data_asset(asset=test_xlsx_in_asset,
                                      source_files=[p],
                                      sheet_name=0)
コード例 #7
0
from datetime import datetime

from airflow.models import DAG

from airtunnel import PandasDataAsset
from airtunnel.operators.archival import DataAssetArchiveOperator, IngestArchiveOperator
from airtunnel.operators.ingestion import IngestOperator
from airtunnel.operators.loading import StagingToReadyOperator
from airtunnel.operators.transformation import PandasTransformationOperator
from airtunnel.sensors.ingestion import SourceFileIsReadySensor

student = PandasDataAsset("student")
programme = PandasDataAsset("programme")
enrollment = PandasDataAsset("enrollment")
enrollment_summary = PandasDataAsset("enrollment_summary")

with DAG(
        dag_id="university",
        schedule_interval=None,
        start_date=datetime(year=2019, month=9, day=1),
) as dag:
    ingested_ready_tasks = set()

    # a common stream of tasks for all ingested assets:
    for ingested_asset in (student, programme, enrollment):
        source_is_ready = SourceFileIsReadySensor(
            # we reduce the poke interval to only 3 seconds so that our example runs complete faster
            # do not do in production!! :)
            asset=ingested_asset,
            poke_interval=3,
            no_of_required_static_pokes=2,
コード例 #8
0
def test_pandas_asset() -> PandasDataAsset:
    return PandasDataAsset("test_parquet_in_asset")
コード例 #9
0
def rebuild_for_store(asset: PandasDataAsset, airflow_context):
    programme_data = PandasDataAssetIO.read_data_asset(
        asset=asset, source_files=asset.pickedup_files(airflow_context))
    programme_data = programme_data.drop_duplicates(
        subset=asset.declarations.key_columns)
    PandasDataAssetIO.write_data_asset(asset=asset, data=programme_data)
コード例 #10
0
from datetime import datetime, timedelta

import pytest
from airflow.models import DAG

from airtunnel import PandasDataAsset
from airtunnel.sensors.metadata import (
    AwaitLoadStatusSensor,
    AwaitAssetAncestorsUpdatedSensor,
)

enrollment_summary = PandasDataAsset("enrollment_summary")

with DAG(
    dag_id="metadata_sensors",
    schedule_interval=None,
    start_date=datetime(year=2019, month=9, day=1),
) as dag:
    await_load_status = AwaitLoadStatusSensor(
        asset=enrollment_summary,
        refreshed_within=timedelta(days=1),
        poke_interval=5,
        timeout=120,
    )

    await_load_status_refreshed_after = AwaitLoadStatusSensor(
        asset=enrollment_summary,
        task_id="enrollment_summary_load_status_2",
        refreshed_after=datetime.now() - timedelta(days=1),
        poke_interval=5,
        timeout=120,
コード例 #11
0
def test_pandas_data_asset_exceptions(
        fake_airflow_context: Dict,
        test_parquet_asset: PandasDataAsset) -> None:
    with pytest.raises(Exception):
        test_parquet_asset.name = "fail"
        test_parquet_asset.rebuild_for_store(fake_airflow_context)
コード例 #12
0
def test_xlsx_in_asset() -> PandasDataAsset:
    return PandasDataAsset("test_xlsx_in_asset")
コード例 #13
0
def test_csv_asset() -> PandasDataAsset:
    return PandasDataAsset("test_csv_out_asset_pandas")
コード例 #14
0
def rebuild_for_store(asset: PandasDataAsset, airflow_context):
    enrollment_data = PandasDataAssetIO.read_data_asset(
        asset=asset, source_files=asset.pickedup_files(airflow_context)
    )

    PandasDataAssetIO.write_data_asset(asset=asset, data=enrollment_data)