def rebuild_for_store(asset: PandasDataAsset, airflow_context): student_data = PandasDataAssetIO.read_data_asset( asset=asset, source_files=asset.pickedup_files(airflow_context)) student_data = asset.rename_fields_as_declared(student_data) PandasDataAssetIO.write_data_asset(asset=asset, data=student_data)
def rebuild_for_store(asset: PandasDataAsset, airflow_context): student = PandasDataAsset(name="student") programme = PandasDataAsset(name="programme") enrollment = PandasDataAsset(name="enrollment") student_df = student.retrieve_from_store( airflow_context=airflow_context, consuming_asset=asset ) programme_df = programme.retrieve_from_store( airflow_context=airflow_context, consuming_asset=asset ) enrollment_df = enrollment.retrieve_from_store( airflow_context=airflow_context, consuming_asset=asset ) enrollment_summary: pd.DataFrame = enrollment_df.merge( right=student_df, on=student.declarations.key_columns ).merge(right=programme_df, on=programme.declarations.key_columns) enrollment_summary = ( enrollment_summary.loc[:, ["student_major", "programme_name", "student_id"]] .groupby(by=["student_major", "programme_name"]) .count() ) PandasDataAssetIO.write_data_asset(asset=asset, data=enrollment_summary)
def test_data_asset_paths( test_parquet_asset: PandasDataAsset, test_parquet_asset_df: pd.DataFrame, fake_airflow_context: Dict, ) -> None: # test various path getters/properties: test_path = test_parquet_asset.staging_pickedup_path(fake_airflow_context) assert isinstance(test_path, str) test_path = test_parquet_asset.ingest_archive_path assert isinstance(test_path, str) test_path = test_parquet_asset.ready_path assert isinstance(test_path, str) test_path = test_parquet_asset.staging_ready_path assert isinstance(test_path, str) test_path = test_parquet_asset.landing_path assert isinstance(test_path, str) test_path = test_parquet_asset.ready_archive_path(fake_airflow_context) assert isinstance(test_path, str)
def test_pandas_data_asset( fake_airflow_context: Dict, test_parquet_asset: PandasDataAsset, test_parquet_asset_df: pd.DataFrame, ) -> None: # none, none d1 = test_parquet_asset.retrieve_from_store() # only airflow_context d2 = test_parquet_asset.retrieve_from_store( airflow_context=fake_airflow_context) # only consuming asset d3 = test_parquet_asset.retrieve_from_store(consuming_asset=ShellDataAsset( name="test_consumer")) # both parameters set d4 = test_parquet_asset.retrieve_from_store( airflow_context=fake_airflow_context, consuming_asset=ShellDataAsset(name="test_consumer"), ) assert d1.equals(d2) and d2.equals(d3) and d3.equals(d4)
def test_read_write_parquet(test_parquet_in_asset: PandasDataAsset, iris: pd.DataFrame, fake_airflow_context) -> None: p = path.join( test_parquet_in_asset.staging_pickedup_path(fake_airflow_context), "test_parquet_in.parquet", ) os.makedirs(path.dirname(p), exist_ok=True) iris.to_parquet(p) PandasDataAssetIO.read_data_asset(test_parquet_in_asset, source_files=[p]) # try with additional kwargs: PandasDataAssetIO.read_data_asset(asset=test_parquet_in_asset, source_files=[p], engine="auto")
def test_read_write_xlsx(test_xlsx_in_asset: PandasDataAsset, iris: pd.DataFrame, fake_airflow_context) -> None: p = path.join( test_xlsx_in_asset.staging_pickedup_path(fake_airflow_context), "test_xlsx_in.xls", ) os.makedirs(path.dirname(p), exist_ok=True) iris.to_excel(p) # try without any extra kwargs: PandasDataAssetIO.read_data_asset(asset=test_xlsx_in_asset, source_files=[p]) # try with additional kwargs: PandasDataAssetIO.read_data_asset(asset=test_xlsx_in_asset, source_files=[p], sheet_name=0)
from datetime import datetime from airflow.models import DAG from airtunnel import PandasDataAsset from airtunnel.operators.archival import DataAssetArchiveOperator, IngestArchiveOperator from airtunnel.operators.ingestion import IngestOperator from airtunnel.operators.loading import StagingToReadyOperator from airtunnel.operators.transformation import PandasTransformationOperator from airtunnel.sensors.ingestion import SourceFileIsReadySensor student = PandasDataAsset("student") programme = PandasDataAsset("programme") enrollment = PandasDataAsset("enrollment") enrollment_summary = PandasDataAsset("enrollment_summary") with DAG( dag_id="university", schedule_interval=None, start_date=datetime(year=2019, month=9, day=1), ) as dag: ingested_ready_tasks = set() # a common stream of tasks for all ingested assets: for ingested_asset in (student, programme, enrollment): source_is_ready = SourceFileIsReadySensor( # we reduce the poke interval to only 3 seconds so that our example runs complete faster # do not do in production!! :) asset=ingested_asset, poke_interval=3, no_of_required_static_pokes=2,
def test_pandas_asset() -> PandasDataAsset: return PandasDataAsset("test_parquet_in_asset")
def rebuild_for_store(asset: PandasDataAsset, airflow_context): programme_data = PandasDataAssetIO.read_data_asset( asset=asset, source_files=asset.pickedup_files(airflow_context)) programme_data = programme_data.drop_duplicates( subset=asset.declarations.key_columns) PandasDataAssetIO.write_data_asset(asset=asset, data=programme_data)
from datetime import datetime, timedelta import pytest from airflow.models import DAG from airtunnel import PandasDataAsset from airtunnel.sensors.metadata import ( AwaitLoadStatusSensor, AwaitAssetAncestorsUpdatedSensor, ) enrollment_summary = PandasDataAsset("enrollment_summary") with DAG( dag_id="metadata_sensors", schedule_interval=None, start_date=datetime(year=2019, month=9, day=1), ) as dag: await_load_status = AwaitLoadStatusSensor( asset=enrollment_summary, refreshed_within=timedelta(days=1), poke_interval=5, timeout=120, ) await_load_status_refreshed_after = AwaitLoadStatusSensor( asset=enrollment_summary, task_id="enrollment_summary_load_status_2", refreshed_after=datetime.now() - timedelta(days=1), poke_interval=5, timeout=120,
def test_pandas_data_asset_exceptions( fake_airflow_context: Dict, test_parquet_asset: PandasDataAsset) -> None: with pytest.raises(Exception): test_parquet_asset.name = "fail" test_parquet_asset.rebuild_for_store(fake_airflow_context)
def test_xlsx_in_asset() -> PandasDataAsset: return PandasDataAsset("test_xlsx_in_asset")
def test_csv_asset() -> PandasDataAsset: return PandasDataAsset("test_csv_out_asset_pandas")
def rebuild_for_store(asset: PandasDataAsset, airflow_context): enrollment_data = PandasDataAssetIO.read_data_asset( asset=asset, source_files=asset.pickedup_files(airflow_context) ) PandasDataAssetIO.write_data_asset(asset=asset, data=enrollment_data)