from dataflows import Flow, aggregate def process_data(rows): # group by customer id and sum revenue aggregate_fields = { 'customer_id': ['customer_id'], 'total_revenue': ('revenue', sum) } yield from aggregate(rows, aggregate_fields) with Flow("customer-revenue") as flow: flow.transform(process_data) flow.process()
from dataflows import Flow, validate, update_schema def process_data(rows): # define validation and schema updates schema_updates = { 'name': {'missingValues': ['']}, 'age': {'type': 'integer', 'default': 0}, 'email': {'missingValues': [''], 'default': '[email protected]'} } validation = {'name': {'type': 'string', 'minLength': 3}} yield from validate(rows, schema_updates=schema_updates, validation=validation) with Flow("customer-cleaning") as flow: flow.transform(process_data) flow.process()These examples illustrate the usefulness of Python Dataflows for processing data in a structured and scalable way. By using a declarative syntax, you can easily define complex data processing flows and maintain them over time as your data changes.