from dataflows import Flow, load, dump_to_path, update_resource def clean_data(row): row['age'] = int(row['age']) row['city'] = row['city'].strip().title() def data_processing(): return Flow( load('data.csv'), update_resource('data', name='cleaned_data'), clean_data, dump_to_path('cleaned_data') ) if __name__ == '__main__': data_processing().process()
from dataflows import Flow, load, join, dump_to_path def aggregate_data(rows): data = {} for row in rows: key = row['product'] if key not in data: data[key] = { 'product': key, 'total_sales': 0, 'total_revenue': 0 } data[key]['total_sales'] += row['sales'] data[key]['total_revenue'] += row['revenue'] return data.values() def data_processing(): products = Flow(load('products.csv')) sales = Flow(load('sales.csv')) aggregated_data = Flow( join( sources={'products': products, 'sales': sales}, on='products.id == sales.product_id' ), aggregate_data, dump_to_path('aggregated_data') ) return Flow([products, sales, aggregated_data]) if __name__ == '__main__': data_processing().process()In this example, two CSV files are loaded into two separate pipelines. The `join` function is used to join the two pipelines on a common column. The `aggregate_data` function is used to aggregate the data by product and calculate total sales and revenue. The aggregated data is then dumped to an output folder. Package library: `datapackage` and `datapackage-pipelines`