def create_view_task(ds, **kwargs): client = bigquery.Client() dest_table_name = 'amended_tokens' dest_table_ref = client.dataset(dataset_name, project=destination_dataset_project_id).table(dest_table_name) sql_path = os.path.join(dags_folder, 'resources/stages/enrich/sqls/amended_tokens.sql') sql = read_file(sql_path) print('amended_tokens view: \n' + sql) description = 'Tokens amended with data from https://github.com/blockchain-etl/ethereum-etl-airflow/blob/master/dags/resources/stages/seed/data/token_amendments.csv' create_view(client, sql, dest_table_ref, description=description)
def read_bigquery_schema_from_file(filepath): result = [] file_content = read_file(filepath) json_content = json.loads(file_content) return read_bigquery_schema_from_json_recursive(json_content)
def build_parse_dag(dag_id, dataset_folder, parse_destination_dataset_project_id, notification_emails=None, parse_start_date=datetime(2018, 7, 1), schedule_interval='0 0 * * *', parse_all_partitions=None, send_success_email=False): logging.info('parse_all_partitions is {}'.format(parse_all_partitions)) if parse_all_partitions: dag_id = dag_id + '_FULL' if 'ethereum_kovan_parse' in dag_id: SOURCE_PROJECT_ID = 'public-data-finance' SOURCE_DATASET_NAME = 'crypto_ethereum_kovan' PARTITION_DAG_ID = 'ethereum_kovan_partition_dag' else: SOURCE_PROJECT_ID = 'bigquery-public-data' SOURCE_DATASET_NAME = 'crypto_ethereum' PARTITION_DAG_ID = 'ethereum_partition_dag' default_dag_args = { 'depends_on_past': False, 'start_date': parse_start_date, 'email_on_failure': True, 'email_on_retry': False, 'retries': 5, 'retry_delay': timedelta(minutes=5) } if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] dag = models.DAG(dag_id, catchup=False, schedule_interval=schedule_interval, default_args=default_dag_args) validation_error = None try: validate_definition_files(dataset_folder) except ValueError as e: validation_error = e # This prevents failing all dags as they are constructed in a loop in ethereum_parse_dag.py if validation_error is not None: def raise_validation_error(ds, **kwargs): raise validation_error validation_error_operator = PythonOperator( task_id='validation_error', python_callable=raise_validation_error, provide_context=True, execution_timeout=timedelta(minutes=10), dag=dag) return dag def create_parse_task(table_definition): def parse_task(ds, **kwargs): client = bigquery.Client() parse(bigquery_client=client, table_definition=table_definition, ds=ds, source_project_id=SOURCE_PROJECT_ID, source_dataset_name=SOURCE_DATASET_NAME, destination_project_id=parse_destination_dataset_project_id, sqls_folder=os.path.join(dags_folder, 'resources/stages/parse/sqls'), parse_all_partitions=parse_all_partitions) table_name = table_definition['table']['table_name'] parsing_operator = PythonOperator( task_id=table_name, python_callable=parse_task, provide_context=True, execution_timeout=timedelta(minutes=60), dag=dag) contract_address = table_definition['parser']['contract_address'] if contract_address is not None: ref_dependencies = ref_regex.findall( table_definition['parser']['contract_address']) else: ref_dependencies = [] return parsing_operator, ref_dependencies def create_add_view_task(dataset_name, view_name, sql): def create_view_task(ds, **kwargs): client = bigquery.Client() dest_table_name = view_name dest_table_ref = create_dataset( client, dataset_name, parse_destination_dataset_project_id).table(dest_table_name) print('View sql: \n' + sql) create_view(client, sql, dest_table_ref) create_view_operator = PythonOperator( task_id=f'create_view_{view_name}', python_callable=create_view_task, provide_context=True, execution_timeout=timedelta(minutes=10), dag=dag) return create_view_operator def create_share_dataset_task(dataset_name): def share_dataset_task(**kwargs): if parse_destination_dataset_project_id != 'blockchain-etl': logging.info('Skipping sharing dataset.') else: client = bigquery.Client() share_dataset_all_users_read( client, f'{parse_destination_dataset_project_id}.{dataset_name}') share_dataset_all_users_read( client, f'{parse_destination_dataset_project_id}-internal.{dataset_name}' ) share_dataset_operator = PythonOperator( task_id='share_dataset', python_callable=share_dataset_task, provide_context=True, execution_timeout=timedelta(minutes=10), dag=dag) return share_dataset_operator wait_for_ethereum_load_dag_task = ExternalTaskSensor( task_id='wait_for_ethereum_partition_dag', external_dag_id=PARTITION_DAG_ID, external_task_id='done', execution_delta=timedelta(minutes=30), priority_weight=0, mode='reschedule', poke_interval=5 * 60, timeout=60 * 60 * 12, dag=dag) json_files = get_list_of_files(dataset_folder, '*.json') logging.info(json_files) all_parse_tasks = {} task_dependencies = {} for json_file in json_files: table_definition = read_json_file(json_file) task, dependencies = create_parse_task(table_definition) wait_for_ethereum_load_dag_task >> task all_parse_tasks[task.task_id] = task task_dependencies[task.task_id] = dependencies checkpoint_task = BashOperator(task_id='parse_all_checkpoint', bash_command='echo parse_all_checkpoint', priority_weight=1000, dag=dag) for task, dependencies in task_dependencies.items(): for dependency in dependencies: if dependency not in all_parse_tasks: raise ValueError( 'Table {} is not found in the the dataset. Check your ref() in contract_address field.' .format(dependency)) all_parse_tasks[dependency] >> all_parse_tasks[task] all_parse_tasks[task] >> checkpoint_task final_tasks = [checkpoint_task] dataset_name = os.path.basename(dataset_folder) full_dataset_name = 'ethereum_' + dataset_name share_dataset_task = create_share_dataset_task(full_dataset_name) checkpoint_task >> share_dataset_task final_tasks.append(share_dataset_task) # Create views sql_files = get_list_of_files(dataset_folder, '*.sql') logging.info(sql_files) for sql_file in sql_files: sql = read_file(sql_file) base_name = os.path.basename(sql_file) view_name = os.path.splitext(base_name)[0] create_view_task = create_add_view_task(full_dataset_name, view_name, sql) checkpoint_task >> create_view_task final_tasks.append(create_view_task) if notification_emails and len( notification_emails) > 0 and send_success_email: send_email_task = EmailOperator( task_id='send_email', to=[email.strip() for email in notification_emails.split(',')], subject='Ethereum ETL Airflow Parse DAG Succeeded', html_content='Ethereum ETL Airflow Parse DAG Succeeded for {}'. format(dag_id), dag=dag) for final_task in final_tasks: final_task >> send_email_task return dag