def validate_definition_files(dataset_folder): json_files = get_list_of_files(dataset_folder, '*.json') dataset_folder_name = dataset_folder.split('/')[-1] all_lowercase_table_names = [] for json_file in json_files: file_name = json_file.split('/')[-1].replace('.json', '') table_definition = read_json_file(json_file) table = table_definition.get('table') if not table: raise ValueError(f'table is empty in file {json_file}') dataset_name = table.get('dataset_name') if not dataset_name: raise ValueError(f'dataset_name is empty in file {json_file}') if dataset_folder_name != dataset_name: raise ValueError(f'dataset_name {dataset_name} is not equal to dataset_folder_name {dataset_folder_name}') table_name = table.get('table_name') if not table_name: raise ValueError(f'table_name is empty in file {json_file}') if file_name != table_name: raise ValueError(f'file_name {file_name} doest match the table_name {table_name}') all_lowercase_table_names.append(table_name.lower()) table_name_counts = collections.defaultdict(lambda: 0) for table_name in all_lowercase_table_names: table_name_counts[table_name] += 1 non_unique_table_names = [name for name, count in table_name_counts.items() if count > 1] if len(non_unique_table_names) > 0: raise ValueError(f'The following table names are not unique {",".join(non_unique_table_names)}')
def test_create_or_update_table_from_table_definition(table_definition_file): bigquery_client = MockBigqueryClient() table_definition = read_json_file( os.path.join(table_definitions_folder, table_definition_file)) create_or_update_table_from_table_definition( bigquery_client=bigquery_client, table_definition=table_definition, ds='2020-01-01', source_project_id='bigquery-public-data', source_dataset_name='crypto_ethereum', destination_project_id='blockchain-etl', sqls_folder=sqls_folder, parse_all_partitions=True, airflow_task=create_dummy_airflow_task()) assert len(bigquery_client.queries) == 1 expected_filename = table_definition_file_to_expected_file( table_definition_file) assert trim(bigquery_client.queries[0]) == trim( read_resource(expected_filename))
def test_create_or_update_table_from_table_definition(table_definition_file, parse_all_partitions): bigquery_client = MockBigqueryClient() table_definition = read_json_file( os.path.join(table_definitions_folder, table_definition_file)) parse(bigquery_client=bigquery_client, table_definition=table_definition, ds='2020-01-01', source_project_id='bigquery-public-data', source_dataset_name='crypto_ethereum', destination_project_id='blockchain-etl', sqls_folder=sqls_folder, parse_all_partitions=parse_all_partitions, time_func=lambda: 1587556654.993) assert len(bigquery_client.queries) > 0 for ind, query in enumerate(bigquery_client.queries): expected_filename = table_definition_file_to_expected_file( table_definition_file, parse_all_partitions, ind) assert trim(query) == trim(read_resource(expected_filename))
def build_parse_dag(dag_id, dataset_folder, parse_destination_dataset_project_id, notification_emails=None, parse_start_date=datetime(2018, 7, 1), schedule_interval='0 0 * * *', parse_all_partitions=None, send_success_email=False): logging.info('parse_all_partitions is {}'.format(parse_all_partitions)) if parse_all_partitions: dag_id = dag_id + '_FULL' if 'ethereum_kovan_parse' in dag_id: SOURCE_PROJECT_ID = 'public-data-finance' SOURCE_DATASET_NAME = 'crypto_ethereum_kovan' PARTITION_DAG_ID = 'ethereum_kovan_partition_dag' else: SOURCE_PROJECT_ID = 'bigquery-public-data' SOURCE_DATASET_NAME = 'crypto_ethereum' PARTITION_DAG_ID = 'ethereum_partition_dag' default_dag_args = { 'depends_on_past': False, 'start_date': parse_start_date, 'email_on_failure': True, 'email_on_retry': False, 'retries': 5, 'retry_delay': timedelta(minutes=5) } if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] dag = models.DAG(dag_id, catchup=False, schedule_interval=schedule_interval, default_args=default_dag_args) validation_error = None try: validate_definition_files(dataset_folder) except ValueError as e: validation_error = e # This prevents failing all dags as they are constructed in a loop in ethereum_parse_dag.py if validation_error is not None: def raise_validation_error(ds, **kwargs): raise validation_error validation_error_operator = PythonOperator( task_id='validation_error', python_callable=raise_validation_error, provide_context=True, execution_timeout=timedelta(minutes=10), dag=dag) return dag def create_parse_task(table_definition): def parse_task(ds, **kwargs): client = bigquery.Client() parse(bigquery_client=client, table_definition=table_definition, ds=ds, source_project_id=SOURCE_PROJECT_ID, source_dataset_name=SOURCE_DATASET_NAME, destination_project_id=parse_destination_dataset_project_id, sqls_folder=os.path.join(dags_folder, 'resources/stages/parse/sqls'), parse_all_partitions=parse_all_partitions) table_name = table_definition['table']['table_name'] parsing_operator = PythonOperator( task_id=table_name, python_callable=parse_task, provide_context=True, execution_timeout=timedelta(minutes=60), dag=dag) contract_address = table_definition['parser']['contract_address'] if contract_address is not None: ref_dependencies = ref_regex.findall( table_definition['parser']['contract_address']) else: ref_dependencies = [] return parsing_operator, ref_dependencies def create_add_view_task(dataset_name, view_name, sql): def create_view_task(ds, **kwargs): client = bigquery.Client() dest_table_name = view_name dest_table_ref = create_dataset( client, dataset_name, parse_destination_dataset_project_id).table(dest_table_name) print('View sql: \n' + sql) create_view(client, sql, dest_table_ref) create_view_operator = PythonOperator( task_id=f'create_view_{view_name}', python_callable=create_view_task, provide_context=True, execution_timeout=timedelta(minutes=10), dag=dag) return create_view_operator def create_share_dataset_task(dataset_name): def share_dataset_task(**kwargs): if parse_destination_dataset_project_id != 'blockchain-etl': logging.info('Skipping sharing dataset.') else: client = bigquery.Client() share_dataset_all_users_read( client, f'{parse_destination_dataset_project_id}.{dataset_name}') share_dataset_all_users_read( client, f'{parse_destination_dataset_project_id}-internal.{dataset_name}' ) share_dataset_operator = PythonOperator( task_id='share_dataset', python_callable=share_dataset_task, provide_context=True, execution_timeout=timedelta(minutes=10), dag=dag) return share_dataset_operator wait_for_ethereum_load_dag_task = ExternalTaskSensor( task_id='wait_for_ethereum_partition_dag', external_dag_id=PARTITION_DAG_ID, external_task_id='done', execution_delta=timedelta(minutes=30), priority_weight=0, mode='reschedule', poke_interval=5 * 60, timeout=60 * 60 * 12, dag=dag) json_files = get_list_of_files(dataset_folder, '*.json') logging.info(json_files) all_parse_tasks = {} task_dependencies = {} for json_file in json_files: table_definition = read_json_file(json_file) task, dependencies = create_parse_task(table_definition) wait_for_ethereum_load_dag_task >> task all_parse_tasks[task.task_id] = task task_dependencies[task.task_id] = dependencies checkpoint_task = BashOperator(task_id='parse_all_checkpoint', bash_command='echo parse_all_checkpoint', priority_weight=1000, dag=dag) for task, dependencies in task_dependencies.items(): for dependency in dependencies: if dependency not in all_parse_tasks: raise ValueError( 'Table {} is not found in the the dataset. Check your ref() in contract_address field.' .format(dependency)) all_parse_tasks[dependency] >> all_parse_tasks[task] all_parse_tasks[task] >> checkpoint_task final_tasks = [checkpoint_task] dataset_name = os.path.basename(dataset_folder) full_dataset_name = 'ethereum_' + dataset_name share_dataset_task = create_share_dataset_task(full_dataset_name) checkpoint_task >> share_dataset_task final_tasks.append(share_dataset_task) # Create views sql_files = get_list_of_files(dataset_folder, '*.sql') logging.info(sql_files) for sql_file in sql_files: sql = read_file(sql_file) base_name = os.path.basename(sql_file) view_name = os.path.splitext(base_name)[0] create_view_task = create_add_view_task(full_dataset_name, view_name, sql) checkpoint_task >> create_view_task final_tasks.append(create_view_task) if notification_emails and len( notification_emails) > 0 and send_success_email: send_email_task = EmailOperator( task_id='send_email', to=[email.strip() for email in notification_emails.split(',')], subject='Ethereum ETL Airflow Parse DAG Succeeded', html_content='Ethereum ETL Airflow Parse DAG Succeeded for {}'. format(dag_id), dag=dag) for final_task in final_tasks: final_task >> send_email_task return dag
def build_parse_dag(dag_id, dataset_folder, parse_destination_dataset_project_id, notification_emails=None, parse_start_date=datetime(2018, 7, 1), schedule_interval='0 0 * * *', parse_all_partitions=True, send_success_email=False): logging.info('parse_all_partitions is {}'.format(parse_all_partitions)) if parse_all_partitions: dag_id = dag_id + '_FULL' SOURCE_PROJECT_ID = 'bigquery-public-data' SOURCE_DATASET_NAME = 'crypto_ethereum' default_dag_args = { 'depends_on_past': False, 'start_date': parse_start_date, 'email_on_failure': True, 'email_on_retry': False, 'retries': 5, 'retry_delay': timedelta(minutes=5) } if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] dag = models.DAG(dag_id, catchup=False, schedule_interval=schedule_interval, default_args=default_dag_args) def create_task_and_add_to_dag(table_definition): def parse_task(ds, **kwargs): client = bigquery.Client() create_or_update_table_from_table_definition( bigquery_client=client, table_definition=table_definition, ds=ds, source_project_id=SOURCE_PROJECT_ID, source_dataset_name=SOURCE_DATASET_NAME, destination_project_id=parse_destination_dataset_project_id, sqls_folder=os.path.join(dags_folder, 'resources/stages/parse/sqls'), parse_all_partitions=parse_all_partitions, airflow_task=kwargs['task']) table_name = table_definition['table']['table_name'] parsing_operator = PythonOperator( task_id=table_name, python_callable=parse_task, provide_context=True, execution_timeout=timedelta(minutes=60), dag=dag) ref_dependencies = ref_regex.findall( table_definition['parser']['contract_address']) return parsing_operator, ref_dependencies wait_for_ethereum_load_dag_task = ExternalTaskSensor( task_id='wait_for_ethereum_load_dag', external_dag_id='ethereum_load_dag', external_task_id='verify_logs_have_latest', execution_delta=timedelta(hours=1), priority_weight=0, mode='reschedule', poke_interval=5 * 60, timeout=60 * 60 * 12, dag=dag) files = get_list_of_json_files(dataset_folder) logging.info('files') logging.info(files) all_parse_tasks = {} task_dependencies = {} for f in files: table_definition = read_json_file(f) task, dependencies = create_task_and_add_to_dag(table_definition) wait_for_ethereum_load_dag_task >> task all_parse_tasks[task.task_id] = task task_dependencies[task.task_id] = dependencies for task, dependencies in task_dependencies.items(): for dependency in dependencies: if dependency not in all_parse_tasks: raise ValueError( 'Table {} is not found in the the dataset. Check your ref() in contract_address field.' .format(dependency)) all_parse_tasks[dependency] >> all_parse_tasks[task] if notification_emails and len( notification_emails) > 0 and send_success_email: send_email_task = EmailOperator( task_id='send_email', to=[email.strip() for email in notification_emails.split(',')], subject='Ethereum ETL Airflow Parse DAG Succeeded', html_content='Ethereum ETL Airflow Parse DAG Succeeded for {}'. format(dag_id), dag=dag) for task in all_parse_tasks.values(): task >> send_email_task return dag