コード例 #1
0
        def create_view_task(ds, **kwargs):

            client = bigquery.Client()

            dest_table_name = 'amended_tokens'
            dest_table_ref = client.dataset(dataset_name, project=destination_dataset_project_id).table(dest_table_name)

            sql_path = os.path.join(dags_folder, 'resources/stages/enrich/sqls/amended_tokens.sql')
            sql = read_file(sql_path)
            print('amended_tokens view: \n' + sql)

            description = 'Tokens amended with data from https://github.com/blockchain-etl/ethereum-etl-airflow/blob/master/dags/resources/stages/seed/data/token_amendments.csv'
            create_view(client, sql, dest_table_ref, description=description)
コード例 #2
0
def read_bigquery_schema_from_file(filepath):
    result = []
    file_content = read_file(filepath)
    json_content = json.loads(file_content)
    return read_bigquery_schema_from_json_recursive(json_content)
コード例 #3
0
def build_parse_dag(dag_id,
                    dataset_folder,
                    parse_destination_dataset_project_id,
                    notification_emails=None,
                    parse_start_date=datetime(2018, 7, 1),
                    schedule_interval='0 0 * * *',
                    parse_all_partitions=None,
                    send_success_email=False):

    logging.info('parse_all_partitions is {}'.format(parse_all_partitions))

    if parse_all_partitions:
        dag_id = dag_id + '_FULL'

    if 'ethereum_kovan_parse' in dag_id:
        SOURCE_PROJECT_ID = 'public-data-finance'
        SOURCE_DATASET_NAME = 'crypto_ethereum_kovan'

        PARTITION_DAG_ID = 'ethereum_kovan_partition_dag'
    else:
        SOURCE_PROJECT_ID = 'bigquery-public-data'
        SOURCE_DATASET_NAME = 'crypto_ethereum'

        PARTITION_DAG_ID = 'ethereum_partition_dag'

    default_dag_args = {
        'depends_on_past': False,
        'start_date': parse_start_date,
        'email_on_failure': True,
        'email_on_retry': False,
        'retries': 5,
        'retry_delay': timedelta(minutes=5)
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args['email'] = [
            email.strip() for email in notification_emails.split(',')
        ]

    dag = models.DAG(dag_id,
                     catchup=False,
                     schedule_interval=schedule_interval,
                     default_args=default_dag_args)

    validation_error = None
    try:
        validate_definition_files(dataset_folder)
    except ValueError as e:
        validation_error = e

    # This prevents failing all dags as they are constructed in a loop in ethereum_parse_dag.py
    if validation_error is not None:

        def raise_validation_error(ds, **kwargs):
            raise validation_error

        validation_error_operator = PythonOperator(
            task_id='validation_error',
            python_callable=raise_validation_error,
            provide_context=True,
            execution_timeout=timedelta(minutes=10),
            dag=dag)

        return dag

    def create_parse_task(table_definition):
        def parse_task(ds, **kwargs):
            client = bigquery.Client()

            parse(bigquery_client=client,
                  table_definition=table_definition,
                  ds=ds,
                  source_project_id=SOURCE_PROJECT_ID,
                  source_dataset_name=SOURCE_DATASET_NAME,
                  destination_project_id=parse_destination_dataset_project_id,
                  sqls_folder=os.path.join(dags_folder,
                                           'resources/stages/parse/sqls'),
                  parse_all_partitions=parse_all_partitions)

        table_name = table_definition['table']['table_name']
        parsing_operator = PythonOperator(
            task_id=table_name,
            python_callable=parse_task,
            provide_context=True,
            execution_timeout=timedelta(minutes=60),
            dag=dag)

        contract_address = table_definition['parser']['contract_address']
        if contract_address is not None:
            ref_dependencies = ref_regex.findall(
                table_definition['parser']['contract_address'])
        else:
            ref_dependencies = []
        return parsing_operator, ref_dependencies

    def create_add_view_task(dataset_name, view_name, sql):
        def create_view_task(ds, **kwargs):
            client = bigquery.Client()

            dest_table_name = view_name
            dest_table_ref = create_dataset(
                client, dataset_name,
                parse_destination_dataset_project_id).table(dest_table_name)

            print('View sql: \n' + sql)

            create_view(client, sql, dest_table_ref)

        create_view_operator = PythonOperator(
            task_id=f'create_view_{view_name}',
            python_callable=create_view_task,
            provide_context=True,
            execution_timeout=timedelta(minutes=10),
            dag=dag)

        return create_view_operator

    def create_share_dataset_task(dataset_name):
        def share_dataset_task(**kwargs):
            if parse_destination_dataset_project_id != 'blockchain-etl':
                logging.info('Skipping sharing dataset.')
            else:
                client = bigquery.Client()
                share_dataset_all_users_read(
                    client,
                    f'{parse_destination_dataset_project_id}.{dataset_name}')
                share_dataset_all_users_read(
                    client,
                    f'{parse_destination_dataset_project_id}-internal.{dataset_name}'
                )

        share_dataset_operator = PythonOperator(
            task_id='share_dataset',
            python_callable=share_dataset_task,
            provide_context=True,
            execution_timeout=timedelta(minutes=10),
            dag=dag)

        return share_dataset_operator

    wait_for_ethereum_load_dag_task = ExternalTaskSensor(
        task_id='wait_for_ethereum_partition_dag',
        external_dag_id=PARTITION_DAG_ID,
        external_task_id='done',
        execution_delta=timedelta(minutes=30),
        priority_weight=0,
        mode='reschedule',
        poke_interval=5 * 60,
        timeout=60 * 60 * 12,
        dag=dag)

    json_files = get_list_of_files(dataset_folder, '*.json')
    logging.info(json_files)

    all_parse_tasks = {}
    task_dependencies = {}
    for json_file in json_files:
        table_definition = read_json_file(json_file)
        task, dependencies = create_parse_task(table_definition)
        wait_for_ethereum_load_dag_task >> task
        all_parse_tasks[task.task_id] = task
        task_dependencies[task.task_id] = dependencies

    checkpoint_task = BashOperator(task_id='parse_all_checkpoint',
                                   bash_command='echo parse_all_checkpoint',
                                   priority_weight=1000,
                                   dag=dag)

    for task, dependencies in task_dependencies.items():
        for dependency in dependencies:
            if dependency not in all_parse_tasks:
                raise ValueError(
                    'Table {} is not found in the the dataset. Check your ref() in contract_address field.'
                    .format(dependency))
            all_parse_tasks[dependency] >> all_parse_tasks[task]

        all_parse_tasks[task] >> checkpoint_task

    final_tasks = [checkpoint_task]

    dataset_name = os.path.basename(dataset_folder)
    full_dataset_name = 'ethereum_' + dataset_name

    share_dataset_task = create_share_dataset_task(full_dataset_name)
    checkpoint_task >> share_dataset_task
    final_tasks.append(share_dataset_task)

    # Create views

    sql_files = get_list_of_files(dataset_folder, '*.sql')
    logging.info(sql_files)

    for sql_file in sql_files:
        sql = read_file(sql_file)
        base_name = os.path.basename(sql_file)
        view_name = os.path.splitext(base_name)[0]
        create_view_task = create_add_view_task(full_dataset_name, view_name,
                                                sql)
        checkpoint_task >> create_view_task
        final_tasks.append(create_view_task)

    if notification_emails and len(
            notification_emails) > 0 and send_success_email:
        send_email_task = EmailOperator(
            task_id='send_email',
            to=[email.strip() for email in notification_emails.split(',')],
            subject='Ethereum ETL Airflow Parse DAG Succeeded',
            html_content='Ethereum ETL Airflow Parse DAG Succeeded for {}'.
            format(dag_id),
            dag=dag)
        for final_task in final_tasks:
            final_task >> send_email_task
    return dag