Python BigQueryHook._get_field 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: airflow.contrib.hooks.bigquery_hook

클래스/타입: BigQueryHook

메소드/함수: _get_field

hotexamples.com에서의 예제들: 3

Python BigQueryHook._get_field - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 airflow.contrib.hooks.bigquery_hook.BigQueryHook._get_field에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

get_conn(30)

BigQueryHook(13)

get_pandas_df(6)

_get_credentials(5)

get_service(4)

_get_field(3)

run(3)

table_exists(3)

get_connection(2)

table_list_partition(2)

tables(2)

get_first(1)

get_records(1)

run_extract(1)

run_query(1)

예제 #1

파일 보기

파일: dag.py 프로젝트: 094459/GoogleBigQueryAWSAirflowExample

def bq_to_s3():
    s3 = boto3.resource('s3')
    logger.info(
        "Download google bigquery and google client dependencies from S3")
    s3.Bucket(BUCKET_NAME).download_file(LIB_KEY, '/tmp/site-packages.zip')

    with zipfile.ZipFile('/tmp/site-packages.zip', 'r') as zip_ref:
        zip_ref.extractall('/tmp/python3.7/site-packages')

    sys.path.insert(1, "/tmp/python3.7/site-packages/site-packages/")

    # Import google bigquery and google client dependencies
    import pyarrow
    from google.cloud import bigquery
    from airflow.contrib.hooks.bigquery_hook import BigQueryHook

    bq_hook = BigQueryHook(bigquery_conn_id="bigquery_default",
                           use_legacy_sql=False)

    bq_client = bigquery.Client(project=bq_hook._get_field("project"),
                                credentials=bq_hook._get_credentials())

    events_df = bq_client.query(BQ_SQL).result().to_dataframe(
        create_bqstorage_client=False)

    logger.info(f'google analytics events dataframe head - {events_df.head()}')

    wr.s3.to_csv(events_df, OUTPUT_PATH, index=False)

예제 #2

파일 보기

파일: bigquery_plugin.py 프로젝트: LeoArruda/DataAcyclicGraphs

    def execute(self, context):
        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            use_legacy_sql=self.use_legacy_sql,
            location=self.location,
        )

        records = self.run_query(project=hook._get_field("project"),
                                 credentials=hook._get_credentials())

        if not records:
            raise AirflowException("Query returned no results.")
        elif not all([bool(record) for record in records]):
            raise AirflowException(
                f"Test failed\nQuery: {self.sql}\nRecords: {records}")

        self.log.info(f"Test passed\nQuery: {self.sql}\nRecords: {records}")

예제 #3

파일 보기

파일: dynamic.py 프로젝트: abdulfaqihalm/self-service_pipeline

def create_dag(pipeline):
    #Request Configuration
    DAG_NAME = pipeline['dag']['dag_name']
    ARGS = pipeline['args']
    START_DATE = pipeline['start_date']
    SCHEDULE_INTERVAL = pipeline['dag']['schedule_interval'] 
    BQ_CONN_ID = pipeline['connection']['bq_conn_id']
    BQ_PROJECT_DESTINATION = pipeline['connection']['bq_project_destination']
    BQ_DATASET_DESTINATION = pipeline['connection']['bq_dataset_destination']
    BQ_TABLE_DESTINATION = pipeline['connection']['bq_table_destination']
    BUCKET_DESTINATION = pipeline['connection']['bucket_destination']
    FOLDER_IN_BUCKET = pipeline['connection']['folder_in_bucket']
    IS_USING_ML = pipeline['model']['is_using_ml']
    TARGET_FORECAST = pipeline['model']['target_forecast']
    CATEGORY = pipeline['model']['category']
    YLABEL = pipeline['viz']['ylabel']
    QUERY = pipeline['query']
    START_DATETIME_QUERY = pipeline['start_datetime_query'] 

    if IS_USING_ML:
        ATTACHED_FILE = ['forecast_result.pdf']
    else:
        ATTACHED_FILE = None

    
    
    # Connection Hook
    bq_hook = BigQueryHook(bigquery_conn_id=BQ_CONN_ID, use_legacy_sql=False)
    storage_client = storage.Client(project = bq_hook._get_field("project"), credentials = bq_hook._get_credentials())
    bq_client = bigquery.Client(project = bq_hook._get_field("project"), credentials = bq_hook._get_credentials())

    dag = DAG(
        dag_id=DAG_NAME,
        default_args=ARGS,
        schedule_interval=timedelta(days=SCHEDULE_INTERVAL),
        start_date= datetime.strptime(START_DATE, '%Y-%m-%d'),
        dagrun_timeout=timedelta(minutes=60),
        max_active_runs=1, 
        catchup=False
        )

    def if_table_exists(**kwargs):
        bq_table = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'*')
        #read file 
        try:
            bq_client.get_table(bq_table)
            kwargs['ti'].xcom_push(key='created_flag', value=True)
        except:
            kwargs['ti'].xcom_push(key='created_flag', value=False)

    def branch_table(**kwargs):
        is_table_exists = kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table')
        print('is_table_exists' + str(is_table_exists))
        if not IS_USING_ML:
            return 'mail'
        elif IS_USING_ML and is_table_exists: 
            return 'predict'
        else:
            return 'train'

    def to_table(**kwargs):
        is_table_exists=kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table')

        #table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d')))
        if is_table_exists:
            table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180914')
        else:
            table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180831')


        job_config = bigquery.QueryJobConfig()
        job_config.create_disposition = 'CREATE_IF_NEEDED'
        job_config.write_disposition = 'WRITE_TRUNCATE'    
        job_config.destination = table_ref
        # -- Uncomment to create partitioned table
        #job_config.time_partitioning = bigquery.table.TimePartitioning()
        
        #is_table_exists = False
        if is_table_exists:
            #sql=QUERY.format(str(datetime.fromtimestamp(kwargs['execution_date'].timestamp()) - timedelta(SCHEDULE_INTERVAL - 1)) ,str(datetime.fromtimestamp(kwargs['execution_date'].timestamp())))
            sql=QUERY.format('2018-09-01 00:00:00','2018-09-14 23:59:59')
            kwargs['ti'].xcom_push(key='csv_file', value='20180914')
        else:
            #sql=QUERY.format(START_DATETIME_QUERY,str(datetime.fromtimestamp(kwargs['execution_date'].timestamp())))
            sql=QUERY.format('2018-01-01 00:00:00','2018-08-31 23:59:59')
            kwargs['ti'].xcom_push(key='csv_file', value='20180831')

        query_job = bq_client.query(
            sql,
            location='US',
            job_config=job_config,
        )
        result = query_job.result()
        kwargs['ti'].xcom_push(key='row_num', value=result.total_rows)

    def query_to_csv(**kwargs):    
        #table_ref = bq_client.dataset(dataset_id=BQ_DATASET_DESTINATION).table(table_id=BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d')))

        is_table_exists=kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table')

        #table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d')))
        if is_table_exists:
            table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180914')
            uri = 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+'20180914.csv'
        else:
            table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180831')
            uri = 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+'20180831.csv'

        csv_job = bq_client.extract_table(
            source = table_ref,
            #Change the destination uri if needed
            # 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))+'.csv' 
            destination_uris = uri,
            location='US',
        )
        csv_job.result()

    def gcs_csv_to_df(bq_hook, execution_date):
        #todo : change the blob folder to be filled with the request.json 
        #blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))+'.csv') 
        blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+execution_date+'.csv') 
        byte_stream = io.BytesIO()
        blob.download_to_file(byte_stream)
        byte_stream.seek(0)
        df = pd.read_csv(byte_stream)
        return df

    def save_to_gcs(bq_hook, stream, path_file):
        blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+path_file)
        byte_stream = io.BytesIO()
        pickle.dump(stream, byte_stream, pickle.HIGHEST_PROTOCOL)
        byte_stream.seek(0)
        blob.upload_from_file(byte_stream)

    def train_mdl(**kwargs):
        #df = gcs_csv_to_df(bq_hook, kwargs['execution_date'].date())
        df = gcs_csv_to_df(bq_hook, '20180831')

        df.rename(columns={TARGET_FORECAST: 'y'})
        model = prophet.train(df, category_cols=CATEGORY)
        save_to_gcs(bq_hook,  model, 'model/model.pickle')
        
        #prediction = predict_mdl(kwargs['execution_date'].date())
        prediction = predict_mdl(datetime(2018,8,31).date())
        return prediction

    def predict_mdl(execution_date):
        blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+'model/model.pickle')
        byte_stream = io.BytesIO()
        blob.download_to_file(byte_stream)
        byte_stream.seek(0)
        models = pickle.load(byte_stream)
        prediction = prophet.predict(execution_date, models=models, schedule_interval=SCHEDULE_INTERVAL, category_cols=CATEGORY)
        plot.plotvis(prediction, YLABEL, SCHEDULE_INTERVAL, category_cols=CATEGORY)
        prediction.to_csv('prediction.csv', index=False)

        #blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+'prediction/prediction_'+str((kwargs['execution_date']+timedelta(days=1)).date().strftime('%Y%m%d'))+'.csv')
        blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+'prediction/prediction_'+execution_date.strftime('%Y%m%d')+'.csv')

        blob.upload_from_filename('prediction.csv')
        return prediction

    def predict(**kwargs):
        #prediction = predict_mdl(kwargs['execution_date'].date())
        prediction = predict_mdl(datetime(2018,9,14).date())
        return prediction
    

    with dag: 
        chck_table = PythonOperator(
            task_id = 'chck_table',
            dag=dag,
            python_callable=if_table_exists,
        )

        chck_table_branch = BranchPythonOperator(
            task_id = 'chck_table_branch',
            dag=dag,
            python_callable=branch_table,
        )

        crt_table = PythonOperator(
            task_id='crt_table',
            dag=dag,
            python_callable=to_table
        )

        save_to_csv = PythonOperator(
            task_id='save_to_csv',
            dag=dag,
            python_callable=query_to_csv
        )

        train = PythonOperator(
            task_id='train',
            dag=dag,
            python_callable=train_mdl,
        )

        #Change the 2018031 to {{ds_nodash}}
        mail = EmailOperator(
            task_id='mail',
            dag=dag,
            trigger_rule='none_failed',
            to='*****@*****.**',
            subject='Reporting: {{ params.dag_name }} {{ ds }}',
            params={
                'dag_name': DAG_NAME,
                'table': BQ_TABLE_DESTINATION, 
                'dataset': BQ_DATASET_DESTINATION,
                'project': BQ_PROJECT_DESTINATION,
                'bucket': BUCKET_DESTINATION,
                'using_ml': IS_USING_ML
                },
            html_content=''' 
            DAG name : {{ params.dag_name }}
            <br>
            Table : {{ params.table }}
            <br>
            Dataset : {{ params.dataset }} 
            <br>
            Project : {{ params.project }}
            <br>
            CSV link in GCS : https://storage.cloud.google.com/{{ params.bucket }}/{{ params.dataset }}/{{ params.table }}_{{task_instance.xcom_pull(task_ids='crt_table', key='csv_file')}}.csv
            <br>
            Number of recorded rows : {{task_instance.xcom_pull(task_ids='crt_table', key='row_num')}}
            ''',
            files = ATTACHED_FILE,
            cc=['*****@*****.**', '*****@*****.**'],
        )

        predict = PythonOperator(
            task_id='predict',
            dag=dag,
            python_callable=predict,
        )

    chck_table >> crt_table >> save_to_csv >> chck_table_branch >> [train, predict, mail] 
    train >> mail 
    predict >> mail 

    return dag