Beispiel #1
0
 def poke(self, context):
     table_uri = '{0}:{1}.{2}'.format(self.project_id, self.dataset_id, self.table_id)
     self.log.info('Sensor checks existence of table: %s', table_uri)
     hook = BigQueryHook(
         bigquery_conn_id=self.bigquery_conn_id,
         delegate_to=self.delegate_to)
     return hook.table_exists(self.project_id, self.dataset_id, self.table_id)
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        logging.info('start_date_str = %s', self.start_date_str)
        logging.info('end_date_str = %s', self.end_date_str)
        logging.info('Date conversion starts')
        start = str2date(self.start_date_str)
        end = str2date(self.end_date_str)
        logging.info('Date conversion ends')
        logging.info('time_partitioning = %s', self.time_partitioning)

        for i in daterange(start, end):
            date_no_dash = i.strftime("%Y%m%d")
            partitioned_table_id = self.table_id + date_no_dash
            logging.info("Partitioned table {0}".format(partitioned_table_id))

            logging.info('Hooks to check if table exists <%s:%s.%s>',
                         self.project_id, self.dataset_id,
                         partitioned_table_id)
            table_exists = bq_hook.table_exists(self.project_id,
                                                self.dataset_id,
                                                partitioned_table_id)
            if not table_exists:
                logging.info('Table <%s> does not exists',
                             partitioned_table_id)
                logging.info('Connects to BigQuery')
                cursor = BigQueryHelperCursor(bq_hook.get_service(),
                                              self.project_id)

                logging.info('Creates the empty table %s with the schema %s',
                             partitioned_table_id, self.schema_fields)
                cursor.create_empty_table(
                    project_id=self.project_id,
                    dataset_id=self.dataset_id,
                    table_id=partitioned_table_id,
                    schema_fields=self.schema_fields,
                    time_partitioning=self.time_partitioning)
class GeotabToBigQueryOperator(BaseOperator):

    @apply_defaults
    def __init__(self,
                 gcs_conn_id,
                 gcs_bucket_name,
                 bq_project_name,
                 bq_dataset_name,
                 bq_table_name,
                 bq_table_schema,
                 update_info_dataset_id,
                 update_info_table_id,
                 geotab_conn_id,
                 geotab_data_type_name,
                 partition_column = None,
                 is_append_mode = True,
                 add_snapshot_time_column = False,
                 selected_column_list = [],
                 fields_preprocessing_map = [],
                 parse_data_field = False,
                 *args,
                 **kwargs):
        super(GeotabToBigQueryOperator, self).__init__(*args, **kwargs)
        self.gcs_conn_id = gcs_conn_id
        self.gcs_bucket_name = gcs_bucket_name
        self.bq_project_name = bq_project_name
        self.bq_dataset_name = bq_dataset_name
        self.bq_table_name = bq_table_name
        self.bq_table_schema = bq_table_schema
        self.update_info_dataset_id = update_info_dataset_id
        self.update_info_table_id = update_info_table_id
        self.geotab_conn_id = geotab_conn_id
        self.geotab_data_type_name = geotab_data_type_name
        self.partition_column = partition_column
        self.is_append_mode = is_append_mode
        self.add_snapshot_time_column = add_snapshot_time_column
        self.selected_column_list = selected_column_list
        self.fields_preprocessing_map = fields_preprocessing_map
        self.parse_data_field = parse_data_field

    def execute(self, context):
        self.log.info(f"start execute")
        try:
            self.init()
            self.calc_patch_interval()
            self.get_geotab_data()
            self.do_preprocess()
            self.write_to_csv()
            self.send_to_gcs()
            self.push_to_bigquery()
            self.set_last_updated_time()
        except Exception as e:
            self.log.exception(e)
            raise
        finally:
            self.clean_up()

    def init(self):
        self.log.info(f"init() is started")

        # bucket connection
        self.gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcs_conn_id, delegate_to=None)

        # bigquery connection
        self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcs_conn_id, use_legacy_sql=False)
        bq_conn = self.bq_hook.get_conn()
        self.bq_cursor = bq_conn.cursor()

        # geotab connection
        self.geotab_hook = GeotabHook(geotab_conn_id=self.geotab_conn_id)

        params = self.geotab_hook.get_connection(self.geotab_conn_id)
        self.log.info(f"login: "******", password: "******", schema: " + params.schema)

    def calc_patch_interval(self):
        self.log.info(f"calc_patch_interval() is started")
        self.interval_start_at = self.get_bigquery_last_updated_time()
        self.log.info(f"last updated: [{self.interval_start_at}]")
        # add 1 second to interval_start_at for [ )
        self.interval_start_at = self.add_seconds(self.interval_start_at, 1)
        self.interval_end_at = self.calc_inverval_end_time(self.interval_start_at, PATCH_INTERVAL_DAY)
        self.log.info(f"patch interval range: [{self.interval_start_at}, {self.interval_end_at})")

    def get_geotab_data(self):
        self.log.info(f"get_geotab_data() is started")
        date_params= { 'fromDate': self.interval_start_at, 'toDate': self.interval_end_at}
        if self.geotab_data_type_name is 'DeviceStatusInfo':
            self.geotab_json_data = self.geotab_hook.get(type_name=self.geotab_data_type_name)
        else:
            self.geotab_json_data = self.geotab_hook.get(type_name=self.geotab_data_type_name, params=date_params)

    def do_preprocess(self):
        self.log.info(f"do_preprocess() is started")
        self.geotab_df = pd.DataFrame(self.geotab_json_data)
        # refine columns
        if len(self.selected_column_list) > 0:
            self.geotab_df = self.geotab_df[self.selected_column_list]
        # parse json
        for replace_set in self.fields_preprocessing_map:
            if len(replace_set) != 3:
                continue
            self.geotab_df[replace_set[2]]=self.geotab_df[replace_set[0]].map(lambda s: s[replace_set[1]])
        # add snapshotAt column
        if self.add_snapshot_time_column:
            self.geotab_df.insert(loc=0, column='snapshotAt', value=self.interval_end_at)
        #
        if self.parse_data_field:
            self.parse_data_field_for_customdata()

    def parse_data_field_for_customdata(self):
        self.geotab_df['device']=self.geotab_df['device'].map(lambda s: s['id'])
        self.geotab_df['data'] = self.geotab_df['data'].apply(lambda x: base64.b64decode(x).hex())

        customdata_name = ["Pkt Sequence", "Pkt Type", "Temperature", "Humidity", "PM1.0(1st)", "PM1.0(2nd)", "PM2.5(1st)", "PM2.5(2nd)", "PM10(1st)", "PM10(2nd)", "CO(1st)", "CO(2nd)", "CO2(1st)", "TBD(1st)", "TBD(2nd)"]
        # customdata_unit = ["[-]", "[-]", "[℃]", "[%]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[ppm]", "[ppm]", "[ppm]", "[minute]", "[index]"]
        customdata_byte = [2,1,2,2,2,2,2,2,2,2,2,2,2,1,1]
        customdata_set = {}

        list_ = []
        for index,row in self.geotab_df.iterrows():
            idx_to = 0
            for i in range(len(customdata_name)):
                idx_from = idx_to
                idx_to = idx_from + customdata_byte[i]*2
                customdata_set[customdata_name[i]] = int(row['data'][idx_from:idx_to],16)
                list_.append(customdata_set)

        customdata_set["Temperature"] = customdata_set["Temperature"] * 0.1 - 100
        customdata_set["Humidity"] *= 0.1
        customdata_set["CO(1st)"] *= 0.1
        customdata_set["CO(2nd)"] *= 0.1

        self.geotab_df = pd.merge(self.geotab_df.loc[:, self.geotab_df.columns != 'data'], pd.DataFrame(list_, columns = customdata_set.keys()), left_index=True, right_index=True)

    def write_to_csv(self):
        self.log.info(f"write_to_csv() is started")
        self.csv_file_name = self.geotab_data_type_name + '.csv'
        self.geotab_df.to_csv(self.csv_file_name, header=None, index=False)

    def send_to_gcs(self):
        self.log.info(f"send_to_gcs() is started")
        self.gcs_hook.upload(self.gcs_bucket_name, self.csv_file_name, self.csv_file_name)

    def push_to_bigquery(self):
        self.log.info(f"push_to_bigquery() is started")
        tp_dictionary = None

        if self.partition_column is not None:
            tp = bigquery.table.TimePartitioning()
            tp.expiration_ms = None
            tp.field = self.partition_column
            tp_dictionary = tp.to_api_repr()
        
        # check table existence
        is_table_exist = False
        if self.bq_hook.table_exists(self.bq_project_name, self.bq_dataset_name, self.bq_table_name):
            is_table_exist = True

        create_disposition='CREATE_IF_NEEDED'
        write_disposition='WRITE_TRUNCATE'
        if self.is_append_mode and is_table_exist:
            create_disposition='CREATE_NEVER'
            write_disposition='WRITE_APPEND'

        self.bq_cursor.run_load(
            destination_project_dataset_table=self.bq_table_id(self.bq_project_name, self.bq_dataset_name, self.bq_table_name),
            schema_fields=self.bq_table_schema,
            source_uris=self.gs_uri(self.gcs_bucket_name, self.csv_file_name),
            create_disposition=create_disposition,
            write_disposition=write_disposition,
            max_bad_records=0,
            allow_quoted_newlines=True,
            field_delimiter=',',
            src_fmt_configs={'nullMarker': 'NULL'},
            time_partitioning=tp_dictionary
        )

    def set_last_updated_time(self):
        self.log.info(f"set_last_updated_time() is started")
        self.set_bigquery_last_updated_time()

    def clean_up(self):
        self.log.info(f"clean_up() is started")
        if os.path.isfile(self.csv_file_name):
            os.remove(self.csv_file_name)
        #self.gcs_hook.delete(self.gcs_bucket_name, self.csv_file_name)

    ###############################################################################
    # helper
    def add_seconds(self, target_time, second_value):
        time_seconds_added = dt.datetime.strptime(target_time, '%Y-%m-%d %H:%M:%S')
        time_seconds_added = time_seconds_added + dt.timedelta(seconds=second_value)
        return time_seconds_added.strftime('%Y-%m-%d %H:%M:%S')

    def calc_inverval_end_time(self, interval_start_time, interval_day):
        start_time = dt.datetime.strptime(interval_start_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo = dt.timezone.utc)
        end_time = dt.datetime.now(dt.timezone.utc)
        if interval_day >= 1:
            end_time = start_time + dt.timedelta(days=interval_day)
        # subtract 1 second to interval_end_at for [ )
        end_time = end_time + dt.timedelta(seconds=-1)

        current_time = dt.datetime.now(dt.timezone.utc)
        if (end_time > current_time):
            end_time = current_time

        interval_end_time =  end_time.strftime('%Y-%m-%d %H:%M:%S')
        return interval_end_time

    def bq_table_id(self, project, dataset, table):
        return f"{project}:{dataset}.{table}"

    def gs_uri(self, bucket, file_key):
        return f"gs://{bucket}/{file_key}"

    def replace_escape_char(self, file_name):
        with open(file_name, 'r') as file:
            data = file.read()
        data = data.replace("'", "\"")
        with open(file_name, 'w') as file:
            file.write(data)

    def get_bigquery_last_updated_time(self):
        bq_query = self.get_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id,
                                                          self.bq_dataset_name, self.bq_table_name)
        self.log.info(f"get_bigquery_last_updated_time() - debug query: {bq_query}")
        bq_conn = self.bq_hook.get_conn()
        temp_bq_cursor = bq_conn.cursor()
        temp_bq_cursor.execute(bq_query)
        last_updated_time_value_row = temp_bq_cursor.fetchone()
        if last_updated_time_value_row is None:
            self.log.info(f"there is no bigquery table: {self.bq_table_name}")
            return PATCH_START_TIME
        else:
            last_updated = last_updated_time_value_row[0]
            self.log.info(f"get bigquery last updated time value: {last_updated}")
            last_updated_formatted = dt.datetime.strptime(last_updated, '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
            return last_updated_formatted

    def set_bigquery_last_updated_time(self):
        # first, check the inserted data
        bq_query_get = self.get_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id,
                                                              self.bq_dataset_name, self.bq_table_name)
        bq_conn_get = self.bq_hook.get_conn()
        bq_cursor_get = bq_conn_get.cursor()
        bq_cursor_get.execute(bq_query_get)
        last_updated_time_value_row = bq_cursor_get.fetchone()
       
        # insert or update last_updated time
        last_updated_formatted = dt.datetime.strptime(self.interval_end_at, '%Y-%m-%d %H:%M:%S').strftime('%Y%m%d%H%M%S')
        bq_query_set = ''
        if last_updated_time_value_row is None:
            self.log.info(f"insert last_updated: {self.bq_table_name}")
            bq_query_set = self.insert_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id,
                                                                     self.bq_dataset_name, self.bq_table_name, last_updated_formatted)
        else:
            self.log.info(f"update last_updated: {self.bq_table_name}")
            bq_query_set = self.update_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id,
                                                                     self.bq_dataset_name, self.bq_table_name, last_updated_formatted)
        bq_conn_set = self.bq_hook.get_conn()
        bq_cursor_set = bq_conn_set.cursor()
        bq_cursor_set.execute(bq_query_set)

    # queries for last_updated column
    def get_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table):
        return f"SELECT last_updated FROM {ref_dataset}.{ref_table} WHERE dataset_id = '{target_dataset}' AND table_id = '{target_table}';"

    def insert_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table, last_updated_value):
        return f"INSERT INTO {ref_dataset}.{ref_table} (dataset_id, table_id, last_updated) VALUES ('{target_dataset}', '{target_table}', '{last_updated_value}');"

    def update_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table, last_updated_value):
        return f"UPDATE {ref_dataset}.{ref_table} SET last_updated = '{last_updated_value}' WHERE dataset_id = '{target_dataset}' AND table_id = '{target_table}';"