def poke(self, context): table_uri = '{0}:{1}.{2}'.format(self.project_id, self.dataset_id, self.table_id) self.log.info('Sensor checks existence of table: %s', table_uri) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) return hook.table_exists(self.project_id, self.dataset_id, self.table_id)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) logging.info('start_date_str = %s', self.start_date_str) logging.info('end_date_str = %s', self.end_date_str) logging.info('Date conversion starts') start = str2date(self.start_date_str) end = str2date(self.end_date_str) logging.info('Date conversion ends') logging.info('time_partitioning = %s', self.time_partitioning) for i in daterange(start, end): date_no_dash = i.strftime("%Y%m%d") partitioned_table_id = self.table_id + date_no_dash logging.info("Partitioned table {0}".format(partitioned_table_id)) logging.info('Hooks to check if table exists <%s:%s.%s>', self.project_id, self.dataset_id, partitioned_table_id) table_exists = bq_hook.table_exists(self.project_id, self.dataset_id, partitioned_table_id) if not table_exists: logging.info('Table <%s> does not exists', partitioned_table_id) logging.info('Connects to BigQuery') cursor = BigQueryHelperCursor(bq_hook.get_service(), self.project_id) logging.info('Creates the empty table %s with the schema %s', partitioned_table_id, self.schema_fields) cursor.create_empty_table( project_id=self.project_id, dataset_id=self.dataset_id, table_id=partitioned_table_id, schema_fields=self.schema_fields, time_partitioning=self.time_partitioning)
class GeotabToBigQueryOperator(BaseOperator): @apply_defaults def __init__(self, gcs_conn_id, gcs_bucket_name, bq_project_name, bq_dataset_name, bq_table_name, bq_table_schema, update_info_dataset_id, update_info_table_id, geotab_conn_id, geotab_data_type_name, partition_column = None, is_append_mode = True, add_snapshot_time_column = False, selected_column_list = [], fields_preprocessing_map = [], parse_data_field = False, *args, **kwargs): super(GeotabToBigQueryOperator, self).__init__(*args, **kwargs) self.gcs_conn_id = gcs_conn_id self.gcs_bucket_name = gcs_bucket_name self.bq_project_name = bq_project_name self.bq_dataset_name = bq_dataset_name self.bq_table_name = bq_table_name self.bq_table_schema = bq_table_schema self.update_info_dataset_id = update_info_dataset_id self.update_info_table_id = update_info_table_id self.geotab_conn_id = geotab_conn_id self.geotab_data_type_name = geotab_data_type_name self.partition_column = partition_column self.is_append_mode = is_append_mode self.add_snapshot_time_column = add_snapshot_time_column self.selected_column_list = selected_column_list self.fields_preprocessing_map = fields_preprocessing_map self.parse_data_field = parse_data_field def execute(self, context): self.log.info(f"start execute") try: self.init() self.calc_patch_interval() self.get_geotab_data() self.do_preprocess() self.write_to_csv() self.send_to_gcs() self.push_to_bigquery() self.set_last_updated_time() except Exception as e: self.log.exception(e) raise finally: self.clean_up() def init(self): self.log.info(f"init() is started") # bucket connection self.gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcs_conn_id, delegate_to=None) # bigquery connection self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcs_conn_id, use_legacy_sql=False) bq_conn = self.bq_hook.get_conn() self.bq_cursor = bq_conn.cursor() # geotab connection self.geotab_hook = GeotabHook(geotab_conn_id=self.geotab_conn_id) params = self.geotab_hook.get_connection(self.geotab_conn_id) self.log.info(f"login: "******", password: "******", schema: " + params.schema) def calc_patch_interval(self): self.log.info(f"calc_patch_interval() is started") self.interval_start_at = self.get_bigquery_last_updated_time() self.log.info(f"last updated: [{self.interval_start_at}]") # add 1 second to interval_start_at for [ ) self.interval_start_at = self.add_seconds(self.interval_start_at, 1) self.interval_end_at = self.calc_inverval_end_time(self.interval_start_at, PATCH_INTERVAL_DAY) self.log.info(f"patch interval range: [{self.interval_start_at}, {self.interval_end_at})") def get_geotab_data(self): self.log.info(f"get_geotab_data() is started") date_params= { 'fromDate': self.interval_start_at, 'toDate': self.interval_end_at} if self.geotab_data_type_name is 'DeviceStatusInfo': self.geotab_json_data = self.geotab_hook.get(type_name=self.geotab_data_type_name) else: self.geotab_json_data = self.geotab_hook.get(type_name=self.geotab_data_type_name, params=date_params) def do_preprocess(self): self.log.info(f"do_preprocess() is started") self.geotab_df = pd.DataFrame(self.geotab_json_data) # refine columns if len(self.selected_column_list) > 0: self.geotab_df = self.geotab_df[self.selected_column_list] # parse json for replace_set in self.fields_preprocessing_map: if len(replace_set) != 3: continue self.geotab_df[replace_set[2]]=self.geotab_df[replace_set[0]].map(lambda s: s[replace_set[1]]) # add snapshotAt column if self.add_snapshot_time_column: self.geotab_df.insert(loc=0, column='snapshotAt', value=self.interval_end_at) # if self.parse_data_field: self.parse_data_field_for_customdata() def parse_data_field_for_customdata(self): self.geotab_df['device']=self.geotab_df['device'].map(lambda s: s['id']) self.geotab_df['data'] = self.geotab_df['data'].apply(lambda x: base64.b64decode(x).hex()) customdata_name = ["Pkt Sequence", "Pkt Type", "Temperature", "Humidity", "PM1.0(1st)", "PM1.0(2nd)", "PM2.5(1st)", "PM2.5(2nd)", "PM10(1st)", "PM10(2nd)", "CO(1st)", "CO(2nd)", "CO2(1st)", "TBD(1st)", "TBD(2nd)"] # customdata_unit = ["[-]", "[-]", "[℃]", "[%]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[ppm]", "[ppm]", "[ppm]", "[minute]", "[index]"] customdata_byte = [2,1,2,2,2,2,2,2,2,2,2,2,2,1,1] customdata_set = {} list_ = [] for index,row in self.geotab_df.iterrows(): idx_to = 0 for i in range(len(customdata_name)): idx_from = idx_to idx_to = idx_from + customdata_byte[i]*2 customdata_set[customdata_name[i]] = int(row['data'][idx_from:idx_to],16) list_.append(customdata_set) customdata_set["Temperature"] = customdata_set["Temperature"] * 0.1 - 100 customdata_set["Humidity"] *= 0.1 customdata_set["CO(1st)"] *= 0.1 customdata_set["CO(2nd)"] *= 0.1 self.geotab_df = pd.merge(self.geotab_df.loc[:, self.geotab_df.columns != 'data'], pd.DataFrame(list_, columns = customdata_set.keys()), left_index=True, right_index=True) def write_to_csv(self): self.log.info(f"write_to_csv() is started") self.csv_file_name = self.geotab_data_type_name + '.csv' self.geotab_df.to_csv(self.csv_file_name, header=None, index=False) def send_to_gcs(self): self.log.info(f"send_to_gcs() is started") self.gcs_hook.upload(self.gcs_bucket_name, self.csv_file_name, self.csv_file_name) def push_to_bigquery(self): self.log.info(f"push_to_bigquery() is started") tp_dictionary = None if self.partition_column is not None: tp = bigquery.table.TimePartitioning() tp.expiration_ms = None tp.field = self.partition_column tp_dictionary = tp.to_api_repr() # check table existence is_table_exist = False if self.bq_hook.table_exists(self.bq_project_name, self.bq_dataset_name, self.bq_table_name): is_table_exist = True create_disposition='CREATE_IF_NEEDED' write_disposition='WRITE_TRUNCATE' if self.is_append_mode and is_table_exist: create_disposition='CREATE_NEVER' write_disposition='WRITE_APPEND' self.bq_cursor.run_load( destination_project_dataset_table=self.bq_table_id(self.bq_project_name, self.bq_dataset_name, self.bq_table_name), schema_fields=self.bq_table_schema, source_uris=self.gs_uri(self.gcs_bucket_name, self.csv_file_name), create_disposition=create_disposition, write_disposition=write_disposition, max_bad_records=0, allow_quoted_newlines=True, field_delimiter=',', src_fmt_configs={'nullMarker': 'NULL'}, time_partitioning=tp_dictionary ) def set_last_updated_time(self): self.log.info(f"set_last_updated_time() is started") self.set_bigquery_last_updated_time() def clean_up(self): self.log.info(f"clean_up() is started") if os.path.isfile(self.csv_file_name): os.remove(self.csv_file_name) #self.gcs_hook.delete(self.gcs_bucket_name, self.csv_file_name) ############################################################################### # helper def add_seconds(self, target_time, second_value): time_seconds_added = dt.datetime.strptime(target_time, '%Y-%m-%d %H:%M:%S') time_seconds_added = time_seconds_added + dt.timedelta(seconds=second_value) return time_seconds_added.strftime('%Y-%m-%d %H:%M:%S') def calc_inverval_end_time(self, interval_start_time, interval_day): start_time = dt.datetime.strptime(interval_start_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo = dt.timezone.utc) end_time = dt.datetime.now(dt.timezone.utc) if interval_day >= 1: end_time = start_time + dt.timedelta(days=interval_day) # subtract 1 second to interval_end_at for [ ) end_time = end_time + dt.timedelta(seconds=-1) current_time = dt.datetime.now(dt.timezone.utc) if (end_time > current_time): end_time = current_time interval_end_time = end_time.strftime('%Y-%m-%d %H:%M:%S') return interval_end_time def bq_table_id(self, project, dataset, table): return f"{project}:{dataset}.{table}" def gs_uri(self, bucket, file_key): return f"gs://{bucket}/{file_key}" def replace_escape_char(self, file_name): with open(file_name, 'r') as file: data = file.read() data = data.replace("'", "\"") with open(file_name, 'w') as file: file.write(data) def get_bigquery_last_updated_time(self): bq_query = self.get_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id, self.bq_dataset_name, self.bq_table_name) self.log.info(f"get_bigquery_last_updated_time() - debug query: {bq_query}") bq_conn = self.bq_hook.get_conn() temp_bq_cursor = bq_conn.cursor() temp_bq_cursor.execute(bq_query) last_updated_time_value_row = temp_bq_cursor.fetchone() if last_updated_time_value_row is None: self.log.info(f"there is no bigquery table: {self.bq_table_name}") return PATCH_START_TIME else: last_updated = last_updated_time_value_row[0] self.log.info(f"get bigquery last updated time value: {last_updated}") last_updated_formatted = dt.datetime.strptime(last_updated, '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S') return last_updated_formatted def set_bigquery_last_updated_time(self): # first, check the inserted data bq_query_get = self.get_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id, self.bq_dataset_name, self.bq_table_name) bq_conn_get = self.bq_hook.get_conn() bq_cursor_get = bq_conn_get.cursor() bq_cursor_get.execute(bq_query_get) last_updated_time_value_row = bq_cursor_get.fetchone() # insert or update last_updated time last_updated_formatted = dt.datetime.strptime(self.interval_end_at, '%Y-%m-%d %H:%M:%S').strftime('%Y%m%d%H%M%S') bq_query_set = '' if last_updated_time_value_row is None: self.log.info(f"insert last_updated: {self.bq_table_name}") bq_query_set = self.insert_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id, self.bq_dataset_name, self.bq_table_name, last_updated_formatted) else: self.log.info(f"update last_updated: {self.bq_table_name}") bq_query_set = self.update_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id, self.bq_dataset_name, self.bq_table_name, last_updated_formatted) bq_conn_set = self.bq_hook.get_conn() bq_cursor_set = bq_conn_set.cursor() bq_cursor_set.execute(bq_query_set) # queries for last_updated column def get_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table): return f"SELECT last_updated FROM {ref_dataset}.{ref_table} WHERE dataset_id = '{target_dataset}' AND table_id = '{target_table}';" def insert_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table, last_updated_value): return f"INSERT INTO {ref_dataset}.{ref_table} (dataset_id, table_id, last_updated) VALUES ('{target_dataset}', '{target_table}', '{last_updated_value}');" def update_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table, last_updated_value): return f"UPDATE {ref_dataset}.{ref_table} SET last_updated = '{last_updated_value}' WHERE dataset_id = '{target_dataset}' AND table_id = '{target_table}';"