def prepare_data(self, df_ef, attr_list): """ 数据准备工作,为正式处理前进行数据过滤 只操作attr表中有的probe_mac,并选取后续所需字段 """ if self.table_name == 'probe_type': df_ef = df_ef.rename(columns=probe_fields_change) elif self.table_name == 'wifi_type': df_ef = df_ef.rename(columns=wifi_fields_change) elif self.table_name == 'audit_type': df_ef = df_ef.rename(columns=audit_fields_change) elif self.table_name == 'im_type': df_ef = df_ef.rename(columns=im_fields_change) else: return 1 df_ef = df_ef[df_ef['probe_mac'].isin(attr_list)] if df_ef.empty: logger.info(f"ETL->没有预警数据,任务结束") return df_ef else: logger.info(f"ETL->发现预警消息") df_ef["probe_mac"] = df_ef["probe_mac"].map(udf_mac_o2h) # df_attr = df_attr[df_attr["attr_type_id"] == 5] # if len(df_attr) == 0: return len(df_attr) # df_attr = df_attr[["attr_value"]].rename(columns={"attr_value": "probe_mac"}) # 只选出attr表中有的probe_mac进行处理 # df_ef = pd.merge(df_attr, df_ef, on="probe_mac", how="left") df_ef = df_ef[['time_on', 'probe_mac', 'place_code', 'collect_mac']] # df_ef = df_ef.dropna(how="any") # if len(df_ef) == 0: return len(df_ef) df_ef["place_code"] = df_ef["place_code"].map(udf_null_to_zero) df_ef = df_ef[df_ef["place_code"] != "0"] return df_ef
def update_attr(self, df): """ 检查是否需要生成新的attr记录 """ if not df.empty: attr_mac_list = tuple(set(df['probe_data'].tolist())) current_ts = int(time.time() * 1000) conn = self.db_conn_pool.getconn() cursor = conn.cursor() cursor.execute( f"UPDATE {ETL_SCHEMA}.attr SET update_time = {current_ts} WHERE attr_type_id = 5 and attr_value in {attr_mac_list} RETURNING id,attr_value" ) conn.commit() query_result = cursor.fetchall() exist_attr_df = None insert_attr_df = None if query_result: exist_attr_df = pd.DataFrame(query_result, columns=['id', 'probe_data']) insert_mac_list = list((collections.Counter(attr_mac_list) - collections.Counter( exist_attr_df['probe_data'].tolist())).elements()) else: insert_mac_list = attr_mac_list insert_df = pd.DataFrame(insert_mac_list, columns=['attr_value']) if not insert_df.empty: insert_df['attr_type_id'] = 5 insert_df['create_time'] = current_ts insert_df['update_time'] = current_ts logger.info("以下是attr数据") logger.info(insert_df) insert_data = insert_df.to_dict(orient="records") insert_data_str = "" for data in insert_data: insert_data_str += f"({data['attr_type_id']},'{data['attr_value']}',{data['create_time']},{data['update_time']})," insert_data_str = insert_data_str[:-1] cursor.execute( f"SELECT setval('{ETL_SCHEMA}.attr_id_seq', (SELECT max(id) FROM zhaoqing_duanzhou_db.attr));") cursor.execute( f"insert into {ETL_SCHEMA}.attr (attr_type_id,attr_value,create_time,update_time) values {insert_data_str} RETURNING id,attr_value") conn.commit() insert_query_result = cursor.fetchall() insert_attr_df = pd.DataFrame(insert_query_result, columns=['id', 'probe_data']) self.db_conn_pool.putconn(conn) if exist_attr_df is not None and insert_attr_df is not None: attr_df = pd.concat([exist_attr_df, insert_attr_df]) return attr_df elif exist_attr_df is not None: return exist_attr_df else: return insert_attr_df
def on_mq_entry(self, deliver_data, precision_span, time_stack_span): transfer_df = pd.DataFrame(deliver_data.get('content', None)) table_name = deliver_data.get('dataSrc', None) logger.info(f"ETL->datasync start") start_time = time.time() try: AnalysisEleFence().etl_real_time( table_name, transfer_df, precision_span, time_stack_span, ) except Exception as e: logger.exception(f"ETL->发生错误 {e}") emd_time = time.time() logger.info(f"ETL->耗时{decimal_four_place(emd_time - start_time)}秒")
def update_attr_record(self, df_track_append, df_attr, track_id): # df_track_append [place_code, probe_time, probe_data, flag, datasource_table_name, create_time, datasource_id, probe_device_id] # df_track [id, probe_time, place_code, create_time, datasource_id, datasource_table_name, base_person_id, probe_device_id] # 太多,后续无法读取 # df_attr = df_attr[df_attr["attr_type_id"] == 5] # df_attr = df_attr[["id", "attr_value"]].rename(columns={"attr_value": "probe_data"}) df_track_append = pd.merge(df_track_append, df_attr, on=["probe_data"], how="left") df_track_append = df_track_append.rename(columns={"id": "attr_id"}) df_track_append["create_time"] = int(time.time() * 1000) df_track_append = df_track_append[["track_id", "attr_id", "create_time"]] logger.info("以下是attr_record数据") logger.info(df_track_append) logger.info(f"ETL->start to insert") self.db_opr.pg_insert_return_id(self.attr_record_table, ["track_id", "attr_id", "create_time"], pandas_dataframe_to_string_sql_insert_values(df_track_append)) logger.info(f"ETL->insert finish") publisher = RabbitPublisher( exchange=RABBIT_MQ_EXCHANGE, route_key=RABBIT_MQ_PERSON_TRACK_ROUTE_KEY, queue=RABBIT_MQ_PERSON_TRACK_QUEUE ) publisher.publish(json.dumps(track_id))
def etl_real_time(self, table_name, df_ef, precision_span, time_stack_span): """ 对所有的probe_mac进行清洗,入库 """ self.table_name = table_name etl_job = AnalysisEleFence() clue_rule_df = self.db_opr.read_clue_rule_data() if clue_rule_df.empty: logger.info(f"ETL->预警规则为空,不予处理") return attr_list = clue_rule_df['clue_value'].tolist() # conn_attr, df_attr = self.db_opr.read_pgsql_to_pandas_dataframe(self.PGSQL_214, self.PGSQL_214_ATTR) # df_attr = self.db_opr.read_pgsql_to_pandas_dataframe(self.PGSQL_PROP, self.PGSQL_ATTR) # logger.info("preprocess : read_sql success") # df_ef = etl_job.prepare_data(df_ef, df_attr) # logger.info("preprocess : prepare_data success") df_ef = etl_job.prepare_data(df_ef, attr_list) if type(df_ef) == int: return 0 df_ef["place_code"] = df_ef["place_code"].astype(str) df_ef["place_code"] = df_ef["place_code"].map(udf_string_float_to_string) df_ef_dev = df_ef[["place_code", "collect_mac"]].drop_duplicates(subset=["place_code"]) # ele[probe_mac, place_code, time_start, time_end, count] -- etl_2_track df_ef = etl_job.etl_2_track(df_ef, precision_span, time_stack_span) logger.info(f"ETL->etl_2_track success"), if type(df_ef) == int: return 0 # 当前旧轨迹表数据从atrack表中取,并转存到atrack_tmp中,最后存回atrack,后续得改为到hdfs中取 df_track_source = etl_job.union_track_2_hdfs(df_ef, time_stack_span) logger.info(f"ETL->union_track_2_hdfs success") if type(df_track_source) == int: return 0 df_track_output = etl_job.standardize_etl_data(df_track_source) logger.info(f"ETL->output success") df_track_append, track_id = etl_job.update_track_table(df_track_output, df_ef_dev) logger.info(f"ETL->`update_track_talbe` success") if type(df_track_append) == int: return 0 attr_df = self.db_opr.update_attr(df_track_append) etl_job.update_attr_record(df_track_append, attr_df, track_id) logger.info(f"ETL->update_attr_record success")
def update_track_table(self, df_track_output, df_ef_dev): """ 执行更新操作,将需要更新的数据列出,将新的数据追加到track表中 """ if self.hdfs_opr.check_path_is_exist(self.HDFS_ST_TRACK_INFO_SUB): try: df_track_old = self.hdfs_opr.read_csv_to_df(self.HDFS_ST_TRACK_INFO_SUB)# , '1' df_track_old = df_track_old[ ["probe_time", "probe_data", "place_code", "flag", "datasource_table_name"]] except: self.hdfs_opr.delete_path(path=self.HDFS_ST_TRACK_INFO_SUB) df_track_old = pd.DataFrame( columns=['probe_time', 'probe_data', 'place_code', 'flag', 'datasource_table_name']) else: logger.info(f"ETL->st_track_info ready to create") df_track_old = pd.DataFrame( columns=['probe_time', 'probe_data', 'place_code', 'flag', 'datasource_table_name']) logger.info(f"ETL->update_track client init success") df_track_old = df_track_old[df_track_old["datasource_table_name"] == "probe_type"] df_track_append = df_track_output.append(df_track_old).append(df_track_old).drop_duplicates(keep=False) df_track_append["create_time"] = int(time.time() * 1000) df_track_append["datasource_id"] = range(len(df_track_append)) df_track_append["probe_time"] = df_track_append["probe_time"].astype("long").map(lambda x: x * 1000) df_ef_dev["place_code"] = df_ef_dev["place_code"].astype(str) df_ef_dev["collect_mac"] = df_ef_dev["collect_mac"].astype(np.int64) df_ef_dev["collect_mac"] = df_ef_dev["collect_mac"].map(udf_mac_o2h) df_track_append['place_code'] = df_track_append['place_code'].astype(str) df_track_append = pd.merge(df_track_append, df_ef_dev, on="place_code", how="left") # conn_dev, df_dev = self.db_opr.read_pgsql_to_pandas_dataframe(self.PGSQL_214, self.PGSQL_214_DEV) logger.info(f"ETL->ready to exec read_pgsql_to_pandas_dataframe") df_dev = self.db_opr.read_pgsql_to_pandas_dataframe(self.device_table_name) logger.info(f"ETL->exec read_pgsql_to_pandas_dataframe finish") df_track_append = pd.merge(df_track_append, df_dev[["id", "collect_mac"]], how="left", on="collect_mac") df_track_append = df_track_append.rename(columns={"id": "probe_device_id"}) df_probe_data = df_track_append[["datasource_id", "probe_data"]] df_track_append.drop(["flag", "probe_data"], axis=1) if len(df_track_append) == 0: return 0, None logger.info(f"ETL->st_track_info ready to append") if self.hdfs_opr.check_path_is_exist(self.HDFS_ST_TRACK_INFO_SUB): self.hdfs_opr.delete_path(path=self.HDFS_ST_TRACK_INFO_SUB) self.hdfs_opr.push_csv_data( hdfs_dir=self.HDFS_TRACK_PATH, filename=self.HDFS_TRACK_INFO_FILENAME, df=df_track_append.append(df_track_old) ) logger.info(f"ETL->st_track_info append success") df_track_append = df_track_append[ ["probe_time", "place_code", "create_time", "datasource_id", "datasource_table_name", "probe_device_id"]] df_track_append["datasource_table_name"] = df_track_append["datasource_table_name"].map( lambda x: '\'' + x + '\'') df_track_append["place_code"] = df_track_append["place_code"].map(lambda x: '\'' + x + '\'') df_track_append = df_track_append.where(df_track_append.notnull(), None) df_track_append = df_track_append[~(df_track_append['probe_device_id'].isnull())].reset_index(drop=True) logger.info("写入track的数据") logger.info(df_track_append) if df_track_append.empty: return 0, None id_list = self.db_opr.pg_insert_return_id( self.track_table_name, ["probe_time", "place_code", "create_time", "datasource_id", "datasource_table_name", "probe_device_id"], pandas_dataframe_to_string_sql_insert_values(df_track_append) ) logger.info(f"ETL->df_track_append_insert_sql success") pd_id_list = pd.DataFrame(id_list, columns=["track_id"]) df_track_append["track_id"] = pd_id_list["track_id"] df_track_append = pd.merge(df_track_append, df_probe_data, on=["datasource_id"], how="left") return df_track_append, pd_id_list['track_id'].tolist()
def union_track_2_hdfs(self, df_ef, time_stack_span): """ 将hdfs中的旧track数据与新的track数据进行合并,进行轨迹合并操作,将合并后的新的轨迹存回hdfs """ df_ef["place_code"] = df_ef["place_code"].astype(str) # ele[probe_mac, place_code, time_start, time_end, count] # 如果旧轨迹表有数据,则拉过来进行union并在后面进行轨迹合并 if self.hdfs_opr.check_path_is_exist(self.HDFS_ST_TRACK_SUB): # [probe_mac, place_code, time_start, time_end, count] df_track = self.hdfs_opr.read_csv_to_df(self.HDFS_ST_TRACK_SUB) df_track["time_start"] = df_track["time_start"].astype("long") df_track["time_end"] = df_track["time_end"].astype("long") df_track["count"] = df_track["count"].astype("long") df_ef = pd.concat([df_ef, df_track], axis=0) else: logger.info(f"ETL->client ready to create") self.hdfs_opr.push_csv_data( hdfs_dir=self.HDFS_TRACK_PATH, filename=self.HDFS_TRACK_FILENAME, df=df_ef ) logger.info(f"ETL->new data success") return df_ef logger.info(f"ETL->hdfs connect success") df_ef = df_ef.sort_values("time_start").reset_index().drop(["index"], axis=1) df_ef["time_start"] = df_ef["time_start"].astype(str) df_ef["time_end"] = df_ef["time_end"].astype(str) df_ef["count"] = df_ef["count"].astype(str) df_ef["time_on"] = df_ef["time_start"] + ',' + df_ef["time_end"] + ',' + df_ef["count"] df_ef = df_ef[["time_on", "probe_mac", "place_code"]] df_ef["time_on"] = df_ef["time_on"].map(lambda x: "[" + x + "]") df_ef = df_ef.groupby(["probe_mac", "place_code"])["time_on"].apply( lambda time_on: [','.join(time_on)]).reset_index().rename(columns={"time_on": "time_list"}) # ele[probe_mac, place_code, time_list[time_on(time_start, time_end, count), time_on(time_start, time_end, count), ...] df_ef["time_list"] = df_ef.apply(udf_track_union, axis=1, args=(time_stack_span,)) df_ef["time_list"] = df_ef["time_list"].map(lambda x: str( x.replace(" ", ""))) # .replace("'", "").split("\\],\\["))[1: -1].replace("]", "").replace("[", "") df_ef = df_ef.drop(["time_list"], axis=1).join( df_ef["time_list"].str.split("\\],\\[", expand=True).stack().reset_index(level=1, drop=True).rename( "time_list")) df_ef = df_ef.reset_index() df_ef = df_ef.join(df_ef["time_list"].str.split(',', expand=True)).rename( columns={0: "time_start", 1: "time_end", 2: "count"}).drop(["time_list"], axis=1) if len(df_ef) == 0: return len(df_ef) if len(df_ef) > 0: self.hdfs_opr.delete_path(path=self.HDFS_ST_TRACK_SUB) self.hdfs_opr.push_csv_data( hdfs_dir=self.HDFS_TRACK_PATH, filename=self.HDFS_TRACK_FILENAME, df=df_ef ) df_ef = df_ef[["probe_mac", "place_code", "time_start", "time_end"]] df_track = df_track[["probe_mac", "place_code", "time_start", "time_end"]] df_track["probe_mac"] = df_track["probe_mac"].astype(str) df_track["place_code"] = df_track["place_code"].astype(str) df_track["time_start"] = df_track["time_start"].astype(str) df_track["time_end"] = df_track["time_end"].astype(str) df_ef = df_ef.append(df_track).append(df_track).drop_duplicates( subset=["probe_mac", "place_code", "time_start", "time_end"], keep=False) if len(df_ef) == 0: return 0 return df_ef