def trigger_consolidate_callable(**kwargs): """ trigger consolidation tasks. """ task_instance = kwargs["ti"] tables = task_instance.xcom_pull(key="tables", task_ids="sense") triggered = 0 session = Session() for table in tables: table_task_dict = task_instance.xcom_pull(key=table, task_ids="sense") table_config = table_task_dict["config"] date = table_config["date"] table_name = table_config["table_name"] run_id = table_task_dict["run_id"] if ETL.can_process("consolidation", table_name, date, session): ETL.commit_new("consolidation", table_name, date, session) logging.info(f"Triggering {run_id}.") triggered += 1 trigger_dag( "consolidate", run_id, conf=json.dumps(table_config), execution_date=pendulum.now(), replace_microseconds=False, ) logging.info(f"A total of {triggered} consolidate tasks triggered.") session.close()
def trigger_etl_callable(**kwargs): """ trigger etl tasks. """ task_instance = kwargs["ti"] files = task_instance.xcom_pull(key="files", task_ids="sense") session = Session() for file_name in files: file_task_dict = task_instance.xcom_pull(key=file_name, task_ids="sense") file_config = file_task_dict["config"] file_name = file_config["file_name"] pulltime = file_config["pulltime"] run_id = file_task_dict["run_id"] if ETL.can_process("pull_file", file_name, pulltime, session): ETL.commit_new("pull_file", file_name, pulltime, session) logging.info(f"Triggering {run_id}.") trigger_dag( "etl", run_id, conf=json.dumps(file_config), execution_date=pendulum.now(), replace_microseconds=False, ) session.close()
def sense_callable(**kwargs): """ look for tables to consolidate. """ task_instance = kwargs["ti"] session = Session() dates = ETL.ready_for_consolidation(session) queue = [] for date in dates: table_name = f"fact.session_{date.format('YYYY_MM_DD')}" hash = hashlib.sha1() hash.update(str(pendulum.now()).encode("utf-8")) hex = hash.hexdigest() table_task_dict = { "config": { "date": str(date), "table_name": table_name, }, "run_id": f"{hex[:10]}-consolidation-{date}", } if ETL.can_process("consolidation", table_name, date, session): queue.append(table_name) task_instance.xcom_push(table_name, table_task_dict) task_instance.xcom_push("tables", queue) logging.info(f"Queued tables: {queue}")
def success_callable(**kwargs): """ mark etl as completed. """ task_instance = kwargs["ti"] file_config = task_instance.xcom_pull(key="config", task_ids="init") file_name = file_config["file_name"] pulltime = file_config["pulltime"] session = Session() ETL.set_status("pull_file", file_name, pulltime, "completed", session) session.close()
def fail_callable(**kwargs): """ quarantine file if any previous etl task fails. """ task_instance = kwargs["ti"] file_config = task_instance.xcom_pull(key="config", task_ids="init") file_name = file_config["file_name"] pulltime = file_config["pulltime"] session = Session() ETL.set_status("pull_file", file_name, pulltime, "quarantine", session) session.close()
def consolidate_callable(**kwargs): """ consolidate session table. """ task_instance = kwargs["ti"] table_config = task_instance.xcom_pull(key="config", task_ids="init") date = table_config["date"] date = pendulum.from_format(date, "YYYY-MM-DD[T]HH:mm:ss").naive() table_name = table_config["table_name"] session = Session() try: Fact.consolidate(date) ETL.set_status("consolidation", table_name, date, "completed", session) session.close() except Exception as e: ETL.set_status("consolidation", table_name, date, "quarantine", session) session.close() raise e
def sense_callable(**kwargs): """ look for files to process. """ task_instance = kwargs["ti"] logging.info( f"Looking for files with the following pattern: {AIRFLOW_RAW.resolve()}/{RAW_GLOB}" ) files = list(AIRFLOW_RAW.glob(RAW_GLOB)) session = Session() queue = [] for file_name in files: file_config = {} file_config["file_name"] = str(file_name) file_config["file_stem"] = file_name.stem file_config["pulltime"] = str( pendulum.from_format(file_name.stem, "YYYY_MM_DD_HH_mm_ss[-v2]").naive()) hash = hashlib.sha1() hash.update(str(pendulum.now()).encode("utf-8")) hex = hash.hexdigest() file_config["extract_table"] = f"etl.x{hex}" file_config["load_table"] = f"etl.l{hex}" run_id = f"{hex[:10]}-{file_name}" file_name = file_config["file_name"] pulltime = file_config["pulltime"] file_task_dict = {"config": file_config, "run_id": run_id} if ETL.can_process("pull_file", file_name, pulltime, session): queue.append(file_name) task_instance.xcom_push(file_name, file_task_dict) task_instance.xcom_push("files", queue) logging.info(f"Queued files: {queue}") session.close()
def ingest_callable(**kwargs): """ ingest preprocessed wifi log files to database. """ task_instance = kwargs["ti"] file_config = task_instance.xcom_pull(key="config", task_ids="init") file_stem = file_config["file_stem"] extract_table_name = file_config["extract_table"] load_table_name = file_config["load_table"] logging.info(f"Looping through '{file_stem}*.csv'") ingest_errors = [] for file_path in AIRFLOW_IMPORT.glob(f"{file_stem}*.csv"): logging.info(f"Ingesting {file_path}.") date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive() session = Session() if ETL.can_process("session_file", file_path, date, session): try: ETL.commit_new("session_file", file_path, date, session) Fact.etl(date, file_path.name, extract_table_name, load_table_name) ETL.set_status("session_file", file_path, date, "completed", session) session.close() except: ingest_errors.append(file_path) ETL.set_status("session_file", file_path, date, "quarantine", session) session.close() if len(ingest_errors) > 0: logging.info( f"The following files could not be ingested: {ingest_errors}.") raise Exception( f"A total of {len(ingest_errors)} files could not be ingested. Failing DAG run" )
def preprocess_callable(**kwargs): """ preprocess raw wifi log files. """ task_instance = kwargs["ti"] file_config = task_instance.xcom_pull(key="config", task_ids="init") file_name = file_config["file_name"] file_stem = file_config["file_stem"] columns = [ "username", "macaddress", "protocol", "apname", "location", "ssid", "sessionstarttime", "sessionendtime", "pulltime", "rssi", ] df = pd.read_csv((file_name), header=None, sep="\t", quoting=csv.QUOTE_NONE) df.columns = columns df.fillna("N/A", inplace=True) df["sessionstarttime"] = pd.to_datetime(df["sessionstarttime"], unit="ms").dt.floor("s") df["sessionendtime"] = pd.to_datetime(df["sessionendtime"], unit="ms").dt.floor("s") df["pulltime"] = pd.to_datetime(df["pulltime"], unit="s") # we add the timezone offset as the data is collected in GMT+00:00 timezone_offset = pd.Timedelta("8 hours") df["sessionstarttime"] = df["sessionstarttime"] + timezone_offset df["sessionendtime"] = df["sessionendtime"] + timezone_offset df["pulltime"] = df["pulltime"] + timezone_offset logging.info(f"Original file, number of rows: {len(df)}.") df = df[-df.duplicated()] logging.info(f"After removal of duplicates, number of rows: {len(df)}.") # assign null values to missing end time missing_time = pd.to_datetime("2100-01-01 00:00:00") + timezone_offset df.loc[df.sessionendtime == missing_time, "sessionendtime"] = np.nan # save each date in a separate file session = Session() for date, group in df.groupby( df["sessionstarttime"].map(lambda x: x.date())): date_str = date.strftime("%Y_%m_%d") group_file_path = Path(AIRFLOW_IMPORT / (f"{file_stem}_{date_str}.csv")) if ETL.can_process("session_file", f"{group_file_path}", date_str, session): group.loc[:, "sessionstarttime"] = group[ "sessionstarttime"].dt.strftime("%Y-%m-%d %H:%M:%S") group.loc[:, "sessionendtime"] = group["sessionendtime"].dt.strftime( "%Y-%m-%d %H:%M:%S") group.loc[group.sessionendtime == "NaT", "sessionendtime"] = "" group.loc[:, "pulltime"] = group["pulltime"].dt.strftime( "%Y-%m-%d %H:%M:%S") group.to_csv(group_file_path, index=False) logging.info( f"Preprocessed group, {group_file_path}:\n{group.head()}") session.close()
def test_etl_states(session, task_instance): ti = task_instance(AIRFLOW_RAW / "2020_04_01_00_00_00-v2.tsv") file_config = ti.xcom_pull("config", "init") file_name = file_config["file_name"] pulltime = file_config["pulltime"] assert ETL.can_process("pull_file", file_name, pulltime, session) ETL.commit_new("pull_file", file_name, pulltime, session) assert ETL.can_process("pull_file", file_name, pulltime, session) == False q = ETL.get_most_recent("pull_file", file_name, pulltime, session) assert q.task_type == "pull_file" assert q.task_name == file_name assert pendulum.instance(q.task_timestamp) == pendulum.from_format( pulltime, "YYYY-MM-DD[T]HH:mm:ss") assert q.status == ETLStatus.ongoing ETL.set_status("pull_file", file_name, pulltime, "quarantine", session) assert ETL.can_process("pull_file", file_name, pulltime, session) q = ETL.get_most_recent("pull_file", file_name, pulltime, session) assert q.task_type == "pull_file" assert q.task_name == file_name assert pendulum.instance(q.task_timestamp) == pendulum.from_format( pulltime, "YYYY-MM-DD[T]HH:mm:ss") assert q.status == ETLStatus.quarantine ETL.set_status("pull_file", file_name, pulltime, "completed", session) assert ETL.can_process("pull_file", file_name, pulltime, session) == False q = ETL.get_most_recent("pull_file", file_name, pulltime, session) assert q.task_type == "pull_file" assert q.task_name == file_name assert pendulum.instance(q.task_timestamp) == pendulum.from_format( pulltime, "YYYY-MM-DD[T]HH:mm:ss") assert q.status == ETLStatus.completed with pytest.raises(Exception, match="Once a task is completed"): ETL.commit_new("pull_file", file_name, pulltime, session) ETL.set_status("quarantine", file_name, pulltime, "completed", session)
def _mock_etl(task_type, task_name, task_timestamp, status): etl_entries.append((task_type, task_name, task_timestamp, status)) ETL.set_status(task_type, task_name, task_timestamp, status, session)