Ejemplo n.º 1
0
def trigger_consolidate_callable(**kwargs):
    """ trigger consolidation tasks. """

    task_instance = kwargs["ti"]
    tables = task_instance.xcom_pull(key="tables", task_ids="sense")
    triggered = 0

    session = Session()

    for table in tables:
        table_task_dict = task_instance.xcom_pull(key=table, task_ids="sense")
        table_config = table_task_dict["config"]
        date = table_config["date"]
        table_name = table_config["table_name"]
        run_id = table_task_dict["run_id"]
        if ETL.can_process("consolidation", table_name, date, session):
            ETL.commit_new("consolidation", table_name, date, session)
            logging.info(f"Triggering {run_id}.")
            triggered += 1
            trigger_dag(
                "consolidate",
                run_id,
                conf=json.dumps(table_config),
                execution_date=pendulum.now(),
                replace_microseconds=False,
            )

    logging.info(f"A total of {triggered} consolidate tasks triggered.")

    session.close()
Ejemplo n.º 2
0
def trigger_etl_callable(**kwargs):
    """ trigger etl tasks. """

    task_instance = kwargs["ti"]
    files = task_instance.xcom_pull(key="files", task_ids="sense")

    session = Session()

    for file_name in files:
        file_task_dict = task_instance.xcom_pull(key=file_name,
                                                 task_ids="sense")
        file_config = file_task_dict["config"]
        file_name = file_config["file_name"]
        pulltime = file_config["pulltime"]
        run_id = file_task_dict["run_id"]
        if ETL.can_process("pull_file", file_name, pulltime, session):
            ETL.commit_new("pull_file", file_name, pulltime, session)
            logging.info(f"Triggering {run_id}.")
            trigger_dag(
                "etl",
                run_id,
                conf=json.dumps(file_config),
                execution_date=pendulum.now(),
                replace_microseconds=False,
            )

    session.close()
Ejemplo n.º 3
0
def sense_callable(**kwargs):
    """ look for tables to consolidate. """

    task_instance = kwargs["ti"]
    session = Session()
    dates = ETL.ready_for_consolidation(session)
    queue = []

    for date in dates:
        table_name = f"fact.session_{date.format('YYYY_MM_DD')}"
        hash = hashlib.sha1()
        hash.update(str(pendulum.now()).encode("utf-8"))
        hex = hash.hexdigest()
        table_task_dict = {
            "config": {
                "date": str(date),
                "table_name": table_name,
            },
            "run_id": f"{hex[:10]}-consolidation-{date}",
        }
        if ETL.can_process("consolidation", table_name, date, session):
            queue.append(table_name)
            task_instance.xcom_push(table_name, table_task_dict)

    task_instance.xcom_push("tables", queue)
    logging.info(f"Queued tables: {queue}")
Ejemplo n.º 4
0
def success_callable(**kwargs):
    """ mark etl as completed. """

    task_instance = kwargs["ti"]
    file_config = task_instance.xcom_pull(key="config", task_ids="init")

    file_name = file_config["file_name"]
    pulltime = file_config["pulltime"]

    session = Session()
    ETL.set_status("pull_file", file_name, pulltime, "completed", session)
    session.close()
Ejemplo n.º 5
0
def fail_callable(**kwargs):
    """ quarantine file if any previous etl task fails. """

    task_instance = kwargs["ti"]
    file_config = task_instance.xcom_pull(key="config", task_ids="init")

    file_name = file_config["file_name"]
    pulltime = file_config["pulltime"]

    session = Session()
    ETL.set_status("pull_file", file_name, pulltime, "quarantine", session)
    session.close()
Ejemplo n.º 6
0
def consolidate_callable(**kwargs):
    """ consolidate session table. """

    task_instance = kwargs["ti"]
    table_config = task_instance.xcom_pull(key="config", task_ids="init")

    date = table_config["date"]
    date = pendulum.from_format(date, "YYYY-MM-DD[T]HH:mm:ss").naive()

    table_name = table_config["table_name"]

    session = Session()
    try:
        Fact.consolidate(date)
        ETL.set_status("consolidation", table_name, date, "completed", session)
        session.close()
    except Exception as e:
        ETL.set_status("consolidation", table_name, date, "quarantine",
                       session)
        session.close()
        raise e
Ejemplo n.º 7
0
def sense_callable(**kwargs):
    """ look for files to process. """

    task_instance = kwargs["ti"]

    logging.info(
        f"Looking for files with the following pattern: {AIRFLOW_RAW.resolve()}/{RAW_GLOB}"
    )
    files = list(AIRFLOW_RAW.glob(RAW_GLOB))
    session = Session()
    queue = []

    for file_name in files:

        file_config = {}

        file_config["file_name"] = str(file_name)
        file_config["file_stem"] = file_name.stem
        file_config["pulltime"] = str(
            pendulum.from_format(file_name.stem,
                                 "YYYY_MM_DD_HH_mm_ss[-v2]").naive())

        hash = hashlib.sha1()
        hash.update(str(pendulum.now()).encode("utf-8"))
        hex = hash.hexdigest()

        file_config["extract_table"] = f"etl.x{hex}"
        file_config["load_table"] = f"etl.l{hex}"

        run_id = f"{hex[:10]}-{file_name}"

        file_name = file_config["file_name"]
        pulltime = file_config["pulltime"]

        file_task_dict = {"config": file_config, "run_id": run_id}

        if ETL.can_process("pull_file", file_name, pulltime, session):
            queue.append(file_name)
            task_instance.xcom_push(file_name, file_task_dict)

    task_instance.xcom_push("files", queue)
    logging.info(f"Queued files: {queue}")

    session.close()
Ejemplo n.º 8
0
def ingest_callable(**kwargs):
    """ ingest preprocessed wifi log files to database. """

    task_instance = kwargs["ti"]
    file_config = task_instance.xcom_pull(key="config", task_ids="init")

    file_stem = file_config["file_stem"]
    extract_table_name = file_config["extract_table"]
    load_table_name = file_config["load_table"]

    logging.info(f"Looping through '{file_stem}*.csv'")

    ingest_errors = []

    for file_path in AIRFLOW_IMPORT.glob(f"{file_stem}*.csv"):
        logging.info(f"Ingesting {file_path}.")
        date = pendulum.from_format(file_path.stem[23:], "YYYY_MM_DD").naive()
        session = Session()
        if ETL.can_process("session_file", file_path, date, session):
            try:
                ETL.commit_new("session_file", file_path, date, session)
                Fact.etl(date, file_path.name, extract_table_name,
                         load_table_name)
                ETL.set_status("session_file", file_path, date, "completed",
                               session)
                session.close()
            except:
                ingest_errors.append(file_path)
                ETL.set_status("session_file", file_path, date, "quarantine",
                               session)
                session.close()

        if len(ingest_errors) > 0:
            logging.info(
                f"The following files could not be ingested: {ingest_errors}.")
            raise Exception(
                f"A total of {len(ingest_errors)} files could not be ingested. Failing DAG run"
            )
Ejemplo n.º 9
0
def preprocess_callable(**kwargs):
    """ preprocess raw wifi log files. """

    task_instance = kwargs["ti"]
    file_config = task_instance.xcom_pull(key="config", task_ids="init")
    file_name = file_config["file_name"]
    file_stem = file_config["file_stem"]

    columns = [
        "username",
        "macaddress",
        "protocol",
        "apname",
        "location",
        "ssid",
        "sessionstarttime",
        "sessionendtime",
        "pulltime",
        "rssi",
    ]

    df = pd.read_csv((file_name),
                     header=None,
                     sep="\t",
                     quoting=csv.QUOTE_NONE)
    df.columns = columns
    df.fillna("N/A", inplace=True)

    df["sessionstarttime"] = pd.to_datetime(df["sessionstarttime"],
                                            unit="ms").dt.floor("s")
    df["sessionendtime"] = pd.to_datetime(df["sessionendtime"],
                                          unit="ms").dt.floor("s")
    df["pulltime"] = pd.to_datetime(df["pulltime"], unit="s")

    # we add the timezone offset as the data is collected in GMT+00:00
    timezone_offset = pd.Timedelta("8 hours")
    df["sessionstarttime"] = df["sessionstarttime"] + timezone_offset
    df["sessionendtime"] = df["sessionendtime"] + timezone_offset
    df["pulltime"] = df["pulltime"] + timezone_offset

    logging.info(f"Original file, number of rows: {len(df)}.")
    df = df[-df.duplicated()]
    logging.info(f"After removal of duplicates, number of rows: {len(df)}.")

    # assign null values to missing end time
    missing_time = pd.to_datetime("2100-01-01 00:00:00") + timezone_offset
    df.loc[df.sessionendtime == missing_time, "sessionendtime"] = np.nan

    # save each date in a separate file
    session = Session()
    for date, group in df.groupby(
            df["sessionstarttime"].map(lambda x: x.date())):

        date_str = date.strftime("%Y_%m_%d")
        group_file_path = Path(AIRFLOW_IMPORT /
                               (f"{file_stem}_{date_str}.csv"))

        if ETL.can_process("session_file", f"{group_file_path}", date_str,
                           session):
            group.loc[:, "sessionstarttime"] = group[
                "sessionstarttime"].dt.strftime("%Y-%m-%d %H:%M:%S")
            group.loc[:,
                      "sessionendtime"] = group["sessionendtime"].dt.strftime(
                          "%Y-%m-%d %H:%M:%S")
            group.loc[group.sessionendtime == "NaT", "sessionendtime"] = ""
            group.loc[:, "pulltime"] = group["pulltime"].dt.strftime(
                "%Y-%m-%d %H:%M:%S")
            group.to_csv(group_file_path, index=False)
            logging.info(
                f"Preprocessed group, {group_file_path}:\n{group.head()}")

    session.close()
Ejemplo n.º 10
0
def test_etl_states(session, task_instance):

    ti = task_instance(AIRFLOW_RAW / "2020_04_01_00_00_00-v2.tsv")
    file_config = ti.xcom_pull("config", "init")
    file_name = file_config["file_name"]
    pulltime = file_config["pulltime"]

    assert ETL.can_process("pull_file", file_name, pulltime, session)

    ETL.commit_new("pull_file", file_name, pulltime, session)
    assert ETL.can_process("pull_file", file_name, pulltime, session) == False
    q = ETL.get_most_recent("pull_file", file_name, pulltime, session)
    assert q.task_type == "pull_file"
    assert q.task_name == file_name
    assert pendulum.instance(q.task_timestamp) == pendulum.from_format(
        pulltime, "YYYY-MM-DD[T]HH:mm:ss")
    assert q.status == ETLStatus.ongoing

    ETL.set_status("pull_file", file_name, pulltime, "quarantine", session)
    assert ETL.can_process("pull_file", file_name, pulltime, session)
    q = ETL.get_most_recent("pull_file", file_name, pulltime, session)
    assert q.task_type == "pull_file"
    assert q.task_name == file_name
    assert pendulum.instance(q.task_timestamp) == pendulum.from_format(
        pulltime, "YYYY-MM-DD[T]HH:mm:ss")
    assert q.status == ETLStatus.quarantine

    ETL.set_status("pull_file", file_name, pulltime, "completed", session)
    assert ETL.can_process("pull_file", file_name, pulltime, session) == False
    q = ETL.get_most_recent("pull_file", file_name, pulltime, session)
    assert q.task_type == "pull_file"
    assert q.task_name == file_name
    assert pendulum.instance(q.task_timestamp) == pendulum.from_format(
        pulltime, "YYYY-MM-DD[T]HH:mm:ss")
    assert q.status == ETLStatus.completed

    with pytest.raises(Exception, match="Once a task is completed"):
        ETL.commit_new("pull_file", file_name, pulltime, session)
        ETL.set_status("quarantine", file_name, pulltime, "completed", session)
Ejemplo n.º 11
0
 def _mock_etl(task_type, task_name, task_timestamp, status):
     etl_entries.append((task_type, task_name, task_timestamp, status))
     ETL.set_status(task_type, task_name, task_timestamp, status, session)