Beispiel #1
0
def create_or_append_table(context, dataset_id, table_id, path):
    tb = Table(table_id=table_id, dataset_id=dataset_id)
    if not tb.table_exists('staging'):
        context.log.info("Table does not exist in STAGING, creating table...")
        tb.create(
            path=path,
            if_table_exists="pass",
            if_storage_data_exists="replace",
            if_table_config_exists="pass",
        )
        context.log.info("Table created in STAGING")
    else:
        context.log.info("Table already exists in STAGING, appending to it...")
        tb.append(filepath=path,
                  if_exists="replace",
                  timeout=600,
                  chunk_size=1024 * 1024 * 10)
        context.log.info("Appended to table on STAGING successfully.")

    if not tb.table_exists("prod"):
        context.log.info("Table does not exist in PROD, publishing...")
        tb.publish(if_exists="pass")
        context.log.info("Published table in PROD successfully.")
    else:
        context.log.info("Table already published in PROD.")
Beispiel #2
0
def test_update_raises(metadatadir, sample_data, capsys):

    table_part = Table(
        dataset_id=DATASET_ID,
        table_id=TABLE_ID + "_partitioned",
        metadata_path=metadatadir,
    )

    shutil.copy(
        sample_data / "table_config_part_wrong.yaml",
        metadatadir / DATASET_ID / "pytest_partitioned" / "table_config.yaml",
    )

    with pytest.raises(Exception):
        table_part.update("all")
        out, err = capsys.readouterr()
        assert "publish.sql" in out

    shutil.copy(
        sample_data / "publish_part.sql",
        table_part.table_folder / "publish.sql",
    )
    shutil.copy(
        sample_data / "table_config.yaml",
        Path(table_part.table_folder / "table_config.yaml"),
    )

    with pytest.raises(Exception):
        table.update("all")
        assert "table_config.yaml" in out
Beispiel #3
0
def table(metadatadir):

    t = Table(dataset_id=DATASET_ID,
              table_id=TABLE_ID,
              metadata_path=metadatadir)
    t._refresh_templates()
    return t
Beispiel #4
0
def upload_logs_to_bq(context, timestamp, error):

    dataset_id = context.resources.basedosdados_config['dataset_id']
    table_id = context.resources.basedosdados_config['table_id'] + "_logs"

    filepath = Path(
        f"{timestamp}/{table_id}/data={pendulum.parse(timestamp).date()}/{table_id}_{timestamp}.csv")
    # create partition directory
    filepath.parent.mkdir(exist_ok=True, parents=True)
    # create dataframe to be uploaded
    df = pd.DataFrame(
        {"timestamp_captura": [pd.to_datetime(timestamp)], "sucesso": [
            error is None], "erro": [error]}
    )
    # save local
    df.to_csv(filepath, index=False)
    # BD Table object
    tb = Table(table_id, dataset_id)
    # create and publish if table does not exist, append to it otherwise
    if not tb.table_exists("staging"):
        tb.create(
            path=f"{timestamp}/{table_id}",
            if_table_exists="replace",
            if_storage_data_exists="replace",
            if_table_config_exists="pass",
        )
    elif not tb.table_exists("prod"):
        tb.publish(if_exists="replace")
    else:
        tb.append(filepath=f"{timestamp}/{table_id}", if_exists='replace')

    # delete local file
    shutil.rmtree(f"{timestamp}")
Beispiel #5
0
def test_create_auto_partitions(metadatadir, data_path, sample_data):
    shutil.rmtree(metadatadir / "partitions", ignore_errors=True)

    table_part = Table(
        dataset_id=DATASET_ID,
        table_id=TABLE_ID + "_partitioned",
        metadata_path=metadatadir,
    )

    table_part.delete("all")

    table_part.init(
        data_sample_path=data_path,
        if_folder_exists="replace",
        if_table_config_exists="replace",
    )

    Path(metadatadir / "partitions").mkdir()

    shutil.copy(
        sample_data / "table_config_part.yaml",
        Path(table_part.table_folder / "table_config.yaml"),
    )
    shutil.copy(
        sample_data / "publish_part.sql",
        table_part.table_folder / "publish.sql",
    )
    for n in [1, 2]:
        Path(metadatadir / "partitions" / f"keys={n}").mkdir()
        shutil.copy(
            metadatadir / "municipios.csv",
            metadatadir / "partitions" / f"keys={n}" / "municipios.csv",
        )

    table_part.create(
        metadatadir / "partitions",
        partitioned=True,
        if_table_exists="replace",
        if_table_config_exists="pass",
        if_storage_data_exists="replace",
    )
    assert table_exists(table_part, "staging")

    table_part.publish()

    assert table_exists(table_part, "prod")
Beispiel #6
0
def upload_to_bq(context, paths):
    for key in paths.keys():
        context.log.info("#" * 80)
        context.log.info(f"KEY = {key}")
        tb = Table(
            key,
            context.resources.basedosdados_config["dataset_id"],
        )
        tb_dir = paths[key].parent.parent
        context.log.info(f"tb_dir = {tb_dir}")

        if not tb.table_exists("staging"):
            context.log.info(
                "Table does not exist in STAGING, creating table...")
            tb.create(
                path=tb_dir,
                if_table_exists="pass",
                if_storage_data_exists="replace",
                if_table_config_exists="pass",
            )
            context.log.info("Table created in STAGING")
        else:
            context.log.info(
                "Table already exists in STAGING, appending to it...")
            tb.append(filepath=tb_dir,
                      if_exists="replace",
                      timeout=600,
                      chunk_size=1024 * 1024 * 10)
            context.log.info("Appended to table on STAGING successfully.")

        if not tb.table_exists("prod"):
            context.log.info("Table does not exist in PROD, publishing...")
            tb.publish(if_exists="pass")
            context.log.info("Published table in PROD successfully.")
        else:
            context.log.info("Table already published in PROD.")
    context.log.info(f"Returning -> {tb_dir.parent}")

    return tb_dir.parent
Beispiel #7
0
def create_or_append_table(context, csv_path, which_table, _df, date):
    table_obj = Table(
        dataset_id=context.resources.basedosdados_config["dataset_id"],
        table_id=which_table,
    )
    query = f"""SELECT * FROM {table_obj.table_full_name['prod']} as t
            """
    if which_table == "realized_trips":
        query += f"""WHERE EXTRACT(DATE FROM t.departure_datetime) = DATE_SUB(DATE("{date}"), INTERVAL 1 DAY)"""
    if which_table == "unplanned":
        query += f"""WHERE DATE(t.dia) = DATE_SUB(DATE("{date}"), INTERVAL 1 DAY)"""

    try:
        ref = table_obj._get_table_obj("prod")
    except google.api_core.exceptions.NotFound:
        ref = None
    if ref:
        savepath = f"tmp_data/{which_table}_{date}_from_bq.csv"
        bd.download(
            savepath=savepath,
            query=query,
            billing_project_id=context.resources.bd_client.project,
            from_file=True,
            index=False,
        )

        tb = pd.read_csv(savepath)
        df = drop_overlap(tb, _df)
        df.to_csv(csv_path, index=False)

        table_obj.append(csv_path, if_exists="replace")
    else:
        _df.to_csv(csv_path, index=False)
        table_obj.create(csv_path,
                         if_table_config_exists="pass",
                         if_storage_data_exists="replace")
        table_obj.publish(if_exists="replace")
Beispiel #8
0
def test_create_auto_partitions(metadatadir):

    table_part = Table(
        dataset_id=DATASET_ID,
        table_id=TABLE_ID + "_autopartitioned",
        metadata_path=metadatadir,
    )

    table_part.create("tests/sample_data/partitions",
                      partitioned=True,
                      if_exists="replace")

    table_part.publish()