Exemple #1
0
    def run(params, **kwargs):
        source_connector = grab_connector(__file__,
                                          params.get("source-connector"))
        postgres_connector = grab_connector(__file__, "postgres-local")

        step1 = DownloadStep(connector=source_connector)
        step2 = TransformStep()
        step3 = LoadStep("brazil_ncm", postgres_connector, if_exists="append")

        pipeline = AdvancedPipelineExecutor(params)
        pipeline = pipeline.next(step1).next(step2).next(step3)

        return pipeline.run_pipeline()
Exemple #2
0
    def run(params, **kwargs):
        source_connector = grab_connector(__file__,
                                          params.get("source-connector"))
        postgres_connector = grab_connector(__file__, "postgres-local")

        step1 = DownloadStep(connector=source_connector)
        step2 = LoadStep(params.get("dim") + "_table",
                         postgres_connector,
                         if_exists="replace")

        pipeline = AdvancedPipelineExecutor(params)
        pipeline = pipeline.next(step1).next(step2)

        return pipeline.run_pipeline()
Exemple #3
0
    def run(params, **kwargs):
        source_connector = grab_connector(__file__, params.get("source"))
        db_connector = grab_connector(__file__, params.get("db"))

        step1 = DownloadStep(connector=source_connector)
        step2 = TransformStep()
        step3 = LoadStep("trade_i_mdic_m_hs",
                         db_connector,
                         if_exists="append",
                         pk=["time_id"])

        pipeline = AdvancedPipelineExecutor(params)
        pipeline = pipeline.next(step1).next(step2).next(step3)

        return pipeline.run_pipeline()
    def steps():
        source = grab_connector(__file__, "http-local")
        db = grab_connector(__file__, "clickhouse-local")

        step1 = DownloadStep(connector=source, connector_path=__file__)
        step2 = TransformationStep()

        dtypes = {"int_column": "UInt32", "str_column": "String"}

        step3 = LoadStep("table_name",
                         connector=db,
                         pk=["primary_key"],
                         dtype=dtypes,
                         nullable_list=["null_col"])

        return [step1, step2, step3]
Exemple #5
0
    def steps(params):
        db_connector = grab_connector(__file__, params.get("output-db"))

        open_step = OpenStep()
        tidy_step = TidyStep()

        region_step = RegionDimensionStep()
        load_region = LoadStep(table_name="tic_dim_region",
                               connector=db_connector,
                               if_exists="drop",
                               pk=["region_id"],
                               dtype={
                                   "region_id": "UInt8",
                                   "region_name": "String"
                               },
                               nullable_list=[])

        variable_step = VariableDimensionStep()
        load_variable = LoadStep(table_name="tic_dim_variable",
                                 connector=db_connector,
                                 if_exists="drop",
                                 pk=["response_id"],
                                 dtype={
                                     "response_id": "UInt8",
                                     "variable_name": "String",
                                     "response_name": "String",
                                     "combined": "String"
                                 },
                                 nullable_list=[])

        fact_step = FactTableStep()
        load_fact = LoadStep(table_name="tic_fact",
                             connector=db_connector,
                             if_exists="drop",
                             pk=["region_id"],
                             dtype={
                                 "region_id": "UInt8",
                                 "data_origin_id": "UInt8",
                                 "response_id": "UInt8",
                                 "year": "UInt8",
                                 "percentage": "Float64"
                             },
                             nullable_list=[])

        if params.get("ingest") == True:
            steps = [
                open_step, tidy_step, region_step, load_region, variable_step,
                load_variable, fact_step, load_fact
            ]
        else:
            steps = [
                open_step, tidy_step, region_step, variable_step, fact_step
            ]

        return steps
Exemple #6
0
    def steps(params):
        connector = grab_connector('etl/budget_transparency/conns.yaml',
                                   'budget-transparency-wildcard')
        db_connector = Connector.fetch("clickhouse-database",
                                       open("etl/conns.yaml"))

        download_step = WildcardDownloadStep(connector=connector)

        read_step = ReadStep(save_result_key="global_df")

        dim_fun_step = DimFunStep(save_result_key="dim_fun")

        dim_dep_step = DimDepStep(save_result_key="dim_dep")

        dim_exp_step = DimExpTypeStep(save_result_key="dim_exp")

        dim_cap_step = DimCapStep(save_result_key="dim_cap")

        fact_step = FactStep()

        ld_fun = LoadStep('dim_fun_budget_transparency',
                          db_connector,
                          if_exists="drop",
                          pk=['subfunction_id'],
                          dtype=dt_fun)

        ld_dep = LoadStep('dim_dep_budget_transparency',
                          db_connector,
                          if_exists="drop",
                          pk=['department_id'],
                          dtype=dt_dep)

        ld_exp = LoadStep('dim_exp_budget_transparency',
                          db_connector,
                          if_exists="drop",
                          pk=['exp_type_id'],
                          dtype=dt_exp)

        ld_cap = LoadStep('dim_cap_budget_transparency',
                          db_connector,
                          if_exists="drop",
                          pk=['concept_id'],
                          dtype=dt_cap)

        ld_fact = LoadStep('budget_transparency',
                           db_connector,
                           if_exists="drop",
                           pk=['quarter_id'],
                           dtype=dt_fact)

        return [
            download_step, read_step, dim_fun_step, ld_fun, dim_dep_step,
            ld_dep, dim_exp_step, ld_exp, dim_cap_step, ld_cap, fact_step,
            ld_fact
        ]
Exemple #7
0
    def steps(params):
        db_connector = grab_connector(__file__, params.get("output-db"))

        open_step = OpenStep()
        transform_step = TransformStep()
        load_step = LoadStep(table_name="wakanda_trade",
                             connector=db_connector,
                             if_exists="append",
                             pk=["time_id"])

        return [open_step, transform_step, load_step]
    def steps(params):
        source_connector = grab_connector(__file__, params.get("input-file"))
        db_connector = grab_connector(__file__, params.get("output-db"))

        country_step = CountryDimStep()

        load_country = LoadStep(table_name="dim_country",
                                connector=db_connector,
                                if_exists="drop",
                                pk=["country_id"],
                                dtype={
                                    "country_id": "Int64",
                                    "country_name": "String",
                                    "country_code": "String"
                                },
                                nullable_list=[])

        download_step = DownloadStep(connector=source_connector, force=False)

        transform_step = TransformStep()

        load_step = LoadStep(table_name="gdp_fact",
                             connector=db_connector,
                             if_exists="drop",
                             pk=["year"],
                             dtype={
                                 "year": "Int64",
                                 "country_id": "Int64",
                                 "gdp_value": "Float64"
                             },
                             nullable_list=[])

        steps = [
            country_step, load_country, download_step, transform_step,
            load_step
        ] if params.get("ingest") else [
            country_step, download_step, transform_step
        ]

        return steps
    def steps(params):
        db_connector = grab_connector(__file__, params.get("db"))

        open_step = OpenStep()
        transform_step = TransformStep()
        load_step = LoadStep(table_name="unemployment_claims",
                             connector=db_connector,
                             if_exists="append",
                             pk=["week_ended"])

        steps = [open_step, transform_step, load_step
                 ] if params.get("ingest") else [open_step, transform_step]

        return steps
Exemple #10
0
    def steps(params):

        connector = grab_connector('etl/insurers/conns.yaml', 'insurers')
        db_connector = Connector.fetch("clickhouse-database", open("etl/conns.yaml"))
        
        dl_step = DownloadStep(
            connector=connector
        )

        read_step = ReadStep(save_result_key="global_df") 

        dim_med_supply_step = DimMedicalSupply(save_result_key="dim_medical_supply")

        ld_dim_step = LoadStep('dim_insurers_med_supply', db_connector, if_exists="drop", pk=['supply_id'], dtype=dt_dim_med_supply)

        x_step = XformStep()

        ld_fact_step = LoadStep('insurers', db_connector, if_exists="drop", pk=['patient_id', 'ingress_date', 'egress_date', 'supply_id', 'hospital_group', 'diagnosis'], dtype=dt_insurers_fact)

        return [dl_step, read_step, dim_med_supply_step, ld_dim_step, x_step, ld_fact_step]
    def run(params, **kwargs):
        db_connector = grab_connector(__file__, params.get("db"))

        step1 = TimeCreationStep()
        step2 = LoadStep("dim_shared_time",
                         db_connector,
                         if_exists="replace",
                         pk=["time_id"])
        step3 = ProductCreationStep()
        step4 = LoadStep("dim_shared_products",
                         db_connector,
                         if_exists="replace",
                         pk=["hs4_id"])
        step5 = CountryCreationStep()
        step6 = LoadStep("dim_shared_countries",
                         db_connector,
                         if_exists="replace",
                         pk=["id"])
        step7 = StateCreationStep()
        step8 = LoadStep("dim_shared_states",
                         db_connector,
                         if_exists="replace",
                         pk=["state_id"])
        step9 = MunicipalityCreationStep()
        step10 = LoadStep("dim_shared_municipalities",
                          db_connector,
                          if_exists="replace",
                          pk=["municipality_id"])
        step11 = FlowCreationStep()
        step12 = LoadStep("dim_shared_flow",
                          db_connector,
                          if_exists="replace",
                          pk=["flow_id"])

        pipeline = AdvancedPipelineExecutor(params)
        pipeline = pipeline.next(step1).next(step2).next(step3).next(
            step4).next(step5).next(step6).next(step7).next(step8).next(
                step9).next(step10).next(step11).next(step12)

        return pipeline.run_pipeline()
Exemple #12
0
    def steps(params):
        source_connector = grab_connector(__file__, "resources-data")
        db_connector = Connector.fetch("clickhouse-database", open("../conns.yaml"))
        dtype = {
            "mun_id":                           "UInt16",
            "clues_id":                         "String",
            "type_id":                          "String",
            "year":                             "UInt16",
            "resources_subcategories_id":       "UInt16",
            "total":                            "UInt32"
        }

        download_step = DownloadStep(
            connector=source_connector,
            force=True
        )

        transform_step = TransformStep()

        load_step = LoadStep("health_resources", db_connector, if_exists="append", pk=["mun_id"], dtype=dtype)

        return [download_step, transform_step, load_step]
Exemple #13
0
    def steps(params):
        source_connector = grab_connector("../conns.yaml", "fincen-source")
        #db_connector = grab_connector("../conns.yaml", params.get("output-db"))

        dl_step = DownloadStep(connector=source_connector)
        unzip_step = UnzipStep(pattern=r"\.csv$")
        date_step = DateStep()

        #load_fact = LoadStep(
        #    table_name="tic_fact",
        #    connector=db_connector,
        #    if_exists="drop",
        #    pk=["region_id"],
        #    dtype={"region_id":"UInt8","data_origin_id":"UInt8","response_id":"UInt8","year":"UInt8","percentage":"Float64"},
        #    nullable_list=[]
        #)

        if params.get("ingest") == True:
            steps = []
            #[open_step, tidy_step, region_step, load_region, variable_step, load_variable, fact_step, load_fact]
        else:
            steps = [dl_step, unzip_step, date_step]

        return steps
import pandas as pd
from tabulate import tabulate
from bamboo_lib.helpers import grab_connector, query_to_df

db_connector = grab_connector(__file__, "clickhouse-remote")

# Get table names
names_df = query_to_df(db_connector, "SHOW TABLES")
tables = names_df["name"].tolist()

# Print row count
for t in [table for table in tables if "acs" in table]:
    print("\nTABLE: {}".format(t))

    query = "SELECT year, COUNT(*) FROM {} GROUP BY year ORDER BY year;".format(
        t)
    df = query_to_df(db_connector, query, col_headers=["year", "count"])
    new_df = pd.DataFrame({
        "year": df["year"].tolist(),
        "count": ["{:,}".format(int(c)) for c in df["count"]]
    })
    print(tabulate(new_df, headers="keys", tablefmt="psql", showindex=False))